Commit bc5ebf0f authored by luopl's avatar luopl
Browse files

Initial commit

parents
Pipeline #2167 canceled with stages
import torch
from transformers import AutoTokenizer, AutoModel
import warnings
from .base import BaseModel
from ..smp import *
from ..dataset import DATASET_TYPE
import pandas as pd
import string
class H2OVLChat(BaseModel):
INSTALL_REQ = False
INTERLEAVE = True
def __init__(self, model_path='h2oai/h2ovl-mississippi-2b', **kwargs):
assert model_path is not None
self.model_path = model_path
self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, use_fast=False)
device = torch.cuda.current_device()
self.device = device
self.model = AutoModel.from_pretrained(
model_path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval()
self.model = self.model.to(device)
self.image_size = self.model.config.vision_config.image_size
kwargs_default = dict(do_sample=False, max_new_tokens=1024, top_p=None, num_beams=1)
kwargs_default.update(kwargs)
self.kwargs = kwargs_default
warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
def use_custom_prompt(self, dataset):
return True
def build_multi_choice_prompt(self, line, dataset=None):
question = line['question']
hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
if hint is not None:
question = hint + '\n' + question
options = {
cand: line[cand]
for cand in string.ascii_uppercase
if cand in line and not pd.isna(line[cand])
}
for key, item in options.items():
question += f'\n{key}. {item}'
prompt = question
if len(options):
prompt += '\n请直接回答选项字母。' if cn_string(
prompt) else "\nAnswer with the option's letter from the given choices directly."
else:
prompt += '\n请直接回答问题。' if cn_string(prompt) else '\nAnswer the question directly.'
return prompt
def build_prompt(self, line, dataset=None):
assert self.use_custom_prompt(dataset)
assert dataset is None or isinstance(dataset, str)
tgt_path = self.dump_image(line, dataset)
if dataset is not None and listinstr(['MME'], dataset):
question = line['question']
prompt = question + ' Answer the question using a single word or phrase.'
elif dataset is not None and listinstr(['HallusionBench'], dataset):
question = line['question']
prompt = question + ' Please answer yes or no. Answer the question using a single word or phrase.'
elif dataset is not None and DATASET_TYPE(dataset) == 'MCQ':
prompt = self.build_multi_choice_prompt(line, dataset)
elif dataset is not None and DATASET_TYPE(dataset) == 'VQA':
if 'MathVista' in dataset:
prompt = line['question']
elif listinstr(['LLaVABench'], dataset):
question = line['question']
prompt = question + '\nAnswer this question in detail.'
elif listinstr(['MMVet'], dataset):
prompt = line['question']
else:
question = line['question']
prompt = question + '\nAnswer the question using a single word or phrase.'
else:
prompt = line['question']
message = [dict(type='text', value=prompt)]
message.extend([dict(type='image', value=s) for s in tgt_path])
return message
def generate_inner(self, message, dataset=None):
image_num = len([x for x in message if x['type'] == 'image'])
question = ''
image_files = [x['value'] for x in message if x['type'] == 'image']
if image_num == 1:
question = '<image>\n' + '\n'.join([x['value'] for x in message if x['type'] == 'text'])
elif image_num > 1:
text_part = ' '.join([x['value'] for x in message if x['type'] == 'text'])
image_part = ' '.join([f'<image-{i + 1}>: <image>' for i in range(image_num)])
question = image_part + '\n' + text_part
else:
question = '\n'.join([x['value'] for x in message if x['type'] == 'text'])
image_files = None
response, history = self.model.chat(
self.tokenizer,
image_files=image_files,
question=question,
generation_config=self.kwargs,
max_tiles=6,
history=None,
return_history=True)
return response
import torch
import os.path as osp
import warnings
from .base import BaseModel
from ..smp import splitlen, listinstr
from PIL import Image
from transformers import AutoProcessor, AutoModelForVision2Seq
from transformers.image_utils import load_image
class IDEFICS(BaseModel):
INSTALL_REQ = False
INTERLEAVE = True
def __init__(self, model_path='HuggingFaceM4/idefics-9b-instruct', **kwargs):
assert osp.exists(model_path) or splitlen(model_path) == 2
from transformers import IdeficsForVisionText2Text, AutoProcessor
self.model = IdeficsForVisionText2Text.from_pretrained(
model_path, torch_dtype=torch.bfloat16, device_map='auto'
)
self.processor = AutoProcessor.from_pretrained(model_path)
kwargs_default = {'max_new_tokens': 512}
kwargs_default.update(kwargs)
self.kwargs = kwargs_default
self.file_root = osp.dirname(__file__)
warnings.warn(
f'Following kwargs received: {self.kwargs}, will use as generation config. '
)
def generate_inner(self, message, dataset=None):
prompts = (
['Users:']
+ [msg['value'] if msg['type'] == 'text' else Image.open(msg['value']) for msg in message]
+ ['<end_of_utterance>', '\nAssistant: ']
)
inputs = self.processor(
prompts, add_end_of_utterance_token=False, return_tensors='pt'
).to('cuda')
exit_condition = self.processor.tokenizer(
'<end_of_utterance>', add_special_tokens=False
).input_ids
bad_words_ids = self.processor.tokenizer(
['<image>', '<fake_token_around_image>'], add_special_tokens=False
).input_ids
generated_ids = self.model.generate(
**inputs,
eos_token_id=exit_condition,
bad_words_ids=bad_words_ids,
**self.kwargs,
)
generated_text = self.processor.batch_decode(
generated_ids, skip_special_tokens=True
)
text = generated_text[0].split('\nAssistant: ')[-1]
return text
class IDEFICS2(BaseModel):
INSTALL_REQ = True
INTERLEAVE = True
def __init__(self, model_path='HuggingFaceM4/idefics2-8b', **kwargs):
assert model_path is not None
self.model_path = model_path
if 'Idefics3' in self.model_path.lower():
warnings.warn('Install transfomers from source: PR https://github.com/open-compass/VLMEvalKit/pull/379')
warnings.warn('Reference: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3')
self.processor = AutoProcessor.from_pretrained(model_path)
model = AutoModelForVision2Seq.from_pretrained(
model_path,
torch_dtype=torch.bfloat16,
_attn_implementation='flash_attention_2',
device_map='cpu')
self.model = model.to('cuda')
kwargs_default = {'max_new_tokens': 1024}
kwargs_default.update(kwargs)
self.kwargs = kwargs_default
warnings.warn(
f'Following kwargs received: {self.kwargs}, will use as generation config. '
)
torch.cuda.empty_cache()
def _process(self, formatted_messages, formatted_images):
inputs = self.processor(
text=formatted_messages, images=formatted_images, return_tensors='pt'
)
inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
return inputs
def build_prompt_default(self, message, add_brief=False, add_yes_or_no=False, change_the_img_place=False):
if change_the_img_place:
new_message = []
for s in message:
if s['type'] == 'image':
new_message.append(s)
for s in message:
if s['type'] == 'text':
new_message.append(s)
message = new_message
prompt, images = 'User:', []
for msg in message:
if msg['type'] == 'image':
img = load_image(msg['value'])
images.append(img)
prompt += '<image>'
elif msg['type'] == 'text':
prompt += msg['value'].strip()
if add_brief:
prompt += '\nGive a very brief answer.'
if add_yes_or_no:
prompt += '\nAnswer yes or no.'
prompt += '<end_of_utterance>\nAssistant:'
return prompt, images
def build_prompt_puremcq(self, message):
replace_mapping = {
'\nOptions:': '\nChoices:',
'Please select the correct answer from the options above.': 'Answer with the letter.',
}
prompt, images = 'User:', []
for msg in message:
if msg['type'] == 'image':
img = load_image(msg['value'])
images.append(img)
prompt += '<image>'
elif msg['type'] == 'text':
instruction = msg['value'].strip()
for k, v in replace_mapping.items():
instruction = instruction.replace(k, v)
prompt += instruction
prompt += '<end_of_utterance>\nAssistant: Answer:'
return prompt, images
def build_prompt_mt(self, message):
prompt, images = '', []
for msg in message:
if msg['role'] == 'user':
prompt += 'User: '
elif msg['role'] == 'assistant':
prompt += 'Assistant: '
for item in msg['content']:
if item['type'] == 'image':
img = load_image(item['value'])
images.append(img)
prompt += '<image>'
elif item['type'] == 'text':
prompt += item['value'].strip()
prompt += '<end_of_utterance>\n'
return prompt + 'Assistant: '
def build_prompt_mmbench(self, message):
replace_mapping = {
'\nOptions:': '\nChoices:',
'Please select the correct answer from the options above.': 'Answer with a letter.',
}
prompt, images = 'User:', []
for msg in message:
if msg['type'] == 'image':
img = load_image(msg['value'])
images.append(img)
prompt += '<image>'
elif msg['type'] == 'text':
instruction = msg['value'].strip()
for k, v in replace_mapping.items():
instruction = instruction.replace(k, v)
# Swap hint and question
if instruction.startswith('Hint:'):
hint, question = instruction.split('\nQuestion:')
question, choices = question.split('\nChoices:')
instruction = (
'Question:' + question + '\n' + hint + '\nChoices:' + choices
)
prompt += instruction
prompt += '<end_of_utterance>\nAssistant: Answer:'
return prompt, images
def build_prompt_mmmu(self, message):
replace_mapping = {
'Question:': '',
'Please select the correct answer from the options above.': 'Answer with the letter.',
'\nOptions:': '\nChoices:',
}
prompt, images, img_counter = 'User: Question: ', [], 1
for msg in message:
if msg['type'] == 'image':
prompt += f'<image {img_counter}>:<image>\n'
img_counter += 1
img_counter = 1
for msg in message:
if msg['type'] == 'image':
img = load_image(msg['value'])
images.append(img)
prompt += f' <image {img_counter}> '
img_counter += 1
elif msg['type'] == 'text':
instruction = msg['value'].strip()
for k, v in replace_mapping.items():
instruction = instruction.replace(k, v)
prompt += instruction.strip()
prompt += '<end_of_utterance>\nAssistant:'
if 'A.' in prompt and 'B.' in prompt:
prompt += ' Answer:'
return prompt, images
def build_prompt_mathvista(self, message):
replace_mapping = {
'(A) ': 'A. ',
'(B) ': 'B. ',
'(C) ': 'C. ',
'(D) ': 'D. ',
'(E) ': 'E. ',
'(F) ': 'F. ',
'(G) ': 'G. ',
'(H) ': 'H. ',
'\nOptions:': '\nChoices:',
'Hint: ': '',
}
prompt, images = 'User:', []
for msg in message:
if msg['type'] == 'image':
img = load_image(msg['value'])
images.append(img)
prompt += '<image>'
elif msg['type'] == 'text':
instruction = msg['value'].strip()
for k, v in replace_mapping.items():
instruction = instruction.replace(k, v)
prompt += instruction.strip()
if 'A.' in prompt and 'B.' in prompt:
prompt += '\nAnswer with the letter.'
prompt += '<end_of_utterance>\nAssistant:'
if 'A.' in prompt and 'B.' in prompt:
prompt += ' Answer:'
return prompt, images
def chat_inner(self, message, dataset=None):
formatted_messages, formatted_images = self.build_prompt_mt(message)
inputs = self._process(formatted_messages, formatted_images)
generated_ids = self.model.generate(**inputs, **self.kwargs)
generated_text = self.processor.batch_decode(
generated_ids[:, inputs['input_ids'].size(1):], skip_special_tokens=True
)[0]
response = generated_text.strip()
# print(dataset, " | ", formatted_messages.replace("\n", "\\n"), " | ", response.replace("\n", "\\n"))
return response
def generate_inner(self, message, dataset=None):
if dataset in [
'MMBench_DEV_EN', 'MMBench_DEV_EN_V11',
'MMBench_TEST_EN', 'MMBench_TEST_EN_V11',
'MMBench_DEV_CN', 'MMBench_DEV_CN_V11',
'MMBench_TEST_CN', 'MMBench_TEST_CN_V11',
'MMBench', 'MMBench_V11', 'MMBench_CN', 'MMBench_CN_V11'
]:
formatted_messages, formatted_images = self.build_prompt_mmbench(message)
elif dataset in ['MMMU_DEV_VAL', 'MMMU_TEST']:
formatted_messages, formatted_images = self.build_prompt_mmmu(message)
elif dataset in ['MathVista_MINI']:
formatted_messages, formatted_images = self.build_prompt_mathvista(message)
elif dataset in [
'MME',
'MMVet',
'OCRVQA_TEST',
'OCRVQA_TESTCORE',
'TextVQA_VAL',
'ChartQA_TEST',
'DocVQA_VAL',
'DocVQA_TEST',
'InfoVQA_VAL',
'InfoVQA_TEST',
]:
formatted_messages, formatted_images = self.build_prompt_default(
message, add_brief=True
)
elif dataset == 'HallusionBench':
formatted_messages, formatted_images = self.build_prompt_default(
message, add_yes_or_no=True
)
elif dataset in [
'MMStar',
'SEEDBench_IMG',
'AI2D_TEST',
'ScienceQA_VAL',
'ScienceQA_TEST',
]:
formatted_messages, formatted_images = self.build_prompt_puremcq(message)
elif listinstr(['MLVU','TempCompass','MVBench'], dataset):
formatted_messages, formatted_images = self.build_prompt_default(message, change_the_img_place=True)
else:
formatted_messages, formatted_images = self.build_prompt_default(message)
inputs = self._process(formatted_messages, formatted_images)
generated_ids = self.model.generate(**inputs, **self.kwargs)
generated_text = self.processor.batch_decode(
generated_ids[:, inputs['input_ids'].size(1):], skip_special_tokens=True
)[0]
response = generated_text.strip()
# print(dataset, " | ", formatted_messages.replace("\n", "\\n"), " | ", response.replace("\n", "\\n"))
return response
import torch
from PIL import Image
import os.path as osp
import sys
from .base import BaseModel
from ..smp import *
class InstructBLIP(BaseModel):
INSTALL_REQ = True
INTERLEAVE = False
def __init__(self, name):
self.config_map = {
'instructblip_7b': 'misc/blip2_instruct_vicuna7b.yaml',
'instructblip_13b': 'misc/blip2_instruct_vicuna13b.yaml',
}
self.file_path = __file__
config_root = osp.dirname(self.file_path)
try:
from lavis.models import load_preprocess
from omegaconf import OmegaConf
from lavis.common.registry import registry
except Exception as e:
logging.critical('Please install lavis before using InstructBLIP. ')
raise e
assert name in self.config_map
cfg_path = osp.join(config_root, self.config_map[name])
cfg = OmegaConf.load(cfg_path)
model_cfg = cfg.model
assert osp.exists(model_cfg.llm_model) or splitlen(model_cfg.llm_model) == 2
model_cls = registry.get_model_class(name='blip2_vicuna_instruct')
model = model_cls.from_config(model_cfg)
model.eval()
self.device = torch.device('cuda') if torch.cuda.is_available() else 'cpu'
device = self.device
model.to(device)
self.model = model
self.kwargs = {'max_length': 512}
preprocess_cfg = cfg.preprocess
vis_processors, _ = load_preprocess(preprocess_cfg)
self.vis_processors = vis_processors
def generate_inner(self, message, dataset=None):
prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
vis_processors = self.vis_processors
raw_image = Image.open(image_path).convert('RGB')
image_tensor = vis_processors['eval'](raw_image).unsqueeze(0).to(self.device)
outputs = self.model.generate(dict(image=image_tensor, prompt=prompt))
return outputs[0]
from .internvl_chat import InternVLChat
__all__ = ['InternVLChat']
import math
import pandas as pd
import random
import re
import string
import torch
import torch.distributed as dist
import torchvision.transforms as T
import transformers
import warnings
from PIL import Image
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoTokenizer, AutoConfig, AutoModel, CLIPImageProcessor
from .utils import (build_multi_choice_prompt,
build_video_prompt,
build_mpo_prompt,
build_mcq_cot_prompt,
build_qa_cot_prompt,
mpo_post_processing,
reorganize_prompt,
split_model, load_image)
from .utils import mpo_prompt_with_final_answer, mpo_prompt_without_final_answer
from ..base import BaseModel
from ...dataset import DATASET_TYPE, DATASET_MODALITY
from ...smp import *
class InternVLChat(BaseModel):
INSTALL_REQ = False
INTERLEAVE = True
def __init__(self,
model_path='OpenGVLab/InternVL-Chat-V1-5',
load_in_8bit=False,
use_mpo_prompt=False,
version='V1.0',
**kwargs):
assert model_path is not None
assert version_cmp(transformers.__version__, '4.37.2', 'ge')
self.use_mpo_prompt = use_mpo_prompt
self.model_path = model_path
self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, use_fast=False)
# Regular expression to match the pattern 'Image' followed by a number, e.g. Image1
self.pattern = r'Image(\d+)'
# Replacement pattern to insert a hyphen between 'Image' and the number, e.g. Image-1
self.replacement = r'Image-\1'
# Convert InternVL2 response to dataset format
# e.g. Image1 -> Image-1
# Regular expression to match the pattern 'Image-' followed by a number
self.reverse_pattern = r'Image-(\d+)'
# Replacement pattern to remove the hyphen (Image-1 -> Image1)
self.reverse_replacement = r'Image\1'
if auto_split_flag():
device_map, visible_devices = split_model(model_path=model_path)
self.device = visible_devices[0]
self.model = AutoModel.from_pretrained(
model_path,
torch_dtype=torch.bfloat16,
load_in_8bit=load_in_8bit,
trust_remote_code=True,
low_cpu_mem_usage=True,
device_map=device_map).eval()
else:
self.model = AutoModel.from_pretrained(
model_path,
torch_dtype=torch.bfloat16,
load_in_8bit=load_in_8bit,
trust_remote_code=True,
low_cpu_mem_usage=True).eval().cuda()
self.device = 'cuda'
self.image_size = self.model.config.vision_config.image_size
self.version = version
kwargs_default = dict(do_sample=False, max_new_tokens=4096, top_p=None)
kwargs_default.update(kwargs)
self.kwargs = kwargs_default
warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
def use_custom_prompt(self, dataset):
assert dataset is not None
if listinstr(['MMDU', 'MME-RealWorld', 'MME-RealWorld-CN'], dataset):
# For Multi-Turn we don't have custom prompt
return False
if DATASET_MODALITY(dataset) == 'VIDEO':
# For Video benchmarks we don't have custom prompt at here
return False
else:
return True
def build_prompt(self, line, dataset=None):
assert self.use_custom_prompt(dataset)
assert dataset is None or isinstance(dataset, str)
tgt_path = self.dump_image(line, dataset)
if dataset is not None and DATASET_TYPE(dataset) == 'Y/N':
question = line['question']
if listinstr(['MME'], dataset):
prompt = question + ' Answer the question using a single word or phrase.'
elif listinstr(['HallusionBench', 'AMBER'], dataset):
prompt = question + ' Please answer yes or no. Answer the question using a single word or phrase.'
else:
prompt = question
elif dataset is not None and DATASET_TYPE(dataset) == 'MCQ':
prompt = build_multi_choice_prompt(line, dataset)
if os.getenv('USE_COT') == '1':
prompt = build_mcq_cot_prompt(line, prompt)
elif dataset is not None and DATASET_TYPE(dataset) == 'VQA':
question = line['question']
if listinstr(['LLaVABench', 'WildVision'], dataset):
prompt = question + '\nAnswer this question in detail.'
elif listinstr(['OCRVQA', 'TextVQA', 'ChartQA', 'DocVQA', 'InfoVQA', 'OCRBench',
'DUDE', 'SLIDEVQA', 'GQA', 'MMLongBench_DOC'], dataset):
prompt = question + '\nAnswer the question using a single word or phrase.'
elif listinstr(['MathVista', 'MathVision', 'VCR', 'MTVQA', 'MMVet', 'MathVerse',
'MMDU', 'CRPE', 'MIA-Bench', 'MM-Math', 'DynaMath', 'QSpatial'], dataset):
prompt = question
if os.getenv('USE_COT') == '1':
prompt = build_qa_cot_prompt(line, prompt)
else:
prompt = question + '\nAnswer the question using a single word or phrase.'
else:
# VQA_ex_prompt: OlympiadBench, VizWiz
prompt = line['question']
if os.getenv('USE_COT') == '1':
prompt = build_qa_cot_prompt(line, prompt)
message = [dict(type='text', value=prompt)]
message.extend([dict(type='image', value=s) for s in tgt_path])
if self.use_mpo_prompt:
message = build_mpo_prompt(message, line, dataset)
return message
def set_max_num(self, dataset):
# The total limit on the number of images processed, set to avoid Out-of-Memory issues.
self.total_max_num = 64
if dataset is None:
self.max_num = 6
return None
res_12_datasets = ['ChartQA_TEST', 'MMMU_DEV_VAL', 'MMMU_TEST', 'MME-RealWorld',
'VCR_EN', 'VCR_ZH', 'OCRVQA']
res_18_datasets = ['DocVQA_VAL', 'DocVQA_TEST', 'DUDE', 'MMLongBench_DOC', 'SLIDEVQA']
res_24_datasets = ['InfoVQA_VAL', 'InfoVQA_TEST', 'OCRBench', 'HRBench4K', 'HRBench8K']
if DATASET_MODALITY(dataset) == 'VIDEO':
self.max_num = 1
elif listinstr(res_12_datasets, dataset):
self.max_num = 12
elif listinstr(res_18_datasets, dataset):
self.max_num = 18
elif listinstr(res_24_datasets, dataset):
self.max_num = 24
else:
self.max_num = 6
def generate_v1_2(self, message, dataset=None):
self.INTERLEAVE = False
prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
image = Image.open(image_path).convert('RGB')
image = image.resize((self.image_size, self.image_size))
image_processor = CLIPImageProcessor.from_pretrained(self.model_path)
pixel_values = image_processor(images=image, return_tensors='pt').pixel_values
pixel_values = pixel_values.to(torch.bfloat16).to(self.device)
with torch.no_grad():
response = self.model.chat(self.tokenizer, pixel_values=pixel_values,
question=prompt, generation_config=self.kwargs)
return response
def generate_v1_5(self, message, dataset=None):
image_num = len([x for x in message if x['type'] == 'image'])
max_num = max(1, min(self.max_num, self.total_max_num // image_num))
prompt = '\n'.join([x['value'] for x in message if x['type'] == 'text'])
if DATASET_MODALITY(dataset) == 'VIDEO':
prompt = build_video_prompt(prompt, dataset)
if image_num > 1:
image_path = [x['value'] for x in message if x['type'] == 'image']
pixel_values_list = []
for file_name in image_path:
pixel_values_list.append(load_image(file_name, max_num=max_num).to(self.device).to(torch.bfloat16))
pixel_values = torch.cat(pixel_values_list, dim=0)
elif image_num == 1:
image_path = [x['value'] for x in message if x['type'] == 'image'][0]
pixel_values = load_image(image_path, max_num=max_num).to(self.device).to(torch.bfloat16)
else:
pixel_values = None
with torch.no_grad():
response = self.model.chat(
self.tokenizer,
pixel_values=pixel_values,
question=prompt,
generation_config=self.kwargs,
verbose=True)
return response
def generate_v2(self, message, dataset=None):
image_num = len([x for x in message if x['type'] == 'image'])
max_num = max(1, min(self.max_num, self.total_max_num // image_num))
prompt = reorganize_prompt(message, image_num, dataset=dataset)
if dataset is not None and DATASET_MODALITY(dataset) == 'VIDEO':
prompt = build_video_prompt(prompt, dataset)
if image_num > 1:
image_path = [x['value'] for x in message if x['type'] == 'image']
num_patches_list, pixel_values_list = [], []
for image_idx, file_name in enumerate(image_path):
upscale_flag = image_idx == 0 and dataset is not None and listinstr(['MMMU'], dataset)
curr_pixel_values = load_image(
file_name, max_num=max_num, upscale=upscale_flag).to(self.device).to(torch.bfloat16)
num_patches_list.append(curr_pixel_values.size(0))
pixel_values_list.append(curr_pixel_values)
pixel_values = torch.cat(pixel_values_list, dim=0)
elif image_num == 1:
image_path = [x['value'] for x in message if x['type'] == 'image'][0]
upscale_flag = dataset is not None and listinstr(['MMMU'], dataset)
pixel_values = load_image(
image_path, max_num=max_num, upscale=upscale_flag).to(self.device).to(torch.bfloat16)
num_patches_list = [pixel_values.size(0)]
else:
pixel_values = None
num_patches_list = []
with torch.no_grad():
response = self.model.chat(
self.tokenizer,
pixel_values=pixel_values,
num_patches_list=num_patches_list,
question=prompt,
generation_config=self.kwargs,
verbose=True
)
if self.use_mpo_prompt:
response = mpo_post_processing(response, dataset)
return response
def generate_inner(self, message, dataset=None):
self.set_max_num(dataset)
print(f'InternVL model version: {self.version}')
if self.version in ['V1.1', 'V1.2']:
return self.generate_v1_2(message, dataset)
elif self.version == 'V1.5':
return self.generate_v1_5(message, dataset)
elif self.version == 'V2.0':
return self.generate_v2(message, dataset)
else:
raise ValueError(f'Unsupported version: {self.version}')
def build_history(self, message):
# Global Variables
image_path = []
image_cnt = 0
def concat_tilist(tilist):
nonlocal image_cnt # Declare image_cnt as nonlocal to modify it
prompt = ''
for item in tilist:
# Substitute the pattern in the text
if item['type'] == 'text':
prompt += re.sub(self.pattern, self.replacement, item['value'])
elif item['type'] == 'image':
image_cnt += 1
prompt += '<image>\n'
image_path.append(item['value'])
return prompt
# Only previous messages
assert len(message) % 2 == 0
history = []
for i in range(len(message) // 2):
m1, m2 = message[2 * i], message[2 * i + 1]
assert m1['role'] == 'user' and m2['role'] == 'assistant'
history.append((concat_tilist(m1['content']), concat_tilist(m2['content'])))
return history, image_path, image_cnt
def chat_inner_v2(self, message, dataset=None):
if len(message) > 1:
history, image_path, image_cnt = self.build_history(message[:-1])
else:
history, image_path, image_cnt = None, [], 1
current_msg = message[-1]
question = ''
# If message is just text in the conversation
if len(current_msg['content']) == 1 and current_msg['content'][0]['type'] == 'text':
question = current_msg['content'][0]['value']
question = re.sub(self.pattern, self.replacement, question) # Fix pattern as per InternVL
else:
for msg in current_msg['content']:
if msg['type'] == 'text':
question += re.sub(self.pattern, self.replacement, msg['value'])
elif msg['type'] == 'image':
image_cnt += 1
question += '<image>\n'
image_path.append(msg['value'])
if image_cnt > 1:
num_patches_list = []
pixel_values_list = []
for image_idx, file_name in enumerate(image_path):
upscale_flag = image_idx == 0 and dataset is not None and listinstr(['MMMU_DEV_VAL'], dataset)
curr_pixel_values = load_image(
file_name, max_num=self.max_num, upscale=upscale_flag).to(self.device).to(torch.bfloat16)
num_patches_list.append(curr_pixel_values.size(0))
pixel_values_list.append(curr_pixel_values)
pixel_values = torch.cat(pixel_values_list, dim=0)
elif image_cnt == 1:
upscale_flag = listinstr(['MMMU_DEV_VAL'], dataset)
pixel_values = load_image(
image_path, max_num=self.max_num, upscale=upscale_flag).to(self.device).to(torch.bfloat16)
num_patches_list = [pixel_values.size(0)]
else:
pixel_values = None
num_patches_list = []
response, history = self.model.chat(
self.tokenizer,
pixel_values=pixel_values,
num_patches_list=num_patches_list,
question=question,
generation_config=self.kwargs,
history=history,
return_history=True
)
response = re.sub(self.reverse_pattern, self.reverse_replacement, response)
return response
def chat_inner(self, message, dataset=None):
self.set_max_num(dataset)
if self.version in ['V1.1', 'V1.2']:
raise ValueError(f'Unsupported version for Multi-Turn: {self.version}')
elif self.version == 'V1.5':
raise ValueError(f'Unsupported version for Multi-Turn: {self.version}')
elif self.version == 'V2.0':
kwargs_default = dict(do_sample=False, max_new_tokens=512, top_p=None, num_beams=1)
self.kwargs = kwargs_default
return self.chat_inner_v2(message, dataset)
else:
raise ValueError(f'Unsupported version for Multi-Turn: {self.version}')
import math
import pandas as pd
import random
import re
import string
import torch
import torch.distributed as dist
import torchvision.transforms as T
import transformers
import warnings
from PIL import Image
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoTokenizer, AutoConfig, AutoModel, CLIPImageProcessor
from ..base import BaseModel
from ...dataset import DATASET_TYPE, DATASET_MODALITY
from ...smp import *
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
best_ratio_diff = float('inf')
best_ratio = (1, 1)
area = width * height
for ratio in target_ratios:
target_aspect_ratio = ratio[0] / ratio[1]
ratio_diff = abs(aspect_ratio - target_aspect_ratio)
if ratio_diff < best_ratio_diff:
best_ratio_diff = ratio_diff
best_ratio = ratio
elif ratio_diff == best_ratio_diff:
if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
best_ratio = ratio
return best_ratio
def dynamic_preprocess(image, min_num=1, max_num=6, image_size=448, use_thumbnail=False):
orig_width, orig_height = image.size
aspect_ratio = orig_width / orig_height
# calculate the existing image aspect ratio
target_ratios = set(
(i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
i * j <= max_num and i * j >= min_num)
target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
# find the closest aspect ratio to the target
target_aspect_ratio = find_closest_aspect_ratio(
aspect_ratio, target_ratios, orig_width, orig_height, image_size)
# calculate the target width and height
target_width = image_size * target_aspect_ratio[0]
target_height = image_size * target_aspect_ratio[1]
blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
# resize the image
resized_img = image.resize((target_width, target_height))
processed_images = []
for i in range(blocks):
box = (
(i % (target_width // image_size)) * image_size,
(i // (target_width // image_size)) * image_size,
((i % (target_width // image_size)) + 1) * image_size,
((i // (target_width // image_size)) + 1) * image_size
)
# split the image
split_img = resized_img.crop(box)
processed_images.append(split_img)
assert len(processed_images) == blocks
if use_thumbnail and len(processed_images) != 1:
thumbnail_img = image.resize((image_size, image_size))
processed_images.append(thumbnail_img)
return processed_images
def load_image(image_file, input_size=448, max_num=6, upscale=False):
image = Image.open(image_file).convert('RGB')
if upscale:
image = image.resize((image.width * 2, image.height * 2), Image.BILINEAR)
transform = build_transform(input_size=input_size)
images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
pixel_values = [transform(image) for image in images]
pixel_values = torch.stack(pixel_values)
return pixel_values
def get_local_rank_and_local_world_size():
if not dist.is_available():
return 0, 1
if not dist.is_initialized():
return 0, 1
if 'SLURM_LOCALID' in os.environ:
local_rank = int(os.environ['SLURM_LOCALID'])
local_world_size = int(os.environ['SLURM_NTASKS_PER_NODE'])
return local_rank, local_world_size
if 'LOCAL_RANK' in os.environ and 'LOCAL_WORLD_SIZE' in os.environ:
return int(os.environ['LOCAL_RANK']), int(os.environ['LOCAL_WORLD_SIZE'])
raise NotImplementedError(
"Fail to get local_rank and local_world_size! "
"Please ensure that you set the environment variable "
"`LOCAL_RANK` and `LOCAL_WORLD_SIZE`"
)
def split_model(model_path):
num_gpus_per_node = 8
rank, world_size = get_rank_and_world_size()
try:
local_rank, local_world_size = get_local_rank_and_local_world_size()
except:
local_rank = rank
if 'GPUS_PER_PROCESS' in os.environ:
gpus_per_process = int(os.environ['GPUS_PER_PROCESS'])
else:
gpus_per_process = 8 # default to use 8 GPUs for one model
start_gpu = local_rank * gpus_per_process
end_gpu = start_gpu + gpus_per_process
assert end_gpu <= num_gpus_per_node, f"Process {local_rank} tries to access GPU {end_gpu}, " \
f"but only {num_gpus_per_node} GPUs are available per node."
visible_devices = list(range(start_gpu, end_gpu))
device_map = {}
config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
num_gpus_for_vit = 0.5
num_layers = config.llm_config.num_hidden_layers
num_layers_per_gpu = math.ceil(num_layers / (len(visible_devices) - num_gpus_for_vit))
num_layers_per_gpu = [num_layers_per_gpu] * len(visible_devices)
num_layers_per_gpu[0] = math.ceil(num_layers_per_gpu[0] * 0.5)
layer_cnt = 0
for i, num_layer in enumerate(num_layers_per_gpu):
for j in range(num_layer):
device_map[f'language_model.model.layers.{layer_cnt}'] = visible_devices[i]
layer_cnt += 1
device_map['vision_model'] = visible_devices[0]
device_map['mlp1'] = visible_devices[0]
device_map['language_model.model.tok_embeddings'] = visible_devices[0]
device_map['language_model.model.embed_tokens'] = visible_devices[0]
device_map['language_model.output'] = visible_devices[0]
device_map['language_model.model.norm'] = visible_devices[0]
device_map['language_model.lm_head'] = visible_devices[0]
device_map[f'language_model.model.layers.{num_layers - 1}'] = visible_devices[0]
return device_map, visible_devices
def split_model_old(model_name):
import math
device_map = {}
num_gpus = torch.cuda.device_count()
rank, world_size = get_rank_and_world_size()
num_gpus = num_gpus // world_size
num_layers_map = {
'InternVL2-8B': 32,
'InternVL2-26B': 48,
'InternVL2-40B': 60,
'InternVL2-Llama3-76B': 80
}
if model_name not in num_layers_map:
return 'cuda'
num_layers = num_layers_map[model_name]
# Since the first GPU will be used for ViT, treat it as 0.5 GPU.
num_layers_per_gpu = math.ceil(num_layers / (num_gpus - 0.5))
num_layers_per_gpu = [num_layers_per_gpu] * num_gpus
num_layers_per_gpu[0] = math.ceil(num_layers_per_gpu[0] * 0.5)
layer_cnt = 0
for i, num_layer in enumerate(num_layers_per_gpu):
for j in range(num_layer):
device_map[f'language_model.model.layers.{layer_cnt}'] = rank + world_size * i
layer_cnt += 1
device_map['vision_model'] = rank
device_map['mlp1'] = rank
device_map['language_model.model.tok_embeddings'] = rank
device_map['language_model.model.embed_tokens'] = rank
device_map['language_model.output'] = rank
device_map['language_model.model.norm'] = rank
device_map['language_model.lm_head'] = rank
device_map['language_model.model.rotary_emb'] = rank
device_map[f'language_model.model.layers.{num_layers - 1}'] = rank
return device_map
def build_mcq_cot_prompt(line, prompt):
cot_prompt = (
"Answer the preceding multiple choice question. The last line of your response should follow "
"this format: 'Answer: \\boxed{$LETTER}' (without quotes), where LETTER is one of the options. "
"If you are uncertain or the problem is too complex, make a reasoned guess based on the "
"information provided. Avoid repeating steps indefinitely—provide your best guess even if "
"unsure. Think step by step logically, considering all relevant information before answering."
)
prompt = prompt.replace("Answer with the option's letter from the given choices directly.", '').strip()
prompt = prompt + '\n' + cot_prompt
return prompt
def build_qa_cot_prompt(line, prompt):
cot_prompt = (
"Answer the preceding question. The last line of your response should follow this format: "
"'Answer: \\boxed{$FINAL_ANSWER}' (without quotes), where 'FINAL_ANSWER' is your conclusion "
"based on the reasoning provided. If you are uncertain or the problem is too complex, make "
"a reasoned guess based on the information provided. Avoid repeating steps indefinitely—"
"provide your best guess even if unsure. Think step by step logically, considering all "
"relevant information before answering."
)
prompt = prompt + '\n' + cot_prompt
return prompt
def build_multi_choice_prompt(line, dataset=None):
question = line['question']
hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
if hint is not None:
question = hint + '\n' + question
options = {
cand: line[cand]
for cand in string.ascii_uppercase
if cand in line and not pd.isna(line[cand])
}
for key, item in options.items():
question += f'\n{key}. {item}'
prompt = question
if len(options):
prompt += '\n请直接回答选项字母。' if cn_string(
prompt) else "\nAnswer with the option's letter from the given choices directly."
else:
prompt += '\n请直接回答问题。' if cn_string(prompt) else '\nAnswer the question directly.'
return prompt
def build_video_prompt(prompt, dataset=None, max_frames=64):
for start in range(0, max_frames, 8):
images_to_remove = ''.join([f'<Image-{i}>' for i in range(start + 1, start + 9)])
prompt = prompt.replace(images_to_remove, '')
for i in range(max_frames):
prompt = prompt.replace(f'Image-{i + 1}', f'Frame-{i + 1}')
if listinstr(['MMBench-Video'], dataset):
prompt = prompt.replace('\nAnswer:', '')
elif listinstr(['Video-MME'], dataset):
prompt = prompt.replace('\nAnswer:', '')
prompt += "\nAnswer with the option's letter from the given choices directly."
elif listinstr(['MVBench'], dataset):
prompt = prompt.replace('Best option:(', '')
return prompt
def reorganize_prompt(message, image_num, dataset=None):
if dataset is not None and listinstr(['MUIRBench'], dataset):
prompt = '\n'.join([x['value'] for x in message if x['type'] == 'text'])
images_to_remove = ' '.join(['<image>'] * image_num)
prompt = prompt.replace(images_to_remove, '')
for i in range(image_num):
prompt = prompt.replace('<image>', f'<Image-{i + 1}>', 1)
prompt = ''.join([f'Image-{i + 1}: <image>\n' for i in range(image_num)]) + prompt
elif image_num == 1:
prompt = '<image>\n' + '\n'.join([x['value'] for x in message if x['type'] == 'text'])
else:
prompt, image_idx = '', 1
for x in message:
if x['type'] == 'text':
prompt += x['value']
elif x['type'] == 'image':
prompt += f'<Image-{image_idx}>'
image_idx += 1
prompt = ''.join([f'Image-{i + 1}: <image>\n' for i in range(image_num)]) + prompt
images_to_remove = ''.join([f'<Image-{i + 1}>' for i in range(image_num)])
prompt = prompt.replace(images_to_remove, '')
return prompt
mpo_prompt_with_final_answer = (
"Your task is to answer the question below. "
"Give step by step reasoning before you answer, and when you're ready to answer, "
"please use the format \"Final answer: ..\""
"\n\n"
"Question:"
"\n\n"
"{question}"
)
mpo_prompt_without_final_answer = (
"Your task is to answer the question below. "
"Give step by step reasoning. "
"\n\n"
"Question:"
"\n\n"
"{question}"
)
def mpo_post_processing(response, dataset):
def extract_answer(text):
match = re.search(r'(Final answer:|Answer:)\s*(.*)', text, re.IGNORECASE)
if match:
return match.group(2).strip()
return text
if dataset is not None and (DATASET_TYPE(dataset) in ['Y/N', 'MCQ'] or listinstr(['CRPE'], dataset)):
response = extract_answer(response).strip()
return response
def build_mpo_prompt(message, line, dataset):
if not listinstr(['LLaVABench'], dataset):
if listinstr(['MMVet'], dataset):
cot_prompt = mpo_prompt_without_final_answer
else:
cot_prompt = mpo_prompt_with_final_answer
question_orig = line['question']
if listinstr(['MathVerse', 'MathVision'], dataset):
question_orig = question_orig.split('Question:', 1)[-1].strip()
question_orig = question_orig.replace('Choices:\n', '').strip()
prompt = cot_prompt.format(question=question_orig)
else:
prompt = line['question']
message[0]['value'] = prompt
return message
import sys
import torch
from transformers import AutoModelForCausalLM, AutoConfig
import warnings
from .base import BaseModel
from ..smp import *
from ..dataset import DATASET_TYPE
class Janus(BaseModel):
INSTALL_REQ = True
INTERLEAVE = True
def check_install(self):
try:
import janus
except Exception as e:
logging.critical(
'Please first install janus from source codes in: https://github.com/deepseek-ai/Janus')
raise e
def __init__(self, model_path='deepseek-ai/Janus-1.3B', **kwargs):
self.check_install()
assert model_path is not None
self.model_path = model_path
from janus.models import VLChatProcessor
self.vl_chat_processor = VLChatProcessor.from_pretrained(model_path)
self.tokenizer = self.vl_chat_processor.tokenizer
model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
self.model = model.to(torch.bfloat16).cuda().eval()
torch.cuda.empty_cache()
default_kwargs = dict(
max_new_tokens=512,
do_sample=False,
use_cache=True,
output_logits=False,
output_scores=False,
return_dict_in_generate=False)
default_kwargs.update(kwargs)
self.kwargs = default_kwargs
warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
def prepare_inputs(self, message):
def prepare_itlist(msgs):
content, images = '', []
for s in msgs:
if s['type'] == 'image':
images.append(s['value'])
content += '<image_placeholder>'
elif s['type'] == 'text':
content += s['value']
return content, images
conversation = []
if 'role' not in message[0]:
content, images = prepare_itlist(message)
conversation.append(dict(role='User', content=content, images=images))
else:
role_map = {'user': 'User', 'assistant': 'Assistant'}
for msgs in message:
role = role_map[msgs['role']]
content, images = prepare_itlist(msgs['content'])
conversation.append(dict(role=role, content=content, images=images))
conversation.append(dict(role='Assistant', content=''))
return conversation
def generate_inner(self, message, dataset=None):
if dataset is None or not ('MMVet' in dataset):
self.vl_chat_processor.system_prompt = ""
else:
self.vl_chat_processor.system_prompt = "You are a helpful assistant. Please answer truthfully and write out your thinking step by step to be sure you get the right answer." # noqa: E501
conversation = self.prepare_inputs(message)
from janus.utils.io import load_pil_images
pil_images = load_pil_images(conversation)
prepare_inputs = self.vl_chat_processor(conversations=conversation, images=pil_images, force_batchify=True)
prepare_inputs = prepare_inputs.to(self.model.device, dtype=torch.bfloat16)
inputs_embeds = self.model.prepare_inputs_embeds(**prepare_inputs)
outputs = self.model.language_model.generate(
inputs_embeds=inputs_embeds,
attention_mask=prepare_inputs.attention_mask,
pad_token_id=self.tokenizer.eos_token_id,
bos_token_id=self.tokenizer.bos_token_id,
eos_token_id=self.tokenizer.eos_token_id,
**self.kwargs)
answer = self.tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
return answer
def chat_inner(self, message, dataset=None):
return self.generate_inner(message, dataset=dataset)
def use_custom_prompt(self, dataset):
assert dataset is not None
if DATASET_TYPE(dataset) == 'Y/N' or DATASET_TYPE(dataset) == 'MCQ' or dataset == 'MMVet':
return True
return False
def build_prompt(self, line, dataset=None):
assert dataset is None or isinstance(dataset, str)
assert self.use_custom_prompt(dataset)
tgt_path = self.dump_image(line, dataset)
question = line['question']
if DATASET_TYPE(dataset) == 'Y/N':
if dataset == 'POPE':
question = question.replace(" Please answer yes or no.", "")
prompt = '\n' + question + "\nAnswer the question using a single word or phrase."
elif DATASET_TYPE(dataset) == 'MCQ':
options = {
cand: line[cand]
for cand in string.ascii_uppercase
if cand in line and not pd.isna(line[cand])
}
options_prompt = ''
for key, item in options.items():
options_prompt += f'{key}. {item}\n'
hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
prompt = f'\nHint: {hint}\n' if hint is not None else '\n'
prompt += f'{question}\n'
prompt += (
f"{options_prompt}\nAnswer with the option's letter from the given choices directly."
if len(options) else 'Answer the question directly. '
)
elif dataset == 'MMVet':
prompt = '\n' + question
else:
raise NotImplementedError
message = [dict(type='image', value=s) for s in tgt_path]
message.extend([dict(type='text', value=prompt)])
return message
import torch
import re
from PIL import Image
from abc import abstractproperty
import sys
import os.path as osp
from .base import BaseModel
from ..smp import *
from ..dataset import DATASET_TYPE
import copy
class Kosmos2(BaseModel):
INSTALL_REQ = True
INTERLEAVE = True
def __init__(self,
model_path='microsoft/kosmos-2-patch14-224',
**kwargs):
try:
from transformers import AutoProcessor, Kosmos2ForConditionalGeneration
except Exception as e:
logging.critical("Please install Transformers version 4.45.1 by running: pip install transformers==4.45.1")
raise e
assert osp.exists(model_path) or splitlen(model_path) == 2
self.model = (
Kosmos2ForConditionalGeneration.from_pretrained(model_path, torch_dtype=torch.float16)
.to(torch.device('cuda'))
)
self.processor = AutoProcessor.from_pretrained(model_path)
default_kwargs = dict(
max_new_tokens=512,
use_cache=True
)
default_kwargs.update(kwargs)
self.kwargs = default_kwargs
warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
torch.cuda.empty_cache()
def generate_inner(self, message, dataset=None):
TASK_TOKEN = '<grounding> '
QEUSTION_TOKEN = 'Question: '
ANSWER_TOKEN = 'Answer: '
images = []
prompt = ''
prompt += TASK_TOKEN
for s in message:
if s['type'] == 'image':
images.append(s['value'])
elif s['type'] == 'text':
prompt += QEUSTION_TOKEN
prompt += s['value']
prompt += ANSWER_TOKEN
images = [Image.open(s) for s in images]
inputs = self.processor(text=prompt, images=images[0], return_tensors='pt').to(torch.device('cuda'))
generated_ids = self.model.generate(
pixel_values=inputs['pixel_values'],
input_ids=inputs['input_ids'],
attention_mask=inputs['attention_mask'],
image_embeds=None,
image_embeds_position_mask=inputs['image_embeds_position_mask'],
**self.kwargs
)
generated_text = self.processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
processed_text = self.processor.post_process_generation(generated_text, cleanup_and_extract=True)[0]
cleaned_answer = re.sub(r'(Question:.*?Answer:|Question:.*)', '', processed_text).strip()
return cleaned_answer
def use_custom_prompt(self, dataset):
assert dataset is not None
if listinstr(['MMMU'], dataset):
return False
if DATASET_TYPE(dataset) == 'MCQ' or dataset == 'MMVet':
return True
return False
def build_prompt(self, line, dataset=None):
assert dataset is None or isinstance(dataset, str)
assert self.use_custom_prompt(dataset)
tgt_path = self.dump_image(line, dataset)
question = line['question']
if dataset == 'MMVet':
prompt = question + '\nAnswer the question directly. '
elif DATASET_TYPE(dataset) == 'MCQ':
options = {
cand: line[cand]
for cand in string.ascii_uppercase
if cand in line and not pd.isna(line[cand])
}
options_prompt = ''
for key, item in options.items():
options_prompt += f'{key}. {item}\n'
hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
prompt = f'Hint: {hint}\n' if hint is not None else ''
prompt += f'{question}\n'
prompt += (
f'{options_prompt}\nAnswer with the option’s letter from the given choices directly. '
if len(options) else 'Answer the question directly. '
)
else:
raise NotImplementedError
message = [dict(type='text', value=prompt)]
message.extend([dict(type='image', value=s) for s in tgt_path])
return message
import torch
from PIL import Image
import os.path as osp
import sys
from .base import BaseModel
from ..smp import *
from ..dataset import DATASET_TYPE
class llama_vision(BaseModel):
INSTALL_REQ = False
INTERLEAVE = False
# This function is used to split Llama-3.2-90B
def split_model(self):
import math
device_map = {}
num_gpus = torch.cuda.device_count()
rank, world_size = get_rank_and_world_size()
num_gpus = num_gpus // world_size
num_layers = 100
# GPU0: -5, GPU-1: -7
total_cost = num_layers + 5 + 7
# Since the first GPU will be used for ViT, treat it as 0.8 GPU.
num_layers_per_gpu = total_cost // num_gpus
num_layers_per_gpu = [num_layers_per_gpu] * num_gpus
# The total number of GPUs might be odd
num_layers_per_gpu[-1] = total_cost - sum(num_layers_per_gpu[:-1])
num_layers_per_gpu[0] -= 5
num_layers_per_gpu[-1] -= 7
layer_cnt = 0
for i, num_layer in enumerate(num_layers_per_gpu):
for j in range(num_layer):
device_map[f'language_model.model.layers.{layer_cnt}'] = rank + world_size * i
layer_cnt += 1
device_map['vision_model'] = rank
device_map['language_model.model.embed_tokens'] = rank
device_map['language_model.model.rotary_emb'] = rank
device_map['language_model.model.norm'] = rank + world_size * (num_gpus - 1)
device_map['language_model.lm_head'] = rank + world_size * (num_gpus - 1)
device_map['multi_modal_projector'] = rank + world_size * (num_gpus - 1)
return device_map
def __init__(self, model_path='meta-llama/Llama-3.2-11B-Vision-Instruct', **kwargs):
try:
from transformers import MllamaForConditionalGeneration, AutoProcessor
except Exception as e:
logging.critical('Please install transformers>=4.45.0 before using llama_vision.')
raise e
rank, world_size = get_rank_and_world_size()
if '11b' in model_path.lower() and auto_split_flag():
assert world_size == 1, 'We only support world_size == 1 when AUTO_SPLIT is set for Llama-3.2-11B'
logging.warning('Currently, we only support to split the 11B model across all GPUs.')
self.model = MllamaForConditionalGeneration.from_pretrained(
model_path,
torch_dtype=torch.bfloat16,
device_map='auto',
).eval()
elif '90b' in model_path.lower():
device_map = self.split_model()
self.model = MllamaForConditionalGeneration.from_pretrained(
model_path,
torch_dtype=torch.bfloat16,
device_map=device_map,
).eval()
else:
self.model = MllamaForConditionalGeneration.from_pretrained(
model_path,
torch_dtype=torch.bfloat16,
device_map='cpu',
).cuda().eval()
self.device = 'cuda'
self.processor = AutoProcessor.from_pretrained(model_path)
if 'Instruct' in model_path:
kwargs_default = dict(do_sample=True, temperature=0.6, top_p=0.9)
else:
kwargs_default = dict(do_sample=False, max_new_tokens=512, temperature=0.0, top_p=None, num_beams=1)
kwargs.update(kwargs_default)
print(f'Following kwargs received: {kwargs}, will use as generation config. ')
self.kwargs = kwargs
self.model_name = model_path
def use_custom_prompt(self, dataset):
if dataset is None:
return False
if listinstr(['AI2D', 'MMMU', 'MathVista', 'ChartQA', 'DocVQA'], dataset):
# For Certain dataset we use custom prompt
return True
else:
return False
def build_prompt(self, line, dataset=None):
assert self.use_custom_prompt(dataset)
assert dataset is None or isinstance(dataset, str)
tgt_path = self.dump_image(line, dataset)
question = line['question']
options = {
cand: line[cand]
for cand in string.ascii_uppercase
if cand in line and not pd.isna(line[cand])
}
if listinstr(['AI2D'], dataset):
self.kwargs['max_new_tokens'] = 400
for key, item in options.items():
question += f'\n{key}. {item}'
if '11B' in self.model_name:
prompt = (
f'Look at the scientific diagram carefully and answer the following question: {question}\n'
f'Think step by step and finally respond to the question '
f"with only the correct option number as \"FINAL ANSWER\"."
f"<cot_start>Let's think step by step."
)
elif '90B' in self.model_name:
prompt = (
f'Look at the scientific diagram carefully and answer the following question: {question}\n'
f'Respond only with the correct option digit.'
)
elif listinstr(['MMMU'], dataset):
self.kwargs['max_new_tokens'] = 2048
options = '\n'.join([f'{key}. {item}' for key, item in options.items()])
prompt = (
f'Look at the image carefully and solve the following question step-by-step. '
f'Question: {question} Options: {options} Indicate the correct answer at the end.'
)
for i in range(len(tgt_path)):
prompt = prompt.replace(f'<image {i+1}>', '')
elif listinstr(['MathVista'], dataset):
self.kwargs['max_new_tokens'] = 2048
prompt = f'{question}'
elif listinstr(['ChartQA'], dataset):
self.kwargs['max_new_tokens'] = 512
if '11B' in self.model_name:
prompt = (
f'You are provided a chart image and will be asked a question. '
f'You have to think through your answer and provide a step-by-step solution. '
f'Once you have the solution, write the final answer in at most a few words at the end '
f"with the phrase \"FINAL ANSWER:\". "
f"The question is: {question}<cot_start>Let's think step by step."
)
elif '90B' in self.model_name:
prompt = (
f'You are provided a chart image and will be asked a question. '
f'Follow these steps carefully:\n '
f'Step 1: Analyze the question to understand what specific data or information is being asked for. '
f'Focus on whether the question is asking for a specific number or category '
f'from the chart image.\n '
f'Step 2: Identify any numbers, categories, or groups mentioned in the question '
f'and take note of them. Focus on detecting and matching them directly to the image. \n'
f'Step 3: Study the image carefully and find the relevant data corresponding to the categories '
f'or numbers mentioned. Avoid unnecessary assumptions or calculations; '
f'simply read the correct data from the image.\n '
f'Step 4: Develop a clear plan to solve the question by locating the right data. '
f'Focus only on the specific category or group that matches the question. \n'
f'Step 5: Use step-by-step reasoning to ensure you are referencing the correct numbers '
f'or data points from the image, avoiding unnecessary extra steps or interpretations.\n '
f"Step 6: Provide the final answer, starting with \"FINAL ANSWER:\" "
f'and using as few words as possible, '
f'simply stating the number or data point requested. \n\n '
f"The question is: {question}<cot_start>Let's think step by step."
)
elif listinstr(['DocVQA'], dataset):
self.kwargs['max_new_tokens'] = 512
prompt = (
f'Read the text in the image carefully and answer the question '
f'with the text as seen exactly in the image. '
f'For yes/no questions, just respond Yes or No. '
f'If the answer is numeric, just respond with the number and nothing else. '
f'If the answer has multiple words, just respond with the words and absolutely nothing else. '
f'Never respond in a sentence or a phrase.\n Question: {question}'
)
else:
raise NotImplementedError(f'Dataset {dataset}) not supported.')
message = [dict(type='text', value=prompt)]
message.extend([dict(type='image', value=s) for s in tgt_path])
return message
def generate_inner(self, message, dataset=None):
prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
image = Image.open(image_path)
messages = [
{'role': 'user', 'content': [
{'type': 'image'},
{'type': 'text', 'text': prompt}
]}
]
input_text = self.processor.apply_chat_template(messages, add_generation_prompt=True)
inputs = self.processor(image, input_text, return_tensors='pt').to(self.device)
if not self.use_custom_prompt(dataset):
if dataset is not None and DATASET_TYPE(dataset) in ['MCQ', 'Y/N']:
self.kwargs['max_new_tokens'] = 128
else:
self.kwargs['max_new_tokens'] = 512
output = self.model.generate(**inputs, **self.kwargs)
return self.processor.decode(output[0][inputs['input_ids'].shape[1]:]).replace('<|eot_id|>', '')
from .llava import LLaVA, LLaVA_Next, LLaVA_Next2, LLaVA_OneVision, LLaVA_OneVision_HF
from .llava_xtuner import LLaVA_XTuner
__all__ = ['LLaVA', 'LLaVA_Next', 'LLaVA_XTuner', 'LLaVA_Next2', 'LLaVA_OneVision', 'LLaVA_OneVision_HF']
import torch
from PIL import Image
from abc import abstractproperty
import sys
import os.path as osp
from ..base import BaseModel
from ...smp import *
from ...dataset import DATASET_TYPE, DATASET_MODALITY
import copy
import requests
class LLaVA(BaseModel):
INSTALL_REQ = True
INTERLEAVE = True
def __init__(self, model_path="liuhaotian/llava_v1.5_7b", **kwargs):
try:
from llava.model.builder import load_pretrained_model
from llava.mm_utils import get_model_name_from_path
except Exception as err:
logging.critical(
"Please install llava from https://github.com/haotian-liu/LLaVA"
)
raise err
assert osp.exists(model_path) or splitlen(model_path) == 2
self.system_prompt = (
"A chat between a curious human and an artificial intelligence assistant. "
"The assistant gives helpful, detailed, and polite answers to the human's questions. "
)
self.stop_str = "</s>"
if model_path == "Lin-Chen/ShareGPT4V-7B":
model_name = "llava-v1.5-7b"
elif model_path == "Lin-Chen/ShareGPT4V-13B":
model_name = "llava-v1.5-13b"
else:
model_name = get_model_name_from_path(model_path)
try:
self.tokenizer, self.model, self.image_processor, self.context_len = (
load_pretrained_model(
model_path=model_path,
model_base=None,
model_name=model_name,
device="cpu",
device_map="cpu",
)
)
except Exception as err:
if "ShareGPT4V" in model_path:
import llava
logging.critical(
"Please manually remove the encoder type check in "
f"{llava.__path__[0]}/model/multimodal_encoder/builder.py "
"Line 8 to use the ShareGPT4V model. "
)
else:
logging.critical("Unknown error when loading LLaVA model.")
raise err
self.model = self.model.cuda()
self.conv_mode = "llava_v1"
kwargs_default = dict(
do_sample=False,
temperature=0,
max_new_tokens=512,
top_p=None,
num_beams=1,
use_cache=True,
) # noqa E501
kwargs_default.update(kwargs)
self.kwargs = kwargs_default
warnings.warn(
f"Following kwargs received: {self.kwargs}, will use as generation config. "
)
def use_custom_prompt(self, dataset):
assert dataset is not None
if DATASET_TYPE(dataset) == "MCQ":
return True
return False
def build_prompt(self, line, dataset=None):
assert self.use_custom_prompt(dataset)
assert dataset is None or isinstance(dataset, str)
tgt_path = self.dump_image(line, dataset)
question = line["question"]
hint = line["hint"] if ("hint" in line and not pd.isna(line["hint"])) else None
if hint is not None:
question = hint + "\n" + question
options = {
cand: line[cand]
for cand in string.ascii_uppercase
if cand in line and not pd.isna(line[cand])
}
for key, item in options.items():
question += f"\n{key}. {item}"
prompt = question
if len(options):
prompt += (
"\n请直接回答选项字母。"
if cn_string(prompt)
else "\nAnswer with the option's letter from the given choices directly."
)
else:
prompt += (
"\n请直接回答问题。"
if cn_string(prompt)
else "\nAnswer the question directly."
)
message = [dict(type="image", value=s) for s in tgt_path]
message.append(dict(type="text", value=prompt))
return message
def concat_tilist(self, message):
text, images = "", []
for item in message:
if item["type"] == "text":
text += item["value"]
elif item["type"] == "image":
text += " <image> "
images.append(item["value"])
return text, images
def chat_inner(self, message, dataset=None):
from llava.mm_utils import (
process_images,
tokenizer_image_token,
KeywordsStoppingCriteria,
)
from llava.constants import IMAGE_TOKEN_INDEX
prompt = self.system_prompt
images = []
for utter in message:
prompt += "USER: " if utter["role"] == "user" else "ASSISTANT: "
content, images_sub = self.concat_tilist(utter["content"])
prompt += content
images.extend(images_sub)
prompt += " " if utter["role"] == "user" else self.stop_str
assert message[-1]["role"] == "user", message
prompt += "ASSISTANT: "
images = [Image.open(s).convert("RGB") for s in images]
args = abstractproperty()
args.image_aspect_ratio = "pad"
image_tensor = process_images(images, self.image_processor, args).to(
"cuda", dtype=torch.float16
)
input_ids = (
tokenizer_image_token(
prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt"
)
.unsqueeze(0)
.cuda()
)
keywords = [self.stop_str]
stopping_criteria = KeywordsStoppingCriteria(
keywords, self.tokenizer, input_ids
)
with torch.inference_mode():
output_ids = self.model.generate(
input_ids,
images=image_tensor,
stopping_criteria=[stopping_criteria],
**self.kwargs,
)
output = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)[
0
].strip()
return output
def generate_inner(self, message, dataset=None):
from llava.mm_utils import (
process_images,
tokenizer_image_token,
KeywordsStoppingCriteria,
)
from llava.constants import IMAGE_TOKEN_INDEX
# Support interleave text and image
content, images = self.concat_tilist(message)
images = [Image.open(s).convert("RGB") for s in images]
args = abstractproperty()
args.image_aspect_ratio = "pad"
if images:
image_tensor = process_images(images, self.image_processor, args).to(
"cuda", dtype=torch.float16
)
else:
image_tensor = None
prompt = self.system_prompt + "USER: " + content + " ASSISTANT: "
input_ids = (
tokenizer_image_token(
prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt"
)
.unsqueeze(0)
.cuda()
)
keywords = [self.stop_str]
stopping_criteria = KeywordsStoppingCriteria(
keywords, self.tokenizer, input_ids
)
with torch.inference_mode():
output_ids = self.model.generate(
input_ids,
images=image_tensor,
stopping_criteria=[stopping_criteria],
**self.kwargs,
)
output = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)[
0
].strip()
return output
class LLaVA_Next(BaseModel):
INSTALL_REQ = False
INTERLEAVE = True
def __init__(self, model_path="llava-hf/llava-v1.6-vicuna-7b-hf", **kwargs):
import transformers
from transformers import (
LlavaNextProcessor,
LlavaNextForConditionalGeneration,
AutoProcessor,
LlavaForConditionalGeneration,
)
self.model_path = model_path
if "34b" in model_path.lower():
self.processor = LlavaNextProcessor.from_pretrained(
self.model_path, use_fast=False
)
elif "interleave" in model_path.lower():
self.processor = AutoProcessor.from_pretrained(self.model_path)
else:
self.processor = LlavaNextProcessor.from_pretrained(self.model_path)
flash_attn_flag = False
try:
import flash_attn
flash_attn_flag = True
except ImportError:
pass
if flash_attn_flag:
if "interleave" in model_path.lower():
model = LlavaForConditionalGeneration.from_pretrained(
self.model_path,
torch_dtype=torch.float16,
low_cpu_mem_usage=True,
use_flash_attention_2=True,
)
else:
model = LlavaNextForConditionalGeneration.from_pretrained(
self.model_path,
torch_dtype=torch.float16,
low_cpu_mem_usage=True,
use_flash_attention_2=True,
)
else:
if "interleave" in model_path.lower():
model = LlavaForConditionalGeneration.from_pretrained(
self.model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True
)
else:
model = LlavaNextForConditionalGeneration.from_pretrained(
self.model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True
)
model = model.eval()
self.model = model.cuda()
kwargs_default = dict(
do_sample=False, temperature=0, max_new_tokens=512, top_p=None, num_beams=1
)
kwargs_default.update(kwargs)
self.kwargs = kwargs_default
warnings.warn(
f"Following kwargs received: {self.kwargs}, will use as generation config. "
)
def apply_prompt_template(self, prompt):
model_path = self.model_path.lower()
if "mistral" in model_path:
template = "[INST] PLACEHOLDER [/INST]"
elif "vicuna" in model_path:
template = (
"A chat between a curious human and an artificial intelligence assistant. "
"The assistant gives helpful, detailed, and polite answers to the human's questions. "
"USER: PLACEHOLDER ASSISTANT:"
)
elif "34b" in model_path:
template = (
"<|im_start|>system\nAnswer the questions.<|im_end|><|im_start|>user\nPLACEHOLDER<|im_end|>"
"<|im_start|>assistant\n"
)
else:
raise NotImplementedError(
f"Prompt template for {model_path} not implemented."
)
prompt = template.replace("PLACEHOLDER", f"<image>\n{prompt}")
return prompt
def output_process(self, answer):
if "<s>" in answer:
answer = answer.replace("<s>", "").strip()
if "[/INST]" in answer:
answer = answer.split("[/INST]")[1].strip()
elif "ASSISTANT:" in answer:
answer = answer.split("ASSISTANT:")[1].strip()
elif "assistant\n" in answer:
answer = answer.split("assistant\n")[1].strip()
elif "<|end_header_id|>\n\n" in answer:
answer = answer.split("<|end_header_id|>\n\n")[2].strip()
if "</s>" in answer:
answer = answer.split("</s>")[0].strip()
elif "<|im_end|>" in answer:
answer = answer.split("<|im_end|>")[0].strip()
elif "<|eot_id|>" in answer:
answer = answer.split("<|eot_id|>")[0].strip()
return answer
def use_custom_prompt(self, dataset):
assert dataset is not None
if DATASET_TYPE(dataset) == "MCQ":
return True
return False
def build_prompt(self, line, dataset=None):
assert self.use_custom_prompt(dataset)
assert dataset is None or isinstance(dataset, str)
tgt_path = self.dump_image(line, dataset)
question = line["question"]
hint = line["hint"] if ("hint" in line and not pd.isna(line["hint"])) else None
if hint is not None:
question = hint + "\n" + question
options = {
cand: line[cand]
for cand in string.ascii_uppercase
if cand in line and not pd.isna(line[cand])
}
for key, item in options.items():
question += f"\n{key}. {item}"
prompt = question
if len(options):
prompt += (
"\n请直接回答选项字母。"
if cn_string(prompt)
else "\nAnswer with the option's letter from the given choices directly."
)
else:
prompt += (
"\n请直接回答问题。"
if cn_string(prompt)
else "\nAnswer the question directly."
)
message = [dict(type="image", value=s) for s in tgt_path]
message.append(dict(type="text", value=prompt))
return message
def generate_inner(self, message, dataset=None):
content, images = [], []
for msg in message:
if msg["type"] == "text":
content.append({"type": msg["type"], "text": msg["value"]})
else:
content.append({"type": "image"})
images.append(Image.open(msg["value"]).convert("RGB"))
conversation = [
{
"role": "user",
"content": content,
}
]
prompt = self.processor.apply_chat_template(
conversation, add_generation_prompt=True
)
inputs = self.processor(prompt, images, return_tensors="pt").to(
"cuda", torch.float16
)
output = self.model.generate(**inputs, **self.kwargs)
answer = self.processor.decode(output[0], skip_special_token=True)
answer = self.output_process(answer)
return answer
class LLaVA_Next2(BaseModel):
INSTALL_REQ = True
INTERLEAVE = True
DEFAULT_IMAGE_TOKEN = "<image>"
IMAGE_TOKEN_INDEX = -200
def __init__(self, model_path="lmms-lab/llama3-llava-next-8b", **kwargs):
assert model_path is not None
try:
from llava.model.builder import load_pretrained_model
from llava.conversation import conv_templates, SeparatorStyle
from llava.mm_utils import (
get_model_name_from_path,
tokenizer_image_token,
KeywordsStoppingCriteria,
)
except Exception as err:
logging.critical(
"Please `pip install git+https://github.com/LLaVA-VL/LLaVA-NeXT.git`"
)
raise err
model_name = get_model_name_from_path(model_path)
tokenizer, model, image_processor, _ = load_pretrained_model(
model_path, None, model_name, device_map=None
)
model.cuda().eval()
model.tie_weights()
if "llama3" in model_path.lower():
conv_mode = "llava_llama_3"
elif "qwen" in model_path.lower():
conv_mode = "qwen_1_5"
self.conv_template = conv_mode
self.conv_templates = conv_templates
self.tokenizer = tokenizer
self.model = model
self.image_processor = image_processor
self.tokenizer_image_token = tokenizer_image_token
self.KeywordStoppingCriteria = KeywordsStoppingCriteria
self.SeparatorStyle = SeparatorStyle
def generate_inner(self, message, dataset=None):
content, images = "", []
for msg in message:
if msg["type"] == "text":
content += msg["value"]
else:
images.append(Image.open(msg["value"]).convert("RGB"))
content += self.DEFAULT_IMAGE_TOKEN + "\n"
preprocess = self.image_processor.preprocess
image_tokenizer = self.tokenizer_image_token
image_tensor = [
preprocess(f, return_tensors="pt")["pixel_values"][0].half().cuda()
for f in images
]
image_tensor = torch.stack(image_tensor)
conv = copy.deepcopy(self.conv_templates[self.conv_template])
conv.append_message(conv.roles[0], content)
conv.append_message(conv.roles[1], None)
prompt_question = conv.get_prompt()
input_ids = image_tokenizer(
prompt_question, self.tokenizer, self.IMAGE_TOKEN_INDEX, return_tensors="pt"
)
input_ids = input_ids.unsqueeze(0).cuda()
stop_str = conv.sep if conv.sep_style != self.SeparatorStyle.TWO else conv.sep2
keywords = [stop_str]
stopping_criteria = self.KeywordStoppingCriteria(
keywords, self.tokenizer, input_ids
)
cont = self.model.generate(
input_ids,
images=image_tensor,
do_sample=False,
temperature=0,
max_new_tokens=512,
stopping_criteria=[stopping_criteria],
)
text_outputs = self.tokenizer.batch_decode(cont, skip_special_tokens=True)[0]
return text_outputs
class LLaVA_OneVision(BaseModel):
INSTALL_REQ = True
INTERLEAVE = True
VIDEO_LLM = True
DEFAULT_IMAGE_TOKEN = "<image>"
IMAGE_TOKEN_INDEX = -200
# This function is used to split InternVL2-Llama3-76B
def split_model(self, model_path):
import math
device_map = {}
num_gpus = torch.cuda.device_count()
rank, world_size = get_rank_and_world_size()
num_gpus = num_gpus // world_size
if "72b" not in model_path.lower():
return None
# embed_tokens, vision_tower, mm_projector, lm_head are treated as 2 layers
num_layers = 80 + 8
num_layers_per_gpu = math.ceil(num_layers / num_gpus)
num_layers_per_gpu = [num_layers_per_gpu] * num_gpus
num_layers_per_gpu[0] -= 6
num_layers_per_gpu[-1] -= 2
layer_cnt = 0
for i, num_layer in enumerate(num_layers_per_gpu):
for j in range(num_layer):
device_map[f"model.layers.{layer_cnt}"] = rank + world_size * i
layer_cnt += 1
last_gpu = rank + world_size * (num_gpus - 1)
device_map["model.image_newline"] = rank
device_map["model.embed_tokens"] = rank
device_map["model.norm"] = rank
device_map["model.vision_tower"] = rank
device_map["model.vision_resampler"] = rank
device_map["model.mm_projector"] = rank
device_map["lm_head"] = last_gpu
return device_map
def __init__(self, model_path="lmms-lab/llava-onevision-qwen2-7b-si", **kwargs):
assert model_path is not None
try:
from llava.model.builder import load_pretrained_model
from llava.conversation import conv_templates, SeparatorStyle
from llava.mm_utils import (
get_model_name_from_path,
process_images,
tokenizer_image_token,
KeywordsStoppingCriteria,
) # noqa: E501
except Exception as err:
logging.critical(
"Please `pip install git+https://github.com/LLaVA-VL/LLaVA-NeXT.git`"
)
raise err
video_kwargs_default = dict(
overwrite=True, mm_spatial_pool_mode="average", force_sample=True
)
video_kwargs_default.update(kwargs)
self.video_kwargs = video_kwargs_default
overwrite_config = None
if "video" in model_path.lower():
if self.video_kwargs["overwrite"]:
overwrite_config = {}
overwrite_config["mm_spatial_pool_mode"] = self.video_kwargs[
"mm_spatial_pool_mode"
]
rank, world_size = get_rank_and_world_size()
model_name = get_model_name_from_path(model_path)
device_map = self.split_model(model_path)
if device_map is None:
if auto_split_flag():
assert world_size == 1, 'Only support world_size == 1 when AUTO_SPLIT set for non-72B LLaVA-OneVision'
logging.warning('Currently, we only support to split the non-72B model across all GPUs.')
tokenizer, model, image_processor, _ = load_pretrained_model(
model_path,
None,
model_name,
device_map="auto",
overwrite_config=overwrite_config,
)
else:
tokenizer, model, image_processor, _ = load_pretrained_model(
model_path,
None,
model_name,
device_map="cpu",
overwrite_config=overwrite_config,
)
model.cuda()
else:
tokenizer, model, image_processor, _ = load_pretrained_model(
model_path,
None,
model_name,
device_map=device_map,
overwrite_config=overwrite_config,
)
model.eval()
model.tie_weights()
if "llava" in model_path.lower():
conv_mode = "qwen_1_5"
if 'llava-video' in model_path.lower():
self.nframe = 64
else:
self.nframe = 16
if "72b" in model_path.lower():
self.nframe = 32
if "video" in model_path.lower():
self.force_sample = self.video_kwargs["force_sample"]
else:
self.force_sample = False
self.conv_template = conv_mode
self.conv_templates = conv_templates
self.tokenizer = tokenizer
self.model = model
self.image_processor = image_processor
self.tokenizer_image_token = tokenizer_image_token
self.process_images = (
process_images # Store process_images as a class attribute
)
self.KeywordStoppingCriteria = KeywordsStoppingCriteria
self.SeparatorStyle = SeparatorStyle
def generate_inner_image(self, message, dataset=None):
content, images = "", []
image_sizes = [] # Store image sizes
for msg in message:
if msg["type"] == "text":
content += msg["value"]
else:
img = Image.open(msg["value"]).convert("RGB")
images.append(img)
image_sizes.append(img.size) # Store the size of each image
content += self.DEFAULT_IMAGE_TOKEN + "\n"
# Process images using the class attribute self.process_images
image_tensor = self.process_images(
images, self.image_processor, self.model.config
)
image_tensor = [
_image.to(dtype=torch.float16, device="cuda") for _image in image_tensor
]
conv = copy.deepcopy(self.conv_templates[self.conv_template])
conv.append_message(conv.roles[0], content)
conv.append_message(conv.roles[1], None)
prompt_question = conv.get_prompt()
input_ids = self.tokenizer_image_token(
prompt_question, self.tokenizer, self.IMAGE_TOKEN_INDEX, return_tensors="pt"
)
input_ids = input_ids.unsqueeze(0).cuda()
stop_str = conv.sep if conv.sep_style != self.SeparatorStyle.TWO else conv.sep2
keywords = [stop_str]
stopping_criteria = self.KeywordStoppingCriteria(
keywords, self.tokenizer, input_ids
)
# Pass image sizes along with other parameters
cont = self.model.generate(
input_ids,
images=image_tensor,
image_sizes=image_sizes, # Pass the image sizes here
do_sample=False,
temperature=0,
max_new_tokens=512,
stopping_criteria=[stopping_criteria],
)
text_outputs = self.tokenizer.batch_decode(cont, skip_special_tokens=True)[0]
return text_outputs
def generate_inner_video(self, message, dataset=None):
content, text_content, visual_content, videos = "", "", "", []
for msg in message:
if msg["type"] == "text":
text_content += msg["value"]
else:
videos.append(msg["value"])
visual_content += self.DEFAULT_IMAGE_TOKEN + "\n"
if len(videos) > 1:
raise ValueError(
"LLaVA-OneVision does not support multiple videos as input."
)
video_frames, frame_time, video_time = self.load_video(
videos[0], self.nframe, self.force_sample
)
time_instruciton = (
f"The video lasts for {video_time:.2f} seconds,"
f"and {len(video_frames[0])} frames are uniformly sampled from it."
f"These frames are located at {frame_time}."
f"Please answer the following questions related to this video.\n"
)
if self.force_sample:
content = visual_content + time_instruciton + text_content
else:
content = visual_content + text_content
image_tensors = []
frames = (
self.image_processor.preprocess(video_frames, return_tensors="pt")[
"pixel_values"
]
.half()
.cuda()
)
image_tensors.append(frames)
conv = copy.deepcopy(self.conv_templates[self.conv_template])
conv.append_message(conv.roles[0], content)
conv.append_message(conv.roles[1], None)
prompt_question = conv.get_prompt()
input_ids = self.tokenizer_image_token(
prompt_question, self.tokenizer, self.IMAGE_TOKEN_INDEX, return_tensors="pt"
)
input_ids = input_ids.unsqueeze(0).cuda()
image_sizes = [frame.size for frame in video_frames]
modalities = ["video"] * len(video_frames)
stop_str = conv.sep if conv.sep_style != self.SeparatorStyle.TWO else conv.sep2
keywords = [stop_str]
stopping_criteria = self.KeywordStoppingCriteria(
keywords, self.tokenizer, input_ids
)
# Pass image sizes along with other parameters
cont = self.model.generate(
input_ids,
images=image_tensors,
image_sizes=image_sizes, # Pass the image sizes here
do_sample=False,
temperature=0,
max_new_tokens=512,
modalities=modalities,
stopping_criteria=[stopping_criteria],
)
text_outputs = self.tokenizer.batch_decode(cont, skip_special_tokens=True)[0]
return text_outputs
def load_video(self, video_path, max_frames_num, force_sample=False, fps=1):
from decord import VideoReader, cpu
import numpy as np
if max_frames_num == 0:
return np.zeros((1, 336, 336, 3))
vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
total_frame_num = len(vr)
video_time = total_frame_num / vr.get_avg_fps()
fps = round(vr.get_avg_fps() / fps)
frame_idx = [i for i in range(0, len(vr), fps)]
frame_time = [i / fps for i in frame_idx]
if len(frame_idx) > max_frames_num or force_sample:
sample_fps = max_frames_num
uniform_sampled_frames = np.linspace(
0, total_frame_num - 1, sample_fps, dtype=int
)
frame_idx = uniform_sampled_frames.tolist()
frame_time = [i / vr.get_avg_fps() for i in frame_idx]
frame_time = ",".join([f"{i:.2f}s" for i in frame_time])
spare_frames = vr.get_batch(frame_idx).asnumpy()
# import pdb;pdb.set_trace()
return spare_frames, frame_time, video_time
def generate_inner(self, message, dataset=None):
if DATASET_MODALITY(dataset) == 'VIDEO':
return self.generate_inner_video(message, dataset)
else:
return self.generate_inner_image(message, dataset)
class LLaVA_OneVision_HF(BaseModel):
INSTALL_REQ = True
INTERLEAVE = True
VIDEO_LLM = True
DEFAULT_IMAGE_TOKEN = "<image>"
IMAGE_TOKEN_INDEX = -200
def __init__(self, model_path="llava-hf/llava-onevision-qwen2-0.5b-ov-hf", **kwargs):
from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration
assert model_path is not None, "Model path must be provided."
self.model = LlavaOnevisionForConditionalGeneration.from_pretrained(
model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True
).to('cuda')
self.processor = AutoProcessor.from_pretrained(model_path)
self.video_kwargs = kwargs.get("video_kwargs", {})
self.force_sample = self.video_kwargs.get("force_sample", False)
self.nframe = kwargs.get("nframe", 8)
self.fps = 1
self.model_path = model_path
def generate_inner_image(self, message, dataset=None):
content, images = "", []
image_sizes = []
for msg in message:
if msg["type"] == "text":
content += msg["value"]
elif msg["type"] == "image":
img = Image.open(msg["value"]).convert("RGB")
images.append(img)
image_sizes.append(img.size)
content += self.DEFAULT_IMAGE_TOKEN + "\n"
conversation = [
{
"role": "user",
"content": [
{"type": "text", "text": content.split("\n", 1)[-1]},
{"type": "image"},
],
}
]
prompt = self.processor.apply_chat_template(conversation, add_generation_prompt=True)
inputs = self.processor(images=images, text=prompt, return_tensors="pt").to('cuda', torch.float16)
output = self.model.generate(**inputs, max_new_tokens=512)
if self.model_path == "NCSOFT/VARCO-VISION-14B-HF":
return self.processor.decode(output[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
return self.processor.decode(output[0], skip_special_tokens=True)
def generate_inner_video(self, message, dataset=None):
content, text_content, visual_content, videos = "", "", "", []
for msg in message:
if msg["type"] == "text":
text_content += msg["value"]
elif msg["type"] == "video":
videos.append(msg["value"])
visual_content += self.DEFAULT_IMAGE_TOKEN + "\n"
if len(videos) > 1:
raise ValueError("LLaVA-OneVision does not support multiple videos as input.")
video_frames, frame_time, video_time = self.load_video(
videos[0], self.nframe, fps=1, force_sample=self.force_sample
)
time_instruction = (
f"The video lasts for {video_time:.2f} seconds, "
f"and {len(video_frames)} frames are uniformly sampled from it. "
f"These frames are located at {frame_time}. "
f"Please answer the following questions related to this video.\n"
)
content = visual_content + time_instruction + text_content
conversation = [
{
"role": "user",
"content": [{"type": "text", "text": content}, {"type": "video"}],
}
]
prompt = self.processor.apply_chat_template(conversation, add_generation_prompt=True)
inputs = self.processor(videos=video_frames, text=prompt, return_tensors="pt").to('cuda', torch.float16)
output = self.model.generate(**inputs, max_new_tokens=512)
return self.processor.decode(output[0], skip_special_tokens=True)
def load_video(self, video_path, max_frames_num, fps=1, force_sample=False):
from decord import VideoReader, cpu
import numpy as np
vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
total_frame_num = len(vr)
avg_fps = vr.get_avg_fps()
if avg_fps == 0:
raise ValueError(f"Video '{video_path}' has an average FPS of 0, which is invalid.")
if fps <= 0:
raise ValueError("FPS argument must be greater than 0.")
effective_fps = round(avg_fps / fps)
frame_idx = list(range(0, total_frame_num, effective_fps))
frame_time = [i / avg_fps for i in frame_idx]
if len(frame_idx) > max_frames_num or force_sample:
uniform_sampled_frames = np.linspace(0, total_frame_num - 1, max_frames_num, dtype=int)
frame_idx = uniform_sampled_frames.tolist()
frame_time = [i / avg_fps for i in frame_idx]
frame_time_str = ", ".join([f"{t:.2f}s" for t in frame_time])
video_frames = vr.get_batch(frame_idx).asnumpy()
video_time = total_frame_num / avg_fps
return video_frames, frame_time_str, video_time
def generate_inner(self, message, dataset=None):
if DATASET_MODALITY(dataset) == "VIDEO":
return self.generate_inner_video(message, dataset)
else:
return self.generate_inner_image(message, dataset)
import os
import os.path as osp
import string
import sys
import warnings
import pandas as pd
import torch
from huggingface_hub import snapshot_download
from PIL import Image
from transformers import (AutoModel, AutoModelForCausalLM, AutoTokenizer,
CLIPImageProcessor, CLIPVisionModel,
GenerationConfig, StoppingCriteriaList)
from ..base import BaseModel
from ...smp import *
from ...dataset import DATASET_TYPE
class LLaVA_XTuner(BaseModel):
INSTALL_REQ = True
INTERLEAVE = False
def __init__(self,
llava_path,
llm_path=None,
visual_encoder_path='openai/clip-vit-large-patch14-336',
visual_select_layer=-2,
prompt_template=None,
stop_words=[],
torch_dtype=torch.float16):
try:
from peft import PeftModel
from xtuner.utils import PROMPT_TEMPLATE, StopWordStoppingCriteria
except Exception as err:
logging.critical(
'Please install xtuner with `pip install -U xtuner` before '
'using LLaVA_XTuner')
raise err
if not osp.isdir(llava_path):
cache_path = get_cache_path(llava_path)
if cache_path is not None:
llava_path = cache_path
else:
llava_path = snapshot_download(repo_id=llava_path)
assert osp.exists(llava_path) and osp.isdir(llava_path)
# build visual_encoder
if 'llm' in os.listdir(llava_path):
assert llm_path is None, (
"Please don't specify the `llm_path` since passed "
'`llava_path` contains a LLM!')
llm_path = osp.join(llava_path, 'llm')
else:
assert llm_path is not None, 'Please specify the `llm_path`!'
llm = AutoModelForCausalLM.from_pretrained(llm_path,
trust_remote_code=True,
torch_dtype=torch_dtype,
device_map='cpu')
tokenizer = AutoTokenizer.from_pretrained(llm_path,
trust_remote_code=True,
encode_special_tokens=True)
print(f'Load LLM from {llm_path}')
# build visual_encoder
if 'visual_encoder' in os.listdir(llava_path):
assert visual_encoder_path is None, (
"Please don't specify the `visual_encoder_path` since passed "
'`llava_path` contains a visual encoder!')
visual_encoder_path = osp.join(llava_path, 'visual_encoder')
else:
assert visual_encoder_path is not None, (
'Please specify the `visual_encoder_path`!')
visual_encoder = CLIPVisionModel.from_pretrained(
visual_encoder_path, torch_dtype=torch_dtype, device_map='cpu')
image_processor = CLIPImageProcessor.from_pretrained(
visual_encoder_path)
print(f'Load visual_encoder from {visual_encoder_path}')
# load adapter
if 'llm_adapter' in os.listdir(llava_path):
adapter_path = osp.join(llava_path, 'llm_adapter')
llm = PeftModel.from_pretrained(llm,
adapter_path,
trust_remote_code=True,
device_map='cpu')
print(f'Load LLM adapter from {llava_path}')
if 'visual_encoder_adapter' in os.listdir(llava_path):
adapter_path = osp.join(llava_path, 'visual_encoder_adapter')
visual_encoder = PeftModel.from_pretrained(visual_encoder,
adapter_path,
trust_remote_code=True,
device_map='cpu')
print(f'Load visual_encoder adapter from {llava_path}')
# build projector
projector_path = osp.join(llava_path, 'projector')
projector = AutoModel.from_pretrained(projector_path,
trust_remote_code=True,
torch_dtype=torch_dtype,
device_map='cpu')
print(f'Load projector from {llava_path}')
llm.eval()
visual_encoder.eval()
projector.eval()
self.llm = llm.cuda()
self.tokenizer = tokenizer
self.visual_encoder = visual_encoder.cuda()
self.image_processor = image_processor
self.projector = projector.cuda()
self.visual_select_layer = visual_select_layer
if prompt_template is not None:
# modified prompt template
if prompt_template == 'llama3_chat':
self.prompt_template = dict(
SYSTEM=('<|start_header_id|>system<|end_header_id|>\n\n'
'{system}<|eot_id|>'),
INSTRUCTION=(
'<|start_header_id|>user<|end_header_id|>\n\n{input}<|eot_id|>'
'<|start_header_id|>assistant<|end_header_id|>\n\n'),
SUFFIX='<|eot_id|>',
SUFFIX_AS_EOS=True,
STOP_WORDS=['<|eot_id|>'])
else:
self.prompt_template = PROMPT_TEMPLATE[prompt_template]
stop_words += self.prompt_template.get('STOP_WORDS', [])
else:
self.prompt_template = None
self.stop_criteria = StoppingCriteriaList()
for word in stop_words:
self.stop_criteria.append(
StopWordStoppingCriteria(self.tokenizer, word))
def build_gen_config(self, dataset):
gen_kwargs = dict(max_new_tokens=512,
do_sample=True,
temperature=1,
num_beams=5,
eos_token_id=self.tokenizer.eos_token_id,
pad_token_id=self.tokenizer.pad_token_id
if self.tokenizer.pad_token_id is not None else
self.tokenizer.eos_token_id)
# For single word generation
if (dataset is not None
and DATASET_TYPE(dataset) in ['MCQ', 'Y/N']):
gen_kwargs.update(
dict(max_new_tokens=5, do_sample=False, num_beams=1))
return GenerationConfig(**gen_kwargs)
def use_custom_prompt(self, dataset):
assert dataset is not None
if DATASET_TYPE(dataset) == 'MCQ':
return True
return False
def build_prompt(self, line, dataset=None):
assert self.use_custom_prompt(dataset)
assert dataset is None or isinstance(dataset, str)
tgt_path = self.dump_image(line, dataset)
question = line['question']
hint = line['hint'] if ('hint' in line
and not pd.isna(line['hint'])) else None
if hint is not None:
question = hint + '\n' + question
options = {
cand: line[cand]
for cand in string.ascii_uppercase
if cand in line and not pd.isna(line[cand])
}
for key, item in options.items():
question += f'\n{key}. {item}'
if not cn_string(question):
prompt = question + '\n' + ("Answer with the option's letter "
'from the given choices directly.')
else:
prompt = question + '\n' + '请直接回答选项字母。'
message = [dict(type='text', value=prompt)]
message.extend([dict(type='image', value=s) for s in tgt_path])
return message
def generate_inner(self, message, dataset=None):
from xtuner.dataset.utils import expand2square
from xtuner.model.utils import prepare_inputs_labels_for_multimodal
from xtuner.utils import DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX
prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
prompt = prompt.replace('<image>', '')
image = Image.open(image_path).convert('RGB')
image = expand2square(
image,
tuple(int(x * 255) for x in self.image_processor.image_mean))
image = self.image_processor.preprocess(
image, return_tensors='pt')['pixel_values'][0]
image = image.cuda().unsqueeze(0)
visual_outputs = self.visual_encoder(image, output_hidden_states=True)
pixel_values = self.projector(
visual_outputs.hidden_states[self.visual_select_layer][:, 1:])
inputs = DEFAULT_IMAGE_TOKEN + '\n' + prompt
if self.prompt_template:
inputs = self.prompt_template['INSTRUCTION'].format(input=inputs)
chunk_encode = []
for idx, chunk in enumerate(inputs.split(DEFAULT_IMAGE_TOKEN)):
if idx == 0:
cur_encode = self.tokenizer(chunk)
else:
cur_encode = self.tokenizer(chunk, add_special_tokens=False)
chunk_encode.append(cur_encode)
assert len(chunk_encode) == 2
ids = []
for idx, cur_chunk_encode in enumerate(chunk_encode):
ids.extend(cur_chunk_encode['input_ids'])
if idx != len(chunk_encode) - 1:
ids.append(IMAGE_TOKEN_INDEX)
ids = torch.tensor(ids).cuda().unsqueeze(0)
mm_inputs = prepare_inputs_labels_for_multimodal(
llm=self.llm, input_ids=ids, pixel_values=pixel_values)
gen_config = self.build_gen_config(dataset)
generate_output = self.llm.generate(
**mm_inputs,
generation_config=gen_config,
streamer=None,
bos_token_id=self.tokenizer.bos_token_id,
stopping_criteria=self.stop_criteria)
predict = self.tokenizer.decode(generate_output[0],
skip_special_tokens=True).strip()
return predict
import torch
from PIL import Image
from abc import abstractproperty
from .base import BaseModel
from ..smp import *
from ..dataset import DATASET_TYPE
import warnings
class Mantis(BaseModel):
"""
Mantis Model
This implementation is adpated from the Llava model from llava.py and the Idefics model from idefics.py
"""
INSTALL_REQ = True
INTERLEAVE = True
DEFAULT_IMAGE_TOKEN = '<image>'
IMAGE_TOKEN_INDEX = -200
def __init__(self, model_path='TIGER-Lab/Mantis-8B-siglip-llama3', **kwargs):
assert model_path is not None
try:
from mantis.models.mllava import LlavaForConditionalGeneration, MLlavaProcessor
from mantis.models.mfuyu import MFuyuForCausalLM, MFuyuProcessor
from mantis.models.conversation import conv_mllava_v1 as default_conv, conv_templates
except Exception as e:
logging.critical(
"Mantis is not installed. Please install Mantis to use this model.Please use 'pip install "
"git+https://github.com/TIGER-AI-Lab/Mantis.git' to install"
)
raise e
try:
from transformers import AutoModelForVision2Seq, AutoProcessor
except Exception as e:
logging.critical(f'{type(e)}: {e}')
logging.critical("Upgrade transformers to use Mantis's idefics model.\nError: %s" % e)
# inference implementation for attention, can be "sdpa", "eager", "flash_attention_2".
# Seems FA2 is not effective during inference:
# https://discuss.huggingface.co/t/flash-attention-has-no-effect-on-inference/73453/5
# if is_flash_attn_2_available:
# best_fit_attn_implementation = "flash_attention_2"
# flash_attn has a bug that says: ERROR Error query and key must have the same dtype in generating
try:
import flash_attn
best_fit_attn_implementation = 'flash_attention_2'
except ImportError:
best_fit_attn_implementation = 'eager'
self.model_path = model_path
attn_implementation = best_fit_attn_implementation
self._is_idefics = 'idefics' in model_path.lower()
# Here load the "non-idefics" Mantis model.
if not self._is_idefics:
if 'fuyu' in model_path.lower():
self.processor = MFuyuProcessor.from_pretrained(self.model_path)
model = MFuyuForCausalLM.from_pretrained(
self.model_path,
device_map='cuda',
attn_implementation=attn_implementation,
torch_dtype=torch.float16
)
else:
self.processor = MLlavaProcessor.from_pretrained(self.model_path)
model = LlavaForConditionalGeneration.from_pretrained(
self.model_path,
device_map='cuda',
attn_implementation=attn_implementation,
torch_dtype=torch.float16
)
else:
self.processor = AutoProcessor.from_pretrained(self.model_path)
model = AutoModelForVision2Seq.from_pretrained(
self.model_path,
device_map='cuda',
torch_dtype=torch.float16
)
model = model.eval()
self.model = model.cuda()
kwargs_default = dict(do_sample=False, temperature=0, max_new_tokens=1024, top_p=None, num_beams=1)
kwargs_default.update(kwargs)
self.kwargs = kwargs_default
warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
self.tokenizer = self.processor.tokenizer
self.default_conv = default_conv
self.conv_templates = conv_templates
def use_custom_prompt(self, dataset):
assert dataset is not None
if DATASET_TYPE(dataset) == 'MCQ':
return True
return False
def build_prompt(self, line, dataset=None):
assert self.use_custom_prompt(dataset)
assert dataset is None or isinstance(dataset, str)
tgt_path = self.dump_image(line, dataset)
question = line['question']
hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
if hint is not None:
question = hint + '\n' + question
options = {
cand: line[cand]
for cand in string.ascii_uppercase
if cand in line and not pd.isna(line[cand])
}
for key, item in options.items():
question += f'\n{key}. {item}'
prompt = question
if len(options):
prompt += (
'\n请直接回答选项字母。' if cn_string(prompt) else
"\nAnswer with the option's letter from the given choices directly."
)
else:
prompt += '\n请直接回答问题。' if cn_string(prompt) else '\nAnswer the question directly.'
message = [dict(type='image', value=s) for s in tgt_path]
message.append(dict(type='text', value=prompt))
return message
def output_process(self, answer):
if '<s>' in answer:
answer = answer.replace('<s>', '').strip()
if '[/INST]' in answer:
answer = answer.split('[/INST]')[1].strip()
elif 'ASSISTANT:' in answer:
answer = answer.split('ASSISTANT:')[1].strip()
elif 'assistant\n' in answer:
answer = answer.split('assistant\n')[1].strip()
elif '<|end_header_id|>\n\n' in answer:
answer = answer.split('<|end_header_id|>\n\n')[2].strip()
if '</s>' in answer:
answer = answer.split('</s>')[0].strip()
elif '<|im_end|>' in answer:
answer = answer.split('<|im_end|>')[0].strip()
elif '<|eot_id|>' in answer:
answer = answer.split('<|eot_id|>')[0].strip()
elif '<end_of_utterance>' in answer:
answer = answer.split('<end_of_utterance>')[0].strip()
elif '|ENDOFTEXT|' in answer:
answer = answer.split('|ENDOFTEXT|')[0].strip()
return answer
def generate_inner(self, message, dataset=None):
content, images = '', []
ide_content, question = [], ''
for msg in message:
if msg['type'] == 'text':
content += msg['value']
question += msg['value']
else:
images.append(Image.open(msg['value']).convert('RGB'))
content += (self.DEFAULT_IMAGE_TOKEN + '\n')
ide_content.append({'type': 'image'})
if self._is_idefics:
# Follow the idefics implementation:
ide_content.append({'type': 'text', 'text': question})
prompt = [{'role': 'user', 'content': ide_content}]
prompt = self.processor.apply_chat_template(prompt, add_generation_prompt=True)
else:
# Follow the Mantis code base to make sure they are consistent:
# https://github.com/TIGER-AI-Lab/Mantis/blob/main/mantis/models/mllava/utils.py#L33
# Users don't need to define chat template as it is done here
if 'llama-3' in self.model.language_model.name_or_path.lower():
conv = self.conv_templates['llama_3']
terminators = [
self.processor.tokenizer.eos_token_id,
self.processor.tokenizer.convert_tokens_to_ids('<|eot_id|>')
]
else:
conv = self.default_conv
terminators = [self.processor.tokenizer.eos_token_id]
# Using EOT because end of *text* is more accurate for what we're doing than end of *sentence*
if 'eos_token_id' not in self.kwargs:
self.kwargs['eos_token_id'] = terminators
conv = conv.copy()
conv.append_message(conv.roles[0], content)
conv.append_message(conv.roles[1], '')
assert conv.messages[-1][0] == conv.roles[1] and conv.messages[-1][1] == '', 'Format check'
prompt = conv.get_prompt()
inputs = self.processor(prompt, images, return_tensors='pt', truncation=True)
# FIXME: Fuyu model would return a list instead of a pytorch tensor. This weird behavior needs fixing.
if 'image_patches' in inputs.keys():
inputs['image_patches'] = inputs['image_patches'][0]
inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
output = self.model.generate(**inputs, **self.kwargs)
output = output[0]
generated_ids = output[inputs['input_ids'].shape[-1]:]
answer = self.processor.decode(generated_ids, skip_special_token=True)
answer = self.output_process(answer)
return answer
import sys
import torch
import os.path as osp
import os
import warnings
from .base import BaseModel
from ..smp import *
from PIL import Image
'''
Please follow the instructions to download ckpt.
https://github.com/dvlab-research/MGM?tab=readme-ov-file#pretrained-weights
'''
class Mini_Gemini(BaseModel):
INSTALL_REQ = True
INTERLEAVE = False
def __init__(self, model_path, root=None, conv_mode='llava_v1', **kwargs):
if root is None:
warnings.warn('Please set `root` to Mini_Gemini code directory, \
which is cloned from here: "https://github.com/dvlab-research/MGM?tab=readme-ov-file" ')
raise ValueError
warnings.warn('Please follow the instructions of Mini_Gemini to put the ckpt file in the right place, \
which can be found at https://github.com/dvlab-research/MGM?tab=readme-ov-file#structure')
assert model_path == 'YanweiLi/MGM-7B-HD', 'We only support MGM-7B-HD for now'
self.model_path = model_path
sys.path.append(root)
try:
from mgm.model.builder import load_pretrained_model
from mgm.mm_utils import get_model_name_from_path
except Exception as e:
logging.critical(
'Please first install Mini_Gemini and set the root path to use Mini_Gemini, '
'which is cloned from here: "https://github.com/dvlab-research/MGM?tab=readme-ov-file" '
)
raise e
VLMEvalKit_path = os.getcwd()
os.chdir(root)
warnings.warn('Please set `root` to Mini_Gemini code directory, \
which is cloned from here: "https://github.com/dvlab-research/MGM?tab=readme-ov-file" ')
model_path = osp.join(root, 'work_dirs', 'MGM', 'MGM-7B-HD')
try:
model_name = get_model_name_from_path(model_path)
except Exception as e:
logging.critical(
'Please follow the instructions of Mini_Gemini to put the ckpt file in the right place, '
'which can be found at https://github.com/dvlab-research/MGM?tab=readme-ov-file#structure'
)
raise e
tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, None, model_name)
os.chdir(VLMEvalKit_path)
self.model = model
self.tokenizer = tokenizer
self.image_processor = image_processor
self.conv_mode = conv_mode
kwargs_default = dict(temperature=float(0), num_beams=1, top_p=None, max_new_tokens=1024, use_cache=True)
kwargs_default.update(kwargs)
do_sample = kwargs_default['temperature'] > 0
kwargs_default.update({'do_sample': do_sample})
self.kwargs = kwargs_default
def generate_inner(self, message, dataset=None):
try:
from mgm.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, \
DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
from mgm.conversation import conv_templates
from mgm.mm_utils import tokenizer_image_token, process_images
except Exception as e:
logging.critical(
'Please first install Mini_Gemini and set the root path to use Mini_Gemini, '
'which is cloned from here: "https://github.com/dvlab-research/MGM?tab=readme-ov-file" '
)
raise e
prompt, image = self.message_to_promptimg(message, dataset=dataset)
image = Image.open(image)
prompt = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + prompt
conv = conv_templates[self.conv_mode].copy()
conv.append_message(conv.roles[0], prompt)
conv.append_message(conv.roles[1], None)
prompt = conv.get_prompt()
input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt')
input_ids = input_ids.unsqueeze(0).cuda()
if hasattr(self.model.config, 'image_size_aux'):
if not hasattr(self.image_processor, 'image_size_raw'):
self.image_processor.image_size_raw = self.image_processor.crop_size.copy()
self.image_processor.crop_size['height'] = self.model.config.image_size_aux
self.image_processor.crop_size['width'] = self.model.config.image_size_aux
self.image_processor.size['shortest_edge'] = self.model.config.image_size_aux
image_tensor = process_images([image], self.image_processor, self.model.config)[0]
image_grid = getattr(self.model.config, 'image_grid', 1)
if hasattr(self.model.config, 'image_size_aux'):
raw_shape = [
self.image_processor.image_size_raw['height'] * image_grid,
self.image_processor.image_size_raw['width'] * image_grid
]
image_tensor_aux = image_tensor
image_tensor = torch.nn.functional.interpolate(
image_tensor[None],
size=raw_shape,
mode='bilinear',
align_corners=False
)[0]
else:
image_tensor_aux = []
if image_grid >= 2:
raw_image = image_tensor.reshape(
3, image_grid, self.image_processor.image_size_raw['height'],
image_grid, self.image_processor.image_size_raw['width']
)
raw_image = raw_image.permute(1, 3, 0, 2, 4)
raw_image = raw_image.reshape(
-1, 3, self.image_processor.image_size_raw['height'], self.image_processor.image_size_raw['width']
)
if getattr(self.model.config, 'image_global', False):
global_image = image_tensor
if len(global_image.shape) == 3:
global_image = global_image[None]
global_image = torch.nn.functional.interpolate(
global_image,
size=[
self.image_processor.image_size_raw['height'],
self.image_processor.image_size_raw['width']
],
mode='bilinear',
align_corners=False
)
# [image_crops, image_global]
raw_image = torch.cat([raw_image, global_image], dim=0)
image_tensor = raw_image.contiguous()
images = image_tensor[None].to(dtype=self.model.dtype, device='cuda', non_blocking=True)
if len(image_tensor_aux) > 0:
images_aux = image_tensor_aux[None].to(dtype=self.model.dtype, device='cuda', non_blocking=True)
else:
images_aux = None
with torch.inference_mode():
output_ids = self.model.generate(
input_ids,
images=images,
images_aux=images_aux,
# no_repeat_ngram_size=3,
bos_token_id=self.tokenizer.bos_token_id, # Begin of sequence token
eos_token_id=self.tokenizer.eos_token_id, # End of sequence token
pad_token_id=self.tokenizer.pad_token_id, # Pad token
**self.kwargs
)
outputs = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
return outputs
import math
import torch
import random
import numpy as np
from PIL import Image
from transformers import AutoModel, AutoTokenizer
from .base import BaseModel
from ..smp import *
from ..dataset import DATASET_TYPE, DATASET_MODALITY
class MiniCPM_V(BaseModel):
INSTALL_REQ = False
INTERLEAVE = False
def __init__(self, model_path='openbmb/MiniCPM-V', **kwargs):
assert model_path is not None
self.model_path = model_path
print(f'load from {self.model_path}')
self.model = AutoModel.from_pretrained(self.model_path, trust_remote_code=True)
self.model = self.model.to(dtype=torch.bfloat16)
self.model.eval().cuda()
self.kwargs = kwargs
self.tokenizer = AutoTokenizer.from_pretrained(self.model_path, trust_remote_code=True)
torch.cuda.empty_cache()
self.num_beams = 1 if self.model_path == 'openbmb/MiniCPM-V' else 3
def use_custom_prompt(self, dataset):
assert dataset is not None
if listinstr(['MMDU', 'MME-RealWorld', 'MME-RealWorld-CN'], dataset):
# For Multi-Turn we don't have custom prompt
return False
return False
def build_prompt(self, line, dataset=None):
assert dataset is None or isinstance(dataset, str)
assert self.use_custom_prompt(dataset)
tgt_path = self.dump_image(line, dataset)
question = line['question']
options = {
cand: line[cand]
for cand in string.ascii_uppercase
if cand in line and not pd.isna(line[cand])
}
options_prompt = 'Options:\n'
for key, item in options.items():
options_prompt += f'{key}. {item}\n'
hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
prompt = ''
if hint is not None:
prompt += f'Hint: {hint}\n'
prompt += f'{question}\n'
if len(options):
prompt += options_prompt
prompt = 'Study the image carefully and pick the option associated with the correct answer. \
Focus solely on selecting the option and avoid including any other content.\n' + prompt
message = [dict(type='text', value=prompt)]
message.extend([dict(type='image', value=p) for p in tgt_path])
return message
def generate_inner(self, message, dataset=None):
prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
image = Image.open(image_path).convert('RGB')
msgs = [{'role': 'user', 'content': prompt}]
if DATASET_TYPE(dataset) == 'MCQ':
max_new_tokens = 20
elif DATASET_TYPE(dataset) == 'Y/N':
max_new_tokens = 100
else:
max_new_tokens = 1024
default_kwargs = dict(
max_new_tokens=max_new_tokens,
sampling=False,
num_beams=self.num_beams
)
default_kwargs.update(self.kwargs)
res, _, _ = self.model.chat(
image=image,
msgs=msgs,
context=None,
tokenizer=self.tokenizer,
**default_kwargs
)
return res
class MiniCPM_Llama3_V(BaseModel):
INSTALL_REQ = False
INTERLEAVE = True
def __init__(self, model_path='openbmb/MiniCPM-Llama3-V-2_5', **kwargs):
assert model_path is not None
self.model_path = model_path
print(f'load from {self.model_path}')
self.model = AutoModel.from_pretrained(self.model_path, trust_remote_code=True)
self.model = self.model.to(dtype=torch.float16)
self.model.eval().cuda()
self.kwargs = kwargs
self.tokenizer = AutoTokenizer.from_pretrained(self.model_path, trust_remote_code=True)
torch.cuda.empty_cache()
self.num_beams = 1 if self.model_path == 'openbmb/MiniCPM-V' else 3
self.options_system_prompt = ('Carefully read the following question and select the letter corresponding '
'to the correct answer. Highlight the applicable choices without giving '
'explanations.')
self.wo_options_system_prompt = 'Carefully read the following question Answer the question directly.'
self.detail_system_prompt = 'Answer this question in detail.'
self.vqa_prompt = 'Answer the question using a single word or phrase.'
def use_custom_prompt(self, dataset):
if listinstr(['MCQ', 'VQA'], DATASET_TYPE(dataset)):
return True
elif dataset is not None and listinstr(['HallusionBench'], dataset):
return True
return False
def build_prompt(self, line, dataset=None):
if isinstance(line, int):
line = self.data.iloc[line]
tgt_path = self.dump_image(line, dataset)
system_prompt = ''
question = line['question']
if DATASET_TYPE(dataset) == 'MCQ':
options = {
cand: line[cand]
for cand in string.ascii_uppercase
if cand in line and not pd.isna(line[cand])
}
options_prompt = 'Options:\n'
for key, item in options.items():
options_prompt += f'{key}. {item}\n'
hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
prompt = ''
if hint is not None:
prompt += f'Hint: {hint}\n'
prompt += f'Question: {question}\n'
if len(options):
prompt += options_prompt
system_prompt = self.options_system_prompt + '\nPlease just indicate your choice.'
else:
system_prompt = self.wo_options_system_prompt
if 'MMMU' in dataset: # Corner Case
prompt = system_prompt + '\n' + prompt
system_prompt = ''
elif dataset is not None and listinstr(['HallusionBench'], dataset):
question = line['question'] + ' Yes or No?'
prompt = question
elif dataset is not None and listinstr(['MME'], dataset):
question = line['question'] + ' Yes or No?'
prompt = question
elif dataset is not None and listinstr(['OCRBench'], dataset):
system_prompt = self.vqa_prompt
question = line['question']
prompt = question
elif DATASET_TYPE(dataset) == 'VQA':
if listinstr(['LLaVABench', 'MMLongBench_DOC'], dataset):
system_prompt = ''
prompt = question
elif listinstr(['MMVet'], dataset):
system_prompt = self.detail_system_prompt
prompt = question
else:
system_prompt = self.vqa_prompt
prompt = question
msgs = []
if system_prompt:
msgs.append(dict(type='text', value=system_prompt))
if isinstance(tgt_path, list):
msgs.extend([dict(type='image', value=p) for p in tgt_path])
else:
msgs = [dict(type='image', value=tgt_path)]
msgs.append(dict(type='text', value=prompt))
return msgs
def generate_inner(self, message, dataset=None):
if DATASET_TYPE(dataset) == 'MCQ':
max_new_tokens = 200
elif DATASET_TYPE(dataset) == 'Y/N':
max_new_tokens = 3
else:
max_new_tokens = 1024
default_kwargs = dict(
max_new_tokens=max_new_tokens,
sampling=False,
num_beams=self.num_beams,
)
default_kwargs.update(self.kwargs)
content = []
for x in message:
if x['type'] == 'text':
content.append(x['value'])
elif x['type'] == 'image':
image = Image.open(x['value']).convert('RGB')
content.append(image)
msgs = [{'role': 'user', 'content': content}]
res = self.model.chat(
msgs=msgs,
context=None,
image=None,
tokenizer=self.tokenizer,
**default_kwargs
)
if isinstance(res, tuple) and len(res) > 0:
res = res[0]
return res
def chat_inner(self, message, dataset=None):
max_new_tokens = 1024
default_kwargs = dict(
max_new_tokens=max_new_tokens,
sampling=False,
num_beams=self.num_beams,
)
default_kwargs.update(self.kwargs)
msgs = []
for msg in message:
content = []
if len(msg['content']) == 1 and msg['content'][0]['type'] == 'text':
msg_new = {'role': msg['role'], 'content': msg['content'][0]['value']}
msgs.append(msg_new)
continue
for x in msg['content']:
if x['type'] == 'text':
content.append(x['value'])
elif x['type'] == 'image':
image = Image.open(x['value']).convert('RGB')
content.append(image)
msg_new = {'role': msg['role'], 'content': content}
msgs.append(msg_new)
res = self.model.chat(
msgs=msgs,
context=None,
image=None,
tokenizer=self.tokenizer,
**default_kwargs)
if isinstance(res, tuple) and len(res) > 0:
res = res[0]
return res
class MiniCPM_V_2_6(BaseModel):
INSTALL_REQ = False
INTERLEAVE = True
def __init__(self, model_path='openbmb/MiniCPM-V', **kwargs):
random.seed(0)
np.random.seed(0)
torch.manual_seed(0)
torch.cuda.manual_seed_all(0)
assert model_path is not None
self.model_path = model_path
print(f'load from path {self.model_path}')
self.model = AutoModel.from_pretrained(self.model_path, trust_remote_code=True)
self.model = self.model.to(dtype=torch.bfloat16)
self.model.eval().cuda()
self.kwargs = kwargs
self.tokenizer = AutoTokenizer.from_pretrained(self.model_path, trust_remote_code=True)
torch.cuda.empty_cache()
self.num_beams = 1 if self.model_path == 'openbmb/MiniCPM-V' else 3
self.options_suffix_prompt = '''\nAnswer with the option's letter from the given choices directly.'''
self.wo_options_system_prompt = 'Carefully read the following question Answer the question directly.'
self.detail_system_prompt = 'Answer this question in detail.'
self.vqa_prompt = 'Answer the question using a single word or phrase.'
self.multi_choice_cot_prompt = ('''Carefully read the following multichoice question, solve it step '''
'''by step and finally pick the option associated with the correct '''
'''answer in the format of "Answer: selected option\n\n''')
self.short_ans_cot_prompt = ('''Read the following question carefully, solve it step by step, and '''
'''then output the final answer in the format of "Answer: single number '''
'''or single word or phrase".\n\n''')
def use_custom_prompt(self, dataset=None):
if dataset is None:
return False
if DATASET_TYPE(dataset) in ['MCQ', 'VQA', 'Y/N']:
return True
return False
def use_cot(self, dataset=None):
if dataset is None:
return False
if listinstr(['MMMU', 'HallusionBench', 'OCRBench', 'ChartQA'], dataset):
return True
elif listinstr(['MathVista', 'MMVet', 'MMBench', 'MMStar', 'AI2D', 'RealWorldQA',
'POPE', 'ScienceQA', 'TextVQA', 'DocVQA'], dataset):
return False
else:
return False
def use_upsize(self, dataset=None):
if dataset is None:
return False
if listinstr(['MMVet', 'MMBench', 'MMStar', 'AI2D', 'OCRBench'], dataset):
return True
else:
return False
def build_prompt(self, line, dataset=None):
if isinstance(line, int):
line = self.data.iloc[line]
tgt_path = self.dump_image(line, dataset)
system_prompt, prompt = '', ''
question = line['question']
if not self.use_cot(dataset):
if DATASET_TYPE(dataset) == 'MCQ':
options = {
cand: line[cand]
for cand in string.ascii_uppercase
if cand in line and not pd.isna(line[cand])
}
options_prompt = 'Options:\n'
for key, item in options.items():
options_prompt += f'{key}. {item}\n'
hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
if hint is not None:
prompt += f'Hint: {hint}\n'
prompt += f'Question: {question}\n'
if len(options):
prompt += options_prompt
prompt += self.options_suffix_prompt
else:
system_prompt = self.wo_options_system_prompt
if 'MMMU' in dataset:
if len(system_prompt) > 0:
prompt = system_prompt + '\n' + prompt
system_prompt = ''
elif dataset is not None and listinstr(['HallusionBench'], dataset):
question += ' Yes or No?'
prompt = question
elif dataset is not None and listinstr(['OCRBench'], dataset):
system_prompt = self.vqa_prompt
prompt = question
elif DATASET_TYPE(dataset) == 'VQA':
if listinstr(['LLaVABench'], dataset):
system_prompt = ''
elif listinstr(['MMVet'], dataset):
system_prompt = self.detail_system_prompt
else:
system_prompt = self.vqa_prompt
prompt = question
else:
prompt = question
else:
has_options = True
if DATASET_TYPE(dataset) == 'MCQ':
options = {
cand: line[cand]
for cand in string.ascii_uppercase
if cand in line and not pd.isna(line[cand])
}
options_prompt = ''
for key, item in options.items():
options_prompt += f'{key}. {item}\n'
hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
if hint is not None:
prompt += f'Hint: {hint}\n'
prompt += f'{question}\n'
if len(options):
prompt += options_prompt
else:
has_options = False
if 'MMMU' in dataset:
if len(system_prompt) > 0:
prompt = system_prompt + '\n' + prompt
system_prompt = ''
else:
prompt = question
if DATASET_TYPE(dataset) in ['MCQ', 'Y/N', 'VQA']:
if DATASET_TYPE(dataset) == 'MCQ':
if has_options:
prompt = self.multi_choice_cot_prompt + prompt
else:
prompt = self.short_ans_cot_prompt + prompt
elif DATASET_TYPE(dataset) == 'Y/N':
prompt = self.short_ans_cot_prompt + prompt
else:
prompt = self.short_ans_cot_prompt + prompt
msgs = []
if system_prompt:
msgs.append(dict(type='text', value=system_prompt))
if isinstance(tgt_path, list):
msgs.extend([dict(type='image', value=p) for p in tgt_path])
else:
msgs = [dict(type='image', value=tgt_path)]
msgs.append(dict(type='text', value=prompt))
return msgs
def generate_inner(self, message, dataset=None):
if DATASET_MODALITY(dataset) == 'VIDEO':
max_slice_nums = 1
use_image_id = False
max_inp_length = 2048 * 10
else:
max_slice_nums = None
use_image_id = True
max_inp_length = 8192
max_new_tokens = 2048
default_kwargs = dict(
max_new_tokens=max_new_tokens,
sampling=False,
num_beams=self.num_beams,
)
default_kwargs.update(self.kwargs)
content = []
for x in message:
if x['type'] == 'text':
content.append(x['value'])
elif x['type'] == 'image':
image = Image.open(x['value']).convert('RGB')
if not self.use_upsize(dataset):
content.append(image)
else:
img_width, img_height = image.width, image.height
if (img_width * img_height) >= (1344 * 1344):
content.append(image)
else:
ratio = math.sqrt((1344 * 1344) / (img_width * img_height))
max_img_width = int(img_width * ratio)
new_img_width = random.randint(img_width, max_img_width)
new_img_height = int(new_img_width / img_width * img_height)
resized_image = image.resize((new_img_width, new_img_height))
content.append(resized_image)
msgs = [{'role': 'user', 'content': content}]
res = self.model.chat(
image=None,
msgs=msgs,
context=None,
tokenizer=self.tokenizer,
max_inp_length=max_inp_length,
use_image_id=use_image_id,
max_slice_nums=max_slice_nums,
**default_kwargs
)
if isinstance(res, tuple) and len(res) > 0:
res = res[0]
return res
import torch
import sys
import os.path as osp
import warnings
from transformers import StoppingCriteriaList
from .base import BaseModel
class MiniGPT4(BaseModel):
INSTALL_REQ = True
INTERLEAVE = False
def __init__(self,
mode='v2',
root='/mnt/petrelfs/share_data/duanhaodong/MiniGPT-4/',
temperature=1,
max_out_len=512):
if root is None:
warnings.warn(
'Please set root to the directory of MiniGPT-4, which is cloned from here: '
'https://github.com/Vision-CAIR/MiniGPT-4. '
)
if mode == 'v2':
cfg = 'minigptv2_eval.yaml'
elif mode == 'v1_7b':
cfg = 'minigpt4_7b_eval.yaml'
elif mode == 'v1_13b':
cfg = 'minigpt4_13b_eval.yaml'
else:
raise NotImplementedError
self.mode = mode
self.temperature = temperature
self.max_out_len = max_out_len
self.root = root
this_dir = osp.dirname(__file__)
self.cfg = osp.join(this_dir, 'misc', cfg)
sys.path.append(self.root)
from omegaconf import OmegaConf
from minigpt4.common.registry import registry
from minigpt4.conversation.conversation import StoppingCriteriaSub, CONV_VISION_Vicuna0, CONV_VISION_minigptv2
device = torch.cuda.current_device()
self.device = device
cfg_path = self.cfg
cfg = OmegaConf.load(cfg_path)
model_cfg = cfg.model
model_cfg.device_8bit = device
model_cls = registry.get_model_class(model_cfg.arch)
model = model_cls.from_config(model_cfg)
model = model.to(device)
model.eval()
vis_processor_cfg = cfg.datasets.cc_sbu_align.vis_processor.train
vis_processor = registry.get_processor_class(vis_processor_cfg.name).from_config(vis_processor_cfg)
self.model = model
self.vis_processor = vis_processor
self.CONV_VISION = CONV_VISION_minigptv2 if self.mode == 'v2' else CONV_VISION_Vicuna0
stop_words_ids = [[835], [2277, 29937]]
stop_words_ids = [torch.tensor(ids).to(device) for ids in stop_words_ids]
self.stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids)])
def generate_inner(self, message, dataset=None):
from minigpt4.conversation.conversation import Chat
prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
if self.mode == 'v2':
chat = Chat(self.model, self.vis_processor, device=self.device)
else:
chat = Chat(self.model, self.vis_processor, device=self.device, stopping_criteria=self.stopping_criteria)
chat_state = self.CONV_VISION.copy()
img_list = []
_ = chat.upload_img(image_path, chat_state, img_list)
chat.encode_img(img_list)
chat.ask(prompt, chat_state)
with torch.inference_mode():
msg = chat.answer(conv=chat_state, img_list=img_list)[0]
return msg
import torch
from transformers import AutoTokenizer, AutoConfig, AutoModel, CLIPImageProcessor
import warnings
from PIL import Image
from .base import BaseModel
from ..smp import *
from ..dataset import DATASET_TYPE
import pandas as pd
import string
import torch.distributed as dist
import torchvision.transforms as T
import transformers
from torchvision.transforms.functional import InterpolationMode
import re
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
best_ratio_diff = float('inf')
best_ratio = (1, 1)
area = width * height
for ratio in target_ratios:
target_aspect_ratio = ratio[0] / ratio[1]
ratio_diff = abs(aspect_ratio - target_aspect_ratio)
if ratio_diff < best_ratio_diff:
best_ratio_diff = ratio_diff
best_ratio = ratio
elif ratio_diff == best_ratio_diff:
if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
best_ratio = ratio
return best_ratio
def dynamic_preprocess(image, min_num=5, max_num=6, image_size=448, use_thumbnail=False):
orig_width, orig_height = image.size
aspect_ratio = orig_width / orig_height
# calculate the existing image aspect ratio
target_ratios = set(
(i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
i * j <= max_num and i * j >= min_num)
target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
# find the closest aspect ratio to the target
target_aspect_ratio = find_closest_aspect_ratio(
aspect_ratio, target_ratios, orig_width, orig_height, image_size)
# calculate the target width and height
target_width = image_size * target_aspect_ratio[0]
target_height = image_size * target_aspect_ratio[1]
blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
# resize the image
resized_img = image.resize((target_width, target_height))
processed_images = []
for i in range(blocks):
box = (
(i % (target_width // image_size)) * image_size,
(i // (target_width // image_size)) * image_size,
((i % (target_width // image_size)) + 1) * image_size,
((i // (target_width // image_size)) + 1) * image_size
)
# split the image
split_img = resized_img.crop(box)
processed_images.append(split_img)
assert len(processed_images) == blocks
if use_thumbnail and len(processed_images) != 1:
thumbnail_img = image.resize((image_size, image_size))
processed_images.append(thumbnail_img)
return processed_images, target_aspect_ratio
def dynamic_preprocess2(image, min_num=1, max_num=6, image_size=448, use_thumbnail=False, prior_aspect_ratio=None):
orig_width, orig_height = image.size
aspect_ratio = orig_width / orig_height
# calculate the existing image aspect ratio
target_ratios = set(
(i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
i * j <= max_num and i * j >= min_num)
target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
new_target_ratios = []
if prior_aspect_ratio is not None:
for i in target_ratios:
if prior_aspect_ratio[0] % i[0] != 0 or prior_aspect_ratio[1] % i[1] != 0:
new_target_ratios.append(i)
else:
continue
# find the closest aspect ratio to the target
target_aspect_ratio = find_closest_aspect_ratio(
aspect_ratio, new_target_ratios, orig_width, orig_height, image_size)
# calculate the target width and height
target_width = image_size * target_aspect_ratio[0]
target_height = image_size * target_aspect_ratio[1]
blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
# resize the image
resized_img = image.resize((target_width, target_height))
processed_images = []
for i in range(blocks):
box = (
(i % (target_width // image_size)) * image_size,
(i // (target_width // image_size)) * image_size,
((i % (target_width // image_size)) + 1) * image_size,
((i // (target_width // image_size)) + 1) * image_size
)
# split the image
split_img = resized_img.crop(box)
processed_images.append(split_img)
assert len(processed_images) == blocks
if use_thumbnail and len(processed_images) != 1:
thumbnail_img = image.resize((image_size, image_size))
processed_images.append(thumbnail_img)
return processed_images
def load_image(image_file, input_size=448, min_num=1, max_num=6):
image = Image.open(image_file).convert('RGB')
transform = build_transform(input_size=input_size)
images, target_aspect_ratio = dynamic_preprocess(
image, image_size=input_size, use_thumbnail=True, min_num=min_num, max_num=max_num)
pixel_values = [transform(image) for image in images]
pixel_values = torch.stack(pixel_values)
return pixel_values, target_aspect_ratio
def load_image2(image_file, input_size=448, target_aspect_ratio=(1, 1), min_num=1, max_num=6):
image = Image.open(image_file).convert('RGB')
transform = build_transform(input_size=input_size)
images = dynamic_preprocess2(
image,
image_size=input_size,
prior_aspect_ratio=target_aspect_ratio,
use_thumbnail=True,
min_num=min_num,
max_num=max_num)
pixel_values = [transform(image) for image in images]
pixel_values = torch.stack(pixel_values)
return pixel_values
# This function is used to split InternVL2-Llama3-76B
def split_model(model_name):
import math
device_map = {}
num_gpus = torch.cuda.device_count()
rank, world_size = get_rank_and_world_size()
num_gpus = num_gpus // world_size
num_layers = {'InternVL2-8B': 32, 'InternVL2-26B': 48,
'InternVL2-40B': 60, 'InternVL2-Llama3-76B': 80}[model_name]
# Since the first GPU will be used for ViT, treat it as 0.8 GPU.
num_layers_per_gpu = math.ceil(num_layers / (num_gpus - 0.2))
num_layers_per_gpu = [num_layers_per_gpu] * num_gpus
num_layers_per_gpu[0] = math.ceil(num_layers_per_gpu[0] * 0.8)
layer_cnt = 0
for i, num_layer in enumerate(num_layers_per_gpu):
for j in range(num_layer):
device_map[f'language_model.model.layers.{layer_cnt}'] = rank + world_size * i
layer_cnt += 1
device_map['vision_model'] = rank
device_map['mlp1'] = rank
device_map['language_model.model.tok_embeddings'] = rank
device_map['language_model.model.embed_tokens'] = rank
device_map['language_model.output'] = rank
device_map['language_model.model.norm'] = rank
device_map['language_model.lm_head'] = rank
device_map[f'language_model.model.layers.{num_layers - 1}'] = rank
return device_map
# To revert changes
class MiniMonkey(BaseModel):
INSTALL_REQ = False
INTERLEAVE = True
def __init__(self, model_path='mx262/MiniMokney', load_in_8bit=False, **kwargs):
assert model_path is not None
assert version_cmp(transformers.__version__, '4.36.2', 'ge')
self.model_path = model_path
self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, use_fast=False)
# Regular expression to match the pattern 'Image' followed by a number, e.g. Image1
self.pattern = r'Image(\d+)'
# Replacement pattern to insert a hyphen between 'Image' and the number, e.g. Image-1
self.replacement = r'Image-\1'
# Convert InternVL2 response to dataset format
# e.g. Image1 -> Image-1
# Regular expression to match the pattern 'Image-' followed by a number
self.reverse_pattern = r'Image-(\d+)'
# Replacement pattern to remove the hyphen (Image-1 -> Image1)
self.reverse_replacement = r'Image\1'
if listinstr(['InternVL2-Llama3-76B'], model_path):
device_map = split_model(model_path.split('/')[-1])
self.model = AutoModel.from_pretrained(
model_path,
torch_dtype=torch.bfloat16,
load_in_8bit=load_in_8bit,
trust_remote_code=True,
low_cpu_mem_usage=True,
device_map=device_map).eval()
else:
device = torch.cuda.current_device()
self.device = device
self.model = AutoModel.from_pretrained(
model_path,
torch_dtype=torch.bfloat16,
trust_remote_code=True,
load_in_8bit=load_in_8bit).eval()
if not load_in_8bit:
self.model = self.model.to(device)
self.image_size = self.model.config.vision_config.image_size
self.kwargs = kwargs
warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
def use_custom_prompt(self, dataset):
if dataset is None:
return False
if listinstr(['MMDU'], dataset):
# For Multi-Turn we don't have custom prompt
return False
else:
return True
def build_multi_choice_prompt(self, line, dataset=None):
question = line['question']
hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
if hint is not None:
question = hint + '\n' + question
options = {
cand: line[cand]
for cand in string.ascii_uppercase
if cand in line and not pd.isna(line[cand])
}
for key, item in options.items():
question += f'\n{key}. {item}'
prompt = question
if len(options):
prompt += '\n请直接回答选项字母。' if cn_string(
prompt) else "\nAnswer with the option's letter from the given choices directly."
else:
prompt += '\n请直接回答问题。' if cn_string(prompt) else '\nAnswer the question directly.'
return prompt
def build_video_prompt(self, prompt, dataset=None, max_nframe=64):
for start in range(0, max_nframe, 8):
images_to_remove = ''.join([f'<image-{i}>' for i in range(start + 1, start + 9)])
prompt = prompt.replace(images_to_remove, '')
for i in range(max_nframe):
prompt = prompt.replace(f'<image-{i + 1}>', f'Frame{i + 1}')
if listinstr(['MMBench-Video'], dataset):
prompt = prompt.replace('\nAnswer:', '')
prompt += '\nAnswer the question using a single word or phrase.'
elif listinstr(['Video-MME'], dataset):
prompt = prompt.replace('\nAnswer:', '')
prompt += "\nAnswer with the option's letter from the given choices directly."
return prompt
def build_prompt(self, line, dataset=None):
assert self.use_custom_prompt(dataset)
assert dataset is None or isinstance(dataset, str)
tgt_path = self.dump_image(line, dataset)
kwargs_default = dict(do_sample=False, max_new_tokens=512, top_p=None, num_beams=1)
self.kwargs = kwargs_default
if dataset is not None and listinstr(['MME'], dataset):
question = line['question']
prompt = question + ' Answer the question using a single word or phrase.'
elif dataset is not None and listinstr(['HallusionBench'], dataset):
question = line['question']
prompt = question + ' Please answer yes or no. Answer the question using a single word or phrase.'
elif dataset is not None and DATASET_TYPE(dataset) == 'MCQ':
prompt = self.build_multi_choice_prompt(line, dataset)
elif dataset is not None and DATASET_TYPE(dataset) == 'VQA':
if listinstr(['MathVista', 'MathVision'], dataset):
prompt = line['question']
elif listinstr(['LLaVABench'], dataset):
question = line['question']
prompt = question + '\nAnswer this question in detail.'
elif listinstr(['MMVet'], dataset):
prompt = line['question']
else:
question = line['question']
prompt = question + '\nAnswer the question using a single word or phrase.'
else:
prompt = line['question']
message = [dict(type='text', value=prompt)]
message.extend([dict(type='image', value=s) for s in tgt_path])
return message
def set_max_num(self, dataset):
if dataset is None:
self.max_num = 12
self.max_num2 = 7
self.min_num = 4
self.min_num2 = 3
return
if dataset is not None and listinstr(['ChartQA_TEST'], dataset):
self.max_num = 12
self.max_num2 = 3
elif dataset is not None and listinstr(['DocVQA_VAL', 'DocVQA_TEST', 'TextVQA_VAL'], dataset):
self.max_num = 23
self.max_num2 = 15
self.min_num = 14
self.min_num2 = 5
elif dataset is not None and listinstr(['InfoVQA_VAL', 'InfoVQA_TEST', 'SEEDBench_IMG'], dataset):
self.max_num = 23
self.max_num2 = 5
self.min_num = 15
self.min_num2 = 3
elif dataset is not None and listinstr(['OCRBench', 'POPE'], dataset):
self.max_num = 24
self.max_num2 = 8
self.min_num = 9
self.min_num2 = 5
elif dataset is not None and listinstr(['HallusionBench'], dataset):
self.max_num = 11
self.max_num2 = 6
self.min_num = 4
self.min_num2 = 2
elif dataset is not None and listinstr(['MME'], dataset):
self.max_num = 11
self.max_num2 = 6
self.min_num = 5
self.min_num2 = 2
elif dataset is not None and listinstr(['AI2D_TEST'], dataset):
self.max_num = 12
self.max_num2 = 6
self.min_num = 5
self.min_num2 = 2
elif dataset is not None and listinstr(['CCBench'], dataset):
self.max_num = 24
self.max_num2 = 8
self.min_num = 9
self.min_num2 = 4
elif dataset is not None and listinstr(['MMMU_DEV_VAL'], dataset):
self.max_num = 12
self.max_num2 = 7
self.min_num = 5
self.min_num2 = 3
else:
self.max_num = 12
self.max_num2 = 7
self.min_num = 4
self.min_num2 = 3
def generate_v2(self, message, dataset=None):
image_num = len([x for x in message if x['type'] == 'image'])
if image_num == 1:
prompt = '<image>\n' + '\n'.join([x['value'] for x in message if x['type'] == 'text'])
else:
prompt, image_idx = '', 1
for x in message:
if x['type'] == 'text':
prompt += x['value']
elif x['type'] == 'image':
prompt += f'<image-{image_idx}>'
image_idx += 1
prompt = ' '.join([f'<image-{i + 1}>: <image>' for i in range(image_num)]) + '\n' + prompt
if dataset is not None and listinstr(['Video'], dataset):
prompt = self.build_video_prompt(prompt, dataset)
if image_num > 1:
image_path = [x['value'] for x in message if x['type'] == 'image']
num_patches_list = []
pixel_values_list = []
for image_idx, file_name in enumerate(image_path):
curr_pixel_values, target_aspect_ratio = load_image(
file_name, min_num=self.min_num, max_num=self.max_num)
curr_pixel_values = curr_pixel_values.cuda().to(torch.bfloat16)
curr_pixel_values2 = load_image2(
file_name, target_aspect_ratio=target_aspect_ratio, min_num=self.min_num2, max_num=self.max_num2)
curr_pixel_values2 = curr_pixel_values2.cuda().to(torch.bfloat16)
curr_pixel_values = torch.cat(
(curr_pixel_values[:-1], curr_pixel_values2[:-1], curr_pixel_values[-1:]), 0)
num_patches_list.append(curr_pixel_values.size(0))
pixel_values_list.append(curr_pixel_values)
pixel_values = torch.cat(pixel_values_list, dim=0)
elif image_num == 1:
image_path = [x['value'] for x in message if x['type'] == 'image'][0]
pixel_values, target_aspect_ratio = load_image(image_path, min_num=self.min_num, max_num=self.max_num)
pixel_values = pixel_values.cuda().to(torch.bfloat16)
pixel_values2 = load_image2(
image_path, target_aspect_ratio=target_aspect_ratio, min_num=self.min_num2, max_num=self.max_num2)
pixel_values2 = pixel_values2.cuda().to(torch.bfloat16)
pixel_values = torch.cat((pixel_values[:-1], pixel_values2[:-1], pixel_values[-1:]), 0)
num_patches_list = [pixel_values.size(0)]
else:
pixel_values = None
num_patches_list = []
with torch.no_grad():
response = self.model.chat(
self.tokenizer,
pixel_values=pixel_values,
target_aspect_ratio=(1, 1),
num_patches_list=num_patches_list,
question=prompt,
generation_config=self.kwargs,
verbose=False
)
return response
def generate_inner(self, message, dataset=None):
self.set_max_num(dataset)
return self.generate_v2(message, dataset)
def build_history(self, message):
# Global Variables
image_path = []
image_cnt = 0
def concat_tilist(tilist):
nonlocal image_cnt # Declare image_cnt as nonlocal to modify it
prompt = ''
for item in tilist:
# Substitute the pattern in the text
if item['type'] == 'text':
prompt += re.sub(self.pattern, self.replacement, item['value'])
elif item['type'] == 'image':
image_cnt += 1
prompt += '<image>\n'
image_path.append(item['value'])
return prompt
# Only previous messages
assert len(message) % 2 == 0
history = []
for i in range(len(message) // 2):
m1, m2 = message[2 * i], message[2 * i + 1]
assert m1['role'] == 'user' and m2['role'] == 'assistant'
history.append((concat_tilist(m1['content']), concat_tilist(m2['content'])))
return history, image_path, image_cnt
def chat_inner_v2(self, message, dataset=None):
image_cnt = 0
if len(message) > 1:
history, image_path, image_cnt = self.build_history(message[:-1])
else:
history, image_path, image_cnt = None, [], 1
current_msg = message[-1]
question = ''
# If message is just text in the conversation
if len(current_msg['content']) == 1 and current_msg['content'][0]['type'] == 'text':
question = current_msg['content'][0]['value']
question = re.sub(self.pattern, self.replacement, question) # Fix pattern as per InternVL
else:
for msg in current_msg['content']:
if msg['type'] == 'text':
question += re.sub(self.pattern, self.replacement, msg['value'])
elif msg['type'] == 'image':
image_cnt += 1
question += '<image>\n'
image_path.append(msg['value'])
if image_cnt > 1:
num_patches_list = []
pixel_values_list = []
for image_idx, file_name in enumerate(image_path):
curr_pixel_values, target_aspect_ratio = load_image(
file_name, min_num=self.min_num, max_num=self.max_num)
curr_pixel_values = curr_pixel_values.cuda().to(torch.bfloat16)
curr_pixel_values2 = load_image2(
file_name, target_aspect_ratio=target_aspect_ratio, min_num=self.min_num2, max_num=self.max_num2)
curr_pixel_values2 = curr_pixel_values2.cuda().to(torch.bfloat16)
curr_pixel_values = torch.cat(
(curr_pixel_values[:-1], curr_pixel_values2[:-1], curr_pixel_values[-1:]), 0)
num_patches_list.append(curr_pixel_values.size(0))
pixel_values_list.append(curr_pixel_values)
pixel_values = torch.cat(pixel_values_list, dim=0)
elif image_cnt == 1:
pixel_values, target_aspect_ratio = load_image(image_path, min_num=self.min_num, max_num=self.max_num)
pixel_values = pixel_values.cuda().to(torch.bfloat16)
pixel_values2 = load_image2(
image_path, target_aspect_ratio=target_aspect_ratio, min_num=self.min_num2, max_num=self.max_num2)
pixel_values2 = pixel_values2.cuda().to(torch.bfloat16)
pixel_values = torch.cat((pixel_values[:-1], pixel_values2[:-1], pixel_values[-1:]), 0)
num_patches_list = [pixel_values.size(0)]
else:
pixel_values = None
num_patches_list = []
response, history = self.model.chat(
self.tokenizer,
pixel_values=pixel_values,
target_aspect_ratio=target_aspect_ratio,
num_patches_list=num_patches_list,
question=question,
generation_config=self.kwargs,
history=history,
return_history=True
)
response = re.sub(self.reverse_pattern, self.reverse_replacement, response)
return response
def chat_inner(self, message, dataset=None):
self.set_max_num(dataset)
kwargs_default = dict(do_sample=False, max_new_tokens=512, top_p=None, num_beams=1)
self.kwargs = kwargs_default
return self.chat_inner_v2(message, dataset)
# Copyright (c) 2022, salesforce.com, inc.
# All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
model:
arch: instruct_vicuna13b
load_finetuned: False
load_pretrained: True
pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/InstructBLIP/instruct_blip_vicuna13b_trimmed.pth"
finetuned: ""
# vit encoder
image_size: 224
drop_path_rate: 0
use_grad_checkpoint: False
vit_precision: "fp16"
freeze_vit: True
# Q-Former
num_query_token: 32
# path to Vicuna checkpoint
llm_model: "Please set the path to your vicuna-13b-v1.1"
# generation configs
prompt: ""
preprocess:
vis_processor:
train:
name: "blip2_image_train"
image_size: 224
eval:
name: "blip_image_eval"
image_size: 224
text_processor:
train:
name: "blip_caption"
eval:
name: "blip_caption"
# Copyright (c) 2022, salesforce.com, inc.
# All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
model:
arch: instruct_vicuna7b
load_finetuned: False
load_pretrained: True
pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/InstructBLIP/instruct_blip_vicuna7b_trimmed.pth"
finetuned: ""
# vit encoder
image_size: 224
drop_path_rate: 0
use_grad_checkpoint: False
vit_precision: "fp16"
freeze_vit: True
# Q-Former
num_query_token: 32
# path to Vicuna checkpoint
llm_model: "Please set the path to your vicuna-7b-v1.1"
# generation configs
prompt: ""
preprocess:
vis_processor:
train:
name: "blip2_image_train"
image_size: 224
eval:
name: "blip_image_eval"
image_size: 224
text_processor:
train:
name: "blip_caption"
eval:
name: "blip_caption"
model:
arch: minigpt4
model_type: pretrain_vicuna_7b
max_txt_len: 160
end_sym: "###"
low_resource: True
prompt_template: '###Human: {} ###Assistant: '
ckpt: "please set this value to the path of pretrained checkpoint"
# vit encoder
image_size: 224
drop_path_rate: 0
use_grad_checkpoint: False
vit_precision: "fp16"
freeze_vit: True
freeze_qformer: True
# Q-Former
num_query_token: 32
# generation configs
prompt: ""
llama_model: "please set this value to the path of vicuna-13b-v0"
datasets:
cc_sbu_align:
vis_processor:
train:
name: "blip2_image_eval"
image_size: 224
text_processor:
train:
name: "blip_caption"
run:
task: image_text_pretrain
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment