Commit bc5ebf0f authored by luopl's avatar luopl
Browse files

Initial commit

parents
Pipeline #2167 canceled with stages
import torch
from transformers import AutoTokenizer, AutoConfig, AutoModel, CLIPImageProcessor
import warnings
from PIL import Image
from .base import BaseModel
from ..smp import *
from ..dataset import DATASET_TYPE, DATASET_MODALITY
import pandas as pd
import string
import torch.distributed as dist
import torchvision.transforms as T
import transformers
from torchvision.transforms.functional import InterpolationMode
import re
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
best_ratio_diff = float('inf')
best_ratio = (1, 1)
area = width * height
for ratio in target_ratios:
target_aspect_ratio = ratio[0] / ratio[1]
ratio_diff = abs(aspect_ratio - target_aspect_ratio)
if ratio_diff < best_ratio_diff:
best_ratio_diff = ratio_diff
best_ratio = ratio
elif ratio_diff == best_ratio_diff:
if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
best_ratio = ratio
return best_ratio
def dynamic_preprocess(image, min_num=1, max_num=4, image_size=448, use_thumbnail=False):
orig_width, orig_height = image.size
aspect_ratio = orig_width / orig_height
# calculate the existing image aspect ratio
target_ratios = set(
(i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
i * j <= max_num and i * j >= min_num)
target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
# find the closest aspect ratio to the target
target_aspect_ratio = find_closest_aspect_ratio(
aspect_ratio, target_ratios, orig_width, orig_height, image_size)
# calculate the target width and height
target_width = image_size * target_aspect_ratio[0]
target_height = image_size * target_aspect_ratio[1]
blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
# resize the image
resized_img = image.resize((target_width, target_height))
processed_images = []
for i in range(blocks):
box = (
(i % (target_width // image_size)) * image_size,
(i // (target_width // image_size)) * image_size,
((i % (target_width // image_size)) + 1) * image_size,
((i // (target_width // image_size)) + 1) * image_size
)
# split the image
split_img = resized_img.crop(box)
processed_images.append(split_img)
assert len(processed_images) == blocks
if use_thumbnail and len(processed_images) != 1:
thumbnail_img = image.resize((image_size, image_size))
processed_images.append(thumbnail_img)
return processed_images
def load_image(image_file, input_size=448, max_num=6, upscale=False):
image = Image.open(image_file).convert('RGB')
if upscale:
image = image.resize((image.width * 2, image.height * 2), Image.BILINEAR)
transform = build_transform(input_size=input_size)
images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
pixel_values = [transform(image) for image in images]
pixel_values = torch.stack(pixel_values)
return pixel_values
class VinternChat(BaseModel):
INSTALL_REQ = False
INTERLEAVE = True
def __init__(self, model_path='5CD-AI/Vintern-3B-beta', load_in_8bit=False, **kwargs):
assert model_path is not None
assert version_cmp(transformers.__version__, '4.36.2', 'ge')
self.model_path = model_path
self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, use_fast=False)
# Regular expression to match the pattern 'Image' followed by a number, e.g. Image1
self.pattern = r'Image(\d+)'
# Replacement pattern to insert a hyphen between 'Image' and the number, e.g. Image-1
self.replacement = r'Image-\1'
# Convert InternVL2 response to dataset format
# e.g. Image1 -> Image-1
# Regular expression to match the pattern 'Image-' followed by a number
self.reverse_pattern = r'Image-(\d+)'
# Replacement pattern to remove the hyphen (Image-1 -> Image1)
self.reverse_replacement = r'Image\1'
device = torch.cuda.current_device()
self.device = device
self.model = AutoModel.from_pretrained(
model_path,
torch_dtype=torch.bfloat16,
trust_remote_code=True,
load_in_8bit=load_in_8bit).eval()
if not load_in_8bit:
self.model = self.model.to(device)
self.image_size = self.model.config.vision_config.image_size
kwargs_default = dict(do_sample=False, max_new_tokens=1024, top_p=None, num_beams=3)
kwargs_default.update(kwargs)
self.kwargs = kwargs_default
warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
def use_custom_prompt(self, dataset):
if dataset is None:
return False
if listinstr(['MMDU', 'MME-RealWorld', 'MME-RealWorld-CN'], dataset):
# For Multi-Turn we don't have custom prompt
return False
if DATASET_MODALITY(dataset) == 'VIDEO':
# For Video benchmarks we don't have custom prompt at here
return False
else:
return True
def build_multi_choice_prompt(self, line, dataset=None):
question = line['question']
hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
if hint is not None:
question = hint + '\n' + question
options = {
cand: line[cand]
for cand in string.ascii_uppercase
if cand in line and not pd.isna(line[cand])
}
for key, item in options.items():
question += f'\n{key}. {item}'
prompt = question
if len(options):
prompt += '\n请直接回答选项字母。' if cn_string(
prompt) else "\nAnswer with the option's letter from the given choices directly."
else:
prompt += '\n请直接回答问题。' if cn_string(prompt) else '\nAnswer the question directly.'
return prompt
def build_video_prompt(self, prompt, dataset=None, max_frames=64):
for start in range(0, max_frames, 8):
images_to_remove = ''.join([f'<Image-{i}>' for i in range(start + 1, start + 9)])
prompt = prompt.replace(images_to_remove, '')
for i in range(max_frames):
prompt = prompt.replace(f'Image-{i + 1}', f'Frame-{i + 1}')
if listinstr(['MMBench-Video'], dataset):
prompt = prompt.replace('\nAnswer:', '')
elif listinstr(['Video-MME'], dataset):
prompt = prompt.replace('\nAnswer:', '')
prompt += "\nAnswer with the option's letter from the given choices directly."
elif listinstr(['MVBench'], dataset):
prompt = prompt.replace('Best option:(', '')
return prompt
def build_prompt(self, line, dataset=None):
assert self.use_custom_prompt(dataset)
assert dataset is None or isinstance(dataset, str)
tgt_path = self.dump_image(line, dataset)
kwargs_default = dict(do_sample=False, max_new_tokens=1024, top_p=None, num_beams=3)
if listinstr(['MTVQA'], dataset):
kwargs_default["max_new_tokens"] = 256
if listinstr(['MMMU_DEV_VAL','MMMU_TEST'], dataset):
kwargs_default["num_beams"] = 1
self.kwargs = kwargs_default
if dataset is not None and DATASET_TYPE(dataset) == 'Y/N':
question = line['question']
if listinstr(['MME'], dataset):
prompt = question + ' Answer the question using a single word or phrase.'
elif listinstr(['HallusionBench'], dataset):
prompt = question + ' Please answer yes or no. Answer the question using a single word or phrase.'
else:
prompt = line['question']
elif dataset is not None and DATASET_TYPE(dataset) == 'MCQ':
prompt = self.build_multi_choice_prompt(line, dataset)
elif dataset is not None and DATASET_TYPE(dataset) == 'VQA':
question = line['question']
if listinstr(['MathVista', 'MathVision', 'VCR', 'MTVQA', 'MMVet', 'MathVerse'], dataset):
prompt = question
elif listinstr(['LLaVABench'], dataset):
prompt = question + '\nAnswer this question in detail.'
else:
prompt = question + '\nAnswer the question using a single word or phrase.'
else:
prompt = line['question']
message = [dict(type='text', value=prompt)]
message.extend([dict(type='image', value=s) for s in tgt_path])
return message
def set_max_num(self, dataset):
if dataset is None:
self.max_num = 1
return
# res_1_datasets = ['MMBench-Video', 'Video-MME', 'MVBench', 'Video']
res_12_datasets = ['ChartQA_TEST', 'MMMU_DEV_VAL', 'MMMU_TEST', 'MME-RealWorld',
'MME-RealWorld', 'VCR_EN', 'VCR_ZH']
res_18_datasets = ['DocVQA_VAL', 'DocVQA_TEST']
res_24_datasets = ['InfoVQA_VAL', 'InfoVQA_TEST', 'OCRBench', 'HRBench4K', 'HRBench8K']
if DATASET_MODALITY(dataset) == 'VIDEO':
self.max_num = 1
elif listinstr(res_12_datasets, dataset):
self.max_num = 6 # 12
elif listinstr(res_18_datasets, dataset):
self.max_num = 6 # 18
elif listinstr(res_24_datasets, dataset):
self.max_num = 6 # 24
elif listinstr(["MME"], dataset):
self.max_num = 6 # 24
else:
self.max_num = 6 # 6
def generate_v2(self, message, dataset=None):
image_num = len([x for x in message if x['type'] == 'image'])
if image_num == 1:
prompt = '<image>\n' + '\n'.join([x['value'] for x in message if x['type'] == 'text'])
else:
prompt, image_idx = '', 1
for x in message:
if x['type'] == 'text':
prompt += x['value']
elif x['type'] == 'image':
prompt += f'<Image-{image_idx}>'
image_idx += 1
prompt = '\n'.join([f'Image-{i + 1}: <image>' for i in range(image_num)]) + '\n' + prompt
if dataset is not None and DATASET_MODALITY(dataset) == 'VIDEO':
prompt = self.build_video_prompt(prompt, dataset)
if image_num > 1:
image_path = [x['value'] for x in message if x['type'] == 'image']
num_patches_list = []
pixel_values_list = []
for image_idx, file_name in enumerate(image_path):
upscale_flag = image_idx == 0 and dataset is not None and listinstr(['MMMU_DEV_VAL'], dataset)
curr_pixel_values = load_image(
file_name, max_num=self.max_num, upscale=upscale_flag).to(self.device).to(torch.bfloat16)
num_patches_list.append(curr_pixel_values.size(0))
pixel_values_list.append(curr_pixel_values)
pixel_values = torch.cat(pixel_values_list, dim=0)
elif image_num == 1:
image_path = [x['value'] for x in message if x['type'] == 'image'][0]
upscale_flag = dataset is not None and listinstr(['MMMU_DEV_VAL'], dataset)
pixel_values = load_image(
image_path, max_num=self.max_num, upscale=upscale_flag).to(self.device).to(torch.bfloat16)
num_patches_list = [pixel_values.size(0)]
else:
pixel_values = None
num_patches_list = []
with torch.no_grad():
response = self.model.chat(
self.tokenizer,
pixel_values=pixel_values,
num_patches_list=num_patches_list,
question=prompt,
generation_config=self.kwargs,
verbose=False
)
return response
def generate_inner(self, message, dataset=None):
self.set_max_num(dataset)
return self.generate_v2(message, dataset)
def build_history(self, message):
# Global Variables
image_path = []
image_cnt = 0
def concat_tilist(tilist):
nonlocal image_cnt # Declare image_cnt as nonlocal to modify it
prompt = ''
for item in tilist:
# Substitute the pattern in the text
if item['type'] == 'text':
prompt += re.sub(self.pattern, self.replacement, item['value'])
elif item['type'] == 'image':
image_cnt += 1
prompt += '<image>\n'
image_path.append(item['value'])
return prompt
# Only previous messages
assert len(message) % 2 == 0
history = []
for i in range(len(message) // 2):
m1, m2 = message[2 * i], message[2 * i + 1]
assert m1['role'] == 'user' and m2['role'] == 'assistant'
history.append((concat_tilist(m1['content']), concat_tilist(m2['content'])))
return history, image_path, image_cnt
def chat_inner_v2(self, message, dataset=None):
image_cnt = 0
if len(message) > 1:
history, image_path, image_cnt = self.build_history(message[:-1])
else:
history, image_path, image_cnt = None, [], 1
current_msg = message[-1]
question = ''
# If message is just text in the conversation
if len(current_msg['content']) == 1 and current_msg['content'][0]['type'] == 'text':
question = current_msg['content'][0]['value']
question = re.sub(self.pattern, self.replacement, question) # Fix pattern as per InternVL
else:
for msg in current_msg['content']:
if msg['type'] == 'text':
question += re.sub(self.pattern, self.replacement, msg['value'])
elif msg['type'] == 'image':
image_cnt += 1
question += '<image>\n'
image_path.append(msg['value'])
if image_cnt > 1:
num_patches_list = []
pixel_values_list = []
for image_idx, file_name in enumerate(image_path):
upscale_flag = image_idx == 0 and dataset is not None and listinstr(['MMMU_DEV_VAL'], dataset)
curr_pixel_values = load_image(
file_name, max_num=1, upscale=upscale_flag).to(self.device).to(torch.bfloat16)
num_patches_list.append(curr_pixel_values.size(0))
pixel_values_list.append(curr_pixel_values)
pixel_values = torch.cat(pixel_values_list, dim=0)
elif image_cnt == 1:
upscale_flag = dataset is not None and listinstr(['MMMU_DEV_VAL'], dataset)
pixel_values = load_image(
image_path, max_num=self.max_num, upscale=upscale_flag).to(self.device).to(torch.bfloat16)
num_patches_list = [pixel_values.size(0)]
else:
pixel_values = None
num_patches_list = []
response, history = self.model.chat(
self.tokenizer,
pixel_values=pixel_values,
num_patches_list=num_patches_list,
question=question,
generation_config=self.kwargs,
history=history,
return_history=True
)
response = re.sub(self.reverse_pattern, self.reverse_replacement, response)
return response
def chat_inner(self, message, dataset=None):
self.set_max_num(dataset)
kwargs_default = dict(do_sample=False, max_new_tokens=512, top_p=None, num_beams=3)
self.kwargs = kwargs_default
return self.chat_inner_v2(message, dataset)
import warnings
from .base import BaseModel
from ..smp import *
class VisualGLM(BaseModel):
INSTALL_REQ = False
INTERLEAVE = False
def __init__(self, model_path='THUDM/visualglm-6b', **kwargs):
try:
import sat
except Exception as err:
logging.critical('Please install SwissArmyTransformer to use VisualGLM')
raise err
assert model_path is not None
self.model_path = model_path
from transformers import AutoModel
from transformers import AutoTokenizer
self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = AutoModel.from_pretrained(model_path, trust_remote_code=True).half().cuda()
self.model = model
self.kwargs = kwargs
warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
def generate_inner(self, message, dataset=None):
prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
output, _ = self.model.chat(
image_path=image_path,
tokenizer=self.tokenizer,
query=prompt,
history=[],
**self.kwargs
)
return output
import torch
import sys
import os.path as osp
import warnings
from .base import BaseModel
from transformers import StoppingCriteriaList
from PIL import Image
from huggingface_hub import snapshot_download
from vlmeval.smp import *
model_cfgs = {
'XVERSE-V-13B': {
'arch': 'vxverse',
'model_type': 'pretrain_xverse13b-chat',
'max_txt_len': 512,
'end_sym': '<|endoftext|>',
'low_resource': False,
'prompt_template': 'Human: {}\nAssistant: ',
'ckpt': 'xverse/XVERSE-V-13B',
'lora_r': 128,
'lora_alpha': 256,
'lora_dropout': 0.05,
'lora_target_modules': 'all_linear',
'has_qformer': False,
'n_proj_layers': 2,
'vit_model': 'openai/clip-vit-large-patch14',
'vit_path': 'openai/clip-vit-large-patch14',
'image_size': 224,
'drop_path_rate': 0,
'vit_precision': 'fp16',
'llama_model': 'xverse/XVERSE-13B-Chat',
}
}
class VXVERSE(BaseModel):
INSTALL_REQ = True
INTERLEAVE = False
def __init__(self, model_name='XVERSE-V-13B', root=None, **kwargs):
from omegaconf import OmegaConf
if root is None:
warnings.warn('Please set root to the directory of vxverse.')
if model_name == 'XVERSE-V-13B':
cfg = model_cfgs['XVERSE-V-13B']
else:
raise NotImplementedError
ckpt_dir = cfg['ckpt']
if not osp.isdir(ckpt_dir):
cache_path = get_cache_path(ckpt_dir)
if cache_path is not None:
ckpt_dir = cache_path
else:
ckpt_dir = snapshot_download(repo_id=ckpt_dir)
assert osp.exists(ckpt_dir) and osp.isdir(ckpt_dir)
ckpt = osp.join(ckpt_dir, 'adapter_and_lora.bin')
cfg['ckpt'] = ckpt
model_cfg = OmegaConf.create(cfg)
self.model_name = model_name
self.root = root
sys.path.append(self.root)
from vxverse.common.registry import registry
from vxverse.conversation.conversation import CONV_VISION_XVERSE
device = torch.cuda.current_device()
self.device = device
model_cls = registry.get_model_class(model_cfg.arch)
model = model_cls.from_config(model_cfg)
model = model.to(device)
model.eval()
vis_processor_cfg = OmegaConf.create(dict(name='hd_image_train', image_size=224))
vis_processor = registry.get_processor_class(
vis_processor_cfg.name
).from_config(vis_processor_cfg)
self.model = model
self.vis_processor = vis_processor
self.vis_processor_cfg = vis_processor_cfg
self.CONV_VISION = CONV_VISION_XVERSE
self.CONV_VISION.system = ''
stop_words_ids = [[835], [2277, 29937]]
self.stop_words_ids = stop_words_ids
default_kwargs = dict(max_new_tokens=512)
default_kwargs.update(kwargs)
self.kwargs = default_kwargs
def generate_inner(self, message, dataset=None):
prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
image = Image.open(image_path).convert('RGB')
image = self.vis_processor(image)
if self.vis_processor_cfg.name == 'hd_image_train':
patches_per_image = [[image.shape[0]]]
image = [image]
else:
patches_per_image = None
image = image.unsqueeze(0)
chat_state = self.CONV_VISION.copy()
texts = self.prepare_texts([prompt], chat_state)
texts = [text.lstrip() for text in texts]
answers = self.model.generate(
image,
texts,
patches_per_images=patches_per_image,
do_sample=False,
stop_words_ids=self.stop_words_ids,
**self.kwargs
)
return answers[0]
def prepare_texts(self, texts, conv_temp):
convs = [conv_temp.copy() for _ in range(len(texts))]
[
conv.append_message(conv.roles[0], '<ImageHere>\n{}'.format(text))
for conv, text in zip(convs, texts)
]
[conv.append_message(conv.roles[1], None) for conv in convs]
texts = [conv.get_prompt() for conv in convs]
return texts
import torch
from PIL import Image
import sys
from ..smp import *
from .base import BaseModel
from ..dataset import DATASET_TYPE
from transformers import AutoModel, GenerationConfig
class WeMM(BaseModel):
def __init__(self, model_path='feipengma/WeMM', **kwargs):
self.wemm = AutoModel.from_pretrained(model_path, torch_dtype=torch.bfloat16, trust_remote_code=True)
self.wemm.cuda()
self.wemm.eval()
torch.cuda.empty_cache()
def use_custom_prompt(self, dataset):
assert dataset is not None
if DATASET_TYPE(dataset) == 'MCQ':
return True
return False
def build_prompt(self, line, dataset=None):
assert self.use_custom_prompt(dataset)
assert dataset is None or isinstance(dataset, str)
tgt_path = self.dump_image(line, dataset)
question = line['question']
hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
if hint is not None:
question = hint + '\n' + question
options = {
cand: line[cand]
for cand in string.ascii_uppercase
if cand in line and not pd.isna(line[cand])
}
for key, item in options.items():
question += f'\n{key}. {item}'
prompt = question
if len(options):
prompt += (
'\n请直接回答选项字母。' if cn_string(prompt) else
"\nAnswer with the option's letter from the given choices directly."
)
else:
prompt += '\n请直接回答问题。' if cn_string(prompt) else '\nAnswer the question directly.'
message = [dict(type='text', value=prompt)]
message.extend([dict(type='image', value=p) for p in tgt_path])
return message
def generate_inner(self, message, dataset=None):
prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
if dataset == 'HallusionBench':
prompt = prompt + ' Please answer yes or no. Answer the question using a single word or phrase.'
gen_config = None
if dataset == 'MMVet':
gen_config = GenerationConfig(
max_new_tokens=512,
do_sample=True,
temperatures=0.7,
num_beams=3,
eos_token_id=self.wemm.tokenizer.eos_token_id,
pad_token_id=self.wemm.tokenizer.pad_token_id
if self.wemm.tokenizer.pad_token_id is not None else self.wemm.tokenizer.eos_token_id,
)
pred = self.wemm.mm_generate(image_path, prompt, gen_config)
return pred
from .sharecaptioner import ShareCaptioner
from .xcomposer import XComposer
from .xcomposer2 import XComposer2
from .xcomposer2_4KHD import XComposer2_4KHD
from .xcomposer2d5 import XComposer2d5
__all__ = ['ShareCaptioner', 'XComposer', 'XComposer2', 'XComposer2_4KHD', 'XComposer2d5']
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from ..base import BaseModel
from ...smp import *
from ...dataset import DATASET_TYPE
class ShareCaptioner(BaseModel):
INSTALL_REQ = False
INTERLEAVE = False
def __init__(self, model_path='Lin-Chen/ShareCaptioner', **kwargs):
assert model_path is not None
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
self.model = AutoModelForCausalLM.from_pretrained(
model_path, device_map='cuda', trust_remote_code=True).eval()
self.model.tokenizer = tokenizer
self.model.cuda()
self.model.half()
def use_custom_prompt(self, dataset):
assert dataset is not None
if DATASET_TYPE(dataset) == 'MCQ':
return True
return False
def build_prompt(self, line, dataset=None):
assert dataset is None or isinstance(dataset, str)
assert self.use_custom_prompt(dataset)
tgt_path = self.dump_image(line, dataset)
if dataset is not None and DATASET_TYPE(dataset) == 'MCQ':
question = line['question']
hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
if hint is not None:
question = hint + '\n' + question
option_candidate = string.ascii_uppercase
options = {
cand: line[cand]
for cand in option_candidate
if cand in line and not pd.isna(line[cand])
}
for key, item in options.items():
question += f'\n{key}. {item}'
prompt = question
if not cn_string(prompt):
prompt = prompt + '\n' + "Answer with the option's letter from the given choices directly."
else:
prompt = prompt + '\n' + '请直接回答选项字母。'
else:
prompt = line['question']
message = [dict(type='text', value=prompt)]
message.extend([dict(type='image', value=s) for s in tgt_path])
return message
def generate_inner(self, message, dataset=None):
prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
seg1 = '<|User|>:'
seg2 = f'{prompt}{self.model.eoh}\n<|Bot|>:'
self.seg_emb1 = self.model.encode_text(seg1, add_special_tokens=True)
self.seg_emb2 = self.model.encode_text(seg2, add_special_tokens=False)
image = Image.open(image_path).convert('RGB')
image = self.model.vis_processor(image).unsqueeze(0)
image = image.to(self.model.device)
tmp_bs = image.shape[0]
tmp_seg_emb1 = self.seg_emb1.repeat(tmp_bs, 1, 1)
tmp_seg_emb2 = self.seg_emb2.repeat(tmp_bs, 1, 1)
with torch.cuda.amp.autocast():
with torch.no_grad():
image = self.model.encode_img(image)
input_emb = torch.cat(
[tmp_seg_emb1, image, tmp_seg_emb2], dim=1)
out_embeds = self.model.internlm_model.generate(
inputs_embeds=input_emb,
max_length=500,
num_beams=3,
min_length=1,
do_sample=True,
repetition_penalty=1.5,
length_penalty=1.0,
temperature=1.,
eos_token_id=self.model.tokenizer.eos_token_id,
num_return_sequences=1)
for j, out in enumerate(out_embeds):
out[out == -1] = 2
response = self.model.decode_text([out])
return response
import torch
from transformers import AutoModel, AutoTokenizer
from transformers import StoppingCriteria, StoppingCriteriaList
from PIL import Image
from ..base import BaseModel
from ...smp import *
from ...dataset import DATASET_TYPE
class StoppingCriteriaSub(StoppingCriteria):
def __init__(self, stops=[], encounters=1):
super().__init__()
self.stops = stops
def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor):
for stop in self.stops:
if torch.all((stop == input_ids[0][-len(stop):])).item():
return True
return False
class XComposer(BaseModel):
INSTALL_REQ = False
INTERLEAVE = False
def __init__(self, model_path='internlm/internlm-xcomposer-vl-7b', **kwargs):
assert model_path is not None
self.model_path = model_path
model = AutoModel.from_pretrained(self.model_path, device_map='cpu', trust_remote_code=True).cuda().eval()
tokenizer = AutoTokenizer.from_pretrained(self.model_path, trust_remote_code=True)
model.tokenizer = tokenizer
self.model = model
self.device = self.model.internlm_model.model.embed_tokens.weight.device
self.eoh = '<TOKENS_UNUSED_0>'
self.eoa = '<TOKENS_UNUSED_1>'
stop_words_ids = [
torch.tensor([103027]).to(self.device), # end of human
torch.tensor([103028]).to(self.device), # end of bot
]
default_kwargs = {
'max_new_tokens': 512, 'num_beams': 5, 'do_sample': False,
'min_length': 1, 'repetition_penalty': 1.5, 'length_penalty': 1.0
}
default_kwargs.update(kwargs)
self.kwargs = default_kwargs
self.stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids)])
def generate_inner(self, message, dataset=None):
if len(message) == 2:
if message[0]['type'] == 'text' and message[1]['type'] == 'image':
message = [message[1], message[0]]
kwargs = cp.deepcopy(self.kwargs)
if dataset is not None:
if DATASET_TYPE(dataset) == 'MCQ':
kwargs['max_new_tokens'] = 5
kwargs['num_beams'] = 5
with torch.cuda.amp.autocast():
with torch.no_grad():
prompt_embs = self.message_to_prompt_embs(message, dataset)
outputs = self.model.internlm_model.generate(
inputs_embeds=prompt_embs,
stopping_criteria=self.stopping_criteria,
**kwargs
)
output_token = outputs[0]
if output_token[0] == 0:
output_token = output_token[1:]
if output_token[0] == 1:
output_token = output_token[1:]
output_text = self.model.tokenizer.decode(output_token, add_special_tokens=False)
output_text = output_text.split(self.model.eoa)[0]
output_text = output_text.split('<|Bot|>')[-1].strip()
return output_text
def message_to_prompt_embs(self, message, dataset=None):
assert isinstance(message, list)
img_embeds = []
prompt_full = '<|User|>: '
for msg in message:
if msg['type'] == 'text':
prompt_full += msg['value']
elif msg['type'] == 'image':
image = Image.open(msg['value']).convert('RGB')
image = self.model.vis_processor(image).unsqueeze(0).to(self.device)
img_embeds.append(self.model.encode_img(image))
prompt_full += '<ImageHere>'
prompt_full += self.model.eoh + ' <|Bot|>: '
if dataset is not None and DATASET_TYPE(dataset) == 'MCQ':
prompt_full += 'Answer: The answer is '
elif dataset is not None and DATASET_TYPE(dataset) in ['VQA', 'QA', 'Y/N']:
prompt_full += 'Answer: '
prompt_segs = prompt_full.split('<ImageHere>')
assert len(prompt_segs) == len(img_embeds) + 1
prompt_seg_tokens = [
self.model.tokenizer(seg, return_tensors='pt', add_special_tokens=(i == 0)).to(self.device).input_ids.long()
for i, seg in enumerate(prompt_segs)
]
prompt_seg_embs = [self.model.internlm_model.model.embed_tokens(seg) for seg in prompt_seg_tokens]
all_embeddings = []
for i in range(len(img_embeds)):
all_embeddings.extend([prompt_seg_embs[i], img_embeds[i]])
all_embeddings.append(prompt_seg_embs[-1])
prompt_embs = torch.cat(all_embeddings, dim=1)
return prompt_embs
def use_custom_prompt(self, dataset):
assert dataset is not None
if DATASET_TYPE(dataset) == 'MCQ':
return True
return False
def build_prompt(self, line, dataset=None):
assert dataset is None or isinstance(dataset, str)
assert self.use_custom_prompt(dataset)
tgt_path = self.dump_image(line, dataset)
question = line['question']
options = {
cand: line[cand]
for cand in string.ascii_uppercase
if cand in line and not pd.isna(line[cand])
}
options_prompt = ''
for key, item in options.items():
options_prompt += f'{key}. {item}\n'
hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
context = 'N/A' if hint is None else hint
mid_prompt = 'Context: ' + context + '\nQuestion: ' + question
if len(options_prompt):
mid_prompt += '\nOptions: ' + options_prompt
if len(options):
txt_prompt = 'Please answer this question by choosing the correct choice.'
else:
txt_prompt = 'Please answer this question directly. '
prompt = txt_prompt + mid_prompt
message = [dict(type='text', value=prompt)]
message.extend([dict(type='image', value=s) for s in tgt_path])
return message
import torch
import torchvision
from transformers import AutoModel, AutoTokenizer
from PIL import Image
from ..base import BaseModel
from ...smp import *
from ...dataset import DATASET_TYPE
import re
pattern = re.compile(r'[A-Z]')
def __padding__(image):
width, height = image.size
tar = max(width, height)
top_padding = int((tar - height) / 2)
bottom_padding = tar - height - top_padding
left_padding = int((tar - width) / 2)
right_padding = tar - width - left_padding
image = torchvision.transforms.functional.pad(image, [left_padding, top_padding, right_padding, bottom_padding])
return image
meta_instruction = """
You are an AI assistant whose name is InternLM-XComposer (浦语·灵笔).
- InternLM-XComposer (浦语·灵笔) is a multi-modality conversational language model that is developed by
Shanghai AI Laboratory (上海人工智能实验室). It is designed to be helpful, honest, and harmless.
- InternLM-XComposer (浦语·灵笔) can understand and communicate fluently in the language
chosen by the user such as English and 中文.
- InternLM-XComposer (浦语·灵笔) is capable of comprehending and articulating responses effectively
based on the provided image.
"""
def model_gen(model, text, images, need_bos=True, padding=False, beams=3, max_token=500):
pt1 = 0
embeds = []
im_mask = []
images = [images]
images_loc = [0]
for i, pts in enumerate(images_loc + [len(text)]):
subtext = text[pt1:pts]
if need_bos or len(subtext) > 0:
text_embeds = model.encode_text(subtext, add_special_tokens=need_bos)
embeds.append(text_embeds)
im_mask.append(torch.zeros(text_embeds.shape[:2]).cuda())
need_bos = False
if i < len(images):
try:
image = Image.open(images[i]).convert('RGB')
except:
image = images[i].convert('RGB')
if padding:
image = __padding__(image)
image = model.vis_processor(image).unsqueeze(0).cuda()
image_embeds = model.encode_img(image)
embeds.append(image_embeds)
im_mask.append(torch.ones(image_embeds.shape[:2]).cuda())
pt1 = pts
embeds = torch.cat(embeds, dim=1)
im_mask = torch.cat(im_mask, dim=1)
im_mask = im_mask.bool()
outputs = model.generate(
inputs_embeds=embeds,
im_mask=im_mask,
temperature=1.0,
max_new_tokens=max_token,
num_beams=beams,
do_sample=False,
repetition_penalty=1.0)
output_token = outputs[0]
if output_token[0] == 0 or output_token[0] == 1:
output_token = output_token[1:]
output_text = model.tokenizer.decode(output_token, add_special_tokens=False)
output_text = output_text.split('[UNUSED_TOKEN_145]')[0].strip()
return output_text
class XComposer2(BaseModel):
INSTALL_REQ = False
INTERLEAVE = False
def __init__(self, model_path='internlm/internlm-xcomposer2-vl-7b', **kwargs):
assert model_path is not None
self.model_path = model_path
model = AutoModel.from_pretrained(self.model_path, device_map='cpu', trust_remote_code=True).cuda().eval()
model.half()
tokenizer = AutoTokenizer.from_pretrained(self.model_path, trust_remote_code=True)
model.tokenizer = tokenizer
self.model = model
self.device = self.model.model.tok_embeddings.weight.device
def generate_mme(self, image_path, text):
text = text.split('Please answer')[0].strip()
text = f'{text} Answer this question briefly'
text = f'[UNUSED_TOKEN_146]user\n{text}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
return model_gen(self.model, text, image_path, need_bos=True, padding=True, beams=5)
def generate_multichoice(self, image_path, text, dataset):
out = model_gen(self.model, text, image_path, need_bos=True, padding=False, beams=5, max_token=5)
if 'mmmu' in dataset.lower():
return out
res = pattern.findall(out)
if len(res) == 0:
print('Error:', out)
res = 'Z'
return res[0]
def generate_vqa(self, image_path, text):
out = model_gen(self.model, text, image_path, need_bos=True)
return out
def generate_vanilla(self, image_path, text):
text = (
'[UNUSED_TOKEN_146]system\n{}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]user\n{}'
'Answer this question in detail.[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
).format(meta_instruction, text)
out = model_gen(self.model, text, image_path, need_bos=True, max_token=500)
return out
def generate_brief(self, image_path, text):
text = (
'[UNUSED_TOKEN_146]user\nAnswer the question using a single word or phrase.{}'
'[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
).format(text)
out = model_gen(self.model, text, image_path, need_bos=True, max_token=10)
return out
def generate_driectly(self, image_path, text):
text = '[UNUSED_TOKEN_146]user\n{}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'.format(text)
out = model_gen(self.model, text, image_path, need_bos=True, max_token=500)
return out
def generate_inner(self, message, dataset=None):
prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
with torch.cuda.amp.autocast():
if dataset is None:
return self.generate_vanilla(image_path, prompt)
assert isinstance(dataset, str)
if dataset == 'MME':
return self.generate_mme(image_path, prompt)
elif listinstr(['hallu'], dataset.lower()):
return self.generate_brief(image_path, prompt)
elif listinstr(['llava'], dataset.lower()):
return self.generate_vanilla(image_path, prompt)
elif dataset is not None and DATASET_TYPE(dataset) == 'MCQ':
return self.generate_multichoice(image_path, prompt, dataset)
elif dataset is not None and DATASET_TYPE(dataset) == 'VQA':
return self.generate_vqa(image_path, prompt)
else:
return self.generate_vanilla(image_path, prompt)
def use_custom_prompt(self, dataset):
assert dataset is not None
if DATASET_TYPE(dataset) == 'MCQ' or DATASET_TYPE(dataset) == 'VQA':
return True
return False
def build_mcqa(self, line):
question = line['question']
options = {
cand: line[cand]
for cand in string.ascii_uppercase
if cand in line and not pd.isna(line[cand])
}
img_prompt = '[UNUSED_TOKEN_146]user\n'
if len(options):
options_prompt = ''
for key, item in options.items():
options_prompt += f'{key}. {item} '
options_prompt = options_prompt.strip()
hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
context = 'N/A' if hint is None else hint
mid_prompt = 'Question: ' + question + '\nContext: ' + context + '\nOptions: ' + options_prompt
ans_prompt = '[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\nThe answer is'
prompt = img_prompt + mid_prompt + ans_prompt
else:
mid_prompt = f'Answer the question using a single word or phrase.{question}'
ans_prompt = '[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
prompt = img_prompt + mid_prompt + ans_prompt
return prompt
def build_prompt(self, line, dataset=None):
assert dataset is None or isinstance(dataset, str)
assert self.use_custom_prompt(dataset)
tgt_path = self.dump_image(line, dataset)
if DATASET_TYPE(dataset) == 'MCQ':
prompt = self.build_mcqa(line)
elif DATASET_TYPE(dataset) == 'VQA':
if 'mathvista' in dataset.lower():
q = line['question']
prompt = f'[UNUSED_TOKEN_146]user\n{q}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
else:
q = line['question']
prompt = (
f'[UNUSED_TOKEN_146]user\nAnswer the question using a single word or phrase.{q}'
'[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
)
message = [dict(type='text', value=prompt)]
message.extend([dict(type='image', value=s) for s in tgt_path])
return message
import torch
from transformers import AutoModel, AutoTokenizer
from PIL import Image
from ..base import BaseModel
from ...smp import *
from ...dataset import DATASET_TYPE
import numpy as np
import torchvision.transforms as transforms
import re
pattern = re.compile(r'[A-Z]')
def padding_336(b):
width, height = b.size
tar = int(np.ceil(height / 336) * 336)
top_padding = int((tar - height) / 2)
bottom_padding = tar - height - top_padding
left_padding = 0
right_padding = 0
b = transforms.functional.pad(b, [left_padding, top_padding, right_padding, bottom_padding], fill=[255, 255, 255])
return b
def HD_transform(img, im_num=16):
width, height = img.size
trans = False
if width < height:
img = img.transpose(Image.TRANSPOSE)
trans = True
width, height = img.size
ratio = (width / height)
scale = 1
while scale * np.ceil(scale / ratio) <= im_num:
scale += 1
scale -= 1
new_w = int(scale * 336)
new_h = int(new_w / ratio)
img = transforms.functional.resize(img, [new_h, new_w],)
img = padding_336(img)
width, height = img.size
assert width * height <= im_num * 336 * 336
if trans:
img = img.transpose(Image.TRANSPOSE)
return img
meta_instruction = """You are an AI assistant whose name is InternLM-XComposer (浦语·灵笔).
- InternLM-XComposer (浦语·灵笔) is a multi-modality conversational language model that is developed\
by Shanghai AI Laboratory (上海人工智能实验室). It is designed to be helpful, honest, and harmless.
- InternLM-XComposer (浦语·灵笔) can understand and communicate fluently in the language chosen by\
the user such as English and 中文.
- InternLM-XComposer (浦语·灵笔) is capable of comprehending and articulating responses\
effectively based on the provided image."""
def model_gen(model, text, images, need_bos=True, padding=False, beams=3, max_token=500):
pt1 = 0
embeds = []
im_mask = []
images = [images]
images_loc = [0]
for i, pts in enumerate(images_loc + [len(text)]):
subtext = text[pt1:pts]
if need_bos or len(subtext) > 0:
text_embeds = model.encode_text(subtext, add_special_tokens=need_bos)
embeds.append(text_embeds)
im_mask.append(torch.zeros(text_embeds.shape[:2]).cuda())
need_bos = False
if i < len(images):
try:
image = Image.open(images[i]).convert('RGB')
except:
image = images[i].convert('RGB')
image = HD_transform(image, im_num=model.hd_num)
image = model.vis_processor(image).unsqueeze(0).cuda()
image_embeds = model.encode_img(image)
embeds.append(image_embeds)
im_mask.append(torch.ones(image_embeds.shape[:2]).cuda())
pt1 = pts
embeds = torch.cat(embeds, dim=1)
im_mask = torch.cat(im_mask, dim=1)
im_mask = im_mask.bool()
outputs = model.generate(inputs_embeds=embeds, im_mask=im_mask,
temperature=1.0, max_new_tokens=max_token, num_beams=beams,
do_sample=False, repetition_penalty=1.0)
output_token = outputs[0]
if output_token[0] == 0 or output_token[0] == 1:
output_token = output_token[1:]
output_text = model.tokenizer.decode(output_token, add_special_tokens=False)
output_text = output_text.split('[UNUSED_TOKEN_145]')[0].strip()
return output_text
class XComposer2_4KHD(BaseModel):
INSTALL_REQ = False
INTERLEAVE = False
def __init__(self, model_path='internlm/internlm-xcomposer2-4khd-7b', **kwargs):
assert model_path is not None
self.model_path = model_path
model = AutoModel.from_pretrained(self.model_path, device_map='cpu', trust_remote_code=True).cuda().eval()
model.half()
tokenizer = AutoTokenizer.from_pretrained(self.model_path, trust_remote_code=True)
model.tokenizer = tokenizer
self.model = model
self.device = self.model.model.tok_embeddings.weight.device
self.model.hd_num = 25
def generate_mme(self, image_path, text):
text = text.split('Please answer')[0].strip()
text = f'{text} Answer this question briefly'
text = f'[UNUSED_TOKEN_146]user\n{text}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
return model_gen(self.model, text, image_path, need_bos=True, padding=True, beams=5)
def generate_multichoice(self, image_path, text, dataset):
out = model_gen(self.model, text, image_path, need_bos=True, padding=False, beams=5, max_token=5)
if 'mmmu' in dataset.lower():
return out
res = pattern.findall(out)
if len(res) == 0:
print('Error:', out)
res = 'Z'
return res[0]
def generate_vqa(self, image_path, text):
out = model_gen(self.model, text, image_path, need_bos=True, max_token=100)
return out
def generate_vanilla(self, image_path, text):
out = model_gen(self.model, text, image_path, need_bos=True, max_token=500)
return out
def generate_brief(self, image_path, text):
text = '[UNUSED_TOKEN_146]user\nAnswer the question using a single word or phrase.{}\
[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'.format(text)
out = model_gen(self.model, text, image_path, need_bos=True, max_token=10)
return out
def generate(self, message, dataset=None):
prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
if listinstr(['docvqa_test', 'infovqa_test'], dataset.lower()):
self.model.hd_num = 65
elif listinstr(['docvqa_val', 'infovqa_val', 'OCRBench'], dataset.lower()):
self.model.hd_num = 55
elif listinstr(['mmlongbench_doc'], dataset.lower()):
self.model.hd_num = 45
elif listinstr(['mmmu', 'mmbench', 'mmvet'], dataset.lower()):
self.model.hd_num = 16
else:
self.model.hd_num = 25
with torch.cuda.amp.autocast():
if dataset is None:
return self.generate_vanilla(image_path, prompt)
assert isinstance(dataset, str)
if dataset == 'MME':
return self.generate_mme(image_path, prompt)
elif listinstr(['hallu'], dataset.lower()):
return self.generate_brief(image_path, prompt)
elif listinstr(['llava', 'mmvet'], dataset.lower()):
return self.generate_vanilla(image_path, prompt)
elif dataset is not None and DATASET_TYPE(dataset) == 'MCQ':
return self.generate_multichoice(image_path, prompt, dataset)
elif dataset is not None and DATASET_TYPE(dataset) == 'VQA':
return self.generate_vqa(image_path, prompt)
else:
return self.generate_vanilla(image_path, prompt)
def use_custom_prompt(self, dataset):
assert dataset is not None
if DATASET_TYPE(dataset) == 'MCQ' or DATASET_TYPE(dataset) == 'VQA':
return True
return False
def build_mcqa(self, line):
question = line['question']
options = {
cand: line[cand]
for cand in string.ascii_uppercase
if cand in line and not pd.isna(line[cand])
}
img_prompt = '[UNUSED_TOKEN_146]user\n'
if len(options):
options_prompt = ''
for key, item in options.items():
options_prompt += f'{key}. {item} '
options_prompt = options_prompt.strip()
hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
context = 'N/A' if hint is None else hint
mid_prompt = 'Question: ' + question + '\nContext: ' + context + '\nOptions: ' + options_prompt
ans_prompt = '[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\nThe answer is'
prompt = img_prompt + mid_prompt + ans_prompt
else:
mid_prompt = f'Answer the question using a single word or phrase.{question}'
ans_prompt = '[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
prompt = img_prompt + mid_prompt + ans_prompt
return prompt
def build_prompt(self, line, dataset=None):
assert dataset is None or isinstance(dataset, str)
assert self.use_custom_prompt(dataset)
tgt_path = self.dump_image(line, dataset)
if DATASET_TYPE(dataset) == 'MCQ':
prompt = self.build_mcqa(line)
elif DATASET_TYPE(dataset) == 'VQA':
if 'mathvista' in dataset.lower():
q = line['question']
prompt = f'[UNUSED_TOKEN_146]user\n{q}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
elif listinstr(['llava', 'mmvet'], dataset.lower()):
q = line['question']
prompt = '[UNUSED_TOKEN_146]system\n{}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]user\n{}\
Answer this question in detail.[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]\
assistant\n'.format(meta_instruction, q)
elif listinstr(['mmlongbench_doc'], dataset.lower()):
q = line['question']
prompt = f'[UNUSED_TOKEN_146]user\n{q}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
else:
q = line['question']
prompt = f'[UNUSED_TOKEN_146]user\nAnswer the question using a single word or phrase.\
{q}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
ret = [dict(type='text', value=prompt)]
ret.extend([dict(type='image', value=s) for s in tgt_path])
return ret
import re
import numpy as np
import torch
import torchvision.transforms as transforms
from PIL import Image, ImageDraw, ImageFont
from transformers import AutoModel, AutoTokenizer
from ...dataset import DATASET_TYPE
from ...smp import *
from ..base import BaseModel
pattern = re.compile(r'[A-Z]')
conv_pattern = '\\[UNUSED_TOKEN_146\\]user\\\n|\\[UNUSED_TOKEN_146\\]assistant\\\n|\\[UNUSED_TOKEN_145\\]'
def get_font():
try:
truetype_url = "http://opencompass.openxlab.space/utils/Fonts/SimHei.ttf"
ff = urlopen(truetype_url)
# ff = '/fs-computility/mllm/shared/dongxiaoyi/share_data/SimHei.ttf'
font = ImageFont.truetype(ff, size=40)
except Exception as e:
logging.warning(f'{type(e)}: {e}')
logging.warning("Fail to download the font. Use the default one.")
font = ImageFont.load_default(size=40)
return font
def padding_560(b):
width, height = b.size
tar = int(np.ceil(height / 560) * 560)
top_padding = int((tar - height) / 2)
bottom_padding = tar - height - top_padding
left_padding = 0
right_padding = 0
b = transforms.functional.pad(
b, [left_padding, top_padding, right_padding, bottom_padding], fill=[255, 255, 255])
return b
def Identity_transform(img, hd_num=25):
width, height = img.size
trans = False
if width < height:
img = img.transpose(Image.TRANSPOSE)
trans = True
width, height = img.size
ratio = (width / height)
scale = 1
new_h = int(scale * 560)
new_w = int(new_h * ratio)
# print (new_h, new_w)
img = transforms.functional.resize(img, [new_h, new_w],)
img = img.transpose(Image.TRANSPOSE)
img = padding_560(img)
width, height = img.size
if not trans:
img = img.transpose(Image.TRANSPOSE)
return img
def HD_transform(img, im_num=36, id_scale=1.5):
width, height = img.size
trans = False
if width < height:
img = img.transpose(Image.TRANSPOSE)
trans = True
width, height = img.size
ratio = (width / height)
scale = 1
while scale * np.ceil(scale / ratio) <= im_num:
scale += 1
scale -= 1
scale = min(np.ceil(width * id_scale / 560), scale)
new_w = int(scale * 560)
new_h = int(new_w / ratio)
img = transforms.functional.resize(img, [new_h, new_w],)
img = padding_560(img)
width, height = img.size
assert width * height <= im_num * 560 * 560
if trans:
img = img.transpose(Image.TRANSPOSE)
return img
def img_process(imgs):
new_imgs = []
for img in imgs:
w, h = img.size
scale = w / h
if w > h:
new_w = 560 * 2
new_h = int(560 * 2 / scale)
else:
new_w = int(560 * 2 * scale)
new_h = 560 * 2
img = transforms.functional.resize(img, [new_h, new_w],)
new_imgs.append(img)
imgs = new_imgs
new_w = 0
new_h = 0
pad = 40
if w > h:
for im in imgs:
w,h = im.size
new_w = max(new_w, w)
new_h += h + 10 + pad
font = get_font()
new_img = Image.new('RGB', (new_w, new_h), 'white')
draw = ImageDraw.Draw(new_img)
curr_h = 0
for idx, im in enumerate(imgs):
w,h = im.size
new_img.paste(im, (0, pad + curr_h))
draw.text((0, curr_h), f'<IMAGE {idx}>', font=font, fill='black')
if idx + 1 < len(imgs):
draw.line([(0, pad + curr_h + h + 5), (new_w, pad + curr_h + h + 5)], fill='black', width=2)
curr_h += h + 10 + pad
# print (new_w, new_h)
else:
for im in imgs:
w,h = im.size
new_w += w + 10
new_h = max(new_h, h)
new_h += pad
font = get_font()
new_img = Image.new('RGB', (new_w, new_h), 'white')
draw = ImageDraw.Draw(new_img)
curr_w = 0
for idx, im in enumerate(imgs):
w,h = im.size
new_img.paste(im, (curr_w, pad))
draw.text((curr_w, 0), f'<IMAGE {idx}>', font=font, fill='black')
if idx + 1 < len(imgs):
draw.line([(curr_w + w + 5, 0), (curr_w + w + 5, new_h)], fill='black', width=2)
curr_w += w + 10
return new_img
meta_instruction = """You are an AI assistant whose name is InternLM (书生·浦语).\n" + "- InternLM (书生·浦语) \
is a conversational language model that is developed by Shanghai AI Laboratory (上海人工智能实验室).
It is designed to be helpful, honest, and harmless.\n"+"- InternLM (书生·浦语) \
can understand and communicate fluently in the language chosen by the user such as English and 中文."""
def model_gen(model, text, images, need_bos=True, padding=False, beams=3, max_token=500, video_input=False):
embeds = []
im_mask = []
# print(text)
im_idx = 0
sub_q = text.split('<IM_POS>')
add_im = len(sub_q) - 1
for subtext in sub_q:
if need_bos or len(subtext) > 0:
text_embeds = model.encode_text(
subtext, add_special_tokens=need_bos)
embeds.append(text_embeds)
im_mask.append(torch.zeros(text_embeds.shape[:2]).to(model.device))
need_bos = False
if im_idx < len(images) and add_im:
image = images[im_idx]
if video_input:
image = Identity_transform(image)
else:
if len(images) > 1:
image = HD_transform(image, im_num=model.hd_num // len(images), id_scale=model.id_scale)
else:
image = HD_transform(
image, im_num=model.hd_num, id_scale=model.id_scale)
# print(image.size)
image = model.vis_processor(image).unsqueeze(0).to(model.device)
image_embeds = model.encode_img(image)
im_idx += 1
add_im -= 1
embeds.append(image_embeds)
im_mask.append(torch.ones(
image_embeds.shape[:2], dtype=torch.long).to(model.device))
embeds = torch.cat(embeds, dim=1)
im_mask = torch.cat(im_mask, dim=1)
im_mask = im_mask.bool()
outputs = model.generate(inputs_embeds=embeds, im_mask=im_mask,
temperature=1.0, max_new_tokens=max_token, num_beams=beams,
do_sample=False, repetition_penalty=1.0)
output_token = outputs[0]
if output_token[0] == 0 or output_token[0] == 1:
output_token = output_token[1:]
output_text = model.tokenizer.decode(output_token, add_special_tokens=False)
output_text = output_text.split('[UNUSED_TOKEN_145]')[0].strip().split('<|im_end|>')[0].strip().split('The answer is')[-1].strip() # noqa
# print(output_text)
return output_text
class XComposer2d5(BaseModel):
INSTALL_REQ = False
INTERLEAVE = True
def __init__(self, model_path='internlm/internlm-xcomposer2d5-7b', id_scale=1.5, beam=3, **kwargs):
assert model_path is not None
self.model_path = model_path
self.id_scale = id_scale
self.beam = beam
model = AutoModel.from_pretrained(
self.model_path, device_map='cpu', trust_remote_code=True, local_files_only=True).cuda().eval()
model.half()
tokenizer = AutoTokenizer.from_pretrained(
self.model_path, trust_remote_code=True)
model.tokenizer = tokenizer
self.model = model
self.device = self.model.model.tok_embeddings.weight.device
self.model.hd_num = 36
self.model.id_scale = self.id_scale
def message_to_promptimg(self, message, dataset=None, video_input=False):
num_images = len([x for x in message if x['type'] == 'image'])
if num_images == 0:
prompt = '\n'.join([x['value']
for x in message if x['type'] == 'text'])
image = None
else:
image = [Image.open(x['value']).convert('RGB') for x in message if x['type'] == 'image']
if video_input:
im_prompt = '<IM_POS>Here are some frames of a video.'
if len(image) > 64:
step = len(image) / 64
image = [image[int(i * step)] for i in range(64)]
image = [img_process(image)]
else:
if len(image) > 1:
im_prompt = ' '.join([
f'Image{im_idx+1}: <IM_POS>;' for im_idx in range(len(image))])
else:
im_prompt = '<IM_POS>'
prompt = ''
for x in message:
if x['type'] == 'text' and x.get('role', '') != 'system':
prompt += x['value']
sp = [i for i in re.split(conv_pattern, prompt) if i != '' and i != '\n']
assert len(sp) <= 2
q = sp[0]
prompt = f'[UNUSED_TOKEN_146]user\n{im_prompt}{q}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
for idx in range(10):
idx = chr(65 + idx)
prompt = prompt.replace(f'({idx})', f'{idx}.')
return prompt, image
def generate_mme(self, image_path, text):
text = text.split('Please answer')[0].strip()
text = f'{text} Answer this question briefly'
text = f'[UNUSED_TOKEN_146]user\n{text}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
return model_gen(self.model, text, image_path, need_bos=True, padding=True, beams=self.beam)
def generate_multichoice(self, image_path, text, dataset):
out = model_gen(self.model, text, image_path,
need_bos=True, padding=False, beams=self.beam, max_token=5)
if 'mmmu' in dataset.lower():
return out
res = pattern.findall(out)
if len(res) == 0:
print('Error:', out)
res = 'Z'
return res[0]
def generate_vqa(self, image_path, text):
out = model_gen(self.model, text, image_path, beams=self.beam,
need_bos=True, max_token=100)
return out
def generate_vanilla(self, image_path, text):
out = model_gen(self.model, text, image_path, beams=self.beam,
need_bos=True, max_token=500)
return out
def generate_brief(self, image_path, text):
text = '[UNUSED_TOKEN_146]user\nAnswer the question using a single word or phrase.{}\
[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'.format(text)
out = model_gen(self.model, text, image_path, beams=self.beam,
need_bos=True, max_token=10)
return out
def generate_video(self, image_path, text):
out = model_gen(
self.model, text, image_path, beams=1, # self.beam,
need_bos=True, max_token=100, video_input=True)
return out
def set_max_num(self, dataset):
if dataset is not None and listinstr(['MME-RealWorld', 'MME-RealWorld-CN'], dataset):
self.model.hd_num = 25
def generate_inner(self, message, dataset=None):
self.set_max_num(dataset)
with torch.cuda.amp.autocast():
if dataset is None:
prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
return self.generate_vanilla(image_path, prompt)
assert isinstance(dataset, str)
if listinstr(['video', 'mvbench'], dataset.lower()):
prompt, image_path = self.message_to_promptimg(message, dataset=dataset, video_input=True)
return self.generate_video(image_path, prompt)
else:
prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
if dataset == 'MME':
return self.generate_mme(image_path, prompt)
elif listinstr(['hallu', 'pope'], dataset.lower()):
return self.generate_brief(image_path, prompt)
elif listinstr(['llava', 'mmvet'], dataset.lower()):
return self.generate_vanilla(image_path, prompt)
elif dataset is not None and DATASET_TYPE(dataset) == 'MCQ':
return self.generate_multichoice(image_path, prompt, dataset)
elif listinstr(['MME-RealWorld', 'MME-RealWorld-CN'], dataset):
return self.generate_multichoice(image_path, prompt, dataset)
elif dataset is not None and DATASET_TYPE(dataset) == 'VQA':
return self.generate_vqa(image_path, prompt)
else:
return self.generate_vanilla(image_path, prompt)
def use_custom_prompt(self, dataset):
assert dataset is not None
if DATASET_TYPE(dataset) == 'MCQ' or DATASET_TYPE(dataset) == 'VQA':
return True
return False
def build_mcqa(self, line):
question = line['question']
options = {
cand: line[cand]
for cand in string.ascii_uppercase
if cand in line and not pd.isna(line[cand])
}
img_prompt = '[UNUSED_TOKEN_146]user\n'
if len(options):
options_prompt = ''
for key, item in options.items():
options_prompt += f'{key}. {item} '
options_prompt = options_prompt.strip()
hint = line['hint'] if (
'hint' in line and not pd.isna(line['hint'])) else None
context = 'N/A' if hint is None else hint
mid_prompt = 'Question: ' + question + '\nContext: ' + \
context + '\nOptions: ' + options_prompt
ans_prompt = '[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\nThe answer is'
prompt = img_prompt + mid_prompt + ans_prompt
else:
mid_prompt = f'Answer the question using a single word or phrase.{question}'
ans_prompt = '[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
prompt = img_prompt + mid_prompt + ans_prompt
return prompt
def build_prompt(self, line, dataset=None):
assert dataset is None or isinstance(dataset, str)
assert self.use_custom_prompt(dataset)
tgt_path = self.dump_image(line, dataset)
if DATASET_TYPE(dataset) == 'MCQ':
prompt = self.build_mcqa(line)
elif DATASET_TYPE(dataset) == 'VQA':
if 'mathvista' in dataset.lower():
q = line['question']
prompt = f'[UNUSED_TOKEN_146]user\n{q}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
elif listinstr(['llava', 'mmvet'], dataset.lower()):
q = line['question']
prompt = '[UNUSED_TOKEN_146]system\n{}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]user\n{}\
Answer this question in detail.[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]\
assistant\n'.format(meta_instruction, q)
elif listinstr(['mmlongbench_doc', 'dude', 'slidevqa'], dataset.lower()):
q = line['question']
prompt = f'[UNUSED_TOKEN_146]user\n{q}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
else:
q = line['question']
prefix = 'Answer the question using a single word or phrase.'
prompt = f'[UNUSED_TOKEN_146]user\n{prefix}{q}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
ret = [dict(type='text', value=prompt)]
ret.extend([dict(type='image', value=s) for s in tgt_path])
return ret
from PIL import Image
import torch
from .base import BaseModel
from ..smp import *
class XGenMM(BaseModel):
INSTALL_REQ = False
INTERLEAVE = True
def __init__(self, model_path='Salesforce/xgen-mm-phi3-mini-instruct-interleave-r-v1.5', **kwargs):
try:
from transformers import AutoModelForVision2Seq, AutoTokenizer, AutoImageProcessor
except Exception as err:
logging.critical('Please install the latest version transformers.')
raise err
model = AutoModelForVision2Seq.from_pretrained(
model_path, device_map='cuda', trust_remote_code=True, torch_dtype='auto'
).eval()
tokenizer = AutoTokenizer.from_pretrained(
model_path, trust_remote_code=True, use_fast=False, legacy=False
)
tokenizer = model.update_special_tokens(tokenizer)
tokenizer.eos_token = '<|end|>'
tokenizer.padding_side = 'left'
image_processor = AutoImageProcessor.from_pretrained(model_path, trust_remote_code=True)
self.model = model
self.image_processor = image_processor
self.tokenizer = tokenizer
self.kwargs = kwargs
def apply_prompt_template(self, query):
s = (
'<|system|>\nA chat between a curious user and an artificial intelligence assistant. '
"The assistant gives helpful, detailed, and polite answers to the user's questions.<|end|>\n"
f'<|user|>\n{query}<|end|>\n<|assistant|>\n'
)
return s
def generate_inner(self, message, dataset=None):
content, images, image_sizes = '', [], []
for msg in message:
if msg['type'] == 'text':
content += msg['value']
elif msg['type'] == 'image':
image = Image.open(msg['value']).convert('RGB')
images.append(self.image_processor([image], image_aspect_ratio='anyres')['pixel_values'].to('cuda'))
image_sizes.append(image.size)
content += '<image> '
inputs = {'pixel_values': [images]}
prompt = self.apply_prompt_template(content)
language_inputs = self.tokenizer([prompt], return_tensors='pt').to('cuda')
inputs.update(language_inputs)
generation_args = {
'max_new_tokens': 1024,
'temperature': 0.0,
'do_sample': False,
'top_p': None,
'num_beams': 1
}
generation_args.update(self.kwargs)
generate_ids = self.model.generate(
**inputs, image_size=[image_sizes],
pad_token_id=self.tokenizer.pad_token_id,
eos_token_id=self.tokenizer.eos_token_id,
**generation_args
)
# remove input tokens
response = self.tokenizer.decode(generate_ids[0], skip_special_tokens=True).split('<|end|>')[0]
return response
import torch
import sys
import os.path as osp
import warnings
from PIL import Image
from vlmeval.smp import get_cache_path, load, dump, splitlen
from huggingface_hub import snapshot_download
from .base import BaseModel
"""
You can perform inference of Yi-VL through the following steps:
1. clone the repo https://github.com/01-ai/Yi to path-to-Yi
2. set up the environment and install the required packages in path-to-Yi/VL/requirements.txt
3. set Yi_ROOT in vlmeval/config.py
Yi_ROOT = path-to-Yi
You are all set now! To run a demo for Yi-VL:
```python
from vlmeval import *
model = supported_VLM['Yi_VL_6B']()
model.generate('apple.jpg', 'What is in this image?')
```
To run evaluation for Yi-VL, use `python run.py --model Yi_VL_6B --data {dataset_list}`
"""
def edit_config(repo_id):
if not osp.exists(repo_id):
root = get_cache_path(repo_id)
else:
root = repo_id
assert root is not None and osp.exists(root)
cfg = osp.join(root, 'config.json')
data = load(cfg)
mm_vision_tower = data['mm_vision_tower']
if mm_vision_tower.startswith('./vit/'):
data['mm_vision_tower'] = osp.join(root, mm_vision_tower)
assert osp.exists(data['mm_vision_tower'])
dump(data, cfg)
def disable_torch_init():
"""
Disable the redundant torch default initialization to accelerate model creation.
"""
import torch
setattr(torch.nn.Linear, 'reset_parameters', lambda self: None)
setattr(torch.nn.LayerNorm, 'reset_parameters', lambda self: None)
class Yi_VL(BaseModel):
INSTALL_REQ = True
INTERLEAVE = False
def __init__(self,
model_path='01-ai/Yi-VL-6B',
root=None,
**kwargs):
if root is None:
warnings.warn(
'Please set root to the directory of Yi, '
'which is cloned from here: https://github.com/01-ai/Yi.'
)
self.root = osp.join(root, 'VL')
sys.path.append(self.root)
if splitlen(model_path, '/') == 2 and not osp.exists(model_path):
if get_cache_path(model_path) is None:
snapshot_download(repo_id=model_path)
edit_config(model_path)
elif osp.exists(model_path):
edit_config(model_path)
from llava.mm_utils import get_model_name_from_path, load_pretrained_model
from llava.model.constants import key_info
disable_torch_init()
key_info['model_path'] = model_path
get_model_name_from_path(model_path)
self.tokenizer, self.model, self.image_processor, self.context_len = load_pretrained_model(
model_path,
device_map='cpu')
self.model = self.model.cuda()
self.conv_mode = 'mm_default'
kwargs_default = dict(temperature=0.2,
num_beams=1,
do_sample=False,
max_new_tokens=1024,
top_p=None)
kwargs_default.update(kwargs)
self.kwargs = kwargs_default
warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
def generate_inner(self, message, dataset=None):
prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
from llava.conversation import conv_templates
from llava.model.constants import DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX
from llava.mm_utils import KeywordsStoppingCriteria, expand2square, tokenizer_image_token
qs = DEFAULT_IMAGE_TOKEN + '\n' + prompt
conv = conv_templates[self.conv_mode].copy()
conv.append_message(conv.roles[0], qs)
conv.append_message(conv.roles[1], None)
prompt = conv.get_prompt()
input_ids = (
tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt')
.unsqueeze(0)
.cuda()
)
image = Image.open(image_path)
if getattr(self.model.config, 'image_aspect_ratio', None) == 'pad':
if image.mode == 'L':
background_color = int(sum([int(x * 255) for x in self.image_processor.image_mean]) / 3)
else:
background_color = tuple(int(x * 255) for x in self.image_processor.image_mean)
image = expand2square(image, background_color)
image_tensor = self.image_processor.preprocess(image, return_tensors='pt')[
'pixel_values'
][0]
stop_str = conv.sep
keywords = [stop_str]
stopping_criteria = KeywordsStoppingCriteria(keywords, self.tokenizer, input_ids)
self.model = self.model.to(dtype=torch.bfloat16)
with torch.inference_mode():
output_ids = self.model.generate(
input_ids,
images=image_tensor.unsqueeze(0).to(dtype=torch.bfloat16).cuda(),
stopping_criteria=[stopping_criteria],
use_cache=True,
**self.kwargs)
input_token_len = input_ids.shape[1]
n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
if n_diff_input_output > 0:
print(
f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids'
)
outputs = self.tokenizer.batch_decode(
output_ids[:, input_token_len:], skip_special_tokens=True
)[0]
outputs = outputs.strip()
if outputs.endswith(stop_str):
outputs = outputs[: -len(stop_str)]
outputs = outputs.strip()
return outputs
icon.png

53.8 KB

from transformers import AutoProcessor
from vllm import LLM, SamplingParams
from qwen_vl_utils import process_vision_info
MODEL_PATH = 'Qwen/Qwen2-VL-7B-Instruct'
def main():
# 指定多卡推理
llm = LLM(
model=MODEL_PATH,
limit_mm_per_prompt={"image": 10, "video": 10},
tensor_parallel_size=4, # 设置为你要使用的 GPU 数量
trust_remote_code=True,
gpu_memory_utilization=0.95,
dtype="float16",
enforce_eager=True
)
sampling_params = SamplingParams(
temperature=0.8,
top_p=0.95,
max_tokens=256,
stop_token_ids=[],
)
messages = [
{"role": "system", "content": "You are a helpful assistant."},
{
"role": "user",
"content": [
{
"type": "image",
"image": "https://modelscope.oss-cn-beijing.aliyuncs.com/resource/qwen.png",
"min_pixels": 224 * 224,
"max_pixels": 1280 * 28 * 28,
},
{"type": "text", "text": "What is the text in the illustrate?"},
],
},
]
processor = AutoProcessor.from_pretrained(MODEL_PATH)
prompt = processor.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True,
)
image_inputs, video_inputs = process_vision_info(messages)
mm_data = {}
if image_inputs is not None:
mm_data["image"] = image_inputs
if video_inputs is not None:
mm_data["video"] = video_inputs
llm_inputs = {
"prompt": prompt,
"multi_modal_data": mm_data,
}
# 多卡推理
outputs = llm.generate([llm_inputs], sampling_params=sampling_params)
generated_text = outputs[0].outputs[0].text
print(generated_text)
if __name__ == '__main__':
main()
from transformers import AutoProcessor
from vllm import LLM, SamplingParams
from qwen_vl_utils import process_vision_info
MODEL_PATH = "Qwen/Qwen2-VL-7B-Instruct"
llm = LLM(
model=MODEL_PATH,
limit_mm_per_prompt={"image": 10, "video": 10},
)
sampling_params = SamplingParams(
temperature=0.1,
top_p=0.001,
repetition_penalty=1.05,
max_tokens=256,
stop_token_ids=[],
)
messages = [
{"role": "system", "content": "You are a helpful assistant."},
{
"role": "user",
"content": [
{
"type": "image",
"image": "https://modelscope.oss-cn-beijing.aliyuncs.com/resource/qwen.png",
"min_pixels": 224 * 224,
"max_pixels": 1280 * 28 * 28,
},
{"type": "text", "text": "What is the text in the illustrate?"},
],
},
]
# For video input, you can pass following values instead:
# "type": "video",
# "video": "<video URL>",
processor = AutoProcessor.from_pretrained(MODEL_PATH)
prompt = processor.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True,
)
image_inputs, video_inputs = process_vision_info(messages)
mm_data = {}
if image_inputs is not None:
mm_data["image"] = image_inputs
if video_inputs is not None:
mm_data["video"] = video_inputs
llm_inputs = {
"prompt": prompt,
"multi_modal_data": mm_data,
}
outputs = llm.generate([llm_inputs], sampling_params=sampling_params)
generated_text = outputs[0].outputs[0].text
print(generated_text)
\ No newline at end of file
# 模型唯一标识
modelCode=1199
# 模型名称
modelName=qwen2_vl_pytorch
# 模型描述
modelDescription=Qwen2-VL是基于Qwen2开发而成,在架构上的一大改进是实现了对原生动态分辨率的全面支持,与上一代模型相比,能够处理任意分辨率的图像输入。
# 应用场景
appScenario=推理,训练,对话问答,科研,教育,政府,金融
# 框架类型
frameType=Pytorch
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment