Unverified Commit 3a232db4 authored by Haodong Duan's avatar Haodong Duan Committed by GitHub
Browse files

[Deperecate] Remove multi-modal related stuff (#1072)



* Remove MultiModal

* update index.rst

* update README

* remove mmbench codes

* update news

---------
Co-authored-by: default avatarLeymore <zfz-960727@163.com>
parent f1ee11de
from typing import TYPE_CHECKING
from transformers.utils import (OptionalDependencyNotAvailable,
is_torch_available)
if TYPE_CHECKING:
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
from .otter import Otter
from .post_processor import OTTERMMBenchPostProcessor
from .prompt_constructor import OTTERMMBenchPromptConstructor
__all__ = [
'Otter', 'OTTERMMBenchPromptConstructor', 'OTTERMMBenchPostProcessor'
]
import importlib
import mmengine
import torch
import torch.nn as nn
from mmengine.device import get_device
from opencompass.registry import MM_MODELS
@MM_MODELS.register_module('otter-9b')
class Otter(nn.Module):
"""Inference code of OTTER.
Model details:
OTTER: a multi-modal model based on OpenFlamingo
(open-sourced version of DeepMind's Flamingo)
https://github.com/Luodian/Otter
Args:
model_path (str): The path of OTTER model
in Huggingface model hub format.
load_bit (str): The bit of OTTER model, can be "fp32" or "bf16".
mode (str): The mode of inference. Defaults to 'generation'.
"""
def __init__(self,
model_path,
load_bit,
prompt_constructor,
post_processor,
mode='generation') -> None:
super().__init__()
torch_dtype = torch.bfloat16 if load_bit == 'bf16' else torch.float32
otter_ai = importlib.import_module('otter_ai')
self.model = otter_ai.OtterForConditionalGeneration.from_pretrained(
model_path, torch_dtype=torch_dtype, device_map=get_device())
self.tokenizer = self.model.text_tokenizer
self.tokenizer.padding_side = 'left'
self.model_dtype = next(self.model.parameters()).dtype
self.prompt_constructor = mmengine.registry.build_from_cfg(
prompt_constructor, MM_MODELS)
if post_processor is not None:
self.post_processor = mmengine.registry.build_from_cfg(
post_processor, MM_MODELS)
self.mode = mode
def forward(self, batch):
if self.mode == 'generation':
return self.generate(batch)
elif self.mode == 'loss':
return self.loss(batch)
else:
raise RuntimeError(f'Invalid mode "{self.mode}".')
def generate(self, batch):
inputs = self.prompt_constructor(batch)
image = inputs['image']
prompt = inputs['prompt']
data_samples = inputs['data_samples']
vision_x = image.unsqueeze(1).unsqueeze(0).to(dtype=self.model_dtype)
lang_x = self.model.text_tokenizer([prompt], return_tensors='pt')
bad_words_id = self.model.text_tokenizer(['User:', 'GPT:']).input_ids
generated_text = self.model.generate(
vision_x=vision_x.to(self.model.device),
lang_x=lang_x['input_ids'].to(self.model.device),
attention_mask=lang_x['attention_mask'].to(self.model.device),
do_sample=False,
max_new_tokens=512,
num_beams=3,
bad_words_ids=bad_words_id,
no_repeat_ngram_size=3,
)
for i, data_sample in enumerate(data_samples):
output_text = self.post_processor(generated_text[i],
self.model.text_tokenizer)
data_sample.pred_answer = output_text
data_samples[i] = data_sample
return data_samples
import random
import re
import torch
class OTTERMMBenchPostProcessor:
""""Post processor for OTTER on MMBench."""
def __init__(self) -> None:
pass
def __call__(self, output_token: torch.tensor, tokenizer) -> str:
if output_token[0] == 0:
output_token = output_token[1:]
if output_token[0] == 1:
output_token = output_token[1:]
output_text = tokenizer.decode(output_token,
add_special_tokens=False) # noqa
output_text = self._extract_key_words(output_text)
return output_text
def _extract_key_words(self, output_text: str) -> str:
output_text = (output_text.split('<answer>')[-1].lstrip().rstrip().
split('<|endofchunk|>')[0].lstrip().rstrip())
pattern = re.compile(r'([A-Z]\.)')
res = pattern.findall(output_text)
if len(res) > 0:
output_text = res[0][:-1]
return output_text
class OTTERCOCOCaptionPostProcessor:
""""Post processor for OTTER on COCO Caption."""
def __init__(self) -> None:
pass
def __call__(self, output_token: torch.tensor, tokenizer) -> str:
if output_token[0] == 0:
output_token = output_token[1:]
if output_token[0] == 1:
output_token = output_token[1:]
output_text = tokenizer.decode(output_token,
add_special_tokens=False) # noqa
output_text = (output_text.split('<answer>')[-1].lstrip().rstrip().
split('<|endofchunk|>')[0].lstrip().rstrip())
pattern = re.compile(r'([A-Z]\.)')
res = pattern.findall(output_text)
if len(res) > 0:
output_text = res[0][:-1]
return output_text
class OTTERScienceQAPostProcessor:
""""Post processor for OTTER on ScienceQA."""
def __init__(self) -> None:
pass
def __call__(self, output_token: torch.tensor, tokenizer) -> str:
if output_token[0] == 0:
output_token = output_token[1:]
if output_token[0] == 1:
output_token = output_token[1:]
output_text = tokenizer.decode(output_token,
add_special_tokens=False) # noqa
output_text = (output_text.split('<answer>')[-1].lstrip().rstrip().
split('<|endofchunk|>')[0].lstrip().rstrip())
pattern = re.compile(r'\(([A-Z])\)')
output_text = pattern.findall(output_text)
if len(output_text) == 0:
output_text = random.choice(['A', 'B', 'C', 'D'])
else:
output_text = output_text[0]
return output_text
class OTTERVQAPostProcessor:
""""Post processor for OTTER on VQA."""
def __init__(self) -> None:
pass
def __call__(self, output_token: torch.tensor, tokenizer) -> str:
if output_token[0] == 0:
output_token = output_token[1:]
if output_token[0] == 1:
output_token = output_token[1:]
output_text = tokenizer.decode(output_token,
add_special_tokens=False) # noqa
output_text = (output_text.split('<answer>')[-1].lstrip().rstrip().
split('<|endofchunk|>')[0].lstrip().rstrip())
return output_text
class OTTERVSRPostProcessor:
""""Post processor for OTTER on VSR."""
def __init__(self) -> None:
pass
def __call__(self, output_token: torch.tensor, tokenizer) -> str:
if output_token[0] == 0:
output_token = output_token[1:]
if output_token[0] == 1:
output_token = output_token[1:]
output_text = tokenizer.decode(output_token, add_special_tokens=False)
pattern = r'yes|no|Yes|No'
output_text = re.findall(pattern, output_text)
if len(output_text) > 0:
output_text = output_text[0].lower()
return output_text
class OTTERMMEPostProcessor(OTTERMMBenchPostProcessor):
""""Post processor for OTTER on MME."""
def __init__(self) -> None:
super().__init__()
def __call__(self, output_token: torch.tensor, tokenizer) -> str:
response = super().__call__(output_token, tokenizer)
# extract yes or no, copy from MME official evaluation script
prefix_pred_ans = response[:4].lower()
if 'yes' in prefix_pred_ans:
pred_label = 'yes'
elif 'no' in prefix_pred_ans:
pred_label = 'no'
else:
pred_label = 'other'
return pred_label
from typing import List
import torch
from mmpretrain.structures import DataSample
class OTTERMMBenchPromptConstructor:
"""Prompt constructor for OTTER on MMBench.
Args:
image_prompt (str): Image prompt. Defaults to `''`.
reply_prompt (str): Reply prompt. Defaults to `''`.
"""
def __init__(self, user_label: str = '', model_label: str = '') -> None:
self.image_token = '<image>'
self.reply_token = '<answer>'
self.user_label = user_label
self.model_label = model_label
def __call__(self, inputs: dict) -> dict:
"""Construct prompt.
Args:
inputs (dict): Input data containing image and data_samples.
Returns:
dict: A dict containing prompt, images and data_samples.
"""
images = [image.unsqueeze(0) for image in inputs['inputs']]
data_samples = [data_sample for data_sample in inputs['data_samples']]
images = torch.cat(images, dim=0)
inputs = {'image': images, 'data_samples': data_samples}
data_samples = inputs['data_samples']
prompt = self._process(data_samples)
inputs.update({'prompt': prompt})
return inputs
def _process(self, data_samples: List[DataSample]) -> str:
"""Process data sample to prompt.
Args:
data_samples (List[DataSample]): A list of data_samples.
Returns:
str: Prompt.
"""
assert len(data_samples) == 1, 'Only support batch size 1.'
data_sample = data_samples[0]
question = data_sample.get('question')
options = data_sample.get('options')
context = data_sample.get('context')
# e.g. <image>User: What is the color of the sky? A: Blue B: Red C: Green D: Yellow GPT:<answer> # noqa
if context is not None:
prompt = f'{self.image_token}{self.user_label} {context} {question} {options} {self.model_label}:{self.reply_token}' # noqa
else:
prompt = f'{self.image_token}{self.user_label} {question} {options} {self.model_label}:{self.reply_token}' # noqa
return prompt
class OTTERCOCOCaotionPromptConstructor(OTTERMMBenchPromptConstructor):
"""Prompt constructor for OTTER on COCO Caption."""
def _process(self, data_samples: List[DataSample]) -> str:
# e.g. <image>User: a photo of GPT:<answer> # noqa
prompt = f'{self.image_token}{self.user_label} a photo of {self.model_label}:{self.reply_token}' # noqa
return prompt
class OTTERScienceQAPromptConstructor(OTTERMMBenchPromptConstructor):
"""Prompt constructor for OTTER on ScienceQA."""
choice_mapping = {0: 'A', 1: 'B', 2: 'C', 3: 'D', 4: 'E', 5: 'F'}
def _process(self, data_samples: List[DataSample]) -> str:
assert len(data_samples) == 1, 'Only support batch size 1.'
questions = [
'Question: ' + data_sample.get('question') + '\n'
for data_sample in data_samples
] # noqa
choices = [data_sample.get('choices') for data_sample in data_samples]
choices = [[
f'({self.choice_mapping[i]}) ' + item
for i, item in enumerate(choice)
] for choice in choices]
choices = [
'Choices: ' + ' '.join(choice) + '\n' for choice in choices
] # noqa
contexts = [
'Context: ' + data_sample.get('hint') + '\n'
for data_sample in data_samples
] # noqa
question = questions[0]
choice = choices[0]
context = contexts[0]
prompt = f'{self.image_token}{self.user_label} {context} {question} {choice} The answer is {self.model_label}:{self.reply_token}' # noqa
return prompt
class OTTERVQAPromptConstructor(OTTERMMBenchPromptConstructor):
"""Prompt constructor for OTTER on VQA."""
def _process(self, data_samples: List[DataSample]) -> str:
assert len(data_samples) == 1, 'Only support batch size 1.'
questions = [
data_sample.get('question') for data_sample in data_samples
]
question = questions[0]
prompt = f'{self.image_token}{self.user_label} {question}. Answer it with with few words. {self.model_label}:{self.reply_token}' # noqa
return prompt
class OTTERVSRPromptConstructor(OTTERMMBenchPromptConstructor):
"""Prompt constructor for OTTER on VSR."""
def _process(self, data_samples: List[DataSample]) -> str:
assert len(data_samples) == 1, 'Only support batch size 1.'
questions = [
data_sample.get('question') for data_sample in data_samples
]
question = questions[0]
prompt = f'{self.image_token}{self.user_label} {question}. Is the above description correct? Answer yes or no. {self.model_label}:{self.reply_token}' # noqa
return prompt
class OTTERSEEDBenchPromptConstructor(OTTERMMBenchPromptConstructor):
def _process(self, data_samples: List[DataSample]) -> str:
"""Process data sample to prompt.
Args:
data_samples (List[DataSample]): A list of data_samples.
Returns:
str: Prompt.
"""
assert len(data_samples) == 1, 'Only support batch size 1.'
questions = [
data_sample.get('question') for data_sample in data_samples
]
question = questions[0]
prompt = f'{self.image_token}{self.user_label} {question} {self.model_label}:{self.reply_token}' # noqa
return prompt
class OTTERMMEPromptConstructor(OTTERMMBenchPromptConstructor):
"""Prompt constructor for OTTER on MME.
Args:
image_prompt (str): Image prompt. Defaults to `''`.
reply_prompt (str): Reply prompt. Defaults to `''`.
"""
def _process(self, data_samples: List[DataSample]) -> str:
"""Process data sample to prompt.
Args:
data_samples (List[DataSample]): A list of data_samples.
Returns:
str: Prompt.
"""
assert len(data_samples) == 1, 'Only support batch size 1.'
question = data_samples[0].get('question')
prompt = f'{self.image_token}{self.user_label} {question} {self.model_label}:{self.reply_token}' # noqa
return prompt
from .post_processor import QwenVLBasePostProcessor, QwenVLChatVSRPostProcessor
from .prompt_constructor import (QwenVLChatPromptConstructor,
QwenVLChatScienceQAPromptConstructor,
QwenVLChatVQAPromptConstructor,
QwenVLMMBenchPromptConstructor)
from .qwen import QwenVLBase, QwenVLChat
__all__ = [
'QwenVLBase', 'QwenVLChat', 'QwenVLBasePostProcessor',
'QwenVLMMBenchPromptConstructor', 'QwenVLChatPromptConstructor',
'QwenVLChatVQAPromptConstructor', 'QwenVLChatVSRPostProcessor',
'QwenVLChatScienceQAPromptConstructor'
]
# Copyright (c) Alibaba Cloud.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
"""Generation support."""
from typing import List, Tuple, Union
import torch
from transformers import PreTrainedTokenizer
# Types.
HistoryType = List[Tuple[str, str]]
TokensType = List[int]
BatchTokensType = List[List[int]]
def pad_batch(batch: BatchTokensType, pad_id: int,
seq_length: int) -> BatchTokensType:
for tokens in batch:
context_length = len(tokens)
if context_length < seq_length:
tokens.extend([pad_id] * (seq_length - context_length))
return batch
def get_ltor_masks_and_position_ids(
data: torch.Tensor,
eod_token: int,
reset_position_ids: bool,
reset_attention_mask: bool,
eod_mask_loss: bool,
):
"""Build masks and position id for left to right model."""
# Extract batch size and sequence length.
micro_batch_size, seq_length = data.size()
# Attention mask (lower triangular).
if reset_attention_mask:
att_mask_batch = micro_batch_size
else:
att_mask_batch = 1
attention_mask = torch.tril(
torch.ones((att_mask_batch, seq_length, seq_length),
device=data.device)).view(att_mask_batch, 1, seq_length,
seq_length)
# Loss mask.
loss_mask = torch.ones(data.size(), dtype=torch.float, device=data.device)
if eod_mask_loss:
loss_mask[data == eod_token] = 0.0
# Position ids.
position_ids = torch.arange(seq_length,
dtype=torch.long,
device=data.device)
position_ids = position_ids.unsqueeze(0).expand_as(data)
# We need to clone as the ids will be modified based on batch index.
if reset_position_ids:
position_ids = position_ids.clone()
if reset_position_ids or reset_attention_mask:
# Loop through the batches:
for b in range(micro_batch_size):
# Find indices where EOD token is.
eod_index = position_ids[b, data[b] == eod_token]
# Detach indices from positions if going to modify positions.
if reset_position_ids:
eod_index = eod_index.clone()
# Loop through EOD indices:
prev_index = 0
for j in range(eod_index.size()[0]):
i = eod_index[j]
# Mask attention loss.
if reset_attention_mask:
attention_mask[b, 0, (i + 1):, :(i + 1)] = 0
# Reset positions.
if reset_position_ids:
position_ids[b, (i + 1):] -= i + 1 - prev_index
prev_index = i + 1
# Convert attention mask to binary:
attention_mask = attention_mask < 0.5
return attention_mask, loss_mask, position_ids
def get_batch(context_tokens: torch.LongTensor, eod_id: int):
"""Generate batch from context tokens."""
# Move to GPU.
tokens = context_tokens.contiguous().to(context_tokens.device)
# Get the attention mask and position ids.
attention_mask, _, position_ids = get_ltor_masks_and_position_ids(
tokens,
eod_id,
reset_position_ids=False,
reset_attention_mask=False,
eod_mask_loss=False,
)
return tokens, attention_mask, position_ids
def get_stop_words_ids(chat_format: str, tokenizer: PreTrainedTokenizer):
if chat_format == 'raw':
stop_words_ids = [tokenizer.encode('Human:'), [tokenizer.eod_id]]
elif chat_format == 'chatml':
stop_words_ids = [[tokenizer.im_end_id], [tokenizer.im_start_id]]
else:
raise NotImplementedError(f'Unknown chat format {chat_format!r}')
return stop_words_ids
def make_context(
tokenizer: PreTrainedTokenizer,
query: str,
history: List[Tuple[str, str]] = None,
system: str = '',
max_window_size: int = 6144,
chat_format: str = 'chatml',
):
if history is None:
history = []
if chat_format == 'chatml':
im_start, im_end = '<|im_start|>', '<|im_end|>'
im_start_tokens = [tokenizer.im_start_id]
im_end_tokens = [tokenizer.im_end_id]
nl_tokens = tokenizer.encode('\n')
def _tokenize_str(role, content):
return f'{role}\n{content}', tokenizer.encode(
role, allowed_special=set(
tokenizer.IMAGE_ST)) + nl_tokens + tokenizer.encode(
content, allowed_special=set(tokenizer.IMAGE_ST))
system_text, system_tokens_part = _tokenize_str('system', system)
system_tokens = im_start_tokens + system_tokens_part + im_end_tokens
raw_text = ''
context_tokens = []
for turn_query, turn_response in reversed(history):
query_text, query_tokens_part = _tokenize_str('user', turn_query)
query_tokens = im_start_tokens + query_tokens_part + im_end_tokens
if turn_response is not None:
response_text, response_tokens_part = _tokenize_str(
'assistant', turn_response)
response_tokens = im_start_tokens + response_tokens_part + im_end_tokens # noqa
next_context_tokens = nl_tokens + query_tokens + nl_tokens + response_tokens # noqa
prev_chat = (
f'\n{im_start}{query_text}{im_end}\n{im_start}{response_text}{im_end}' # noqa
)
else:
next_context_tokens = nl_tokens + query_tokens + nl_tokens
prev_chat = f'\n{im_start}{query_text}{im_end}\n'
current_context_size = (len(system_tokens) +
len(next_context_tokens) +
len(context_tokens))
if current_context_size < max_window_size:
context_tokens = next_context_tokens + context_tokens
raw_text = prev_chat + raw_text
else:
break
context_tokens = system_tokens + context_tokens
raw_text = f'{im_start}{system_text}{im_end}' + raw_text
context_tokens += (nl_tokens + im_start_tokens +
_tokenize_str('user', query)[1] + im_end_tokens +
nl_tokens + im_start_tokens +
tokenizer.encode('assistant') + nl_tokens)
raw_text += f'\n{im_start}user\n{query}{im_end}\n{im_start}assistant\n'
elif chat_format == 'raw':
raw_text = query
context_tokens = tokenizer.encode(raw_text)
else:
raise NotImplementedError(f'Unknown chat format {chat_format!r}')
return raw_text, context_tokens
def _decode_default(
tokens: List[int],
*,
stop_words: List[str],
eod_words: List[str],
tokenizer: PreTrainedTokenizer,
raw_text_len: int,
verbose: bool = False,
return_end_reason: bool = False,
errors: str = 'replace',
):
trim_decode_tokens = tokenizer.decode(tokens, errors=errors)[raw_text_len:]
if verbose:
print('\nRaw Generate: ', trim_decode_tokens)
end_reason = f'Gen length {len(tokens)}'
for stop_word in stop_words:
trim_decode_tokens = trim_decode_tokens.replace(stop_word, '').strip()
for eod_word in eod_words:
if eod_word in trim_decode_tokens:
end_reason = f'Gen {eod_word!r}'
trim_decode_tokens = trim_decode_tokens.split(eod_word)[0]
trim_decode_tokens = trim_decode_tokens.strip()
if verbose:
print('\nEnd Reason:', end_reason)
print('\nGenerate: ', trim_decode_tokens)
if return_end_reason:
return trim_decode_tokens, end_reason
else:
return trim_decode_tokens
def _decode_chatml(tokens: List[int],
*,
stop_words: List[str],
eod_token_ids: List[int],
tokenizer: PreTrainedTokenizer,
raw_text_len: int,
context_length: int,
verbose: bool = False,
return_end_reason: bool = False,
errors: str = 'replace'):
end_reason = f'Gen length {len(tokens)}'
eod_token_idx = context_length
for eod_token_idx in range(context_length, len(tokens)):
if tokens[eod_token_idx] in eod_token_ids:
end_reason = f'Gen {tokenizer.decode([tokens[eod_token_idx]])!r}'
break
trim_decode_tokens = tokenizer.decode(tokens[:eod_token_idx],
errors=errors)[raw_text_len:]
if verbose:
print('\nRaw Generate w/o EOD:',
tokenizer.decode(tokens, errors=errors)[raw_text_len:])
print('\nRaw Generate:', trim_decode_tokens)
print('\nEnd Reason:', end_reason)
for stop_word in stop_words:
trim_decode_tokens = trim_decode_tokens.replace(stop_word, '').strip()
trim_decode_tokens = trim_decode_tokens.strip()
if verbose:
print('\nGenerate:', trim_decode_tokens)
if return_end_reason:
return trim_decode_tokens, end_reason
else:
return trim_decode_tokens
def decode_tokens(
tokens: Union[torch.LongTensor, TokensType],
tokenizer: PreTrainedTokenizer,
raw_text_len: int,
context_length: int,
chat_format: str,
verbose: bool = False,
return_end_reason: bool = False,
errors: str = 'replace',
) -> str:
if torch.is_tensor(tokens):
tokens = tokens.cpu().numpy().tolist()
if chat_format == 'chatml':
return _decode_chatml(
tokens,
stop_words=[],
eod_token_ids=[tokenizer.im_start_id, tokenizer.im_end_id],
tokenizer=tokenizer,
raw_text_len=raw_text_len,
context_length=context_length,
verbose=verbose,
return_end_reason=return_end_reason,
errors=errors,
)
elif chat_format == 'raw':
return _decode_default(
tokens,
stop_words=['<|endoftext|>'],
eod_words=['<|endoftext|>'],
tokenizer=tokenizer,
raw_text_len=raw_text_len,
verbose=verbose,
return_end_reason=return_end_reason,
errors=errors,
)
else:
raise NotImplementedError(f'Unknown chat format {chat_format!r}')
from typing import Any
import torch
class QwenVLBasePostProcessor:
"""Post processor for Qwen-VL-Base."""
def __init__(self) -> None:
pass
def __call__(self, pred: torch.tensor, tokenizer: Any,
input_len: int) -> str:
response = self.tokenizer.decode(pred)[input_len:]
response = response.replace('<|endoftext|>', '').strip()
return response
class QwenVLChatVSRPostProcessor:
"""VSR post processor for Qwen-VL-Chat."""
def __init__(self) -> None:
pass
def __call__(self, response: str) -> str:
if 'yes' in response.lower():
return 'yes'
elif 'no' in response.lower():
return 'no'
else:
return 'unknown'
class QwenVLMMBenchPromptConstructor:
"""MMBench prompt constructor for Qwen-VL.
The output is a dict following the input format of Qwen-VL tokenizer.
"""
def __init__(self) -> None:
pass
def __call__(self, inputs: dict) -> list:
data_samples = inputs['data_samples']
assert len(data_samples) == 1
data_sample = data_samples[0]
question = data_sample.get('question')
options = data_sample.get('options')
context = data_sample.get('context')
if context is not None:
prompt = context + ' ' + question + ' ' + options
else:
prompt = question + ' ' + options
format_input = [
{
'image': 'This_is_path_to_an_image.'
}, # Just placeholder for Image Tokens
{
'text': prompt
},
]
return format_input
class QwenVLChatPromptConstructor:
"""Prompt constructorfor Qwen-VL-Chat."""
def __init__(self, prompt='') -> None:
self.prompt = prompt
def __call__(self, inputs: dict) -> list:
assert len(inputs['data_samples']) == 1
format_input = [
{
'image': 'This_is_path_to_an_image.'
}, # Just placeholder for Image Tokens
{
'text': self.prompt
},
]
return format_input
class QwenVLChatVQAPromptConstructor:
"""VQA prompt constructor for Qwen-VL-Chat."""
def __init__(self, prompt='') -> None:
self.prompt = prompt
def __call__(self, inputs: dict) -> list:
data_samples = inputs['data_samples']
assert len(data_samples) == 1
data_sample = data_samples[0]
question = data_sample.get('question')
format_input = [
{
'image': 'This_is_path_to_an_image.'
}, # Just placeholder for Image Tokens
{
'text': question + self.prompt
},
]
return format_input
class QwenVLChatScienceQAPromptConstructor:
"""ScienceQA prompt constructor for Qwen-VL-Chat."""
choice_mapping = {0: 'A', 1: 'B', 2: 'C', 3: 'D', 4: 'E', 5: 'F'}
def __init__(self, prompt='') -> None:
self.prompt = prompt
def __call__(self, inputs: dict) -> list:
data_samples = inputs['data_samples']
assert len(data_samples) == 1
data_sample = data_samples[0]
question = data_sample.get('question')
choices = data_sample.get('choices')
choices = [
f'({self.choice_mapping[i]}) ' + item
for i, item in enumerate(choices)
]
choices = 'Choices: ' + ' '.join(choices) + '\n'
contexts = 'Context: ' + data_sample.get('hint')
format_input = [
{
'image': 'This_is_path_to_an_image.'
}, # Just placeholder for Image Tokens
{
'text': contexts + question + choices + self.prompt
},
]
return format_input
import types
from typing import Optional, Tuple
import mmengine
import torch
import torch.nn as nn
from mmengine.device import get_device
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.generation import GenerationConfig
from transformers.modeling_outputs import BaseModelOutputWithPast
from opencompass.registry import MM_MODELS
from .generation_utils import decode_tokens, make_context
@MM_MODELS.register_module('qwen-vl-base')
class QwenVLBase(nn.Module):
"""Inference code of Qwen-VL.
We load the Qwen model via Huggingface.
Args:
pretrained_path (str): Path to Qwen checkpoint or repo id.
prompt_constructor (dict): The config of prompt constructor.
post_processor (dict): The config of post processor.
is_caption_task (bool): Whether the task is caption task.
Defaults to False.
commit_id (str): Use given version of Qwen-VL.
Warning: the latest version may have some conflicts.
Recommend to use the given default version.
"""
def __init__(
self,
pretrained_path: str,
prompt_constructor: dict = None,
post_processor: dict = None,
is_caption_task: bool = False,
commit_id: str = '548275c8b99de56dec203c0e793be18e030f2f4c'
) -> None:
super().__init__()
self.tokenizer = AutoTokenizer.from_pretrained(pretrained_path,
trust_remote_code=True,
revision=commit_id)
self.model = AutoModelForCausalLM.from_pretrained(
pretrained_path,
device_map=get_device(),
trust_remote_code=True,
revision=commit_id)
self.model.generation_config = GenerationConfig.from_pretrained(
pretrained_path, trust_remote_code=True, revision=commit_id)
if prompt_constructor is not None:
self.prompt_constructor = mmengine.registry.build_from_cfg(
prompt_constructor, MM_MODELS)
if post_processor is not None:
self.post_processor = mmengine.registry.build_from_cfg(
post_processor, MM_MODELS)
else:
self.post_processor = None
self.is_caption_task = is_caption_task
self.model.transformer.forward = types.MethodType(
forward_hack, self.model.transformer)
def _build_embeds(self, images, input_ids):
# encode image
images = self.model.transformer.visual(images)
# compute image position
bos_pos = torch.where(input_ids == self.model.transformer.config.
visual['image_start_id'])
eos_pos = torch.where(
input_ids ==
self.model.transformer.config.visual['image_start_id'] + 1)
assert (bos_pos[0] == eos_pos[0]).all()
img_pos = torch.stack((bos_pos[0], bos_pos[1], eos_pos[1]), dim=1)
# embed words
inputs_embeds = self.model.transformer.wte(input_ids)
# embed image tokens
for idx, (i, a, b) in enumerate(img_pos):
inputs_embeds[i][a + 1:b] = images[idx]
return inputs_embeds
def generate(self, batch):
images = batch.pop('inputs')
images = torch.stack(images, dim=0)
format_input = self.prompt_constructor(batch)
query = self.tokenizer.from_list_format(format_input)
inputs = self.tokenizer(query, return_tensors='pt')
inputs = inputs.to(get_device())
input_ids, token_type_ids, attention_mask = inputs[
'input_ids'], inputs['token_type_ids'], inputs['attention_mask']
inputs_embeds = self._build_embeds(images, input_ids)
pred = self.model.generate(input_ids=input_ids,
inputs_embeds=inputs_embeds,
attention_mask=attention_mask,
token_type_ids=token_type_ids)
response = self.post_processor(pred.cpu()[0])
data_sample = batch['data_samples'][0]
if self.is_caption_task:
data_sample.pred_caption = response
else:
data_sample.pred_answer = response
return data_sample
def forward(self, batch):
return self.generate(batch)
@MM_MODELS.register_module('qwen-vl-chat')
class QwenVLChat(QwenVLBase):
"""Inference code of Qwen-VL-Chat.
We load the Qwen model via Huggingface.
Args:
pretrained_path (str): Path to Qwen checkpoint or repo id.
prompt_constructor (dict): The config of prompt constructor.
post_processor (dict): The config of post processor.
is_caption_task (bool): Whether the task is caption task.
Defaults to False.
"""
def __init__(self,
pretrained_path: str,
prompt_constructor: dict = None,
post_processor: dict = None,
is_caption_task: bool = False) -> None:
super().__init__(pretrained_path, prompt_constructor, post_processor,
is_caption_task)
def generate(self, batch):
images = batch.pop('inputs')
images = torch.stack(images, dim=0)
format_input = self.prompt_constructor(batch)
query = self.tokenizer.from_list_format(format_input)
raw_text, context_tokens = make_context(
self.tokenizer,
query,
system='You are a helpful assistant.',
chat_format=self.model.generation_config.chat_format,
)
input_ids = torch.tensor([context_tokens]).to(get_device())
inputs_embeds = self._build_embeds(images, input_ids)
pred = self.model.generate(input_ids=input_ids,
inputs_embeds=inputs_embeds)
response = decode_tokens(
pred[0],
self.tokenizer,
raw_text_len=len(raw_text),
context_length=len(context_tokens),
chat_format=self.model.generation_config.chat_format,
verbose=False,
errors='replace')
if self.post_processor:
response = self.post_processor(response)
data_sample = batch['data_samples'][0]
if self.is_caption_task:
data_sample.pred_caption = response
else:
data_sample.pred_answer = response
return data_sample
def forward_hack(self,
input_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.Tensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None):
if past_key_values is None and input_ids is not None and torch.any(
input_ids == self.config.visual['image_start_id']):
bos_pos = torch.where(
input_ids == self.config.visual['image_start_id'])
eos_pos = torch.where(
input_ids == self.config.visual['image_start_id'] + 1)
assert (bos_pos[0] == eos_pos[0]).all()
img_pos = torch.stack((bos_pos[0], bos_pos[1], eos_pos[1]), dim=1)
images = []
for i, a, b in img_pos:
image = input_ids[i][a + 1:b - 1].tolist()
image = image[:image.index(self.config.visual['image_start_id'] +
2)]
images.append(bytes(image).decode('utf-8'))
images = self.visual.encode(images)
assert images.shape[0] == len(images)
else:
images = None
output_attentions = (output_attentions if output_attentions is not None
else self.config.output_attentions)
output_hidden_states = (output_hidden_states if output_hidden_states
is not None else self.config.output_hidden_states)
use_cache = use_cache if use_cache is not None else self.config.use_cache
return_dict = (return_dict
if return_dict is not None else self.config.use_return_dict)
if input_ids is not None and inputs_embeds is not None:
raise ValueError(
'You cannot specify both input_ids and inputs_embeds at the same time' # noqa
)
elif input_ids is not None:
input_shape = input_ids.size()
input_ids = input_ids.view(-1, input_shape[-1])
batch_size = input_ids.shape[0]
elif inputs_embeds is not None:
input_shape = inputs_embeds.size()[:-1]
batch_size = inputs_embeds.shape[0]
else:
raise ValueError(
'You have to specify either input_ids or inputs_embeds')
device = input_ids.device if input_ids is not None else inputs_embeds.device # noqa
if token_type_ids is not None:
token_type_ids = token_type_ids.view(-1, input_shape[-1])
if position_ids is not None:
position_ids = position_ids.view(-1, input_shape[-1])
if past_key_values is None:
past_length = 0
past_key_values = tuple([None] * len(self.h))
else:
past_length = past_key_values[0][0].size(-2)
if position_ids is None:
position_ids = torch.arange(
past_length,
input_shape[-1] + past_length,
dtype=torch.long,
device=device,
)
position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1])
encoder_attention_mask = None
head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
if inputs_embeds is None:
inputs_embeds = self.wte(input_ids)
if batch_size <= 0:
raise ValueError('batch_size has to be defined and > 0')
attention_mask = self._prepare_decoder_attention_mask(
attention_mask, input_shape, inputs_embeds, past_length)
hidden_states = inputs_embeds
hidden_states = self.drop(hidden_states)
if images is not None:
for idx, (i, a, b) in enumerate(img_pos):
hidden_states[i][a + 1:b] = images[idx]
output_shape = input_shape + (hidden_states.size(-1), )
presents = () if use_cache else None
all_self_attentions = () if output_attentions else None
all_hidden_states = () if output_hidden_states else None
for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states, )
if self.gradient_checkpointing and self.training:
def create_custom_forward(module):
def custom_forward(*inputs):
# None for past_key_value
return module(*inputs, use_cache, output_attentions)
return custom_forward
outputs = torch.utils.checkpoint.checkpoint(
create_custom_forward(block),
hidden_states,
None,
attention_mask,
head_mask[i],
encoder_hidden_states,
encoder_attention_mask,
)
else:
outputs = block(
hidden_states,
layer_past=layer_past,
attention_mask=attention_mask,
head_mask=head_mask[i],
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
use_cache=use_cache,
output_attentions=output_attentions,
)
hidden_states = outputs[0]
if use_cache is True:
presents = presents + (outputs[2 if output_attentions else 1], )
if output_attentions:
all_self_attentions = all_self_attentions + (outputs[1], )
hidden_states = self.ln_f(hidden_states)
hidden_states = hidden_states.view(output_shape)
# Add last hidden state
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states, )
if not return_dict:
return tuple(v for v in [hidden_states, presents, all_hidden_states]
if v is not None)
return BaseModelOutputWithPast(
last_hidden_state=hidden_states,
past_key_values=presents,
hidden_states=all_hidden_states,
attentions=all_self_attentions,
)
from .post_processor import (VisualGLMBasePostProcessor,
VisualGLMVSRPostProcessor)
from .prompt_constructor import (VisualGLMBasePromptConstructor,
VisualGLMIconQAPromptConstructor,
VisualGLMMMBenchPromptConstructor,
VisualGLMScienceQAPromptConstructor,
VisualGLMVQAPromptConstructor)
from .visualglm import VisualGLM
__all__ = [
'VisualGLM', 'VisualGLMBasePostProcessor', 'VisualGLMVSRPostProcessor',
'VisualGLMBasePromptConstructor', 'VisualGLMMMBenchPromptConstructor',
'VisualGLMVQAPromptConstructor', 'VisualGLMScienceQAPromptConstructor',
'VisualGLMIconQAPromptConstructor'
]
from typing import Any
import torch
class VisualGLMBasePostProcessor:
"""Base post processor for VisualGLM."""
def __init__(self) -> None:
pass
def __call__(self, output_token: torch.tensor, tokenizer: Any) -> str:
return tokenizer.decode(output_token)
class VisualGLMVSRPostProcessor(VisualGLMBasePostProcessor):
"""VSR post processor for VisualGLM."""
def __init__(self) -> None:
super().__init__()
def __call__(self, output_token: torch.tensor, tokenizer: Any) -> str:
output_text = tokenizer.decode(output_token)
if 'yes' in output_text.lower():
return 'yes'
elif 'no' in output_text.lower():
return 'no'
else:
return 'unknown'
class VisualGLMMMBenchPromptConstructor:
"""MMBench prompt constructor for VisualGLM.
Args:
system_prompt (str): System prompt. (Default: '')
human_prompt (str): Human prompt. (Default: 'Q:')
assistant_prompt (str): Assistant prompt. (Default: 'A:')
"""
def __init__(self,
system_prompt: str = '',
human_prompt: str = 'Q:',
assistant_prompt: str = 'A:') -> None:
self.system_prompt = system_prompt
self.human_prompt = human_prompt
self.assistant_prompt = assistant_prompt
def __call__(self, batch: dict) -> tuple:
"""Construct prompt.
Args:
batch (dict): Input data containing image and data_samples.
Returns:
A tuple containing images, prompt, data_samples and image_position.
"""
assert len(batch['inputs']) == 1
image = batch.pop('inputs')[0].unsqueeze(0)
data_sample = batch.pop('data_samples')[0]
img_prompt = '<img></img>'
if data_sample.get('context') is not None:
prompt = img_prompt + self.system_prompt + self.human_prompt + data_sample.context + ' ' + data_sample.question + ' ' + data_sample.options # noqa
else:
prompt = img_prompt + self.system_prompt + self.human_prompt + data_sample.question + ' ' + data_sample.options # noqa
prompt += self.assistant_prompt
image_position = prompt.rfind('<img>') + 5
return image, prompt, data_sample, image_position
class VisualGLMBasePromptConstructor:
"""Base prompt constructor for VisualGLM.
The prompt will concat <img> and the given system prompt.
Args:
system_prompt (str): System prompt. (Default: '')
human_prompt (str): Human prompt. (Default: 'Q:')
assistant_prompt (str): Assistant prompt. (Default: 'A:')
"""
def __init__(self,
system_prompt: str = '',
human_prompt: str = 'Q:',
assistant_prompt: str = 'A:') -> None:
self.prompt = system_prompt
self.human_prompt = human_prompt
self.assistant_prompt = assistant_prompt
def __call__(self, batch: dict) -> tuple:
"""Construct prompt.
Args:
batch (dict): Input data containing image and data_samples.
Returns:
A tuple containing images, prompt, data_samples and image_position.
"""
assert len(batch['inputs']) == 1
image = batch.pop('inputs')[0].unsqueeze(0)
data_sample = batch.pop('data_samples')[0]
# generate text prompt
prompt = '<img></img>' + self.human_prompt + self.prompt + self.assistant_prompt # noqa
image_position = prompt.rfind('<img>') + 5
return image, prompt, data_sample, image_position
class VisualGLMVQAPromptConstructor(VisualGLMBasePromptConstructor):
"""VQA prompt constructor for VisualGLM.
The prompt will concat <img>, the question and the system prompt.
Args:
system_prompt (str): System prompt. (Default: '')
human_prompt (str): Human prompt. (Default: 'Q:')
assistant_prompt (str): Assistant prompt. (Default: 'A:')
"""
def __init__(self,
system_prompt='',
human_prompt: str = 'Q:',
assistant_prompt: str = 'A:') -> None:
super().__init__(system_prompt, human_prompt, assistant_prompt)
def __call__(self, batch: dict) -> tuple:
"""Construct prompt.
Args:
batch (dict): Input data containing image and data_samples.
Returns:
A tuple containing images, prompt, data_samples and image_position.
"""
assert len(batch['inputs']) == 1
image = batch.pop('inputs')[0].unsqueeze(0)
data_sample = batch.pop('data_samples')[0]
# generate text prompt
question = data_sample.get('question')
prompt = '<img></img>' + self.human_prompt + question + self.prompt
prompt += '\n' + self.assistant_prompt
image_position = prompt.rfind('<img>') + 5
return image, prompt, data_sample, image_position
class VisualGLMScienceQAPromptConstructor(VisualGLMBasePromptConstructor):
"""ScienceQA prompt constructor for VisualGLM.
The prompt will concat image and all terms in a question.
Args:
system_prompt (str): System prompt. (Default: '')
human_prompt (str): Human prompt. (Default: 'Q:')
assistant_prompt (str): Assistant prompt. (Default: 'A:')
"""
choice_mapping = {0: 'A', 1: 'B', 2: 'C', 3: 'D', 4: 'E', 5: 'F'}
def __init__(self,
system_prompt='',
human_prompt: str = 'Q:',
assistant_prompt: str = 'A:') -> None:
super().__init__(system_prompt, human_prompt, assistant_prompt)
def __call__(self, batch: dict) -> tuple:
"""Construct prompt.
Args:
batch (dict): Input data containing image and data_samples.
Returns:
A tuple containing images, prompt, data_samples and image_position.
"""
assert len(batch['inputs']) == 1
image = batch.pop('inputs')[0].unsqueeze(0)
data_sample = batch.pop('data_samples')[0]
questions = 'Question: ' + data_sample.get('question')
choices = data_sample.get('choices')
choices = [
f'({self.choice_mapping[i]}) ' + item
for i, item in enumerate(choices)
]
choices = 'Choices: ' + ' '.join(choices) + '\n'
contexts = 'Context: ' + data_sample.get('hint') + '\n'
# generate text prompt
prompt = '<img></img>' + self.human_prompt + contexts + questions + choices + self.prompt + self.assistant_prompt # noqa
image_position = prompt.rfind('<img>') + 5
return image, prompt, data_sample, image_position
class VisualGLMIconQAPromptConstructor(VisualGLMBasePromptConstructor):
"""IconQA prompt constructor for VisualGLM.
The prompt will concat <img>, the question and the system prompt.
Args:
system_prompt (str): System prompt. (Default: '')
human_prompt (str): Human prompt. (Default: 'Q:')
assistant_prompt (str): Assistant prompt. (Default: 'A:')
"""
def __init__(self,
system_prompt='',
human_prompt: str = 'Q:',
assistant_prompt: str = 'A:') -> None:
super().__init__(system_prompt, human_prompt, assistant_prompt)
def __call__(self, batch: dict) -> tuple:
"""Construct prompt.
Args:
batch (dict): Input data containing image and data_samples.
Returns:
A tuple containing images, prompt, data_samples and image_position.
"""
assert len(batch['inputs']) == 1
image = batch.pop('inputs')[0].unsqueeze(0)
data_sample = batch.pop('data_samples')[0]
questions = data_sample.get('question') + '\n'
choices = data_sample.get('choices')
choices = 'Options: ' + ', '.join(choices) + '.\n'
# generate text prompt
prompt = '<img></img>' + self.human_prompt + questions + choices + self.prompt + self.assistant_prompt # noqa
image_position = prompt.rfind('<img>') + 5
return image, prompt, data_sample, image_position
from typing import Optional
import mmengine
import torch
import torch.nn as nn
from mmengine.device import get_device
from transformers import AutoModel, AutoTokenizer
from opencompass.registry import MM_MODELS
@MM_MODELS.register_module('visualglm')
class VisualGLM(nn.Module):
"""Inference code of VisualGLM.
We load the visualGLM model via Huggingface.
Args:
pretrained_path (str): Path to visualGLM checkpoint or repo id.
prompt_constructor (dict): The config of prompt constructor.
post_processor (dict): The config of post processor.
is_caption_task (bool): Whether the task is caption task.
Defaults to False.
gen_kwargs (dict): Customize generate function arguments.
Defaults to None.
"""
def __init__(self,
pretrained_path: str,
prompt_constructor: dict,
post_processor: dict,
is_caption_task: bool = False,
gen_kwargs: Optional[dict] = None) -> None:
super().__init__()
self.tokenizer = AutoTokenizer.from_pretrained(pretrained_path,
trust_remote_code=True)
self.model = AutoModel.from_pretrained(pretrained_path,
trust_remote_code=True).half()
self.prompt_constructor = mmengine.registry.build_from_cfg(
prompt_constructor, MM_MODELS)
self.post_processor = mmengine.registry.build_from_cfg(
post_processor, MM_MODELS)
if gen_kwargs:
self.gen_kwargs = gen_kwargs
else:
self.gen_kwargs = dict(max_length=1024,
min_length=100,
do_sample=True,
temperature=0.8,
top_p=0.4,
top_k=100,
repetition_penalty=1.2)
self.is_caption_task = is_caption_task
def encode_by_tokenizer(self, prompt, image_position):
input0 = self.tokenizer.encode(prompt[:image_position],
add_special_tokens=False)
input1 = [self.tokenizer.unk_token_id] * self.model.image_length
input2 = self.tokenizer.encode(prompt[image_position:],
add_special_tokens=False)
input_all = sum([input0, input1, input2], [])
input_all = self.tokenizer.build_inputs_with_special_tokens(input_all)
input_all = torch.tensor(input_all, dtype=torch.long).to(get_device())
input_all = input_all.unsqueeze(0)
pre_image_len = len(input0)
return input_all, pre_image_len
def generate(self, batch):
# process input
image, prompt, data_sample, image_position = self.prompt_constructor(
batch)
image = image.to(self.model.dtype).to(get_device())
# tokenize
input_all, pre_image_len = self.encode_by_tokenizer(
prompt, image_position)
# build input param
inputs = {
'input_ids': input_all,
'pre_image_length': pre_image_len,
'images': image
}
# generate answer
outputs = self.model.generate(**inputs, **self.gen_kwargs)
# format output
outputs = outputs.tolist()[0][input_all.shape[1]:]
answer = self.post_processor(outputs, self.tokenizer)
if self.is_caption_task:
data_sample.pred_caption = answer
else:
data_sample.pred_answer = answer
return data_sample
def forward(self, batch):
return self.generate(batch)
from .mm_naive import * # noqa: F401, F403
from .naive import * # noqa: F401, F403
from .num_worker import * # noqa: F401, F403
from .size import * # noqa: F401, F403
from copy import deepcopy
from typing import Dict, List
from mmengine.config import Config, ConfigDict
from opencompass.registry import PARTITIONERS
from .base import BasePartitioner
@PARTITIONERS.register_module()
class MultimodalNaivePartitioner(BasePartitioner):
"""Multimodal naive task partitioner.
This partitioner will generate a task for each
model-dataset-evaluator pair.
Args:
config (ConfigDict): The full config dict.
"""
def partition(self, models: List[ConfigDict], datasets: List[ConfigDict],
evaluators: List[ConfigDict], load_froms: List[ConfigDict],
work_dir: str, num_gpus: int, num_procs: int,
launcher: str) -> List[Dict]:
"""Partition model-dataset pairs into tasks. Each task is defined as a
dict and will run independently as a unit. Its structure is as follows:
.. code-block:: python
{
'models': [], # a list of model configs
'datasets': [], # a list of dataset configs
'evaluators': [], # a list of evaluator configs
'load_froms': [], # a list of load_from paths
'work_dir': '', # the work dir
'num_gpus': int, # integer, number of gpus for each task
'num_procs': int, # integer, number of gpus on single machine
'launcher': str, # string, how to launch distributed training
}
Args:
models (List[ConfigDict]): A list of model configs.
datasets (List[ConfigDict]): A list of dataset configs.
evaluators (List[ConfigDict]): A list of evaluator configs.
load_froms (List[ConfigDict]): A list of load_from paths.
work_dir (str): The work dir for the task.
num_gpus (int): Number of gpus for each task.
num_procs (int): Number of gpus on single machine.
launcher (str): How to launch distributed training.
Only `slurm`, `pytorch` and `mpi` are available.
Returns:
List[Dict]: A list of tasks.
"""
tasks = []
for model, dataset, evaluator, load_from in zip(
models, datasets, evaluators, load_froms):
task = Config({
'model': model,
'dataset': dataset,
'evaluator': evaluator,
'load_from': load_from,
'work_dir': work_dir,
'num_gpus': num_gpus,
'num_procs': num_procs,
'launcher': launcher
})
tasks.append(task)
return tasks
def __call__(self, cfg: ConfigDict) -> List[Dict]:
"""Generate tasks from config. Each task is defined as a
dict and will run independently as a unit. Its structure is as
follows:
.. code-block:: python
{
'models': [], # a list of model configs
'datasets': [], # a list of dataset configs
'evaluators': [], # a list of evaluator configs
'load_froms': [], # a list of load_from paths
'work_dir': '', # the work dir
'num_gpus': int, # integer, number of gpus for each task
'num_procs': int, # integer, number of gpus on single machine
}
Args:
cfg (ConfigDict): The config dict, containing "models", "dataset"
and "work_dir" keys.
Returns:
List[Dict]: A list of tasks.
"""
cfg = deepcopy(cfg)
models = cfg['models']
datasets = cfg['datasets']
evaluators = cfg['evaluators']
load_froms = cfg['load_froms']
work_dir = cfg['work_dir']
num_gpus = cfg['num_gpus']
num_procs = cfg['num_procs']
launcher = cfg['launcher']
tasks = self.partition(models, datasets, evaluators, load_froms,
work_dir, num_gpus, num_procs, launcher)
self.logger.info(f'Partitioned into {len(tasks)} tasks.')
for i, task in enumerate(tasks):
model_name = task['model']['type']
dataset_name = task['dataset']['dataset']['type']
evaluator_name = task['evaluator'][0]['type']
self.logger.debug(
f'Task {i}: {model_name}-{dataset_name}-{evaluator_name}')
return tasks
from typing import Callable, List, Optional, Type, Union
from mmengine.registry import DATASETS as MMENGINE_DATASETS
from mmengine.registry import METRICS as MMENGINE_METRICS
from mmengine.registry import MODELS as MMENGINE_MODELS
from mmengine.registry import Registry as OriginalRegistry
......@@ -39,15 +37,9 @@ ICL_PROMPT_TEMPLATES = Registry(
locations=['opencompass.openicl.icl_prompt_template'])
ICL_EVALUATORS = Registry('icl_evaluators',
locations=['opencompass.openicl.icl_evaluator'])
DATASETS = Registry('mm_datasets',
parent=MMENGINE_DATASETS,
locations=['opencompass.multimodal.datasets'])
METRICS = Registry('metric',
parent=MMENGINE_METRICS,
locations=['opencompass.metrics'])
MM_MODELS = Registry('mm_model',
parent=MMENGINE_MODELS,
locations=['opencompass.multimodal.models'])
TOT_WRAPPER = Registry('tot_wrapper', locations=['opencompass.datasets'])
......
from .mm_infer import * # noqa: F401, F403
from .openicl_attack import * # noqa: F401, F403
from .openicl_eval import * # noqa: F401, F403
from .openicl_infer import * # noqa: F401, F403
import argparse
import json
import os
import os.path as osp
import random
import time
from typing import List, Sequence
import mmengine
import torch
import torch.distributed as dist
from mmengine.config import Config, ConfigDict
from mmengine.device import get_device
from mmengine.dist import init_dist
from mmengine.evaluator import Evaluator
from mmengine.logging import print_log
from mmengine.model.wrappers import MMDistributedDataParallel
from mmengine.utils import track_iter_progress
from opencompass.registry import MM_MODELS, TASKS
from opencompass.utils import get_logger
def build_model(cfg):
model = MM_MODELS.build(cfg['model'])
load_from = cfg.get('load_from', None)
if load_from is not None:
state_dict = torch.load(cfg['load_from'], map_location='cpu')
if 'model' in state_dict:
state_dict = state_dict['model']
elif 'state_dict' in state_dict:
state_dict = state_dict['state_dict']
msg = model.load_state_dict(state_dict, strict=False)
print_log(msg)
model.to(get_device())
if dist.is_initialized():
model = MMDistributedDataParallel(
model,
device_ids=[int(os.environ['LOCAL_RANK'])],
broadcast_buffers=False)
return model
@TASKS.register_module(force=(__name__ == '__main__')) # A hack for script run
class MultimodalInferTask:
"""Multimodal Inference Task.
This task is used to run the inference process.
"""
def __init__(self, cfg: ConfigDict):
self.num_gpus = cfg.get('num_gpus', 0)
self.num_procs = cfg.get('num_procs', 1)
self.dataloader = cfg.get('dataset')
self.model = cfg.get('model')
self.evaluator = cfg.get('evaluator')
self.cfg = cfg
self.logger = get_logger()
@property
def name(self) -> str:
model_name = self.model['type']
dataset_name = self.dataloader['dataset']['type']
evaluator_name = self.evaluator[0]['type']
return f'{model_name}-{dataset_name}-{evaluator_name}'
def get_log_path(self, file_extension: str = 'json') -> str:
"""Get the path to the log file.
Args:
file_extension (str): The file extension of the log file.
Default: 'json'.
"""
model_name = self.model['type']
dataset_name = self.dataloader['dataset']['type']
evaluator_name = self.evaluator[0]['type']
return osp.join(self.cfg.work_dir, model_name, dataset_name,
f'{evaluator_name}.{file_extension}')
def get_output_paths(self, file_extension: str = 'json') -> List[str]:
"""Get the path to the output file.
Args:
file_extension (str): The file extension of the log file.
Default: 'json'.
"""
model_name = self.model['type']
dataset_name = self.dataloader['dataset']['type']
evaluator_name = self.evaluator[0]['type']
return [
osp.join(self.cfg.work_dir, model_name, dataset_name,
f'{evaluator_name}.{file_extension}')
]
def get_command(self, cfg_path, template):
"""Get the command template for the task.
Args:
cfg_path (str): The path to the config file of the task.
template (str): The template which have '{task_cmd}' to format
the command.
"""
script_path = __file__
if self.num_gpus > 0:
port = random.randint(12000, 32000)
command = (f'torchrun --master_port={port} '
f'--nproc_per_node {self.num_procs} '
f'{script_path} {cfg_path}')
else:
command = f'python {script_path} {cfg_path}'
return template.format(task_cmd=command)
def run(self):
from mmengine.runner import Runner
# only support slurm, pytorch, mpi
init_dist(self.cfg.launcher)
self.logger.info(f'Task {self.name}')
# build dataloader
dataloader = Runner.build_dataloader(self.dataloader)
# build model
model = build_model(self.cfg)
model.eval()
# build evaluator
evaluator = Evaluator(self.evaluator)
for batch in track_iter_progress(dataloader):
if dist.is_initialized():
data_samples = model.module.forward(batch)
else:
data_samples = model.forward(batch)
if not isinstance(data_samples, Sequence):
data_samples = [data_samples]
evaluator.process(data_samples)
metrics = evaluator.evaluate(len(dataloader.dataset))
metrics_file = self.get_output_paths()[0]
mmengine.mkdir_or_exist(osp.split(metrics_file)[0])
with open(metrics_file, 'w') as f:
json.dump(metrics, f)
def parse_args():
parser = argparse.ArgumentParser(description='Model Inferencer')
parser.add_argument('config', help='Config file path')
args = parser.parse_args()
return args
if __name__ == '__main__':
args = parse_args()
cfg = Config.fromfile(args.config)
start_time = time.time()
inferencer = MultimodalInferTask(cfg)
inferencer.run()
end_time = time.time()
get_logger().info(f'time elapsed: {end_time - start_time:.2f}s')
......@@ -270,27 +270,6 @@ def change_accelerator(models, accelerator):
return model_accels
def exec_mm_infer_runner(tasks, args, cfg):
"""execute multimodal infer runner according to args."""
if args.slurm:
runner = SlurmRunner(dict(type='MultimodalInferTask'),
max_num_workers=args.max_num_workers,
partition=args.partition,
quotatype=args.quotatype,
retry=args.retry,
debug=args.debug,
lark_bot_url=cfg['lark_bot_url'])
elif args.dlc:
raise NotImplementedError('Currently, we do not support evaluating \
multimodal models on dlc.')
else:
runner = LocalRunner(task=dict(type='MultimodalInferTask'),
max_num_workers=args.max_num_workers,
debug=args.debug,
lark_bot_url=cfg['lark_bot_url'])
runner(tasks)
def get_config_type(obj) -> str:
return f'{obj.__module__}.{obj.__name__}'
......
# Usage: python eval_mmbench.py mmbench_dev_inference_result.xlsx
import argparse
import json
import os.path as osp
import pickle
import random as rd
import string
from collections import defaultdict
import numpy as np
import pandas as pd
from tqdm import tqdm
from opencompass.models import OpenAI
fout = None
# Utils
def double_log(msg, fout=None):
print(msg)
if fout is not None:
fout.write(str(msg) + '\n')
fout.flush()
def dump(data, f):
def dump_pkl(data, pth):
pickle.dump(data, open(pth, 'wb'))
def dump_json(data, pth):
json.dump(data, open(pth, 'w'))
def dump_jsonl(data, f):
lines = [json.dumps(x, ensure_ascii=False) for x in data]
with open(f, 'w', encoding='utf8') as fout:
fout.write('\n'.join(lines))
def dump_xlsx(data, f):
data.to_excel(f, index=False)
def dump_csv(data, f):
data.to_csv(f, index=False)
def dump_tsv(data, f):
data.to_csv(f, sep='\t', index=False)
handlers = dict(pkl=dump_pkl,
json=dump_json,
jsonl=dump_jsonl,
xlsx=dump_xlsx,
csv=dump_csv,
tsv=dump_tsv)
suffix = f.split('.')[-1]
return handlers[suffix](data, f)
def load(f):
def load_pkl(pth):
return pickle.load(open(pth, 'rb'))
def load_json(pth):
return json.load(open(pth, 'r', encoding='utf-8'))
def load_jsonl(f):
lines = open(f, encoding='utf-8').readlines()
lines = [x.strip() for x in lines]
if lines[-1] == '':
lines = lines[:-1]
data = [json.loads(x) for x in lines]
return data
def load_xlsx(f):
return pd.read_excel(f)
def load_csv(f):
return pd.read_csv(f)
def load_tsv(f):
return pd.read_csv(f, sep='\t')
handlers = dict(pkl=load_pkl,
json=load_json,
jsonl=load_jsonl,
xlsx=load_xlsx,
csv=load_csv,
tsv=load_tsv)
suffix = f.split('.')[-1]
return handlers[suffix](f)
# Accuracy Report
def report_acc(df, group='category'):
assert 'split' in df
assert group in [None, 'category', 'l2-category']
res = defaultdict(list)
res['split'] = ['full', 'dev', 'test']
if group is None:
res['overall'] = [
np.mean(df['hit']),
np.mean(df[df['split'] == 'dev']['hit']),
np.mean(df[df['split'] == 'test']['hit'])
]
return pd.DataFrame(res)
elif group in df:
abilities = list(set(df[group]))
abilities.sort()
for ab in abilities:
sub_df = df[df[group] == ab]
res[ab] = [
np.mean(sub_df['hit']),
np.mean(sub_df[sub_df['split'] == 'dev']['hit']),
np.mean(sub_df[sub_df['split'] == 'test']['hit'])
]
return pd.DataFrame(res)
# Prompt Building
def build_option_str(option_list):
chars = string.ascii_uppercase
s = 'There are several options: \n'
for c, opt in zip(chars, option_list):
if not pd.isna(opt):
s += f'{c}. {opt}\n'
else:
return s
return s
def extract_options(item):
options = []
for c in 'ABCD':
if c in item and not pd.isna(item[c]):
options.append(item[c])
else:
return options
return options
def build_choices(item):
ret = {}
for ch in 'ABCD':
if not pd.isna(item[ch]):
ret[ch] = item[ch]
return ret
def build_prompt(question, options, prediction):
tmpl = (
'You are an AI assistant who will help me to match an answer '
'with several options of a single-choice question. '
'You are provided with a question, several options, and an answer, '
'and you need to find which option is most similar to the answer. '
'If the meaning of all options are significantly different '
'from the answer, output E. '
'Your should output a single uppercase character in A, B, C, D '
'(if they are valid options), and E. \n'
'Example 1: \n'
'Question: What is the main object in image?\nOptions: A. teddy bear '
'B. rabbit C. cat D. dog\nAnswer: a cute teddy bear\nYour output: A\n'
'Example 2: \n'
'Question: What is the main object in image?\nOptions: A. teddy bear '
'B. rabbit C. cat D. dog\nAnswer: Spider\nYour output: E\n'
'Example 3: \n'
'Question: {}?\nOptions: {}\nAnswer: {}\nYour output: ')
return tmpl.format(question, options, prediction)
# Prefetch Answers
def can_infer_option(answer, num_choice=5):
choices = string.ascii_uppercase[:num_choice]
if 'Failed to obtain answer via API' in answer:
return False
def count(splits, choices='ABCD', prefix='', suffix=''):
cnt = 0
for c in choices:
if prefix + c + suffix in splits:
cnt += 1
return cnt
splits = [x.strip() for x in answer.split()]
if count(splits, choices) == 1:
for ch in choices:
if 'A' in splits and len(splits) > 3:
double_log(
f'A might be a quantifier in the string: {answer}. ', fout)
break
if ch in splits:
return ch
tups = [('', '.'), ('', ','), ('', ':'), ('', ')'), ('', ').'), ('(', ')'),
('(', ').'), (':', ''), (':', ','), (':', '.'), (':', ')'),
(':', ').')]
for tup in tups:
if count(splits, choices, prefix=tup[0], suffix=tup[1]) == 1:
for ch in choices:
if tup[0] + ch + tup[1] in splits:
return ch
return False
def can_infer_text(answer, choices):
answer = answer.lower()
assert isinstance(choices, dict)
for k in choices:
assert k in 'ABCD'
choices[k] = str(choices[k]).lower()
cands = []
for k in choices:
if choices[k] in answer:
cands.append(k)
if len(cands) == 1:
return cands[0]
return False
def can_infer(answer, choices):
copt = can_infer_option(answer)
return copt if copt else can_infer_text(answer, choices)
def prefetch_answer(item):
choices = build_choices(item)
return can_infer(item['prediction'], choices)
# Extract answer from a single record
def extract_answer_from_item(model, item):
# It will return: (pred, raw, llm_time)
options = extract_options(item)
option_str = build_option_str(options)
prompt = build_prompt(item['question'], option_str, item['prediction'])
retry = 3
choices = build_choices(item)
ret = can_infer(item['prediction'], choices)
if ret:
return ret, item['prediction']
while retry:
ans = model.generate([prompt])[0]
if 'Failed to obtain answer via API' in ans:
msg = 'GPT API failed to answer. '
double_log(msg, fout)
retry -= 1
else:
ret = can_infer(ans, choices)
if ret:
return ret, ans
else:
double_log(
f'GPT output includes 0 / >1 letter in "ABCD": {ans}',
fout)
retry -= 1
if retry == 0:
num_options = sum([ch in item for ch in 'ABCD'])
if num_options >= 2:
chars = string.ascii_uppercase[:num_options]
chars = chars + 'E'
num_options += 1
tmp = rd.randint(0, num_options - 1)
return chars[
tmp], 'Failed to predict, thus randomly generate one. '
# Extract answer from multiple rolling records
def eval_sub_data(model, sub_data, answer_map):
lt = len(sub_data)
GT, PRED = [], []
for i in range(lt):
item = sub_data.iloc[i]
idx = item['index']
GT.append(answer_map[idx])
PRED.append(prefetch_answer(item))
if PRED[-1] and (GT[-1] != PRED[-1]):
return 0
for i in range(lt):
if PRED[i]:
continue
else:
ret, _ = extract_answer_from_item(model, sub_data.iloc[i])
PRED[i] = ret
if PRED[i] != GT[i]:
return 0
return 1
# Evaluate Results
def eval_result(eval_file, eval_method, meta_file):
rd.seed(2680)
assert eval_method == 'openai'
# Set a large retry number to avoid failure
model = OpenAI('gpt-3.5-turbo-0613', retry=99)
double_log(f'Evaluating {eval_file}', fout)
result_file = eval_file.replace('.xlsx', f'_{eval_method}_result.pkl')
result = {}
if osp.exists(result_file):
result = load(result_file)
data = load(eval_file)
data = data.sort_values(by='index')
data['prediction'] = [str(x) for x in data['prediction']]
for k in data.keys():
data[k.lower() if k not in 'ABCD' else k] = data.pop(k)
meta = load(meta_file)
data_main = data[data['index'] < int(1e6)]
cate_map = {i: c for i, c in zip(meta['index'], meta['category'])}
l2_cate_map = {i: c for i, c in zip(meta['index'], meta['l2-category'])}
split_map = {i: c for i, c in zip(meta['index'], meta['split'])}
answer_map = {i: c for i, c in zip(meta['index'], meta['answer'])}
lt = len(data_main)
hit, tot = 0, 0
for i in tqdm(range(lt)):
# Dealing with the normal part
item_main = data_main.iloc[i]
idx = item_main['index']
if idx in result:
correct = result[idx]
assert correct in [0, 1]
hit += correct
tot += 1
continue
sub_data = data[data['index'] % int(1e6) == idx]
ret = eval_sub_data(model, sub_data, answer_map)
result[idx] = ret
hit += ret
tot += 1
dump(result, result_file)
if (i + 1) % 10 == 0:
double_log((f'Evaluating {eval_file}: {i + 1}/{lt}, '
f'Acc: {hit / tot * 100: .2f}%. '), fout)
dump(data_main, 'tmp.xlsx')
data_main = load('tmp.xlsx')
res = load(result_file)
indices = data_main['index']
data_main['hit'] = [res[i] for i in indices]
data_main['split'] = [split_map[i] for i in indices]
main_idx = data_main['index']
data_main['category'] = [cate_map[i] for i in main_idx]
data_main['l2-category'] = [l2_cate_map[i] for i in main_idx]
# load split
dump(data_main, eval_file.replace('.xlsx', f'_{eval_method}_result.xlsx'))
data_main = load(eval_file.replace('.xlsx', f'_{eval_method}_result.xlsx'))
overall = report_acc(data_main, None)
dump(overall, eval_file.replace('.xlsx', '_overall.csv'))
double_log(overall)
l2 = report_acc(data_main, 'l2-category')
dump(l2, eval_file.replace('.xlsx', '_l2.csv'))
double_log(l2)
leaf = report_acc(data_main, 'category')
dump(leaf, eval_file.replace('.xlsx', '_leaf.csv'))
double_log(leaf)
if fout is not None:
fout.close()
return overall, l2, leaf
def parse_args():
parser = argparse.ArgumentParser(
description='Evaluate Inference Results of MMBench-DEV SPLIT. ')
parser.add_argument('result',
type=str,
help='The path to your inference result. ')
parser.add_argument('--meta',
type=str,
default='data/mmbench_dev_20230712.tsv',
help=('The path to your meta file (dev). '
'Downloaded from MMBench website. '))
args = parser.parse_args()
return args
if __name__ == '__main__':
args = parse_args()
log_pth = args.result.replace('.xlsx', '_openai_eval.log')
fout = open(log_pth, 'a')
acc, l2, leaf = eval_result(args.result, 'openai', args.meta)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment