[Deperecate] Remove multi-modal related stuff (#1072)

* Remove MultiModal * update index.rst * update README * remove mmbench codes * update news --------- Co-authored-by: Leymore <zfz-960727@163.com>

[Deperecate] Remove multi-modal related stuff (#1072)
* Remove MultiModal * update index.rst * update README * remove mmbench codes * update news --------- Co-authored-by: Leymore <zfz-960727@163.com>
3a232db4 · Haodong Duan · GitHub · f1ee11de · f1ee11de · f1ee11de
Unverified Commit 3a232db4 authored Apr 26, 2024 by Haodong Duan Committed by GitHub Apr 26, 2024
20 changed files
--- a/opencompass/multimodal/models/otter/__init__.py
+++ b/opencompass/multimodal/models/otter/__init__.py
-from typing import TYPE_CHECKING
-
-from transformers.utils import (OptionalDependencyNotAvailable,
-                                is_torch_available)
-
-if TYPE_CHECKING:
-    try:
-        if not is_torch_available():
-            raise OptionalDependencyNotAvailable()
-    except OptionalDependencyNotAvailable:
-        pass
-
-from .otter import Otter
-from .post_processor import OTTERMMBenchPostProcessor
-from .prompt_constructor import OTTERMMBenchPromptConstructor
-
-__all__ = [
-    'Otter', 'OTTERMMBenchPromptConstructor', 'OTTERMMBenchPostProcessor'
-]
--- a/opencompass/multimodal/models/otter/otter.py
+++ b/opencompass/multimodal/models/otter/otter.py
-import importlib
-
-import mmengine
-import torch
-import torch.nn as nn
-from mmengine.device import get_device
-
-from opencompass.registry import MM_MODELS
-
-
-@MM_MODELS.register_module('otter-9b')
-class Otter(nn.Module):
-    """Inference code of OTTER.
-
-    Model details:
-        OTTER: a multi-modal model based on OpenFlamingo
-        (open-sourced version of DeepMind's Flamingo)
-        https://github.com/Luodian/Otter
-    Args:
-        model_path (str): The path of OTTER model
-        in Huggingface model hub format.
-        load_bit (str): The bit of OTTER model, can be "fp32" or "bf16".
-        mode (str): The mode of inference. Defaults to 'generation'.
-    """
-
-    def __init__(self,
-                 model_path,
-                 load_bit,
-                 prompt_constructor,
-                 post_processor,
-                 mode='generation') -> None:
-        super().__init__()
-        torch_dtype = torch.bfloat16 if load_bit == 'bf16' else torch.float32
-        otter_ai = importlib.import_module('otter_ai')
-        self.model = otter_ai.OtterForConditionalGeneration.from_pretrained(
-            model_path, torch_dtype=torch_dtype, device_map=get_device())
-        self.tokenizer = self.model.text_tokenizer
-        self.tokenizer.padding_side = 'left'
-        self.model_dtype = next(self.model.parameters()).dtype
-        self.prompt_constructor = mmengine.registry.build_from_cfg(
-            prompt_constructor, MM_MODELS)
-        if post_processor is not None:
-            self.post_processor = mmengine.registry.build_from_cfg(
-                post_processor, MM_MODELS)
-        self.mode = mode
-
-    def forward(self, batch):
-        if self.mode == 'generation':
-            return self.generate(batch)
-        elif self.mode == 'loss':
-            return self.loss(batch)
-        else:
-            raise RuntimeError(f'Invalid mode "{self.mode}".')
-
-    def generate(self, batch):
-        inputs = self.prompt_constructor(batch)
-        image = inputs['image']
-        prompt = inputs['prompt']
-        data_samples = inputs['data_samples']
-        vision_x = image.unsqueeze(1).unsqueeze(0).to(dtype=self.model_dtype)
-        lang_x = self.model.text_tokenizer([prompt], return_tensors='pt')
-        bad_words_id = self.model.text_tokenizer(['User:', 'GPT:']).input_ids
-        generated_text = self.model.generate(
-            vision_x=vision_x.to(self.model.device),
-            lang_x=lang_x['input_ids'].to(self.model.device),
-            attention_mask=lang_x['attention_mask'].to(self.model.device),
-            do_sample=False,
-            max_new_tokens=512,
-            num_beams=3,
-            bad_words_ids=bad_words_id,
-            no_repeat_ngram_size=3,
-        )
-        for i, data_sample in enumerate(data_samples):
-            output_text = self.post_processor(generated_text[i],
-                                              self.model.text_tokenizer)
-            data_sample.pred_answer = output_text
-            data_samples[i] = data_sample
-
-        return data_samples
--- a/opencompass/multimodal/models/otter/post_processor.py
+++ b/opencompass/multimodal/models/otter/post_processor.py
-import random
-import re
-
-import torch
-
-
-class OTTERMMBenchPostProcessor:
-    """"Post processor for OTTER on MMBench."""
-
-    def __init__(self) -> None:
-        pass
-
-    def __call__(self, output_token: torch.tensor, tokenizer) -> str:
-
-        if output_token[0] == 0:
-            output_token = output_token[1:]
-        if output_token[0] == 1:
-            output_token = output_token[1:]
-        output_text = tokenizer.decode(output_token,
-                                       add_special_tokens=False)  # noqa
-        output_text = self._extract_key_words(output_text)
-        return output_text
-
-    def _extract_key_words(self, output_text: str) -> str:
-        output_text = (output_text.split('<answer>')[-1].lstrip().rstrip().
-                       split('<|endofchunk|>')[0].lstrip().rstrip())
-        pattern = re.compile(r'([A-Z]\.)')
-        res = pattern.findall(output_text)
-        if len(res) > 0:
-            output_text = res[0][:-1]
-        return output_text
-
-
-class OTTERCOCOCaptionPostProcessor:
-    """"Post processor for OTTER on COCO Caption."""
-
-    def __init__(self) -> None:
-        pass
-
-    def __call__(self, output_token: torch.tensor, tokenizer) -> str:
-
-        if output_token[0] == 0:
-            output_token = output_token[1:]
-        if output_token[0] == 1:
-            output_token = output_token[1:]
-        output_text = tokenizer.decode(output_token,
-                                       add_special_tokens=False)  # noqa
-        output_text = (output_text.split('<answer>')[-1].lstrip().rstrip().
-                       split('<|endofchunk|>')[0].lstrip().rstrip())
-        pattern = re.compile(r'([A-Z]\.)')
-        res = pattern.findall(output_text)
-        if len(res) > 0:
-            output_text = res[0][:-1]
-        return output_text
-
-
-class OTTERScienceQAPostProcessor:
-    """"Post processor for OTTER on ScienceQA."""
-
-    def __init__(self) -> None:
-        pass
-
-    def __call__(self, output_token: torch.tensor, tokenizer) -> str:
-
-        if output_token[0] == 0:
-            output_token = output_token[1:]
-        if output_token[0] == 1:
-            output_token = output_token[1:]
-        output_text = tokenizer.decode(output_token,
-                                       add_special_tokens=False)  # noqa
-        output_text = (output_text.split('<answer>')[-1].lstrip().rstrip().
-                       split('<|endofchunk|>')[0].lstrip().rstrip())
-        pattern = re.compile(r'\(([A-Z])\)')
-        output_text = pattern.findall(output_text)
-        if len(output_text) == 0:
-            output_text = random.choice(['A', 'B', 'C', 'D'])
-        else:
-            output_text = output_text[0]
-        return output_text
-
-
-class OTTERVQAPostProcessor:
-    """"Post processor for OTTER on VQA."""
-
-    def __init__(self) -> None:
-        pass
-
-    def __call__(self, output_token: torch.tensor, tokenizer) -> str:
-
-        if output_token[0] == 0:
-            output_token = output_token[1:]
-        if output_token[0] == 1:
-            output_token = output_token[1:]
-        output_text = tokenizer.decode(output_token,
-                                       add_special_tokens=False)  # noqa
-        output_text = (output_text.split('<answer>')[-1].lstrip().rstrip().
-                       split('<|endofchunk|>')[0].lstrip().rstrip())
-        return output_text
-
-
-class OTTERVSRPostProcessor:
-    """"Post processor for OTTER on VSR."""
-
-    def __init__(self) -> None:
-        pass
-
-    def __call__(self, output_token: torch.tensor, tokenizer) -> str:
-
-        if output_token[0] == 0:
-            output_token = output_token[1:]
-        if output_token[0] == 1:
-            output_token = output_token[1:]
-        output_text = tokenizer.decode(output_token, add_special_tokens=False)
-        pattern = r'yes|no|Yes|No'
-        output_text = re.findall(pattern, output_text)
-        if len(output_text) > 0:
-            output_text = output_text[0].lower()
-        return output_text
-
-
-class OTTERMMEPostProcessor(OTTERMMBenchPostProcessor):
-    """"Post processor for OTTER on MME."""
-
-    def __init__(self) -> None:
-        super().__init__()
-
-    def __call__(self, output_token: torch.tensor, tokenizer) -> str:
-        response = super().__call__(output_token, tokenizer)
-        # extract yes or no, copy from MME official evaluation script
-        prefix_pred_ans = response[:4].lower()
-
-        if 'yes' in prefix_pred_ans:
-            pred_label = 'yes'
-        elif 'no' in prefix_pred_ans:
-            pred_label = 'no'
-        else:
-            pred_label = 'other'
-
-        return pred_label
--- a/opencompass/multimodal/models/otter/prompt_constructor.py
+++ b/opencompass/multimodal/models/otter/prompt_constructor.py
-from typing import List
-
-import torch
-from mmpretrain.structures import DataSample
-
-
-class OTTERMMBenchPromptConstructor:
-    """Prompt constructor for OTTER on MMBench.
-
-    Args:
-        image_prompt (str): Image prompt. Defaults to `''`.
-        reply_prompt (str): Reply prompt. Defaults to `''`.
-    """
-
-    def __init__(self, user_label: str = '', model_label: str = '') -> None:
-        self.image_token = '<image>'
-        self.reply_token = '<answer>'
-        self.user_label = user_label
-        self.model_label = model_label
-
-    def __call__(self, inputs: dict) -> dict:
-        """Construct prompt.
-
-        Args:
-            inputs (dict): Input data containing image and data_samples.
-
-        Returns:
-            dict: A dict containing prompt, images and data_samples.
-        """
-        images = [image.unsqueeze(0) for image in inputs['inputs']]
-        data_samples = [data_sample for data_sample in inputs['data_samples']]
-        images = torch.cat(images, dim=0)
-        inputs = {'image': images, 'data_samples': data_samples}
-        data_samples = inputs['data_samples']
-        prompt = self._process(data_samples)
-        inputs.update({'prompt': prompt})
-
-        return inputs
-
-    def _process(self, data_samples: List[DataSample]) -> str:
-        """Process data sample to prompt.
-
-        Args:
-            data_samples (List[DataSample]): A list of data_samples.
-
-        Returns:
-            str: Prompt.
-        """
-        assert len(data_samples) == 1, 'Only support batch size 1.'
-        data_sample = data_samples[0]
-        question = data_sample.get('question')
-        options = data_sample.get('options')
-        context = data_sample.get('context')
-        # e.g. <image>User: What is the color of the sky? A: Blue B: Red C: Green D: Yellow GPT:<answer>  # noqa
-        if context is not None:
-            prompt = f'{self.image_token}{self.user_label} {context} {question} {options} {self.model_label}:{self.reply_token}'  # noqa
-        else:
-            prompt = f'{self.image_token}{self.user_label} {question} {options} {self.model_label}:{self.reply_token}'  # noqa
-
-        return prompt
-
-
-class OTTERCOCOCaotionPromptConstructor(OTTERMMBenchPromptConstructor):
-    """Prompt constructor for OTTER on COCO Caption."""
-
-    def _process(self, data_samples: List[DataSample]) -> str:
-        # e.g. <image>User: a photo of GPT:<answer>  # noqa
-        prompt = f'{self.image_token}{self.user_label} a photo of {self.model_label}:{self.reply_token}'  # noqa
-        return prompt
-
-
-class OTTERScienceQAPromptConstructor(OTTERMMBenchPromptConstructor):
-    """Prompt constructor for OTTER on ScienceQA."""
-
-    choice_mapping = {0: 'A', 1: 'B', 2: 'C', 3: 'D', 4: 'E', 5: 'F'}
-
-    def _process(self, data_samples: List[DataSample]) -> str:
-        assert len(data_samples) == 1, 'Only support batch size 1.'
-        questions = [
-            'Question: ' + data_sample.get('question') + '\n'
-            for data_sample in data_samples
-        ]  # noqa
-        choices = [data_sample.get('choices') for data_sample in data_samples]
-        choices = [[
-            f'({self.choice_mapping[i]}) ' + item
-            for i, item in enumerate(choice)
-        ] for choice in choices]
-        choices = [
-            'Choices: ' + ' '.join(choice) + '\n' for choice in choices
-        ]  # noqa
-        contexts = [
-            'Context: ' + data_sample.get('hint') + '\n'
-            for data_sample in data_samples
-        ]  # noqa
-        question = questions[0]
-        choice = choices[0]
-        context = contexts[0]
-        prompt = f'{self.image_token}{self.user_label} {context} {question} {choice} The answer is {self.model_label}:{self.reply_token}'  # noqa
-        return prompt
-
-
-class OTTERVQAPromptConstructor(OTTERMMBenchPromptConstructor):
-    """Prompt constructor for OTTER on VQA."""
-
-    def _process(self, data_samples: List[DataSample]) -> str:
-        assert len(data_samples) == 1, 'Only support batch size 1.'
-        questions = [
-            data_sample.get('question') for data_sample in data_samples
-        ]
-        question = questions[0]
-        prompt = f'{self.image_token}{self.user_label} {question}. Answer it with with few words. {self.model_label}:{self.reply_token}'  # noqa
-        return prompt
-
-
-class OTTERVSRPromptConstructor(OTTERMMBenchPromptConstructor):
-    """Prompt constructor for OTTER on VSR."""
-
-    def _process(self, data_samples: List[DataSample]) -> str:
-        assert len(data_samples) == 1, 'Only support batch size 1.'
-        questions = [
-            data_sample.get('question') for data_sample in data_samples
-        ]
-        question = questions[0]
-        prompt = f'{self.image_token}{self.user_label} {question}. Is the above description correct? Answer yes or no. {self.model_label}:{self.reply_token}'  # noqa
-        return prompt
-
-
-class OTTERSEEDBenchPromptConstructor(OTTERMMBenchPromptConstructor):
-
-    def _process(self, data_samples: List[DataSample]) -> str:
-        """Process data sample to prompt.
-
-        Args:
-            data_samples (List[DataSample]): A list of data_samples.
-
-        Returns:
-            str: Prompt.
-        """
-        assert len(data_samples) == 1, 'Only support batch size 1.'
-        questions = [
-            data_sample.get('question') for data_sample in data_samples
-        ]
-        question = questions[0]
-        prompt = f'{self.image_token}{self.user_label} {question} {self.model_label}:{self.reply_token}'  # noqa
-        return prompt
-
-
-class OTTERMMEPromptConstructor(OTTERMMBenchPromptConstructor):
-    """Prompt constructor for OTTER on MME.
-
-    Args:
-        image_prompt (str): Image prompt. Defaults to `''`.
-        reply_prompt (str): Reply prompt. Defaults to `''`.
-    """
-
-    def _process(self, data_samples: List[DataSample]) -> str:
-        """Process data sample to prompt.
-
-        Args:
-            data_samples (List[DataSample]): A list of data_samples.
-
-        Returns:
-            str: Prompt.
-        """
-        assert len(data_samples) == 1, 'Only support batch size 1.'
-        question = data_samples[0].get('question')
-        prompt = f'{self.image_token}{self.user_label} {question} {self.model_label}:{self.reply_token}'  # noqa
-        return prompt
--- a/opencompass/multimodal/models/qwen/__init__.py
+++ b/opencompass/multimodal/models/qwen/__init__.py
-from .post_processor import QwenVLBasePostProcessor, QwenVLChatVSRPostProcessor
-from .prompt_constructor import (QwenVLChatPromptConstructor,
-                                 QwenVLChatScienceQAPromptConstructor,
-                                 QwenVLChatVQAPromptConstructor,
-                                 QwenVLMMBenchPromptConstructor)
-from .qwen import QwenVLBase, QwenVLChat
-
-__all__ = [
-    'QwenVLBase', 'QwenVLChat', 'QwenVLBasePostProcessor',
-    'QwenVLMMBenchPromptConstructor', 'QwenVLChatPromptConstructor',
-    'QwenVLChatVQAPromptConstructor', 'QwenVLChatVSRPostProcessor',
-    'QwenVLChatScienceQAPromptConstructor'
-]
--- a/opencompass/multimodal/models/qwen/generation_utils.py
+++ b/opencompass/multimodal/models/qwen/generation_utils.py
-# Copyright (c) Alibaba Cloud.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-"""Generation support."""
-
-from typing import List, Tuple, Union
-
-import torch
-from transformers import PreTrainedTokenizer
-
-# Types.
-HistoryType = List[Tuple[str, str]]
-TokensType = List[int]
-BatchTokensType = List[List[int]]
-
-
-def pad_batch(batch: BatchTokensType, pad_id: int,
-              seq_length: int) -> BatchTokensType:
-    for tokens in batch:
-        context_length = len(tokens)
-        if context_length < seq_length:
-            tokens.extend([pad_id] * (seq_length - context_length))
-    return batch
-
-
-def get_ltor_masks_and_position_ids(
-    data: torch.Tensor,
-    eod_token: int,
-    reset_position_ids: bool,
-    reset_attention_mask: bool,
-    eod_mask_loss: bool,
-):
-    """Build masks and position id for left to right model."""
-
-    # Extract batch size and sequence length.
-    micro_batch_size, seq_length = data.size()
-
-    # Attention mask (lower triangular).
-    if reset_attention_mask:
-        att_mask_batch = micro_batch_size
-    else:
-        att_mask_batch = 1
-    attention_mask = torch.tril(
-        torch.ones((att_mask_batch, seq_length, seq_length),
-                   device=data.device)).view(att_mask_batch, 1, seq_length,
-                                             seq_length)
-
-    # Loss mask.
-    loss_mask = torch.ones(data.size(), dtype=torch.float, device=data.device)
-    if eod_mask_loss:
-        loss_mask[data == eod_token] = 0.0
-
-    # Position ids.
-    position_ids = torch.arange(seq_length,
-                                dtype=torch.long,
-                                device=data.device)
-    position_ids = position_ids.unsqueeze(0).expand_as(data)
-    # We need to clone as the ids will be modified based on batch index.
-    if reset_position_ids:
-        position_ids = position_ids.clone()
-
-    if reset_position_ids or reset_attention_mask:
-        # Loop through the batches:
-        for b in range(micro_batch_size):
-
-            # Find indices where EOD token is.
-            eod_index = position_ids[b, data[b] == eod_token]
-            # Detach indices from positions if going to modify positions.
-            if reset_position_ids:
-                eod_index = eod_index.clone()
-
-            # Loop through EOD indices:
-            prev_index = 0
-            for j in range(eod_index.size()[0]):
-                i = eod_index[j]
-                # Mask attention loss.
-                if reset_attention_mask:
-                    attention_mask[b, 0, (i + 1):, :(i + 1)] = 0
-                # Reset positions.
-                if reset_position_ids:
-                    position_ids[b, (i + 1):] -= i + 1 - prev_index
-                    prev_index = i + 1
-
-    # Convert attention mask to binary:
-    attention_mask = attention_mask < 0.5
-
-    return attention_mask, loss_mask, position_ids
-
-
-def get_batch(context_tokens: torch.LongTensor, eod_id: int):
-    """Generate batch from context tokens."""
-    # Move to GPU.
-    tokens = context_tokens.contiguous().to(context_tokens.device)
-    # Get the attention mask and position ids.
-    attention_mask, _, position_ids = get_ltor_masks_and_position_ids(
-        tokens,
-        eod_id,
-        reset_position_ids=False,
-        reset_attention_mask=False,
-        eod_mask_loss=False,
-    )
-    return tokens, attention_mask, position_ids
-
-
-def get_stop_words_ids(chat_format: str, tokenizer: PreTrainedTokenizer):
-    if chat_format == 'raw':
-        stop_words_ids = [tokenizer.encode('Human:'), [tokenizer.eod_id]]
-    elif chat_format == 'chatml':
-        stop_words_ids = [[tokenizer.im_end_id], [tokenizer.im_start_id]]
-    else:
-        raise NotImplementedError(f'Unknown chat format {chat_format!r}')
-    return stop_words_ids
-
-
-def make_context(
-    tokenizer: PreTrainedTokenizer,
-    query: str,
-    history: List[Tuple[str, str]] = None,
-    system: str = '',
-    max_window_size: int = 6144,
-    chat_format: str = 'chatml',
-):
-    if history is None:
-        history = []
-
-    if chat_format == 'chatml':
-        im_start, im_end = '<|im_start|>', '<|im_end|>'
-        im_start_tokens = [tokenizer.im_start_id]
-        im_end_tokens = [tokenizer.im_end_id]
-        nl_tokens = tokenizer.encode('\n')
-
-        def _tokenize_str(role, content):
-            return f'{role}\n{content}', tokenizer.encode(
-                role, allowed_special=set(
-                    tokenizer.IMAGE_ST)) + nl_tokens + tokenizer.encode(
-                        content, allowed_special=set(tokenizer.IMAGE_ST))
-
-        system_text, system_tokens_part = _tokenize_str('system', system)
-        system_tokens = im_start_tokens + system_tokens_part + im_end_tokens
-
-        raw_text = ''
-        context_tokens = []
-
-        for turn_query, turn_response in reversed(history):
-            query_text, query_tokens_part = _tokenize_str('user', turn_query)
-            query_tokens = im_start_tokens + query_tokens_part + im_end_tokens
-            if turn_response is not None:
-                response_text, response_tokens_part = _tokenize_str(
-                    'assistant', turn_response)
-                response_tokens = im_start_tokens + response_tokens_part + im_end_tokens  # noqa
-
-                next_context_tokens = nl_tokens + query_tokens + nl_tokens + response_tokens  # noqa
-                prev_chat = (
-                    f'\n{im_start}{query_text}{im_end}\n{im_start}{response_text}{im_end}'  # noqa
-                )
-            else:
-                next_context_tokens = nl_tokens + query_tokens + nl_tokens
-                prev_chat = f'\n{im_start}{query_text}{im_end}\n'
-
-            current_context_size = (len(system_tokens) +
-                                    len(next_context_tokens) +
-                                    len(context_tokens))
-            if current_context_size < max_window_size:
-                context_tokens = next_context_tokens + context_tokens
-                raw_text = prev_chat + raw_text
-            else:
-                break
-
-        context_tokens = system_tokens + context_tokens
-        raw_text = f'{im_start}{system_text}{im_end}' + raw_text
-        context_tokens += (nl_tokens + im_start_tokens +
-                           _tokenize_str('user', query)[1] + im_end_tokens +
-                           nl_tokens + im_start_tokens +
-                           tokenizer.encode('assistant') + nl_tokens)
-        raw_text += f'\n{im_start}user\n{query}{im_end}\n{im_start}assistant\n'
-
-    elif chat_format == 'raw':
-        raw_text = query
-        context_tokens = tokenizer.encode(raw_text)
-    else:
-        raise NotImplementedError(f'Unknown chat format {chat_format!r}')
-
-    return raw_text, context_tokens
-
-
-def _decode_default(
-    tokens: List[int],
-    *,
-    stop_words: List[str],
-    eod_words: List[str],
-    tokenizer: PreTrainedTokenizer,
-    raw_text_len: int,
-    verbose: bool = False,
-    return_end_reason: bool = False,
-    errors: str = 'replace',
-):
-    trim_decode_tokens = tokenizer.decode(tokens, errors=errors)[raw_text_len:]
-    if verbose:
-        print('\nRaw Generate: ', trim_decode_tokens)
-
-    end_reason = f'Gen length {len(tokens)}'
-    for stop_word in stop_words:
-        trim_decode_tokens = trim_decode_tokens.replace(stop_word, '').strip()
-    for eod_word in eod_words:
-        if eod_word in trim_decode_tokens:
-            end_reason = f'Gen {eod_word!r}'
-        trim_decode_tokens = trim_decode_tokens.split(eod_word)[0]
-    trim_decode_tokens = trim_decode_tokens.strip()
-    if verbose:
-        print('\nEnd Reason:', end_reason)
-        print('\nGenerate: ', trim_decode_tokens)
-
-    if return_end_reason:
-        return trim_decode_tokens, end_reason
-    else:
-        return trim_decode_tokens
-
-
-def _decode_chatml(tokens: List[int],
-                   *,
-                   stop_words: List[str],
-                   eod_token_ids: List[int],
-                   tokenizer: PreTrainedTokenizer,
-                   raw_text_len: int,
-                   context_length: int,
-                   verbose: bool = False,
-                   return_end_reason: bool = False,
-                   errors: str = 'replace'):
-    end_reason = f'Gen length {len(tokens)}'
-    eod_token_idx = context_length
-    for eod_token_idx in range(context_length, len(tokens)):
-        if tokens[eod_token_idx] in eod_token_ids:
-            end_reason = f'Gen {tokenizer.decode([tokens[eod_token_idx]])!r}'
-            break
-
-    trim_decode_tokens = tokenizer.decode(tokens[:eod_token_idx],
-                                          errors=errors)[raw_text_len:]
-    if verbose:
-        print('\nRaw Generate w/o EOD:',
-              tokenizer.decode(tokens, errors=errors)[raw_text_len:])
-        print('\nRaw Generate:', trim_decode_tokens)
-        print('\nEnd Reason:', end_reason)
-    for stop_word in stop_words:
-        trim_decode_tokens = trim_decode_tokens.replace(stop_word, '').strip()
-    trim_decode_tokens = trim_decode_tokens.strip()
-    if verbose:
-        print('\nGenerate:', trim_decode_tokens)
-
-    if return_end_reason:
-        return trim_decode_tokens, end_reason
-    else:
-        return trim_decode_tokens
-
-
-def decode_tokens(
-    tokens: Union[torch.LongTensor, TokensType],
-    tokenizer: PreTrainedTokenizer,
-    raw_text_len: int,
-    context_length: int,
-    chat_format: str,
-    verbose: bool = False,
-    return_end_reason: bool = False,
-    errors: str = 'replace',
-) -> str:
-    if torch.is_tensor(tokens):
-        tokens = tokens.cpu().numpy().tolist()
-
-    if chat_format == 'chatml':
-        return _decode_chatml(
-            tokens,
-            stop_words=[],
-            eod_token_ids=[tokenizer.im_start_id, tokenizer.im_end_id],
-            tokenizer=tokenizer,
-            raw_text_len=raw_text_len,
-            context_length=context_length,
-            verbose=verbose,
-            return_end_reason=return_end_reason,
-            errors=errors,
-        )
-    elif chat_format == 'raw':
-        return _decode_default(
-            tokens,
-            stop_words=['<|endoftext|>'],
-            eod_words=['<|endoftext|>'],
-            tokenizer=tokenizer,
-            raw_text_len=raw_text_len,
-            verbose=verbose,
-            return_end_reason=return_end_reason,
-            errors=errors,
-        )
-    else:
-        raise NotImplementedError(f'Unknown chat format {chat_format!r}')
--- a/opencompass/multimodal/models/qwen/post_processor.py
+++ b/opencompass/multimodal/models/qwen/post_processor.py
-from typing import Any
-
-import torch
-
-
-class QwenVLBasePostProcessor:
-    """Post processor for Qwen-VL-Base."""
-
-    def __init__(self) -> None:
-        pass
-
-    def __call__(self, pred: torch.tensor, tokenizer: Any,
-                 input_len: int) -> str:
-        response = self.tokenizer.decode(pred)[input_len:]
-        response = response.replace('<|endoftext|>', '').strip()
-        return response
-
-
-class QwenVLChatVSRPostProcessor:
-    """VSR post processor for Qwen-VL-Chat."""
-
-    def __init__(self) -> None:
-        pass
-
-    def __call__(self, response: str) -> str:
-        if 'yes' in response.lower():
-            return 'yes'
-        elif 'no' in response.lower():
-            return 'no'
-        else:
-            return 'unknown'
--- a/opencompass/multimodal/models/qwen/prompt_constructor.py
+++ b/opencompass/multimodal/models/qwen/prompt_constructor.py
-class QwenVLMMBenchPromptConstructor:
-    """MMBench prompt constructor for Qwen-VL.
-
-    The output is a dict following the input format of Qwen-VL tokenizer.
-    """
-
-    def __init__(self) -> None:
-        pass
-
-    def __call__(self, inputs: dict) -> list:
-        data_samples = inputs['data_samples']
-        assert len(data_samples) == 1
-        data_sample = data_samples[0]
-        question = data_sample.get('question')
-        options = data_sample.get('options')
-        context = data_sample.get('context')
-        if context is not None:
-            prompt = context + ' ' + question + ' ' + options
-        else:
-            prompt = question + ' ' + options
-        format_input = [
-            {
-                'image': 'This_is_path_to_an_image.'
-            },  # Just placeholder for Image Tokens
-            {
-                'text': prompt
-            },
-        ]
-        return format_input
-
-
-class QwenVLChatPromptConstructor:
-    """Prompt constructorfor Qwen-VL-Chat."""
-
-    def __init__(self, prompt='') -> None:
-        self.prompt = prompt
-
-    def __call__(self, inputs: dict) -> list:
-        assert len(inputs['data_samples']) == 1
-        format_input = [
-            {
-                'image': 'This_is_path_to_an_image.'
-            },  # Just placeholder for Image Tokens
-            {
-                'text': self.prompt
-            },
-        ]
-        return format_input
-
-
-class QwenVLChatVQAPromptConstructor:
-    """VQA prompt constructor for Qwen-VL-Chat."""
-
-    def __init__(self, prompt='') -> None:
-        self.prompt = prompt
-
-    def __call__(self, inputs: dict) -> list:
-        data_samples = inputs['data_samples']
-        assert len(data_samples) == 1
-        data_sample = data_samples[0]
-        question = data_sample.get('question')
-        format_input = [
-            {
-                'image': 'This_is_path_to_an_image.'
-            },  # Just placeholder for Image Tokens
-            {
-                'text': question + self.prompt
-            },
-        ]
-        return format_input
-
-
-class QwenVLChatScienceQAPromptConstructor:
-    """ScienceQA prompt constructor for Qwen-VL-Chat."""
-    choice_mapping = {0: 'A', 1: 'B', 2: 'C', 3: 'D', 4: 'E', 5: 'F'}
-
-    def __init__(self, prompt='') -> None:
-        self.prompt = prompt
-
-    def __call__(self, inputs: dict) -> list:
-        data_samples = inputs['data_samples']
-        assert len(data_samples) == 1
-        data_sample = data_samples[0]
-        question = data_sample.get('question')
-        choices = data_sample.get('choices')
-        choices = [
-            f'({self.choice_mapping[i]}) ' + item
-            for i, item in enumerate(choices)
-        ]
-        choices = 'Choices: ' + ' '.join(choices) + '\n'
-        contexts = 'Context: ' + data_sample.get('hint')
-        format_input = [
-            {
-                'image': 'This_is_path_to_an_image.'
-            },  # Just placeholder for Image Tokens
-            {
-                'text': contexts + question + choices + self.prompt
-            },
-        ]
-        return format_input
--- a/opencompass/multimodal/models/qwen/qwen.py
+++ b/opencompass/multimodal/models/qwen/qwen.py
-import types
-from typing import Optional, Tuple
-
-import mmengine
-import torch
-import torch.nn as nn
-from mmengine.device import get_device
-from transformers import AutoModelForCausalLM, AutoTokenizer
-from transformers.generation import GenerationConfig
-from transformers.modeling_outputs import BaseModelOutputWithPast
-
-from opencompass.registry import MM_MODELS
-
-from .generation_utils import decode_tokens, make_context
-
-
-@MM_MODELS.register_module('qwen-vl-base')
-class QwenVLBase(nn.Module):
-    """Inference code of Qwen-VL.
-
-    We load the Qwen model via Huggingface.
-    Args:
-        pretrained_path (str): Path to Qwen checkpoint or repo id.
-        prompt_constructor (dict): The config of prompt constructor.
-        post_processor (dict): The config of post processor.
-        is_caption_task (bool): Whether the task is caption task.
-            Defaults to False.
-        commit_id (str): Use given version of Qwen-VL.
-            Warning: the latest version may have some conflicts.
-            Recommend to use the given default version.
-    """
-
-    def __init__(
-            self,
-            pretrained_path: str,
-            prompt_constructor: dict = None,
-            post_processor: dict = None,
-            is_caption_task: bool = False,
-            commit_id: str = '548275c8b99de56dec203c0e793be18e030f2f4c'
-    ) -> None:
-        super().__init__()
-        self.tokenizer = AutoTokenizer.from_pretrained(pretrained_path,
-                                                       trust_remote_code=True,
-                                                       revision=commit_id)
-        self.model = AutoModelForCausalLM.from_pretrained(
-            pretrained_path,
-            device_map=get_device(),
-            trust_remote_code=True,
-            revision=commit_id)
-        self.model.generation_config = GenerationConfig.from_pretrained(
-            pretrained_path, trust_remote_code=True, revision=commit_id)
-        if prompt_constructor is not None:
-            self.prompt_constructor = mmengine.registry.build_from_cfg(
-                prompt_constructor, MM_MODELS)
-        if post_processor is not None:
-            self.post_processor = mmengine.registry.build_from_cfg(
-                post_processor, MM_MODELS)
-        else:
-            self.post_processor = None
-        self.is_caption_task = is_caption_task
-        self.model.transformer.forward = types.MethodType(
-            forward_hack, self.model.transformer)
-
-    def _build_embeds(self, images, input_ids):
-        # encode image
-        images = self.model.transformer.visual(images)
-        # compute image position
-        bos_pos = torch.where(input_ids == self.model.transformer.config.
-                              visual['image_start_id'])
-        eos_pos = torch.where(
-            input_ids ==
-            self.model.transformer.config.visual['image_start_id'] + 1)
-        assert (bos_pos[0] == eos_pos[0]).all()
-        img_pos = torch.stack((bos_pos[0], bos_pos[1], eos_pos[1]), dim=1)
-        # embed words
-        inputs_embeds = self.model.transformer.wte(input_ids)
-        # embed image tokens
-        for idx, (i, a, b) in enumerate(img_pos):
-            inputs_embeds[i][a + 1:b] = images[idx]
-        return inputs_embeds
-
-    def generate(self, batch):
-        images = batch.pop('inputs')
-        images = torch.stack(images, dim=0)
-        format_input = self.prompt_constructor(batch)
-        query = self.tokenizer.from_list_format(format_input)
-
-        inputs = self.tokenizer(query, return_tensors='pt')
-        inputs = inputs.to(get_device())
-        input_ids, token_type_ids, attention_mask = inputs[
-            'input_ids'], inputs['token_type_ids'], inputs['attention_mask']
-        inputs_embeds = self._build_embeds(images, input_ids)
-        pred = self.model.generate(input_ids=input_ids,
-                                   inputs_embeds=inputs_embeds,
-                                   attention_mask=attention_mask,
-                                   token_type_ids=token_type_ids)
-        response = self.post_processor(pred.cpu()[0])
-
-        data_sample = batch['data_samples'][0]
-        if self.is_caption_task:
-            data_sample.pred_caption = response
-        else:
-            data_sample.pred_answer = response
-        return data_sample
-
-    def forward(self, batch):
-        return self.generate(batch)
-
-
-@MM_MODELS.register_module('qwen-vl-chat')
-class QwenVLChat(QwenVLBase):
-    """Inference code of Qwen-VL-Chat.
-
-    We load the Qwen model via Huggingface.
-    Args:
-        pretrained_path (str): Path to Qwen checkpoint or repo id.
-        prompt_constructor (dict): The config of prompt constructor.
-        post_processor (dict): The config of post processor.
-        is_caption_task (bool): Whether the task is caption task.
-            Defaults to False.
-    """
-
-    def __init__(self,
-                 pretrained_path: str,
-                 prompt_constructor: dict = None,
-                 post_processor: dict = None,
-                 is_caption_task: bool = False) -> None:
-        super().__init__(pretrained_path, prompt_constructor, post_processor,
-                         is_caption_task)
-
-    def generate(self, batch):
-        images = batch.pop('inputs')
-        images = torch.stack(images, dim=0)
-        format_input = self.prompt_constructor(batch)
-        query = self.tokenizer.from_list_format(format_input)
-
-        raw_text, context_tokens = make_context(
-            self.tokenizer,
-            query,
-            system='You are a helpful assistant.',
-            chat_format=self.model.generation_config.chat_format,
-        )
-
-        input_ids = torch.tensor([context_tokens]).to(get_device())
-
-        inputs_embeds = self._build_embeds(images, input_ids)
-        pred = self.model.generate(input_ids=input_ids,
-                                   inputs_embeds=inputs_embeds)
-
-        response = decode_tokens(
-            pred[0],
-            self.tokenizer,
-            raw_text_len=len(raw_text),
-            context_length=len(context_tokens),
-            chat_format=self.model.generation_config.chat_format,
-            verbose=False,
-            errors='replace')
-
-        if self.post_processor:
-            response = self.post_processor(response)
-
-        data_sample = batch['data_samples'][0]
-        if self.is_caption_task:
-            data_sample.pred_caption = response
-        else:
-            data_sample.pred_answer = response
-        return data_sample
-
-
-def forward_hack(self,
-                 input_ids: Optional[torch.LongTensor] = None,
-                 past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
-                 attention_mask: Optional[torch.FloatTensor] = None,
-                 token_type_ids: Optional[torch.LongTensor] = None,
-                 position_ids: Optional[torch.LongTensor] = None,
-                 head_mask: Optional[torch.FloatTensor] = None,
-                 inputs_embeds: Optional[torch.FloatTensor] = None,
-                 encoder_hidden_states: Optional[torch.Tensor] = None,
-                 encoder_attention_mask: Optional[torch.FloatTensor] = None,
-                 use_cache: Optional[bool] = None,
-                 output_attentions: Optional[bool] = None,
-                 output_hidden_states: Optional[bool] = None,
-                 return_dict: Optional[bool] = None):
-    if past_key_values is None and input_ids is not None and torch.any(
-            input_ids == self.config.visual['image_start_id']):
-        bos_pos = torch.where(
-            input_ids == self.config.visual['image_start_id'])
-        eos_pos = torch.where(
-            input_ids == self.config.visual['image_start_id'] + 1)
-        assert (bos_pos[0] == eos_pos[0]).all()
-        img_pos = torch.stack((bos_pos[0], bos_pos[1], eos_pos[1]), dim=1)
-        images = []
-        for i, a, b in img_pos:
-            image = input_ids[i][a + 1:b - 1].tolist()
-            image = image[:image.index(self.config.visual['image_start_id'] +
-                                       2)]
-            images.append(bytes(image).decode('utf-8'))
-
-        images = self.visual.encode(images)
-        assert images.shape[0] == len(images)
-    else:
-        images = None
-
-    output_attentions = (output_attentions if output_attentions is not None
-                         else self.config.output_attentions)
-    output_hidden_states = (output_hidden_states if output_hidden_states
-                            is not None else self.config.output_hidden_states)
-    use_cache = use_cache if use_cache is not None else self.config.use_cache
-    return_dict = (return_dict
-                   if return_dict is not None else self.config.use_return_dict)
-
-    if input_ids is not None and inputs_embeds is not None:
-        raise ValueError(
-            'You cannot specify both input_ids and inputs_embeds at the same time'  # noqa
-        )
-    elif input_ids is not None:
-        input_shape = input_ids.size()
-        input_ids = input_ids.view(-1, input_shape[-1])
-        batch_size = input_ids.shape[0]
-    elif inputs_embeds is not None:
-        input_shape = inputs_embeds.size()[:-1]
-        batch_size = inputs_embeds.shape[0]
-    else:
-        raise ValueError(
-            'You have to specify either input_ids or inputs_embeds')
-
-    device = input_ids.device if input_ids is not None else inputs_embeds.device  # noqa
-
-    if token_type_ids is not None:
-        token_type_ids = token_type_ids.view(-1, input_shape[-1])
-    if position_ids is not None:
-        position_ids = position_ids.view(-1, input_shape[-1])
-
-    if past_key_values is None:
-        past_length = 0
-        past_key_values = tuple([None] * len(self.h))
-    else:
-        past_length = past_key_values[0][0].size(-2)
-
-    if position_ids is None:
-        position_ids = torch.arange(
-            past_length,
-            input_shape[-1] + past_length,
-            dtype=torch.long,
-            device=device,
-        )
-        position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1])
-
-    encoder_attention_mask = None
-    head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
-
-    if inputs_embeds is None:
-        inputs_embeds = self.wte(input_ids)
-
-    if batch_size <= 0:
-        raise ValueError('batch_size has to be defined and > 0')
-    attention_mask = self._prepare_decoder_attention_mask(
-        attention_mask, input_shape, inputs_embeds, past_length)
-
-    hidden_states = inputs_embeds
-
-    hidden_states = self.drop(hidden_states)
-    if images is not None:
-        for idx, (i, a, b) in enumerate(img_pos):
-            hidden_states[i][a + 1:b] = images[idx]
-    output_shape = input_shape + (hidden_states.size(-1), )
-
-    presents = () if use_cache else None
-    all_self_attentions = () if output_attentions else None
-    all_hidden_states = () if output_hidden_states else None
-    for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
-
-        if output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states, )
-
-        if self.gradient_checkpointing and self.training:
-
-            def create_custom_forward(module):
-
-                def custom_forward(*inputs):
-                    # None for past_key_value
-                    return module(*inputs, use_cache, output_attentions)
-
-                return custom_forward
-
-            outputs = torch.utils.checkpoint.checkpoint(
-                create_custom_forward(block),
-                hidden_states,
-                None,
-                attention_mask,
-                head_mask[i],
-                encoder_hidden_states,
-                encoder_attention_mask,
-            )
-        else:
-            outputs = block(
-                hidden_states,
-                layer_past=layer_past,
-                attention_mask=attention_mask,
-                head_mask=head_mask[i],
-                encoder_hidden_states=encoder_hidden_states,
-                encoder_attention_mask=encoder_attention_mask,
-                use_cache=use_cache,
-                output_attentions=output_attentions,
-            )
-
-        hidden_states = outputs[0]
-        if use_cache is True:
-            presents = presents + (outputs[2 if output_attentions else 1], )
-
-        if output_attentions:
-            all_self_attentions = all_self_attentions + (outputs[1], )
-
-    hidden_states = self.ln_f(hidden_states)
-    hidden_states = hidden_states.view(output_shape)
-    # Add last hidden state
-    if output_hidden_states:
-        all_hidden_states = all_hidden_states + (hidden_states, )
-
-    if not return_dict:
-        return tuple(v for v in [hidden_states, presents, all_hidden_states]
-                     if v is not None)
-
-    return BaseModelOutputWithPast(
-        last_hidden_state=hidden_states,
-        past_key_values=presents,
-        hidden_states=all_hidden_states,
-        attentions=all_self_attentions,
-    )
--- a/opencompass/multimodal/models/visualglm/__init__.py
+++ b/opencompass/multimodal/models/visualglm/__init__.py
-from .post_processor import (VisualGLMBasePostProcessor,
-                             VisualGLMVSRPostProcessor)
-from .prompt_constructor import (VisualGLMBasePromptConstructor,
-                                 VisualGLMIconQAPromptConstructor,
-                                 VisualGLMMMBenchPromptConstructor,
-                                 VisualGLMScienceQAPromptConstructor,
-                                 VisualGLMVQAPromptConstructor)
-from .visualglm import VisualGLM
-
-__all__ = [
-    'VisualGLM', 'VisualGLMBasePostProcessor', 'VisualGLMVSRPostProcessor',
-    'VisualGLMBasePromptConstructor', 'VisualGLMMMBenchPromptConstructor',
-    'VisualGLMVQAPromptConstructor', 'VisualGLMScienceQAPromptConstructor',
-    'VisualGLMIconQAPromptConstructor'
-]
--- a/opencompass/multimodal/models/visualglm/post_processor.py
+++ b/opencompass/multimodal/models/visualglm/post_processor.py
-from typing import Any
-
-import torch
-
-
-class VisualGLMBasePostProcessor:
-    """Base post processor for VisualGLM."""
-
-    def __init__(self) -> None:
-        pass
-
-    def __call__(self, output_token: torch.tensor, tokenizer: Any) -> str:
-        return tokenizer.decode(output_token)
-
-
-class VisualGLMVSRPostProcessor(VisualGLMBasePostProcessor):
-    """VSR post processor for VisualGLM."""
-
-    def __init__(self) -> None:
-        super().__init__()
-
-    def __call__(self, output_token: torch.tensor, tokenizer: Any) -> str:
-        output_text = tokenizer.decode(output_token)
-        if 'yes' in output_text.lower():
-            return 'yes'
-        elif 'no' in output_text.lower():
-            return 'no'
-        else:
-            return 'unknown'
--- a/opencompass/multimodal/models/visualglm/prompt_constructor.py
+++ b/opencompass/multimodal/models/visualglm/prompt_constructor.py
-class VisualGLMMMBenchPromptConstructor:
-    """MMBench prompt constructor for VisualGLM.
-
-    Args:
-        system_prompt (str): System prompt. (Default: '')
-        human_prompt (str): Human prompt. (Default: 'Q:')
-        assistant_prompt (str): Assistant prompt. (Default: 'A:')
-    """
-
-    def __init__(self,
-                 system_prompt: str = '',
-                 human_prompt: str = 'Q:',
-                 assistant_prompt: str = 'A:') -> None:
-        self.system_prompt = system_prompt
-        self.human_prompt = human_prompt
-        self.assistant_prompt = assistant_prompt
-
-    def __call__(self, batch: dict) -> tuple:
-        """Construct prompt.
-
-        Args:
-            batch (dict): Input data containing image and data_samples.
-
-        Returns:
-            A tuple containing images, prompt, data_samples and image_position.
-        """
-
-        assert len(batch['inputs']) == 1
-        image = batch.pop('inputs')[0].unsqueeze(0)
-        data_sample = batch.pop('data_samples')[0]
-        img_prompt = '<img></img>'
-        if data_sample.get('context') is not None:
-            prompt = img_prompt + self.system_prompt + self.human_prompt + data_sample.context + ' ' + data_sample.question + ' ' + data_sample.options  # noqa
-        else:
-            prompt = img_prompt + self.system_prompt + self.human_prompt + data_sample.question + ' ' + data_sample.options  # noqa
-        prompt += self.assistant_prompt
-        image_position = prompt.rfind('<img>') + 5
-
-        return image, prompt, data_sample, image_position
-
-
-class VisualGLMBasePromptConstructor:
-    """Base prompt constructor for VisualGLM.
-
-    The prompt will concat <img> and the given system prompt.
-    Args:
-        system_prompt (str): System prompt. (Default: '')
-        human_prompt (str): Human prompt. (Default: 'Q:')
-        assistant_prompt (str): Assistant prompt. (Default: 'A:')
-    """
-
-    def __init__(self,
-                 system_prompt: str = '',
-                 human_prompt: str = 'Q:',
-                 assistant_prompt: str = 'A:') -> None:
-        self.prompt = system_prompt
-        self.human_prompt = human_prompt
-        self.assistant_prompt = assistant_prompt
-
-    def __call__(self, batch: dict) -> tuple:
-        """Construct prompt.
-
-        Args:
-            batch (dict): Input data containing image and data_samples.
-
-        Returns:
-            A tuple containing images, prompt, data_samples and image_position.
-        """
-
-        assert len(batch['inputs']) == 1
-        image = batch.pop('inputs')[0].unsqueeze(0)
-        data_sample = batch.pop('data_samples')[0]
-
-        # generate text prompt
-        prompt = '<img></img>' + self.human_prompt + self.prompt + self.assistant_prompt  # noqa
-
-        image_position = prompt.rfind('<img>') + 5
-
-        return image, prompt, data_sample, image_position
-
-
-class VisualGLMVQAPromptConstructor(VisualGLMBasePromptConstructor):
-    """VQA prompt constructor for VisualGLM.
-
-    The prompt will concat <img>, the question and the system prompt.
-    Args:
-        system_prompt (str): System prompt. (Default: '')
-        human_prompt (str): Human prompt. (Default: 'Q:')
-        assistant_prompt (str): Assistant prompt. (Default: 'A:')
-    """
-
-    def __init__(self,
-                 system_prompt='',
-                 human_prompt: str = 'Q:',
-                 assistant_prompt: str = 'A:') -> None:
-        super().__init__(system_prompt, human_prompt, assistant_prompt)
-
-    def __call__(self, batch: dict) -> tuple:
-        """Construct prompt.
-
-        Args:
-            batch (dict): Input data containing image and data_samples.
-
-        Returns:
-            A tuple containing images, prompt, data_samples and image_position.
-        """
-
-        assert len(batch['inputs']) == 1
-        image = batch.pop('inputs')[0].unsqueeze(0)
-        data_sample = batch.pop('data_samples')[0]
-
-        # generate text prompt
-        question = data_sample.get('question')
-        prompt = '<img></img>' + self.human_prompt + question + self.prompt
-        prompt += '\n' + self.assistant_prompt
-
-        image_position = prompt.rfind('<img>') + 5
-
-        return image, prompt, data_sample, image_position
-
-
-class VisualGLMScienceQAPromptConstructor(VisualGLMBasePromptConstructor):
-    """ScienceQA prompt constructor for VisualGLM.
-
-    The prompt will concat image and all terms in a question.
-    Args:
-        system_prompt (str): System prompt. (Default: '')
-        human_prompt (str): Human prompt. (Default: 'Q:')
-        assistant_prompt (str): Assistant prompt. (Default: 'A:')
-    """
-
-    choice_mapping = {0: 'A', 1: 'B', 2: 'C', 3: 'D', 4: 'E', 5: 'F'}
-
-    def __init__(self,
-                 system_prompt='',
-                 human_prompt: str = 'Q:',
-                 assistant_prompt: str = 'A:') -> None:
-        super().__init__(system_prompt, human_prompt, assistant_prompt)
-
-    def __call__(self, batch: dict) -> tuple:
-        """Construct prompt.
-
-        Args:
-            batch (dict): Input data containing image and data_samples.
-
-        Returns:
-            A tuple containing images, prompt, data_samples and image_position.
-        """
-
-        assert len(batch['inputs']) == 1
-        image = batch.pop('inputs')[0].unsqueeze(0)
-        data_sample = batch.pop('data_samples')[0]
-
-        questions = 'Question: ' + data_sample.get('question')
-        choices = data_sample.get('choices')
-        choices = [
-            f'({self.choice_mapping[i]}) ' + item
-            for i, item in enumerate(choices)
-        ]
-        choices = 'Choices: ' + ' '.join(choices) + '\n'
-        contexts = 'Context: ' + data_sample.get('hint') + '\n'
-
-        # generate text prompt
-        prompt = '<img></img>' + self.human_prompt + contexts + questions + choices + self.prompt + self.assistant_prompt  # noqa
-        image_position = prompt.rfind('<img>') + 5
-
-        return image, prompt, data_sample, image_position
-
-
-class VisualGLMIconQAPromptConstructor(VisualGLMBasePromptConstructor):
-    """IconQA prompt constructor for VisualGLM.
-
-    The prompt will concat <img>, the question and the system prompt.
-    Args:
-        system_prompt (str): System prompt. (Default: '')
-        human_prompt (str): Human prompt. (Default: 'Q:')
-        assistant_prompt (str): Assistant prompt. (Default: 'A:')
-    """
-
-    def __init__(self,
-                 system_prompt='',
-                 human_prompt: str = 'Q:',
-                 assistant_prompt: str = 'A:') -> None:
-        super().__init__(system_prompt, human_prompt, assistant_prompt)
-
-    def __call__(self, batch: dict) -> tuple:
-        """Construct prompt.
-
-        Args:
-            batch (dict): Input data containing image and data_samples.
-
-        Returns:
-            A tuple containing images, prompt, data_samples and image_position.
-        """
-
-        assert len(batch['inputs']) == 1
-        image = batch.pop('inputs')[0].unsqueeze(0)
-        data_sample = batch.pop('data_samples')[0]
-
-        questions = data_sample.get('question') + '\n'
-        choices = data_sample.get('choices')
-        choices = 'Options: ' + ', '.join(choices) + '.\n'
-
-        # generate text prompt
-        prompt = '<img></img>' + self.human_prompt + questions + choices + self.prompt + self.assistant_prompt  # noqa
-        image_position = prompt.rfind('<img>') + 5
-
-        return image, prompt, data_sample, image_position
--- a/opencompass/multimodal/models/visualglm/visualglm.py
+++ b/opencompass/multimodal/models/visualglm/visualglm.py
-from typing import Optional
-
-import mmengine
-import torch
-import torch.nn as nn
-from mmengine.device import get_device
-from transformers import AutoModel, AutoTokenizer
-
-from opencompass.registry import MM_MODELS
-
-
-@MM_MODELS.register_module('visualglm')
-class VisualGLM(nn.Module):
-    """Inference code of VisualGLM.
-
-    We load the visualGLM model via Huggingface.
-    Args:
-        pretrained_path (str): Path to visualGLM checkpoint or repo id.
-        prompt_constructor (dict): The config of prompt constructor.
-        post_processor (dict): The config of post processor.
-        is_caption_task (bool): Whether the task is caption task.
-            Defaults to False.
-        gen_kwargs (dict): Customize generate function arguments.
-            Defaults to None.
-    """
-
-    def __init__(self,
-                 pretrained_path: str,
-                 prompt_constructor: dict,
-                 post_processor: dict,
-                 is_caption_task: bool = False,
-                 gen_kwargs: Optional[dict] = None) -> None:
-        super().__init__()
-        self.tokenizer = AutoTokenizer.from_pretrained(pretrained_path,
-                                                       trust_remote_code=True)
-        self.model = AutoModel.from_pretrained(pretrained_path,
-                                               trust_remote_code=True).half()
-        self.prompt_constructor = mmengine.registry.build_from_cfg(
-            prompt_constructor, MM_MODELS)
-        self.post_processor = mmengine.registry.build_from_cfg(
-            post_processor, MM_MODELS)
-
-        if gen_kwargs:
-            self.gen_kwargs = gen_kwargs
-        else:
-            self.gen_kwargs = dict(max_length=1024,
-                                   min_length=100,
-                                   do_sample=True,
-                                   temperature=0.8,
-                                   top_p=0.4,
-                                   top_k=100,
-                                   repetition_penalty=1.2)
-
-        self.is_caption_task = is_caption_task
-
-    def encode_by_tokenizer(self, prompt, image_position):
-
-        input0 = self.tokenizer.encode(prompt[:image_position],
-                                       add_special_tokens=False)
-        input1 = [self.tokenizer.unk_token_id] * self.model.image_length
-        input2 = self.tokenizer.encode(prompt[image_position:],
-                                       add_special_tokens=False)
-        input_all = sum([input0, input1, input2], [])
-        input_all = self.tokenizer.build_inputs_with_special_tokens(input_all)
-        input_all = torch.tensor(input_all, dtype=torch.long).to(get_device())
-        input_all = input_all.unsqueeze(0)
-
-        pre_image_len = len(input0)
-
-        return input_all, pre_image_len
-
-    def generate(self, batch):
-        # process input
-        image, prompt, data_sample, image_position = self.prompt_constructor(
-            batch)
-        image = image.to(self.model.dtype).to(get_device())
-
-        # tokenize
-        input_all, pre_image_len = self.encode_by_tokenizer(
-            prompt, image_position)
-
-        # build input param
-        inputs = {
-            'input_ids': input_all,
-            'pre_image_length': pre_image_len,
-            'images': image
-        }
-
-        # generate answer
-        outputs = self.model.generate(**inputs, **self.gen_kwargs)
-
-        # format output
-        outputs = outputs.tolist()[0][input_all.shape[1]:]
-        answer = self.post_processor(outputs, self.tokenizer)
-
-        if self.is_caption_task:
-            data_sample.pred_caption = answer
-        else:
-            data_sample.pred_answer = answer
-
-        return data_sample
-
-    def forward(self, batch):
-        return self.generate(batch)
--- a/opencompass/partitioners/__init__.py
+++ b/opencompass/partitioners/__init__.py
-from .mm_naive import *  # noqa: F401, F403
 from .naive import *  # noqa: F401, F403
 from .num_worker import *  # noqa: F401, F403
 from .size import *  # noqa: F401, F403
--- a/opencompass/partitioners/mm_naive.py
+++ b/opencompass/partitioners/mm_naive.py
-from copy import deepcopy
-from typing import Dict, List
-
-from mmengine.config import Config, ConfigDict
-
-from opencompass.registry import PARTITIONERS
-
-from .base import BasePartitioner
-
-
-@PARTITIONERS.register_module()
-class MultimodalNaivePartitioner(BasePartitioner):
-    """Multimodal naive task partitioner.
-
-    This partitioner will generate a task for each
-    model-dataset-evaluator pair.
-
-    Args:
-        config (ConfigDict): The full config dict.
-    """
-
-    def partition(self, models: List[ConfigDict], datasets: List[ConfigDict],
-                  evaluators: List[ConfigDict], load_froms: List[ConfigDict],
-                  work_dir: str, num_gpus: int, num_procs: int,
-                  launcher: str) -> List[Dict]:
-        """Partition model-dataset pairs into tasks. Each task is defined as a
-        dict and will run independently as a unit. Its structure is as follows:
-
-        .. code-block:: python
-
-            {
-                'models': [],  # a list of model configs
-                'datasets': [],  # a list of dataset configs
-                'evaluators': [], # a list of evaluator configs
-                'load_froms': [], # a list of load_from paths
-                'work_dir': '',  # the work dir
-                'num_gpus': int, # integer, number of gpus for each task
-                'num_procs': int, # integer, number of gpus on single machine
-                'launcher': str, # string, how to launch distributed training
-            }
-
-        Args:
-            models (List[ConfigDict]): A list of model configs.
-            datasets (List[ConfigDict]): A list of dataset configs.
-            evaluators (List[ConfigDict]): A list of evaluator configs.
-            load_froms (List[ConfigDict]): A list of load_from paths.
-            work_dir (str): The work dir for the task.
-            num_gpus (int): Number of gpus for each task.
-            num_procs (int): Number of gpus on single machine.
-            launcher (str): How to launch distributed training.
-                Only `slurm`, `pytorch` and `mpi` are available.
-
-        Returns:
-            List[Dict]: A list of tasks.
-        """
-
-        tasks = []
-        for model, dataset, evaluator, load_from in zip(
-                models, datasets, evaluators, load_froms):
-            task = Config({
-                'model': model,
-                'dataset': dataset,
-                'evaluator': evaluator,
-                'load_from': load_from,
-                'work_dir': work_dir,
-                'num_gpus': num_gpus,
-                'num_procs': num_procs,
-                'launcher': launcher
-            })
-            tasks.append(task)
-
-        return tasks
-
-    def __call__(self, cfg: ConfigDict) -> List[Dict]:
-        """Generate tasks from config. Each task is defined as a
-        dict and will run independently as a unit. Its structure is as
-        follows:
-
-        .. code-block:: python
-
-            {
-                'models': [],  # a list of model configs
-                'datasets': [],  # a list of dataset configs
-                'evaluators': [], # a list of evaluator configs
-                'load_froms': [], # a list of load_from paths
-                'work_dir': '',  # the work dir
-                'num_gpus': int, # integer, number of gpus for each task
-                'num_procs': int, # integer, number of gpus on single machine
-            }
-
-        Args:
-            cfg (ConfigDict): The config dict, containing "models", "dataset"
-                and "work_dir" keys.
-
-        Returns:
-            List[Dict]: A list of tasks.
-        """
-        cfg = deepcopy(cfg)
-        models = cfg['models']
-        datasets = cfg['datasets']
-        evaluators = cfg['evaluators']
-        load_froms = cfg['load_froms']
-        work_dir = cfg['work_dir']
-        num_gpus = cfg['num_gpus']
-        num_procs = cfg['num_procs']
-        launcher = cfg['launcher']
-
-        tasks = self.partition(models, datasets, evaluators, load_froms,
-                               work_dir, num_gpus, num_procs, launcher)
-
-        self.logger.info(f'Partitioned into {len(tasks)} tasks.')
-        for i, task in enumerate(tasks):
-            model_name = task['model']['type']
-            dataset_name = task['dataset']['dataset']['type']
-            evaluator_name = task['evaluator'][0]['type']
-            self.logger.debug(
-                f'Task {i}: {model_name}-{dataset_name}-{evaluator_name}')
-
-        return tasks
--- a/opencompass/registry.py
+++ b/opencompass/registry.py
 from typing import Callable, List, Optional, Type, Union

-from mmengine.registry import DATASETS as MMENGINE_DATASETS
 from mmengine.registry import METRICS as MMENGINE_METRICS
-from mmengine.registry import MODELS as MMENGINE_MODELS
 from mmengine.registry import Registry as OriginalRegistry


@@ -39,15 +37,9 @@ ICL_PROMPT_TEMPLATES = Registry(
    locations=['opencompass.openicl.icl_prompt_template'])
 ICL_EVALUATORS = Registry('icl_evaluators',
                          locations=['opencompass.openicl.icl_evaluator'])
-DATASETS = Registry('mm_datasets',
-                    parent=MMENGINE_DATASETS,
-                    locations=['opencompass.multimodal.datasets'])
 METRICS = Registry('metric',
                   parent=MMENGINE_METRICS,
                   locations=['opencompass.metrics'])
-MM_MODELS = Registry('mm_model',
-                     parent=MMENGINE_MODELS,
-                     locations=['opencompass.multimodal.models'])
 TOT_WRAPPER = Registry('tot_wrapper', locations=['opencompass.datasets'])



--- a/opencompass/tasks/__init__.py
+++ b/opencompass/tasks/__init__.py
-from .mm_infer import *  # noqa: F401, F403
 from .openicl_attack import *  # noqa: F401, F403
 from .openicl_eval import *  # noqa: F401, F403
 from .openicl_infer import *  # noqa: F401, F403
--- a/opencompass/tasks/mm_infer.py
+++ b/opencompass/tasks/mm_infer.py
-import argparse
-import json
-import os
-import os.path as osp
-import random
-import time
-from typing import List, Sequence
-
-import mmengine
-import torch
-import torch.distributed as dist
-from mmengine.config import Config, ConfigDict
-from mmengine.device import get_device
-from mmengine.dist import init_dist
-from mmengine.evaluator import Evaluator
-from mmengine.logging import print_log
-from mmengine.model.wrappers import MMDistributedDataParallel
-from mmengine.utils import track_iter_progress
-
-from opencompass.registry import MM_MODELS, TASKS
-from opencompass.utils import get_logger
-
-
-def build_model(cfg):
-    model = MM_MODELS.build(cfg['model'])
-    load_from = cfg.get('load_from', None)
-    if load_from is not None:
-        state_dict = torch.load(cfg['load_from'], map_location='cpu')
-        if 'model' in state_dict:
-            state_dict = state_dict['model']
-        elif 'state_dict' in state_dict:
-            state_dict = state_dict['state_dict']
-        msg = model.load_state_dict(state_dict, strict=False)
-        print_log(msg)
-    model.to(get_device())
-    if dist.is_initialized():
-        model = MMDistributedDataParallel(
-            model,
-            device_ids=[int(os.environ['LOCAL_RANK'])],
-            broadcast_buffers=False)
-    return model
-
-
-@TASKS.register_module(force=(__name__ == '__main__'))  # A hack for script run
-class MultimodalInferTask:
-    """Multimodal Inference Task.
-
-    This task is used to run the inference process.
-    """
-
-    def __init__(self, cfg: ConfigDict):
-        self.num_gpus = cfg.get('num_gpus', 0)
-        self.num_procs = cfg.get('num_procs', 1)
-        self.dataloader = cfg.get('dataset')
-        self.model = cfg.get('model')
-        self.evaluator = cfg.get('evaluator')
-        self.cfg = cfg
-        self.logger = get_logger()
-
-    @property
-    def name(self) -> str:
-        model_name = self.model['type']
-        dataset_name = self.dataloader['dataset']['type']
-        evaluator_name = self.evaluator[0]['type']
-        return f'{model_name}-{dataset_name}-{evaluator_name}'
-
-    def get_log_path(self, file_extension: str = 'json') -> str:
-        """Get the path to the log file.
-
-        Args:
-            file_extension (str): The file extension of the log file.
-                Default: 'json'.
-        """
-        model_name = self.model['type']
-        dataset_name = self.dataloader['dataset']['type']
-        evaluator_name = self.evaluator[0]['type']
-
-        return osp.join(self.cfg.work_dir, model_name, dataset_name,
-                        f'{evaluator_name}.{file_extension}')
-
-    def get_output_paths(self, file_extension: str = 'json') -> List[str]:
-        """Get the path to the output file.
-
-        Args:
-            file_extension (str): The file extension of the log file.
-                Default: 'json'.
-        """
-        model_name = self.model['type']
-        dataset_name = self.dataloader['dataset']['type']
-        evaluator_name = self.evaluator[0]['type']
-
-        return [
-            osp.join(self.cfg.work_dir, model_name, dataset_name,
-                     f'{evaluator_name}.{file_extension}')
-        ]
-
-    def get_command(self, cfg_path, template):
-        """Get the command template for the task.
-
-        Args:
-            cfg_path (str): The path to the config file of the task.
-            template (str): The template which have '{task_cmd}' to format
-                the command.
-        """
-        script_path = __file__
-        if self.num_gpus > 0:
-            port = random.randint(12000, 32000)
-            command = (f'torchrun --master_port={port} '
-                       f'--nproc_per_node {self.num_procs} '
-                       f'{script_path} {cfg_path}')
-        else:
-            command = f'python {script_path} {cfg_path}'
-
-        return template.format(task_cmd=command)
-
-    def run(self):
-        from mmengine.runner import Runner
-
-        # only support slurm, pytorch, mpi
-        init_dist(self.cfg.launcher)
-        self.logger.info(f'Task {self.name}')
-        # build dataloader
-        dataloader = Runner.build_dataloader(self.dataloader)
-        # build model
-        model = build_model(self.cfg)
-        model.eval()
-        # build evaluator
-        evaluator = Evaluator(self.evaluator)
-
-        for batch in track_iter_progress(dataloader):
-            if dist.is_initialized():
-                data_samples = model.module.forward(batch)
-            else:
-                data_samples = model.forward(batch)
-            if not isinstance(data_samples, Sequence):
-                data_samples = [data_samples]
-            evaluator.process(data_samples)
-
-        metrics = evaluator.evaluate(len(dataloader.dataset))
-        metrics_file = self.get_output_paths()[0]
-        mmengine.mkdir_or_exist(osp.split(metrics_file)[0])
-        with open(metrics_file, 'w') as f:
-            json.dump(metrics, f)
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(description='Model Inferencer')
-    parser.add_argument('config', help='Config file path')
-    args = parser.parse_args()
-    return args
-
-
-if __name__ == '__main__':
-    args = parse_args()
-    cfg = Config.fromfile(args.config)
-    start_time = time.time()
-    inferencer = MultimodalInferTask(cfg)
-    inferencer.run()
-    end_time = time.time()
-    get_logger().info(f'time elapsed: {end_time - start_time:.2f}s')
--- a/opencompass/utils/run.py
+++ b/opencompass/utils/run.py
@@ -270,27 +270,6 @@ def change_accelerator(models, accelerator):
    return model_accels


-def exec_mm_infer_runner(tasks, args, cfg):
-    """execute multimodal infer runner according to args."""
-    if args.slurm:
-        runner = SlurmRunner(dict(type='MultimodalInferTask'),
-                             max_num_workers=args.max_num_workers,
-                             partition=args.partition,
-                             quotatype=args.quotatype,
-                             retry=args.retry,
-                             debug=args.debug,
-                             lark_bot_url=cfg['lark_bot_url'])
-    elif args.dlc:
-        raise NotImplementedError('Currently, we do not support evaluating \
-                             multimodal models on dlc.')
-    else:
-        runner = LocalRunner(task=dict(type='MultimodalInferTask'),
-                             max_num_workers=args.max_num_workers,
-                             debug=args.debug,
-                             lark_bot_url=cfg['lark_bot_url'])
-    runner(tasks)
-
-
 def get_config_type(obj) -> str:
    return f'{obj.__module__}.{obj.__name__}'


--- a/tools/eval_mmbench.py
+++ b/tools/eval_mmbench.py
-# Usage: python eval_mmbench.py mmbench_dev_inference_result.xlsx
-import argparse
-import json
-import os.path as osp
-import pickle
-import random as rd
-import string
-from collections import defaultdict
-
-import numpy as np
-import pandas as pd
-from tqdm import tqdm
-
-from opencompass.models import OpenAI
-
-fout = None
-
-
-# Utils
-def double_log(msg, fout=None):
-    print(msg)
-    if fout is not None:
-        fout.write(str(msg) + '\n')
-        fout.flush()
-
-
-def dump(data, f):
-
-    def dump_pkl(data, pth):
-        pickle.dump(data, open(pth, 'wb'))
-
-    def dump_json(data, pth):
-        json.dump(data, open(pth, 'w'))
-
-    def dump_jsonl(data, f):
-        lines = [json.dumps(x, ensure_ascii=False) for x in data]
-        with open(f, 'w', encoding='utf8') as fout:
-            fout.write('\n'.join(lines))
-
-    def dump_xlsx(data, f):
-        data.to_excel(f, index=False)
-
-    def dump_csv(data, f):
-        data.to_csv(f, index=False)
-
-    def dump_tsv(data, f):
-        data.to_csv(f, sep='\t', index=False)
-
-    handlers = dict(pkl=dump_pkl,
-                    json=dump_json,
-                    jsonl=dump_jsonl,
-                    xlsx=dump_xlsx,
-                    csv=dump_csv,
-                    tsv=dump_tsv)
-    suffix = f.split('.')[-1]
-    return handlers[suffix](data, f)
-
-
-def load(f):
-
-    def load_pkl(pth):
-        return pickle.load(open(pth, 'rb'))
-
-    def load_json(pth):
-        return json.load(open(pth, 'r', encoding='utf-8'))
-
-    def load_jsonl(f):
-        lines = open(f, encoding='utf-8').readlines()
-        lines = [x.strip() for x in lines]
-        if lines[-1] == '':
-            lines = lines[:-1]
-        data = [json.loads(x) for x in lines]
-        return data
-
-    def load_xlsx(f):
-        return pd.read_excel(f)
-
-    def load_csv(f):
-        return pd.read_csv(f)
-
-    def load_tsv(f):
-        return pd.read_csv(f, sep='\t')
-
-    handlers = dict(pkl=load_pkl,
-                    json=load_json,
-                    jsonl=load_jsonl,
-                    xlsx=load_xlsx,
-                    csv=load_csv,
-                    tsv=load_tsv)
-    suffix = f.split('.')[-1]
-    return handlers[suffix](f)
-
-
-# Accuracy Report
-def report_acc(df, group='category'):
-    assert 'split' in df
-    assert group in [None, 'category', 'l2-category']
-
-    res = defaultdict(list)
-    res['split'] = ['full', 'dev', 'test']
-    if group is None:
-        res['overall'] = [
-            np.mean(df['hit']),
-            np.mean(df[df['split'] == 'dev']['hit']),
-            np.mean(df[df['split'] == 'test']['hit'])
-        ]
-        return pd.DataFrame(res)
-
-    elif group in df:
-        abilities = list(set(df[group]))
-        abilities.sort()
-        for ab in abilities:
-            sub_df = df[df[group] == ab]
-            res[ab] = [
-                np.mean(sub_df['hit']),
-                np.mean(sub_df[sub_df['split'] == 'dev']['hit']),
-                np.mean(sub_df[sub_df['split'] == 'test']['hit'])
-            ]
-        return pd.DataFrame(res)
-
-
-# Prompt Building
-def build_option_str(option_list):
-    chars = string.ascii_uppercase
-    s = 'There are several options: \n'
-    for c, opt in zip(chars, option_list):
-        if not pd.isna(opt):
-            s += f'{c}. {opt}\n'
-        else:
-            return s
-    return s
-
-
-def extract_options(item):
-    options = []
-    for c in 'ABCD':
-        if c in item and not pd.isna(item[c]):
-            options.append(item[c])
-        else:
-            return options
-    return options
-
-
-def build_choices(item):
-    ret = {}
-    for ch in 'ABCD':
-        if not pd.isna(item[ch]):
-            ret[ch] = item[ch]
-    return ret
-
-
-def build_prompt(question, options, prediction):
-    tmpl = (
-        'You are an AI assistant who will help me to match an answer '
-        'with several options of a single-choice question. '
-        'You are provided with a question, several options, and an answer, '
-        'and you need to find which option is most similar to the answer. '
-        'If the meaning of all options are significantly different '
-        'from the answer, output E. '
-        'Your should output a single uppercase character in A, B, C, D '
-        '(if they are valid options), and E. \n'
-        'Example 1: \n'
-        'Question: What is the main object in image?\nOptions: A. teddy bear '
-        'B. rabbit C. cat D. dog\nAnswer: a cute teddy bear\nYour output: A\n'
-        'Example 2: \n'
-        'Question: What is the main object in image?\nOptions: A. teddy bear '
-        'B. rabbit C. cat D. dog\nAnswer: Spider\nYour output: E\n'
-        'Example 3: \n'
-        'Question: {}?\nOptions: {}\nAnswer: {}\nYour output: ')
-    return tmpl.format(question, options, prediction)
-
-
-# Prefetch Answers
-def can_infer_option(answer, num_choice=5):
-    choices = string.ascii_uppercase[:num_choice]
-    if 'Failed to obtain answer via API' in answer:
-        return False
-
-    def count(splits, choices='ABCD', prefix='', suffix=''):
-        cnt = 0
-        for c in choices:
-            if prefix + c + suffix in splits:
-                cnt += 1
-        return cnt
-
-    splits = [x.strip() for x in answer.split()]
-    if count(splits, choices) == 1:
-        for ch in choices:
-            if 'A' in splits and len(splits) > 3:
-                double_log(
-                    f'A might be a quantifier in the string: {answer}. ', fout)
-                break
-            if ch in splits:
-                return ch
-    tups = [('', '.'), ('', ','), ('', ':'), ('', ')'), ('', ').'), ('(', ')'),
-            ('(', ').'), (':', ''), (':', ','), (':', '.'), (':', ')'),
-            (':', ').')]
-    for tup in tups:
-        if count(splits, choices, prefix=tup[0], suffix=tup[1]) == 1:
-            for ch in choices:
-                if tup[0] + ch + tup[1] in splits:
-                    return ch
-    return False
-
-
-def can_infer_text(answer, choices):
-    answer = answer.lower()
-    assert isinstance(choices, dict)
-    for k in choices:
-        assert k in 'ABCD'
-        choices[k] = str(choices[k]).lower()
-    cands = []
-    for k in choices:
-        if choices[k] in answer:
-            cands.append(k)
-    if len(cands) == 1:
-        return cands[0]
-    return False
-
-
-def can_infer(answer, choices):
-    copt = can_infer_option(answer)
-    return copt if copt else can_infer_text(answer, choices)
-
-
-def prefetch_answer(item):
-    choices = build_choices(item)
-    return can_infer(item['prediction'], choices)
-
-
-# Extract answer from a single record
-def extract_answer_from_item(model, item):
-    # It will return: (pred, raw, llm_time)
-    options = extract_options(item)
-    option_str = build_option_str(options)
-
-    prompt = build_prompt(item['question'], option_str, item['prediction'])
-    retry = 3
-    choices = build_choices(item)
-
-    ret = can_infer(item['prediction'], choices)
-    if ret:
-        return ret, item['prediction']
-
-    while retry:
-        ans = model.generate([prompt])[0]
-        if 'Failed to obtain answer via API' in ans:
-            msg = 'GPT API failed to answer. '
-            double_log(msg, fout)
-            retry -= 1
-        else:
-            ret = can_infer(ans, choices)
-            if ret:
-                return ret, ans
-            else:
-                double_log(
-                    f'GPT output includes 0 / >1 letter in "ABCD": {ans}',
-                    fout)
-                retry -= 1
-
-        if retry == 0:
-            num_options = sum([ch in item for ch in 'ABCD'])
-            if num_options >= 2:
-                chars = string.ascii_uppercase[:num_options]
-                chars = chars + 'E'
-                num_options += 1
-                tmp = rd.randint(0, num_options - 1)
-                return chars[
-                    tmp], 'Failed to predict, thus randomly generate one. '
-
-
-# Extract answer from multiple rolling records
-def eval_sub_data(model, sub_data, answer_map):
-    lt = len(sub_data)
-    GT, PRED = [], []
-    for i in range(lt):
-        item = sub_data.iloc[i]
-        idx = item['index']
-        GT.append(answer_map[idx])
-        PRED.append(prefetch_answer(item))
-        if PRED[-1] and (GT[-1] != PRED[-1]):
-            return 0
-
-    for i in range(lt):
-        if PRED[i]:
-            continue
-        else:
-            ret, _ = extract_answer_from_item(model, sub_data.iloc[i])
-            PRED[i] = ret
-            if PRED[i] != GT[i]:
-                return 0
-    return 1
-
-
-# Evaluate Results
-def eval_result(eval_file, eval_method, meta_file):
-    rd.seed(2680)
-    assert eval_method == 'openai'
-    # Set a large retry number to avoid failure
-    model = OpenAI('gpt-3.5-turbo-0613', retry=99)
-
-    double_log(f'Evaluating {eval_file}', fout)
-
-    result_file = eval_file.replace('.xlsx', f'_{eval_method}_result.pkl')
-    result = {}
-    if osp.exists(result_file):
-        result = load(result_file)
-
-    data = load(eval_file)
-    data = data.sort_values(by='index')
-    data['prediction'] = [str(x) for x in data['prediction']]
-    for k in data.keys():
-        data[k.lower() if k not in 'ABCD' else k] = data.pop(k)
-
-    meta = load(meta_file)
-
-    data_main = data[data['index'] < int(1e6)]
-    cate_map = {i: c for i, c in zip(meta['index'], meta['category'])}
-    l2_cate_map = {i: c for i, c in zip(meta['index'], meta['l2-category'])}
-    split_map = {i: c for i, c in zip(meta['index'], meta['split'])}
-    answer_map = {i: c for i, c in zip(meta['index'], meta['answer'])}
-
-    lt = len(data_main)
-    hit, tot = 0, 0
-
-    for i in tqdm(range(lt)):
-        # Dealing with the normal part
-        item_main = data_main.iloc[i]
-        idx = item_main['index']
-
-        if idx in result:
-            correct = result[idx]
-            assert correct in [0, 1]
-            hit += correct
-            tot += 1
-            continue
-
-        sub_data = data[data['index'] % int(1e6) == idx]
-        ret = eval_sub_data(model, sub_data, answer_map)
-        result[idx] = ret
-        hit += ret
-        tot += 1
-
-        dump(result, result_file)
-
-        if (i + 1) % 10 == 0:
-            double_log((f'Evaluating {eval_file}: {i + 1}/{lt}, '
-                        f'Acc: {hit / tot * 100: .2f}%. '), fout)
-
-    dump(data_main, 'tmp.xlsx')
-    data_main = load('tmp.xlsx')
-
-    res = load(result_file)
-    indices = data_main['index']
-    data_main['hit'] = [res[i] for i in indices]
-    data_main['split'] = [split_map[i] for i in indices]
-    main_idx = data_main['index']
-    data_main['category'] = [cate_map[i] for i in main_idx]
-    data_main['l2-category'] = [l2_cate_map[i] for i in main_idx]
-
-    # load split
-    dump(data_main, eval_file.replace('.xlsx', f'_{eval_method}_result.xlsx'))
-    data_main = load(eval_file.replace('.xlsx', f'_{eval_method}_result.xlsx'))
-
-    overall = report_acc(data_main, None)
-    dump(overall, eval_file.replace('.xlsx', '_overall.csv'))
-    double_log(overall)
-
-    l2 = report_acc(data_main, 'l2-category')
-    dump(l2, eval_file.replace('.xlsx', '_l2.csv'))
-    double_log(l2)
-
-    leaf = report_acc(data_main, 'category')
-    dump(leaf, eval_file.replace('.xlsx', '_leaf.csv'))
-    double_log(leaf)
-
-    if fout is not None:
-        fout.close()
-
-    return overall, l2, leaf
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(
-        description='Evaluate Inference Results of MMBench-DEV SPLIT. ')
-    parser.add_argument('result',
-                        type=str,
-                        help='The path to your inference result. ')
-    parser.add_argument('--meta',
-                        type=str,
-                        default='data/mmbench_dev_20230712.tsv',
-                        help=('The path to your meta file (dev). '
-                              'Downloaded from MMBench website. '))
-    args = parser.parse_args()
-    return args
-
-
-if __name__ == '__main__':
-    args = parse_args()
-    log_pth = args.result.replace('.xlsx', '_openai_eval.log')
-    fout = open(log_pth, 'a')
-
-    acc, l2, leaf = eval_result(args.result, 'openai', args.meta)