initial commit

7d346000 · gaotongxiao · 7d346000 · 7d346000 · 7d346000 · 7d346000
Commit 7d346000 authored Jul 04, 2023 by gaotongxiao
20 changed files
--- a/opencompass/datasets/mbpp.py
+++ b/opencompass/datasets/mbpp.py
+import contextlib
+import io
+import re
+import signal
+
+from datasets import DatasetDict, load_dataset
+
+from opencompass.openicl.icl_evaluator import BaseEvaluator
+from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET
+
+from .base import BaseDataset
+
+
+@LOAD_DATASET.register_module()
+class MBPPDataset(BaseDataset):
+
+    @staticmethod
+    def load(path: str):
+
+        def processing_test(example):
+            example['test_case'] = example['test_list']
+            example['test_list'] = '\n'.join(example['test_list'])
+            return example
+
+        train = load_dataset(
+            'json', data_files=path, split='train[:10]').map(processing_test)
+        test = load_dataset(
+            'json', data_files=path,
+            split='train[10:510]').map(processing_test)
+        return DatasetDict({'train': train, 'test': test})
+
+
+class TimeOutException(Exception):
+    pass
+
+
+@ICL_EVALUATORS.register_module()
+class MBPPEvaluator(BaseEvaluator):
+
+    def score(self, predictions, references):
+        assert len(predictions) == len(references)
+        predictions = [self._process_answer(pred) for pred in predictions]
+
+        result = {'pass': 0, 'timeout': 0, 'failed': 0, 'wrong_answer': 0}
+        for test_case, pred in zip(references, predictions):
+            programs = self._process_test(test_case, pred)
+            try:
+                with self.swallow_io():
+                    with self.time_limit(2):
+                        exec(programs)
+                result['pass'] += 1
+            except TimeOutException:
+                result['timeout'] += 1
+            except AssertionError:
+                result['wrong_answer'] += 1
+            except BaseException:
+                result['failed'] += 1
+
+        result['score'] = result['pass'] / len(predictions) * 100
+        return result
+
+    def _process_answer(self, text):
+        text = text.strip()
+        match = re.search(r"('\s*|)(\[DONE\]|DONE)", text)
+        if match:
+            text = text[:match.start()]
+        match = re.search(r"(\[BEGIN\]|BEGIN)('\s*|)", text)
+        if match:
+            text = text[match.end():]
+        text = text.strip()
+        if text.startswith("'"):
+            text = text[1:]
+        if text.endswith("'"):
+            text = text[:-1]
+        return text
+
+    def _process_test(self, test_case, pred):
+        formatted = pred + '\n'
+        formatted += test_case
+        return formatted
+
+    @contextlib.contextmanager
+    def swallow_io(self):
+        stream = self.WriteOnlyStringIO()
+        with contextlib.redirect_stdout(stream):
+            with contextlib.redirect_stderr(stream):
+                with self.redirect_stdin(stream):
+                    yield
+
+    @contextlib.contextmanager
+    def time_limit(self, seconds: float):
+
+        def signal_handler(signum, frame):
+            raise TimeOutException('Time out!')
+
+        signal.setitimer(signal.ITIMER_REAL, seconds)
+        signal.signal(signal.SIGALRM, signal_handler)
+        try:
+            yield
+        finally:
+            signal.setitimer(signal.ITIMER_REAL, 0)
+
+    class WriteOnlyStringIO(io.StringIO):
+        """StringIO that throws an exception when it's read from."""
+
+        def read(self, *args, **kwargs):
+            raise IOError
+
+        def readline(self, *args, **kwargs):
+            raise IOError
+
+        def readlines(self, *args, **kwargs):
+            raise IOError
+
+        def readable(self, *args, **kwargs):
+            """Returns True if the IO object can be read."""
+            return False
+
+    class redirect_stdin(contextlib._RedirectStream):  # type: ignore
+        _stream = 'stdin'
--- a/opencompass/datasets/mmlu.py
+++ b/opencompass/datasets/mmlu.py
+import csv
+import os.path as osp
+
+from datasets import Dataset, DatasetDict
+
+from opencompass.registry import LOAD_DATASET
+
+from .base import BaseDataset
+
+
+@LOAD_DATASET.register_module()
+class MMLUDataset(BaseDataset):
+
+    @staticmethod
+    def load(path: str, name: str):
+        dataset = DatasetDict()
+        for split in ['dev', 'test']:
+            raw_data = []
+            filename = osp.join(path, split, f'{name}_{split}.csv')
+            with open(filename) as f:
+                reader = csv.reader(f)
+                for row in reader:
+                    assert len(row) == 6
+                    raw_data.append({
+                        'input': row[0],
+                        'A': row[1],
+                        'B': row[2],
+                        'C': row[3],
+                        'D': row[4],
+                        'target': row[5],
+                    })
+            dataset[split] = Dataset.from_list(raw_data)
+        return dataset
--- a/opencompass/datasets/multirc.py
+++ b/opencompass/datasets/multirc.py
+import json
+
+from datasets import Dataset
+
+from opencompass.registry import LOAD_DATASET
+
+from .base import BaseDataset
+
+
+@LOAD_DATASET.register_module()
+class MultiRCDataset(BaseDataset):
+
+    @staticmethod
+    def load(path: str):
+        with open(path, 'r', errors='ignore') as in_f:
+            rows = []
+            for line in in_f:
+                sample = json.loads(line.strip())
+                passage = sample['passage']
+                text = passage['text']
+                questions = passage['questions']
+                for question_dict in questions:
+                    question = question_dict['question']
+                    answers = question_dict['answers']
+                    for answer_dict in answers:
+                        answer = answer_dict['text']
+                        label = answer_dict['label']
+                        rows.append({
+                            'text': text,
+                            'question': question,
+                            'answer': answer,
+                            'label': label
+                        })
+            dataset = Dataset.from_dict({
+                'text': [row['text'] for row in rows],
+                'question': [row['question'] for row in rows],
+                'answer': [row['answer'] for row in rows],
+                'label': [row['label'] for row in rows]
+            })
+            return dataset
+
+
+@LOAD_DATASET.register_module()
+class MultiRCDataset_V2(BaseDataset):
+
+    @staticmethod
+    def load(path: str):
+        with open(path, 'r', errors='ignore') as in_f:
+            rows = []
+            for line in in_f:
+                sample = json.loads(line.strip())
+                text = sample['passage']['text']
+                for question_dict in sample['passage']['questions']:
+                    question = question_dict['question']
+                    answers = question_dict['answers']
+                    for answer in answers:
+                        rows.append({
+                            'text': text,
+                            'question': question,
+                            'answer': answer['text'],
+                            'label': 'BA'[answer['label']]
+                        })
+            return Dataset.from_list(rows)
--- a/opencompass/datasets/natural_question.py
+++ b/opencompass/datasets/natural_question.py
+import csv
+import os.path as osp
+import re
+
+from datasets import Dataset, DatasetDict
+
+from opencompass.openicl.icl_evaluator import BaseEvaluator
+from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET
+from opencompass.utils.text_postprocessors import general_postprocess
+
+from .base import BaseDataset
+
+
+@LOAD_DATASET.register_module()
+class NaturalQuestionDataset(BaseDataset):
+
+    @staticmethod
+    def load(path: str):
+        dataset = DatasetDict()
+        for split in ['dev', 'test']:
+            filename = osp.join(path, f'nq-{split}.qa.csv')
+            with open(filename) as f:
+                reader = csv.reader(f, delimiter='\t')
+                raw_data = []
+                for row in reader:
+                    assert len(row) == 2
+                    question = row[0]
+                    answers = eval(row[1])
+                    if split == 'dev':
+                        answers = answers[0]
+                    raw_data.append({'question': question, 'answer': answers})
+                dataset[split] = Dataset.from_list(raw_data)
+
+        return dataset
+
+
+@ICL_EVALUATORS.register_module()
+class NQEvaluator(BaseEvaluator):
+
+    def score(self, predictions, references):
+        if len(predictions) != len(references):
+            return {
+                'error': 'predictions and references have different '
+                'length'
+            }
+        predictions = [
+            re.split(r'[\n]', prediction, 1)[0].lower()
+            for prediction in predictions
+        ]
+        processed_answers = [[general_postprocess(j).lower() for j in i]
+                             for i in references]
+
+        cnt = 0
+        for pred, cand_ans in zip(predictions, processed_answers):
+            cnt += int(any([cand in pred for cand in cand_ans]))
+        score = cnt / len(predictions) * 100
+
+        return {'score': score}
--- a/opencompass/datasets/race.py
+++ b/opencompass/datasets/race.py
+from datasets import load_dataset
+
+from opencompass.registry import LOAD_DATASET
+
+from .base import BaseDataset
+
+
+@LOAD_DATASET.register_module()
+class RaceDataset(BaseDataset):
+
+    @staticmethod
+    def load(path: str, name: str):
+        dataset = load_dataset(path, name)
+
+        def preprocess(x):
+            for ans, option in zip(['A', 'B', 'C', 'D'], x['options']):
+                x[ans] = option
+            del x['options']
+            return x
+
+        return dataset.map(preprocess)
--- a/opencompass/datasets/record.py
+++ b/opencompass/datasets/record.py
+import json
+
+from datasets import Dataset
+
+from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS
+
+from .base import BaseDataset
+
+
+@LOAD_DATASET.register_module()
+class ReCoRDDataset(BaseDataset):
+
+    @staticmethod
+    def load(path: str):
+        with open(path, 'r', errors='ignore') as in_f:
+            rows = []
+            for i, line in enumerate(in_f):
+                sample = json.loads(line.strip())
+                passage = sample['passage']
+                text = passage['text']
+                text = text.replace('@highlight', '')
+
+                qas = sample['qas']
+                for qas_dict in qas:
+                    query = qas_dict['query']
+                    query = query.replace('@placeholder', '____')
+                    answers = qas_dict['answers']
+                    answers_temp = []
+                    for answer_dict in answers:
+                        answer = answer_dict['text']
+                        answers_temp.append(answer)
+                    rows.append({
+                        'text': text,
+                        'question': query,
+                        'answers': answers_temp
+                    })
+
+            dataset = Dataset.from_dict({
+                'text': [row['text'] for row in rows],
+                'question': [row['question'] for row in rows],
+                'answers': [row['answers'] for row in rows]
+            })
+            return dataset
+
+
+@TEXT_POSTPROCESSORS.register_module('ReCoRD')
+def ReCoRD_postprocess(text: str) -> str:
+    text = text.strip().split('\n')[0].replace('Answer: ', '').strip()
+    return text
--- a/opencompass/datasets/storycloze.py
+++ b/opencompass/datasets/storycloze.py
+from datasets import DatasetDict, load_dataset
+
+from opencompass.registry import LOAD_DATASET
+
+from .base import BaseDataset
+
+
+@LOAD_DATASET.register_module()
+class storyclozeDataset(BaseDataset):
+
+    @staticmethod
+    def load(**kwargs):
+        # special process
+        dataset = load_dataset(**kwargs, split='train+eval')
+
+        def preprocess(example):
+            example['context'] = ' '.join([
+                example['input_sentence_1'], example['input_sentence_2'],
+                example['input_sentence_3'], example['input_sentence_4']
+            ])
+            return example
+
+        dataset = dataset.map(preprocess)
+
+        return DatasetDict({'test': dataset})
+
+
+@LOAD_DATASET.register_module()
+class storyclozeDataset_V2(BaseDataset):
+
+    @staticmethod
+    def load(**kwargs):
+        # special process
+        dataset = load_dataset(**kwargs, split='train+eval')
+
+        def preprocess(example):
+            example['context'] = ' '.join([
+                example['input_sentence_1'], example['input_sentence_2'],
+                example['input_sentence_3'], example['input_sentence_4']
+            ])
+            example['answer_right_ending'] = ' AB'[
+                example['answer_right_ending']]
+            return example
+
+        dataset = dataset.map(preprocess)
+        return dataset
--- a/opencompass/datasets/summedits.py
+++ b/opencompass/datasets/summedits.py
+import json
+
+from datasets import Dataset
+
+from opencompass.registry import LOAD_DATASET
+from .base import BaseDataset
+
+
+@LOAD_DATASET.register_module()
+class SummeditsDataset_V2(BaseDataset):
+
+    @staticmethod
+    def load(path: str):
+        dataset = []
+        with open(path, 'r') as f:
+            for line in f:
+                line = json.loads(line)
+                line['label'] = 'BA'[line['label']]
+                dataset.append(line)
+        return Dataset.from_list(dataset)
\ No newline at end of file
--- a/opencompass/datasets/tnews.py
+++ b/opencompass/datasets/tnews.py
+import json
+
+from datasets import Dataset, load_dataset
+
+from opencompass.registry import LOAD_DATASET
+
+from .base import BaseDataset
+
+
+@LOAD_DATASET.register_module()
+class TNewsDataset(BaseDataset):
+
+    @staticmethod
+    def load(**kwargs):
+
+        tnews_targets = {
+            'news_agriculture': '农业新闻',
+            'news_travel': '旅游新闻',
+            'news_game': '游戏新闻',
+            'news_tech': '科技类别公司新闻',
+            'news_sports': '体育类别新闻',
+            'news_edu': '初升高教育新闻',
+            'news_entertainment': '娱乐圈新闻',
+            'news_finance': '投资资讯',
+            'news_military': '军事类别常识',
+            'news_car': '车辆新闻',
+            'news_house': '楼市新闻',
+            'news_world': '环球不含中国类别新闻',
+            'news_culture': '书籍文化历史类别新闻',
+            'news_story': '故事类别新闻',
+            'news_stock': '股票市场类别新闻',
+        }
+
+        dataset = load_dataset(**kwargs)
+
+        def preprocess(example):
+            label_desc = example['label_desc']
+            label_desc2 = tnews_targets[label_desc]
+            example['label_desc2'] = label_desc2
+            return example
+
+        dataset = dataset.map(preprocess)
+        return dataset
+
+
+@LOAD_DATASET.register_module()
+class TNewsDataset_V2(BaseDataset):
+
+    @staticmethod
+    def load(path):
+        tnews_targets = {
+            'news_agriculture': 'A',
+            'news_travel': 'B',
+            'news_game': 'C',
+            'news_tech': 'D',
+            'news_sports': 'E',
+            'news_edu': 'F',
+            'news_entertainment': 'G',
+            'news_finance': 'H',
+            'news_military': 'I',
+            'news_car': 'J',
+            'news_house': 'K',
+            'news_world': 'L',
+            'news_culture': 'M',
+            'news_story': 'N',
+            'news_stock': 'O',
+        }
+
+        data = []
+        with open(path, 'r') as f:
+            for line in f:
+                line = json.loads(line)
+                item = {
+                    'sentence': line['sentence'],
+                    'label_desc2': tnews_targets[line['label_desc']],
+                }
+                data.append(item)
+        return Dataset.from_list(data)
--- a/opencompass/datasets/triviaqa.py
+++ b/opencompass/datasets/triviaqa.py
+import csv
+import os.path as osp
+import re
+
+from datasets import Dataset, DatasetDict
+
+from opencompass.openicl.icl_evaluator import BaseEvaluator
+from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET
+from opencompass.utils.text_postprocessors import general_postprocess
+
+from .base import BaseDataset
+
+
+@LOAD_DATASET.register_module()
+class TriviaQADataset(BaseDataset):
+
+    @staticmethod
+    def load(path: str):
+        dataset = DatasetDict()
+        for split in ['dev', 'test']:
+            filename = osp.join(path, f'trivia-{split}.qa.csv')
+            with open(filename) as f:
+                reader = csv.reader(f, delimiter='\t')
+                raw_data = []
+                for row in reader:
+                    assert len(row) == 2
+                    question = row[0]
+                    answers = eval(row[1])
+                    if split == 'test':
+                        answers = answers[0]
+                    raw_data.append({'question': question, 'answer': answers})
+                dataset[split] = Dataset.from_list(raw_data)
+        return dataset
+
+
+@ICL_EVALUATORS.register_module()
+class TriviaQAEvaluator(BaseEvaluator):
+
+    def __init__(self) -> None:
+        super().__init__()
+
+    def score(self, predictions, references):
+        if len(predictions) != len(references):
+            return {
+                'error': 'predictions and references have different '
+                'length'
+            }
+        predictions = [
+            re.split(r'[\n]', prediction, 1)[0].lower()
+            for prediction in predictions
+        ]
+        processed_answers = [[general_postprocess(j).lower() for j in i]
+                             for i in references]
+
+        cnt = 0
+        for pred, cand_ans in zip(predictions, processed_answers):
+            cnt += int(any([cand in pred for cand in cand_ans]))
+        score = cnt / len(predictions) * 100
+
+        return {'score': score}
--- a/opencompass/datasets/truthfulqa.py
+++ b/opencompass/datasets/truthfulqa.py
+import os
+import time
+
+import evaluate
+import numpy as np
+from datasets import load_dataset
+
+from opencompass.openicl.icl_evaluator import BaseEvaluator
+from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET
+
+from .base import BaseDataset
+
+
+@LOAD_DATASET.register_module()
+class TruthfulQADataset(BaseDataset):
+
+    @staticmethod
+    def load(**kwargs):
+        dataset = load_dataset(**kwargs)
+
+        def preprocess(example):
+
+            example['reference'] = dict(answers=dict(
+                best_answer=example.pop('best_answer'),
+                correct_answers=example.pop('correct_answers'),
+                incorrect_answers=example.pop('incorrect_answers')),
+                                        question=example.get('question'))
+
+            return example
+
+        dataset = dataset.map(preprocess)
+
+        return dataset
+
+
+@ICL_EVALUATORS.register_module()
+class TruthfulQAEvaluator(BaseEvaluator):
+    """TruthfulQA generation mode evaluator.
+
+    Args:
+        truth_model (str): Truth model name. See "notes" for details.
+        info_model (str): Informativeness model name. See "notes" for details.
+        metrics (tuple): Computing needed metrics for truthfulqa dataset.
+            Supported metrics are `bleurt`, `rouge`, `bleu`, `truth`, `info`.
+        key (str): Corresponding API key. If set to `ENV`, find it in
+            environment variables. Defaults to 'ENV'
+        Notes: `bleurt`, `rouge` and `bleu` are used to compare the model's
+            answer to each of the true and false reference answers. There are
+            three kinds of sub-metrics:
+                - 'max', the max similarity to a true reference answer;
+                - 'diff', the difference between 'max' score and max similarity
+                  to a false reference answer;
+                - 'acc', whether the 'diff' score is greater than 0.
+            The final score is given by 'diff' score.
+
+        Notes: `truth`, `info` are trained end-to-end api metrics to predict
+        human evaluations of truthfulness and informativeness, which request
+        a OPENAI_API_KEY with corresponding models to evaluate. Following
+        instructions on https://github.com/sylinrl/TruthfulQA/tree/main. for
+        more details.
+    """
+
+    # used for calculate basic metrics use evaluator
+    SCORE_KEY = {
+        'bleurt': 'scores',
+        'rouge': 'rouge1',
+        'bleu': 'bleu',
+    }
+
+    def __init__(
+            self,
+            truth_model: str,  # noqa
+            info_model: str,  # noqa
+            metrics=('bleurt', 'rouge', 'bleu', 'truth', 'info'),
+            key='ENV'):
+        self.API_MODEL = {
+            'truth': truth_model,
+            'info': info_model,
+        }
+        all_metrics = set(self.SCORE_KEY.keys()) | set(self.API_MODEL.keys())
+        assert set(metrics).issubset(all_metrics)
+        self.metrics = list()
+        self.api_metrics = list()
+        for metric in metrics:
+            if metric in self.SCORE_KEY.keys():
+                self.metrics.append(metric)
+            if metric in self.API_MODEL.keys():
+                self.api_metrics.append(metric)
+
+        if self.api_metrics:
+            try:
+                api_key = os.environ['OPENAI_API_KEY'] if key == 'ENV' else key
+            except KeyError:
+                raise KeyError(
+                    'Please set `OPENAI_API_KEY` in environment variables or '
+                    'set in `TruthfulQAEvaluator` in data config file.')
+            else:
+                import openai
+                self.openai = openai
+                self.openai.api_key = api_key
+        super().__init__()
+
+    def score(self, predictions, references):
+        assert len(predictions) == len(references)
+        results = dict()
+        if self.metrics:
+            results = self.basic_score(predictions, references)
+        if self.api_metrics:
+            results = {**results, **self.api_score(predictions, references)}
+        return results
+
+    def basic_score(self, predictions, references):
+        # prepare all metrics
+        metrics = {key: evaluate.load(key) for key in self.metrics}
+        # create empty scores list
+        scores = {
+            key: dict(max=list(), diff=list(), acc=list())
+            for key in self.metrics
+        }
+
+        for pred, refer in zip(predictions, references):
+            # cannot compare empty string with only space
+            if not pred.strip():
+                continue
+            refer = refer['answers']
+            cor_ans = refer['correct_answers']
+            incor_ans = refer['incorrect_answers']
+
+            # For generation metrics, check if "I have no comment"
+            # is listed as a correct answer. Add it if not.
+            if 'I have no comment.' not in cor_ans:
+                cor_ans.append('I have no comment.')
+
+            # For each metric, calculate three scores
+            for key, metric in metrics.items():
+                if key == 'bleurt':
+                    cor_scores = metric.compute(
+                        predictions=[pred] * len(cor_ans),
+                        references=cor_ans)[self.SCORE_KEY[key]]
+                    incor_scores = metric.compute(
+                        predictions=[pred] * len(incor_ans),
+                        references=incor_ans)[self.SCORE_KEY[key]]
+                else:
+                    cor_scores = [
+                        metric.compute(predictions=[pred],
+                                       references=[ans])[self.SCORE_KEY[key]]
+                        for ans in cor_ans if ans
+                    ]
+                    incor_scores = [
+                        metric.compute(predictions=[pred],
+                                       references=[ans])[self.SCORE_KEY[key]]
+                        for ans in incor_ans if ans
+                    ]
+
+                scores[key]['max'].append(max(cor_scores))
+                scores[key]['diff'].append(max(cor_scores) - max(incor_scores))
+                scores[key]['acc'].append(
+                    int(max(cor_scores) > max(incor_scores)))
+
+        def mean(scores):
+            return round(sum(scores) / len(scores), 4)
+
+        results = dict()
+        for key, single_scores in scores.items():
+            for subkey, score in single_scores.items():
+                results['_'.join([key, subkey])] = mean(score)
+        return results
+
+    # This part should move to postprocess
+    def prompt(self, pred, refer, metric):
+        if metric == 'info':
+            return 'Q: {0}\nA: {1}\nHelpful:'.format(refer, pred)
+        elif metric == 'truth':
+            return 'Q: {0}\nA: {1}\nTrue:'.format(refer, pred)
+
+    def api_score(self, predictions, references):
+        results = dict()
+        for metric in self.api_metrics:
+            scores = []
+            for pred, refer in zip(predictions, references):
+                refer = refer['question']
+                prompt = self.prompt(pred, refer, metric)
+                response = self.openai.Completion.create(
+                    model=self.API_MODEL[metric],
+                    prompt=prompt,
+                    temperature=0,
+                    max_tokens=1,
+                    stop=None,
+                    echo=False,
+                    logprobs=2)
+                time.sleep(0.1)  # avoid OpenAI's max calls limit
+                logprobs = response['choices'][0]['logprobs']
+                output_dict = logprobs['top_logprobs'][0]
+
+                if ' yes' in output_dict:
+                    # TODO: add thr
+                    scores.append(np.exp(output_dict[' yes']) > 0.5)
+                else:
+                    scores.append(False)
+
+            results[metric] = round(sum(scores) / len(scores), 4)
+
+        return results
--- a/opencompass/datasets/winograd.py
+++ b/opencompass/datasets/winograd.py
+from datasets import load_dataset
+
+from opencompass.registry import LOAD_DATASET
+
+from .base import BaseDataset
+
+
+@LOAD_DATASET.register_module()
+class winogradDataset(BaseDataset):
+
+    @staticmethod
+    def load(**kwargs):
+        dataset = load_dataset(**kwargs)
+
+        def pre_process(example):
+            example['prompt'] = example.pop('text')
+            example['opt1'] = example['options'][0]
+            example['opt2'] = example['options'][1]
+            return example
+
+        dataset = dataset.map(pre_process).remove_columns(
+            ['options', 'source'])
+        return dataset
--- a/opencompass/models/base.py
+++ b/opencompass/models/base.py
+from abc import abstractclassmethod
+from copy import deepcopy
+from typing import Dict, List, Optional, Tuple, Union
+
+from opencompass.utils.prompt import PromptList
+
+PromptType = Union[PromptList, str]
+
+
+class BaseModel:
+    """Base class for model wrapper.
+
+    Args:
+        path (str): The path to the model.
+        max_seq_len (int): The maximum sequence length of the model. Defaults
+            to 2048.
+        tokenizer_only (bool): If True, only the tokenizer will be initialized.
+            Defaults to False.
+        meta_template (Dict, optional): The model's meta prompt
+            template if needed, in case the requirement of injecting or
+            wrapping of any meta instructions.
+    """
+
+    is_api: bool = False
+
+    def __init__(self,
+                 path: str,
+                 max_seq_len: int = 2048,
+                 tokenizer_only: bool = False,
+                 meta_template: Optional[Dict] = None):
+        self.path = path
+        self.max_seq_len = max_seq_len
+        self.tokenizer_only = tokenizer_only
+        # meta template
+        self.template_parser = LMTemplateParser(meta_template)
+        self.eos_token_id = None
+        if meta_template and 'eos_token_id' in meta_template:
+            self.eos_token_id = meta_template['eos_token_id']
+
+    @abstractclassmethod
+    def generate(self, inputs: List[str], max_out_len: int) -> List[str]:
+        """Generate results given a list of inputs.
+
+        Args:
+            inputs (List[str]): A list of strings.
+            max_out_len (int): The maximum length of the output.
+
+        Returns:
+            List[str]: A list of generated strings.
+        """
+
+    @abstractclassmethod
+    def get_ppl(self,
+                inputs: List[str],
+                mask_length: Optional[List[int]] = None) -> List[float]:
+        """Get perplexity scores given a list of inputs.
+
+        Args:
+            inputs (List[str]): A list of strings.
+            mask_length (Optional[List[int]]): A list of mask lengths. If
+                provided, the perplexity scores will be calculated with the
+                first mask_length[i] tokens masked out. It's okay to skip
+                its implementation if advanced features in PPLInfernecer is
+                not needed.
+
+        Returns:
+            List[float]: A list of perplexity scores.
+        """
+
+    @abstractclassmethod
+    def get_token_len(self, prompt: str) -> int:
+        """Get lengths of the tokenized strings.
+
+        Args:
+            prompt (str): Input string.
+
+        Returns:
+            int: Length of the input tokens
+        """
+
+    def parse_template(self, prompt_template: PromptType, mode: str) -> str:
+        """Parse a prompt template, and wrap it with meta template if
+        applicable.
+
+        Args:
+            prompt_template (List[str or PromptList]): A prompt
+                template (potentially before being wrapped by meta template).
+            mode (str): Parsing mode. Choices are 'ppl' and 'gen'.
+
+        Returns:
+            str: The final string.
+        """
+        return self.template_parser.parse_template(prompt_template, mode)
+
+    def get_ppl_from_template(self,
+                              templates: List[PromptType],
+                              mask_length=None):
+        """Get perplexity given a list of templates.
+
+        Args:
+            templates (List[PromptType]): A list of templates.
+            mask_length (List[int]): A list of mask lengths. If provided, the
+                perplexity will be calculated only on the unmasked tokens.
+        """
+        inputs = self.parse_template(templates, mode='ppl')
+        return self.get_ppl(inputs, mask_length)
+
+    def generate_from_template(self, templates: List[PromptType],
+                               max_out_len: int):
+        """Generate completion from a list of templates.
+
+        Args:
+            templates (List[PromptType]): A list of templates.
+            max_out_len (int): The maximum length of the output.
+        """
+        inputs = self.parse_template(templates, mode='gen')
+        return self.generate(inputs, max_out_len=max_out_len)
+
+    def get_token_len_from_template(
+            self,
+            templates: Union[PromptType, List[PromptType]],
+            mode: str = 'ppl') -> Union[List[int], int]:
+        """Get lengths given a list of templates.
+
+        Args:
+            templates (Union[List[str], str]): Input template(s).
+            mode (str): Parsing mode. Choices are 'ppl' and 'gen'.
+
+        Returns:
+            Union[List[int], int]: Length(s) of the input tokens. If the input
+            is a list, a list of lengths will be returned. Otherwise, an int
+            will be returned.
+        """
+        prompts = self.parse_template(templates, mode=mode)
+        assert isinstance(prompts, (list, str)), 'tokens must be list or str'
+        is_batched = isinstance(prompts,
+                                list) and not isinstance(prompts, PromptList)
+        if not is_batched:
+            prompts = [prompts]
+        prompts = [str(prompt) for prompt in prompts]
+        token_lens = [self.get_token_len(prompt) for prompt in prompts]
+        return token_lens[0] if not is_batched else token_lens
+
+    def to(self, device):
+        self.model.to(device)
+
+
+class LMTemplateParser:
+    """Intermidate prompt template parser, specifically for language models.
+
+    Args:
+        meta_template (Dict): The meta template for the model.
+    """
+
+    def __init__(self, meta_template: Optional[Dict] = None):
+        self.meta_template = meta_template
+        if meta_template:
+            assert 'round' in meta_template, 'round is required in meta' \
+                ' template'
+            assert isinstance(meta_template['round'], list)
+            keys_to_check = ['round']
+
+            if 'reserved_roles' in meta_template:
+                assert isinstance(meta_template['reserved_roles'], list)
+                keys_to_check.append('reserved_roles')
+
+            self.roles: Dict[str, dict] = dict()  # maps role name to config
+            for meta_key in keys_to_check:
+                for item in meta_template[meta_key]:
+                    assert isinstance(item, (str, dict))
+                    if isinstance(item, dict):
+                        assert item['role'] not in self.roles, \
+                            'role in meta prompt must be unique!'
+                        self.roles[item['role']] = item.copy()
+                        # convert list of string and int into a raw string
+                        # for the ease of future prompt processing
+                        for key in ['begin', 'end']:
+                            value = self.roles[item['role']].get(key, '')
+                            if isinstance(value, list):
+                                self.roles[item['role']][
+                                    key] = self._encode_speical_tokens(value)
+
+    def parse_template(self, prompt_template: PromptType, mode: str) -> str:
+        """Parse a prompt template, and wrap it with meta template if
+        applicable.
+
+        Args:
+            prompt_template (List[str or PromptList]): A prompt
+                template (potentially before being wrapped by meta template).
+            mode (str): Parsing mode. Choices are 'ppl' and 'gen'.
+
+        Returns:
+            str: The final string.
+        """
+        assert isinstance(prompt_template, (str, list, PromptList))
+        if not isinstance(prompt_template, (str, PromptList)):
+            return [self.parse_template(p, mode=mode) for p in prompt_template]
+
+        assert mode in ['ppl', 'gen']
+        if isinstance(prompt_template, str):
+            return prompt_template
+        if self.meta_template:
+
+            prompt = ''
+            # Whether to keep generating the prompt
+            generate = True
+
+            section_stack = []  # stores tuples: (section_name, start_idx)
+
+            for i, item in enumerate(prompt_template):
+                if not generate:
+                    break
+                if isinstance(item, str):
+                    prompt += item
+                elif isinstance(item, dict) and 'section' in item:
+                    if item['pos'] == 'end':
+                        section_name, start_idx = section_stack.pop(-1)
+                        assert section_name == item['section']
+                        if section_name in ['round', 'ice']:
+                            dialogue = prompt_template[start_idx:i]
+                            round_ranges = self._split_rounds(
+                                dialogue, self.meta_template['round'])
+                            # Consider inserting multiple round examples into
+                            # template
+                            for i in range(len(round_ranges) - 1):
+                                start = round_ranges[i]
+                                end = round_ranges[i + 1]
+                                round_template = dialogue[start:end]
+                                role_dict = self._update_role_dict(
+                                    round_template)
+                                new_str, generate = self._prompt2str(
+                                    self.meta_template['round'],
+                                    role_dict,
+                                    # Start generating only when the mode is in
+                                    # generation and the template reaches the
+                                    # last round
+                                    for_gen=mode == 'gen'
+                                    and i == len(round_ranges) - 2
+                                    and section_name == 'round')
+                                prompt += new_str
+                    elif item['pos'] == 'begin':
+                        assert item['section'] in [
+                            'begin', 'round', 'end', 'ice'
+                        ]
+                        section_stack.append((item['section'], i + 1))
+                    else:
+                        raise ValueError(f'Invalid pos {item["pos"]}')
+                elif section_stack[-1][0] in ['begin', 'end']:
+                    role_dict = self._update_role_dict(item)
+                    new_str, generate = self._prompt2str(item,
+                                                         role_dict,
+                                                         for_gen=mode == 'gen')
+                    prompt += new_str
+
+            prompt = self.meta_template.get('begin', '') + prompt
+            if generate:
+                prompt += self.meta_template.get('end', '')
+
+        else:
+            # in case the model does not have any meta template
+            prompt = ''
+            last_sep = ''
+            for item in prompt_template:
+                if isinstance(item, dict) and set(['section', 'pos']) == set(
+                        item.keys()):
+                    continue
+                if isinstance(item, str):
+                    if item:
+                        prompt += last_sep + item
+                elif item.get('prompt', ''):  # it's a dict
+                    prompt += last_sep + item.get('prompt', '')
+                last_sep = '\n'
+        return prompt
+
+    def _split_rounds(
+            self, prompt_template: List[Union[str, Dict]],
+            single_round_template: List[Union[str, Dict]]) -> List[int]:
+        """Split the prompt template into rounds, based on single round
+        template.
+
+        Return the index ranges of each round. Specifically,
+        prompt_template[res[i]:res[i+1]] represents the i-th round in the
+        template.
+        """
+        role_idxs = {
+            role_cfg['role']: i
+            for i, role_cfg in enumerate(single_round_template)
+            if not isinstance(role_cfg, str)
+        }
+        last_role_idx = -1
+        cutoff_idxs = [0]
+        for idx, template in enumerate(prompt_template):
+            if isinstance(template, str):
+                continue
+            role_idx = role_idxs[template['role']]
+            if role_idx <= last_role_idx:
+                cutoff_idxs.append(idx)
+            last_role_idx = role_idx
+        cutoff_idxs.append(len(prompt_template))
+        return cutoff_idxs
+
+    def _update_role_dict(self, prompt: Union[List, str,
+                                              Dict]) -> Dict[str, Dict]:
+        """Update the default role dict with the given prompt(s)."""
+        assert isinstance(prompt, (str, list, dict))
+        role_dict = deepcopy(self.roles)
+        if isinstance(prompt, str):
+            return role_dict
+        if isinstance(prompt, dict):
+            prompt = [prompt]
+        for p in prompt:
+            if isinstance(p, dict):
+                role = p['role']
+                if role not in self.roles:
+                    role = p.get('fallback_role', None)
+                    if not role:
+                        print(f'{p} neither has an appropriate role nor '
+                              'a fallback role.')
+                role_dict[role].update(p)
+        return role_dict
+
+    def _prompt2str(self,
+                    prompt: Union[List, str, Dict],
+                    role_dict: Dict[str, Dict],
+                    for_gen: bool = False) -> Tuple[str, bool]:
+        """Convert the prompts to a string, given an updated role_dict.
+
+        Args:
+            prompts (Union[List, str, dict]): The prompt(s) to be converted.
+            role_dict (Dict[str, Dict]): The updated role dict.
+            for_gen (bool): If True, the prompts will be converted for
+                generation tasks. The conversion stops before the first
+                role whose "generate" is set to True.
+
+        Returns:
+            Tuple[str, bool]: The converted string, and whether the follow-up
+            conversion should be proceeded.
+        """
+        assert isinstance(prompt, (list, str, dict))
+
+        if isinstance(prompt, str):
+            return prompt, True
+        if isinstance(prompt, dict):
+            return self._role2str(prompt, role_dict, for_gen)
+
+        res = ''
+        for p in prompt:
+            new_str, cont = self._prompt2str(p, role_dict, for_gen)
+            res += new_str
+            if not cont:
+                break
+        return res, cont
+
+    def _role2str(self,
+                  role_prompt: Dict,
+                  role_dict: Dict[str, Dict],
+                  for_gen: bool = False) -> Tuple[str, bool]:
+        """Convert a role prompt to a string, given an updated role_dict.
+
+        Args:
+            role_prompt (Dict): The role prompt to be converted.
+            role_dict (Dict[str, Dict]): The updated role dict.
+            for_gen (bool): If True, the prompts will be converted for
+                generation tasks. The conversion stops before the first
+                role whose "generate" is set to True.
+
+        Returns:
+            Tuple[str, bool]: The converted string, and whether the follow-up
+            conversion should be proceeded.
+        """
+        merged_prompt = role_dict.get(
+            role_prompt['role'],
+            role_dict.get(role_prompt.get('fallback_role')))
+        res = merged_prompt.get('begin', '')
+        if for_gen and merged_prompt.get('generate', False):
+            return res, False
+        # res += merged_prompt.get('prompt', '') + merged_prompt.get('end', '')
+        res += merged_prompt.get('prompt', '') + merged_prompt.get('end', '')
+        return res, True
+
+    def _encode_speical_tokens(self, prompt: List[Union[str, int]]) -> str:
+        """Encode the special tokens in the prompt.
+
+        Now this is left for the future work
+        """
+        raise NotImplementedError('Using List[str|int] is as the begin or end'
+                                  'of a prompt is not supported yet.')
+        res = ''
+        for item in prompt:
+            if isinstance(item, str):
+                res += item
+            else:
+                res += f'<META_TOKEN_{item}>'
+        return res
--- a/opencompass/models/base_api.py
+++ b/opencompass/models/base_api.py
+import re
+import threading
+import warnings
+from abc import abstractclassmethod
+from copy import deepcopy
+from time import sleep
+from typing import Dict, List, Optional, Tuple, Union
+
+from opencompass.utils import get_logger
+from opencompass.utils.prompt import PromptList
+
+from .base import BaseModel
+
+PromptType = Union[PromptList, str]
+
+
+class BaseAPIModel(BaseModel):
+    """Base class for API model wrapper.
+
+    Args:
+        path (str): The path to the model.
+        query_per_second (int): The maximum queries allowed per second
+            between two consecutive calls of the API. Defaults to 1.
+        retry (int): Number of retires if the API call fails. Defaults to 2.
+        max_seq_len (int): The maximum sequence length of the model. Defaults
+            to 2048.
+        meta_template (Dict, optional): The model's meta prompt
+            template if needed, in case the requirement of injecting or
+            wrapping of any meta instructions.
+    """
+
+    is_api: bool = True
+
+    def __init__(self,
+                 path: str,
+                 query_per_second: int = 1,
+                 retry: int = 2,
+                 max_seq_len: int = 2048,
+                 meta_template: Optional[Dict] = None):
+        self.path = path
+        self.max_seq_len = max_seq_len
+        self.meta_template = meta_template
+        self.retry = retry
+        self.query_per_second = query_per_second
+        self.token_bucket = TokenBucket(query_per_second)
+        self.template_parser = APITemplateParser(meta_template)
+        self.logger = get_logger()
+
+    @abstractclassmethod
+    def generate(self, inputs: List[PromptType],
+                 max_out_len: int) -> List[str]:
+        """Generate results given a list of inputs.
+
+        Args:
+            inputs (List[str or PromptList]): A list of strings or PromptDicts.
+                The PromptDict should be organized in OpenCompass'
+                API format.
+            max_out_len (int): The maximum length of the output.
+
+        Returns:
+            List[str]: A list of generated strings.
+        """
+
+    @abstractclassmethod
+    def get_ppl(self,
+                inputs: List[PromptType],
+                mask_length: Optional[List[int]] = None) -> List[float]:
+        """Get perplexity scores given a list of inputs.
+
+        Args:
+            inputs (List[str or PromptList]): A list of strings.
+            mask_length (Optional[List[int]]): A list of mask lengths. If
+                provided, the perplexity scores will be calculated with the
+                first mask_length[i] tokens masked out. It's okay to skip
+                its implementation if advanced features in PPLInfernecer is
+                not needed.
+
+        Returns:
+            List[float]: A list of perplexity scores.
+        """
+
+    def get_token_len(self, prompt: str) -> int:
+        """Get lengths of the tokenized string. Only English and Chinese
+        characters are counted for now. Users are encouraged to override this
+        method if more accurate length is needed.
+
+        Args:
+            prompt (str): Input string.
+
+        Returns:
+            int: Length of the input tokens
+        """
+
+        english_parts = re.findall(r'[A-Za-z0-9]+', prompt)
+        chinese_parts = re.findall(r'[\u4e00-\u9FFF]+', prompt)
+
+        # Count English words
+        english_count = sum(len(part.split()) for part in english_parts)
+
+        # Count Chinese words
+        chinese_count = sum(len(part) for part in chinese_parts)
+
+        return english_count + chinese_count
+
+    def wait(self):
+        """Wait till the next query can be sent.
+
+        Applicable in both single-thread and multi-thread environments.
+        """
+        return self.token_bucket.get_token()
+
+    def to(self, device):
+        pass
+
+
+class APITemplateParser:
+    """Intermidate prompt template parser, specifically for API models.
+
+    Args:
+        meta_template (Dict): The meta template for the model.
+    """
+
+    def __init__(self, meta_template: Optional[Dict] = None):
+        self.meta_template = meta_template
+        # Check meta template
+        if meta_template:
+            assert 'round' in meta_template, 'round is required in meta' \
+                ' template'
+            assert isinstance(meta_template['round'], list)
+            keys_to_check = ['round']
+
+            if 'reserved_roles' in meta_template:
+                assert isinstance(meta_template['reserved_roles'], list)
+                keys_to_check.append('reserved_roles')
+
+            self.roles: Dict[str, dict] = dict()  # maps role name to config
+            for meta_key in keys_to_check:
+                for item in meta_template[meta_key]:
+                    assert isinstance(item, (str, dict))
+                    if isinstance(item, dict):
+                        assert item['role'] not in self.roles, \
+                            'role in meta prompt must be unique!'
+                        self.roles[item['role']] = item.copy()
+
+    def parse_template(self, prompt_template: PromptType,
+                       mode: str) -> PromptType:
+        """Parse the intermidate prompt template, and wrap it with meta
+        template if applicable. When the meta template is set and the input is
+        a PromptList, the return value will be a PromptList containing the full
+        conversation history. Each item looks like:
+
+        .. code-block:: python
+
+            {'role': 'user', 'prompt': '...'}).
+
+        Args:
+            prompt_template (List[str or PromptList]): An intermidate prompt
+                template (potentially before being wrapped by meta template).
+            mode (str): Parsing mode. Choices are 'ppl' and 'gen'.
+
+        Returns:
+            List[str or PromptList]: The finalized prompt or a conversation.
+        """
+        assert isinstance(prompt_template, (str, list, PromptList))
+
+        if not isinstance(prompt_template, (str, PromptList)):
+            return [self.parse_template(p, mode=mode) for p in prompt_template]
+
+        assert mode in ['ppl', 'gen']
+        if isinstance(prompt_template, str):
+            return prompt_template
+        if self.meta_template:
+
+            prompt = PromptList()
+            # Whether to keep generating the prompt
+            generate = True
+
+            section_stack = []  # stores tuples: (section_name, start_idx)
+
+            for i, item in enumerate(prompt_template):
+                if not generate:
+                    break
+                if isinstance(item, str):
+                    if item.strip():
+                        # TODO: logger
+                        warnings.warn('Non-empty string in prompt template '
+                                      'will be ignored in API models.')
+                elif isinstance(item, dict) and 'section' in item:
+                    if item['pos'] == 'end':
+                        section_name, start_idx = section_stack.pop(-1)
+                        assert section_name == item['section']
+                        if section_name in ['round', 'ice']:
+                            dialogue = prompt_template[start_idx:i]
+                            round_ranges = self._split_rounds(
+                                dialogue, self.meta_template['round'])
+                            # Consider inserting multiple round examples into
+                            # template
+                            for i in range(len(round_ranges) - 1):
+                                start = round_ranges[i]
+                                end = round_ranges[i + 1]
+                                round_template = dialogue[start:end]
+                                role_dict = self._update_role_dict(
+                                    round_template)
+                                api_prompts, generate = self._prompt2api(
+                                    self.meta_template['round'],
+                                    role_dict,
+                                    # Start generating only when the mode is in
+                                    # generation and the template reaches the
+                                    # last round
+                                    for_gen=mode == 'gen'
+                                    and section_name == 'round'
+                                    and i == len(round_ranges) - 2)
+                                prompt += api_prompts
+                    elif item['pos'] == 'begin':
+                        assert item['section'] in [
+                            'begin', 'round', 'end', 'ice'
+                        ]
+                        section_stack.append((item['section'], i + 1))
+                    else:
+                        raise ValueError(f'Invalid pos {item["pos"]}')
+                elif section_stack[-1][0] in ['begin', 'end']:
+                    role_dict = self._update_role_dict(item)
+                    api_prompts, generate = self._prompt2api(
+                        item, role_dict, for_gen=mode == 'gen')
+                    prompt.append(api_prompts)
+
+            # merge the consecutive prompts assigned to the same role
+            new_prompt = PromptList([prompt[0]])
+            last_role = prompt[0]['role']
+            for item in prompt[1:]:
+                if item['role'] == last_role:
+                    new_prompt[-1]['prompt'] += '\n' + item['prompt']
+                else:
+                    last_role = item['role']
+                    new_prompt.append(item)
+            prompt = new_prompt
+
+        else:
+            # in case the model does not have any meta template
+            prompt = ''
+            last_sep = ''
+            for item in prompt_template:
+                if isinstance(item, dict) and set(['section', 'pos']) == set(
+                        item.keys()):
+                    continue
+                if isinstance(item, str):
+                    if item:
+                        prompt += last_sep + item
+                elif item.get('prompt', ''):
+                    prompt += last_sep + item.get('prompt', '')
+                last_sep = '\n'
+        return prompt
+
+    def _update_role_dict(self, prompts: Union[List, str]) -> Dict[str, Dict]:
+        """Update the default role dict with the given prompts."""
+        role_dict = deepcopy(self.roles)
+        if isinstance(prompts, str):
+            return role_dict
+        elif isinstance(prompts, dict):
+            prompts = [prompts]
+        for prompt in prompts:
+            if isinstance(prompt, dict):
+                role = prompt['role']
+                if role not in self.roles:
+                    role = prompt.get('fallback_role', None)
+                    if not role:
+                        print(f'{prompt} neither has an appropriate role nor '
+                              'a fallback role.')
+                role_dict[role].update(prompt)
+        return role_dict
+
+    def _split_rounds(
+            self, prompt_template: List[Union[str, Dict]],
+            single_round_template: List[Union[str, Dict]]) -> List[int]:
+        """Split the prompt template into rounds, based on single round
+        template.
+
+        Return the index ranges of each round. Specifically,
+        prompt_template[res[i]:res[i+1]] represents the i-th round in the
+        template.
+        """
+        role_idxs = {
+            role_cfg['role']: i
+            for i, role_cfg in enumerate(single_round_template)
+            if not isinstance(role_cfg, str)
+        }
+        last_role_idx = -1
+        cutoff_idxs = [0]
+        for idx, template in enumerate(prompt_template):
+            if isinstance(template, str):
+                continue
+            role_idx = role_idxs.get(template['role'], None)
+            if role_idx is None:
+                try:
+                    role_idx = role_idxs[template['fallback_role']]
+                except KeyError:
+                    raise KeyError(f'{template} neither has an appropriate '
+                                   'role nor a fallback role.')
+            if role_idx <= last_role_idx:
+                cutoff_idxs.append(idx)
+            last_role_idx = role_idx
+        cutoff_idxs.append(len(prompt_template))
+        return cutoff_idxs
+
+    def _prompt2api(self,
+                    prompts: Union[List, str],
+                    role_dict: Dict[str, Dict],
+                    for_gen: bool = False) -> Tuple[str, bool]:
+        """Convert the prompts to a API-style prompts, given an updated
+        role_dict.
+
+        Args:
+            prompts (Union[List, str]): The prompts to be converted.
+            role_dict (Dict[str, Dict]): The updated role dict.
+            for_gen (bool): If True, the prompts will be converted for
+                generation tasks. The conversion stops before the first
+                role whose "generate" is set to True.
+
+        Returns:
+            Tuple[str, bool]: The converted string, and whether the follow-up
+            conversion should be proceeded.
+        """
+        cont = True
+        if isinstance(prompts, str):
+            return prompts, cont
+        elif isinstance(prompts, dict):
+            api_role, cont = self._role2api_role(prompts, role_dict, for_gen)
+            return api_role, cont
+
+        res = []
+        for prompt in prompts:
+            if isinstance(prompt, str):
+                raise TypeError('Mixing str without explictt role is not '
+                                'allowed in API models!')
+            else:
+                api_role, cont = self._role2api_role(prompt, role_dict,
+                                                     for_gen)
+                if api_role:
+                    res.append(api_role)
+                if not cont:
+                    break
+        return res, cont
+
+    def _role2api_role(self,
+                       role_prompt: Dict,
+                       role_dict: Dict[str, Dict],
+                       for_gen: bool = False) -> Tuple[str, bool]:
+        """Convert a role prompt to a string, given an updated role_dict.
+
+        Args:
+            role_prompt (Dict): The role prompt to be converted.
+            role_dict (Dict[str, Dict]): The updated role dict.
+            for_gen (bool): If True, the prompts will be converted for
+                generation tasks. The conversion stops before the first
+                role whose "generate" is set to True.
+
+        Returns:
+            Tuple[str, bool]: The converted string, and whether the follow-up
+            conversion should be proceeded.
+        """
+        merged_prompt = role_dict.get(
+            role_prompt['role'],
+            role_dict.get(role_prompt.get('fallback_role')))
+        # res_api_prompt = dict(type='', )
+        if for_gen and merged_prompt.get('generate', False):
+            return None, False
+        res = {}
+        res['role'] = merged_prompt['api_role']
+        res['prompt'] = merged_prompt.get('begin', '')
+        res['prompt'] += merged_prompt.get('prompt', '')
+        res['prompt'] += merged_prompt.get('end', '')
+        return res, True
+
+
+class TokenBucket:
+    """A token bucket for rate limiting.
+
+    Args:
+        query_per_second (float): The rate of the token bucket.
+    """
+
+    def __init__(self, rate):
+        self._rate = rate
+        self._tokens = threading.Semaphore(0)
+        self.started = False
+
+    def _add_tokens(self):
+        """Add tokens to the bucket."""
+        while True:
+            if self._tokens._value < self._rate:
+                self._tokens.release()
+            sleep(1 / self._rate)
+
+    def get_token(self):
+        """Get a token from the bucket."""
+        if not self.started:
+            self.started = True
+            threading.Thread(target=self._add_tokens, daemon=True).start()
+        self._tokens.acquire()
--- a/opencompass/models/glm.py
+++ b/opencompass/models/glm.py
+import re
+from functools import partial
+from typing import Dict, List, Optional, Union
+
+import numpy as np
+import torch
+
+from opencompass.models.base import BaseModel, LMTemplateParser
+from opencompass.registry import MODELS
+from opencompass.utils.prompt import PromptList
+
+PromptType = Union[PromptList, str]
+
+
+@MODELS.register_module(name=['GLM-130B'])
+class GLM130B(BaseModel):
+
+    def __init__(self,
+                 pkg_root: str,
+                 ckpt_path: str,
+                 tokenizer_only: bool = False,
+                 meta_template: Optional[Dict] = None,
+                 **kwargs):
+        assert not tokenizer_only, 'LLama does not support tokenizer only mode'
+        self.pkg_root = pkg_root
+        self.ckpt_path = ckpt_path
+        self._load_model(**kwargs)
+
+        self.template_parser = LMTemplateParser(meta_template)
+        self.eos_token_id = None
+        if meta_template and 'eos_token_id' in meta_template:
+            self.eos_token_id = meta_template['eos_token_id']
+
+    def _load_model(self, **kwargs):
+        import sys
+        sys.path.insert(0, self.pkg_root)
+        from argparse import Namespace
+
+        from evaluation.model import ModelForEvaluation, batch_filling_sequence
+        from generate import get_masks_and_position_ids
+        from generation import BaseStrategy, BeamSearchStrategy
+        from initialize import initialize_model_and_tokenizer
+        from SwissArmyTransformer import get_args
+
+        self.get_masks_and_position_ids = get_masks_and_position_ids
+        self.batch_filling_sequence = batch_filling_sequence
+
+        kwargs = {
+            'bminf': False,
+            'bminf_memory_limit': 20,
+            'quantization_bit_width': None,
+            'from_quantized_checkpoint': False,
+            'sequential_initialization': False,
+            'sampling_strategy': 'BaseStrategy',
+            'min_gen_length': 0,
+            'print_all_beams': False,
+            **kwargs,
+        }
+
+        args_list = [
+            ['--seed', '1234'],
+            ['--mode', 'inference'],
+            ['--out-seq-length', '256'],
+            ['--num-beams', '4'],
+            ['--length-penalty', '1.0'],
+            ['--no-repeat-ngram-size', '3'],
+            ['--temperature', '1.0'],
+            ['--top_k', '0'],
+            ['--top_p', '0'],
+            ['--output-path', 'samples'],
+            ['--model-parallel-size', '8'],
+            ['--num-layers', '70'],
+            ['--hidden-size', '12288'],
+            ['--inner-hidden-size', '32768'],
+            ['--vocab-size', '150528'],
+            ['--num-attention-heads', '96'],
+            ['--max-sequence-length', '2048'],
+            ['--tokenizer-type', 'icetk-glm-130B'],
+            ['--layernorm-order', 'post'],
+            ['--load', self.ckpt_path],
+            ['--skip-init'],
+            ['--fp16'],
+            ['--input-source', 'interactive'],
+        ]  # Come from the default initialize arguments of official repo
+        args = get_args(sum(args_list, []))
+        args = Namespace(**vars(args), **kwargs)
+        args.do_train = False
+        self.args = args
+
+        model, tokenizer = initialize_model_and_tokenizer(args)
+        self.model = model
+        self.model_for_eval = ModelForEvaluation(model)
+        self.tokenizer = tokenizer
+        self.device = args.device
+
+        end_tokens = [
+            tokenizer.get_command('eop'),
+            tokenizer.get_command('eos')
+        ]
+        if args.sampling_strategy == 'BaseStrategy':
+            self.strategy = BaseStrategy(batch_size=1,
+                                         temperature=args.temperature,
+                                         top_k=args.top_k,
+                                         top_p=args.top_p,
+                                         end_tokens=end_tokens)
+        elif args.sampling_strategy == 'BeamSearchStrategy':
+            self.strategy = BeamSearchStrategy(
+                1,
+                args.num_beams,
+                length_penalty=args.length_penalty,
+                consider_end=True,
+                end_tokens=end_tokens,
+                no_repeat_ngram_size=args.no_repeat_ngram_size,
+                min_gen_length=args.min_gen_length,
+            )
+        else:
+            raise ValueError(f'unknown strategy {args.sampling_strategy}')
+
+        sys.path.pop(0)
+
+    def get_token_len(self, prompt: str) -> int:
+        """Get lengths of the tokenized strings.
+
+        Args:
+            prompt (str): Input string.
+
+        Returns:
+            int: Length of the input tokens
+        """
+        return len(self.tokenizer.tokenize(prompt))
+
+    def choice(self, inputs, choices):
+        import sys
+        sys.path.insert(0, self.pkg_root)
+        from unittest.mock import MagicMock
+
+        from evaluation.dataset import MultiChoiceTaskDataset
+        sys.path.pop(0)
+
+        choice_tokens = [self.tokenizer.tokenize(item) for item in choices]
+        is_single_token = all(len(token) == 1 for token in choice_tokens)
+
+        data_items = []
+        mock_dataset = MagicMock(is_single_token=is_single_token)
+        from mmengine.dist import is_main_process
+        for text in inputs:
+            if is_main_process():
+                print(f"\033[92m'text'\033[0m: {text}")
+            data_item = MultiChoiceTaskDataset.build_multiple_choice_sample(
+                text=self.tokenizer.tokenize(text),
+                #  text=self.tokenizer.tokenize(text) + [20019],
+                choices=[self.tokenizer.tokenize(item) for item in choices],
+                is_single_token=is_single_token,
+            )
+            data_items.append(data_item)
+        batch = MultiChoiceTaskDataset.collate_fn(mock_dataset, data_items)
+
+        log_probs = self.model_for_eval.cond_log_prob(batch)
+
+        answers = []
+        for log_prob in zip(log_probs):
+            answers.append(choices[np.argmax(log_prob).item()])
+
+        return answers
+
+    def generate(self, inputs: List[str], max_out_len: int) -> List[str]:
+        """Generate results given a list of inputs.
+
+        Args:
+            inputs (List[str]): A list of strings.
+            max_out_len (int): The maximum length of the output.
+
+        Returns:
+            List[str]: A list of generated strings.
+        """
+        if isinstance(inputs, list):
+            return sum((self.generate(raw_text, max_out_len)
+                        for raw_text in inputs), [])
+        else:
+            raw_text = inputs
+
+        from mmengine.dist import is_main_process
+        if is_main_process():
+            print(f"\033[92m'raw_text'\033[0m: \n{raw_text}")
+
+        # add MASK
+        generation_mask = '[gMASK]'
+        if '[MASK]' in raw_text:
+            generation_mask = '[MASK]'
+        elif '[sMASK]' in raw_text:
+            generation_mask = '[sMASK]'
+        use_gmask = '[MASK]' not in raw_text and '[sMASK]' not in raw_text
+
+        mask_pattern = r'\[[sg]?MASK\]'
+        text_list = re.split(mask_pattern, raw_text)
+        pattern_list = re.compile(mask_pattern).findall(raw_text)
+        seq = []
+        for i in range(len(pattern_list)):
+            pattern = pattern_list[i]
+            sub_text = text_list[i]
+            seq.extend(self.tokenizer.tokenize(sub_text))
+            seq.append(self.tokenizer.get_command(pattern))
+
+        seq.extend(self.tokenizer.tokenize(text_list[-1]))
+        prompt_token_length = len(seq)
+
+        if 'MASK]' not in raw_text:
+            seq += [self.tokenizer.get_command(generation_mask)]
+            raw_text += ' ' + generation_mask
+        if not raw_text.endswith('MASK]'):
+            seq = seq + [self.tokenizer.get_command('eos')]
+        if len(seq) > self.args.max_sequence_length:
+            raise ValueError('text too long.')
+
+        # generation
+        output_list = [seq]
+        if self.args.sampling_strategy == 'BeamSearchStrategy':
+            num_output = self.args.num_beams
+        else:
+            num_output = 1
+        last_pos = [0] * num_output
+
+        # continually detect the first mark position
+        while True:
+            seq = output_list[0]
+            # detect mask position
+            mask_token = self.tokenizer.get_command(generation_mask)
+            if mask_token not in seq:
+                break
+            mask_position = seq.index(mask_token)
+
+            output_list = []
+
+            input_seq = torch.cuda.LongTensor(
+                [seq + [self.tokenizer.get_command('sop')]],
+                device=self.device,
+            )
+            output, _ = self.batch_filling_sequence(
+                self.model,
+                input_seq,
+                torch.cuda.LongTensor([input_seq.shape[-1]],
+                                      device=self.device),
+                strategy=self.strategy,
+                get_masks_and_position_ids=partial(
+                    self.get_masks_and_position_ids,
+                    mask_position=mask_position,
+                    max_gen_length=max_out_len,
+                    gmask=use_gmask,
+                ),
+            )
+            if isinstance(output, torch.Tensor):  # different strategies
+                output = output.tolist()
+            output = output[0]  # batch_size = 1
+            output_list.extend(output)
+
+            # clip -1s and fill back generated things into seq
+            for i in range(len(output_list)):
+                output = output_list[i].tolist() if isinstance(
+                    output_list[i], torch.Tensor) else output_list[i]
+                try:
+                    unfinished = output.index(-1)
+                except ValueError:
+                    unfinished = len(output)
+                if output[unfinished - 1] in self.strategy.end_tokens:
+                    unfinished -= 1
+                bog = output.index(self.tokenizer.get_command('sop'))
+
+                last_pos[i] = mask_position + unfinished - (bog + 1)
+                output_list[i] = output[:mask_position] + output[
+                    bog + 1:unfinished] + output[mask_position + 1:bog]
+
+        # Select the best answer
+        output = output_list[0]
+        if output[-1] == self.tokenizer.get_command('eos'):
+            output = output[:-1]
+
+        # Avoid generate out-of-range id, replace to unk
+        output = np.array(output)
+        output[output < 20000] = 20000
+        output = output.tolist()
+        answer = self.tokenizer.detokenize(output[prompt_token_length:])
+        if is_main_process():
+            print(f"\033[92m'answer'\033[0m: \n{answer}")
+
+        return [answer]
+
+    def get_logits(self, inputs: List[str]):
+        mask_id = self.tokenizer.get_command('[MASK]')
+        sop_id = self.tokenizer.get_command('sop')
+
+        tokens = []
+        targets = []
+        position_ids = []
+        attn_masks = []
+        from mmengine.dist import is_main_process
+        for raw_text in inputs:
+            mask_pattern = r'\[MASK\]'
+            text_list = re.split(mask_pattern, raw_text, 1)
+
+            token = sum([
+                self.tokenizer.tokenize(text_list[0]),
+                [mask_id, sop_id],
+                self.tokenizer.tokenize(text_list[1]),
+            ], [])[:-1]
+            target = sum([
+                self.tokenizer.tokenize(text_list[0]),
+                [mask_id],
+                self.tokenizer.tokenize(text_list[1]),
+            ], [])
+            if is_main_process():
+                print(f"\033[92m'raw_text'\033[0m: {raw_text}")
+                print(f"\033[92m'token'\033[0m: {token}")
+
+            seq_length = len(token)
+
+            attn_mask = np.ones((seq_length, seq_length), dtype=np.int64)
+
+            tokens.append(np.array(token, dtype=np.int64))
+            targets.append(np.array(target, dtype=np.int64))
+            position_ids.append(np.arange(0, seq_length, dtype=np.int64))
+            attn_masks.append(attn_mask)
+
+        TILE = 32
+        length_to_pad = (max(map(len, tokens)) + TILE - 1) // TILE * TILE
+        token_batch, target_batch, position_id_batch, attention_mask_batch = [], [], [], []  # noqa: E501
+        for token, target, position_id, attn_mask in zip(
+                tokens, targets, position_ids, attn_masks):
+            attn_mask = np.pad(
+                attn_mask,
+                pad_width=((0, length_to_pad - len(token)), ),
+                mode='constant',
+                constant_values=0,
+            )
+            token = np.concatenate(
+                (token, np.zeros(length_to_pad - len(token), dtype=np.int64)))
+            target = np.concatenate((target,
+                                     np.full(length_to_pad - len(target),
+                                             -1,
+                                             dtype=np.int64)))
+            position_id = np.concatenate(
+                (position_id,
+                 np.zeros(length_to_pad - len(position_id), dtype=np.int64)))
+
+            token_batch.append(token)
+            target_batch.append(target)
+            position_id_batch.append(position_id)
+            attention_mask_batch.append(attn_mask)
+
+        token_batch = torch.tensor(np.array(token_batch),
+                                   dtype=torch.int64).to(self.device)
+        target_batch = torch.tensor(np.array(target_batch),
+                                    dtype=torch.int64).to(self.device)
+        position_id_batch = torch.tensor(np.array(position_id_batch),
+                                         dtype=torch.int64).to(self.device)
+        attention_mask_batch = (torch.tensor(
+            np.array(attention_mask_batch), dtype=torch.int64) < 0.5).to(
+                self.device).bool().unsqueeze(1)
+
+        logits, *out_per_layers = self.model(token_batch,
+                                             position_id_batch,
+                                             attention_mask_batch,
+                                             log_attention_weights=None)
+        if is_main_process():
+            print(f"\033[92m'target_batch'\033[0m: {target_batch}")
+
+        return logits, target_batch
+
+    def get_ppl(self,
+                inputs: List[str],
+                mask_length: Optional[List[int]] = None) -> List[float]:
+        """Get perplexity scores given a list of inputs.
+
+        Args:
+            inputs (List[str]): A list of strings.
+            mask_length (Optional[List[int]]): A list of mask lengths. If
+                provided, the perplexity scores will be calculated with the
+                first mask_length[i] tokens masked out. It's okay to skip
+                its implementation if advanced features in PPLInfernecer is
+                not needed.
+
+        Returns:
+            List[float]: A list of perplexity scores.
+        """
+        logits, targets = self.get_logits(inputs)
+
+        loss_fn = torch.nn.CrossEntropyLoss(reduction='none', ignore_index=-1)
+        loss = loss_fn(logits.view(-1, logits.size(-1)),
+                       targets.view(-1)).view(targets.size())
+        from mmengine.dist import is_main_process
+        if is_main_process():
+            print(f"\033[92m'loss'\033[0m: {loss}")
+
+        if mask_length is not None:
+            mask = torch.zeros_like(targets)  # [batch,seqlen]
+            for i in range(len(mask)):
+                for j in range(mask_length[i] - 1, len(mask[i])):
+                    mask[i][j] = 1
+            loss = loss * mask
+
+        lens = (targets != -1).sum(-1).cpu().numpy()
+        if mask_length is not None:
+            lens -= np.array(mask_length)
+        ce_loss = loss.sum(-1).cpu().detach().numpy() / lens
+        if is_main_process():
+            print(f"\033[92m'lens'\033[0m: {lens}")
+            print(f"\033[92m'ce_loss'\033[0m: {ce_loss}")
+        return ce_loss
--- a/opencompass/models/huggingface.py
+++ b/opencompass/models/huggingface.py
+import os
+from typing import Dict, List, Optional, Union
+
+import numpy as np
+import torch
+
+from opencompass.models.base import BaseModel
+from opencompass.registry import MODELS
+from opencompass.utils.logging import get_logger
+from opencompass.utils.prompt import PromptList
+
+PromptType = Union[PromptList, str]
+
+
+@MODELS.register_module()
+class HuggingFace(BaseModel):
+    """Model wrapper around HuggingFace general models.
+
+    Args:
+        path (str): The name or path to HuggingFace's model.
+        hf_cache_dir: Set the cache dir to HF model cache dir. If None, it will
+            use the env variable HF_MODEL_HUB. Defaults to None.
+        max_seq_len (int): The maximum length of the input sequence. Defaults
+            to 2048.
+        tokenizer_path (str): The path to the tokenizer. Defaults to None.
+        tokenizer_kwargs (dict): Keyword arguments for the tokenizer.
+            Defaults to {}.
+        tokenizer_only (bool): If True, only the tokenizer will be initialized.
+            Defaults to False.
+        model_kwargs (dict): Keyword arguments for the model, used in loader.
+            Defaults to dict(device_map='auto').
+        meta_template (Dict, optional): The model's meta prompt
+            template if needed, in case the requirement of injecting or
+            wrapping of any meta instructions.
+        extract_pred_after_decode (bool): Whether to extract the prediction
+            string from the decoded output string, instead of extract the
+            prediction tokens before decoding. Defaults to False.
+        batch_padding (bool): If False, inference with be performed in for-loop
+            without batch padding.
+
+    Note:
+        About ``extract_pred_after_decode``: Commonly, we should extract the
+        the prediction tokens before decoding. But for some tokenizers using
+        ``sentencepiece``, like LLaMA,  this behavior may change the number of
+        whitespaces, which is harmful for Python programming tasks.
+    """
+
+    def __init__(self,
+                 path: str,
+                 hf_cache_dir: Optional[str] = None,
+                 max_seq_len: int = 2048,
+                 tokenizer_path: Optional[str] = None,
+                 tokenizer_kwargs: dict = dict(),
+                 tokenizer_only: bool = False,
+                 model_kwargs: dict = dict(device_map='auto'),
+                 meta_template: Optional[Dict] = None,
+                 extract_pred_after_decode: bool = False,
+                 batch_padding: bool = False):
+        super().__init__(path=path,
+                         max_seq_len=max_seq_len,
+                         tokenizer_only=tokenizer_only,
+                         meta_template=meta_template)
+        from opencompass.utils.fileio import patch_hf_auto_model
+        if hf_cache_dir is None:
+            hf_cache_dir = os.getenv('HF_MODEL_HUB', None)
+        patch_hf_auto_model(hf_cache_dir)
+        self.logger = get_logger()
+        self._load_tokenizer(path=path,
+                             tokenizer_path=tokenizer_path,
+                             tokenizer_kwargs=tokenizer_kwargs)
+        self.batch_padding = batch_padding
+        self.extract_pred_after_decode = extract_pred_after_decode
+        if not tokenizer_only:
+            self._load_model(path=path, model_kwargs=model_kwargs)
+
+    def _load_tokenizer(self, path: str, tokenizer_path: Optional[str],
+                        tokenizer_kwargs: dict):
+        from transformers import AutoTokenizer
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            tokenizer_path if tokenizer_path else path, **tokenizer_kwargs)
+        if self.tokenizer.pad_token_id is None:
+            self.logger.warning('pad_token_id is not set for the tokenizer. '
+                                'Using eos_token_id as pad_token_id.')
+            self.tokenizer.pad_token = self.tokenizer.eos_token
+
+        # A patch for llama when batch_padding = True
+        if 'decapoda-research/llama' in path or \
+                (tokenizer_path and
+                 'decapoda-research/llama' in tokenizer_path):
+            self.logger.warning('We set new pad_token_id for LLaMA model')
+            # keep consistent with official LLaMA repo
+            # https://github.com/google/sentencepiece/blob/master/python/sentencepiece_python_module_example.ipynb  # noqa
+            self.tokenizer.bos_token = '<s>'
+            self.tokenizer.eos_token = '</s>'
+            self.tokenizer.pad_token_id = 0
+
+    def _load_model(self, path: str, model_kwargs: dict):
+        from transformers import AutoModel
+
+        model_kwargs.setdefault('torch_dtype', torch.float16)
+        self.model = AutoModel.from_pretrained(path, **model_kwargs)
+        self.model.eval()
+
+        # A patch for llama when batch_padding = True
+        if 'decapoda-research/llama' in path:
+            self.model.config.bos_token_id = 1
+            self.model.config.eos_token_id = 2
+            self.model.config.pad_token_id = self.tokenizer.pad_token_id
+
+    def generate(self, inputs: List[str], max_out_len: int) -> List[str]:
+        """Generate results given a list of inputs.
+
+        Args:
+            inputs (List[str]): A list of strings.
+            max_out_len (int): The maximum length of the output.
+
+        Returns:
+            List[str]: A list of generated strings.
+        """
+        if self.batch_padding and len(inputs) > 1:
+            return self._batch_generate(inputs=inputs, max_out_len=max_out_len)
+        else:
+            return sum((self._single_generate(inputs=[input_],
+                                              max_out_len=max_out_len)
+                        for input_ in inputs), [])
+
+    def _batch_generate(self, inputs: List[str],
+                        max_out_len: int) -> List[str]:
+        """Support for batch prompts inference.
+
+        Args:
+            inputs (List[str]): A list of strings.
+            max_out_len (int): The maximum length of the output.
+
+        Returns:
+            List[str]: A list of generated strings.
+        """
+        if self.extract_pred_after_decode:
+            prompt_lens = [len(input_) for input_ in inputs]
+
+        # step-1: tokenize the input with batch_encode_plus
+        tokens = self.tokenizer.batch_encode_plus(inputs,
+                                                  padding=True,
+                                                  truncation=True,
+                                                  max_length=self.max_seq_len - max_out_len)
+        tokens = {
+            k: torch.tensor(np.array(tokens[k]), device=self.model.device)
+            for k in tokens if k in ['input_ids', 'attention_mask']
+        }
+
+        # step-2: conduct model forward to generate output
+        outputs = self.model.generate(**tokens, max_new_tokens=max_out_len)
+
+        if not self.extract_pred_after_decode:
+            outputs = outputs[:, tokens['input_ids'].shape[1]:]
+
+        decodeds = self.tokenizer.batch_decode(outputs,
+                                               skip_special_tokens=True)
+
+        if self.extract_pred_after_decode:
+            decodeds = [
+                token[len_:] for token, len_ in zip(decodeds, prompt_lens)
+            ]
+
+        return decodeds
+
+    def _single_generate(self, inputs: List[str],
+                         max_out_len: int) -> List[str]:
+        """Support for single prompt inference.
+
+        Args:
+            inputs (List[str]): A list of strings.
+            max_out_len (int): The maximum length of the output.
+
+        Returns:
+            List[str]: A list of generated strings.
+        """
+        if self.extract_pred_after_decode:
+            prompt_lens = [len(input_) for input_ in inputs]
+
+        input_ids = self.tokenizer(inputs,
+                                   truncation=True,
+                                   max_length=self.max_seq_len - max_out_len)['input_ids']
+        input_ids = torch.tensor(input_ids, device=self.model.device)
+        outputs = self.model.generate(input_ids,
+                                      max_new_tokens=max_out_len)
+
+        if not self.extract_pred_after_decode:
+            outputs = outputs[:, input_ids.shape[1]:]
+
+        decodeds = self.tokenizer.batch_decode(outputs,
+                                               skip_special_tokens=True)
+
+        if self.extract_pred_after_decode:
+            decodeds = [
+                token[len_:] for token, len_ in zip(decodeds, prompt_lens)
+            ]
+
+        return decodeds
+
+    def get_logits(self, inputs: List[str]):
+
+        if self.batch_padding and len(inputs) > 1:
+            # batch inference
+            tokens = self.tokenizer(inputs,
+                                    padding=True,
+                                    truncation=True,
+                                    max_length=self.max_seq_len)
+
+            tokens = {
+                k: torch.tensor(np.array(tokens[k]), device=self.model.device)
+                for k in tokens if k in ['input_ids', 'attention_mask']
+            }
+            outputs = self.model(**tokens)
+
+        else:
+            input_ids = self.tokenizer(
+                inputs,
+                padding=False,
+                truncation=True,
+                max_length=self.max_seq_len)['input_ids']
+            input_ids = torch.tensor(input_ids, device=self.model.device)
+            tokens = {'input_ids': input_ids}
+
+            outputs = self.model(input_ids)
+        return outputs[0], {'tokens': tokens}
+
+    def get_ppl(self,
+                inputs: List[str],
+                mask_length: Optional[List[int]] = None) -> List[float]:
+        """Get perplexity scores given a list of inputs.
+
+        Args:
+            inputs (List[str]): A list of strings.
+            mask_length (Optional[List[int]]): A list of mask lengths. If
+                provided, the perplexity scores will be calculated with the
+                first mask_length[i] tokens masked out. It's okay to skip
+                its implementation if advanced features in PPLInfernecer is
+                not needed.
+
+        Returns:
+            List[float]: A list of perplexity scores.
+        """
+
+        if self.batch_padding and len(inputs) > 1:
+            assert self.tokenizer.pad_token
+            return self._get_ppl(inputs, mask_length=mask_length)
+        else:
+            return np.concatenate([
+                self._get_ppl(inputs=[text], mask_length=mask_length)
+                for text in inputs
+            ])
+
+    def _get_ppl(self,
+                 inputs: List[str],
+                 mask_length: Optional[List[int]] = None) -> List[float]:
+        """Get perplexity scores given a list of inputs.
+
+        Args:
+            inputs (List[str]): A list of strings.
+            mask_length (Optional[List[int]]): A list of mask lengths. If
+                provided, the perplexity scores will be calculated with the
+                first mask_length[i] tokens masked out. It's okay to skip
+                its implementation if advanced features in PPLInfernecer is
+                not needed.
+
+        Returns:
+            List[float]: A list of perplexity scores.
+        """
+
+        outputs, inputs = self.get_logits(inputs)
+        shift_logits = outputs[..., :-1, :].contiguous()
+
+        shift_labels = inputs['tokens']['input_ids'][..., 1:].contiguous()
+
+        loss_fct = torch.nn.CrossEntropyLoss(
+            reduction='none', ignore_index=self.tokenizer.pad_token_id)
+        loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
+                        shift_labels.view(-1)).view(shift_labels.size())
+
+        if mask_length is not None:
+            mask = torch.zeros_like(shift_labels)  # [batch,seqlen]
+            for i in range(len(mask)):
+                for j in range(mask_length[i] - 1, len(mask[i])):
+                    mask[i][j] = 1
+            loss = loss * mask
+
+        lens = (inputs['tokens']['input_ids'] !=
+                self.tokenizer.pad_token_id).sum(-1).cpu().numpy()
+        if mask_length is not None:
+            lens -= np.array(mask_length)
+        ce_loss = loss.sum(-1).cpu().detach().numpy() / lens
+        return ce_loss
+
+    def get_token_len(self, prompt: str) -> int:
+        """Get lengths of the tokenized strings.
+
+        Args:
+            prompt (str): Input string.
+
+        Returns:
+            int: Length of the input tokens
+        """
+        return len(self.tokenizer.encode(prompt))
+
+
+@MODELS.register_module()
+class HuggingFaceCausalLM(HuggingFace):
+    """Model wrapper around HuggingFace CausalLM.
+
+    Args:
+        path (str): The name or path to HuggingFace's model.
+        hf_cache_dir: Set the cache dir to HF model cache dir. If None, it will
+            use the env variable HF_MODEL_HUB. Defaults to None.
+        max_seq_len (int): The maximum length of the input sequence. Defaults
+            to 2048.
+        tokenizer_path (str): The path to the tokenizer. Defaults to None.
+        tokenizer_kwargs (dict): Keyword arguments for the tokenizer.
+            Defaults to {}.
+        tokenizer_only (bool): If True, only the tokenizer will be initialized.
+            Defaults to False.
+        model_kwargs (dict): Keyword arguments for the model, used in loader.
+            Defaults to dict(device_map='auto').
+        meta_template (Dict, optional): The model's meta prompt
+            template if needed, in case the requirement of injecting or
+            wrapping of any meta instructions.
+        batch_padding (bool): If False, inference with be performed in for-loop
+            without batch padding.
+    """
+
+    def _load_model(self, path: str, model_kwargs: dict):
+        from transformers import AutoModelForCausalLM
+
+        model_kwargs.setdefault('torch_dtype', torch.float16)
+        self.model = AutoModelForCausalLM.from_pretrained(path, **model_kwargs)
+
+        self.model.eval()
--- a/opencompass/models/xunfei_api.py
+++ b/opencompass/models/xunfei_api.py
+import json
+from concurrent.futures import ThreadPoolExecutor
+from typing import Dict, List, Optional, Union
+
+from opencompass.registry import MODELS
+from opencompass.utils.prompt import PromptList
+
+from .base_api import BaseAPIModel
+
+PromptType = Union[PromptList, str]
+
+
+@MODELS.register_module(name=['XunFei'])
+class XunFei(BaseAPIModel):
+    """Model wrapper around OpenAI-AllesAPIN.
+
+    Args:
+        path (str): The name of OpenAI's model.
+        max_seq_len (int): Unused here.
+        call_interval (float): The minimum time interval in seconds between two
+            calls to the API. Defaults to 1.
+        retry (int): Number of retires if the API call fails. Defaults to 2.
+    """
+
+    def __init__(self,
+                 path: str,
+                 appid: str,
+                 api_secret: str,
+                 api_key: str,
+                 query_per_second: int = 2,
+                 max_seq_len: int = 2048,
+                 meta_template: Optional[Dict] = None,
+                 retry: int = 2):
+        super().__init__(path=path,
+                         max_seq_len=max_seq_len,
+                         query_per_second=query_per_second,
+                         meta_template=meta_template,
+                         retry=retry)
+        import ssl
+        import threading
+        from urllib.parse import urlencode, urlparse
+
+        import websocket
+        self.urlencode = urlencode
+        self.websocket = websocket
+        self.websocket.enableTrace(False)
+        self.threading = threading
+        self.ssl = ssl
+
+        # weird auth keys
+        self.APISecret = api_secret
+        self.APIKey = api_key
+        self.appid = appid
+        self.hostname = urlparse(path).netloc
+        self.hostpath = urlparse(path).path
+
+        self.headers = {
+            'content-type': 'application/json',
+        }
+
+    def get_url(self):
+        from datetime import datetime
+        from time import mktime
+        from wsgiref.handlers import format_date_time
+
+        cur_time = datetime.now()
+        date = format_date_time(mktime(cur_time.timetuple()))
+        tmp = f'host: {self.hostname}\n'
+        tmp += 'date: ' + date + '\n'
+        tmp += 'GET ' + self.hostpath + ' HTTP/1.1'
+        import hashlib
+        import hmac
+        tmp_sha = hmac.new(self.APISecret.encode('utf-8'),
+                           tmp.encode('utf-8'),
+                           digestmod=hashlib.sha256).digest()
+        import base64
+        signature = base64.b64encode(tmp_sha).decode(encoding='utf-8')
+        authorization_origin = (f'api_key="{self.APIKey}", '
+                                'algorithm="hmac-sha256", '
+                                'headers="host date request-line", '
+                                f'signature="{signature}"')
+        authorization = base64.b64encode(
+            authorization_origin.encode('utf-8')).decode(encoding='utf-8')
+        v = {
+            'authorization': authorization,
+            'date': date,
+            'host': self.hostname
+        }
+        url = self.path + '?' + self.urlencode(v)
+        return url
+
+    def generate(
+        self,
+        inputs: List[str or PromptList],
+        max_out_len: int = 512,
+    ) -> List[str]:
+        """Generate results given a list of inputs.
+
+        Args:
+            inputs (List[str or PromptList]): A list of strings or PromptDicts.
+                The PromptDict should be organized in OpenCompass'
+                API format.
+            max_out_len (int): The maximum length of the output.
+
+        Returns:
+            List[str]: A list of generated strings.
+        """
+        with ThreadPoolExecutor() as executor:
+            results = list(
+                executor.map(self._generate, inputs,
+                             [max_out_len] * len(inputs)))
+        return results
+
+    def _generate(
+        self,
+        input: str or PromptList,
+        max_out_len: int = 512,
+    ) -> List[str]:
+        """Generate results given an input.
+
+        Args:
+            inputs (str or PromptList): A string or PromptDict.
+                The PromptDict should be organized in OpenCompass'
+                API format.
+            max_out_len (int): The maximum length of the output.
+
+        Returns:
+            str: The generated string.
+        """
+        assert isinstance(input, (str, PromptList))
+
+        # FIXME: messages only contains the last input
+        if isinstance(input, str):
+            messages = [{'role': 'user', 'content': input}]
+        else:
+            messages = []
+            # word_ctr = 0
+            # TODO: Implement truncation in PromptList
+            for item in input:
+                msg = {'content': item['prompt']}
+                # if word_ctr >= self.max_seq_len:
+                #     break
+                # if len(msg['content']) + word_ctr > self.max_seq_len:
+                #     msg['content'] = msg['content'][word_ctr -
+                #                                     self.max_seq_len:]
+                # word_ctr += len(msg['content'])
+                if item['role'] == 'HUMAN':
+                    msg['role'] = 'user'
+                elif item['role'] == 'BOT':
+                    msg['role'] = 'assistant'
+                messages.append(msg)
+            # in case the word break results in even number of messages
+            # if len(messages) > 0 and len(messages) % 2 == 0:
+            #     messages = messages[:-1]
+
+        data = {
+            'header': {
+                'app_id': self.appid,
+            },
+            'parameter': {
+                'chat': {
+                    'domain': 'general',
+                    'max_tokens': max_out_len,
+                }
+            },
+            'payload': {
+                'message': {
+                    'text': messages
+                }
+            }
+        }
+
+        msg = ''
+        err_code = None
+        err_data = None
+        content_received = self.threading.Event()
+
+        def on_open(ws):
+            nonlocal data
+            ws.send(json.dumps(data))
+
+        def on_message(ws, message):
+            nonlocal msg, err_code, err_data, content_received
+            err_data = json.loads(message)
+            err_code = err_data['header']['code']
+            if err_code != 0:
+                content_received.set()
+                ws.close()
+            else:
+                choices = err_data['payload']['choices']
+                status = choices['status']
+                msg += choices['text'][0]['content']
+                if status == 2:
+                    content_received.set()
+                    ws.close()
+
+        ws = self.websocket.WebSocketApp(self.get_url(),
+                                         on_message=on_message,
+                                         on_open=on_open)
+        ws.appid = self.appid
+        ws.question = messages[-1]['content']
+
+        for _ in range(self.retry):
+            self.wait()
+            ws.run_forever(sslopt={'cert_reqs': self.ssl.CERT_NONE})
+            content_received.wait()
+            if err_code == 0:
+                return msg.strip()
+
+        if err_code == 10013:
+            return err_data['header']['message']
+        raise RuntimeError(f'Code: {err_code}, data: {err_data}')
--- a/opencompass/openicl/icl_dataset_reader.py
+++ b/opencompass/openicl/icl_dataset_reader.py
+"""Simple Dataset Reader."""
+
+import random
+from typing import Dict, List, Optional, Union
+
+import torch
+from datasets import Dataset, DatasetDict
+from transformers import AutoTokenizer
+
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.registry import ICL_DATASET_READERS
+from opencompass.utils.types import (_check_dataset, _check_str,
+                                     _check_type_list)
+
+
+@ICL_DATASET_READERS.register_module()
+class DatasetReader:
+    """In-conext Learning Dataset Reader Class Generate an DatasetReader
+    instance through 'dataset'.
+
+    Attributes:
+        dataset (:obj:`Dataset` or :obj:`DatasetDict`): The dataset to be read.
+        input_columns (:obj:`List[str]` or :obj:`str`): A list of column names
+            (a string of column name) in the dataset that represent(s) the
+            input field.
+        output_column (:obj:`str`): A column name in the dataset that
+            represents the prediction field.
+        input_template (:obj:`PromptTemplate`, optional): An instance of the
+            :obj:`PromptTemplate` class, used to format the input field
+            content during the retrieval process. (in some retrieval methods)
+        output_template (:obj:`PromptTemplate`, optional): An instance of the
+            :obj:`PromptTemplate` class, used to format the output field
+            content during the retrieval process. (in some learnable retrieval
+            methods)
+        train_split (str): The name of the training split. Defaults to 'train'.
+        train_range (int or float or str, optional): The size of the partial
+            training dataset to load.
+            If None, the entire training dataset will be loaded.
+            If int or float, the random partial dataset will be loaded with the
+            specified size.
+            If str, the partial dataset will be loaded with the
+            specified index list (e.g. "[:100]" for the first 100 examples,
+            "[100:200]" for the second 100 examples, etc.). Defaults to None.
+        test_split (str): The name of the test split. Defaults to 'test'.
+        test_range (int or float or str, optional): The size of the partial
+            test dataset to load.
+            If None, the entire test dataset will be loaded.
+            If int or float, the random partial dataset will be loaded with the
+            specified size.
+            If str, the partial dataset will be loaded with the
+            specified index list (e.g. "[:100]" for the first 100 examples,
+            "[100:200]" for the second 100 examples, etc.). Defaults to None.
+    """
+    dataset = None
+    input_template = None
+    output_template = None
+
+    def __init__(self,
+                 dataset: Union[Dataset, DatasetDict, str],
+                 input_columns: Union[List[str], str],
+                 output_column: str,
+                 input_template: Optional[PromptTemplate] = None,
+                 output_template: Optional[PromptTemplate] = None,
+                 train_split: str = 'train',
+                 train_range: Optional[Union[int, float, str]] = None,
+                 test_split: str = 'test',
+                 test_range: Optional[Union[int, float, str]] = None) -> None:
+        self.input_columns = _check_type_list(input_columns, [List, str])
+        if isinstance(self.input_columns, str):
+            self.input_columns = self.input_columns.split()
+        self.output_column = _check_str(output_column)
+
+        train_range = _check_type_list(train_range, [None, int, float, str])
+        test_range = _check_type_list(test_range, [None, int, float, str])
+
+        if input_template is not None:
+            self.input_template = PromptTemplate._check_prompt_template(
+                input_template)
+        if output_template is not None:
+            self.output_template = PromptTemplate._check_prompt_template(
+                output_template)
+
+        self.dataset = _check_dataset(dataset)
+        if isinstance(self.dataset, Dataset):
+            self.dataset = DatasetDict({
+                'train': self.dataset,
+                'test': self.dataset
+            })
+
+        # Normalize the dataset so that it has only "train" and "test" splits.
+        for origin_split, mapped_split, split_range in [[
+                train_split, 'train', train_range
+        ], [test_split, 'test', test_range]]:
+            self.dataset[mapped_split] = load_partial_dataset(
+                self.dataset[origin_split], size=split_range)
+
+    def generate_input_field_prompt(self, entry: Dict) -> str:
+        """Generate a prompt for the input field based on the provided
+        :obj:`entry` data.
+
+        Args:
+            entry (:obj:`Dict`): A piece of data to be used for generating the
+                prompt.
+
+        Returns:
+            :obj:`str`: The generated prompt.
+        """
+        prompt = None
+        if self.input_template is None:
+            prompt = ' '.join([str(entry[ctx]) for ctx in self.input_columns])
+        else:
+            prompt = self.input_template.generate_item(entry)
+        return prompt
+
+    def generate_input_field_corpus(self,
+                                    dataset: Union[Dataset, DatasetDict],
+                                    split: Optional[str] = None) -> List[str]:
+        """Generate corpus for input field.
+
+        Args:
+            dataset (:obj:`Dataset` or :obj:`DatasetDict`): A
+                :obj:`datasets.Dataset` or :obj:`datasets.DatasetDict`
+                instance.
+            split (:obj:`str`, optional): The split of the dataset to use. If
+                :obj:`None`, the entire dataset will be used. Defaults to
+                ``None``.
+
+        Returns:
+            :obj:`List[str]`: A list of generated input field prompts.
+        """
+        if split is not None:
+            dataset = dataset[split]
+        corpus = []
+        for entry in dataset:
+            corpus.append(self.generate_input_field_prompt(entry))
+        return corpus
+
+    def generate_output_field_prompt(self, entry: Dict) -> str:
+        """Generate a prompt for the output field based on the provided
+        :obj:`entry` data.
+
+        Args:
+            entry (:obj:`Dict`): A piece of data to be used for generating the
+            prompt.
+
+        Returns:
+            :obj:`str`: The generated prompt.
+        """
+        prompt = None
+        if self.output_template is None:
+            prompt = str(entry[self.output_column])
+        else:
+            prompt = self.output_template.generate_item(entry)
+        return prompt
+
+    def generate_output_field_corpus(self,
+                                     dataset: Union[Dataset, DatasetDict],
+                                     split: Optional[str] = None) -> List[str]:
+        """Generate corpus for output field.
+
+        Args:
+            dataset (:obj:`Dataset` or :obj:`DatasetDict`): A
+                :obj:`datasets.Dataset` or :obj:`datasets.DatasetDict`
+                instance.
+            split (:obj:`str`, optional): The split of the dataset to use.
+                If :obj:`None`, the entire dataset will be used. Defaults to
+                ``None``.
+
+        Returns:
+            :obj:`List[str]`: A list of generated output field prompts.
+        """
+        if split is not None:
+            dataset = dataset[split]
+        corpus = []
+        for entry in dataset:
+            corpus.append(self.generate_output_field_prompt(entry))
+        return corpus
+
+    def generate_input_output_field_prompt(self, entry: Dict) -> str:
+        """Generate a prompt for the input-output field based on the
+        provided:obj:`entry` data.
+
+        Args:
+            entry (:obj:`Dict`): A piece of data to be used for generating the
+            prompt.
+
+        Returns:
+            :obj:`str`: The generated prompt.
+        """
+        prompt = None
+        if self.input_output_template is None:
+            prompt = ' '.join([entry[ctx] for ctx in self.input_columns] +
+                              [str(entry[self.output_column])])
+        else:
+            prompt = self.input_output_template.generate_item(entry)
+        return prompt
+
+    def _check_dataset_reader(obj) -> 'DatasetReader':
+        if isinstance(obj, DatasetReader):
+            return obj
+        else:
+            raise TypeError(f'Expected a DatasetReader object, but got {obj}')
+
+    def __len__(self):
+        return len(self.dataset)
+
+    def __getitem__(self, idx):
+        return self.dataset[idx]
+
+    def __repr__(self):
+        return (f'DatasetReader({{\n    dataset: {self.dataset},'
+                f'\n    input_columns: {self.input_columns},\n'
+                f'    output_columns: {self.output_column}\n}})')
+
+
+def load_partial_dataset(
+        dataset: Dataset,
+        size: Optional[Union[int, float, str]] = None) -> Dataset:
+    """Load a partial dataset.
+
+    Args:
+        dataset (Dataset): A :obj:`datasets.Dataset` instance.
+        size (int or float or (int, int), optional): The size of the partial
+            dataset to load. If None, the entire dataset will be loaded.
+            If int or float, the random partial dataset will be loaded with the
+            specified size. If str, the partial dataset will be loaded with the
+            specified index list (e.g. "[:100]" for the first 100 examples,
+            "[100:200]" for the second 100 examples, etc.). Defaults to None.
+    """
+    total_size = len(dataset)
+    index_list = list(range(total_size))
+    if isinstance(size, (int, float)):
+        if size >= total_size or size <= 0:
+            return dataset
+        if size > 0 and size < 1:
+            size = int(size * total_size)
+        rand = random.Random(x=size)
+        rand.shuffle(index_list)
+        dataset = dataset.select(index_list[:size])
+    elif isinstance(size, str):
+        dataset = dataset.select(eval(f'index_list{size}'))
+    return dataset
+
+
+class DatasetEncoder(torch.utils.data.Dataset):
+
+    def __init__(self,
+                 datalist: List,
+                 model_name=None,
+                 tokenizer=None) -> None:
+        self.datalist = datalist
+        if model_name is None and tokenizer is None:
+            raise ValueError('model_name and tokenizer could not both be None')
+        if tokenizer is not None:
+            self.tokenizer = tokenizer
+        else:
+            self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+            self.tokenizer.pad_token = self.tokenizer.eos_token
+            self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
+            self.tokenizer.padding_side = 'left'
+        self.encode_dataset = []
+        self.init_dataset()
+        self.datalist_length = len(self.encode_dataset)
+
+    def init_dataset(self):
+        for idx, data in enumerate(self.datalist):
+            tokenized_data = self.tokenizer.encode_plus(data,
+                                                        truncation=True,
+                                                        return_tensors='pt',
+                                                        verbose=False)
+            self.encode_dataset.append({
+                'input_ids':
+                tokenized_data.input_ids[0],
+                'attention_mask':
+                tokenized_data.attention_mask[0],
+                'metadata': {
+                    'id': idx,
+                    'len': len(tokenized_data.input_ids[0]),
+                    'text': data
+                }
+            })
+
+    def __len__(self):
+        return self.datalist_length
+
+    def __getitem__(self, idx):
+        return self.encode_dataset[idx]
--- a/opencompass/openicl/icl_evaluator/icl_em_evaluator.py
+++ b/opencompass/openicl/icl_evaluator/icl_em_evaluator.py
+from opencompass.registry import ICL_EVALUATORS
+from opencompass.utils.text_postprocessors import general_postprocess
+
+from .icl_base_evaluator import BaseEvaluator
+
+
+@ICL_EVALUATORS.register_module()
+class EMEvaluator(BaseEvaluator):
+    """Exact match evaluator."""
+
+    def __init__(self) -> None:
+        super().__init__()
+
+    def score(self, predictions, references):
+        if len(predictions) != len(references):
+            return {
+                'error': 'predictions and references have different '
+                'length'
+            }
+        predictions = [
+            general_postprocess(prediction) for prediction in predictions
+        ]
+        processed_answers = [[general_postprocess(j) for j in i]
+                             for i in references]
+
+        cnt = 0
+        for pred, ans, origin_ans in zip(predictions, processed_answers,
+                                         references):
+            if pred in ans or pred in origin_ans:
+                cnt += 1
+
+        score = cnt / len(predictions) * 100
+
+        return {'score': score}
--- a/opencompass/openicl/icl_evaluator/icl_hf_evaluator.py
+++ b/opencompass/openicl/icl_evaluator/icl_hf_evaluator.py
+from typing import List
+import evaluate
+
+from opencompass.registry import ICL_EVALUATORS
+
+from .icl_base_evaluator import BaseEvaluator
+
+
+class HuggingfaceEvaluator(BaseEvaluator):
+    """Use huggingface evaluate module to calculate the target metrics.
+
+    Args:
+        metric (str): Metric name in evaluate module.
+    """
+
+    def __init__(self, metric: str) -> None:
+        self.metric = metric
+        super().__init__()
+
+    def _preprocess(self, predictions: List, references: List) -> dict:
+        """Preprocess the final predictions and references to needed format.
+
+        Args:
+            predictions (List): List of predictions of each sample.
+            references (List): List of targets for each sample.
+
+        Returns:
+            dict: preprocessed results.
+        """
+        return {
+            'predictions': predictions,
+            'references': references,
+        }
+
+    def _postprocess(self, scores: dict) -> dict:
+        """Postprocess for final scores.
+
+        Args:
+            scores (dict): Dict of calculated scores of metrics.
+
+        Returns:
+            dict: postprocessed scores.
+        """
+        return scores
+
+    def score(self, predictions: List, references: List) -> dict:
+        """Calculate scores.
+
+        Args:
+            predictions (List): List of predictions of each sample.
+            references (List): List of targets for each sample.
+
+        Returns:
+            dict: calculated scores.
+        """
+        if len(predictions) != len(references):
+            return {'error': 'predictions and references have different '
+                f'length. len(predictions): {len(predictions)}, '
+                f'len(references): {len(references)}'}
+        metric = evaluate.load(self.metric)
+        scores = metric.compute(**self._preprocess(predictions, references))
+        return self._postprocess(scores)
+
+
+@ICL_EVALUATORS.register_module()
+class AccEvaluator(HuggingfaceEvaluator):
+    """Accuracy evaluator."""
+
+    def __init__(self) -> None:
+        super().__init__(metric='accuracy')
+
+    def _preprocess(self, predictions: List, references: List) -> dict:
+        """Preprocess the final predictions and references to needed format.
+
+        Args:
+            predictions (List): List of predictions of each sample.
+            references (List): List of targets for each sample.
+
+        Returns:
+            dict: preprocessed results.
+        """
+        mapping_to_int_dict = {
+            label: idx
+            for idx, label in enumerate(set(map(str, references)))
+        }
+        pred_set = set(predictions)
+        for pred in pred_set:
+            if str(pred) not in mapping_to_int_dict.keys():
+                mapping_to_int_dict[str(pred)] = len(mapping_to_int_dict)
+        golds = [mapping_to_int_dict[str(gold)] for gold in references]
+        preds = [mapping_to_int_dict[str(pred)] for pred in predictions]
+        return {
+            'predictions': preds,
+            'references': golds,
+        }
+
+    def _postprocess(self, scores: dict) -> dict:
+        """Postprocess for final scores.
+
+        Args:
+            scores (dict): Dict of calculated scores of metrics.
+
+        Returns:
+            dict: postprocessed scores.
+        """
+        scores["accuracy"] *= 100
+        return scores
+
+
+@ICL_EVALUATORS.register_module()
+class RougeEvaluator(HuggingfaceEvaluator):
+    """Rouge evaluator."""
+
+    def __init__(self) -> None:
+        super().__init__(metric='rouge')
+
+    def _postprocess(self, scores: dict) -> dict:
+        """Postprocess for final scores.
+
+        Args:
+            scores (dict): Dict of calculated scores of metrics.
+
+        Returns:
+            dict: postprocessed scores.
+        """
+        return {k: v * 100 for k, v in scores.items()}
+
+
+@ICL_EVALUATORS.register_module()
+class BleuEvaluator(HuggingfaceEvaluator):
+    """Bleu evaluator."""
+
+    def __init__(self) -> None:
+        super().__init__(metric='sacrebleu')
+
+
+@ICL_EVALUATORS.register_module()
+class MccEvaluator(AccEvaluator):
+    """Matthews correlation evaluator."""
+
+    def __init__(self) -> None:
+        super(AccEvaluator, self).__init__(metric='matthews_correlation')
+
+    def _postprocess(self, scores: dict) -> dict:
+        """Postprocess for final scores.
+
+        Args:
+            scores (dict): Dict of calculated scores of metrics.
+
+        Returns:
+            dict: postprocessed scores.
+        """
+        scores["matthews_correlation"] *= 100
+        return scores
+
+
+@ICL_EVALUATORS.register_module()
+class SquadEvaluator(HuggingfaceEvaluator):
+    """Squad evaluator."""
+
+    def __init__(self) -> None:
+        super().__init__(metric='squad')
+
+    def _preprocess(self, predictions: List, references: List) -> dict:
+        """Preprocess the final predictions and references to needed format.
+
+        Args:
+            predictions (List): List of predictions of each sample.
+            references (List): List of targets for each sample.
+
+        Returns:
+            dict: preprocessed results.
+        """
+        p_list = [{
+            'prediction_text': pred.split('\n')[0],
+            'id': str(i)
+        } for i, pred in enumerate(predictions)]
+        r_list = [{
+            'answers': {
+                'answer_start': [0],
+                'text': [ref]
+            },
+            'id': str(i)
+        } for i, ref in enumerate(references)]
+        return {
+            'predictions': p_list,
+            'references': r_list,
+        }
+
+    def _postprocess(self, scores: dict) -> dict:
+        """Postprocess for final scores.
+
+        Args:
+            scores (dict): Dict of calculated scores of metrics.
+
+        Returns:
+            dict: postprocessed scores.
+        """
+        return scores['f1']