Add release contribution

c94cc943 · Leymore · gaotong · e6b5bdcb · c94cc943 · c94cc943
Commit c94cc943 authored Jul 05, 2023 by Leymore Committed by gaotong Jul 05, 2023
20 changed files
--- a/opencompass/datasets/ax.py
+++ b/opencompass/datasets/ax.py
+import json
+
+from datasets import Dataset
+
+from opencompass.registry import LOAD_DATASET
+
+from .base import BaseDataset
+
+
+@LOAD_DATASET.register_module()
+class AXDataset_V2(BaseDataset):
+
+    @staticmethod
+    def load(path: str):
+        dataset = []
+        with open(path, 'r') as f:
+            for line in f:
+                line = json.loads(line)
+                line['label'] = {
+                    'entailment': 'A',
+                    'not_entailment': 'B'
+                }[line['label']]
+                dataset.append(line)
+        return Dataset.from_list(dataset)
--- a/opencompass/datasets/base.py
+++ b/opencompass/datasets/base.py
+from abc import abstractstaticmethod
+from typing import Dict, Optional, Union
+
+from datasets import Dataset, DatasetDict
+
+from opencompass.openicl import DatasetReader
+
+
+class BaseDataset:
+
+    def __init__(self, reader_cfg: Optional[Dict] = {}, **kwargs):
+        self.dataset = self.load(**kwargs)
+        self._init_reader(**reader_cfg)
+
+    def _init_reader(self, **kwargs):
+        self.reader = DatasetReader(self.dataset, **kwargs)
+
+    @property
+    def train(self):
+        return self.reader.dataset['train']
+
+    @property
+    def test(self):
+        return self.reader.dataset['test']
+
+    @abstractstaticmethod
+    def load(**kwargs) -> Union[Dataset, DatasetDict]:
+        pass
--- a/opencompass/datasets/bbh.py
+++ b/opencompass/datasets/bbh.py
+import json
+import os.path as osp
+import re
+
+from datasets import Dataset
+
+from opencompass.openicl.icl_evaluator import BaseEvaluator
+from opencompass.registry import (ICL_EVALUATORS, LOAD_DATASET,
+                                  TEXT_POSTPROCESSORS)
+
+from .base import BaseDataset
+
+
+@LOAD_DATASET.register_module()
+class BBHDataset(BaseDataset):
+
+    @staticmethod
+    def load(path: str, name: str):
+        with open(osp.join(path, f'{name}.json'), 'r') as f:
+            data = json.load(f)['examples']
+        dataset = Dataset.from_list(data)
+        return dataset
+
+
+@TEXT_POSTPROCESSORS.register_module('bbh-mcq')
+def bbh_mcq_postprocess(text: str) -> str:
+    ans = text
+    ans_line = ans.split('answer is ')
+    if len(ans_line) != 1:
+        ans = ans_line[1].strip()
+    match = re.search(r'\(([A-Z])\)*', ans)
+    if match:
+        return match.group(1)
+    match = re.search(r'([A-Z])', ans)
+    if match:
+        return match.group(1)
+    return ans
+
+
+@TEXT_POSTPROCESSORS.register_module('bbh-freeform')
+def bbh_freeform_postprocess(text: str) -> str:
+    ans = text
+    ans_line = ans.split('answer is ')
+    if len(ans_line) != 1:
+        ans = ans_line[1].strip()
+    ans = ans.split('\n')[0]
+    if ans.endswith('.'):
+        ans = ans[:-1]
+    return ans
+
+
+@ICL_EVALUATORS.register_module()
+class BBHEvaluator(BaseEvaluator):
+
+    def score(self, predictions, references):
+        if len(predictions) != len(references):
+            return {
+                'error': 'predictions and references have different '
+                'length'
+            }
+
+        predictions = [bbh_freeform_postprocess(pred) for pred in predictions]
+
+        cnt = 0
+        for pred, ref in zip(predictions, references):
+            if pred == ref:
+                cnt += 1
+
+        score = cnt / len(predictions) * 100
+
+        return {'score': score}
--- a/opencompass/datasets/boolq.py
+++ b/opencompass/datasets/boolq.py
+import json
+
+from datasets import Dataset, load_dataset
+
+from opencompass.registry import LOAD_DATASET
+
+from .base import BaseDataset
+
+
+@LOAD_DATASET.register_module()
+class BoolQDataset(BaseDataset):
+
+    @staticmethod
+    def load(**kwargs):
+
+        dataset = load_dataset(**kwargs)
+
+        def preprocess(example):
+            if example['label'] == 'true':
+                example['answer'] = 1
+            else:
+                example['answer'] = 0
+
+            return example
+
+        dataset = dataset.map(preprocess)
+        return dataset
+
+
+@LOAD_DATASET.register_module()
+class BoolQDataset_V2(BaseDataset):
+
+    @staticmethod
+    def load(path):
+        dataset = []
+        with open(path, 'r') as f:
+            for line in f:
+                line = json.loads(line)
+                line['label'] = {'true': 'A', 'false': 'B'}[line['label']]
+                dataset.append(line)
+        return Dataset.from_list(dataset)
--- a/opencompass/datasets/bustum.py
+++ b/opencompass/datasets/bustum.py
+import json
+
+from datasets import Dataset
+
+from opencompass.registry import LOAD_DATASET
+
+from .base import BaseDataset
+
+
+@LOAD_DATASET.register_module()
+class bustumDataset_V2(BaseDataset):
+
+    @staticmethod
+    def load(path):
+        data = []
+        with open(path, 'r') as f:
+            for line in f:
+                line = json.loads(line)
+                line['label'] = 'AB'[int(line['label'])]
+                data.append(line)
+        return Dataset.from_list(data)
--- a/opencompass/datasets/cmrc.py
+++ b/opencompass/datasets/cmrc.py
+import json
+
+from datasets import Dataset
+
+from opencompass.registry import LOAD_DATASET
+
+from .base import BaseDataset
+
+
+@LOAD_DATASET.register_module()
+class CMRCDataset(BaseDataset):
+
+    @staticmethod
+    def load(path: str):
+        with open(path) as f:
+            data = json.load(f)
+        # 将原始数据转换为所需的格式
+        rows = []
+        for index, paragraphs in enumerate(data['data']):
+            for paragraph in paragraphs['paragraphs']:
+
+                context = paragraph['context']
+
+                for question in paragraph['qas']:
+                    answers = question['answers']
+                    unique_answers = list(set([a['text'] for a in answers]))
+                    rows.append({
+                        'context': context,
+                        'question': question['question'],
+                        'answers': unique_answers
+                    })
+
+        # 创建 Dataset
+        dataset = Dataset.from_dict({
+            'context': [row['context'] for row in rows],
+            'question': [row['question'] for row in rows],
+            'answers': [row['answers'] for row in rows]
+        })
+
+        return dataset
--- a/opencompass/datasets/govrepcrs.py
+++ b/opencompass/datasets/govrepcrs.py
+from datasets import Dataset, DatasetDict
+
+from opencompass.registry import LOAD_DATASET
+
+from .base import BaseDataset
+
+
+@LOAD_DATASET.register_module()
+class GovRepcrsDataset(BaseDataset):
+
+    @staticmethod
+    def load(path: str):
+        import json
+        import os
+        dataset_dict = DatasetDict()
+        splits = ['train', 'valid', 'test']
+        dataset_lists = {x: [] for x in splits}
+        for split in splits:
+            split_fp = os.path.join(path, 'gov-report', 'split_ids',
+                                    'crs_' + split + '.ids')
+            with open(split_fp, 'r') as f:
+                for line in f.readlines():
+                    xpath = os.path.join(path, 'gov-report', 'crs',
+                                         line.strip() + '.json')
+                    with open(xpath, 'r') as df:
+                        data = json.load(df)
+                        content = data['title'] + '\n' + '\n'.join(
+                            [(x['section_title'] if x['section_title'] else '')
+                             + '\n' + '\n'.join(x['paragraphs'])
+                             for x in data['reports']['subsections']])
+                        summary = '\n'.join(data['summary'])
+                        dataset_lists[split].append({
+                            'content': content,
+                            'summary': summary,
+                        })
+                dataset_dict[split] = Dataset.from_list(dataset_lists[split])
+        return dataset_dict
--- a/opencompass/datasets/iwslt2017.py
+++ b/opencompass/datasets/iwslt2017.py
+from datasets import load_dataset
+
+from opencompass.registry import LOAD_DATASET
+
+from .base import BaseDataset
+
+
+@LOAD_DATASET.register_module()
+class IWSLT2017Dataset(BaseDataset):
+
+    @staticmethod
+    def load(**kwargs):
+        dataset = load_dataset(**kwargs)
+        dataset = dataset.map(lambda example: example['translation']
+                              ).remove_columns('translation')
+        return dataset
--- a/opencompass/datasets/narrativeqa.py
+++ b/opencompass/datasets/narrativeqa.py
+from datasets import Dataset, DatasetDict
+
+from opencompass.registry import LOAD_DATASET
+
+from .base import BaseDataset
+
+
+@LOAD_DATASET.register_module()
+class NarrativeQADataset(BaseDataset):
+
+    @staticmethod
+    def load(path: str):
+        import csv
+        import os
+        dataset_dict = DatasetDict()
+        splits = ['train', 'valid', 'test']
+        dataset_lists = {x: [] for x in splits}
+        with open(os.path.join(path, 'qaps.csv'), 'r') as f:
+            reader = csv.reader(f, delimiter=',')
+            for row in reader:
+                if row[1] == 'set':
+                    continue
+                split = row[1]  # set
+                answers = [row[3], row[4]]  # row['answer1'], row['answer2']
+                question = row[2]  # question
+                x_path = os.path.join(path, 'tmp',
+                                      row[0] + '.content')  # document_id
+
+                try:
+                    with open(x_path, 'r', encoding='utf-8') as f:
+                        evidence = f.read(100000)
+                except:  # noqa: E722
+                    continue
+                dataset_lists[split].append({
+                    'answer': answers,
+                    'question': question,
+                    'evidence': evidence,
+                })
+
+        for split in splits:
+            dataset_dict[split] = Dataset.from_list(dataset_lists[split])
+
+        return dataset_dict
--- a/opencompass/datasets/wsc.py
+++ b/opencompass/datasets/wsc.py
+import json
+
+from datasets import Dataset, load_dataset
+
+from opencompass.registry import LOAD_DATASET
+
+from .base import BaseDataset
+
+
+@LOAD_DATASET.register_module()
+class WSCDataset(BaseDataset):
+
+    @staticmethod
+    def load(**kwargs):
+
+        dataset = load_dataset(**kwargs)
+
+        def preprocess(example):
+            text_list = example['text'].split(' ')
+            assert ' ' not in example['target']['span2_text']
+            # span1 may have 1 or more than 1 words
+            # span2 is the pronoun and has only 1 word
+            text_list[example['target']
+                      ['span2_index']] = example['target']['span1_text']
+            example['new_text'] = ' '.join(text_list)
+            if example['label'] == 'true':
+                example['answer'] = 1
+            else:
+                example['answer'] = 0
+            example['span1'] = example['target']['span1_text']
+            example['span2'] = example['target']['span2_text']
+            del example['target']
+            return example
+
+        dataset = dataset.map(preprocess)
+        return dataset
+
+
+@LOAD_DATASET.register_module()
+class WSCDataset_V2(BaseDataset):
+
+    @staticmethod
+    def load(path):
+        data = []
+        with open(path, 'r') as f:
+            for line in f:
+                line = json.loads(line)
+                item = {
+                    'span1': line['target']['span1_text'],
+                    'span2': line['target']['span2_text'],
+                    'text': line['text'],
+                    'label': {
+                        'true': 'A',
+                        'false': 'B'
+                    }[line['label']],
+                }
+                data.append(item)
+        return Dataset.from_list(data)
--- a/opencompass/models/openai_api.py
+++ b/opencompass/models/openai_api.py
+import os
+from concurrent.futures import ThreadPoolExecutor
+from typing import Dict, List, Optional, Union
+
+from opencompass.registry import MODELS
+from opencompass.utils.prompt import PromptList
+
+from .base_api import BaseAPIModel
+
+PromptType = Union[PromptList, str]
+
+
+@MODELS.register_module()
+class OpenAI(BaseAPIModel):
+    """Model wrapper around OpenAI's models.
+
+    Args:
+        path (str): The name of OpenAI's model.
+        max_seq_len (int): The maximum allowed sequence length of a model.
+            Note that the length of prompt + generated tokens shall not exceed
+            this value. Defaults to 2048.
+        query_per_second (int): The maximum queries allowed per second
+            between two consecutive calls of the API. Defaults to 1.
+        retry (int): Number of retires if the API call fails. Defaults to 2.
+        key (str): OpenAI key. In particular, when it is set to "ENV", the key
+            will be fetched from the environment variable $OPENAI_API_KEY, as
+            how openai defaults to be. Defaults to 'ENV'
+        meta_template (Dict, optional): The model's meta prompt
+            template if needed, in case the requirement of injecting or
+            wrapping of any meta instructions.
+        openai_api_base (str): The base url of OpenAI's API. Defaults to
+            'https://api.openai.com/v1'.
+    """
+
+    is_api: bool = True
+
+    def __init__(self,
+                 path: str,
+                 max_seq_len: int = 2048,
+                 query_per_second: int = 1,
+                 retry: int = 2,
+                 key: str = 'ENV',
+                 meta_template: Optional[Dict] = None,
+                 openai_api_base: str = 'https://api.openai.com/v1'):
+        super().__init__(path=path,
+                         max_seq_len=max_seq_len,
+                         meta_template=meta_template,
+                         query_per_second=query_per_second,
+                         retry=retry)
+        import openai
+        import tiktoken
+        self.openai = openai
+        self.tiktoken = tiktoken
+
+        self.openai.api_key = os.getenv(
+            'OPENAI_API_KEY') if key == 'ENV' else key
+        self.openai.api_rase = openai_api_base
+
+    def generate(
+        self,
+        inputs: List[str or PromptList],
+        max_out_len: int = 512,
+        temperature: float = 0.7,
+    ) -> List[str]:
+        """Generate results given a list of inputs.
+
+        Args:
+            inputs (List[str or PromptList]): A list of strings or PromptDicts.
+                The PromptDict should be organized in OpenCompass'
+                API format.
+            max_out_len (int): The maximum length of the output.
+            temperature (float): What sampling temperature to use,
+                between 0 and 2. Higher values like 0.8 will make the output
+                more random, while lower values like 0.2 will make it more
+                focused and deterministic. Defaults to 0.7.
+
+        Returns:
+            List[str]: A list of generated strings.
+        """
+        with ThreadPoolExecutor() as executor:
+            results = list(
+                executor.map(self._generate, inputs,
+                             [max_out_len] * len(inputs),
+                             [temperature] * len(inputs)))
+        return results
+
+    def _generate(self, input: str or PromptList, max_out_len: int,
+                  temperature: float) -> str:
+        """Generate results given a list of inputs.
+
+        Args:
+            inputs (str or PromptList): A string or PromptDict.
+                The PromptDict should be organized in OpenCompass'
+                API format.
+            max_out_len (int): The maximum length of the output.
+            temperature (float): What sampling temperature to use,
+                between 0 and 2. Higher values like 0.8 will make the output
+                more random, while lower values like 0.2 will make it more
+                focused and deterministic.
+
+        Returns:
+            str: The generated string.
+        """
+        assert isinstance(input, (str, PromptList))
+
+        # max num token for gpt-3.5-turbo is 4097
+        max_out_len = min(max_out_len, 4000 - self.get_token_len(str(input)))
+
+        if isinstance(input, str):
+            messages = [{'role': 'user', 'content': input}]
+        else:
+            messages = []
+            for item in input:
+                msg = {'content': item['prompt']}
+                if item['role'] == 'HUMAN':
+                    msg['role'] = 'user'
+                elif item['role'] == 'BOT':
+                    msg['role'] = 'assistant'
+                elif item['role'] == 'SYSTEM':
+                    msg['role'] = 'system'
+                messages.append(msg)
+
+        max_num_retries = 0
+        while max_num_retries < self.retry:
+            self.wait()
+            try:
+                response = self.openai.ChatCompletion.create(
+                    model=self.path,
+                    messages=messages,
+                    max_tokens=max_out_len,
+                    n=1,
+                    stop=None,
+                    temperature=temperature,
+                )
+            except self.openai.error.RateLimitError:
+                max_num_retries -= 1
+            max_num_retries += 1
+
+        result = response.choices[0].message.content.strip()
+        return result
+
+    def get_token_len(self, prompt: str) -> int:
+        """Get lengths of the tokenized string. Only English and Chinese
+        characters are counted for now. Users are encouraged to override this
+        method if more accurate length is needed.
+
+        Args:
+            prompt (str): Input string.
+
+        Returns:
+            int: Length of the input tokens
+        """
+        enc = self.tiktoken.encoding_for_model(self.path)
+        return len(enc.encode(prompt))
--- a/opencompass/openicl/icl_inferencer/icl_base_inferencer.py
+++ b/opencompass/openicl/icl_inferencer/icl_base_inferencer.py
+"""Basic Inferencer."""
+import json
+import os
+from pathlib import Path
+from typing import List, Optional
+
+import numpy as np
+from mmengine.dist import is_main_process
+from torch.utils.data import DataLoader
+
+from ..icl_prompt_template import PromptTemplate
+from ..icl_retriever import BaseRetriever
+
+
+class BaseInferencer:
+    """Base Inferencer class for all evaluation Inferencer.
+
+    Attributes:
+        model (:obj:`BaseModel`, optional): The module to inference.
+        max_model_token_num (:obj:`int`, optional): Maximum number of
+            tokenized words allowed by the LM.
+        batch_size (:obj:`int`, optional): Batch size for the
+            :obj:`DataLoader`.
+        output_json_filepath (:obj:`str`, optional): File path for output
+            `JSON` file.
+        output_json_filename (:obj:`str`, optional): File name for output
+            `JSON` file.
+        api_name (:obj:`str`, optional): Name of API service.
+        call_api (:obj:`bool`): If ``True``, an API for LM models will be used,
+            determined by :obj:`api_name`.
+    """
+    model = None
+
+    def __init__(
+        self,
+        model,
+        max_seq_len: Optional[int] = None,
+        batch_size: Optional[int] = 1,
+        output_json_filepath: Optional[str] = './icl_inference_output',
+        output_json_filename: Optional[str] = 'predictions',
+        **kwargs,
+    ) -> None:
+        self.model = model
+
+        self.max_seq_len = max_seq_len
+        self.batch_size = batch_size
+        self.output_json_filepath = output_json_filepath
+        self.output_json_filename = output_json_filename
+        self.is_main_process = is_main_process()
+        if not os.path.exists(self.output_json_filepath):
+            os.makedirs(self.output_json_filepath)
+
+    def inference(self,
+                  retriever: BaseRetriever,
+                  ice_template: Optional[PromptTemplate] = None,
+                  prompt_template: Optional[PromptTemplate] = None,
+                  output_json_filepath: Optional[str] = None,
+                  output_json_filename: Optional[str] = None) -> List:
+        """Perform In-Context Inference given a retriever and optional
+        templates.
+
+        Args:
+            retriever (:obj:`BaseRetriever`): An instance of a Retriever class
+                that will be used to retrieve in-context examples
+            ice_template (:obj:`PromptTemplate`, optional): A template for
+                generating the in-context examples prompt. Defaults to None.
+            prompt_template (:obj:`PromptTemplate`, optional): A template for
+                generating the final prompt. Defaults to None.
+            output_json_filepath (:obj:`str`, optional): The file path to save
+                the results as a `JSON` file. Defaults to None.
+            output_json_filename (:obj:`str`, optional): The file name to save
+                the results as a `JSON` file. Defaults to None.
+
+        Raises:
+            NotImplementedError: If the function is not implemented in the
+                subclass.
+
+        Returns:
+            :obj:`List:` A list of string, each representing the results of one
+                inference.
+        """
+        raise NotImplementedError("Method hasn't been implemented yet")
+
+    @staticmethod
+    def get_dataloader(datalist: List[List], batch_size: int) -> DataLoader:
+        """Return a dataloader of the input data list."""
+        dataloader = DataLoader(datalist,
+                                batch_size=batch_size,
+                                collate_fn=lambda x: x)
+        return dataloader
+
+
+def dump_results_dict(results_dict, filename):
+    with open(filename, 'w', encoding='utf-8') as json_file:
+        json.dump(results_dict, json_file, indent=4, ensure_ascii=False)
+
+
+class GenInferencerOutputHandler:
+    origin_prompt_dict = {}
+    output_dict = {}
+    prediction_dict = {}
+    results_dict = {}
+
+    def __init__(self) -> None:
+        self.results_dict = {}
+
+    def write_to_json(self, save_dir: str, filename: str):
+        """Dump the result to a json file."""
+        dump_results_dict(self.results_dict, Path(save_dir) / filename)
+
+    def save_results(self, origin_prompt, prediction, idx):
+        self.results_dict[str(idx)] = {
+            'origin_prompt': origin_prompt,
+            'prediction': prediction,
+        }
+
+
+class PPLInferencerOutputHandler:
+    results_dict = {}
+
+    def __init__(self) -> None:
+        self.results_dict = {}
+
+    def write_to_json(self, save_dir: str, filename: str):
+        """Dump the result to a json file."""
+        dump_results_dict(self.results_dict, Path(save_dir) / filename)
+
+    def save_ice(self, ice):
+        for idx, example in enumerate(ice):
+            if str(idx) not in self.results_dict.keys():
+                self.results_dict[str(idx)] = {}
+            self.results_dict[str(idx)]['in-context examples'] = example
+
+    def save_predictions(self, predictions):
+        for idx, prediction in enumerate(predictions):
+            if str(idx) not in self.results_dict.keys():
+                self.results_dict[str(idx)] = {}
+            self.results_dict[str(idx)]['prediction'] = prediction
+
+    def save_prompt_and_ppl(self, label, input, prompt, ppl, idx):
+        if str(idx) not in self.results_dict.keys():
+            self.results_dict[str(idx)] = {}
+        if 'label: ' + str(label) not in self.results_dict[str(idx)].keys():
+            self.results_dict[str(idx)]['label: ' + str(label)] = {}
+        self.results_dict[str(idx)]['label: ' +
+                                    str(label)]['testing input'] = input
+        self.results_dict[str(idx)]['label: ' + str(label)]['prompt'] = prompt
+        self.results_dict[str(idx)]['label: ' + str(label)]['PPL'] = ppl
+
+    def save_prompt_and_condprob(self, input, prompt, cond_prob, idx, choices):
+        if str(idx) not in self.results_dict.keys():
+            self.results_dict[str(idx)] = {}
+        # TODO:
+        # for single token situation, the input will always be yes currently
+        self.results_dict[str(idx)]['testing input'] = input
+        self.results_dict[str(idx)]['prompt'] = prompt
+        # TODO: hard code here
+        self.results_dict[str(idx)]['choices'] = choices
+        # For calculate auc scores, set scores as prediction
+        self.results_dict[str(idx)]['prediction'] = cond_prob
+        # set pred label in case needed
+        self.results_dict[str(idx)]['pred_label'] = int(np.argmax(cond_prob))
--- a/opencompass/openicl/icl_retriever/icl_mdl_retriever.py
+++ b/opencompass/openicl/icl_retriever/icl_mdl_retriever.py
+"""MDL Retriever."""
+
+from typing import List, Optional
+
+import numpy as np
+import torch
+import tqdm
+from transformers import AutoModelForCausalLM
+
+from opencompass.openicl import PromptTemplate
+from opencompass.openicl.icl_retriever.icl_topk_retriever import TopkRetriever
+from opencompass.openicl.utils.logging import get_logger
+from opencompass.registry import ICL_PROMPT_TEMPLATES, ICL_RETRIEVERS
+
+logger = get_logger(__name__)
+
+
+@ICL_RETRIEVERS.register_module()
+class MDLRetriever(TopkRetriever):
+    """MDL Retriever, subclass of `TopkRetriever`. MDL is a abbreviation of
+    Minimum Description Length, specially designed for ppl evaluation. You may
+    refer to the paper for more details: https://arxiv.org/pdf/2212.10375.pdf.
+
+    Args:
+        dataset (`BaseDataset`): Any BaseDataset instances.
+            Attributes of ``reader``, ``train`` and ``test`` will be used.
+        ice_separator (`Optional[str]`): The separator between each in-context
+            example template when origin `PromptTemplate` is provided. Defaults
+            to '\n'.
+        ice_eos_token (`Optional[str]`): The end of sentence token for
+            in-context example template when origin `PromptTemplate` is
+            provided. Defaults to '\n'.
+        ice_num (`Optional[int]`): The number of in-context example template
+            when origin `PromptTemplate` is provided. Defaults to 1.
+        sentence_transformers_model_name (`Optional[str]`): The name of the
+            sentence transformers model. Defaults to 'all-mpnet-base-v2'.
+        tokenizer_name (`Optional[str]`): The name of the tokenizer. Defaults
+            to 'gpt2-xl'.
+        batch_size (`Optional[int]`): The batch size for the dataloader.
+            Defaults to 1.
+        candidate_num (`Optional[int]`): The number of candidates to retrieve
+            for each example. Defaults to 1.
+        ce_model_name (`Optional[str]`): The name of the model for calculating
+            MDL. Defaults to 'gpt2-xl'.
+        select_time (`Optional[int]`): The number of times to select MDL.
+            Defaults to 5.
+        ice_template (`Optional[PromptTemplate]`): The template for in-context
+            example. Defaults to None.
+        prompt_template (`Optional[PromptTemplate]`): The template for prompt.
+            Defaults to None.
+        labels (`Optional[List]`): The labels for calculating MDL. Defaults to
+            None.
+        seed (`Optional[int]`): The seed for random. Defaults to 1.
+    """
+    metric_model = None
+
+    def __init__(self,
+                 dataset,
+                 ice_separator: Optional[str] = '\n',
+                 ice_eos_token: Optional[str] = '\n',
+                 ice_num: Optional[int] = 1,
+                 sentence_transformers_model_name: Optional[
+                     str] = 'all-mpnet-base-v2',
+                 tokenizer_name: Optional[str] = 'gpt2-xl',
+                 batch_size: Optional[int] = 1,
+                 candidate_num: Optional[int] = 1,
+                 ce_model_name: Optional[str] = 'gpt2-xl',
+                 select_time: Optional[int] = 5,
+                 ice_template: Optional[PromptTemplate] = None,
+                 prompt_template: Optional[PromptTemplate] = None,
+                 labels: Optional[List] = None,
+                 seed: Optional[int] = 1) -> None:
+        super().__init__(dataset, ice_separator, ice_eos_token, ice_num,
+                         sentence_transformers_model_name, tokenizer_name,
+                         batch_size)
+        self.ce_model_name = ce_model_name
+        self.candidate_num = candidate_num
+        self.select_time = select_time
+        self.ice_template = ICL_PROMPT_TEMPLATES.build(ice_template)
+        if prompt_template is not None:
+            self.prompt_template = ICL_PROMPT_TEMPLATES.build(prompt_template)
+        else:
+            self.prompt_template = None
+        self.labels = labels
+        self.seed = seed
+
+    def topk_search(self):
+        np.random.seed(self.seed)
+        res_list = self.forward(self.dataloader)
+        rtr_idx_list = [[] for _ in range(len(res_list))]
+
+        logger.info('Retrieving data for test set...')
+        for entry in tqdm.tqdm(res_list, disable=not self.is_main_process):
+            idx = entry['metadata']['id']
+            embed = np.expand_dims(entry['embed'], axis=0)
+            near_ids = self.index.search(
+                embed, min(self.candidate_num,
+                           len(self.index_ds)))[1][0].tolist()
+            candidates = []
+            mdl_scores = []
+            for j in range(self.select_time):
+                if j == 0:
+                    rand_idx_list = near_ids[:self.ice_num]
+                else:
+                    rand_idx_list = np.random.choice(near_ids,
+                                                     self.ice_num,
+                                                     replace=False)
+                    rand_idx_list = [int(i) for i in rand_idx_list]
+                candidates.append(rand_idx_list)
+
+                ice = self.generate_ice(rand_idx_list,
+                                        ice_template=self.ice_template)
+                ice = str(ice)
+                mask_length = len(
+                    self.tokenizer(ice + self.ice_eos_token,
+                                   verbose=False)['input_ids'])
+                if self.labels is None:
+                    labels = self.get_labels(self.ice_template,
+                                             self.prompt_template)
+                else:
+                    labels = self.labels
+                prompt_list = []
+                for label in labels:
+                    prompt = self.generate_label_prompt(
+                        idx, ice, label, self.ice_template,
+                        self.prompt_template)
+                    prompt = str(prompt)
+                    prompt_list.append(prompt)
+                loss_list = self.cal_ce(prompt_list, mask_length=mask_length)
+                probs = np.exp(-np.array(loss_list))
+                normalized_probs = probs / probs.sum(0, keepdims=True)
+                neg_entropy = -entropy(normalized_probs, label_dim=0)
+                mdl_scores.append(neg_entropy)
+
+            rtr_idx_list[idx] = candidates[mdl_scores.index(max(mdl_scores))]
+            rtr_idx_list[idx] = [int(i) for i in rtr_idx_list[idx]]
+
+        return rtr_idx_list
+
+    def retrieve(self):
+        """Retrieve the in-context example index for each test example."""
+        return self.topk_search()
+
+    def cal_ce(self, input_texts: List[str], mask_length=None):
+        if self.metric_model is None:
+            logger.info(
+                f'Load model {self.ce_model_name} for calculating MDL...')
+            self.metric_model = AutoModelForCausalLM.from_pretrained(
+                self.ce_model_name)
+            self.metric_model.to(self.device)
+        inputs = self.tokenizer(input_texts,
+                                padding=True,
+                                return_tensors='pt',
+                                truncation=True)
+        inputs = {k: v.to(self.device) for k, v in inputs.items()}
+        outputs = self.metric_model(**inputs)
+
+        shift_logits = outputs.logits[..., :-1, :].contiguous()
+        shift_labels = inputs['input_ids'][..., 1:].contiguous()
+
+        loss_fct = torch.nn.CrossEntropyLoss(
+            reduction='none', ignore_index=self.tokenizer.pad_token_id)
+        shift_logits = shift_logits.view(-1, shift_logits.size(-1))
+        loss = loss_fct(shift_logits,
+                        shift_labels.view(-1)).view(shift_labels.size())
+        if mask_length is not None:
+            mask = torch.cat([
+                torch.zeros([loss.shape[0], mask_length], dtype=torch.float),
+                torch.ones([loss.shape[0], loss.shape[-1] - mask_length],
+                           dtype=torch.float)
+            ], -1)
+            mask = mask.to(self.device)
+            loss = torch.mul(mask, loss)
+
+        lens = (inputs['input_ids'] !=
+                self.tokenizer.pad_token_id).sum(-1).cpu().numpy()
+        if mask_length is not None:
+            lens -= mask_length
+        ce_loss = loss.sum(-1).cpu().detach().numpy() / lens
+        return ce_loss
+
+
+def entropy(probs: np.array, label_dim: int = 0, mask=None):
+    if mask is None:
+        return -(probs * np.log(probs)).sum(label_dim)
+    return -(mask * probs * np.log(probs)).sum(label_dim)
--- a/opencompass/openicl/icl_retriever/icl_votek_retriever.py
+++ b/opencompass/openicl/icl_retriever/icl_votek_retriever.py
+"""Votek Retriever."""
+
+import json
+import os
+import random
+from collections import defaultdict
+from typing import Optional
+
+import numpy as np
+from sklearn.metrics.pairwise import cosine_similarity
+
+from opencompass.openicl.icl_retriever.icl_topk_retriever import TopkRetriever
+
+
+class VotekRetriever(TopkRetriever):
+    """Vote-k In-context Learning Retriever, subclass of `TopkRetriever`.
+
+    **WARNING**: This class has not been tested thoroughly. Please use it with
+    caution.
+    """
+
+    def __init__(self,
+                 dataset,
+                 ice_separator: Optional[str] = '\n',
+                 ice_eos_token: Optional[str] = '\n',
+                 ice_num: Optional[int] = 1,
+                 sentence_transformers_model_name: Optional[
+                     str] = 'all-mpnet-base-v2',
+                 tokenizer_name: Optional[str] = 'gpt2-xl',
+                 batch_size: Optional[int] = 1,
+                 votek_k: Optional[int] = 3) -> None:
+        super().__init__(dataset, ice_separator, ice_eos_token, ice_num,
+                         sentence_transformers_model_name, tokenizer_name,
+                         batch_size)
+        self.votek_k = votek_k
+
+    def votek_select(self,
+                     embeddings=None,
+                     select_num=None,
+                     k=None,
+                     overlap_threshold=None,
+                     vote_file=None):
+        n = len(embeddings)
+        if vote_file is not None and os.path.isfile(vote_file):
+            with open(vote_file) as f:
+                vote_stat = json.load(f)
+        else:
+            vote_stat = defaultdict(list)
+
+            for i in range(n):
+                cur_emb = embeddings[i].reshape(1, -1)
+                cur_scores = np.sum(cosine_similarity(embeddings, cur_emb),
+                                    axis=1)
+                sorted_indices = np.argsort(cur_scores).tolist()[-k - 1:-1]
+                for idx in sorted_indices:
+                    if idx != i:
+                        vote_stat[idx].append(i)
+
+            if vote_file is not None:
+                with open(vote_file, 'w', encoding='utf-8') as f:
+                    json.dump(vote_stat, f)
+        votes = sorted(vote_stat.items(),
+                       key=lambda x: len(x[1]),
+                       reverse=True)
+        j = 0
+        selected_indices = []
+        while len(selected_indices) < select_num and j < len(votes):
+            candidate_set = set(votes[j][1])
+            flag = True
+            for pre in range(j):
+                cur_set = set(votes[pre][1])
+                if len(candidate_set.intersection(
+                        cur_set)) >= overlap_threshold * len(candidate_set):
+                    flag = False
+                    break
+            if not flag:
+                j += 1
+                continue
+            selected_indices.append(int(votes[j][0]))
+            j += 1
+        if len(selected_indices) < select_num:
+            unselected_indices = []
+            cur_num = len(selected_indices)
+            for i in range(n):
+                if i not in selected_indices:
+                    unselected_indices.append(i)
+            selected_indices += random.sample(unselected_indices,
+                                              select_num - cur_num)
+        return selected_indices
+
+    def vote_k_search(self):
+        vote_k_idxs = self.votek_select(embeddings=self.embed_list,
+                                        select_num=self.ice_num,
+                                        k=self.votek_k,
+                                        overlap_threshold=1)
+        return [vote_k_idxs[:] for _ in range(len(self.test_ds))]
+
+    def retrieve(self):
+        return self.vote_k_search()
--- a/opencompass/openicl/utils/api_service.py
+++ b/opencompass/openicl/utils/api_service.py
+import json
+import os
+import time
+
+import numpy as np
+import openai
+import requests
+
+OPENICL_API_NAME_LIST = ['opt-175b', 'gpt3']
+OPENICL_API_PARAMETER_DICT = {
+    'opt-175b': ['URL', 'headers'],
+    'gpt3': [
+        'engine', 'temperature', 'max_tokens', 'top_p', 'frequency_penalty',
+        'presence_penalty', 'sleep_time'
+    ]
+}
+OPENICL_API_REQUEST_CONFIG = {
+    'opt-175b': {
+        'URL': '',  # http://xxx/completions or http://xxx/generate
+        'headers': {
+            'Content-Type': 'application/json; charset=UTF-8'
+        }
+    },
+    'gpt3': {
+        'engine': 'text-davinci-003',
+        'temperature': 0,
+        'max_tokens': 256,
+        'top_p': 1.0,
+        'frequency_penalty': 0.0,
+        'presence_penalty': 0.0,
+        'sleep_time': 3
+    }
+}
+PROXIES = {'https': '', 'http': ''}
+
+
+def is_api_available(api_name):
+    if api_name is None:
+        return False
+    return True if api_name in OPENICL_API_NAME_LIST else False
+
+
+def update_openicl_api_request_config(api_name, **kwargs):
+    if api_name is None or not is_api_available(api_name):
+        return
+
+    parameter_list = OPENICL_API_PARAMETER_DICT[api_name]
+    for parameter in parameter_list:
+        if parameter in kwargs.keys():
+            OPENICL_API_REQUEST_CONFIG[api_name][parameter] = kwargs[parameter]
+
+
+def api_get_ppl(api_name, input_texts):
+    if api_name == 'opt-175b':
+        pyload = {'prompt': input_texts, 'max_tokens': 0, 'echo': True}
+        response = json.loads(
+            requests.post(
+                OPENICL_API_REQUEST_CONFIG[api_name]['URL'],
+                data=json.dumps(pyload),
+                headers=OPENICL_API_REQUEST_CONFIG[api_name]['headers'],
+                proxies=PROXIES).text)
+        lens = np.array(
+            [len(r['logprobs']['tokens']) for r in response['choices']])
+        ce_loss = np.array([
+            -sum(r['logprobs']['token_logprobs']) for r in response['choices']
+        ])
+        return ce_loss / lens
+
+    if api_name == 'gpt3':
+        raise NotImplementedError("GPT-3 API doesn't support PPL calculation")
+
+
+def api_get_tokens(api_name, input_texts):
+    length_list = [len(text) for text in input_texts]
+
+    if api_name == 'opt-175b':
+        pyload = {'prompt': input_texts, 'max_tokens': 100, 'echo': True}
+        response = json.loads(
+            requests.post(
+                OPENICL_API_REQUEST_CONFIG[api_name]['URL'],
+                data=json.dumps(pyload),
+                headers=OPENICL_API_REQUEST_CONFIG[api_name]['headers'],
+                proxies=PROXIES).text)
+        return [r['text'] for r in response['choices']], [
+            r['text'][length:]
+            for r, length in zip(response['choices'], length_list)
+        ]
+
+    if api_name == 'gpt3':
+        openai.api_key = os.getenv('OPENAI_API_KEY')
+        response = openai.Completion.create(
+            engine=OPENICL_API_REQUEST_CONFIG['gpt3']['engine'],
+            prompt=input_texts,
+            temperature=OPENICL_API_REQUEST_CONFIG['gpt3']['temperature'],
+            max_tokens=OPENICL_API_REQUEST_CONFIG['gpt3']['max_tokens'],
+            top_p=OPENICL_API_REQUEST_CONFIG['gpt3']['top_p'],
+            frequency_penalty=OPENICL_API_REQUEST_CONFIG['gpt3']
+            ['frequency_penalty'],
+            presence_penalty=OPENICL_API_REQUEST_CONFIG['gpt3']
+            ['presence_penalty'])
+        time.sleep(OPENICL_API_REQUEST_CONFIG['gpt3']['sleep_time'])
+        return [(input + r['text'])
+                for r, input in zip(response['choices'], input_texts)
+                ], [r['text'] for r in response['choices']]
--- a/opencompass/partitioners/naive.py
+++ b/opencompass/partitioners/naive.py
+import os.path as osp
+from typing import Dict, List
+
+from mmengine.config import Config, ConfigDict
+
+from opencompass.registry import PARTITIONERS
+from opencompass.utils import get_infer_output_path
+
+from .base import BasePartitioner
+
+
+@PARTITIONERS.register_module()
+class NaivePartitioner(BasePartitioner):
+    """Naive task partitioner. This partitioner will generate a task for each
+    model-dataset pair.
+
+    Args:
+        config (ConfigDict): The full config dict.
+    """
+
+    def partition(self, models: List[ConfigDict], datasets: List[ConfigDict],
+                  work_dir: str, out_dir: str) -> List[Dict]:
+        """Partition model-dataset pairs into tasks. Each task is defined as a
+        dict and will run independently as a unit. Its structure is as
+        follows:
+
+        .. code-block:: python
+
+            {
+                'models': [],  # a list of model configs
+                'datasets': [[]],  # a nested list of dataset configs, each
+                                    list corresponds to a model
+                'work_dir': '',  # the work dir
+            }
+
+        Args:
+            models (List[ConfigDict]): A list of model configs.
+            datasets (List[ConfigDict]): A list of dataset configs.
+            work_dir (str): The work dir for the task.
+            out_dir (str): The full output path for the task, intended for
+                Partitioners to check whether the task is finished via the
+                existency of result file in this directory.
+
+        Returns:
+            List[Dict]: A list of tasks.
+        """
+
+        tasks = []
+        for model in models:
+            for dataset in datasets:
+                filename = get_infer_output_path(model, dataset, out_dir)
+                if osp.exists(filename):
+                    continue
+                task = Config({
+                    'models': [model],
+                    'datasets': [[dataset]],
+                    'work_dir': work_dir
+                })
+                tasks.append(task)
+        return tasks
--- a/opencompass/partitioners/size.py
+++ b/opencompass/partitioners/size.py
+import copy
+import math
+import os.path as osp
+from typing import List, Tuple, Union
+
+import mmengine
+from mmengine.config import Config, ConfigDict
+
+from opencompass.registry import PARTITIONERS
+from opencompass.utils import (build_dataset_from_cfg, dataset_abbr_from_cfg,
+                               get_infer_output_path)
+
+from .base import BasePartitioner
+
+
+@PARTITIONERS.register_module()
+class SizePartitioner(BasePartitioner):
+    """Task partitioner based on the size of the dataset (with some rough
+    expansion as an estimation of computational cost).
+
+    Args:
+        out_dir (str): The output directory of tasks.
+        max_task_size (int): The maximum size of a task.
+        gen_task_coef (int): The dataset cost measurement coefficient for
+            generation tasks.
+        dataset_size_path (str): The path to the dataset size cache file.
+    """
+
+    def __init__(self,
+                 out_dir: str,
+                 max_task_size: int = 2000,
+                 gen_task_coef: int = 20,
+                 dataset_size_path: str = '.cache/dataset_size.json'):
+        super().__init__(out_dir)
+        self.max_task_size = max_task_size
+        self.gen_task_coef = gen_task_coef
+        self.dataset_size_path = dataset_size_path
+
+    def partition(self, models: List[ConfigDict], datasets: List[ConfigDict],
+                  work_dir: str, out_dir: str) -> List[ConfigDict]:
+        """Partition model-dataset pairs into tasks. Each task is defined as a
+        dict and will run independently as a unit. Its structure is as
+        follows:
+
+        .. code-block:: python
+
+            {
+                'models': [],  # a list of model configs
+                'datasets': [[]],  # a nested list of dataset configs, each
+                                    list corresponds to a model
+                'work_dir': '',  # the work dir
+            }
+
+        Args:
+            models (List[ConfigDict]): A list of model configs.
+            datasets (List[ConfigDict]): A list of dataset configs.
+            work_dir (str): The work dir for the task.
+            out_dir (str): The full output path for the task, intended for
+                Partitioners to check whether the task is finished via the
+                existency of result file in this directory.
+
+        Returns:
+            List[ConfigDict]: A list of tasks.
+        """
+
+        datasets = sorted(datasets,
+                          key=lambda x: self.get_cost(x),
+                          reverse=True)
+        tasks = []
+        for model in models:
+            task = Config({
+                'models': [model],
+                'datasets': [[]],
+                'work_dir': work_dir
+            })
+            num_data = 0
+            for dataset in datasets:
+                filename = get_infer_output_path(model, dataset, out_dir)
+                root, ext = osp.splitext(filename)
+                # skip the task if the task output exists
+                if osp.exists(filename):
+                    continue
+                dataset_size = self.get_cost(dataset)
+                if dataset_size > self.max_task_size:
+                    dataset_splits = self.split_dataset(dataset)
+                    for i, dataset_split in enumerate(dataset_splits):
+                        # skip the task it the task output exists
+                        if not osp.exists(f'{root}_{i}{ext}'):
+                            tasks.append(
+                                Config({
+                                    'models': [model],
+                                    'datasets': [[dataset_split]],
+                                    'work_dir': work_dir
+                                }))
+                else:
+                    if num_data + dataset_size > self.max_task_size:
+                        tasks.append(task)
+                        task = Config({
+                            'models': [model],
+                            'datasets': [[]],
+                            'work_dir': work_dir
+                        })
+                        num_data = 0
+                    task['datasets'][0].append(dataset)
+                    num_data = num_data + dataset_size
+            if task['datasets'][0]:
+                tasks.append(task)
+
+        return tasks
+
+    @property
+    def dataset_size(self):
+        if not hasattr(self, '_dataset_size'):
+            if osp.exists(self.dataset_size_path):
+                self._dataset_size = mmengine.load(self.dataset_size_path)
+            else:
+                self._dataset_size = {}
+        return self._dataset_size
+
+    def split_dataset(self, dataset_cfg: ConfigDict) -> List[ConfigDict]:
+        """Split dataset into several parts."""
+        dataset_size, num_repeats = self.get_cost(dataset_cfg,
+                                                  get_raw_factors=True)
+        split_configs = []
+        abbr = dataset_abbr_from_cfg(dataset_cfg)
+        step = self.max_task_size // num_repeats
+        # evenly distribute the task
+        step = math.ceil(dataset_size / math.ceil(dataset_size / step))
+        for part, i in enumerate(range(0, dataset_size, step)):
+            cfg = copy.deepcopy(dataset_cfg)
+            cfg['abbr'] = abbr + f'_{part}'
+            test_range = cfg['reader_cfg'].get('test_range', '')
+            cfg['reader_cfg']['test_range'] = f'{test_range}[{i}:{i+step}]'
+            split_configs.append(cfg)
+        return split_configs
+
+    def get_cost(self,
+                 dataset: ConfigDict,
+                 get_raw_factors: bool = False) -> Union[int, Tuple[int, int]]:
+        """Get the computational cost of inferring on the dataset.
+
+        Args:
+            dataset (ConfigDict): The dataset config.
+            get_raw_factors (bool): If True, the raw factors of computational
+                cost will be returned.
+
+        Returns:
+            int or Tuple[int, int]: The size of the dataset. If get_raw_factors
+                is True, the number of repeats will also be returned.
+        """
+        dataset_abbr = dataset_abbr_from_cfg(dataset)
+
+        # If it's the PPL template, the dataset size will be multiplied by the
+        # number of labels
+        infer_cfg = dataset.infer_cfg
+        test_range = dataset.reader_cfg.get('test_range', '')
+        template = (infer_cfg.prompt_template.template if 'prompt_template'
+                    in infer_cfg else infer_cfg.ice_template.template)
+        # If it's the Gen template, the dataset size will be multiplied by the
+        # self.gen_task_coef
+        factor = self.gen_task_coef
+        if isinstance(template, dict):
+            ctr = sum(key in template for key in ('begin', 'round', 'end'))
+            if ctr != len(template.keys()):
+                factor = len(template.keys())
+
+        if dataset_abbr in self.dataset_size:
+            actual_size = eval('len(range(self.dataset_size[dataset_abbr])'
+                               f'{test_range})')
+            if get_raw_factors:
+                return actual_size, factor
+            return factor * actual_size
+
+        dataset = build_dataset_from_cfg(dataset)
+        self.dataset_size[dataset_abbr] = len(dataset.test)
+
+        mmengine.mkdir_or_exist('.cache/')
+        mmengine.dump(self.dataset_size,
+                      self.dataset_size_path,
+                      indent=4,
+                      ensure_ascii=False)
+
+        actual_size = eval('len(range(self.dataset_size[dataset_abbr])'
+                           f'{test_range})')
+        if get_raw_factors:
+            return actual_size, factor
+        return factor * actual_size
--- a/opencompass/registry.py
+++ b/opencompass/registry.py
+from mmengine.registry import Registry
+
+PARTITIONERS = Registry('partitioner', locations=['opencompass.partitioners'])
+RUNNERS = Registry('runner', locations=['opencompass.runners'])
+TASKS = Registry('task', locations=['opencompass.tasks'])
+MODELS = Registry('model', locations=['opencompass.models'])
+# TODO: LOAD_DATASET -> DATASETS
+LOAD_DATASET = Registry('load_dataset', locations=['opencompass.datasets'])
+TEXT_POSTPROCESSORS = Registry(
+    'text_postprocessors', locations=['opencompass.utils.text_postprocessors'])
+EVALUATORS = Registry('evaluators', locations=['opencompass.evaluators'])
+
+ICL_INFERENCERS = Registry('icl_inferencers',
+                           locations=['opencompass.openicl.icl_inferencer'])
+ICL_RETRIEVERS = Registry('icl_retrievers',
+                          locations=['opencompass.openicl.icl_retriever'])
+ICL_DATASET_READERS = Registry(
+    'icl_dataset_readers',
+    locations=['opencompass.openicl.icl_dataset_reader'])
+ICL_PROMPT_TEMPLATES = Registry(
+    'icl_prompt_templates',
+    locations=['opencompass.openicl.icl_prompt_template'])
+ICL_EVALUATORS = Registry('icl_evaluators',
+                          locations=['opencompass.openicl.icl_evaluator'])
--- a/opencompass/runners/base.py
+++ b/opencompass/runners/base.py
+import getpass
+from abc import abstractmethod
+from typing import Any, Dict, List, Tuple
+
+from mmengine.config import ConfigDict, Config
+
+from opencompass.utils import LarkReporter, get_logger
+
+
+class BaseRunner:
+    """Base class for all runners. A runner is responsible for launching
+    multiple tasks.
+
+    Args:
+        task (ConfigDict): Task type config.
+        debug (bool): Whether to run in debug mode.
+        lark_bot_url (str): Lark bot url.
+    """
+
+    def __init__(self,
+                 task: ConfigDict,
+                 debug: bool = False,
+                 lark_bot_url: str = None):
+        self.task_cfg = Config(task)
+        self.debug = debug
+        if lark_bot_url:
+            self.lark_reporter = LarkReporter(lark_bot_url)
+        else:
+            self.lark_reporter = None
+
+    def __call__(self, tasks: List[Dict[str, Any]]):
+        """Launch multiple tasks and summarize the results.
+
+        Args:
+            tasks (list[dict]): A list of task configs, usually generated by
+                Partitioner.
+        """
+        status = self.launch(tasks)
+        self.summarize(status)
+
+    @abstractmethod
+    def launch(self, tasks: List[Dict[str, Any]]) -> List[Tuple[str, int]]:
+        """Launch multiple tasks.
+
+        Args:
+            tasks (list[dict]): A list of task configs, usually generated by
+                Partitioner.
+
+        Returns:
+            list[tuple[str, int]]: A list of (task name, exit code).
+        """
+
+    def summarize(self, status: List[Tuple[str, int]]) -> None:
+        """Summarize the results of the tasks.
+
+        Args:
+            status (list[tuple[str, int]]): A list of (task name, exit code).
+        """
+
+        failed_logs = []
+        for _task, code in status:
+            if code != 0:
+                get_logger().error(f'{_task} failed with code {code}')
+                failed_logs.append(_task)
+        if self.lark_reporter:
+            num_succeeded = len(status) - len(failed_logs)
+            if len(failed_logs) > 0:
+                content = f'{getpass.getuser()} 的 '
+                content += f'{self.task_cfg.type} 任务已完成，'
+                content += f'成功任务 {num_succeeded} 个，'
+                content += f'失败 {len(failed_logs)} 个。以下为失败的任务列表：'
+                content += '\n' + '\n'.join(failed_logs)
+                self.lark_reporter.post(title=f'悲报：您有{len(failed_logs)}个'
+                                        '任务炸了',
+                                        content=content)
+            else:
+                content = f'{getpass.getuser()} 的 '
+                content += f'{self.task_cfg.type} 任务已完成，'
+                content += f'成功任务 {num_succeeded} 个。'
+                self.lark_reporter.post(title='喜报：全部任务完成', content=content)
--- a/opencompass/runners/slurm.py
+++ b/opencompass/runners/slurm.py