[Sync] minor test (#683)

e78857ac · Hubert · GitHub · dd4318f6 · e78857ac · e78857ac
Unverified Commit e78857ac authored Dec 11, 2023 by Hubert Committed by GitHub Dec 11, 2023
17 changed files
--- a/opencompass/models/openai_api.py
+++ b/opencompass/models/openai_api.py
@@ -58,6 +58,7 @@ class OpenAI(BaseAPIModel):
                 path: str = 'gpt-3.5-turbo',
                 max_seq_len: int = 4096,
                 query_per_second: int = 1,
+                 rpm_verbose: bool = False,
                 retry: int = 2,
                 key: Union[str, List[str]] = 'ENV',
                 org: Optional[Union[str, List[str]]] = None,
@@ -70,6 +71,7 @@ class OpenAI(BaseAPIModel):
                         max_seq_len=max_seq_len,
                         meta_template=meta_template,
                         query_per_second=query_per_second,
+                         rpm_verbose=rpm_verbose,
                         retry=retry)
        import tiktoken
        self.tiktoken = tiktoken

--- a/opencompass/openicl/icl_evaluator/__init__.py
+++ b/opencompass/openicl/icl_evaluator/__init__.py
@@ -5,5 +5,6 @@ from .icl_circular_evaluator import CircularEvaluator  # noqa
 from .icl_em_evaluator import EMEvaluator  # noqa
 from .icl_hf_evaluator import *  # noqa
 from .icl_jieba_rouge_evaluator import JiebaRougeEvaluator  # noqa
+from .icl_misc_evaluator import AveragePPLEvaluator  # noqa
 from .icl_toxic_evaluator import ToxicEvaluator  # noqa
 from .lm_evaluator import LMEvaluator  # noqa
--- a/opencompass/openicl/icl_evaluator/icl_misc_evaluator.py
+++ b/opencompass/openicl/icl_evaluator/icl_misc_evaluator.py
+from opencompass.registry import ICL_EVALUATORS
+
+from .icl_base_evaluator import BaseEvaluator
+
+
+@ICL_EVALUATORS.register_module()
+class AveragePPLEvaluator(BaseEvaluator):
+
+    def score(self, ppl):
+        average_ppl = sum(ppl) / len(ppl)
+        return {'average_ppl': average_ppl}
--- a/opencompass/openicl/icl_inferencer/__init__.py
+++ b/opencompass/openicl/icl_inferencer/__init__.py
@@ -4,6 +4,8 @@ from .icl_base_inferencer import BaseInferencer  # noqa
 from .icl_chat_inferencer import ChatInferencer  # noqa
 from .icl_clp_inferencer import CLPInferencer  # noqa
 from .icl_gen_inferencer import GenInferencer  # noqa
+from .icl_loglikelihood_inferencer import LoglikelihoodInferencer  # noqa
 from .icl_ppl_inferencer import PPLInferencer  # noqa
+from .icl_ppl_only_inferencer import PPLOnlyInferencer  # noqa
 from .icl_sc_inferencer import SCInferencer  # noqa
 from .icl_tot_inferencer import ToTInferencer  # noqa
--- a/opencompass/openicl/icl_inferencer/icl_agent_inferencer.py
+++ b/opencompass/openicl/icl_inferencer/icl_agent_inferencer.py
@@ -89,7 +89,7 @@ class AgentInferencer(ChatInferencer):

        user_idx = assistant_indices[-1] - 1
        self.model.set_history(chat[:user_idx])
-        answer, steps = self.model.chat(chat[user_idx]['content'])
+        answer, steps, _ = self.model.chat(chat[user_idx]['content'])
        output_handler.save_results(
            origin_prompt=chat[user_idx]['content'],
            prediction=answer,
@@ -104,10 +104,11 @@ class AgentInferencer(ChatInferencer):
            i for i, item in enumerate(chat) if item['role'] == 'assistant'
        ]

-        self.model.set_history(chat[:assistant_indices[0] - 1])
-
+        history = chat[:assistant_indices[0] - 1]
        for i in assistant_indices:
-            answer, steps = self.model.chat(chat[i - 1]['content'])
+            answer, steps, inner_steps = self.model.chat(
+                chat[i - 1]['content'], history)
+            history += inner_steps
            output_handler.save_multiround_results(
                origin_prompt=chat[i - 1]['content'],
                prediction=answer,
@@ -125,7 +126,7 @@ class AgentInferencer(ChatInferencer):

        for i in assistant_indices:
            self.model.set_history(chat[:i - 1])
-            answer, steps = self.model.chat(chat[i - 1]['content'])
+            answer, steps, _ = self.model.chat(chat[i - 1]['content'])
            output_handler.save_multiround_results(
                origin_prompt=chat[i - 1]['content'],
                prediction=answer,

--- a/opencompass/openicl/icl_inferencer/icl_chat_inferencer.py
+++ b/opencompass/openicl/icl_inferencer/icl_chat_inferencer.py
@@ -68,11 +68,11 @@ class LMTemplateParser:
        prompt = ''
        if self.roles:
            for dialog in chat:
-                role_cfg = self.roles.get(dialog['role'])
-                prompt += role_cfg['begin']
+                role_cfg = self.roles.get(dialog['role'], {})
+                prompt += (role_cfg.get('begin') or '')
                prompt += (dialog.get('content') or '')
-                prompt += role_cfg['end']
-            prompt += self.roles['assistant']['begin']
+                prompt += (role_cfg.get('end') or '')
+            prompt += (self.roles['assistant'].get('begin') or '')
        else:
            # in case the model does not have any meta template
            last_sep = ''
@@ -227,7 +227,11 @@ class ChatInferencer(BaseInferencer):
                                         'tmp_' + output_json_filename)
        if osp.exists(tmp_json_filepath):
            # TODO: move resume to output handler
+            try:
                tmp_result_dict = mmengine.load(tmp_json_filepath)
+            except Exception:
+                pass
+            else:
                output_handler.results_dict = tmp_result_dict
                index = len(tmp_result_dict)


--- a/opencompass/openicl/icl_inferencer/icl_gen_inferencer.py
+++ b/opencompass/openicl/icl_inferencer/icl_gen_inferencer.py
 """Direct Generation Inferencer."""

+import inspect
 import os
 import os.path as osp
 from typing import List, Optional
@@ -46,6 +47,7 @@ class GenInferencer(BaseInferencer):
            self,
            model: BaseModel,
            max_out_len: int,
+            stopping_criteria: List[str] = [],
            max_seq_len: Optional[int] = None,
            batch_size: Optional[int] = 1,
            gen_field_replace_token: Optional[str] = '',
@@ -64,6 +66,7 @@ class GenInferencer(BaseInferencer):

        self.gen_field_replace_token = gen_field_replace_token
        self.max_out_len = max_out_len
+        self.stopping_criteria = stopping_criteria

        if self.model.is_api and save_every is None:
            save_every = 1
@@ -128,10 +131,14 @@ class GenInferencer(BaseInferencer):
                entry = datum
                golds = [None for _ in range(len(entry))]
            # 5-1. Inference with local model
+            extra_gen_kwargs = {}
+            sig = inspect.signature(self.model.generate)
+            if 'stopping_criteria' in sig.parameters:
+                extra_gen_kwargs['stopping_criteria'] = self.stopping_criteria
            with torch.no_grad():
                parsed_entries = self.model.parse_template(entry, mode='gen')
                results = self.model.generate_from_template(
-                    entry, max_out_len=self.max_out_len)
+                    entry, max_out_len=self.max_out_len, **extra_gen_kwargs)
                generated = results

            num_return_sequences = getattr(self.model, 'generation_kwargs',

--- a/opencompass/openicl/icl_inferencer/icl_loglikelihood_inferencer.py
+++ b/opencompass/openicl/icl_inferencer/icl_loglikelihood_inferencer.py
+"""PPL Inferencer."""
+
+import os
+from typing import List, Optional
+
+import torch
+from tqdm import trange
+
+from opencompass.models.base import BaseModel
+from opencompass.registry import ICL_INFERENCERS
+
+from ..icl_prompt_template import PromptTemplate
+from ..icl_retriever import BaseRetriever
+from ..utils import get_logger
+from .icl_base_inferencer import BaseInferencer, dump_results_dict
+
+logger = get_logger(__name__)
+
+
+@ICL_INFERENCERS.register_module()
+class LoglikelihoodInferencer(BaseInferencer):
+    """Loglikelihood Inferencer class to evaluate by loglikelihood.
+
+    Attributes:
+        model (:obj:`BaseModel`, optional): The module to inference.
+        max_seq_len (:obj:`int`): Maximum number of tokenized words allowed by
+            the LM.
+        batch_size (:obj:`int`, optional): Batch size for the :obj:`DataLoader`
+        output_json_filepath (:obj:`str`, optional): File path for output
+            `JSON` file.
+        output_json_filename (:obj:`str`, optional): File name for output
+            `JSON` file.
+        labels (:obj:`List`, optional): A list of labels for all classes.
+    """
+
+    def __init__(
+            self,
+            model: BaseModel,
+            max_seq_len: Optional[int] = None,
+            batch_size: Optional[int] = 1,
+            output_json_filepath: Optional[str] = './icl_inference_output',
+            output_json_filename: Optional[str] = 'predictions',
+            labels: Optional[List] = None,
+            **kwargs) -> None:
+        super().__init__(
+            model=model,
+            max_seq_len=max_seq_len,
+            batch_size=batch_size,
+            output_json_filename=output_json_filename,
+            output_json_filepath=output_json_filepath,
+            **kwargs,
+        )
+
+        self.labels = labels
+
+    def inference(self,
+                  retriever: BaseRetriever,
+                  ice_template: Optional[PromptTemplate] = None,
+                  prompt_template: Optional[PromptTemplate] = None,
+                  output_json_filepath: Optional[str] = None,
+                  output_json_filename: Optional[str] = None) -> List:
+        # 1. Preparation for output logs
+        output_handler = LoglikelihoodInferencerOutputHandler()
+
+        sub_predictions = []
+        ppl = []
+        ice = []
+
+        if output_json_filepath is None:
+            output_json_filepath = self.output_json_filepath
+        if output_json_filename is None:
+            output_json_filename = self.output_json_filename
+
+        # 2. Get results of retrieval process
+        ice_idx_list = retriever.retrieve()
+
+        # 3. Get labels of all the classes
+        if self.labels is None:
+            labels = retriever.get_labels(ice_template=ice_template,
+                                          prompt_template=prompt_template)
+        else:
+            labels = self.labels
+
+        # 4. Generate in-context examples for testing inputs
+        for idx in range(len(ice_idx_list)):
+            ice.append(
+                retriever.generate_ice(ice_idx_list[idx],
+                                       ice_template=ice_template))
+        output_handler.save_ice(self.model.parse_template(ice, mode='ppl'))
+
+        # 5. Calculating loglikelihood for prompts in each label's class
+        for label in labels:
+            index = 0
+            prompt_list = []
+            sub_ppl_list = []
+            token_num_list = []
+            cont_list = []
+
+            # 5.1 Generate prompts of current label and truncate
+            # TODO: Refactor
+            for idx in range(len(ice_idx_list)):
+                prompt = retriever.generate_label_prompt(
+                    idx,
+                    ice[idx],
+                    label,
+                    ice_template=ice_template,
+                    prompt_template=prompt_template)
+                if self.max_seq_len is not None:
+                    prompt_token_num = self.model.get_token_len_from_template(
+                        prompt, mode='ppl')
+                    while len(ice_idx_list[idx]
+                              ) > 0 and prompt_token_num > self.max_seq_len:
+                        ice_idx_list[idx] = ice_idx_list[idx][:-1]
+                        ice[idx] = retriever.generate_ice(
+                            ice_idx_list[idx], ice_template=ice_template)
+                        prompt = retriever.generate_label_prompt(
+                            idx,
+                            ice[idx],
+                            label,
+                            ice_template=ice_template,
+                            prompt_template=prompt_template)
+                        prompt_token_num = self.model.get_token_len_from_template(  # noqa
+                            prompt, mode='ppl')  # noqa
+
+                prompt_list.append(prompt)
+                token_num_list.append(prompt_token_num)
+                cont_list.append(retriever.test_ds[idx]['cont'])
+
+            # 5.2 Get PPL
+            logger.info(f"Calculating PPL for prompts labeled '{label}'")
+            for idx in trange(0,
+                              len(prompt_list),
+                              self.batch_size,
+                              disable=not self.is_main_process):
+                sub_prompt_list = prompt_list[idx:idx + self.batch_size]
+                sub_cont_list = cont_list[idx:idx + self.batch_size]
+
+                with torch.no_grad():
+                    # mainly modify compared to PPLInferencer
+                    sub_res = self.model.get_loglikelihood_from_template(
+                        sub_prompt_list, sub_cont_list).tolist()
+                for res, prompt in zip(
+                        sub_res,
+                        self.model.parse_template(sub_prompt_list,
+                                                  mode='ppl')):
+                    sub_ppl_list.append(res)
+                    ice_str = self.model.parse_template(ice[idx], mode='ppl')
+                    output_handler.save_prompt_and_loglikelihood(
+                        label, prompt.replace(ice_str, ''), prompt, res, index)
+                    index = index + 1
+            ppl.append(sub_ppl_list)
+
+        # 6. Get lowest PPL class as predictions
+        ppl = list(zip(*ppl))
+        for single_ppl in ppl:
+            sub_predictions.append(labels[single_ppl.index(max(single_ppl))])
+        output_handler.save_predictions(sub_predictions)
+
+        # 7. Fetch gold answers if exist
+        ds_reader = retriever.dataset_reader
+        if ds_reader.output_column:
+            golds = ds_reader.dataset['test'][ds_reader.output_column]
+            output_handler.save_golds(golds)
+
+        # 8. Output
+        if self.is_main_process:
+            os.makedirs(output_json_filepath, exist_ok=True)
+            output_handler.write_to_json(output_json_filepath,
+                                         output_json_filename)
+
+        return [
+            sample['prediction']
+            for sample in output_handler.results_dict.values()
+        ]
+
+
+class LoglikelihoodInferencerOutputHandler:
+    results_dict = {}
+
+    def __init__(self) -> None:
+        self.results_dict = {}
+
+    def write_to_json(self, save_dir: str, filename: str):
+        """Dump the result to a json file."""
+        dump_results_dict(self.results_dict, os.path.join(save_dir, filename))
+
+    def save_ice(self, ice):
+        for idx, example in enumerate(ice):
+            if str(idx) not in self.results_dict.keys():
+                self.results_dict[str(idx)] = {}
+            self.results_dict[str(idx)]['in-context examples'] = example
+
+    def save_predictions(self, predictions):
+        for idx, prediction in enumerate(predictions):
+            if str(idx) not in self.results_dict.keys():
+                self.results_dict[str(idx)] = {}
+            self.results_dict[str(idx)]['prediction'] = prediction
+
+    def save_prompt_and_loglikelihood(self, label, input, prompt,
+                                      loglikelihood, idx):
+        if str(idx) not in self.results_dict.keys():
+            self.results_dict[str(idx)] = {}
+        if 'label: ' + str(label) not in self.results_dict[str(idx)].keys():
+            self.results_dict[str(idx)]['label: ' + str(label)] = {}
+        self.results_dict[str(idx)]['label: ' +
+                                    str(label)]['testing input'] = input
+        self.results_dict[str(idx)]['label: ' + str(label)]['prompt'] = prompt
+        self.results_dict[str(idx)][
+            'label: ' + str(label)]['Loglikelihood'] = loglikelihood
+
+    def save_golds(self, golds):
+        for idx, gold in enumerate(golds):
+            if str(idx) not in self.results_dict.keys():
+                self.results_dict[str(idx)] = {}
+            self.results_dict[str(idx)]['gold'] = gold
--- a/opencompass/openicl/icl_inferencer/icl_ppl_only_inferencer.py
+++ b/opencompass/openicl/icl_inferencer/icl_ppl_only_inferencer.py
+"""PPL Inferencer."""
+
+import os
+from typing import List, Optional
+
+import mmengine
+import torch
+from tqdm import tqdm
+
+from opencompass.models.base import BaseModel
+from opencompass.registry import ICL_INFERENCERS
+
+from ..icl_prompt_template import PromptTemplate
+from ..icl_retriever import BaseRetriever
+from ..utils import get_logger
+from .icl_base_inferencer import BaseInferencer, dump_results_dict
+
+logger = get_logger(__name__)
+
+
+@ICL_INFERENCERS.register_module()
+class PPLOnlyInferencer(BaseInferencer):
+    """PPLOnlyInferencer class to calculate PPL and PPL only, no choice is
+    made. This Inferencer is usually used along with AveragePPLEvaluator.
+
+    Attributes:
+        model (:obj:`BaseModel`, optional): The module to inference.
+        max_seq_len (:obj:`int`): Maximum number of tokenized words allowed by
+            the LM.
+        batch_size (:obj:`int`, optional): Batch size for the :obj:`DataLoader`
+        output_json_filepath (:obj:`str`, optional): File path for output
+            `JSON` file.
+        output_json_filename (:obj:`str`, optional): File name for output
+            `JSON` file.
+        save_every (:obj:`int`, optional): Save intermediate results every
+    """
+
+    def __init__(
+            self,
+            model: BaseModel,
+            max_seq_len: Optional[int] = None,
+            batch_size: Optional[int] = 1,
+            output_json_filepath: Optional[str] = './icl_inference_output',
+            output_json_filename: Optional[str] = 'predictions',
+            save_every: Optional[int] = 1,
+            **kwargs) -> None:
+        super().__init__(
+            model=model,
+            max_seq_len=max_seq_len,
+            batch_size=batch_size,
+            output_json_filename=output_json_filename,
+            output_json_filepath=output_json_filepath,
+            **kwargs,
+        )
+
+        self.save_every = save_every
+
+    def inference(self,
+                  retriever: BaseRetriever,
+                  ice_template: Optional[PromptTemplate] = None,
+                  prompt_template: Optional[PromptTemplate] = None,
+                  output_json_filepath: Optional[str] = None,
+                  output_json_filename: Optional[str] = None) -> List:
+        # 1. Preparation for output logs
+        output_handler = PPLOnlyInferencerOutputHandler()
+
+        if output_json_filepath is None:
+            output_json_filepath = self.output_json_filepath
+        if output_json_filename is None:
+            output_json_filename = self.output_json_filename
+
+        # 2. Get results of retrieval process
+        ice_idx_list = retriever.retrieve()
+
+        # 3. Generate prompts for testing input
+        prompt_list = self.get_generation_prompt_list_from_retriever_indices(
+            ice_idx_list,
+            retriever,
+            max_seq_len=self.max_seq_len,
+            ice_template=ice_template,
+            prompt_template=prompt_template)
+
+        # 3.1 Fetch and zip prompt & gold answer if output column exists
+        ds_reader = retriever.dataset_reader
+
+        assert ds_reader.output_column is None, (
+            'PPLOnlyInferencer supports `output_column=None` only.')
+
+        # Create tmp json file for saving intermediate results and future
+        # resuming
+        index = 0
+        tmp_json_filepath = os.path.join(output_json_filepath,
+                                         'tmp_' + output_json_filename)
+        if os.path.exists(tmp_json_filepath):
+            # TODO: move resume to output handler
+            try:
+                tmp_result_dict = mmengine.load(tmp_json_filepath)
+            except Exception:
+                pass
+            else:
+                output_handler.results_dict = tmp_result_dict
+                index = len(tmp_result_dict)
+
+        # 4. Wrap prompts with Dataloader
+        dataloader = self.get_dataloader(prompt_list[index:], self.batch_size)
+
+        # 5. Inference for prompts in each batch
+        logger.info('Starting inference process...')
+        for datum in tqdm(dataloader, disable=not self.is_main_process):
+            entry = datum
+            # 5-1. Inference with local model
+            with torch.no_grad():
+                ppls = self.model.get_ppl_from_template(entry).tolist()
+
+            parsed_entries = self.model.parse_template(entry, mode='gen')
+            # 5-3. Save current output
+            for prompt, ppl, in zip(parsed_entries, ppls):
+                output_handler.save_results(prompt, ppl, index)
+                index = index + 1
+
+            # 5-4. Save intermediate results
+            if (self.save_every is not None and index % self.save_every == 0
+                    and self.is_main_process):
+                output_handler.write_to_json(output_json_filepath,
+                                             'tmp_' + output_json_filename)
+
+        # 6. Output
+        if self.is_main_process:
+            os.makedirs(output_json_filepath, exist_ok=True)
+            output_handler.write_to_json(output_json_filepath,
+                                         output_json_filename)
+            if os.path.exists(tmp_json_filepath):
+                os.remove(tmp_json_filepath)
+
+        return [
+            sample['ppl'] for sample in output_handler.results_dict.values()
+        ]
+
+    def get_generation_prompt_list_from_retriever_indices(
+            self,
+            ice_idx_list: List[List[int]],
+            retriever: BaseRetriever,
+            max_seq_len: Optional[int] = None,
+            ice_template: Optional[PromptTemplate] = None,
+            prompt_template: Optional[PromptTemplate] = None):
+        prompt_list = []
+        for idx, ice_idx in enumerate(ice_idx_list):
+            ice = retriever.generate_ice(ice_idx, ice_template=ice_template)
+            prompt = retriever.generate_prompt_for_generate_task(
+                idx,
+                ice,
+                ice_template=ice_template,
+                prompt_template=prompt_template)
+            if max_seq_len is not None:
+                prompt_token_num = self.model.get_token_len_from_template(
+                    prompt, mode='gen')
+                while len(ice_idx) > 0 and prompt_token_num > max_seq_len:
+                    ice_idx = ice_idx[:-1]
+                    ice = retriever.generate_ice(ice_idx,
+                                                 ice_template=ice_template)
+                    prompt = retriever.generate_prompt_for_generate_task(
+                        idx,
+                        ice,
+                        ice_template=ice_template,
+                        prompt_template=prompt_template)
+                    prompt_token_num = self.model.get_token_len_from_template(
+                        prompt, mode='gen')
+            prompt_list.append(prompt)
+        return prompt_list
+
+
+class PPLOnlyInferencerOutputHandler:
+    origin_prompt_dict = {}
+    output_dict = {}
+    results_dict = {}
+
+    def __init__(self) -> None:
+        self.results_dict = {}
+
+    def write_to_json(self, save_dir: str, filename: str):
+        """Dump the result to a json file."""
+        dump_results_dict(self.results_dict, os.path.join(save_dir, filename))
+
+    def save_results(self, origin_prompt, ppl, idx):
+        self.results_dict[str(idx)] = {
+            'origin_prompt': origin_prompt,
+            'ppl': ppl,
+        }
--- a/opencompass/partitioners/base.py
+++ b/opencompass/partitioners/base.py
+import inspect
 from abc import abstractmethod
 from copy import deepcopy
 from typing import Dict, List, Optional

 from mmengine.config import ConfigDict

-from opencompass.utils import get_logger, task_abbr_from_cfg
+from opencompass.utils import (dataset_abbr_from_cfg, get_logger,
+                               model_abbr_from_cfg, task_abbr_from_cfg)


 class BasePartitioner:
@@ -54,8 +56,7 @@ class BasePartitioner:
            List[Dict]: A list of tasks.
        """
        cfg = deepcopy(cfg)
-        models = cfg['models']
-        datasets = cfg['datasets']
+
        work_dir = cfg['work_dir']

        add_cfg = {}
@@ -74,10 +75,11 @@ class BasePartitioner:
                self.logger.debug(f'Key {k} not found in config, ignored.')
        self.logger.debug(f'Additional config: {add_cfg}')

-        tasks = self.partition(models,
-                               datasets,
-                               work_dir,
-                               self.out_dir,
+        model_and_dataset_args = self.parse_model_dataset_args(cfg)
+
+        tasks = self.partition(**model_and_dataset_args,
+                               work_dir=work_dir,
+                               out_dir=self.out_dir,
                               add_cfg=add_cfg)

        self.logger.info(f'Partitioned into {len(tasks)} tasks.')
@@ -86,6 +88,41 @@ class BasePartitioner:

        return tasks

+    def parse_model_dataset_args(self, cfg: ConfigDict):
+        models = cfg['models']
+        datasets = cfg['datasets']
+
+        sig = inspect.signature(self.partition)
+        if 'model_dataset_combinations' in sig.parameters:
+            combs = cfg.get('model_dataset_combinations', None)
+            if combs is None:
+                combs = [{'models': models, 'datasets': datasets}]
+            else:
+                # sanity check
+                model_abbrs = [model_abbr_from_cfg(model) for model in models]
+                dataset_abbrs = [
+                    dataset_abbr_from_cfg(dataset) for dataset in datasets
+                ]
+                for comb in combs:
+                    for model in comb['models']:
+                        if model_abbr_from_cfg(model) not in model_abbrs:
+                            raise ValueError(
+                                f'Model {model_abbr_from_cfg(model)} '
+                                'not found in config.')
+                    for dataset in comb['datasets']:
+                        if dataset_abbr_from_cfg(dataset) not in dataset_abbrs:
+                            raise ValueError(
+                                f'Dataset {dataset_abbr_from_cfg(dataset)} '
+                                'not found in config.')
+            used_kwargs = {'model_dataset_combinations': combs}
+        else:
+            if cfg.get('model_dataset_combinations', None) is not None:
+                self.logger.warning(
+                    'model_dataset_combinations is not supported by '
+                    f'{self.__class__.__name__}. Ignored.')
+            used_kwargs = {'models': models, 'datasets': datasets}
+        return used_kwargs
+
    @abstractmethod
    def partition(self,
                  models: List[ConfigDict],

--- a/opencompass/partitioners/naive.py
+++ b/opencompass/partitioners/naive.py
@@ -29,8 +29,8 @@ class NaivePartitioner(BasePartitioner):
        self.n = n

    def partition(self,
-                  models: List[ConfigDict],
-                  datasets: List[ConfigDict],
+                  model_dataset_combinations: List[Dict[str,
+                                                        List[ConfigDict]]],
                  work_dir: str,
                  out_dir: str,
                  add_cfg: Dict = {}) -> List[Dict]:
@@ -48,8 +48,9 @@ class NaivePartitioner(BasePartitioner):
            }

        Args:
-            models (List[ConfigDict]): A list of model configs.
-            datasets (List[ConfigDict]): A list of dataset configs.
+            model_dataset_combinations (List[Dict]): List of
+                `{models: [...], datasets: [...]}` dicts. Each dict contains
+                a list of model configs and a list of dataset configs.
            work_dir (str): The work dir for the task.
            out_dir (str): The full output path for the task, intended for
                Partitioners to check whether the task is finished via the
@@ -60,9 +61,10 @@ class NaivePartitioner(BasePartitioner):
        """

        tasks = []
-        for model in models:
+        for comb in model_dataset_combinations:
+            for model in comb['models']:
                chunks = []
-            for dataset in datasets:
+                for dataset in comb['datasets']:
                    filename = get_infer_output_path(model, dataset, out_dir)
                    if osp.exists(filename):
                        continue

--- a/opencompass/partitioners/size.py
+++ b/opencompass/partitioners/size.py
@@ -51,8 +51,8 @@ class SizePartitioner(BasePartitioner):
        self.strategy = strategy

    def partition(self,
-                  models: List[ConfigDict],
-                  datasets: List[ConfigDict],
+                  model_dataset_combinations: List[Dict[str,
+                                                        List[ConfigDict]]],
                  work_dir: str,
                  out_dir: str,
                  add_cfg: Dict = {}) -> List[ConfigDict]:
@@ -71,8 +71,9 @@ class SizePartitioner(BasePartitioner):
            }

        Args:
-            models (List[ConfigDict]): A list of model configs.
-            datasets (List[ConfigDict]): A list of dataset configs.
+            model_dataset_combinations (List[Dict]): List of
+                `{models: [...], datasets: [...]}` dicts. Each dict contains
+                a list of model configs and a list of dataset configs.
            work_dir (str): The work dir for the task.
            out_dir (str): The full output path for the task, intended for
                Partitioners to check whether the task is finished via the
@@ -84,13 +85,14 @@ class SizePartitioner(BasePartitioner):
            List[ConfigDict]: A list of tasks.
        """

-        datasets = sorted(datasets,
+        tasks = []
+        for comb in model_dataset_combinations:
+            comb['datasets'] = sorted(comb['datasets'],
                                      key=lambda x: self.get_cost(x),
                                      reverse=True)
-        tasks = []
-        for model in models:
+            for model in comb['models']:
                chunks = []  # elements: tuple(size, dataset_chunk)
-            for dataset in datasets:
+                for dataset in comb['datasets']:
                    filename = get_infer_output_path(model, dataset, out_dir)
                    # skip the task if the task output exists
                    if osp.exists(filename):
@@ -101,7 +103,8 @@ class SizePartitioner(BasePartitioner):
                        dataset_splits = self.split_dataset(dataset)
                        for i, dataset_split in enumerate(dataset_splits):
                            if not osp.exists(f'{root}_{i}{ext}'):
-                            chunks.append((self.max_task_size, dataset_split))
+                                chunks.append(
+                                    (self.max_task_size, dataset_split))
                    else:
                        chunks.append((dataset_size, dataset))


--- a/opencompass/runners/slurm_sequential.py
+++ b/opencompass/runners/slurm_sequential.py
@@ -13,7 +13,7 @@ from mmengine.config import ConfigDict
 from tqdm import tqdm

 from opencompass.registry import RUNNERS, TASKS
-from opencompass.utils import get_logger
+from opencompass.utils import batched, get_logger

 from .base import BaseRunner

@@ -131,7 +131,11 @@ class SlurmSequentialRunner(BaseRunner):
                        break
                parent_conn.close()

-            for job_id in tqdm(job_ids, desc='clear sruns'):
+            tbar = tqdm(total=len(job_ids), desc='clear sruns')
+            for batched_job_ids in batched(job_ids, 4):
+                ps = []
+                for job_id in batched_job_ids:
+                    tbar.update()
                    if job_id is None:
                        continue
                    cmd = f'scancel {job_id}'
@@ -139,7 +143,10 @@ class SlurmSequentialRunner(BaseRunner):
                                         shell=True,
                                         stdout=subprocess.PIPE,
                                         stderr=subprocess.STDOUT)
+                    ps.append(p)
+                for p in ps:
                    p.wait()
+            tbar.close()

    def _launch(self, cfg: ConfigDict, child_conn: Pipe = None):
        logger = get_logger()

--- a/opencompass/tasks/openicl_eval.py
+++ b/opencompass/tasks/openicl_eval.py
@@ -121,8 +121,9 @@ class OpenICLEvalTask(BaseTask):
            pred_dicts = copy.deepcopy(preds)
            preds = {k: [pred.get(k) for pred in preds] for k in preds[0]}

-            pred_strs = preds.pop('prediction')
-            pred_list_flag = isinstance(pred_strs[0], list)
+            pred_strs = preds.pop('prediction', None)
+            pred_list_flag = pred_strs is not None and isinstance(
+                pred_strs[0], list)
            if ('pred_role' in self.eval_cfg
                    and 'meta_template' in self.model_cfg
                    and not MODELS.get(self.model_cfg['type']).is_api):
@@ -166,6 +167,12 @@ class OpenICLEvalTask(BaseTask):
                ]

            icl_evaluator = ICL_EVALUATORS.build(self.eval_cfg['evaluator'])
+            # need results dir to save other files
+            out_path = get_infer_output_path(
+                self.model_cfg, self.dataset_cfg,
+                osp.join(self.work_dir, 'results'))
+            icl_evaluator._out_dir = osp.splitext(out_path)[
+                0]  # strip extension

            preds['predictions'] = pred_strs
            preds['references'] = (test_set[self.output_column]

--- a/opencompass/utils/text_postprocessors.py
+++ b/opencompass/utils/text_postprocessors.py
@@ -49,6 +49,14 @@ def first_capital_postprocess(text: str) -> str:
    return ''


+@TEXT_POSTPROCESSORS.register_module('last-capital')
+def last_capital_postprocess(text: str) -> str:
+    for t in text[::-1]:
+        if t.isupper():
+            return t
+    return ''
+
+
 def first_option_postprocess(text: str, options: str) -> str:
    """Find first valid option for text."""


--- a/requirements/agent.txt
+++ b/requirements/agent.txt
+json5
+jupyter
+jupyter_client
+jupytext
+lagent
+scikit-image
+sympy
--- a/requirements/extra.txt
+++ b/requirements/extra.txt
 faiss_gpu==1.7.2
-jupyter
-lagent
-scikit-image