[Sync] update (#517)

dbb20b82 · Fengzhe Zhou · GitHub · 6f07af30 · dbb20b82 · dbb20b82
Unverified Commit dbb20b82 authored Oct 27, 2023 by Fengzhe Zhou Committed by GitHub Oct 27, 2023
20 changed files
--- a/opencompass/datasets/cmb.py
+++ b/opencompass/datasets/cmb.py
@@ -13,9 +13,9 @@ class CMBDataset(BaseDataset):
    @staticmethod
    def load(path: str):
-        with open(osp.join(path, 'test.json'), 'r') as f:
+        with open(osp.join(path, 'test.json'), 'r', encoding='utf-8') as f:
            test_data = json.load(f)
-        with open(osp.join(path, 'val.json'), 'r') as f:
+        with open(osp.join(path, 'val.json'), 'r', encoding='utf-8') as f:
            val_data = json.load(f)
        for da in test_data:

--- a/opencompass/datasets/cmnli.py
+++ b/opencompass/datasets/cmnli.py
@@ -13,7 +13,7 @@ class cmnliDataset_V2(BaseDataset):
    @staticmethod
    def load(path):
        data = []
-        with open(path, 'r') as f:
+        with open(path, 'r', encoding='utf-8') as f:
            for line in f:
                line = json.loads(line)
                if line['label'] == '-':

--- a/opencompass/datasets/cmrc.py
+++ b/opencompass/datasets/cmrc.py
@@ -12,7 +12,7 @@ class CMRCDataset(BaseDataset):
    @staticmethod
    def load(path: str):
-        with open(path) as f:
+        with open(path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        # 将原始数据转换为所需的格式
        rows = []

--- a/opencompass/datasets/copa.py
+++ b/opencompass/datasets/copa.py
@@ -13,7 +13,7 @@ class COPADataset_V2(BaseDataset):
    @staticmethod
    def load(path):
        dataset = []
-        with open(path, 'r') as f:
+        with open(path, 'r', encoding='utf-8') as f:
            for line in f:
                line = json.loads(line)
                line['label'] = 'AB'[line['label']]

--- a/opencompass/datasets/csl.py
+++ b/opencompass/datasets/csl.py
@@ -31,7 +31,7 @@ class CslDataset_V2(BaseDataset):
    @staticmethod
    def load(path):
        data = []
-        with open(path, 'r') as f:
+        with open(path, 'r', encoding='utf-8') as f:
            for line in f:
                line = json.loads(line)
                item = {

--- a/opencompass/datasets/drcd.py
+++ b/opencompass/datasets/drcd.py
@@ -12,7 +12,7 @@ class DRCDDataset(BaseDataset):
    @staticmethod
    def load(path: str):
-        with open(path) as f:
+        with open(path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        # 将原始数据转换为所需的格式
        rows = []

--- a/opencompass/datasets/eprstmt.py
+++ b/opencompass/datasets/eprstmt.py
@@ -13,7 +13,7 @@ class eprstmtDataset_V2(BaseDataset):
    @staticmethod
    def load(path):
        data = []
-        with open(path, 'r') as f:
+        with open(path, 'r', encoding='utf-8') as f:
            for line in f:
                line = json.loads(line)
                item = {

--- a/opencompass/datasets/gsm8k.py
+++ b/opencompass/datasets/gsm8k.py
+from opencompass.openicl import BaseEvaluator
 from opencompass.registry import TEXT_POSTPROCESSORS
@@ -26,3 +27,25 @@ def gsm8k_postprocess(text: str) -> str:
        if ret[i].isdigit():
            ret1 += ret[i]
    return ret1
+class Gsm8kEvaluator(BaseEvaluator):
+    def score(self, predictions, references):
+        if len(predictions) != len(references):
+            return {
+                'error': 'predictions and references have different '
+                'length'
+            }
+        correct = 0
+        count = 0
+        details = []
+        for i, j in zip(predictions, references):
+            detail = {'pred': i, 'answers': j, 'correct': False}
+            count += 1
+            if i == j:
+                correct += 1
+                detail['correct'] = True
+            details.append(detail)
+        result = {'accuracy': 100 * correct / count, 'details': details}
+        return result
--- a/opencompass/datasets/hellaswag.py
+++ b/opencompass/datasets/hellaswag.py
@@ -49,7 +49,7 @@ class hellaswagDataset_V3(BaseDataset):
    @staticmethod
    def load(path):
        dataset = []
-        with open(path, 'r') as f:
+        with open(path, 'r', encoding='utf-8') as f:
            for line in f:
                data = json.loads(line)
                dataset.append({

--- a/opencompass/datasets/math.py
+++ b/opencompass/datasets/math.py
@@ -148,11 +148,15 @@ class MATHEvaluator(BaseEvaluator):
            }
        correct = 0
        count = 0
+        details = []
        for i, j in zip(predictions, references):
+            detail = {'pred': i, 'answer': j, 'correct': False}
            count += 1
            if self.is_equiv(i, j):
                correct += 1
-        result = {'accuracy': 100 * correct / count}
+                detail['correct'] = True
+            details.append(detail)
+        result = {'accuracy': 100 * correct / count, 'details': details}
        return result
    def _fix_fracs(self, string):

--- a/opencompass/datasets/natural_question.py
+++ b/opencompass/datasets/natural_question.py
@@ -52,9 +52,14 @@ class NQEvaluator(BaseEvaluator):
        processed_answers = [[general_postprocess(j).lower() for j in i]
                             for i in references]
+        details = []
        cnt = 0
        for pred, cand_ans in zip(processed_predictions, processed_answers):
+            detail = {'pred': pred, 'answer': cand_ans, 'correct': False}
            cnt += int(any([cand == pred for cand in cand_ans]))
+            if int(any([cand == pred for cand in cand_ans])):
+                detail['correct'] = True
+            details.append(detail)
        score = cnt / len(predictions) * 100
-        return {'score': score}
+        return {'score': score, 'details': details}
--- a/opencompass/datasets/tnews.py
+++ b/opencompass/datasets/tnews.py
@@ -67,7 +67,7 @@ class TNewsDataset_V2(BaseDataset):
        }
        data = []
-        with open(path, 'r') as f:
+        with open(path, 'r', encoding='utf-8') as f:
            for line in f:
                line = json.loads(line)
                item = {

--- a/opencompass/datasets/triviaqa.py
+++ b/opencompass/datasets/triviaqa.py
@@ -51,9 +51,14 @@ class TriviaQAEvaluator(BaseEvaluator):
        processed_answers = [[general_postprocess(j).lower() for j in i]
                             for i in references]
+        details = []
        cnt = 0
        for pred, cand_ans in zip(processed_predictions, processed_answers):
+            detail = {'pred': pred, 'answer': cand_ans, 'correct': False}
            cnt += int(any([cand == pred for cand in cand_ans]))
+            if int(any([cand == pred for cand in cand_ans])):
+                detail['correct'] = True
+            details.append(detail)
        score = cnt / len(predictions) * 100
-        return {'score': score}
+        return {'score': score, 'details': details}
--- a/opencompass/models/claude_api/postprocessors.py
+++ b/opencompass/models/claude_api/postprocessors.py
@@ -82,6 +82,20 @@ def strategyqa_pred_postprocess(text: str) -> str:
    return ''
+def flores_postprocess(text: str) -> str:
+    text = text.strip().split('\n')[-1].strip()
+    return text
+def flores_postprocess_chinese(text: str) -> str:
+    text = text.strip().split('\n')[-1].strip()
+    import jieba
+    truncated_text = text.strip().split('\n')[0]
+    cleaned_text = re.sub(r'\s+', ' ', truncated_text).strip()
+    cleaned_text = ' '.join(jieba.cut(cleaned_text))
+    return cleaned_text
 def record_postprocess(text: str) -> str:
    match = re.search(r'(?<=refers to )[^.]+', text)

--- a/opencompass/openicl/icl_evaluator/icl_em_evaluator.py
+++ b/opencompass/openicl/icl_evaluator/icl_em_evaluator.py
@@ -24,11 +24,18 @@ class EMEvaluator(BaseEvaluator):
                             for i in references]
        cnt = 0
+        details = []
        for pred, ans, origin_ans in zip(predictions, processed_answers,
                                         references):
+            answers = list(set(ans + origin_ans))
+            detail = {'pred': pred, 'answer': answers}
            if pred in ans or pred in origin_ans:
                cnt += 1
+                detail['correct'] = True
+            else:
+                detail['correct'] = False
+            details.append(detail)
        score = cnt / len(predictions) * 100
-        return {'score': score}
+        return {'score': score, 'details': details}
--- a/opencompass/openicl/icl_inferencer/icl_base_inferencer.py
+++ b/opencompass/openicl/icl_inferencer/icl_base_inferencer.py
@@ -51,8 +51,7 @@ class BaseInferencer:
        self.output_json_filepath = output_json_filepath
        self.output_json_filename = output_json_filename
        self.is_main_process = is_main_process()
-        if not os.path.exists(self.output_json_filepath):
+        os.makedirs(self.output_json_filepath, exist_ok=True)
-            os.makedirs(self.output_json_filepath)
    def inference(self,
                  retriever: BaseRetriever,

--- a/opencompass/openicl/icl_inferencer/icl_ppl_inferencer.py
+++ b/opencompass/openicl/icl_inferencer/icl_ppl_inferencer.py
@@ -94,6 +94,7 @@ class PPLInferencer(BaseInferencer):
            index = 0
            prompt_list = []
            sub_ppl_list = []
+            token_num_list = []
            normalizing_prompt_list = []
            context_length_list = []
@@ -144,6 +145,7 @@ class PPLInferencer(BaseInferencer):
                                                               mode='ppl'))
                    normalizing_prompt_list.append(normalizing_prompt)
                prompt_list.append(prompt)
+                token_num_list.append(prompt_token_num)
            if normalizing_str is not None:
                normalizing_str_len = self.model.get_token_len_from_template(
@@ -186,6 +188,10 @@ class PPLInferencer(BaseInferencer):
                    ice_str = self.model.parse_template(ice[idx], mode='ppl')
                    output_handler.save_prompt_and_ppl(
                        label, prompt.replace(ice_str, ''), prompt, res, index)
+                    output_handler.results_dict[str(
+                        index)][f'label: {str(label)}'][
+                            'BPB'] = res * token_num_list[idx] / len(
+                                prompt.replace(ice_str, '').encode())
                    index = index + 1
            ppl.append(sub_ppl_list)

--- a/opencompass/partitioners/base.py
+++ b/opencompass/partitioners/base.py
 from abc import abstractmethod
 from copy import deepcopy
-from typing import Dict, List
+from typing import Dict, List, Optional
 from mmengine.config import ConfigDict
@@ -13,16 +13,24 @@ class BasePartitioner:
    Args:
        out_dir (str): The output directory of tasks.
-        keep_keys (List[str]): The keys to be kept from the experiment config
+        keep_keys (Optional[List[str]], optional): The keys to be kept from the
-            to the task config.
+            experiment config to the task config. Defaults to None. If None,
+            the following keys will be kept:
+            - eval.runner.task.judge_cfg
+            - eval.runner.task.dump_details
    """
-    def __init__(self,
+    def __init__(self, out_dir: str, keep_keys: Optional[List[str]] = None):
-                 out_dir: str,
-                 keep_keys: List[str] = ['eval.runner.task.judge_cfg']):
        self.logger = get_logger()
        self.out_dir = out_dir
-        self.keep_keys = keep_keys
+        if keep_keys is None:
+            self.keep_keys = [
+                'eval.runner.task.judge_cfg',
+                'eval.runner.task.dump_details',
+            ]
+        else:
+            self.keep_keys = keep_keys
    def __call__(self, cfg: ConfigDict) -> List[Dict]:
        """Generate tasks from config. Each task is defined as a
@@ -63,7 +71,8 @@ class BasePartitioner:
                    tgt_ptr = tgt_ptr[key]
                tgt_ptr[key_chain[-1]] = ori_ptr[key_chain[-1]]
            except Exception:
-                self.logger.warning(f'Key {k} not found in config, ignored.')
+                self.logger.debug(f'Key {k} not found in config, ignored.')
+        self.logger.debug(f'Additional config: {add_cfg}')
        tasks = self.partition(models,
                               datasets,

--- a/opencompass/partitioners/naive.py
+++ b/opencompass/partitioners/naive.py
 import os.path as osp
-from typing import Dict, List
+from typing import Dict, List, Optional
 from mmengine.config import Config, ConfigDict
@@ -11,15 +11,23 @@ from .base import BasePartitioner
 @PARTITIONERS.register_module()
 class NaivePartitioner(BasePartitioner):
-    """Naive task partitioner. This partitioner will generate a task for each
+    """Naive task partitioner. This partitioner will generate a task for each n
-    model-dataset pair.
+    model-dataset pairs.
    Args:
        out_dir (str): The output directory of tasks.
+        n (int): The number of model-dataset pairs in each task.
        keep_keys (List[str]): The keys to be kept from the experiment config
            to the task config.
    """
+    def __init__(self,
+                 out_dir: str,
+                 n: int = 1,
+                 keep_keys: Optional[List[str]] = None):
+        super().__init__(out_dir=out_dir, keep_keys=keep_keys)
+        self.n = n
    def partition(self,
                  models: List[ConfigDict],
                  datasets: List[ConfigDict],
@@ -53,13 +61,17 @@ class NaivePartitioner(BasePartitioner):
        tasks = []
        for model in models:
+            chunks = []
            for dataset in datasets:
                filename = get_infer_output_path(model, dataset, out_dir)
                if osp.exists(filename):
                    continue
+                chunks.append(dataset)
+            for i in range(0, len(chunks), self.n):
                task = Config({
                    'models': [model],
-                    'datasets': [[dataset]],
+                    'datasets': [chunks[i:i + self.n]],
                    'work_dir': work_dir,
                    **add_cfg
                })

--- a/opencompass/partitioners/size.py
+++ b/opencompass/partitioners/size.py
@@ -2,7 +2,7 @@ import copy
 import math
 import os.path as osp
 from fnmatch import fnmatch
-from typing import Dict, List, Tuple, Union
+from typing import Dict, List, Optional, Tuple, Union
 import mmengine
 from mmengine.config import Config, ConfigDict
@@ -24,6 +24,11 @@ class SizePartitioner(BasePartitioner):
        max_task_size (int): The maximum size of a task.
        gen_task_coef (int): The dataset cost measurement coefficient for
            generation tasks.
+        strategy (str): The partition strategy. Supported strategies are:
+            'heuristic' and 'split'. Defaults to 'heuristic'.
+            heuristic: split large datasets into several tasks, merge small
+                datasets into one task.
+            split: split large datasets into several tasks only.
        dataset_size_path (str): The path to the dataset size cache file.
        keep_keys (list[str]): The keys to be kept from the experiment config
            to the task config.
@@ -33,12 +38,17 @@ class SizePartitioner(BasePartitioner):
                 out_dir: str,
                 max_task_size: int = 40000,
                 gen_task_coef: int = 20,
+                 strategy: str = 'heuristic',
                 dataset_size_path: str = '.cache/dataset_size.json',
-                 keep_keys: List[str] = ['eval.runner.task.judge_cfg']):
+                 keep_keys: Optional[List[str]] = None):
        super().__init__(out_dir=out_dir, keep_keys=keep_keys)
        self.max_task_size = max_task_size
        self.gen_task_coef = gen_task_coef
        self.dataset_size_path = dataset_size_path
+        assert strategy in ('heuristic', 'split'), \
+            f'Unsupported partition strategy: {strategy}. '\
+            'Supported strategies are: `heuristic`, `split` .'
+        self.strategy = strategy
    def partition(self,
                  models: List[ConfigDict],
@@ -79,47 +89,47 @@ class SizePartitioner(BasePartitioner):
                          reverse=True)
        tasks = []
        for model in models:
-            task = Config({
+            chunks = []  # elements: tuple(size, dataset_chunk)
-                'models': [model],
-                'datasets': [[]],
-                'work_dir': work_dir,
-                **add_cfg
-            })
-            num_data = 0
            for dataset in datasets:
                filename = get_infer_output_path(model, dataset, out_dir)
-                root, ext = osp.splitext(filename)
                # skip the task if the task output exists
                if osp.exists(filename):
                    continue
                dataset_size = self.get_cost(dataset)
                if dataset_size > self.max_task_size:
+                    root, ext = osp.splitext(filename)
                    dataset_splits = self.split_dataset(dataset)
                    for i, dataset_split in enumerate(dataset_splits):
-                        # skip the task it the task output exists
                        if not osp.exists(f'{root}_{i}{ext}'):
-                            tasks.append(
+                            chunks.append((self.max_task_size, dataset_split))
-                                Config({
-                                    'models': [model],
-                                    'datasets': [[dataset_split]],
-                                    'work_dir': work_dir,
-                                    **add_cfg
-                                }))
                else:
-                    if num_data + dataset_size > self.max_task_size:
+                    chunks.append((dataset_size, dataset))
-                        tasks.append(task)
-                        task = Config({
+            if self.strategy == 'heuristic':
+                chunks = sorted(chunks, key=lambda x: x[0], reverse=True)
+                current_size, current_chunks = 0, []
+                for index in range(len(chunks)):
+                    current_size += chunks[index][0]
+                    current_chunks.append(chunks[index][1])
+                    if index == len(chunks) - 1 or current_size + chunks[
+                            index + 1][0] > self.max_task_size:
+                        tasks.append(
+                            Config({
+                                'models': [model],
+                                'datasets': [current_chunks],
+                                'work_dir': work_dir,
+                                **add_cfg
+                            }))
+                        current_size, current_chunks = 0, []
+            elif self.strategy == 'split':
+                for _, dataset in chunks:
+                    tasks.append(
+                        Config({
                            'models': [model],
-                            'datasets': [[]],
+                            'datasets': [[dataset]],
                            'work_dir': work_dir,
                            **add_cfg
-                        })
+                        }))
-                        num_data = 0
-                    task['datasets'][0].append(dataset)
-                    num_data = num_data + dataset_size
-            if task['datasets'][0]:
-                tasks.append(task)
        return tasks
    @property