Update configs (#9)

* Update implements * Update

Update configs (#9)
* Update implements * Update
86d5ec3d · Leymore · GitHub · 2d0b184b · 86d5ec3d · 86d5ec3d
Unverified Commit 86d5ec3d authored Jul 06, 2023 by Leymore Committed by GitHub Jul 06, 2023
18 changed files
--- a/opencompass/datasets/TheoremQA.py
+++ b/opencompass/datasets/TheoremQA.py
@@ -17,11 +17,10 @@ class TheoremQADataset(BaseDataset):
 @TEXT_POSTPROCESSORS.register_module('TheoremQA')
 def TheoremQA_postprocess(text: str) -> str:
+    text = text.strip()
-    text = text.strip().split('\n')[0].strip()
+    matches = re.findall(r'answer is ([^\s]+)', text)
-    matches = re.findall(r'answer is (.*)', text)
    if len(matches) == 0:
        return text
    else:
-        text = matches[0].strip()[:-1]
+        text = matches[0].strip().strip('.,?!\"\';:')
        return text
--- a/opencompass/datasets/cmrc.py
+++ b/opencompass/datasets/cmrc.py
@@ -2,7 +2,7 @@ import json
 from datasets import Dataset
-from opencompass.registry import LOAD_DATASET
+from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS
 from .base import BaseDataset
@@ -38,3 +38,10 @@ class CMRCDataset(BaseDataset):
        })
        return dataset
+@TEXT_POSTPROCESSORS.register_module('cmrc')
+def cmrc_postprocess(text: str) -> str:
+    if '答案是' in text:
+        text = text.split('答案是')[1]
+    return text
--- a/opencompass/datasets/drcd.py
+++ b/opencompass/datasets/drcd.py
@@ -2,7 +2,7 @@ import json
 from datasets import Dataset
-from opencompass.registry import LOAD_DATASET
+from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS
 from .base import BaseDataset
@@ -38,3 +38,10 @@ class DRCDDataset(BaseDataset):
        })
        return dataset
+@TEXT_POSTPROCESSORS.register_module('drcd')
+def drcd_postprocess(text: str) -> str:
+    if '答案是' in text:
+        text = text.split('答案是')[1]
+    return text
--- a/opencompass/datasets/humaneval.py
+++ b/opencompass/datasets/humaneval.py
@@ -33,12 +33,11 @@ class HumanEvaluator(BaseEvaluator):
        with tempfile.TemporaryDirectory() as tmp_dir:
            out_dir = osp.join(tmp_dir, 'human_eval.json')
            self.write_jsonl(out_dir, predictions)
-            score = self.eval(
+            score = self.eval(out_dir,
-                out_dir,
+                              self.k,
-                self.k,
+                              n_workers=4,
-                n_workers=4,
+                              timeout=3.0,
-                timeout=3.0,
+                              problem_file=self.HUMAN_EVAL)
-                problem_file=self.HUMAN_EVAL)
            return {f'humaneval_{k}': score[k] * 100 for k in score}
@@ -47,7 +46,7 @@ def humaneval_postprocess(text: str) -> str:
    text = text.split('\n\n')[0]
    if '```' in text:
        text = text.split('```')[1]
-    if text.startswith('def'):
+    if text.strip().startswith('def'):
        text = '\n'.join(text.split('\n')[1:])
    if not text.startswith('    '):
        if text.startswith(' '):

--- a/opencompass/datasets/math.py
+++ b/opencompass/datasets/math.py
@@ -3,7 +3,8 @@ import json
 from datasets import Dataset, DatasetDict
 from opencompass.openicl.icl_evaluator import BaseEvaluator
-from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET, TEXT_POSTPROCESSORS
+from opencompass.registry import (ICL_EVALUATORS, LOAD_DATASET,
+                                  TEXT_POSTPROCESSORS)
 from .base import BaseDataset
@@ -65,12 +66,12 @@ class MATHDataset(BaseDataset):
        return dataset
-@TEXT_POSTPROCESSORS.register_module('math')
+@TEXT_POSTPROCESSORS.register_module('math_postprocess')
 def math_postprocess(text: str) -> str:
    SUBSTITUTIONS = [('an ', ''), ('a ', ''), ('.$', '$'), ('\\$', ''),
                     (r'\ ', ''), (' ', ''), ('mbox', 'text'),
                     (',\\text{and}', ','), ('\\text{and}', ','),
-                     ('\\text{m}', '\\text{}'), ('\le', '<')]
+                     ('\\text{m}', '\\text{}'), ('\\le', '<')]
    REMOVED_EXPRESSIONS = [
        'square', 'ways', 'integers', 'dollars', 'mph', 'inches', 'ft',
        'hours', 'km', 'units', '\\ldots', 'sue', 'points', 'feet', 'minutes',
@@ -96,7 +97,9 @@ def math_postprocess(text: str) -> str:
        final_answer = re.sub(r'(\\textbf\{)(.*?)(\})', '\\2', final_answer)
        final_answer = re.sub(r'(\\overline\{)(.*?)(\})', '\\2', final_answer)
        final_answer = re.sub(r'(\\boxed\{)(.*)(\})', '\\2', final_answer)
-        assert '\n' not in final_answer and '\r' not in final_answer and '\f' not in final_answer
+        assert '\n' not in final_answer
+        assert '\r' not in final_answer
+        assert '\f' not in final_answer
        if len(re.findall(r'finalansweris(.*)', final_answer)) > 0:
            final_answer = re.findall(r'finalansweris(.*)', final_answer)[-1]

--- a/opencompass/datasets/mbpp.py
+++ b/opencompass/datasets/mbpp.py
@@ -20,13 +20,13 @@ class MBPPDataset(BaseDataset):
        def processing_test(example):
            example['test_case'] = example['test_list']
            example['test_list'] = '\n'.join(example['test_list'])
+            example['test_list_2'] = example['test_list']
            return example
-        train = load_dataset(
+        train = load_dataset('json', data_files=path,
-            'json', data_files=path, split='train[:10]').map(processing_test)
+                             split='train[:10]').map(processing_test)
-        test = load_dataset(
+        test = load_dataset('json', data_files=path,
-            'json', data_files=path,
+                            split='train[10:510]').map(processing_test)
-            split='train[10:510]').map(processing_test)
        return DatasetDict({'train': train, 'test': test})

--- a/opencompass/datasets/natural_question.py
+++ b/opencompass/datasets/natural_question.py
 import csv
 import os.path as osp
-import re
 from datasets import Dataset, DatasetDict
@@ -43,16 +42,19 @@ class NQEvaluator(BaseEvaluator):
                'error': 'predictions and references have different '
                'length'
            }
-        predictions = [
+        processed_predictions = []
-            re.split(r'[\n]', prediction, 1)[0].lower()
+        for prediction in predictions:
-            for prediction in predictions
+            prediction = prediction.split('\n')[0].lower()
-        ]
+            if 'answer is' in prediction:
+                prediction = prediction.split('answer is')[-1]
+            prediction = general_postprocess(prediction)
+            processed_predictions.append(prediction)
        processed_answers = [[general_postprocess(j).lower() for j in i]
                             for i in references]
        cnt = 0
-        for pred, cand_ans in zip(predictions, processed_answers):
+        for pred, cand_ans in zip(processed_predictions, processed_answers):
-            cnt += int(any([cand in pred for cand in cand_ans]))
+            cnt += int(any([cand == pred for cand in cand_ans]))
        score = cnt / len(predictions) * 100
        return {'score': score}
--- a/opencompass/datasets/realtoxicprompts.py
+++ b/opencompass/datasets/realtoxicprompts.py
-from datasets import load_dataset
+from datasets import Dataset, DatasetDict, load_dataset
 from opencompass.registry import LOAD_DATASET
@@ -11,7 +11,17 @@ class RealToxicPromptsDataset(BaseDataset):
    @staticmethod
    def load(**kwargs):
        challenging_subset = kwargs.pop('challenging_subset', False)
-        dataset = load_dataset(**kwargs)
+        if kwargs['path'] == 'allenai/real-toxicity-prompts':
+            try:
+                dataset = load_dataset(**kwargs)
+            except ConnectionError as e:
+                raise ConnectionError(
+                    f'{e} Something wrong with this dataset, '
+                    'cannot track it online or use offline mode, '
+                    'please set local file path directly.')
+        else:
+            dataset = Dataset.from_file(kwargs.pop('path'))
+            dataset = DatasetDict(train=dataset)
        def preprocess(example):

--- a/opencompass/datasets/strategyqa.py
+++ b/opencompass/datasets/strategyqa.py
+import re
 from opencompass.registry import TEXT_POSTPROCESSORS
 @TEXT_POSTPROCESSORS.register_module('strategyqa')
 def strategyqa_pred_postprocess(text: str) -> str:
    text = text.split('\n\n')[0]
-    strategyqa_pre = text.split('So the answer is ')[-1].strip().replace(
+    text = text.split('answer is ')[-1]
-        '.', '')
+    match = re.search(r'(yes|no)', text.lower())
-    return strategyqa_pre
+    if match:
+        return match.group(1)
+    return ''
 @TEXT_POSTPROCESSORS.register_module('strategyqa_dataset')

--- a/opencompass/datasets/triviaqa.py
+++ b/opencompass/datasets/triviaqa.py
 import csv
 import os.path as osp
-import re
 from datasets import Dataset, DatasetDict
@@ -36,25 +35,25 @@ class TriviaQADataset(BaseDataset):
 @ICL_EVALUATORS.register_module()
 class TriviaQAEvaluator(BaseEvaluator):
-    def __init__(self) -> None:
-        super().__init__()
    def score(self, predictions, references):
        if len(predictions) != len(references):
            return {
                'error': 'predictions and references have different '
                'length'
            }
-        predictions = [
+        processed_predictions = []
-            re.split(r'[\n]', prediction, 1)[0].lower()
+        for prediction in predictions:
-            for prediction in predictions
+            prediction = prediction.split('\n')[0].lower()
-        ]
+            if 'answer is' in prediction:
+                prediction = prediction.split('answer is')[-1]
+            prediction = general_postprocess(prediction)
+            processed_predictions.append(prediction)
        processed_answers = [[general_postprocess(j).lower() for j in i]
                             for i in references]
        cnt = 0
-        for pred, cand_ans in zip(predictions, processed_answers):
+        for pred, cand_ans in zip(processed_predictions, processed_answers):
-            cnt += int(any([cand in pred for cand in cand_ans]))
+            cnt += int(any([cand == pred for cand in cand_ans]))
        score = cnt / len(predictions) * 100
        return {'score': score}
--- a/opencompass/datasets/wsc.py
+++ b/opencompass/datasets/wsc.py
@@ -56,3 +56,47 @@ class WSCDataset_V2(BaseDataset):
                }
                data.append(item)
        return Dataset.from_list(data)
+@LOAD_DATASET.register_module()
+class WSCDataset_V3(BaseDataset):
+    @staticmethod
+    def load(path):
+        data = []
+        with open(path, 'r') as f:
+            for line in f:
+                line = json.loads(line)
+                text_list = line['text'].split(' ')
+                span_text1_len = len(line['target']['span1_text'].split(' '))
+                span_text2_len = len(line['target']['span2_text'].split(' '))
+                span1_start = line['target']['span1_index']
+                span1_end = span1_start + span_text1_len
+                span2_start = line['target']['span2_index']
+                span2_end = span2_start + span_text2_len
+                new_text_list = []
+                for i, t in enumerate(text_list):
+                    if span1_start <= i < span1_end:
+                        if i == span1_start:
+                            new_text_list.append('* ' +
+                                                 line['target']['span1_text'] +
+                                                 ' *')
+                    elif span2_start <= i < span2_end:
+                        if i == span2_start:
+                            new_text_list.append('# ' +
+                                                 line['target']['span2_text'] +
+                                                 ' #')
+                    else:
+                        new_text_list.append(t)
+                item = {
+                    'span1': line['target']['span1_text'],
+                    'span2': line['target']['span2_text'],
+                    'text': ' '.join(new_text_list),
+                    'label': {
+                        'true': 'A',
+                        'false': 'B'
+                    }[line['label']],
+                }
+                data.append(item)
+        return Dataset.from_list(data)
--- a/opencompass/openicl/icl_evaluator/icl_toxic_evaluator.py
+++ b/opencompass/openicl/icl_evaluator/icl_toxic_evaluator.py
-import os
 import json
+import os
 import threading
+import time
 from typing import List
 import numpy as np
@@ -44,11 +45,11 @@ class PerspectiveAPIClient:
        """Build resource when request in real."""
        if self.client is None:
            self.client = discovery.build(
-                "commentanalyzer",
+                'commentanalyzer',
-                "v1alpha1",
+                'v1alpha1',
                developerKey=self.key,
                discoveryServiceUrl=  # noqa
-                "https://commentanalyzer.googleapis.com/$discovery/rest?version=v1alpha1",  # noqa
+                'https://commentanalyzer.googleapis.com/$discovery/rest?version=v1alpha1',  # noqa
                static_discovery=False,
            )
@@ -63,13 +64,13 @@ class PerspectiveAPIClient:
            dict: Request body dictionary.
        """
        return {
-            "comment": {
+            'comment': {
-                "text": text
+                'text': text
            },
-            "requestedAttributes": {
+            'requestedAttributes': {
                'TOXICITY': {}
            },
-            "languages": ["en", "zh"],  # This does not affect the result score
+            'languages': ['en', 'zh'],  # This does not affect the result score
        }
    def extract_toxicity_attributes(self, response: dict) -> dict:
@@ -83,19 +84,19 @@ class PerspectiveAPIClient:
        """
        if response:
            all_scores = {
-                f"{attribute.lower()}_score":
+                f'{attribute.lower()}_score':
-                scores["spanScores"][0]["score"]["value"]
+                scores['spanScores'][0]['score']['value']
-                for attribute, scores in response["attributeScores"].items()
+                for attribute, scores in response['attributeScores'].items()
            }
        else:
            all_scores = {'toxicity_score': -1}
        return all_scores
-    def get_toxicity_scores(self, predicitons: List) -> dict:
+    def get_toxicity_scores(self, predictions: List) -> dict:
        """Request toxicity scores for PerspectiveAPI.
        Args:
-            predicitons: Text to perform evaluation.
+            predictions: Text to perform evaluation.
        Returns:
            dict: output scores dictionary for all samples.
@@ -104,12 +105,13 @@ class PerspectiveAPIClient:
        batch_results = dict()
-        pbar = ProgressBar(len(predicitons))
+        pbar = ProgressBar(len(predictions))
        def callback(request_id: str, response: dict, error):
            if error:
                content = json.loads(error.args[1])
-                error_type = content['error']['details'][0]['errorType']
+                error_type = content['error']['details'][0].get(
+                    'errorType', '')
                if error_type != 'COMMENT_EMPTY':
                    raise error
            batch_results[request_id] = response
@@ -123,7 +125,7 @@ class PerspectiveAPIClient:
        # Add individual request to the batch request. Deduplicate since we
        # use the text as request keys.
-        for idx, text in enumerate(predicitons):
+        for idx, text in enumerate(predictions):
            batch_request.add(
                request=self.client.comments().analyze(
                    body=PerspectiveAPIClient.create_request_body(
@@ -134,6 +136,7 @@ class PerspectiveAPIClient:
            if (idx + 1) % bs == 0:
                batch_request.execute()
+                time.sleep(1)
                batch_request = self.client.new_batch_http_request()
        with self.request_lock:
@@ -198,11 +201,10 @@ class ToxicEvaluator(BaseEvaluator):
        length = np.count_nonzero(~np.isnan(toxicity_scores))
        max_toxicity_score = max(toxicity_scores)
-        return dict(
+        return dict(expected_max_toxicity=round(max_toxicity_score, 4),
-            expected_max_toxicity=round(max_toxicity_score, 4),
+                    max_toxicity_probability=max_toxicity_score >= self.thr,
-            max_toxicity_probability=max_toxicity_score >= self.thr,
+                    toxic_frac=round(num_toxic_completions / length, 4),
-            toxic_frac=round(num_toxic_completions / length, 4),
+                    avg_toxicity_score=round(np.nanmean(toxicity_scores), 4))
-            avg_toxicity_score=round(np.nanmean(toxicity_scores), 4))
    def score(self, predictions: List, references: List) -> dict:
        """Calculate scores. Reference is not needed.

--- a/opencompass/openicl/icl_inferencer/icl_clp_inferencer.py
+++ b/opencompass/openicl/icl_inferencer/icl_clp_inferencer.py
@@ -93,7 +93,7 @@ class CLPInferencer:
                  output_json_filename: Optional[str] = None,
                  normalizing_str: Optional[str] = None) -> List:
        # 1. Preparation for output logs
-        output_handler = PPLInferencerOutputHandler(self.accelerator)
+        output_handler = PPLInferencerOutputHandler()
        ice = []
@@ -122,9 +122,17 @@ class CLPInferencer:
            choice_target_ids = []
            # TODO: Hard code temperaily, need to modified here
            choices = retriever.test_ds[0]['choices']
-            choice_ids = [
+            try:
-                self.model.tokenizer.encode(c, False, False) for c in choices
+                choice_ids = [
-            ]
+                    self.model.tokenizer.encode(c, False, False)
+                    for c in choices
+                ]
+            except ValueError:
+                choice_ids = [self.model.tokenizer.encode(c) for c in choices]
+                if self.model.tokenizer.add_bos_token:
+                    choice_ids = [c[1:] for c in choice_ids]
+                if self.model.tokenizer.add_eos_token:
+                    choice_ids = [c[:-1] for c in choice_ids]
            if isinstance(choice_ids[0], list):
                # in case tokenizer returns list for single token
                choice_ids = list(itertools.chain(*choice_ids))
@@ -185,15 +193,10 @@ class CLPInferencer:
                    index = index + 1
        # 5. Output
-        os.makedirs(output_json_filepath, exist_ok=True)
+        if self.is_main_process:
-        output_handler.subprocess_write_to_json(output_json_filepath,
+            os.makedirs(output_json_filepath, exist_ok=True)
-                                                output_json_filename)
+            output_handler.write_to_json(output_json_filepath,
-        if self.accelerator is not None:
+                                         output_json_filename)
-            self.accelerator.wait_for_everyone()
-        output_handler.merge_to_main_process(output_json_filepath,
-                                             output_json_filename)
-        output_handler.write_to_json(output_json_filepath,
-                                     output_json_filename)
        return [
            sample['prediction']
@@ -206,8 +209,10 @@ class CLPInferencer:
                        choice_ids,
                        mask_length=None):
        # TODO: support multiple tokens
-        outputs, _ = self.model.generator.get_logits(input_texts)
+        try:
+            outputs, _ = self.model.generator.get_logits(input_texts)
+        except AttributeError:
+            outputs, _ = self.model.get_logits(input_texts)
        shift_logits = outputs[..., :-1, :].contiguous()
        shift_logits = F.log_softmax(shift_logits, dim=-1)

--- a/opencompass/runners/dlc.py
+++ b/opencompass/runners/dlc.py
@@ -111,7 +111,6 @@ class DLCRunner(BaseRunner):
               f' --worker_gpu {num_gpus}'
               f' --worker_memory {max(num_gpus * 32, 48)}'
               f" --worker_image {self.aliyun_cfg['worker_image']}"
-               ' --priority 3'
               ' --interactive')
        logger = get_logger()

--- a/opencompass/utils/summarizer.py
+++ b/opencompass/utils/summarizer.py
@@ -13,7 +13,7 @@ from opencompass.utils import (LarkReporter, dataset_abbr_from_cfg,
                               model_abbr_from_cfg)
 from opencompass.utils.prompt import get_prompt_hash
-METRIC_WHITELIST = ['score', 'accuracy', 'humaneval_pass@1', 'rouge1', 'avg_toxicity_score', 'bleurt_diff', 'matthews_correlation', 'truth']
+METRIC_WHITELIST = ['score', 'auc_score', 'accuracy', 'humaneval_pass@1', 'rouge1', 'avg_toxicity_score', 'bleurt_diff', 'matthews_correlation', 'truth']
 METRIC_BLACKLIST = ['bp', 'sys_len', 'ref_len']
 class Summarizer:

--- a/requirements.txt
+++ b/requirements.txt
@@ -10,7 +10,6 @@ jieba
 mmengine>0.8.0
 nltk==3.8
 numpy==1.23.4
-openai==0.27.1
 openai
 pandas<2.0.0
 rank_bm25==0.2.2
@@ -18,7 +17,6 @@ requests==2.28.1
 scikit_learn==1.2.1
 sentence_transformers==2.2.2
 tabulate
-tabulate
 tiktoken
 tokenizers>=0.13.3
 torch>=1.13.1

--- a/run.py
+++ b/run.py
@@ -149,9 +149,12 @@ def main():
    cfg_time_str = dir_time_str = datetime.now().strftime('%Y%m%d_%H%M%S')
    if args.reuse:
        if args.reuse == 'latest':
-            dirs = os.listdir(cfg.work_dir)
+            if not os.path.exists(cfg.work_dir) or not os.listdir(
-            assert len(dirs) > 0, 'No previous results to reuse!'
+                    cfg.work_dir):
-            dir_time_str = sorted(dirs)[-1]
+                logger.warning('No previous results to reuse!')
+            else:
+                dirs = os.listdir(cfg.work_dir)
+                dir_time_str = sorted(dirs)[-1]
        else:
            dir_time_str = args.reuse
        logger.info(f'Reusing experiements from {dir_time_str}')

--- a/tools/prompt_viewer.py
+++ b/tools/prompt_viewer.py
@@ -4,7 +4,8 @@ from typing import Dict
 from mmengine.config import Config, ConfigDict
-from opencompass.openicl.icl_inferencer import GenInferencer, PPLInferencer
+from opencompass.openicl.icl_inferencer import (CLPInferencer, GenInferencer,
+                                                PPLInferencer)
 from opencompass.registry import ICL_PROMPT_TEMPLATES, ICL_RETRIEVERS
 from opencompass.utils import (Menu, build_dataset_from_cfg,
                               build_model_from_cfg, dataset_abbr_from_cfg,
@@ -126,7 +127,7 @@ def print_prompts(model_cfg, dataset_cfg):
            print('-' * 100)
            print(prompt)
            print('-' * 100)
-    elif infer_cfg.inferencer.type == GenInferencer:
+    elif infer_cfg.inferencer.type in [GenInferencer, CLPInferencer]:
        idx, ice_idx = 0, ice_idx_list[0]
        ice = retriever.generate_ice(ice_idx, ice_template=ice_template)
        prompt = retriever.generate_prompt_for_generate_task(