[Sync] some renaming (#641)

9083dea6 · Fengzhe Zhou · GitHub · 68c4c1ef · 9083dea6 · 9083dea6
Unverified Commit 9083dea6 authored Nov 27, 2023 by Fengzhe Zhou Committed by GitHub Nov 27, 2023
8 changed files
--- a/configs/models/others/hf_orionstar_yi_34b_chat.py
+++ b/configs/models/others/hf_orionstar_yi_34b_chat.py
+from opencompass.models import HuggingFaceCausalLM
+_meta_template = dict(
+    begin='<|startoftext|>',
+    round=[
+        dict(role="HUMAN", begin='Human: ', end='\n\n'),
+        dict(role="BOT", begin="Assistant: <|endoftext|>", end='<|endoftext|>', generate=True),
+    ],
+    eos_token_id=2
+)
+models = [
+    dict(
+        abbr='orionstar-yi-34b-chat-hf',
+        type=HuggingFaceCausalLM,
+        path='OrionStarAI/OrionStar-Yi-34B-Chat',
+        tokenizer_path='OrionStarAI/OrionStar-Yi-34B-Chat',
+        model_kwargs=dict(
+            device_map='auto',
+            trust_remote_code=True,
+        ),
+        tokenizer_kwargs=dict(
+            padding_side='left',
+            truncation_side='left',
+            trust_remote_code=True,
+        ),
+        meta_template=_meta_template,
+        max_out_len=100,
+        max_seq_len=2048,
+        batch_size=8,
+        run_cfg=dict(num_gpus=4, num_procs=1),
+    )
+]
--- a/configs/summarizers/groups/ds1000.py
+++ b/configs/summarizers/groups/ds1000.py
+ds1000_summary_groups = []
+_ds1000_all = ['Pandas', 'Numpy', 'Tensorflow', 'Scipy', 'Sklearn', 'Pytorch', 'Matplotlib']
+_ds1000_all = ['ds1000_' + d for d in _ds1000_all]
+ds1000_summary_groups.append({'name': 'ds1000', 'subsets': _ds1000_all})
--- a/opencompass/datasets/ds1000.py
+++ b/opencompass/datasets/ds1000.py
 import configparser
 import importlib
+import json
 import os
+import os.path as osp
 import pickle
 import re
 import shutil
 import signal
+import subprocess
 import sys
 import tempfile
 import threading
 from concurrent.futures import ProcessPoolExecutor
 from pathlib import Path
+from shutil import copyfile
 from subprocess import PIPE, Popen
 from typing import Optional, Union
@@ -20,6 +24,11 @@ from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS
 from .base import BaseDataset
+_LIBRARY_NAME_LIST = [
+    'Pandas', 'Numpy', 'Tensorflow', 'Scipy', 'Sklearn', 'Pytorch',
+    'Matplotlib'
+]
 @LOAD_DATASET.register_module()
 class DS1000Dataset(BaseDataset):
@@ -323,3 +332,98 @@ def import_source_file(fname, modname):
    except FileNotFoundError as e:
        raise ImportError(f'{e.strerror}: {fname}') from e
    return module
+class DS1000ServiceEvaluator(BaseEvaluator):
+    """Evaluator for ds1000 eval by using a service.
+    Before you use this Evaluator, launch a code eval service according to:
+    https://opencompass.readthedocs.io/en/latest/advanced_guides/code_eval_service.html
+    Args:
+        lib (str): The library to be evaluated.
+        ip_address (str): The IP Address of DS1000 code evaluate service.
+            Defaults to 'localhost'.
+        port (int): The port of DS1000 code evaluate service.
+            Defaults to 5000.
+        timeout (int): Maximum wait time when accessing the service,
+            Defaults to 100.
+    """
+    def __init__(self,
+                 lib: str,
+                 ip_address='localhost',
+                 port=5000,
+                 timeout=180) -> None:
+        assert lib in _LIBRARY_NAME_LIST, (
+            f' lib must be in {_LIBRARY_NAME_LIST}')
+        self.lib = lib
+        self.ip_address = ip_address
+        self.port = port
+        self.timeout = timeout
+        super().__init__()
+    def score(self, predictions, references):
+        processed_predictions = {}
+        assert len(predictions) == len(references)
+        for i, (pred, gold) in enumerate(zip(predictions, references)):
+            processed_predictions[str(i)] = {'prediction': pred, 'gold': gold}
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            tmp_out_path = osp.join(tmp_dir, f'ds1000_{self.lib}.json')
+            with open(tmp_out_path, 'w', encoding='utf-8') as json_file:
+                json.dump(processed_predictions,
+                          json_file,
+                          indent=4,
+                          ensure_ascii=False)
+            succeed, output = self._code_eval_service(file_path=tmp_out_path)
+            if succeed:
+                if isinstance(output, str):
+                    return json.loads(output)
+                elif isinstance(output, dict):
+                    return output
+            else:
+                result_file_path = os.path.join('outputs',
+                                                f'ds1000_{self.lib}.json')
+                copyfile(tmp_out_path, result_file_path)
+                ref_url = 'https://opencompass.readthedocs.io/en/latest/advanced_guides/code_eval_service.html'  # noqa
+                raise Exception(
+                    'Call CodeEvalService Error in `DS1000ServiceEvaluator`, '
+                    'The results have been saved in path '
+                    f"'{result_file_path}'. You need to check that your "
+                    'code evaluate service is launched and the network to '
+                    'service is connected, you can also get results directly '
+                    f'by using `curl` command refer to {ref_url}.'
+                    f'\nError Information: {output}')
+    def _code_eval_service(self, file_path: str) -> tuple:
+        """Access the code eval service.
+        Args:
+            file_path (str): The file path to the file to be evaluated.
+        Returns:
+            tuple[bool, str]: Whether the access is successful and the output.
+        """
+        exec_result = subprocess.run([
+            'curl', '-X', 'POST', '-F', f'file=@{file_path}',
+            f'{self.ip_address}:{self.port}/evaluate'
+        ],
+                                     timeout=self.timeout,
+                                     capture_output=True)
+        if exec_result.returncode == 0 and re.match(
+                "\"{.*:.*}\"", exec_result.stdout.decode('utf-8')):
+            return True, json.loads(exec_result.stdout.decode('utf-8'))
+        else:
+            if exec_result.stderr:
+                try:
+                    err = exec_result.stderr.decode()
+                except Exception:
+                    err = exec_result.stderr
+            else:
+                try:
+                    err = exec_result.stdout.decode()
+                except Exception:
+                    err = exec_result.stdout
+            return False, err
--- a/opencompass/datasets/humaneval.py
+++ b/opencompass/datasets/humaneval.py
@@ -93,6 +93,7 @@ def humaneval_postprocess(text: str) -> str:
        if def_idx != -1:
            text = text[max(text.find('\n', def_idx) + 1, 0):]
    text = text.split('\n\n')[0]
+    text = text.lstrip('\n')
    if text.strip().startswith('def'):
        text = '\n'.join(text.split('\n')[1:])
    if not text.startswith('    '):

--- a/opencompass/datasets/mbpp.py
+++ b/opencompass/datasets/mbpp.py
@@ -127,7 +127,9 @@ class MBPPEvaluator(BaseEvaluator):
        predictions = [self._process_answer(pred) for pred in predictions]
        result = {'pass': 0, 'timeout': 0, 'failed': 0, 'wrong_answer': 0}
-        for test_case, pred in zip(references, predictions):
+        details = {}
+        for index, (test_case, pred) in enumerate(zip(references,
+                                                      predictions)):
            programs = self._process_test(test_case, pred)
            try:
                # Add exec globals to prevent the exec to raise
@@ -136,15 +138,18 @@ class MBPPEvaluator(BaseEvaluator):
                with swallow_io():
                    with time_limit(2):
                        exec(programs, exec_globals)
-                result['pass'] += 1
+                r = 'pass'
            except TimeOutException:
-                result['timeout'] += 1
+                r = 'timeout'
            except AssertionError:
-                result['wrong_answer'] += 1
+                r = 'wrong_answer'
            except BaseException:
-                result['failed'] += 1
+                r = 'failed'
+            result[r] += 1
+            details[str(index)] = {'programs': programs, 'result': r}
        result['score'] = result['pass'] / len(predictions) * 100
+        result['details'] = details
        return result
    def _process_answer(self, text):

--- a/opencompass/summarizers/default.py
+++ b/opencompass/summarizers/default.py
@@ -147,26 +147,26 @@ class DefaultSummarizer:
                if all(isinstance(dataset_abbr, (list, tuple)) for dataset_abbr in sg['subsets']):
                    group_metrics = [default_metric]
                    for dataset_abbr, metric in sg['subsets']:
-                        scores.setdefault(default_metric, []).append(parsed_results[model_abbr][dataset_abbr][metric])
+                        scores.setdefault(default_metric, {})[dataset_abbr] = parsed_results[model_abbr][dataset_abbr][metric]
                        eval_modes.append(dataset_eval_mode.get(dataset_abbr, 'unknown'))
                else:
                    group_metrics = list(functools.reduce(lambda a, b: a & b, [set(dataset_metrics[dataset_abbr]) for dataset_abbr in sg['subsets']]))
                    if len(group_metrics) > 1:
                        for metric in group_metrics:
                            for dataset_abbr in sg['subsets']:
-                                scores.setdefault(metric, []).append(parsed_results[model_abbr][dataset_abbr][metric])
+                                scores.setdefault(metric, {})[dataset_abbr] = parsed_results[model_abbr][dataset_abbr][metric]
                                eval_modes.append(dataset_eval_mode.get(sg['subsets'][0], 'unknown'))
                    else:
                        group_metrics = [default_metric]
                        for dataset_abbr in sg['subsets']:
                            metric = dataset_metrics[dataset_abbr][0]
-                            scores.setdefault(default_metric, []).append(parsed_results[model_abbr][dataset_abbr][metric])
+                            scores.setdefault(default_metric, {})[dataset_abbr] = parsed_results[model_abbr][dataset_abbr][metric]
                            eval_modes.append(dataset_eval_mode.get(dataset_abbr, 'unknown'))
                result = {}
                for metric in scores:
                    if default_metric == 'standard_deviation':
-                        avg = sum(scores[metric]) / len(scores[metric])
+                        avg = sum(scores[metric].values()) / len(scores[metric])
                        variance = sum((k - avg) ** 2 for k in scores[metric]) / len(scores[metric])
                        scores[metric] = result[metric] = math.sqrt(variance)
                    else:
@@ -174,7 +174,7 @@ class DefaultSummarizer:
                            numerator = sum(scores[metric][k] * sg['weights'][k] for k in sg['weights'])
                            denominator = sum(sg['weights'].values())
                        else:
-                            numerator = sum(scores[metric])
+                            numerator = sum(scores[metric].values())
                            denominator = len(scores[metric])
                        scores[metric] = result[metric] = numerator / denominator
                    eval_modes = list(set(eval_modes))

--- a/opencompass/utils/text_postprocessors.py
+++ b/opencompass/utils/text_postprocessors.py
@@ -51,19 +51,53 @@ def first_capital_postprocess(text: str) -> str:
 def first_option_postprocess(text: str, options: str) -> str:
    """Find first valid option for text."""
+    # yapf: disable
+    # flake8: noqa: W605
    patterns = [
-        f'[Tt]he answer is [{options}]',
+        f'答案是?\s?([{options}])',
-        f'[Tt]he correct answer\s?(?:option)?\s?is [{options}]',  # noqa
+        f'答案是?\s?：([{options}])',
-        f'答案(?:选项)?是(.*?)[{options}]',
+        f'答案是?\s?:([{options}])',
-        f'答案(?:选项)?为(.*?)[{options}]',
+        f'答案应该?是\s?([{options}])',
-        f'答案(?:选项)?选(.*?)[{options}]',
+        f'答案应该?选\s?([{options}])',
-        f'选项[{options}]是?正确',
+        f'答案为\s?([{options}])',
-        f'选项[{options}]为?正确',
+        f'答案选\s?([{options}])',
-        f'固选(.*?)[{options}]',
+        f'选择?\s?([{options}])',
-        f'答案应该是(.*?)[{options}]',
+        f'只有选?项?\s?([{options}])\s?是?对',
-        f'(\s|^)[{options}][\s。，,\.$]',  # noqa
+        f'只有选?项?\s?([{options}])\s?是?错',
+        f'只有选?项?\s?([{options}])\s?不?正确',
+        f'只有选?项?\s?([{options}])\s?错误',
+        f'说法不?对选?项?的?是\s?([{options}])',
+        f'说法不?正确选?项?的?是\s?([{options}])',
+        f'说法错误选?项?的?是\s?([{options}])',
+        f'([{options}])\s?是正确的',
+        f'([{options}])\s?是正确答案',
+        f'选项\s?([{options}])\s?正确',
+        f'所以答\s?([{options}])',
+        f'1.\s?([{options}])[.。$]?$',
+        f'所以\s?([{options}][.。$]?$)',
+        f'所有\s?([{options}][.。$]?$)',
+        f'[\s，：:,]([{options}])[。，,\.]?$',
+        f'[\s，,：:][故即]([{options}])[。\.]?$',
+        f'[\s，,：:]因此([{options}])[。\.]?$',
+        f'[是为。]\s?([{options}])[。\.]?$',
+        f'因此\s?([{options}])[。\.]?$',
+        f'显然\s?([{options}])[。\.]?$',
+        f'1.\s?(.*?)$',
+        f'答案是\s?(\S+)(?:。|$)',
+        f'答案应该是\s?(\S+)(?:。|$)',
+        f'答案为\s?(\S+)(?:。|$)',
+        f'(\s|^)[{options}][\s。，,：:\.$]',
+        f'[Tt]he answer is ([{options}])',
+        f'[Tt]he answer is option ([{options}])',
+        f'[Tt]he correct answer is ([{options}])',
+        f'[Tt]he correct answer is option ([{options}])',
+        f'[Tt]he answer to the question is ([{options}])',
+        f'([{options}]):',
+        f'(^|\s)[{options}](\s|$)',
        f'[{options}]',
    ]
+    # flake8: noqa
+    # yapf: enable
    regexes = [re.compile(pattern) for pattern in patterns]
    for regex in regexes:

--- a/tools/prompt_viewer.py
+++ b/tools/prompt_viewer.py
@@ -84,20 +84,17 @@ def print_prompts(model_cfg, dataset_cfg, count=1):
        if infer_cfg.inferencer.type == PPLInferencer:
            labels = retriever.get_labels(ice_template=ice_template,
                                          prompt_template=prompt_template)
-            ice = [
+            ice = retriever.generate_ice(ice_idx_list[idx],
-                retriever.generate_ice(ice_idx_list[_idx],
                                         ice_template=ice_template)
-                for _idx in range(len(ice_idx_list))
-            ]
            print('-' * 100)
            print('ICE Template:')
            print('-' * 100)
-            print(ice[0])
+            print(ice)
            print('-' * 100)
            for label in labels:
                prompt = retriever.generate_label_prompt(
                    idx,
-                    ice[idx],
+                    ice,
                    label,
                    ice_template=ice_template,
                    prompt_template=prompt_template,
@@ -111,11 +108,11 @@ def print_prompts(model_cfg, dataset_cfg, count=1):
                        print(f'Truncating ice {num_ice} -> {num_ice - 1}',
                              f'Number of tokens: {prompt_token_num} -> ...')
                        ice_idx_list[idx] = ice_idx_list[idx][:-1]
-                        ice[idx] = retriever.generate_ice(
+                        ice = retriever.generate_ice(ice_idx_list[idx],
-                            ice_idx_list[idx], ice_template=ice_template)
+                                                     ice_template=ice_template)
                        prompt = retriever.generate_label_prompt(
                            idx,
-                            ice[idx],
+                            ice,
                            label,
                            ice_template=ice_template,
                            prompt_template=prompt_template)