Unverified Commit 9083dea6 authored by Fengzhe Zhou's avatar Fengzhe Zhou Committed by GitHub
Browse files

[Sync] some renaming (#641)

parent 68c4c1ef
from opencompass.models import HuggingFaceCausalLM
_meta_template = dict(
begin='<|startoftext|>',
round=[
dict(role="HUMAN", begin='Human: ', end='\n\n'),
dict(role="BOT", begin="Assistant: <|endoftext|>", end='<|endoftext|>', generate=True),
],
eos_token_id=2
)
models = [
dict(
abbr='orionstar-yi-34b-chat-hf',
type=HuggingFaceCausalLM,
path='OrionStarAI/OrionStar-Yi-34B-Chat',
tokenizer_path='OrionStarAI/OrionStar-Yi-34B-Chat',
model_kwargs=dict(
device_map='auto',
trust_remote_code=True,
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
),
meta_template=_meta_template,
max_out_len=100,
max_seq_len=2048,
batch_size=8,
run_cfg=dict(num_gpus=4, num_procs=1),
)
]
ds1000_summary_groups = []
_ds1000_all = ['Pandas', 'Numpy', 'Tensorflow', 'Scipy', 'Sklearn', 'Pytorch', 'Matplotlib']
_ds1000_all = ['ds1000_' + d for d in _ds1000_all]
ds1000_summary_groups.append({'name': 'ds1000', 'subsets': _ds1000_all})
import configparser import configparser
import importlib import importlib
import json
import os import os
import os.path as osp
import pickle import pickle
import re import re
import shutil import shutil
import signal import signal
import subprocess
import sys import sys
import tempfile import tempfile
import threading import threading
from concurrent.futures import ProcessPoolExecutor from concurrent.futures import ProcessPoolExecutor
from pathlib import Path from pathlib import Path
from shutil import copyfile
from subprocess import PIPE, Popen from subprocess import PIPE, Popen
from typing import Optional, Union from typing import Optional, Union
...@@ -20,6 +24,11 @@ from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS ...@@ -20,6 +24,11 @@ from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS
from .base import BaseDataset from .base import BaseDataset
_LIBRARY_NAME_LIST = [
'Pandas', 'Numpy', 'Tensorflow', 'Scipy', 'Sklearn', 'Pytorch',
'Matplotlib'
]
@LOAD_DATASET.register_module() @LOAD_DATASET.register_module()
class DS1000Dataset(BaseDataset): class DS1000Dataset(BaseDataset):
...@@ -323,3 +332,98 @@ def import_source_file(fname, modname): ...@@ -323,3 +332,98 @@ def import_source_file(fname, modname):
except FileNotFoundError as e: except FileNotFoundError as e:
raise ImportError(f'{e.strerror}: {fname}') from e raise ImportError(f'{e.strerror}: {fname}') from e
return module return module
class DS1000ServiceEvaluator(BaseEvaluator):
"""Evaluator for ds1000 eval by using a service.
Before you use this Evaluator, launch a code eval service according to:
https://opencompass.readthedocs.io/en/latest/advanced_guides/code_eval_service.html
Args:
lib (str): The library to be evaluated.
ip_address (str): The IP Address of DS1000 code evaluate service.
Defaults to 'localhost'.
port (int): The port of DS1000 code evaluate service.
Defaults to 5000.
timeout (int): Maximum wait time when accessing the service,
Defaults to 100.
"""
def __init__(self,
lib: str,
ip_address='localhost',
port=5000,
timeout=180) -> None:
assert lib in _LIBRARY_NAME_LIST, (
f' lib must be in {_LIBRARY_NAME_LIST}')
self.lib = lib
self.ip_address = ip_address
self.port = port
self.timeout = timeout
super().__init__()
def score(self, predictions, references):
processed_predictions = {}
assert len(predictions) == len(references)
for i, (pred, gold) in enumerate(zip(predictions, references)):
processed_predictions[str(i)] = {'prediction': pred, 'gold': gold}
with tempfile.TemporaryDirectory() as tmp_dir:
tmp_out_path = osp.join(tmp_dir, f'ds1000_{self.lib}.json')
with open(tmp_out_path, 'w', encoding='utf-8') as json_file:
json.dump(processed_predictions,
json_file,
indent=4,
ensure_ascii=False)
succeed, output = self._code_eval_service(file_path=tmp_out_path)
if succeed:
if isinstance(output, str):
return json.loads(output)
elif isinstance(output, dict):
return output
else:
result_file_path = os.path.join('outputs',
f'ds1000_{self.lib}.json')
copyfile(tmp_out_path, result_file_path)
ref_url = 'https://opencompass.readthedocs.io/en/latest/advanced_guides/code_eval_service.html' # noqa
raise Exception(
'Call CodeEvalService Error in `DS1000ServiceEvaluator`, '
'The results have been saved in path '
f"'{result_file_path}'. You need to check that your "
'code evaluate service is launched and the network to '
'service is connected, you can also get results directly '
f'by using `curl` command refer to {ref_url}.'
f'\nError Information: {output}')
def _code_eval_service(self, file_path: str) -> tuple:
"""Access the code eval service.
Args:
file_path (str): The file path to the file to be evaluated.
Returns:
tuple[bool, str]: Whether the access is successful and the output.
"""
exec_result = subprocess.run([
'curl', '-X', 'POST', '-F', f'file=@{file_path}',
f'{self.ip_address}:{self.port}/evaluate'
],
timeout=self.timeout,
capture_output=True)
if exec_result.returncode == 0 and re.match(
"\"{.*:.*}\"", exec_result.stdout.decode('utf-8')):
return True, json.loads(exec_result.stdout.decode('utf-8'))
else:
if exec_result.stderr:
try:
err = exec_result.stderr.decode()
except Exception:
err = exec_result.stderr
else:
try:
err = exec_result.stdout.decode()
except Exception:
err = exec_result.stdout
return False, err
...@@ -93,6 +93,7 @@ def humaneval_postprocess(text: str) -> str: ...@@ -93,6 +93,7 @@ def humaneval_postprocess(text: str) -> str:
if def_idx != -1: if def_idx != -1:
text = text[max(text.find('\n', def_idx) + 1, 0):] text = text[max(text.find('\n', def_idx) + 1, 0):]
text = text.split('\n\n')[0] text = text.split('\n\n')[0]
text = text.lstrip('\n')
if text.strip().startswith('def'): if text.strip().startswith('def'):
text = '\n'.join(text.split('\n')[1:]) text = '\n'.join(text.split('\n')[1:])
if not text.startswith(' '): if not text.startswith(' '):
......
...@@ -127,7 +127,9 @@ class MBPPEvaluator(BaseEvaluator): ...@@ -127,7 +127,9 @@ class MBPPEvaluator(BaseEvaluator):
predictions = [self._process_answer(pred) for pred in predictions] predictions = [self._process_answer(pred) for pred in predictions]
result = {'pass': 0, 'timeout': 0, 'failed': 0, 'wrong_answer': 0} result = {'pass': 0, 'timeout': 0, 'failed': 0, 'wrong_answer': 0}
for test_case, pred in zip(references, predictions): details = {}
for index, (test_case, pred) in enumerate(zip(references,
predictions)):
programs = self._process_test(test_case, pred) programs = self._process_test(test_case, pred)
try: try:
# Add exec globals to prevent the exec to raise # Add exec globals to prevent the exec to raise
...@@ -136,15 +138,18 @@ class MBPPEvaluator(BaseEvaluator): ...@@ -136,15 +138,18 @@ class MBPPEvaluator(BaseEvaluator):
with swallow_io(): with swallow_io():
with time_limit(2): with time_limit(2):
exec(programs, exec_globals) exec(programs, exec_globals)
result['pass'] += 1 r = 'pass'
except TimeOutException: except TimeOutException:
result['timeout'] += 1 r = 'timeout'
except AssertionError: except AssertionError:
result['wrong_answer'] += 1 r = 'wrong_answer'
except BaseException: except BaseException:
result['failed'] += 1 r = 'failed'
result[r] += 1
details[str(index)] = {'programs': programs, 'result': r}
result['score'] = result['pass'] / len(predictions) * 100 result['score'] = result['pass'] / len(predictions) * 100
result['details'] = details
return result return result
def _process_answer(self, text): def _process_answer(self, text):
......
...@@ -147,26 +147,26 @@ class DefaultSummarizer: ...@@ -147,26 +147,26 @@ class DefaultSummarizer:
if all(isinstance(dataset_abbr, (list, tuple)) for dataset_abbr in sg['subsets']): if all(isinstance(dataset_abbr, (list, tuple)) for dataset_abbr in sg['subsets']):
group_metrics = [default_metric] group_metrics = [default_metric]
for dataset_abbr, metric in sg['subsets']: for dataset_abbr, metric in sg['subsets']:
scores.setdefault(default_metric, []).append(parsed_results[model_abbr][dataset_abbr][metric]) scores.setdefault(default_metric, {})[dataset_abbr] = parsed_results[model_abbr][dataset_abbr][metric]
eval_modes.append(dataset_eval_mode.get(dataset_abbr, 'unknown')) eval_modes.append(dataset_eval_mode.get(dataset_abbr, 'unknown'))
else: else:
group_metrics = list(functools.reduce(lambda a, b: a & b, [set(dataset_metrics[dataset_abbr]) for dataset_abbr in sg['subsets']])) group_metrics = list(functools.reduce(lambda a, b: a & b, [set(dataset_metrics[dataset_abbr]) for dataset_abbr in sg['subsets']]))
if len(group_metrics) > 1: if len(group_metrics) > 1:
for metric in group_metrics: for metric in group_metrics:
for dataset_abbr in sg['subsets']: for dataset_abbr in sg['subsets']:
scores.setdefault(metric, []).append(parsed_results[model_abbr][dataset_abbr][metric]) scores.setdefault(metric, {})[dataset_abbr] = parsed_results[model_abbr][dataset_abbr][metric]
eval_modes.append(dataset_eval_mode.get(sg['subsets'][0], 'unknown')) eval_modes.append(dataset_eval_mode.get(sg['subsets'][0], 'unknown'))
else: else:
group_metrics = [default_metric] group_metrics = [default_metric]
for dataset_abbr in sg['subsets']: for dataset_abbr in sg['subsets']:
metric = dataset_metrics[dataset_abbr][0] metric = dataset_metrics[dataset_abbr][0]
scores.setdefault(default_metric, []).append(parsed_results[model_abbr][dataset_abbr][metric]) scores.setdefault(default_metric, {})[dataset_abbr] = parsed_results[model_abbr][dataset_abbr][metric]
eval_modes.append(dataset_eval_mode.get(dataset_abbr, 'unknown')) eval_modes.append(dataset_eval_mode.get(dataset_abbr, 'unknown'))
result = {} result = {}
for metric in scores: for metric in scores:
if default_metric == 'standard_deviation': if default_metric == 'standard_deviation':
avg = sum(scores[metric]) / len(scores[metric]) avg = sum(scores[metric].values()) / len(scores[metric])
variance = sum((k - avg) ** 2 for k in scores[metric]) / len(scores[metric]) variance = sum((k - avg) ** 2 for k in scores[metric]) / len(scores[metric])
scores[metric] = result[metric] = math.sqrt(variance) scores[metric] = result[metric] = math.sqrt(variance)
else: else:
...@@ -174,7 +174,7 @@ class DefaultSummarizer: ...@@ -174,7 +174,7 @@ class DefaultSummarizer:
numerator = sum(scores[metric][k] * sg['weights'][k] for k in sg['weights']) numerator = sum(scores[metric][k] * sg['weights'][k] for k in sg['weights'])
denominator = sum(sg['weights'].values()) denominator = sum(sg['weights'].values())
else: else:
numerator = sum(scores[metric]) numerator = sum(scores[metric].values())
denominator = len(scores[metric]) denominator = len(scores[metric])
scores[metric] = result[metric] = numerator / denominator scores[metric] = result[metric] = numerator / denominator
eval_modes = list(set(eval_modes)) eval_modes = list(set(eval_modes))
......
...@@ -51,19 +51,53 @@ def first_capital_postprocess(text: str) -> str: ...@@ -51,19 +51,53 @@ def first_capital_postprocess(text: str) -> str:
def first_option_postprocess(text: str, options: str) -> str: def first_option_postprocess(text: str, options: str) -> str:
"""Find first valid option for text.""" """Find first valid option for text."""
# yapf: disable
# flake8: noqa: W605
patterns = [ patterns = [
f'[Tt]he answer is [{options}]', f'答案是?\s?([{options}])',
f'[Tt]he correct answer\s?(?:option)?\s?is [{options}]', # noqa f'答案是?\s?:([{options}])',
f'答案(?:选项)?是(.*?)[{options}]', f'答案是?\s?:([{options}])',
f'答案(?:选项)?为(.*?)[{options}]', f'答案应该?是\s?([{options}])',
f'答案(?:选项)?选(.*?)[{options}]', f'答案应该?选\s?([{options}])',
f'选项[{options}]是?正确', f'答案为\s?([{options}])',
f'选项[{options}]为?正确', f'答案选\s?([{options}])',
f'固选(.*?)[{options}]', f'选择?\s?([{options}])',
f'答案应该是(.*?)[{options}]', f'只有选?项?\s?([{options}])\s?是?对',
f'(\s|^)[{options}][\s。,,\.$]', # noqa f'只有选?项?\s?([{options}])\s?是?错',
f'只有选?项?\s?([{options}])\s?不?正确',
f'只有选?项?\s?([{options}])\s?错误',
f'说法不?对选?项?的?是\s?([{options}])',
f'说法不?正确选?项?的?是\s?([{options}])',
f'说法错误选?项?的?是\s?([{options}])',
f'([{options}])\s?是正确的',
f'([{options}])\s?是正确答案',
f'选项\s?([{options}])\s?正确',
f'所以答\s?([{options}])',
f'1.\s?([{options}])[.。$]?$',
f'所以\s?([{options}][.。$]?$)',
f'所有\s?([{options}][.。$]?$)',
f'[\s,::,]([{options}])[。,,\.]?$',
f'[\s,,::][故即]([{options}])[。\.]?$',
f'[\s,,::]因此([{options}])[。\.]?$',
f'[是为。]\s?([{options}])[。\.]?$',
f'因此\s?([{options}])[。\.]?$',
f'显然\s?([{options}])[。\.]?$',
f'1.\s?(.*?)$',
f'答案是\s?(\S+)(?:。|$)',
f'答案应该是\s?(\S+)(?:。|$)',
f'答案为\s?(\S+)(?:。|$)',
f'(\s|^)[{options}][\s。,,::\.$]',
f'[Tt]he answer is ([{options}])',
f'[Tt]he answer is option ([{options}])',
f'[Tt]he correct answer is ([{options}])',
f'[Tt]he correct answer is option ([{options}])',
f'[Tt]he answer to the question is ([{options}])',
f'([{options}]):',
f'(^|\s)[{options}](\s|$)',
f'[{options}]', f'[{options}]',
] ]
# flake8: noqa
# yapf: enable
regexes = [re.compile(pattern) for pattern in patterns] regexes = [re.compile(pattern) for pattern in patterns]
for regex in regexes: for regex in regexes:
......
...@@ -84,20 +84,17 @@ def print_prompts(model_cfg, dataset_cfg, count=1): ...@@ -84,20 +84,17 @@ def print_prompts(model_cfg, dataset_cfg, count=1):
if infer_cfg.inferencer.type == PPLInferencer: if infer_cfg.inferencer.type == PPLInferencer:
labels = retriever.get_labels(ice_template=ice_template, labels = retriever.get_labels(ice_template=ice_template,
prompt_template=prompt_template) prompt_template=prompt_template)
ice = [ ice = retriever.generate_ice(ice_idx_list[idx],
retriever.generate_ice(ice_idx_list[_idx],
ice_template=ice_template) ice_template=ice_template)
for _idx in range(len(ice_idx_list))
]
print('-' * 100) print('-' * 100)
print('ICE Template:') print('ICE Template:')
print('-' * 100) print('-' * 100)
print(ice[0]) print(ice)
print('-' * 100) print('-' * 100)
for label in labels: for label in labels:
prompt = retriever.generate_label_prompt( prompt = retriever.generate_label_prompt(
idx, idx,
ice[idx], ice,
label, label,
ice_template=ice_template, ice_template=ice_template,
prompt_template=prompt_template, prompt_template=prompt_template,
...@@ -111,11 +108,11 @@ def print_prompts(model_cfg, dataset_cfg, count=1): ...@@ -111,11 +108,11 @@ def print_prompts(model_cfg, dataset_cfg, count=1):
print(f'Truncating ice {num_ice} -> {num_ice - 1}', print(f'Truncating ice {num_ice} -> {num_ice - 1}',
f'Number of tokens: {prompt_token_num} -> ...') f'Number of tokens: {prompt_token_num} -> ...')
ice_idx_list[idx] = ice_idx_list[idx][:-1] ice_idx_list[idx] = ice_idx_list[idx][:-1]
ice[idx] = retriever.generate_ice( ice = retriever.generate_ice(ice_idx_list[idx],
ice_idx_list[idx], ice_template=ice_template) ice_template=ice_template)
prompt = retriever.generate_label_prompt( prompt = retriever.generate_label_prompt(
idx, idx,
ice[idx], ice,
label, label,
ice_template=ice_template, ice_template=ice_template,
prompt_template=prompt_template) prompt_template=prompt_template)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment