Unverified Commit d34ba111 authored by Fengzhe Zhou's avatar Fengzhe Zhou Committed by GitHub
Browse files

[Sync] Merge branch 'dev' into zfz/update-keyset-demo (#876)

parent 32b5948f
import mmengine
import os
import argparse
import numpy as np
# np.set_printoptions(precision=1)
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('--result_path', type=str)
args = parser.parse_args()
return args
def convert_results(result_path):
result = mmengine.load(result_path)
instruct_list = [(result['instruct_json']['json_format_metric'] + result['instruct_json']['json_args_em_metric']) / 2, \
(result['instruct_json']['string_format_metric'] + result['instruct_json']['string_args_em_metric']) / 2]
plan_list = [result['plan_str']['f1_score'], result['plan_json']['f1_score']]
reason_list = [result['reason_str']['thought'], result['rru_json']['thought']]
retrieve_list = [result['retrieve_str']['name'], result['rru_json']['name']]
understand_list = [result['understand_str']['args'], result['rru_json']['args']]
review_list = [result['review_str']['review_quality'], result['review_str']['review_quality']]
final_score = [np.mean(instruct_list), np.mean(plan_list), np.mean(reason_list), \
np.mean(retrieve_list), np.mean(understand_list), np.mean(review_list)]
overall = np.mean(final_score)
final_score.insert(0, overall)
name_list = ['Overall', 'Instruct', 'Plan', 'Reason', 'Retrieve', 'Understand', 'Review']
print("Cut Paste Results: ", np.array(final_score) * 100)
for i in range(len(name_list)):
print("%s: %.1f" % (name_list[i], final_score[i]*100), end='\t')
if __name__ == '__main__':
args = parse_args()
convert_results(args.result_path)
import ast
import json
def format_load(raw_data: str, start_character: str = '', end_character: str = ''):
"""Format the raw data into the format that can be evaluated.
Args:
raw_data (str): The raw data.
start_character (str, optional): The start character. Defaults to '', if using it, the string will be sliced from the first start_character.
end_character (str, optional): The end character. Defaults to '', if using it, the string will be sliced to the last end_character.
Returns:
str: The formatted data.
"""
if type(raw_data) != str:
# the data has been evaluated
return raw_data
if "```json" in raw_data:
raw_data = raw_data[raw_data.find("```json") + len("```json"):]
raw_data = raw_data.strip("`")
if start_character != '':
raw_data = raw_data[raw_data.find(start_character):]
if end_character != '':
raw_data = raw_data[:raw_data.rfind(end_character) + len(end_character)]
successful_parse = False
try:
data = ast.literal_eval(raw_data)
successful_parse = True
except Exception as e:
pass
try:
if not successful_parse:
data = json.loads(raw_data)
successful_parse = True
except Exception as e:
pass
try:
if not successful_parse:
data = json.loads(raw_data.replace("\'", "\""))
successful_parse = True
except Exception as e:
pass
if not successful_parse:
raise Exception("Cannot parse raw data")
return data
meta_template_dict = dict(
internlm = [
dict(role='system', begin='<|System|>:', end='\n'),
dict(role='user', begin='<|User|>:', end='\n'),
dict(
role='assistant',
begin='<|Bot|>:',
end='<eoa>\n',
generate=True)
],
)
import re
from string import Formatter
def format_string(template: str, input_data: dict) -> str:
"""Return string with input content according input format template.
Args:
template (str): Format string with keyword-only argument. For
example '{who} like {what}'
input_data (dict): Input data to fill in the input template.
Returns:
str: Return string.
"""
return template.format(**input_data)
def parse_string(template: str, input_string: str, allow_newline: bool=False) -> dict:
"""Return a dictionary whose keys are from input template and value is
responding content from input_string.
Args:
template (str): Format template with keyword-only argument. For
example '{who} like {what}'
input_string (str): Input string will be parsed.
allow_newline (boolen): Whether allow '\n' in {} during RE match, default to False.
Returns:
dict: Parsed data from input string according to format string. If
input string doesn't match template, It will return None.
Examples:
>>> template = '{who} like {what}'
>>> input_string = 'monkey like banana'
>>> data = parse_string(template, input_string)
>>> data
>>> {'who': 'monkey', 'what': 'banana'}
>>> input_string = 'monkey likes banana'
>>> data = parse_string(template, input_string)
>>> data
>>> None
>>> template = '{what} like {what}'
>>> input_string = 'monkey like banana'
>>> data = parse_string(template, input_string)
>>> data
>>> {'what': ['monkey', 'banana']}
"""
formatter = Formatter()
context = []
keys = []
for v in formatter.parse(template):
# v is (literal_text, field_name, format_spec, conversion)
if v[1] is not None:
keys.append(v[1])
context.append(v[0])
pattern = template
for k in keys:
pattern = pattern.replace('{' + f'{k}' + '}', '(.*)')
# pattern = re.compile(rf'{pattern}')
values = re.findall(pattern, input_string, re.S if allow_newline else 0)
if len(values) < 1:
return None
data = dict()
for k, v in zip(keys, values[0]):
if k in data:
tmp = data[k]
if isinstance(tmp, list):
data[k].append(v)
else:
data[k] = [tmp, v]
else:
data[k] = v
return data
......@@ -85,9 +85,10 @@ class TriviaQAEvaluator(BaseEvaluator):
cnt = 0
for pred, cand_ans in zip(processed_predictions, processed_answers):
detail = {'pred': pred, 'answer': cand_ans, 'correct': False}
cnt += int(any([cand == pred for cand in cand_ans]))
if int(any([cand == pred for cand in cand_ans])):
detail['correct'] = True
# is_correct = any([cand == pred for cand in cand_ans])
is_correct = any([cand in pred for cand in cand_ans])
cnt += int(is_correct)
detail['correct'] = is_correct
details.append(detail)
score = cnt / len(predictions) * 100
......
......@@ -150,7 +150,7 @@ class BaiChuan(BaseAPIModel):
return msg
if raw_response.status_code != 200:
print(raw_response)
print(raw_response.json())
time.sleep(1)
continue
print(response)
......
......@@ -109,10 +109,8 @@ class HuggingFace(BaseModel):
max_seq_len=max_seq_len,
tokenizer_only=tokenizer_only,
meta_template=meta_template)
from opencompass.utils.fileio import patch_hf_auto_model
if hf_cache_dir is None:
hf_cache_dir = os.getenv('HF_MODEL_HUB', None)
patch_hf_auto_model(hf_cache_dir)
self.logger = get_logger()
self.pad_token_id = pad_token_id
assert mode in ['none', 'mid']
......
......@@ -405,6 +405,7 @@ class OpenAIAllesAPIN(OpenAI):
except requests.JSONDecodeError:
self.logger.error('JsonDecode error, got',
str(raw_response.content))
time.sleep(1)
continue
if raw_response.status_code == 200 and response[
'msgCode'] == '10000':
......@@ -415,6 +416,8 @@ class OpenAIAllesAPIN(OpenAI):
else:
return choices[0]['message']['content'].strip()
self.logger.error(response['msg'])
self.logger.error(response)
time.sleep(1)
raise RuntimeError('API call failed.')
......
......@@ -193,6 +193,7 @@ class SenseTime(BaseAPIModel):
time.sleep(1)
continue
return ''
raise RuntimeError(
f'request id: '
f'{raw_response.headers.get("X-Request-Id")}, {raw_response.text}')
......@@ -119,6 +119,8 @@ class ZhiPuV2AI(BaseAPIModel):
while max_num_retries < self.retry:
self.acquire()
response = None
try:
response = self.client.chat.completions.create(**data)
except APIStatusError as err:
......
......@@ -30,12 +30,17 @@ class NumWorkerPartitioner(BasePartitioner):
out_dir: str,
num_worker: int = 8,
min_task_size: int = 16,
strategy: str = 'heuristic',
dataset_size_path: str = '.cache/dataset_size.json',
keep_keys: Optional[List[str]] = None):
super().__init__(out_dir=out_dir, keep_keys=keep_keys)
self.num_worker = num_worker
self.min_task_size = min_task_size
self.dataset_size_path = dataset_size_path
assert strategy in ('heuristic', 'split'), \
f'Unsupported partition strategy: {strategy}. '\
'Supported strategies are: `heuristic`, `split` .'
self.strategy = strategy
def partition(self,
model_dataset_combinations: List[Dict[str, List]],
......@@ -64,16 +69,26 @@ class NumWorkerPartitioner(BasePartitioner):
else:
chunks.append(dataset)
buckets = [[] for _ in range(self.num_worker)]
for i, chunk in enumerate(chunks):
buckets[i % self.num_worker].append(chunk)
for bucket in buckets:
if len(bucket) > 0:
if self.strategy == 'heuristic':
buckets = [[] for _ in range(self.num_worker)]
for i, chunk in enumerate(chunks):
buckets[i % self.num_worker].append(chunk)
for bucket in buckets:
if len(bucket) > 0:
tasks.append(
Config({
'models': [model],
'datasets': [bucket],
'work_dir': work_dir,
**add_cfg
}))
elif self.strategy == 'split':
for dataset in chunks:
tasks.append(
Config({
'models': [model],
'datasets': [bucket],
'datasets': [[dataset]],
'work_dir': work_dir,
**add_cfg
}))
......
......@@ -16,7 +16,7 @@ from opencompass.utils import (LarkReporter, dataset_abbr_from_cfg,
model_abbr_from_cfg)
from opencompass.utils.prompt import get_prompt_hash
METRIC_WHITELIST = ['score', 'auc_score', 'accuracy', 'humaneval_pass@1', 'rouge1', 'avg_toxicity_score', 'bleurt_diff', 'matthews_correlation', 'truth']
METRIC_WHITELIST = ['score', 'auc_score', 'accuracy', 'humaneval_pass@1', 'rouge1', 'avg_toxicity_score', 'bleurt_diff', 'matthews_correlation', 'truth', 'f1', 'exact_match']
METRIC_BLACKLIST = ['bp', 'sys_len', 'ref_len']
def model_abbr_from_cfg_used_in_summarizer(model):
......
......@@ -29,6 +29,62 @@ All_Dimensions = [
'公平与可负责程度', '丰富度', '综合得分'
]
MAPPING = {
'事实与解释型回答': ['事实正确性', '满足用户需求', '清晰度', '完备性'],
'逻辑推理型回答': ['事实正确性', '满足用户需求', '逻辑连贯性', '完备性'],
'生成型回答': ['事实正确性', '满足用户需求', '逻辑连贯性', '创造性', '丰富度'],
'建议型回答': ['事实正确性', '满足用户需求', '公平与可负责程度', '创造性']
}
def detect_mapping(text):
if '清晰度' in text and '完备性' in text:
return '事实与解释型回答'
elif '完备性' in text and '逻辑连贯性' in text:
return '逻辑推理型回答'
elif '创造性' in text and '丰富度' in text:
return '生成型回答'
elif '创造性' in text and '公平与可负责程度' in text:
return '建议型回答'
else:
return None
def extract_missing_rating(text, search_type):
searching_keys = MAPPING[search_type]
result_dict = {}
for k in searching_keys:
matches = re.findall(rf'{k}.*?\n', text)
result_dict[k] = None
for match in reversed(matches):
if re.findall(r'\d{1,2}', match):
result_dict[k] = int(re.findall(r'\d{1,2}', match)[-1])
break
overall_number = re.findall('\d{1,2}', text)
try:
result_dict['综合得分'] = int(overall_number[-1])
except:
return {}
return result_dict
def extract_rating_plus(text):
pattern = r'{(.*?)}(?![^{]*{)' # match last brackets
match = re.search(pattern, text)
if match:
dictionary_str = match.group(1)
kv_pattern = r"'(.*?)': (\d+)"
matches = re.findall(kv_pattern, dictionary_str)
result_dict = {key: int(value) for key, value in matches}
return result_dict
else:
match_type = detect_mapping(text=text)
if match_type is not None:
return extract_missing_rating(text=text, search_type=match_type)
else:
return None
def extract_rating(text):
pattern = r'{(.*?)}(?![^{]*{)' # match last brackets
......@@ -56,6 +112,50 @@ def check_rating(rating, all_dimensions):
return rating
def post_process_alignbench_plus(judgement: str,
all_dimensions=All_Dimensions,
possible_keys=['综合得分']):
"""Input a string like below:
xxx{'事实正确性': 1, '满足用户需求': 1, '清晰度': 2, '完备性': 1, '综合得分': 1}xxx,
and extract each score
"""
def extract_score(text):
keys_pattern = '|'.join(map(re.escape, possible_keys))
pattern = rf"({'|'.join(possible_keys)}): (\d+(\.\d{{1,2}})?)"
match = re.search(pattern, text)
if match:
try:
return float(match.group(1))
except ValueError:
return -1
return -1
# judgement = judgement.replace('\n', '')
rating = extract_rating_plus(judgement)
if rating is not None:
score = -1
for key in possible_keys:
score = rating.get(key, -1)
if score != -1:
break
if score == -1:
score = extract_score(judgement)
if score >= 0 and score <= 10:
pass
else:
score = -1
rating = check_rating(rating, all_dimensions)
else:
score = -1
if rating == None or score == -1:
return None
else:
return {'rating': rating, 'score': score}
def post_process_alignbench(judgement: str,
all_dimensions=All_Dimensions,
possible_keys=['综合得分']):
......@@ -211,9 +311,12 @@ class AlignmentBenchSummarizer:
]
self.judge_abbr = model_abbr_from_cfg(self.cfg['judge_model'])
self.judge_type = judge_type
assert self.judge_type in ['general', 'autoj', 'judgelm']
assert self.judge_type in [
'general', 'autoj', 'judgelm', 'general_plus'
]
self.judge_map = {
'general': post_process_alignbench,
'general_plus': post_process_alignbench_plus,
'autoj': post_process_autoj,
'judgelm': post_process_judgelm
}
......
......@@ -67,8 +67,10 @@ class OpenICLEvalTask(BaseTask):
def __init__(self, cfg: ConfigDict):
super().__init__(cfg)
self.num_gpus = 0
self.logger = get_logger()
self.num_gpus = max(
c.get('eval_cfg', {}).get('num_gpus', 0)
for c in sum(self.dataset_cfgs, []))
self.dump_details = cfg.get('eval', {}).get('runner', {}).get(
'task', {}).get('dump_details', False)
......
......@@ -83,7 +83,6 @@ def first_option_postprocess(text: str, options: str, cushion=True) -> str:
f'([{options}])\s?是正确答案',
f'选项\s?([{options}])\s?正确',
f'所以答\s?([{options}])',
f'1.\s?([{options}])[.。$]?$',
f'所以\s?([{options}][.。$]?$)',
f'所有\s?([{options}][.。$]?$)',
f'[\s,::,]([{options}])[。,,\.]?$',
......@@ -105,6 +104,7 @@ def first_option_postprocess(text: str, options: str, cushion=True) -> str:
f'(\s|^)[{options}][\s。,,::\.$]',
f'(\s|^)[{options}](\s|$)',
f'1.\s?(.*?)$',
f'1.\s?([{options}])[.。$]?$',
]
cushion_patterns = [
f'([{options}]):',
......
......@@ -4,8 +4,9 @@ from typing import Dict
from mmengine.config import Config, ConfigDict
from opencompass.openicl.icl_inferencer import (CLPInferencer, GenInferencer,
PPLInferencer,
from opencompass.openicl.icl_inferencer import (AgentInferencer,
ChatInferencer, CLPInferencer,
GenInferencer, PPLInferencer,
PPLOnlyInferencer)
from opencompass.registry import ICL_PROMPT_TEMPLATES, ICL_RETRIEVERS
from opencompass.utils import (Menu, build_dataset_from_cfg,
......@@ -78,12 +79,16 @@ def print_prompts(model_cfg, dataset_cfg, count=1):
ice_idx_list = retriever.retrieve()
assert infer_cfg.inferencer.type in [
PPLInferencer, GenInferencer, CLPInferencer, PPLOnlyInferencer], \
'Only PPLInferencer and GenInferencer are supported'
supported_inferencer = [
AgentInferencer, PPLInferencer, GenInferencer, CLPInferencer,
PPLOnlyInferencer, ChatInferencer
]
if infer_cfg.inferencer.type not in supported_inferencer:
print(f'Only {supported_inferencer} are supported')
return
for idx in range(min(count, len(ice_idx_list))):
if infer_cfg.inferencer.type == PPLInferencer:
if issubclass(infer_cfg.inferencer.type, PPLInferencer):
labels = retriever.get_labels(ice_template=ice_template,
prompt_template=prompt_template)
ice = retriever.generate_ice(ice_idx_list[idx],
......@@ -129,9 +134,7 @@ def print_prompts(model_cfg, dataset_cfg, count=1):
print('-' * 100)
print(prompt)
print('-' * 100)
elif infer_cfg.inferencer.type in [
GenInferencer, CLPInferencer, PPLOnlyInferencer
]:
else:
ice_idx = ice_idx_list[idx]
ice = retriever.generate_ice(ice_idx, ice_template=ice_template)
prompt = retriever.generate_prompt_for_generate_task(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment