Unverified Commit 2b3d4150 authored by Fengzhe Zhou's avatar Fengzhe Zhou Committed by GitHub
Browse files

[Sync] update evaluator (#1175)

parent 296ea599
......@@ -227,9 +227,10 @@ class MBPPEvaluator(BaseEvaluator):
from tqdm import tqdm
for future in tqdm(as_completed(futures), total=len(futures)):
index, key = future.result()
result[key] += 1
details[str(index)]['result'] = key
index, ret = future.result()
result[ret] += 1
details[str(index)]['result'] = ret
details[str(index)]['is_correct'] = (ret == 'pass')
result['score'] = result['pass'] / len(predictions) * 100
result['details'] = details
......
......@@ -59,7 +59,7 @@ def _get_possible_max_seq_len(max_seq_len, path):
raise ValueError('max_seq_len is not provided and cannot be inferred from the model config.')
def _convert_chat_messages(inputs):
def _convert_chat_messages(inputs, merge_role=True):
outputs = []
for _input in inputs:
messages = []
......@@ -73,7 +73,18 @@ def _convert_chat_messages(inputs):
'SYSTEM': 'system',
}[item['role']]
messages.append({'role': role, 'content': item['prompt']})
if merge_role:
merged_messages = []
for item in messages:
if merged_messages and merged_messages[-1]['role'] == item['role']:
merged_messages[-1]['content'] += '\n' + item['content']
else:
merged_messages.append(item)
messages = merged_messages
outputs.append(messages)
print(messages)
return outputs
......@@ -104,6 +115,8 @@ def _get_meta_template(meta_template):
default_meta_template = dict(
round=[
dict(role='HUMAN', api_role='HUMAN'),
# XXX: all system roles are mapped to human in purpose
dict(role='SYSTEM', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
]
)
......
......@@ -37,6 +37,9 @@ class TurboMindModel(BaseModel):
arguments like session_len, max_batch_size for TurboMind.
gen_config (Dict, optional): Generation config to set
arguments like top_k, top_p, temperature.
end_str (str, optional): Whether to trim generated strings with end_str
if the model has special ending strings that are not handled well.
Defaults to None.
"""
def __init__(self,
......@@ -45,7 +48,8 @@ class TurboMindModel(BaseModel):
max_seq_len: int = 2048,
meta_template: Optional[Dict] = None,
engine_config: Dict = {},
gen_config: Dict = {}):
gen_config: Dict = {},
end_str: Optional[str] = None):
super().__init__(path=path,
max_seq_len=max_seq_len,
meta_template=meta_template)
......@@ -64,6 +68,7 @@ class TurboMindModel(BaseModel):
self.generator_ids = [i + 1 for i in range(concurrency)]
self.gen_config = gen_config
self.major_version, self.minor_version, _ = version_info
self.end_str = end_str
def generate(self,
inputs: List[str],
......@@ -119,6 +124,7 @@ class TurboMindModel(BaseModel):
batch_input,
[max_out_len] * len(batch_input),
[gen_config] * len(batch_input),
[self.end_str] * len(batch_input),
))
results += _results
if stopping_criteria:
......@@ -142,7 +148,8 @@ class TurboMindModel(BaseModel):
session_id,
prompt: PromptType,
max_out_len: int,
gen_config=None) -> str:
gen_config=None,
end_str: Optional[str] = None) -> str:
"""Generate results given a list of inputs.
Args:
......@@ -152,6 +159,10 @@ class TurboMindModel(BaseModel):
max_out_len (int): The maximum length of the output.
gen_config (EngineGenerationConfig, optional): Generation
config to set arguments like top_k, top_p, temperature.
end_str (str, optional): Whether to trim generated strings
with end_str if the model has special ending strings
that are not handled well.
Defaults to None.
Returns:
str: The generated string.
"""
......@@ -174,6 +185,9 @@ class TurboMindModel(BaseModel):
_, output_ids, _ = outputs
response = self.tokenizer.decode(output_ids)
response = valid_str(response)
# used to trim
if end_str:
response = response.split(end_str)[0]
return response
def get_ppl(self,
......
......@@ -342,3 +342,29 @@ class EDAccEvaluator(AccEvaluator):
'predictions': preds,
'references': golds,
}
@ICL_EVALUATORS.register_module()
class AccwithDetailsEvaluator(BaseEvaluator):
def score(self, predictions, references, origin_prompt) -> dict:
if len(predictions) != len(references):
return {'error': 'preds and refrs have different length.'}
details = {}
correct, total = 0, 0
for index, (pred, ref) in enumerate(zip(predictions, references)):
is_correct = pred == ref
correct += is_correct
details[str(index)] = {
'prompt': origin_prompt[index],
'pred': pred,
'refr': ref,
'is_correct': is_correct,
}
total += 1
results = {'accuracy': correct / total * 100, 'details': details}
return results
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment