Unverified Commit 2b3d4150 authored by Fengzhe Zhou's avatar Fengzhe Zhou Committed by GitHub
Browse files

[Sync] update evaluator (#1175)

parent 296ea599
...@@ -227,9 +227,10 @@ class MBPPEvaluator(BaseEvaluator): ...@@ -227,9 +227,10 @@ class MBPPEvaluator(BaseEvaluator):
from tqdm import tqdm from tqdm import tqdm
for future in tqdm(as_completed(futures), total=len(futures)): for future in tqdm(as_completed(futures), total=len(futures)):
index, key = future.result() index, ret = future.result()
result[key] += 1 result[ret] += 1
details[str(index)]['result'] = key details[str(index)]['result'] = ret
details[str(index)]['is_correct'] = (ret == 'pass')
result['score'] = result['pass'] / len(predictions) * 100 result['score'] = result['pass'] / len(predictions) * 100
result['details'] = details result['details'] = details
......
...@@ -59,7 +59,7 @@ def _get_possible_max_seq_len(max_seq_len, path): ...@@ -59,7 +59,7 @@ def _get_possible_max_seq_len(max_seq_len, path):
raise ValueError('max_seq_len is not provided and cannot be inferred from the model config.') raise ValueError('max_seq_len is not provided and cannot be inferred from the model config.')
def _convert_chat_messages(inputs): def _convert_chat_messages(inputs, merge_role=True):
outputs = [] outputs = []
for _input in inputs: for _input in inputs:
messages = [] messages = []
...@@ -73,7 +73,18 @@ def _convert_chat_messages(inputs): ...@@ -73,7 +73,18 @@ def _convert_chat_messages(inputs):
'SYSTEM': 'system', 'SYSTEM': 'system',
}[item['role']] }[item['role']]
messages.append({'role': role, 'content': item['prompt']}) messages.append({'role': role, 'content': item['prompt']})
if merge_role:
merged_messages = []
for item in messages:
if merged_messages and merged_messages[-1]['role'] == item['role']:
merged_messages[-1]['content'] += '\n' + item['content']
else:
merged_messages.append(item)
messages = merged_messages
outputs.append(messages) outputs.append(messages)
print(messages)
return outputs return outputs
...@@ -104,6 +115,8 @@ def _get_meta_template(meta_template): ...@@ -104,6 +115,8 @@ def _get_meta_template(meta_template):
default_meta_template = dict( default_meta_template = dict(
round=[ round=[
dict(role='HUMAN', api_role='HUMAN'), dict(role='HUMAN', api_role='HUMAN'),
# XXX: all system roles are mapped to human in purpose
dict(role='SYSTEM', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True), dict(role='BOT', api_role='BOT', generate=True),
] ]
) )
......
...@@ -37,6 +37,9 @@ class TurboMindModel(BaseModel): ...@@ -37,6 +37,9 @@ class TurboMindModel(BaseModel):
arguments like session_len, max_batch_size for TurboMind. arguments like session_len, max_batch_size for TurboMind.
gen_config (Dict, optional): Generation config to set gen_config (Dict, optional): Generation config to set
arguments like top_k, top_p, temperature. arguments like top_k, top_p, temperature.
end_str (str, optional): Whether to trim generated strings with end_str
if the model has special ending strings that are not handled well.
Defaults to None.
""" """
def __init__(self, def __init__(self,
...@@ -45,7 +48,8 @@ class TurboMindModel(BaseModel): ...@@ -45,7 +48,8 @@ class TurboMindModel(BaseModel):
max_seq_len: int = 2048, max_seq_len: int = 2048,
meta_template: Optional[Dict] = None, meta_template: Optional[Dict] = None,
engine_config: Dict = {}, engine_config: Dict = {},
gen_config: Dict = {}): gen_config: Dict = {},
end_str: Optional[str] = None):
super().__init__(path=path, super().__init__(path=path,
max_seq_len=max_seq_len, max_seq_len=max_seq_len,
meta_template=meta_template) meta_template=meta_template)
...@@ -64,6 +68,7 @@ class TurboMindModel(BaseModel): ...@@ -64,6 +68,7 @@ class TurboMindModel(BaseModel):
self.generator_ids = [i + 1 for i in range(concurrency)] self.generator_ids = [i + 1 for i in range(concurrency)]
self.gen_config = gen_config self.gen_config = gen_config
self.major_version, self.minor_version, _ = version_info self.major_version, self.minor_version, _ = version_info
self.end_str = end_str
def generate(self, def generate(self,
inputs: List[str], inputs: List[str],
...@@ -119,6 +124,7 @@ class TurboMindModel(BaseModel): ...@@ -119,6 +124,7 @@ class TurboMindModel(BaseModel):
batch_input, batch_input,
[max_out_len] * len(batch_input), [max_out_len] * len(batch_input),
[gen_config] * len(batch_input), [gen_config] * len(batch_input),
[self.end_str] * len(batch_input),
)) ))
results += _results results += _results
if stopping_criteria: if stopping_criteria:
...@@ -142,7 +148,8 @@ class TurboMindModel(BaseModel): ...@@ -142,7 +148,8 @@ class TurboMindModel(BaseModel):
session_id, session_id,
prompt: PromptType, prompt: PromptType,
max_out_len: int, max_out_len: int,
gen_config=None) -> str: gen_config=None,
end_str: Optional[str] = None) -> str:
"""Generate results given a list of inputs. """Generate results given a list of inputs.
Args: Args:
...@@ -152,6 +159,10 @@ class TurboMindModel(BaseModel): ...@@ -152,6 +159,10 @@ class TurboMindModel(BaseModel):
max_out_len (int): The maximum length of the output. max_out_len (int): The maximum length of the output.
gen_config (EngineGenerationConfig, optional): Generation gen_config (EngineGenerationConfig, optional): Generation
config to set arguments like top_k, top_p, temperature. config to set arguments like top_k, top_p, temperature.
end_str (str, optional): Whether to trim generated strings
with end_str if the model has special ending strings
that are not handled well.
Defaults to None.
Returns: Returns:
str: The generated string. str: The generated string.
""" """
...@@ -174,6 +185,9 @@ class TurboMindModel(BaseModel): ...@@ -174,6 +185,9 @@ class TurboMindModel(BaseModel):
_, output_ids, _ = outputs _, output_ids, _ = outputs
response = self.tokenizer.decode(output_ids) response = self.tokenizer.decode(output_ids)
response = valid_str(response) response = valid_str(response)
# used to trim
if end_str:
response = response.split(end_str)[0]
return response return response
def get_ppl(self, def get_ppl(self,
......
...@@ -342,3 +342,29 @@ class EDAccEvaluator(AccEvaluator): ...@@ -342,3 +342,29 @@ class EDAccEvaluator(AccEvaluator):
'predictions': preds, 'predictions': preds,
'references': golds, 'references': golds,
} }
@ICL_EVALUATORS.register_module()
class AccwithDetailsEvaluator(BaseEvaluator):
def score(self, predictions, references, origin_prompt) -> dict:
if len(predictions) != len(references):
return {'error': 'preds and refrs have different length.'}
details = {}
correct, total = 0, 0
for index, (pred, ref) in enumerate(zip(predictions, references)):
is_correct = pred == ref
correct += is_correct
details[str(index)] = {
'prompt': origin_prompt[index],
'pred': pred,
'refr': ref,
'is_correct': is_correct,
}
total += 1
results = {'accuracy': correct / total * 100, 'details': details}
return results
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment