Unverified Commit 9afbfa36 authored by Fengzhe Zhou's avatar Fengzhe Zhou Committed by GitHub
Browse files

[Sync] Fix TEvalEvaluator (#929)

parent ba7cd58d
...@@ -7,5 +7,6 @@ from .icl_hf_evaluator import * # noqa ...@@ -7,5 +7,6 @@ from .icl_hf_evaluator import * # noqa
from .icl_jieba_rouge_evaluator import JiebaRougeEvaluator # noqa from .icl_jieba_rouge_evaluator import JiebaRougeEvaluator # noqa
from .icl_misc_evaluator import AverageMinKEvaluator # noqa from .icl_misc_evaluator import AverageMinKEvaluator # noqa
from .icl_misc_evaluator import AveragePPLEvaluator # noqa from .icl_misc_evaluator import AveragePPLEvaluator # noqa
from .icl_plugin_evaluator import TEvalEvaluator # noqa
from .icl_toxic_evaluator import ToxicEvaluator # noqa from .icl_toxic_evaluator import ToxicEvaluator # noqa
from .lm_evaluator import LMEvaluator # noqa from .lm_evaluator import LMEvaluator # noqa
"""Plugin Evaluator."""
import json
class TEvalEvaluator:
"""This module contains the following evaluators for evaluating the
capabilities of the various dimensions of the LLM.
specifically, InstructEvaluator is used to evaluate the instruction
following capability of LLM, i.e. the ability of the model to perform tool
calls according to an predefined format. ReasoningEvaluator is used to
evaluate the model's ability to reason about the next execution step based
on historical observations. PlanningEvaluator is used to evaluate the
model's ability to plan a solution or program based on a given task.
APIRetrievalEvaluator is used to evaluate the model's ability to retrieve a
subset of tools relevant to the given task from a large number of tools.
ReviewEvaluator is used to evaluate the model's ability to review whether a
task was successfully completed.
"""
def __init__(self, subset) -> None:
from opencompass.datasets.teval.evaluators import (
InstructEvaluator, PlanningEvaluator,
ReasonRetrieveUnderstandEvaluator, ReviewEvaluator)
super().__init__()
self.subset = subset
if subset == 'instruct':
self.evaluator = InstructEvaluator('')
elif subset == 'plan':
self.evaluator = PlanningEvaluator('')
elif subset == 'review':
self.evaluator = ReviewEvaluator('')
elif subset == 'reason_retrieve_understand':
self.evaluator = ReasonRetrieveUnderstandEvaluator('')
elif subset == 'reason':
self.evaluator = ReasonRetrieveUnderstandEvaluator(
'', default_prompt_type='str', eval_type='reason')
elif subset == 'retrieve':
self.evaluator = ReasonRetrieveUnderstandEvaluator(
'', default_prompt_type='str', eval_type='retrieve')
elif subset == 'understand':
self.evaluator = ReasonRetrieveUnderstandEvaluator(
'', default_prompt_type='str', eval_type='understand')
elif subset == 'instruct_zh':
self.evaluator = InstructEvaluator('')
elif subset == 'plan_zh':
self.evaluator = PlanningEvaluator(
'', bert_score_model='thenlper/gte-large-zh')
elif subset == 'review_zh':
self.evaluator = ReviewEvaluator('')
elif subset == 'reason_retrieve_understand_zh':
self.evaluator = ReasonRetrieveUnderstandEvaluator(
'', bert_score_model='thenlper/gte-large-zh')
elif subset == 'reason_zh':
self.evaluator = ReasonRetrieveUnderstandEvaluator(
'',
default_prompt_type='str',
eval_type='reason',
bert_score_model='thenlper/gte-large-zh')
elif subset == 'retrieve_zh':
self.evaluator = ReasonRetrieveUnderstandEvaluator(
'', default_prompt_type='str', eval_type='retrieve')
elif subset == 'understand_zh':
self.evaluator = ReasonRetrieveUnderstandEvaluator(
'',
default_prompt_type='str',
eval_type='understand',
bert_score_model='thenlper/gte-large-zh')
else:
raise NotImplementedError
def score(self, predictions, references):
if len(predictions) != len(references):
return {
'error': 'predictions and references have different '
'length'
}
results_list = []
for prediction, reference in zip(predictions, references):
datum = json.loads(reference)
datum['prediction'] = prediction
data_sample = self.evaluator._process_response(datum)
if isinstance(data_sample, tuple):
data_sample = data_sample[0]
metrics_result = self.evaluator._evaluate(data_sample)
results_list.append(metrics_result)
results_dict = self.evaluator._post_process(results_list)
results_dict = {k: v * 100 for k, v in results_dict.items()}
return results_dict
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment