Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
opencompass
Commits
9afbfa36
Unverified
Commit
9afbfa36
authored
Feb 28, 2024
by
Fengzhe Zhou
Committed by
GitHub
Feb 28, 2024
Browse files
[Sync] Fix TEvalEvaluator (#929)
parent
ba7cd58d
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
98 additions
and
0 deletions
+98
-0
opencompass/openicl/icl_evaluator/__init__.py
opencompass/openicl/icl_evaluator/__init__.py
+1
-0
opencompass/openicl/icl_evaluator/icl_plugin_evaluator.py
opencompass/openicl/icl_evaluator/icl_plugin_evaluator.py
+97
-0
No files found.
opencompass/openicl/icl_evaluator/__init__.py
View file @
9afbfa36
...
@@ -7,5 +7,6 @@ from .icl_hf_evaluator import * # noqa
...
@@ -7,5 +7,6 @@ from .icl_hf_evaluator import * # noqa
from
.icl_jieba_rouge_evaluator
import
JiebaRougeEvaluator
# noqa
from
.icl_jieba_rouge_evaluator
import
JiebaRougeEvaluator
# noqa
from
.icl_misc_evaluator
import
AverageMinKEvaluator
# noqa
from
.icl_misc_evaluator
import
AverageMinKEvaluator
# noqa
from
.icl_misc_evaluator
import
AveragePPLEvaluator
# noqa
from
.icl_misc_evaluator
import
AveragePPLEvaluator
# noqa
from
.icl_plugin_evaluator
import
TEvalEvaluator
# noqa
from
.icl_toxic_evaluator
import
ToxicEvaluator
# noqa
from
.icl_toxic_evaluator
import
ToxicEvaluator
# noqa
from
.lm_evaluator
import
LMEvaluator
# noqa
from
.lm_evaluator
import
LMEvaluator
# noqa
opencompass/openicl/icl_evaluator/icl_plugin_evaluator.py
0 → 100644
View file @
9afbfa36
"""Plugin Evaluator."""
import
json
class
TEvalEvaluator
:
"""This module contains the following evaluators for evaluating the
capabilities of the various dimensions of the LLM.
specifically, InstructEvaluator is used to evaluate the instruction
following capability of LLM, i.e. the ability of the model to perform tool
calls according to an predefined format. ReasoningEvaluator is used to
evaluate the model's ability to reason about the next execution step based
on historical observations. PlanningEvaluator is used to evaluate the
model's ability to plan a solution or program based on a given task.
APIRetrievalEvaluator is used to evaluate the model's ability to retrieve a
subset of tools relevant to the given task from a large number of tools.
ReviewEvaluator is used to evaluate the model's ability to review whether a
task was successfully completed.
"""
def
__init__
(
self
,
subset
)
->
None
:
from
opencompass.datasets.teval.evaluators
import
(
InstructEvaluator
,
PlanningEvaluator
,
ReasonRetrieveUnderstandEvaluator
,
ReviewEvaluator
)
super
().
__init__
()
self
.
subset
=
subset
if
subset
==
'instruct'
:
self
.
evaluator
=
InstructEvaluator
(
''
)
elif
subset
==
'plan'
:
self
.
evaluator
=
PlanningEvaluator
(
''
)
elif
subset
==
'review'
:
self
.
evaluator
=
ReviewEvaluator
(
''
)
elif
subset
==
'reason_retrieve_understand'
:
self
.
evaluator
=
ReasonRetrieveUnderstandEvaluator
(
''
)
elif
subset
==
'reason'
:
self
.
evaluator
=
ReasonRetrieveUnderstandEvaluator
(
''
,
default_prompt_type
=
'str'
,
eval_type
=
'reason'
)
elif
subset
==
'retrieve'
:
self
.
evaluator
=
ReasonRetrieveUnderstandEvaluator
(
''
,
default_prompt_type
=
'str'
,
eval_type
=
'retrieve'
)
elif
subset
==
'understand'
:
self
.
evaluator
=
ReasonRetrieveUnderstandEvaluator
(
''
,
default_prompt_type
=
'str'
,
eval_type
=
'understand'
)
elif
subset
==
'instruct_zh'
:
self
.
evaluator
=
InstructEvaluator
(
''
)
elif
subset
==
'plan_zh'
:
self
.
evaluator
=
PlanningEvaluator
(
''
,
bert_score_model
=
'thenlper/gte-large-zh'
)
elif
subset
==
'review_zh'
:
self
.
evaluator
=
ReviewEvaluator
(
''
)
elif
subset
==
'reason_retrieve_understand_zh'
:
self
.
evaluator
=
ReasonRetrieveUnderstandEvaluator
(
''
,
bert_score_model
=
'thenlper/gte-large-zh'
)
elif
subset
==
'reason_zh'
:
self
.
evaluator
=
ReasonRetrieveUnderstandEvaluator
(
''
,
default_prompt_type
=
'str'
,
eval_type
=
'reason'
,
bert_score_model
=
'thenlper/gte-large-zh'
)
elif
subset
==
'retrieve_zh'
:
self
.
evaluator
=
ReasonRetrieveUnderstandEvaluator
(
''
,
default_prompt_type
=
'str'
,
eval_type
=
'retrieve'
)
elif
subset
==
'understand_zh'
:
self
.
evaluator
=
ReasonRetrieveUnderstandEvaluator
(
''
,
default_prompt_type
=
'str'
,
eval_type
=
'understand'
,
bert_score_model
=
'thenlper/gte-large-zh'
)
else
:
raise
NotImplementedError
def
score
(
self
,
predictions
,
references
):
if
len
(
predictions
)
!=
len
(
references
):
return
{
'error'
:
'predictions and references have different '
'length'
}
results_list
=
[]
for
prediction
,
reference
in
zip
(
predictions
,
references
):
datum
=
json
.
loads
(
reference
)
datum
[
'prediction'
]
=
prediction
data_sample
=
self
.
evaluator
.
_process_response
(
datum
)
if
isinstance
(
data_sample
,
tuple
):
data_sample
=
data_sample
[
0
]
metrics_result
=
self
.
evaluator
.
_evaluate
(
data_sample
)
results_list
.
append
(
metrics_result
)
results_dict
=
self
.
evaluator
.
_post_process
(
results_list
)
results_dict
=
{
k
:
v
*
100
for
k
,
v
in
results_dict
.
items
()}
return
results_dict
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment