Unverified Commit d7ff933a authored by Leymore's avatar Leymore Committed by GitHub
Browse files

[Fix] Use jieba rouge in lcsts (#459)

* use jieba rouge in lcsts

* use rouge_chinese
parent c903e7f6
from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import RougeEvaluator from opencompass.openicl.icl_evaluator import JiebaRougeEvaluator
from opencompass.datasets import LCSTSDataset, lcsts_postprocess from opencompass.datasets import LCSTSDataset, lcsts_postprocess
lcsts_reader_cfg = dict(input_columns=['content'], output_column='abst') lcsts_reader_cfg = dict(input_columns=['content'], output_column='abst')
...@@ -16,7 +16,7 @@ lcsts_infer_cfg = dict( ...@@ -16,7 +16,7 @@ lcsts_infer_cfg = dict(
inferencer=dict(type=GenInferencer)) inferencer=dict(type=GenInferencer))
lcsts_eval_cfg = dict( lcsts_eval_cfg = dict(
evaluator=dict(type=RougeEvaluator), evaluator=dict(type=JiebaRougeEvaluator),
pred_role='BOT', pred_role='BOT',
pred_postprocessor=dict(type=lcsts_postprocess), pred_postprocessor=dict(type=lcsts_postprocess),
) )
......
from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import RougeEvaluator from opencompass.openicl.icl_evaluator import JiebaRougeEvaluator
from opencompass.datasets import LCSTSDataset, lcsts_postprocess from opencompass.datasets import LCSTSDataset, lcsts_postprocess
lcsts_reader_cfg = dict(input_columns=['content'], output_column='abst') lcsts_reader_cfg = dict(input_columns=['content'], output_column='abst')
...@@ -13,7 +13,7 @@ lcsts_infer_cfg = dict( ...@@ -13,7 +13,7 @@ lcsts_infer_cfg = dict(
inferencer=dict(type=GenInferencer)) inferencer=dict(type=GenInferencer))
lcsts_eval_cfg = dict( lcsts_eval_cfg = dict(
evaluator=dict(type=RougeEvaluator), evaluator=dict(type=JiebaRougeEvaluator),
pred_postprocessor=dict(type=lcsts_postprocess), pred_postprocessor=dict(type=lcsts_postprocess),
) )
......
...@@ -14,20 +14,21 @@ There is also a type of **scoring-type** evaluation task without standard answer ...@@ -14,20 +14,21 @@ There is also a type of **scoring-type** evaluation task without standard answer
Currently, in OpenCompass, commonly used Evaluators are mainly located in the [`opencompass/openicl/icl_evaluator`](https://github.com/open-compass/opencompass/tree/main/opencompass/openicl/icl_evaluator) folder. There are also some dataset-specific indicators that are placed in parts of [`opencompass/datasets`](https://github.com/open-compass/opencompass/tree/main/opencompass/datasets). Below is a summary: Currently, in OpenCompass, commonly used Evaluators are mainly located in the [`opencompass/openicl/icl_evaluator`](https://github.com/open-compass/opencompass/tree/main/opencompass/openicl/icl_evaluator) folder. There are also some dataset-specific indicators that are placed in parts of [`opencompass/datasets`](https://github.com/open-compass/opencompass/tree/main/opencompass/datasets). Below is a summary:
| Evaluation Strategy | Evaluation Metrics | Common Postprocessing Method | Datasets | | Evaluation Strategy | Evaluation Metrics | Common Postprocessing Method | Datasets |
| ------------------- | -------------------- | ---------------------------- | -------------------------------------------------------------------- | | --------------------- | -------------------- | ---------------------------- | -------------------------------------------------------------------- |
| `ACCEvaluator` | Accuracy | `first_capital_postprocess` | agieval, ARC, bbh, mmlu, ceval, commonsenseqa, crowspairs, hellaswag | | `ACCEvaluator` | Accuracy | `first_capital_postprocess` | agieval, ARC, bbh, mmlu, ceval, commonsenseqa, crowspairs, hellaswag |
| `EMEvaluator` | Match Rate | None, dataset-specific | drop, CLUE_CMRC, CLUE_DRCD | | `EMEvaluator` | Match Rate | None, dataset-specific | drop, CLUE_CMRC, CLUE_DRCD |
| `BleuEvaluator` | BLEU | None, `flores` | flores, iwslt2017, summscreen, govrepcrs | | `BleuEvaluator` | BLEU | None, `flores` | flores, iwslt2017, summscreen, govrepcrs |
| `RougeEvaluator` | ROUGE | None, dataset-specific | lcsts, truthfulqa, Xsum, XLSum | | `RougeEvaluator` | ROUGE | None, dataset-specific | truthfulqa, Xsum, XLSum |
| `HumanEvaluator` | pass@k | `humaneval_postprocess` | humaneval_postprocess | | `JiebaRougeEvaluator` | ROUGE | None, dataset-specific | lcsts |
| `MBPPEvaluator` | Execution Pass Rate | None | mbpp | | `HumanEvaluator` | pass@k | `humaneval_postprocess` | humaneval_postprocess |
| `ToxicEvaluator` | PerspectiveAPI | None | realtoxicityprompts | | `MBPPEvaluator` | Execution Pass Rate | None | mbpp |
| `AGIEvalEvaluator` | Accuracy | None | agieval | | `ToxicEvaluator` | PerspectiveAPI | None | realtoxicityprompts |
| `AUCROCEvaluator` | AUC-ROC | None | jigsawmultilingual, civilcomments | | `AGIEvalEvaluator` | Accuracy | None | agieval |
| `MATHEvaluator` | Accuracy | `math_postprocess` | math | | `AUCROCEvaluator` | AUC-ROC | None | jigsawmultilingual, civilcomments |
| `MccEvaluator` | Matthews Correlation | None | -- | | `MATHEvaluator` | Accuracy | `math_postprocess` | math |
| `SquadEvaluator` | F1-scores | None | -- | | `MccEvaluator` | Matthews Correlation | None | -- |
| `SquadEvaluator` | F1-scores | None | -- |
## How to Configure ## How to Configure
......
...@@ -2,11 +2,11 @@ ...@@ -2,11 +2,11 @@
在评测阶段,我们一般以数据集本身的特性来选取对应的评估策略,最主要的依据为**标准答案的类型**,一般以下几种类型: 在评测阶段,我们一般以数据集本身的特性来选取对应的评估策略,最主要的依据为**标准答案的类型**,一般以下几种类型:
- **选项**:常见于分类任务,判断题以及选择题,目前这类问题的数据集占比最大,有 MMLU, CEval数据集等等,评估标准一般使用准确率--`ACCEvaluator` - **选项**:常见于分类任务,判断题以及选择题,目前这类问题的数据集占比最大,有 MMLU, CEval 数据集等等,评估标准一般使用准确率--`ACCEvaluator`
- **短语**:常见于问答以及阅读理解任务,这类数据集主要包括CLUE_CMRC, CLUE_DRCD, DROP数据集等等,评估标准一般使用匹配率--`EMEvaluator` - **短语**:常见于问答以及阅读理解任务,这类数据集主要包括 CLUE_CMRC, CLUE_DRCD, DROP 数据集等等,评估标准一般使用匹配率--`EMEvaluator`
- **句子**:常见于翻译以及生成伪代码、命令行任务中,主要包括Flores, Summscreen, Govrepcrs, Iwdlt2017数据集等等,评估标准一般使用BLEU(Bilingual Evaluation Understudy)--`BleuEvaluator` - **句子**:常见于翻译以及生成伪代码、命令行任务中,主要包括 Flores, Summscreen, Govrepcrs, Iwdlt2017 数据集等等,评估标准一般使用 BLEU(Bilingual Evaluation Understudy)--`BleuEvaluator`
- **段落**:常见于文本摘要生成的任务,常用的数据集主要包括Lcsts, TruthfulQA, Xsum数据集等等,评估标准一般使用ROUGE(Recall-Oriented Understudy for Gisting Evaluation)--`RougeEvaluator` - **段落**:常见于文本摘要生成的任务,常用的数据集主要包括 Lcsts, TruthfulQA, Xsum 数据集等等,评估标准一般使用 ROUGE(Recall-Oriented Understudy for Gisting Evaluation)--`RougeEvaluator`
- **代码**:常见于代码生成的任务,常用的数据集主要包括Humaneval,MBPP数据集等等,评估标准一般使用执行通过率以及 `pass@k`,目前 Opencompass 支持的有`MBPPEvaluator``HumanEvaluator` - **代码**:常见于代码生成的任务,常用的数据集主要包括 Humaneval,MBPP 数据集等等,评估标准一般使用执行通过率以及 `pass@k`,目前 Opencompass 支持的有`MBPPEvaluator``HumanEvaluator`
还有一类**打分类型**评测任务没有标准答案,比如评判一个模型的输出是否存在有毒,可以直接使用相关 API 服务进行打分,目前支持的有 `ToxicEvaluator`,目前有 realtoxicityprompts 数据集使用此评测方式。 还有一类**打分类型**评测任务没有标准答案,比如评判一个模型的输出是否存在有毒,可以直接使用相关 API 服务进行打分,目前支持的有 `ToxicEvaluator`,目前有 realtoxicityprompts 数据集使用此评测方式。
...@@ -14,20 +14,21 @@ ...@@ -14,20 +14,21 @@
目前 OpenCompass 中,常用的 Evaluator 主要放在 [`opencompass/openicl/icl_evaluator`](https://github.com/open-compass/opencompass/tree/main/opencompass/openicl/icl_evaluator)文件夹下, 还有部分数据集特有指标的放在 [`opencompass/datasets`](https://github.com/open-compass/opencompass/tree/main/opencompass/datasets) 的部分文件中。以下是汇总: 目前 OpenCompass 中,常用的 Evaluator 主要放在 [`opencompass/openicl/icl_evaluator`](https://github.com/open-compass/opencompass/tree/main/opencompass/openicl/icl_evaluator)文件夹下, 还有部分数据集特有指标的放在 [`opencompass/datasets`](https://github.com/open-compass/opencompass/tree/main/opencompass/datasets) 的部分文件中。以下是汇总:
| 评估指标 | 评估策略 | 常用后处理方式 | 数据集 | | 评估指标 | 评估策略 | 常用后处理方式 | 数据集 |
| ------------------ | -------------------- | --------------------------- | -------------------------------------------------------------------- | | --------------------- | -------------------- | --------------------------- | -------------------------------------------------------------------- |
| `ACCEvaluator` | 正确率 | `first_capital_postprocess` | agieval, ARC, bbh, mmlu, ceval, commonsenseqa, crowspairs, hellaswag | | `ACCEvaluator` | 正确率 | `first_capital_postprocess` | agieval, ARC, bbh, mmlu, ceval, commonsenseqa, crowspairs, hellaswag |
| `EMEvaluator` | 匹配率 | None, dataset_specification | drop, CLUE_CMRC, CLUE_DRCD | | `EMEvaluator` | 匹配率 | None, dataset_specification | drop, CLUE_CMRC, CLUE_DRCD |
| `BleuEvaluator` | BLEU | None, `flores` | flores, iwslt2017, summscreen, govrepcrs | | `BleuEvaluator` | BLEU | None, `flores` | flores, iwslt2017, summscreen, govrepcrs |
| `RougeEvaluator` | ROUGE | None, dataset_specification | lcsts, truthfulqa, Xsum, XLSum | | `RougeEvaluator` | ROUGE | None, dataset_specification | truthfulqa, Xsum, XLSum |
| `HumanEvaluator` | pass@k | `humaneval_postprocess` | humaneval_postprocess | | `JiebaRougeEvaluator` | ROUGE | None, dataset_specification | lcsts |
| `MBPPEvaluator` | 执行通过率 | None | mbpp | | `HumanEvaluator` | pass@k | `humaneval_postprocess` | humaneval_postprocess |
| `ToxicEvaluator` | PerspectiveAPI | None | realtoxicityprompts | | `MBPPEvaluator` | 执行通过率 | None | mbpp |
| `AGIEvalEvaluator` | 正确率 | None | agieval | | `ToxicEvaluator` | PerspectiveAPI | None | realtoxicityprompts |
| `AUCROCEvaluator` | AUC-ROC | None | jigsawmultilingual, civilcomments | | `AGIEvalEvaluator` | 正确率 | None | agieval |
| `MATHEvaluator` | 正确率 | `math_postprocess` | math | | `AUCROCEvaluator` | AUC-ROC | None | jigsawmultilingual, civilcomments |
| `MccEvaluator` | Matthews Correlation | None | -- | | `MATHEvaluator` | 正确率 | `math_postprocess` | math |
| `SquadEvaluator` | F1-scores | None | -- | | `MccEvaluator` | Matthews Correlation | None | -- |
| `SquadEvaluator` | F1-scores | None | -- |
## 如何配置 ## 如何配置
......
...@@ -3,5 +3,6 @@ from .icl_aucroc_evaluator import AUCROCEvaluator # noqa ...@@ -3,5 +3,6 @@ from .icl_aucroc_evaluator import AUCROCEvaluator # noqa
from .icl_base_evaluator import BaseEvaluator # noqa from .icl_base_evaluator import BaseEvaluator # noqa
from .icl_em_evaluator import EMEvaluator # noqa from .icl_em_evaluator import EMEvaluator # noqa
from .icl_hf_evaluator import * # noqa from .icl_hf_evaluator import * # noqa
from .icl_jieba_rouge_evaluator import JiebaRougeEvaluator # noqa
from .icl_toxic_evaluator import ToxicEvaluator # noqa from .icl_toxic_evaluator import ToxicEvaluator # noqa
from .lm_evaluator import LMEvaluator # noqa from .lm_evaluator import LMEvaluator # noqa
...@@ -134,7 +134,10 @@ class AccEvaluator(HuggingfaceEvaluator): ...@@ -134,7 +134,10 @@ class AccEvaluator(HuggingfaceEvaluator):
@ICL_EVALUATORS.register_module() @ICL_EVALUATORS.register_module()
class RougeEvaluator(HuggingfaceEvaluator): class RougeEvaluator(HuggingfaceEvaluator):
"""Rouge evaluator.""" # noqa """Rouge evaluator.
Note: this evaluator is not suitable for chinese datasets.
"""
def __init__(self) -> None: def __init__(self) -> None:
super().__init__(metric='rouge') super().__init__(metric='rouge')
......
import jieba
from rouge_chinese import Rouge
from opencompass.registry import ICL_EVALUATORS
from opencompass.utils.text_postprocessors import general_postprocess
from .icl_base_evaluator import BaseEvaluator
@ICL_EVALUATORS.register_module()
class JiebaRougeEvaluator(BaseEvaluator):
"""This Evaluator will first use jieba for tokenization, and then calculate
the rouge score.
This Evaluator especially suitable for evaluating Chinese datasets.
"""
def __init__(self) -> None:
super().__init__()
def score(self, predictions, references):
if len(predictions) != len(references):
return {
'error': 'predictions and references have different '
'length'
}
predictions = [general_postprocess(i) for i in predictions]
references = [general_postprocess(i) for i in references]
metric = Rouge()
predictions = [' '.join(jieba.cut(i)) for i in predictions]
references = [' '.join(jieba.cut(i)) for i in references]
score = metric.get_scores(predictions, references, avg=True)
return {
'rouge1': score['rouge-1']['f'] * 100,
'rouge2': score['rouge-2']['f'] * 100,
'rougeL': score['rouge-l']['f'] * 100,
}
...@@ -18,6 +18,7 @@ rank_bm25==0.2.2 ...@@ -18,6 +18,7 @@ rank_bm25==0.2.2
rapidfuzz rapidfuzz
requests==2.31.0 requests==2.31.0
rouge rouge
rouge_chinese
rouge_score rouge_score
scikit_learn==1.2.1 scikit_learn==1.2.1
sentence_transformers==2.2.2 sentence_transformers==2.2.2
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment