"...composable_kernel.git" did not exist on "740149fcf1708a4a023f4d951629cd32aa2c3f3e"
Unverified Commit dbb20b82 authored by Fengzhe Zhou's avatar Fengzhe Zhou Committed by GitHub
Browse files

[Sync] update (#517)

parent 6f07af30
...@@ -11,7 +11,7 @@ configs/eval_debug*.py ...@@ -11,7 +11,7 @@ configs/eval_debug*.py
configs/viz_*.py configs/viz_*.py
data data
work_dirs work_dirs
configs/internal/
# Byte-compiled / optimized / DLL files # Byte-compiled / optimized / DLL files
__pycache__/ __pycache__/
*.py[cod] *.py[cod]
...@@ -86,3 +86,6 @@ docs/zh_cn/_build/ ...@@ -86,3 +86,6 @@ docs/zh_cn/_build/
# .zip # .zip
*.zip *.zip
# sft config ignore list
configs/sft_cfg/*B_*
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.datasets import AGIEvalDataset_v2, AGIEvalEvaluator
from opencompass.utils.text_postprocessors import first_capital_postprocess, first_capital_postprocess_multi
agieval_reader_cfg = dict(
input_columns=['question', 'options'], output_column='label')
agieval_single_choice_sets = [
'gaokao-chinese',
'gaokao-english',
'gaokao-geography',
'gaokao-history',
'gaokao-biology',
'gaokao-chemistry',
'gaokao-physics',
'gaokao-mathqa',
'logiqa-zh',
'lsat-ar',
'lsat-lr',
'lsat-rc',
'logiqa-en',
'sat-math',
'sat-en',
'sat-en-without-passage',
'aqua-rat',
]
agieval_multiple_choices_sets = [
'jec-qa-kd',
'jec-qa-ca',
]
agieval_cloze_sets = ['gaokao-mathcloze', 'math']
agieval_chinese_sets = [
'gaokao-chinese',
'gaokao-english',
'gaokao-geography',
'gaokao-history',
'gaokao-biology',
'gaokao-chemistry',
'gaokao-physics',
'gaokao-mathqa',
'logiqa-zh',
'gaokao-mathcloze',
]
agieval_english_sets = [
'lsat-ar',
'lsat-lr',
'lsat-rc',
'logiqa-en',
'sat-math',
'sat-en',
'sat-en-without-passage',
'aqua-rat',
'math',
]
agieval_gaokao_sets = [
'gaokao-chinese',
'gaokao-english',
'gaokao-geography',
'gaokao-history',
'gaokao-biology',
'gaokao-chemistry',
'gaokao-physics',
'gaokao-mathqa',
]
agieval_datasets = []
for _name in agieval_single_choice_sets:
if _name in agieval_chinese_sets:
_hint = '答案是: '
else:
_hint = 'The answer is '
agieval_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(role='HUMAN', prompt=f'{{question}}\n{{options}}\n{_hint}')
])),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=1024))
agieval_eval_cfg = dict(
evaluator=dict(type=AccEvaluator),
pred_postprocessor=dict(type=first_capital_postprocess))
agieval_datasets.append(
dict(
type=AGIEvalDataset_v2,
path='./data/AGIEval/data/v1/',
name=_name,
abbr='agieval-' + _name,
setting_name='zero-shot',
reader_cfg=agieval_reader_cfg,
infer_cfg=agieval_infer_cfg.copy(),
eval_cfg=agieval_eval_cfg.copy()))
for _name in agieval_multiple_choices_sets:
if _name in agieval_chinese_sets:
_hint = '答案是: '
else:
_hint = 'The answer is '
agieval_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(role='HUMAN', prompt=f'{{question}}\n{{options}}\n{_hint}')
])),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=1024))
agieval_eval_cfg = dict(
evaluator=dict(type=AccEvaluator),
pred_postprocessor=dict(type=first_capital_postprocess_multi))
agieval_datasets.append(
dict(
type=AGIEvalDataset_v2,
path='./data/AGIEval/data/v1/',
name=_name,
abbr='agieval-' + _name,
setting_name='zero-shot',
reader_cfg=agieval_reader_cfg,
infer_cfg=agieval_infer_cfg.copy(),
eval_cfg=agieval_eval_cfg.copy()))
for _name in agieval_cloze_sets:
if _name in agieval_chinese_sets:
_hint = '答案是: '
else:
_hint = 'The answer is '
agieval_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[dict(role='HUMAN', prompt=f'{{question}}\n{_hint}')])),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=1024))
agieval_eval_cfg = dict(evaluator=dict(type=AGIEvalEvaluator))
agieval_datasets.append(
dict(
type=AGIEvalDataset_v2,
path='./data/AGIEval/data/v1/',
name=_name,
abbr='agieval-' + _name,
setting_name='zero-shot',
reader_cfg=agieval_reader_cfg,
infer_cfg=agieval_infer_cfg.copy(),
eval_cfg=agieval_eval_cfg.copy()))
for _item in agieval_datasets:
_name = _item['name']
_intro = {
'gaokao-chinese':
'以下是一道中国高考语文选择题,请选择正确的答案。',
'gaokao-english':
'以下是一道中国高考英语选择题,请选择正确的答案。',
'gaokao-geography':
'以下是一道中国高考地理选择题,请选择正确的答案。',
'gaokao-history':
'以下是一道中国高考历史选择题,请选择正确的答案。',
'gaokao-biology':
'以下是一道中国高考生物选择题,请选择正确的答案。',
'gaokao-chemistry':
'以下是一道中国高考化学选择题,请选择正确的答案。',
'gaokao-physics':
'以下是一道中国高考物理选择题,请选择正确的答案。',
'gaokao-mathqa':
'以下是一道中国高考数学选择题,请选择正确的答案。',
'logiqa-zh':
'以下是一道中国公务员考试题,请选择正确的答案。',
'lsat-ar':
'The following is a LSAT Analytical Reasoning question. Please select the correct answer.',
'lsat-lr':
'The following is a LSAT Logical Reasoning question. Please select the correct answer.',
'lsat-rc':
'The following is a LSAT Reading Comprehension question. Please select the correct answer.',
'logiqa-en':
'The following is a Logic Reasoning question. Please select the correct answer.',
'sat-math':
'The following is a SAT Math question. Please select the correct answer.',
'sat-en':
'The following is a SAT English question. Please select the correct answer.',
'sat-en-without-passage':
'The following is a SAT English question. Please select the correct answer.',
'aqua-rat':
'The following is a AQUA-RAT question. Please select the correct answer.',
'jec-qa-kd':
'以下是一道中国司法考试基础知识题,请选择正确的答案。',
'jec-qa-ca':
'以下是一道中国司法考试案例分析题,请选择正确的答案。',
'gaokao-mathcloze':
'以下是一道中国高考数学填空题,请填入正确的答案。',
'math':
'The following is a Math question. Please select the correct answer.',
}[_name]
_templates = _item['infer_cfg']['prompt_template']['template']
_templates['round'][0][
'prompt'] = _intro + '\n' + _templates['round'][0]['prompt']
del _item, _intro, _templates, _name, _hint, agieval_infer_cfg, agieval_eval_cfg
...@@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate ...@@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import PPLInferencer, GenInferencer from opencompass.openicl.icl_inferencer import PPLInferencer, GenInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.datasets import AGIEvalDataset_v2, AGIEvalEvaluator from opencompass.datasets import AGIEvalDataset_v2, AGIEvalEvaluator, AGIEvalEvaluator_mcq
from opencompass.utils.text_postprocessors import first_capital_postprocess_multi from opencompass.utils.text_postprocessors import first_capital_postprocess_multi
agieval_single_choice_sets = [ agieval_single_choice_sets = [
...@@ -116,7 +116,7 @@ for _name in agieval_multiple_choices_sets: ...@@ -116,7 +116,7 @@ for _name in agieval_multiple_choices_sets:
inferencer=dict(type=GenInferencer, max_out_len=1024)) inferencer=dict(type=GenInferencer, max_out_len=1024))
agieval_eval_cfg = dict( agieval_eval_cfg = dict(
evaluator=dict(type=AccEvaluator), evaluator=dict(type=AGIEvalEvaluator_mcq),
pred_postprocessor=dict(type=first_capital_postprocess_multi)) pred_postprocessor=dict(type=first_capital_postprocess_multi))
agieval_datasets.append( agieval_datasets.append(
......
...@@ -3,7 +3,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate ...@@ -3,7 +3,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.datasets import BBHDataset, BBHEvaluator, bbh_mcq_postprocess from opencompass.datasets import BBHDataset, BBHEvaluator, bbh_mcq_postprocess, BBHEvaluator_mcq
bbh_reader_cfg = dict(input_columns=["input"], output_column="target") bbh_reader_cfg = dict(input_columns=["input"], output_column="target")
...@@ -56,7 +56,7 @@ for _name in bbh_multiple_choice_sets: ...@@ -56,7 +56,7 @@ for _name in bbh_multiple_choice_sets:
retriever=dict(type=ZeroRetriever), retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512)) inferencer=dict(type=GenInferencer, max_out_len=512))
bbh_eval_cfg = dict( bbh_eval_cfg = dict(
evaluator=dict(type=AccEvaluator), evaluator=dict(type=BBHEvaluator_mcq),
pred_role="BOT", pred_role="BOT",
pred_postprocessor=dict(type=bbh_mcq_postprocess), pred_postprocessor=dict(type=bbh_mcq_postprocess),
dataset_postprocessor=dict(type=bbh_mcq_postprocess)) dataset_postprocessor=dict(type=bbh_mcq_postprocess))
......
...@@ -3,7 +3,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate ...@@ -3,7 +3,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.datasets import BBHDataset, BBHEvaluator, bbh_mcq_postprocess from opencompass.datasets import BBHDataset, BBHEvaluator, bbh_mcq_postprocess, BBHEvaluator_mcq
bbh_reader_cfg = dict(input_columns=["input"], output_column="target") bbh_reader_cfg = dict(input_columns=["input"], output_column="target")
...@@ -56,7 +56,7 @@ for _name in bbh_multiple_choice_sets: ...@@ -56,7 +56,7 @@ for _name in bbh_multiple_choice_sets:
retriever=dict(type=ZeroRetriever), retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512)) inferencer=dict(type=GenInferencer, max_out_len=512))
bbh_eval_cfg = dict( bbh_eval_cfg = dict(
evaluator=dict(type=AccEvaluator), evaluator=dict(type=BBHEvaluator_mcq),
pred_role="BOT", pred_role="BOT",
pred_postprocessor=dict(type=bbh_mcq_postprocess), pred_postprocessor=dict(type=bbh_mcq_postprocess),
dataset_postprocessor=dict(type=bbh_mcq_postprocess)) dataset_postprocessor=dict(type=bbh_mcq_postprocess))
......
...@@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate ...@@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.datasets import HFDataset, gsm8k_postprocess, gsm8k_dataset_postprocess from opencompass.datasets import HFDataset, gsm8k_postprocess, gsm8k_dataset_postprocess, Gsm8kEvaluator
gsm8k_reader_cfg = dict(input_columns=['question'], output_column='answer') gsm8k_reader_cfg = dict(input_columns=['question'], output_column='answer')
...@@ -25,7 +25,7 @@ gsm8k_infer_cfg = dict( ...@@ -25,7 +25,7 @@ gsm8k_infer_cfg = dict(
retriever=dict(type=ZeroRetriever), retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512)) inferencer=dict(type=GenInferencer, max_out_len=512))
gsm8k_eval_cfg = dict(evaluator=dict(type=AccEvaluator), gsm8k_eval_cfg = dict(evaluator=dict(type=Gsm8kEvaluator),
pred_postprocessor=dict(type=gsm8k_postprocess), pred_postprocessor=dict(type=gsm8k_postprocess),
dataset_postprocessor=dict(type=gsm8k_dataset_postprocess)) dataset_postprocessor=dict(type=gsm8k_dataset_postprocess))
......
...@@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate ...@@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.datasets import HFDataset, gsm8k_postprocess, gsm8k_dataset_postprocess from opencompass.datasets import HFDataset, gsm8k_postprocess, gsm8k_dataset_postprocess, Gsm8kEvaluator
gsm8k_reader_cfg = dict(input_columns=['question'], output_column='answer') gsm8k_reader_cfg = dict(input_columns=['question'], output_column='answer')
...@@ -72,7 +72,7 @@ Question: {question}{answer} ...@@ -72,7 +72,7 @@ Question: {question}{answer}
inferencer=dict(type=GenInferencer, max_out_len=512)) inferencer=dict(type=GenInferencer, max_out_len=512))
gsm8k_eval_cfg = dict( gsm8k_eval_cfg = dict(
evaluator=dict(type=AccEvaluator), evaluator=dict(type=Gsm8kEvaluator),
pred_postprocessor=dict(type=gsm8k_postprocess), pred_postprocessor=dict(type=gsm8k_postprocess),
dataset_postprocessor=dict(type=gsm8k_dataset_postprocess)) dataset_postprocessor=dict(type=gsm8k_dataset_postprocess))
......
...@@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate ...@@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import SCInferencer from opencompass.openicl.icl_inferencer import SCInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.datasets import HFDataset, gsm8k_postprocess, gsm8k_dataset_postprocess from opencompass.datasets import HFDataset, gsm8k_postprocess, gsm8k_dataset_postprocess, Gsm8kEvaluator
gsm8k_reader_cfg = dict(input_columns=['question'], output_column='answer' ) gsm8k_reader_cfg = dict(input_columns=['question'], output_column='answer' )
generation_kwargs = dict(do_sample=True, temperature=0.7, top_k=40) generation_kwargs = dict(do_sample=True, temperature=0.7, top_k=40)
...@@ -73,7 +73,7 @@ Question: {question}{answer} ...@@ -73,7 +73,7 @@ Question: {question}{answer}
inferencer=dict(type=SCInferencer, max_out_len=512, generation_kwargs = generation_kwargs, infer_type='sc', sc_size = 20)) inferencer=dict(type=SCInferencer, max_out_len=512, generation_kwargs = generation_kwargs, infer_type='sc', sc_size = 20))
gsm8k_eval_cfg = dict( gsm8k_eval_cfg = dict(
evaluator=dict(type=AccEvaluator), evaluator=dict(type=Gsm8kEvaluator),
pred_postprocessor=dict(type=gsm8k_postprocess), pred_postprocessor=dict(type=gsm8k_postprocess),
dataset_postprocessor=dict(type=gsm8k_dataset_postprocess), dataset_postprocessor=dict(type=gsm8k_dataset_postprocess),
sc_size = 20) sc_size = 20)
......
...@@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate ...@@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.datasets import HFDataset, gsm8k_postprocess, gsm8k_dataset_postprocess from opencompass.datasets import HFDataset, gsm8k_postprocess, gsm8k_dataset_postprocess, Gsm8kEvaluator
gsm8k_reader_cfg = dict(input_columns=['question'], output_column='answer') gsm8k_reader_cfg = dict(input_columns=['question'], output_column='answer')
...@@ -34,7 +34,7 @@ gsm8k_infer_cfg = dict( ...@@ -34,7 +34,7 @@ gsm8k_infer_cfg = dict(
retriever=dict(type=ZeroRetriever), retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer)) inferencer=dict(type=GenInferencer))
gsm8k_eval_cfg = dict(evaluator=dict(type=AccEvaluator), gsm8k_eval_cfg = dict(evaluator=dict(type=Gsm8kEvaluator),
pred_role="BOT", pred_role="BOT",
pred_postprocessor=dict(type=gsm8k_postprocess), pred_postprocessor=dict(type=gsm8k_postprocess),
dataset_postprocessor=dict(type=gsm8k_dataset_postprocess)) dataset_postprocessor=dict(type=gsm8k_dataset_postprocess))
......
from opencompass.models.claude_api.claude_api import Claude from opencompass.models.claude_api.claude_api import Claude
from opencompass.models.claude_api.postprocessors import (
flores_postprocess, gsm8k_postprocess, humaneval_postprocess,
lcsts_postprocess, mbpp_postprocess, strategyqa_pred_postprocess)
from opencompass.utils.text_postprocessors import last_option_postprocess from opencompass.utils.text_postprocessors import last_option_postprocess
from opencompass.models.claude_api.postprocessors import gsm8k_postprocess, humaneval_postprocess, lcsts_postprocess, mbpp_postprocess, strategyqa_pred_postprocess
agieval_single_choice_sets = [ agieval_single_choice_sets = [
'gaokao-chinese', 'gaokao-chinese',
...@@ -47,6 +49,8 @@ claude_postprocessors = { ...@@ -47,6 +49,8 @@ claude_postprocessors = {
'lcsts': dict(type=lcsts_postprocess), 'lcsts': dict(type=lcsts_postprocess),
'mbpp': dict(type=mbpp_postprocess), 'mbpp': dict(type=mbpp_postprocess),
'strategyqa': dict(type=strategyqa_pred_postprocess), 'strategyqa': dict(type=strategyqa_pred_postprocess),
'commonsense_qa': dict(type=last_option_postprocess, options='ABCDE'),
'flores_100_*-zho_simpl': dict(type=flores_postprocess),
} }
for _name in agieval_multiple_choices_sets + agieval_single_choice_sets: for _name in agieval_multiple_choices_sets + agieval_single_choice_sets:
......
...@@ -5,7 +5,7 @@ ...@@ -5,7 +5,7 @@
The program entry for the evaluation task is `run.py`. The usage is as follows: The program entry for the evaluation task is `run.py`. The usage is as follows:
```shell ```shell
python run.py $EXP {--slurm | --dlc | None} [-p PARTITION] [-q QUOTATYPE] [--debug] [-m MODE] [-r [REUSE]] [-w WORKDIR] [-l] [--dry-run] python run.py $EXP {--slurm | --dlc | None} [-p PARTITION] [-q QUOTATYPE] [--debug] [-m MODE] [-r [REUSE]] [-w WORKDIR] [-l] [--dry-run] [--dump-eval-details]
``` ```
Task Configuration (`$EXP`): Task Configuration (`$EXP`):
...@@ -66,6 +66,7 @@ The parameter explanation is as follows: ...@@ -66,6 +66,7 @@ The parameter explanation is as follows:
- `-w`: Specify the working path, default is `./outputs/default`. - `-w`: Specify the working path, default is `./outputs/default`.
- `-l`: Enable status reporting via Lark bot. - `-l`: Enable status reporting via Lark bot.
- `--dry-run`: When enabled, inference and evaluation tasks will be dispatched but won't actually run for debugging. - `--dry-run`: When enabled, inference and evaluation tasks will be dispatched but won't actually run for debugging.
- `--dump-eval-details`: When enabled,evaluation under the `results` folder will include more details, such as the correctness of each sample.
Using run mode `-m all` as an example, the overall execution flow is as follows: Using run mode `-m all` as an example, the overall execution flow is as follows:
......
...@@ -5,7 +5,7 @@ ...@@ -5,7 +5,7 @@
评测任务的程序入口为 `run.py`,使用方法如下: 评测任务的程序入口为 `run.py`,使用方法如下:
```shell ```shell
python run.py $EXP {--slurm | --dlc | None} [-p PARTITION] [-q QUOTATYPE] [--debug] [-m MODE] [-r [REUSE]] [-w WORKDIR] [-l] [--dry-run] python run.py $EXP {--slurm | --dlc | None} [-p PARTITION] [-q QUOTATYPE] [--debug] [-m MODE] [-r [REUSE]] [-w WORKDIR] [-l] [--dry-run] [--dump-eval-details]
``` ```
任务配置 (`$EXP`): 任务配置 (`$EXP`):
...@@ -66,6 +66,7 @@ python run.py $EXP {--slurm | --dlc | None} [-p PARTITION] [-q QUOTATYPE] [--deb ...@@ -66,6 +66,7 @@ python run.py $EXP {--slurm | --dlc | None} [-p PARTITION] [-q QUOTATYPE] [--deb
- `-w`: 指定工作路径,默认为 `./outputs/default` - `-w`: 指定工作路径,默认为 `./outputs/default`
- `-l`: 打开飞书机器人状态上报。 - `-l`: 打开飞书机器人状态上报。
- `--dry-run`: 开启时,推理和评测任务仅会分发但不会真正运行,便于调试; - `--dry-run`: 开启时,推理和评测任务仅会分发但不会真正运行,便于调试;
- `--dump-eval-details`: 开启时,`results` 下的评测结果中将会包含更加详细的评测结果信息,例如每条样本是否正确等。
以运行模式 `-m all` 为例,整体运行流如下: 以运行模式 `-m all` 为例,整体运行流如下:
......
...@@ -13,7 +13,7 @@ class AFQMCDataset_V2(BaseDataset): ...@@ -13,7 +13,7 @@ class AFQMCDataset_V2(BaseDataset):
@staticmethod @staticmethod
def load(path): def load(path):
data = [] data = []
with open(path, 'r') as f: with open(path, 'r', encoding='utf-8') as f:
for line in f: for line in f:
line = json.loads(line) line = json.loads(line)
line['label'] = 'AB'[int(line['label'])] line['label'] = 'AB'[int(line['label'])]
......
...@@ -64,9 +64,36 @@ class AGIEvalEvaluator(BaseEvaluator): ...@@ -64,9 +64,36 @@ class AGIEvalEvaluator(BaseEvaluator):
def score(self, predictions, references): def score(self, predictions, references):
predictions = [parse_math_answer('', pred) for pred in predictions] predictions = [parse_math_answer('', pred) for pred in predictions]
details = []
cnt = 0 cnt = 0
for pred, ref in zip(predictions, references): for pred, ref in zip(predictions, references):
detail = {'pred': pred, 'answer': ref, 'correct': False}
if is_equiv(pred, ref): if is_equiv(pred, ref):
cnt += 1 cnt += 1
detail['correct'] = True
details.append(detail)
score = cnt / len(predictions) * 100 score = cnt / len(predictions) * 100
return {'score': score} return {'score': score, 'details': details}
@ICL_EVALUATORS.register_module()
class AGIEvalEvaluator_mcq(BaseEvaluator):
def score(self, predictions, references):
if len(predictions) != len(references):
return {
'error': 'predictions and references have different '
'length'
}
details = []
cnt = 0
for pred, ref in zip(predictions, references):
detail = {'pred': pred, 'answer': ref, 'correct': False}
if pred == ref:
cnt += 1
detail['correct'] = True
details.append(detail)
score = cnt / len(predictions) * 100
return {'score': score, 'details': details}
...@@ -61,11 +61,38 @@ class BBHEvaluator(BaseEvaluator): ...@@ -61,11 +61,38 @@ class BBHEvaluator(BaseEvaluator):
predictions = [bbh_freeform_postprocess(pred) for pred in predictions] predictions = [bbh_freeform_postprocess(pred) for pred in predictions]
details = []
cnt = 0 cnt = 0
for pred, ref in zip(predictions, references): for pred, ref in zip(predictions, references):
detail = {'pred': pred, 'answer': ref, 'correct': False}
if pred == ref: if pred == ref:
cnt += 1 cnt += 1
detail['correct'] = True
details.append(detail)
score = cnt / len(predictions) * 100 score = cnt / len(predictions) * 100
return {'score': score} return {'score': score, 'details': details}
@ICL_EVALUATORS.register_module()
class BBHEvaluator_mcq(BaseEvaluator):
def score(self, predictions, references):
if len(predictions) != len(references):
return {
'error': 'predictions and references have different '
'length'
}
details = []
cnt = 0
for pred, ref in zip(predictions, references):
detail = {'pred': pred, 'answer': ref, 'correct': False}
if pred == ref:
cnt += 1
detail['correct'] = True
details.append(detail)
score = cnt / len(predictions) * 100
return {'score': score, 'details': details}
...@@ -13,7 +13,7 @@ class bustumDataset_V2(BaseDataset): ...@@ -13,7 +13,7 @@ class bustumDataset_V2(BaseDataset):
@staticmethod @staticmethod
def load(path): def load(path):
data = [] data = []
with open(path, 'r') as f: with open(path, 'r', encoding='utf-8') as f:
for line in f: for line in f:
line = json.loads(line) line = json.loads(line)
line['label'] = 'AB'[int(line['label'])] line['label'] = 'AB'[int(line['label'])]
......
...@@ -13,7 +13,7 @@ class C3Dataset(BaseDataset): ...@@ -13,7 +13,7 @@ class C3Dataset(BaseDataset):
@staticmethod @staticmethod
def load(path: str): def load(path: str):
with open(path) as f: with open(path, 'r', encoding='utf-8') as f:
data = json.load(f) data = json.load(f)
rows = [] rows = []
for _, row in enumerate(data): for _, row in enumerate(data):
...@@ -58,7 +58,7 @@ class C3Dataset_V2(BaseDataset): ...@@ -58,7 +58,7 @@ class C3Dataset_V2(BaseDataset):
@staticmethod @staticmethod
def load(path: str): def load(path: str):
with open(path) as f: with open(path, 'r', encoding='utf-8') as f:
raw = json.load(f) raw = json.load(f)
data = [] data = []
for line in raw: for line in raw:
......
...@@ -15,7 +15,8 @@ class CEvalDataset(BaseDataset): ...@@ -15,7 +15,8 @@ class CEvalDataset(BaseDataset):
def load(path: str, name: str): def load(path: str, name: str):
dataset = {} dataset = {}
for split in ['dev', 'val', 'test']: for split in ['dev', 'val', 'test']:
with open(osp.join(path, split, f'{name}_{split}.csv')) as f: filename = osp.join(path, split, f'{name}_{split}.csv')
with open(filename, encoding='utf-8') as f:
reader = csv.reader(f) reader = csv.reader(f)
header = next(reader) header = next(reader)
for row in reader: for row in reader:
......
...@@ -31,7 +31,7 @@ class CHIDDataset_V2(BaseDataset): ...@@ -31,7 +31,7 @@ class CHIDDataset_V2(BaseDataset):
@staticmethod @staticmethod
def load(path): def load(path):
data = [] data = []
with open(path, 'r') as f: with open(path, 'r', encoding='utf-8') as f:
for line in f: for line in f:
line = json.loads(line) line = json.loads(line)
item = {} item = {}
......
...@@ -41,7 +41,7 @@ class CluewscDataset_V2(BaseDataset): ...@@ -41,7 +41,7 @@ class CluewscDataset_V2(BaseDataset):
@staticmethod @staticmethod
def load(path): def load(path):
data = [] data = []
with open(path, 'r') as f: with open(path, 'r', encoding='utf-8') as f:
for line in f: for line in f:
line = json.loads(line) line = json.loads(line)
item = { item = {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment