Commit abd17276 authored by Baber's avatar Baber
Browse files

Merge branch 'smolrefact' into tasklist

# Conflicts:
#	lm_eval/__main__.py
#	lm_eval/api/group.py
#	lm_eval/api/task.py
#	lm_eval/evaluator_utils.py
#	lm_eval/tasks/__init__.py
#	lm_eval/utils.py
#	pyproject.toml
parents 00afd536 70314843
dataset_name: lm_syneval__reflexives__simple_reflexives__sing_MS_ANPHR
include: _template_yaml
task: lm_syneval__reflexives__simple_reflexives__sing_MS_ANPHR
group: lm_syneval
task:
- group: lm_syneval__reflexives
task:
- group: lm_syneval__reflexives__simple_reflexives
task:
- lm_syneval__reflexives__simple_reflexives__sing_MS_ANPHR
- lm_syneval__reflexives__simple_reflexives__plur_MS_ANPHR
aggregate_metric_list:
- metric: acc
aggregation: mean
weight_by_size: false
- group: lm_syneval__reflexives__reflexive_sent_comp
task:
- lm_syneval__reflexives__reflexive_sent_comp__sing_MS_ANPHR_sing_BS
- lm_syneval__reflexives__reflexive_sent_comp__sing_MS_ANPHR_plur_BS
- lm_syneval__reflexives__reflexive_sent_comp__plur_MS_ANPHR_sing_BS
- lm_syneval__reflexives__reflexive_sent_comp__plur_MS_ANPHR_plur_BS
aggregate_metric_list:
- metric: acc
aggregation: mean
weight_by_size: false
- group: lm_syneval__reflexives__reflexives_across
task:
- lm_syneval__reflexives__reflexives_across__sing_MS_ANPHR_sing_ES_EV
- lm_syneval__reflexives__reflexives_across__sing_MS_ANPHR_plur_ES_EV
- lm_syneval__reflexives__reflexives_across__plur_MS_ANPHR_sing_ES_EV
- lm_syneval__reflexives__reflexives_across__plur_MS_ANPHR_plur_ES_EV
aggregate_metric_list:
- metric: acc
aggregation: mean
weight_by_size: false
aggregate_metric_list:
- metric: acc
aggregation: mean
weight_by_size: false
- group: lm_syneval__agreement
task:
- group: lm_syneval__agreement__obj_rel_within_inanim
task:
- lm_syneval__agreement__obj_rel_within_inanim__sing_ES_EV_sing_IS_IV
- lm_syneval__agreement__obj_rel_within_inanim__sing_ES_EV_plur_IS_IV
- lm_syneval__agreement__obj_rel_within_inanim__plur_ES_EV_sing_IS_IV
- lm_syneval__agreement__obj_rel_within_inanim__plur_ES_EV_plur_IS_IV
aggregate_metric_list:
- metric: acc
aggregation: mean
weight_by_size: false
- group: lm_syneval__agreement__vp_coord
task:
- lm_syneval__agreement__vp_coord__sing_MS_MV_MV
- lm_syneval__agreement__vp_coord__plur_MS_MV_MV
aggregate_metric_list:
- metric: acc
aggregation: mean
weight_by_size: false
- group: lm_syneval__agreement__sent_comp
task:
- lm_syneval__agreement__sent_comp__sing_MS_MV_sing_BS
- lm_syneval__agreement__sent_comp__sing_MS_MV_plur_BS
- lm_syneval__agreement__sent_comp__plur_MS_MV_sing_BS
- lm_syneval__agreement__sent_comp__plur_MS_MV_plur_BS
aggregate_metric_list:
- metric: acc
aggregation: mean
weight_by_size: false
- group: lm_syneval__agreement__obj_rel_no_comp_within_inanim
task:
- lm_syneval__agreement__obj_rel_no_comp_within_inanim__sing_ES_EV_sing_IS_IV
- lm_syneval__agreement__obj_rel_no_comp_within_inanim__sing_ES_EV_plur_IS_IV
- lm_syneval__agreement__obj_rel_no_comp_within_inanim__plur_ES_EV_sing_IS_IV
- lm_syneval__agreement__obj_rel_no_comp_within_inanim__plur_ES_EV_plur_IS_IV
aggregate_metric_list:
- metric: acc
aggregation: mean
weight_by_size: false
- group: lm_syneval__agreement__obj_rel_within_anim
task:
- lm_syneval__agreement__obj_rel_within_anim__sing_ES_EV_sing_MS_MV
- lm_syneval__agreement__obj_rel_within_anim__sing_ES_EV_plur_MS_MV
- lm_syneval__agreement__obj_rel_within_anim__plur_ES_EV_sing_MS_MV
- lm_syneval__agreement__obj_rel_within_anim__plur_ES_EV_plur_MS_MV
aggregate_metric_list:
- metric: acc
aggregation: mean
weight_by_size: false
- group: lm_syneval__agreement__subj_rel
task:
- lm_syneval__agreement__subj_rel__sing_MS_EV_MV_sing_ES
- lm_syneval__agreement__subj_rel__sing_MS_EV_MV_plur_ES
- lm_syneval__agreement__subj_rel__plur_MS_EV_MV_sing_ES
- lm_syneval__agreement__subj_rel__plur_MS_EV_MV_plur_ES
aggregate_metric_list:
- metric: acc
aggregation: mean
weight_by_size: false
- group: lm_syneval__agreement__prep_inanim
task:
- lm_syneval__agreement__prep_inanim__sing_IS_IV_sing_ES
- lm_syneval__agreement__prep_inanim__sing_IS_IV_plur_ES
- lm_syneval__agreement__prep_inanim__plur_IS_IV_sing_ES
- lm_syneval__agreement__prep_inanim__plur_IS_IV_plur_ES
aggregate_metric_list:
- metric: acc
aggregation: mean
weight_by_size: false
- group: lm_syneval__agreement__long_vp_coord
task:
- lm_syneval__agreement__long_vp_coord__sing_MS_LMV_LMV
- lm_syneval__agreement__long_vp_coord__plur_MS_LMV_LMV
aggregate_metric_list:
- metric: acc
aggregation: mean
weight_by_size: false
- group: lm_syneval__agreement__obj_rel_across_anim
task:
- lm_syneval__agreement__obj_rel_across_anim__sing_MS_MV_sing_ES_EV
- lm_syneval__agreement__obj_rel_across_anim__sing_MS_MV_plur_ES_EV
- lm_syneval__agreement__obj_rel_across_anim__plur_MS_MV_sing_ES_EV
- lm_syneval__agreement__obj_rel_across_anim__plur_MS_MV_plur_ES_EV
aggregate_metric_list:
- metric: acc
aggregation: mean
weight_by_size: false
- group: lm_syneval__agreement__obj_rel_across_inanim
task:
- lm_syneval__agreement__obj_rel_across_inanim__sing_IS_IV_sing_ES_EV
- lm_syneval__agreement__obj_rel_across_inanim__sing_IS_IV_plur_ES_EV
- lm_syneval__agreement__obj_rel_across_inanim__plur_IS_IV_sing_ES_EV
- lm_syneval__agreement__obj_rel_across_inanim__plur_IS_IV_plur_ES_EV
aggregate_metric_list:
- metric: acc
aggregation: mean
weight_by_size: false
- group: lm_syneval__agreement__obj_rel_no_comp_across_anim
task:
- lm_syneval__agreement__obj_rel_no_comp_across_anim__sing_MS_MV_sing_ES_EV
- lm_syneval__agreement__obj_rel_no_comp_across_anim__sing_MS_MV_plur_ES_EV
- lm_syneval__agreement__obj_rel_no_comp_across_anim__plur_MS_MV_sing_ES_EV
- lm_syneval__agreement__obj_rel_no_comp_across_anim__plur_MS_MV_plur_ES_EV
aggregate_metric_list:
- metric: acc
aggregation: mean
weight_by_size: false
- group: lm_syneval__agreement__obj_rel_no_comp_across_inanim
task:
- lm_syneval__agreement__obj_rel_no_comp_across_inanim__sing_IS_IV_sing_ES_EV
- lm_syneval__agreement__obj_rel_no_comp_across_inanim__sing_IS_IV_plur_ES_EV
- lm_syneval__agreement__obj_rel_no_comp_across_inanim__plur_IS_IV_sing_ES_EV
- lm_syneval__agreement__obj_rel_no_comp_across_inanim__plur_IS_IV_plur_ES_EV
aggregate_metric_list:
- metric: acc
aggregation: mean
weight_by_size: false
- group: lm_syneval__agreement__simple_agrmt
task:
- lm_syneval__agreement__simple_agrmt__sing_MS_MV
- lm_syneval__agreement__simple_agrmt__plur_MS_MV
aggregate_metric_list:
- metric: acc
aggregation: mean
weight_by_size: false
- group: lm_syneval__agreement__prep_anim
task:
- lm_syneval__agreement__prep_anim__sing_MS_MV_sing_ES
- lm_syneval__agreement__prep_anim__sing_MS_MV_plur_ES
- lm_syneval__agreement__prep_anim__plur_MS_MV_sing_ES
- lm_syneval__agreement__prep_anim__plur_MS_MV_plur_ES
aggregate_metric_list:
- metric: acc
aggregation: mean
weight_by_size: false
- group: lm_syneval__agreement__obj_rel_no_comp_within_anim
task:
- lm_syneval__agreement__obj_rel_no_comp_within_anim__sing_ES_EV_sing_MS_MV
- lm_syneval__agreement__obj_rel_no_comp_within_anim__sing_ES_EV_plur_MS_MV
- lm_syneval__agreement__obj_rel_no_comp_within_anim__plur_ES_EV_sing_MS_MV
- lm_syneval__agreement__obj_rel_no_comp_within_anim__plur_ES_EV_plur_MS_MV
aggregate_metric_list:
- metric: acc
aggregation: mean
weight_by_size: false
aggregate_metric_list:
- metric: acc
aggregation: mean
weight_by_size: false
- group: lm_syneval__npi
task:
- group: lm_syneval__npi__npi_across_anim
task:
- lm_syneval__npi__npi_across_anim__past
- lm_syneval__npi__npi_across_anim__future
aggregate_metric_list:
- metric: acc
aggregation: mean
weight_by_size: false
- group: lm_syneval__npi__npi_across_inanim
task:
- lm_syneval__npi__npi_across_inanim__past
- lm_syneval__npi__npi_across_inanim__future
aggregate_metric_list:
- metric: acc
aggregation: mean
weight_by_size: false
- group: lm_syneval__npi__simple_npi_anim
task:
- lm_syneval__npi__simple_npi_anim__past
- lm_syneval__npi__simple_npi_anim__future
aggregate_metric_list:
- metric: acc
aggregation: mean
weight_by_size: false
- group: lm_syneval__npi__simple_npi_inanim
task:
- lm_syneval__npi__simple_npi_inanim__past
- lm_syneval__npi__simple_npi_inanim__future
aggregate_metric_list:
- metric: acc
aggregation: mean
weight_by_size: false
aggregate_metric_list:
- metric: acc
aggregation: mean
weight_by_size: false
aggregate_metric_list:
- metric: acc
aggregation: mean
weight_by_size: false
......@@ -5,17 +5,17 @@ task: longbench_2wikimqa
dataset_path: THUDM/LongBench
test_split: test
dataset_name: 2wikimqa
doc_to_text: 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{{context}}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {{input}}\nAnswer:'
doc_to_text: "Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{{context}}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {{input}}\nAnswer:"
doc_to_target: '{{answers}}'
process_results: !function metrics.get_qa_f1_score
generation_kwargs:
max_gen_toks: 32
temperature: 1
do_sample: True
do_sample: False
until: []
metric_list:
- metric: "qa_f1_score"
aggregation: mean
higher_is_better: True
metadata:
version: 3.0
version: 4.0
......@@ -5,17 +5,17 @@ task: longbench_2wikimqa_e
dataset_path: THUDM/LongBench
test_split: test
dataset_name: 2wikimqa_e
doc_to_text: 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{{context}}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {{input}}\nAnswer:'
doc_to_text: "Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{{context}}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {{input}}\nAnswer:"
doc_to_target: '{{answers}}'
process_results: !function metrics.get_qa_f1_score
generation_kwargs:
max_gen_toks: 32
temperature: 1
do_sample: True
do_sample: False
until: []
metric_list:
- metric: "qa_f1_score"
aggregation: mean
higher_is_better: True
metadata:
version: 3.0
version: 4.0
......@@ -101,4 +101,7 @@ If other tasks on this dataset are already supported:
### Changelog
v2.: fix doc_to_target; add vcsum
v3: properly use all answers for metric calculation; trim whitespace from resps; fix stop sequences not parsing correctly.
v4: fixed special characters in prompts; use greedy decoding by default.
......@@ -149,7 +149,7 @@ task: {{ task }}
dataset_path: {{ dataset_path }}
test_split: {{ test_split }}
dataset_name: {{ dataset_name }}
doc_to_text: '{{ doc_to_text }}'
doc_to_text: "{{ doc_to_text }}"
doc_to_target: '{{ doc_to_target }}'
process_results: {{ process_results }}
generation_kwargs:
......@@ -180,13 +180,14 @@ if __name__ == "__main__":
generation_kwargs = {
"max_gen_toks": dataset2maxlen[df],
"temperature": 1,
"do_sample": True,
"do_sample": False,
# We'll handle the until value directly in the template
}
raw_doc_to_text = (
dataset2prompt[df]
.replace("\n", "\\n")
.replace('"', '\\"')
.replace("{", "{{")
.replace("}", "}}")
)
......@@ -210,7 +211,7 @@ if __name__ == "__main__":
"generation_kwargs": generation_kwargs,
"has_newline": has_newline, # Add the flag to the template context
"metric_list": metric_list,
"metadata": {"version": "3.0"},
"metadata": {"version": "4.0"},
}
# Render template
......
......@@ -5,17 +5,17 @@ task: longbench_dureader
dataset_path: THUDM/LongBench
test_split: test
dataset_name: dureader
doc_to_text: '请基于给定的文章回答下述问题。\n\n文章:{{context}}\n\n请基于上述文章回答下面的问题。\n\n问题:{{input}}\n回答:'
doc_to_text: "请基于给定的文章回答下述问题。\n\n文章:{{context}}\n\n请基于上述文章回答下面的问题。\n\n问题:{{input}}\n回答:"
doc_to_target: '{{answers}}'
process_results: !function metrics.get_rouge_zh_score
generation_kwargs:
max_gen_toks: 128
temperature: 1
do_sample: True
do_sample: False
until: []
metric_list:
- metric: "rouge_zh_score"
aggregation: mean
higher_is_better: True
metadata:
version: 3.0
version: 4.0
......@@ -5,17 +5,17 @@ task: longbench_gov_report
dataset_path: THUDM/LongBench
test_split: test
dataset_name: gov_report
doc_to_text: 'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{{context}}\n\nNow, write a one-page summary of the report.\n\nSummary:'
doc_to_text: "You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{{context}}\n\nNow, write a one-page summary of the report.\n\nSummary:"
doc_to_target: '{{answers}}'
process_results: !function metrics.get_rouge_score
generation_kwargs:
max_gen_toks: 512
temperature: 1
do_sample: True
do_sample: False
until: []
metric_list:
- metric: "rouge_score"
aggregation: mean
higher_is_better: True
metadata:
version: 3.0
version: 4.0
......@@ -5,17 +5,17 @@ task: longbench_gov_report_e
dataset_path: THUDM/LongBench
test_split: test
dataset_name: gov_report_e
doc_to_text: 'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{{context}}\n\nNow, write a one-page summary of the report.\n\nSummary:'
doc_to_text: "You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{{context}}\n\nNow, write a one-page summary of the report.\n\nSummary:"
doc_to_target: '{{answers}}'
process_results: !function metrics.get_rouge_score
generation_kwargs:
max_gen_toks: 512
temperature: 1
do_sample: True
do_sample: False
until: []
metric_list:
- metric: "rouge_score"
aggregation: mean
higher_is_better: True
metadata:
version: 3.0
version: 4.0
......@@ -5,17 +5,17 @@ task: longbench_hotpotqa
dataset_path: THUDM/LongBench
test_split: test
dataset_name: hotpotqa
doc_to_text: 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{{context}}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {{input}}\nAnswer:'
doc_to_text: "Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{{context}}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {{input}}\nAnswer:"
doc_to_target: '{{answers}}'
process_results: !function metrics.get_qa_f1_score
generation_kwargs:
max_gen_toks: 32
temperature: 1
do_sample: True
do_sample: False
until: []
metric_list:
- metric: "qa_f1_score"
aggregation: mean
higher_is_better: True
metadata:
version: 3.0
version: 4.0
......@@ -5,17 +5,17 @@ task: longbench_hotpotqa_e
dataset_path: THUDM/LongBench
test_split: test
dataset_name: hotpotqa_e
doc_to_text: 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{{context}}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {{input}}\nAnswer:'
doc_to_text: "Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{{context}}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {{input}}\nAnswer:"
doc_to_target: '{{answers}}'
process_results: !function metrics.get_qa_f1_score
generation_kwargs:
max_gen_toks: 32
temperature: 1
do_sample: True
do_sample: False
until: []
metric_list:
- metric: "qa_f1_score"
aggregation: mean
higher_is_better: True
metadata:
version: 3.0
version: 4.0
......@@ -5,17 +5,17 @@ task: longbench_lcc
dataset_path: THUDM/LongBench
test_split: test
dataset_name: lcc
doc_to_text: 'Please complete the code given below. \n{{context}}Next line of code:\n'
doc_to_text: "Please complete the code given below. \n{{context}}Next line of code:\n"
doc_to_target: '{{answers}}'
process_results: !function metrics.get_code_sim_score
generation_kwargs:
max_gen_toks: 64
temperature: 1
do_sample: True
do_sample: False
until: []
metric_list:
- metric: "code_sim_score"
aggregation: mean
higher_is_better: True
metadata:
version: 3.0
version: 4.0
......@@ -5,17 +5,17 @@ task: longbench_lcc_e
dataset_path: THUDM/LongBench
test_split: test
dataset_name: lcc_e
doc_to_text: 'Please complete the code given below. \n{{context}}Next line of code:\n'
doc_to_text: "Please complete the code given below. \n{{context}}Next line of code:\n"
doc_to_target: '{{answers}}'
process_results: !function metrics.get_code_sim_score
generation_kwargs:
max_gen_toks: 64
temperature: 1
do_sample: True
do_sample: False
until: []
metric_list:
- metric: "code_sim_score"
aggregation: mean
higher_is_better: True
metadata:
version: 3.0
version: 4.0
......@@ -5,17 +5,17 @@ task: longbench_lsht
dataset_path: THUDM/LongBench
test_split: test
dataset_name: lsht
doc_to_text: '请判断给定新闻的类别,下面是一些例子。\n\n{{context}}\n{{input}}'
doc_to_text: "请判断给定新闻的类别,下面是一些例子。\n\n{{context}}\n{{input}}"
doc_to_target: '{{answers}}'
process_results: !function metrics.get_classification_score
generation_kwargs:
max_gen_toks: 64
temperature: 1
do_sample: True
do_sample: False
until: ["\n"]
metric_list:
- metric: "classification_score"
aggregation: mean
higher_is_better: True
metadata:
version: 3.0
version: 4.0
......@@ -5,17 +5,17 @@ task: longbench_multi_news
dataset_path: THUDM/LongBench
test_split: test
dataset_name: multi_news
doc_to_text: 'You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{{context}}\n\nNow, write a one-page summary of all the news.\n\nSummary:'
doc_to_text: "You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{{context}}\n\nNow, write a one-page summary of all the news.\n\nSummary:"
doc_to_target: '{{answers}}'
process_results: !function metrics.get_rouge_score
generation_kwargs:
max_gen_toks: 512
temperature: 1
do_sample: True
do_sample: False
until: []
metric_list:
- metric: "rouge_score"
aggregation: mean
higher_is_better: True
metadata:
version: 3.0
version: 4.0
......@@ -5,17 +5,17 @@ task: longbench_multi_news_e
dataset_path: THUDM/LongBench
test_split: test
dataset_name: multi_news_e
doc_to_text: 'You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{{context}}\n\nNow, write a one-page summary of all the news.\n\nSummary:'
doc_to_text: "You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{{context}}\n\nNow, write a one-page summary of all the news.\n\nSummary:"
doc_to_target: '{{answers}}'
process_results: !function metrics.get_rouge_score
generation_kwargs:
max_gen_toks: 512
temperature: 1
do_sample: True
do_sample: False
until: []
metric_list:
- metric: "rouge_score"
aggregation: mean
higher_is_better: True
metadata:
version: 3.0
version: 4.0
......@@ -5,17 +5,17 @@ task: longbench_multifieldqa_en
dataset_path: THUDM/LongBench
test_split: test
dataset_name: multifieldqa_en
doc_to_text: 'Read the following text and answer briefly.\n\n{{context}}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {{input}}\nAnswer:'
doc_to_text: "Read the following text and answer briefly.\n\n{{context}}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {{input}}\nAnswer:"
doc_to_target: '{{answers}}'
process_results: !function metrics.get_qa_f1_score
generation_kwargs:
max_gen_toks: 64
temperature: 1
do_sample: True
do_sample: False
until: []
metric_list:
- metric: "qa_f1_score"
aggregation: mean
higher_is_better: True
metadata:
version: 3.0
version: 4.0
......@@ -5,17 +5,17 @@ task: longbench_multifieldqa_en_e
dataset_path: THUDM/LongBench
test_split: test
dataset_name: multifieldqa_en_e
doc_to_text: 'Read the following text and answer briefly.\n\n{{context}}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {{input}}\nAnswer:'
doc_to_text: "Read the following text and answer briefly.\n\n{{context}}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {{input}}\nAnswer:"
doc_to_target: '{{answers}}'
process_results: !function metrics.get_qa_f1_score
generation_kwargs:
max_gen_toks: 64
temperature: 1
do_sample: True
do_sample: False
until: []
metric_list:
- metric: "qa_f1_score"
aggregation: mean
higher_is_better: True
metadata:
version: 3.0
version: 4.0
......@@ -5,17 +5,17 @@ task: longbench_multifieldqa_zh
dataset_path: THUDM/LongBench
test_split: test
dataset_name: multifieldqa_zh
doc_to_text: '阅读以下文字并用中文简短回答:\n\n{{context}}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{{input}}\n回答:'
doc_to_text: "阅读以下文字并用中文简短回答:\n\n{{context}}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{{input}}\n回答:"
doc_to_target: '{{answers}}'
process_results: !function metrics.get_qa_f1_zh_score
generation_kwargs:
max_gen_toks: 64
temperature: 1
do_sample: True
do_sample: False
until: []
metric_list:
- metric: "qa_f1_zh_score"
aggregation: mean
higher_is_better: True
metadata:
version: 3.0
version: 4.0
......@@ -5,17 +5,17 @@ task: longbench_musique
dataset_path: THUDM/LongBench
test_split: test
dataset_name: musique
doc_to_text: 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{{context}}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {{input}}\nAnswer:'
doc_to_text: "Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{{context}}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {{input}}\nAnswer:"
doc_to_target: '{{answers}}'
process_results: !function metrics.get_qa_f1_score
generation_kwargs:
max_gen_toks: 32
temperature: 1
do_sample: True
do_sample: False
until: []
metric_list:
- metric: "qa_f1_score"
aggregation: mean
higher_is_better: True
metadata:
version: 3.0
version: 4.0
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment