Unverified Commit aa2dd2b5 authored by Fengzhe Zhou's avatar Fengzhe Zhou Committed by GitHub
Browse files

[Format] Add config lints (#892)

parent 3dbba119
......@@ -9,7 +9,7 @@ subjective_reader_cfg = dict(
output_column='judge',
)
data_path ="data/subjective/compass_arena"
data_path ='data/subjective/compass_arena'
subjective_datasets = []
......@@ -99,7 +99,7 @@ creation_prompt = """
{question}
""" + base_prompt
sub_map = {"language": language_prompt, "knowledge": knowledge_prompt, "reason_v2": reason_prompt, "math_v2": math_prompt, "creationv2_zh": creation_prompt}
sub_map = {'language': language_prompt, 'knowledge': knowledge_prompt, 'reason_v2': reason_prompt, 'math_v2': math_prompt, 'creationv2_zh': creation_prompt}
for _name, _prompt in sub_map.items():
subjective_infer_cfg = dict(
......@@ -108,7 +108,7 @@ for _name, _prompt in sub_map.items():
template=dict(round=[
dict(
role='HUMAN',
prompt="{question}"
prompt='{question}'
),
]),
),
......@@ -129,12 +129,12 @@ for _name, _prompt in sub_map.items():
]),
),
),
pred_role="BOT",
pred_role='BOT',
)
subjective_datasets.append(
dict(
abbr=f"{_name}",
abbr=f'{_name}',
type=CompassArenaDataset,
path=data_path,
name=_name,
......
......@@ -9,7 +9,7 @@ subjective_reader_cfg = dict(
output_column='judge',
)
data_path ="data/subjective/compass_arena"
data_path ='data/subjective/compass_arena'
subjective_datasets = []
......@@ -99,7 +99,7 @@ creation_prompt = """
{question}
""" + base_prompt
sub_map = {"creationv3": creation_prompt}
sub_map = {'creationv3': creation_prompt}
for _name, _prompt in sub_map.items():
subjective_infer_cfg = dict(
......@@ -108,7 +108,7 @@ for _name, _prompt in sub_map.items():
template=dict(round=[
dict(
role='HUMAN',
prompt="{question}"
prompt='{question}'
),
]),
),
......@@ -130,12 +130,12 @@ for _name, _prompt in sub_map.items():
]),
),
),
pred_role="BOT",
pred_role='BOT',
)
subjective_datasets.append(
dict(
abbr=f"{_name}",
abbr=f'{_name}',
type=CompassArenaDataset,
path=data_path,
name=_name,
......
......@@ -9,7 +9,7 @@ subjective_reader_cfg = dict(
output_column='judge',
)
data_path ="data/subjective/compass_arena"
data_path ='data/subjective/compass_arena'
subjective_datasets = []
......@@ -91,7 +91,7 @@ reason_prompt = math_prompt
creation_prompt = """
请根据提供的 评分要求,用户问题 以及 相应的两个回答(回答1,回答2),判断两个回答中哪一个更好。
评分要求(重要性依次递减):
1. 好的回答必须首先符合用户问题里的各种需求,不能跑题
1. 好的回答必须首先符合用户问题里的各种需求,不能跑题
2. 好的回答必须具有逻辑连贯性,围绕一个中心进行回答
3. 好的回答必须具有创造性的词语和表达丰富度
......@@ -99,7 +99,7 @@ creation_prompt = """
{question}
""" + base_prompt
sub_map = {"knowledge": knowledge_prompt, "language": language_prompt, "math_v2": math_prompt, "reason_v2": reason_prompt, "creationv2_zh": creation_prompt}
sub_map = {'knowledge': knowledge_prompt, 'language': language_prompt, 'math_v2': math_prompt, 'reason_v2': reason_prompt, 'creationv2_zh': creation_prompt}
meta_prompt = """
\n你是一个评判专家,请根据提供的 评分要求,用户问题 以及 相应的两个回答(回答1,回答2),判断两个回答中哪一个更好。\n评分要求(重要性依次递减):\n1. 好的回答必须首先符合用户问题里的各种需求,不能跑题 \n2. 好的回答必须具有逻辑连贯性,围绕一个中心进行回答\n3. 好的回答必须具有创造性的词语和表达丰富度\n\n[用户问题]\n{question}\n[回答1开始]\n{prediction}\n[回答1结束]\n[回答2开始]\n{prediction2}\n[回答2结束]\n此外,还有两个其他评判专家的评判意见供你参考。\n[评判意见1]\n{judgement}\n[评判意见2]\n{judgement2}\n\n最终请你综合其他评判专家的评判意见与你自己的意见,在以下 3 个选项中做出选择:\nA. 回答1更好\nB. 回答2更好\nC. 回答1、2平局\n并提供你的解释原因。\n\n如果你认为回答1更好,你的输出应形如:\n选择:A\n原因:blahblah blahblah\n\n\n如果你认为回答2更好,你的输出应形如:\n选择:B\n原因:blahblah blahblah\n\n\n如果你认为回答1、2打成平手,你的输出应形如:\n选择:C\n原因:blahblah blahblah\n\n
......@@ -111,7 +111,7 @@ for _name, _prompt in sub_map.items():
template=dict(round=[
dict(
role='HUMAN',
prompt="{question}"
prompt='{question}'
),
]),
),
......@@ -141,12 +141,12 @@ for _name, _prompt in sub_map.items():
]),
),
),
pred_role="BOT",
pred_role='BOT',
)
subjective_datasets.append(
dict(
abbr=f"{_name}",
abbr=f'{_name}',
type=CompassArenaDataset,
path=data_path,
name=_name,
......
......@@ -10,9 +10,9 @@ subjective_reader_cfg = dict(
)
subjective_all_sets = [
"creationbench",
'creationbench',
]
data_path ="data/subjective/"
data_path ='data/subjective/'
subjective_datasets = []
......@@ -23,7 +23,7 @@ for _name in subjective_all_sets:
template=dict(round=[
dict(
role='HUMAN',
prompt="{question}"
prompt='{question}'
),
]),
),
......@@ -39,17 +39,17 @@ for _name in subjective_all_sets:
template=dict(round=[
dict(
role='HUMAN',
prompt = "{gpt4_prefix}{prediction}{gpt4_suffix}"
prompt = '{gpt4_prefix}{prediction}{gpt4_suffix}'
),
]),
),
),
pred_role="BOT",
pred_role='BOT',
)
subjective_datasets.append(
dict(
abbr=f"{_name}",
abbr=f'{_name}',
type=CreationBenchDataset,
multi_dimension=True,
path=data_path,
......
......@@ -10,9 +10,9 @@ subjective_reader_cfg = dict(
)
subjective_all_sets = [
"creationv2_zh",
'creationv2_zh',
]
data_path ="data/subjective/"
data_path ='data/subjective/'
subjective_datasets = []
......@@ -23,7 +23,7 @@ for _name in subjective_all_sets:
template=dict(round=[
dict(
role='HUMAN',
prompt="{question}"
prompt='{question}'
),
]),
),
......@@ -39,17 +39,17 @@ for _name in subjective_all_sets:
template=dict(round=[
dict(
role='HUMAN',
prompt = "{score_with_ref_prefix}{prediction}{score_with_ref_suffix}"
prompt = '{score_with_ref_prefix}{prediction}{score_with_ref_suffix}'
),
]),
),
),
pred_role="BOT",
pred_role='BOT',
)
subjective_datasets.append(
dict(
abbr=f"{_name}",
abbr=f'{_name}',
type=CreationBenchDataset,
multi_dimension=True,
path=data_path,
......
......@@ -11,9 +11,9 @@ subjective_reader_cfg = dict(
)
subjective_all_sets = [
"FunctionalMT",
'FunctionalMT',
]
data_path ="data/subjective/"
data_path ='data/subjective/'
subjective_datasets = []
......@@ -36,17 +36,17 @@ for _name in subjective_all_sets:
template=dict(round=[
dict(
role='HUMAN',
prompt = "{gpt4_prefix}{prediction}{gpt4_suffix}"
prompt = '{gpt4_prefix}{prediction}{gpt4_suffix}'
),
]),
),
),
pred_role="BOT",
pred_role='BOT',
)
subjective_datasets.append(
dict(
abbr=f"{_name}",
abbr=f'{_name}',
type=MultiroundDataset,
path=data_path,
name=_name,
......
......@@ -11,9 +11,9 @@ subjective_reader_cfg = dict(
)
subjective_all_sets = [
"mtbench",
'mtbench',
]
data_path ="data/subjective/"
data_path ='data/subjective/'
subjective_datasets = []
......@@ -38,22 +38,22 @@ for _name in subjective_all_sets:
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="{system_prompt}")
prompt='{system_prompt}')
],
round=[
dict(
role='HUMAN',
prompt = "{prompt_template}"
prompt = '{prompt_template}'
),
]),
),
),
pred_role="BOT",
pred_role='BOT',
)
subjective_datasets.append(
dict(
abbr=f"{_name}",
abbr=f'{_name}',
type=MTBenchDataset,
path=data_path,
name=_name,
......
......@@ -11,9 +11,9 @@ subjective_reader_cfg = dict(
)
subjective_all_sets = [
"mtbench",
'mtbench',
]
data_path ="data/subjective/"
data_path ='data/subjective/'
subjective_datasets = []
......@@ -37,22 +37,22 @@ for _name in subjective_all_sets:
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="{system_prompt}")
prompt='{system_prompt}')
],
round=[
dict(
role='HUMAN',
prompt = "{prompt_template}"
prompt = '{prompt_template}'
),
]),
),
),
pred_role="BOT",
pred_role='BOT',
)
subjective_datasets.append(
dict(
abbr=f"{_name}",
abbr=f'{_name}',
type=MTBenchDataset,
path=data_path,
name=_name,
......
......@@ -11,9 +11,9 @@ subjective_reader_cfg = dict(
)
subjective_all_sets = [
"mtbench_0.0","mtbench_0.1","mtbench_0.7"
'mtbench_0.0','mtbench_0.1','mtbench_0.7'
]
data_path ="data/subjective/mtbench"
data_path ='data/subjective/mtbench'
subjective_datasets = []
......@@ -39,22 +39,22 @@ for _name in subjective_all_sets:
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="{system_prompt}")
prompt='{system_prompt}')
],
round=[
dict(
role='HUMAN',
prompt = "{prompt_template}"
prompt = '{prompt_template}'
),
]),
),
),
pred_role="BOT",
pred_role='BOT',
)
subjective_datasets.append(
dict(
abbr=f"{_name}",
abbr=f'{_name}',
type=MTBenchDataset,
path=data_path,
name=_name,
......
......@@ -10,7 +10,7 @@ subjective_reader_cfg = dict(
train_split='test')
subjective_all_sets = [
"creation_v0.1",
'creation_v0.1',
]
subjective_datasets = []
......@@ -22,7 +22,7 @@ for _name in subjective_all_sets:
template=dict(round=[
dict(
role='HUMAN',
prompt="{question}"
prompt='{question}'
),
]),
),
......@@ -39,21 +39,21 @@ for _name in subjective_all_sets:
template=dict(
begin=[
dict(
role="SYSTEM",
fallback_role="HUMAN",
prompt="{prompt}"
role='SYSTEM',
fallback_role='HUMAN',
prompt='{prompt}'
),
],
round=[dict(role="HUMAN",
prompt="回答 1: <回答 1 开始> {prediction} <回答 1 结束>\n回答 2: <回答 2 开始> {prediction2} <回答 2 结束>\n")]))),
pred_role="BOT",
round=[dict(role='HUMAN',
prompt='回答 1: <回答 1 开始> {prediction} <回答 1 结束>\n回答 2: <回答 2 开始> {prediction2} <回答 2 结束>\n')]))),
pred_role='BOT',
)
subjective_datasets.append(
dict(
abbr=f"{_name}",
abbr=f'{_name}',
type=SubjectiveCmpDataset,
path="./data/subjective/",
path='./data/subjective/',
name=_name,
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
......
......@@ -12,7 +12,7 @@ subjective_reader_cfg = dict(
)
subjective_all_sets = [
"COREV2_6A_all",
'COREV2_6A_all',
]
......@@ -25,7 +25,7 @@ for _name in subjective_all_sets:
template=dict(round=[
dict(
role='HUMAN',
prompt="{question}"
prompt='{question}'
),
]),
),
......@@ -42,19 +42,19 @@ for _name in subjective_all_sets:
template=dict(round=[
dict(
role='HUMAN',
prompt = "{prefix}问题: <问题开始> {question} <问题结束>\n\n回答 1: <回答 1 开始> {prediction} <回答 1 结束>\n\n回答 2: <回答 2 开始> {prediction2} <回答 2 结束>\n\n{suffix}"
prompt = '{prefix}问题: <问题开始> {question} <问题结束>\n\n回答 1: <回答 1 开始> {prediction} <回答 1 结束>\n\n回答 2: <回答 2 开始> {prediction2} <回答 2 结束>\n\n{suffix}'
),
]),
),
),
pred_role="BOT",
pred_role='BOT',
)
subjective_datasets.append(
dict(
abbr=f"{_name}",
abbr=f'{_name}',
type=Corev2Dataset,
path="./data/subjective/",
path='./data/subjective/',
name=_name,
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
......
......@@ -11,7 +11,7 @@ subjective_reader_cfg = dict(
)
subjective_all_sets = [
"creation_v0.1",
'creation_v0.1',
]
subjective_datasets = []
......@@ -23,7 +23,7 @@ for _name in subjective_all_sets:
template=dict(round=[
dict(
role='HUMAN',
prompt="{question}"
prompt='{question}'
),
]),
),
......@@ -39,19 +39,19 @@ for _name in subjective_all_sets:
template=dict(round=[
dict(
role='HUMAN',
prompt = "{prefix}问题: <问题开始> {question} <问题结束>\n\n回答: <回答开始> {prediction} <回答结束>\n\n{suffix}"
prompt = '{prefix}问题: <问题开始> {question} <问题结束>\n\n回答: <回答开始> {prediction} <回答结束>\n\n{suffix}'
),
]),
),
),
pred_role="BOT",
pred_role='BOT',
)
subjective_datasets.append(
dict(
abbr=f"{_name}",
abbr=f'{_name}',
type=Creationv01Dataset,
path="./data/subjective/",
path='./data/subjective/',
name=_name,
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
......
......@@ -13,7 +13,7 @@ summedits_infer_cfg = dict(
type=PromptTemplate,
template=dict(round=[
dict(
role="HUMAN",
role='HUMAN',
prompt=
"""Given the document below, you have to determine if "Yes" or "No", the summary is factually consistent with the document.
......@@ -36,7 +36,7 @@ Answer:"""
summedits_eval_cfg = dict(
evaluator=dict(type=AccEvaluator),
pred_role="BOT",
pred_role='BOT',
pred_postprocessor=dict(type=first_capital_postprocess),
)
......
......@@ -13,7 +13,7 @@ summedits_infer_cfg = dict(
type=PromptTemplate,
template=dict(round=[
dict(
role="HUMAN",
role='HUMAN',
prompt=
'Document:\n{doc}Summary:\n{summary}\nQuestion:\nIs the summary factually consistent with the document?\nA. Yes\nB. No\nAnswer:'
),
......@@ -23,7 +23,7 @@ summedits_infer_cfg = dict(
summedits_eval_cfg = dict(
evaluator=dict(type=AccEvaluator),
pred_role="BOT",
pred_role='BOT',
pred_postprocessor=dict(type=first_capital_postprocess),
)
......
......@@ -16,20 +16,20 @@ summedits_infer_cfg = dict(
0:
dict(round=[
dict(
role="HUMAN",
role='HUMAN',
prompt=
"""\nDocument:\n{doc}\nSummary:\n{summary}\nIs the summary factually consistent with the document? """
),
dict(role="BOT", prompt="No")
dict(role='BOT', prompt='No')
]),
1:
dict(round=[
dict(
role="HUMAN",
role='HUMAN',
prompt=
"""Document:\n{doc}\nSummary:\n{summary}\nIs the summary factually consistent with the document? """
),
dict(role="BOT", prompt="Yes")
dict(role='BOT', prompt='Yes')
]),
}),
retriever=dict(type=ZeroRetriever),
......
......@@ -10,7 +10,7 @@ summedits_reader_cfg = dict(
test_split='train')
summedits_prompt1 = "Given the document below, you have to determine if 'Yes' or 'No', the summary is factually consistent with the document."
summedits_prompt2 = "Document:\n{doc}\nSummary:\n{summary}\nIs the summary factually consistent with the document? "
summedits_prompt2 = 'Document:\n{doc}\nSummary:\n{summary}\nIs the summary factually consistent with the document? '
summedits_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
......@@ -24,8 +24,8 @@ summedits_infer_cfg = dict(
prompt=summedits_prompt1)
],
round=[
dict(role="HUMAN", prompt=summedits_prompt2),
dict(role="BOT", prompt="No")
dict(role='HUMAN', prompt=summedits_prompt2),
dict(role='BOT', prompt='No')
]),
1:
dict(
......@@ -36,8 +36,8 @@ summedits_infer_cfg = dict(
prompt=summedits_prompt1)
],
round=[
dict(role="HUMAN", prompt=summedits_prompt2),
dict(role="BOT", prompt="Yes")
dict(role='HUMAN', prompt=summedits_prompt2),
dict(role='BOT', prompt='Yes')
]),
}),
retriever=dict(type=ZeroRetriever),
......
......@@ -21,8 +21,8 @@ summedits_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template={
0: f"{summedits_prompt}Answer: No.",
1: f"{summedits_prompt}Answer: Yes."
0: f'{summedits_prompt}Answer: No.',
1: f'{summedits_prompt}Answer: Yes.'
}),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=PPLInferencer))
......
......@@ -18,7 +18,7 @@ summscreen_infer_cfg = dict(
begin=[
dict(
role='SYSTEM',
fallback_role="HUMAN",
fallback_role='HUMAN',
prompt=
'Please summarize the following English play script in English:'
),
......
......@@ -15,7 +15,7 @@ summscreen_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=
"Please summarize the following English report in English:{content}\n{summary}."),
'Please summarize the following English report in English:{content}\n{summary}.'),
retriever=dict(type=ZeroRetriever),
inferencer=dict(
type=GenInferencer, batch_size=4, max_out_len=500, max_seq_len=8192))
......
......@@ -32,9 +32,9 @@ taco_skills = load_dataset('BAAI/TACO', skills=['Sorting', 'Range queries'], tok
```
## Evaluation results
| dataset | metric | CodeLlama-7b-Python | internlm2-chat-1.8b-sft-hf | internlm2-chat-7b-sft-hf | internlm2-chat-20b-sft-hf |
| dataset | metric | CodeLlama-7b-Python | internlm2-chat-1.8b-sft-hf | internlm2-chat-7b-sft-hf | internlm2-chat-20b-sft-hf |
|-----------------------|----------|-------------|-------------|-------------|-------------|
| TACO | pass@1 | 0.7 | 0.7 | 1.7 | 2.7 |
| TACO | pass@1 | 0.7 | 0.7 | 1.7 | 2.7 |
Please refer to [repo](https://github.com/FlagOpen/TACO/tree/main?tab=readme-ov-file) for original results if needed.
......@@ -47,4 +47,4 @@ Please refer to [repo](https://github.com/FlagOpen/TACO/tree/main?tab=readme-ov-
journal={arXiv preprint arXiv:2312.14852},
year={2023}
}
```
\ No newline at end of file
```
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment