Commit 03be40e2 authored by lintangsutawika's avatar lintangsutawika
Browse files

udpate

parent e795efcf
......@@ -26,6 +26,10 @@ Homepage: https://github.com/suzgunmirac/BIG-Bench-Hard
#### Groups
- `bbh_flan_zeroshot`
- `bbh_flan_fewshot`
- `bbh_flan_cot_fewshot`
- `bbh_flan_cot_zeroshot`
#### Tasks
......
......@@ -7,8 +7,8 @@ metric_list:
- metric: exact_match
aggregation: mean
higher_is_better: true
ignore_case: true
ignore_punctuation: true
# ignore_case: true
# ignore_punctuation: true
generation_kwargs:
until:
- "</s>"
......
......@@ -7,8 +7,8 @@ metric_list:
- metric: exact_match
aggregation: mean
higher_is_better: true
ignore_case: true
ignore_punctuation: true
# ignore_case: true
# ignore_punctuation: true
generation_kwargs:
until:
- "</s>"
......
......@@ -7,8 +7,8 @@ metric_list:
- metric: exact_match
aggregation: mean
higher_is_better: true
ignore_case: true
ignore_punctuation: true
# ignore_case: true
# ignore_punctuation: true
generation_kwargs:
until:
- "</s>"
......
......@@ -7,8 +7,8 @@ metric_list:
- metric: exact_match
aggregation: mean
higher_is_better: true
ignore_case: true
ignore_punctuation: true
# ignore_case: true
# ignore_punctuation: true
generation_kwargs:
until:
- "</s>"
......
......@@ -92,7 +92,6 @@ if __name__ == "__main__":
base_yaml_name = os.path.split(args.base_yaml_path)[-1]
with open(args.base_yaml_path) as f:
base_yaml = yaml.full_load(f)
print(base_yaml)
if args.cot_prompt_path is not None:
import json
......@@ -115,4 +114,4 @@ if __name__ == "__main__":
file_save_path = args.save_prefix_path + f"_{subject}.yaml"
eval_logger.info(f"Saving yaml for subset {subject} to {file_save_path}")
with open(file_save_path, "w") as yaml_file:
yaml.dump(yaml_dict, yaml_file, width=float("inf"))
yaml.dump(yaml_dict, yaml_file, width=float("inf"), allow_unicode=True, default_style='"')
group:
- mmlu
- mmlu_original
- multiple_choice
task: mmlu_original_abstract_algebra
dataset_path: cais/mmlu
dataset_name: abstract_algebra
output_type: multiple_choice
validation_split: validation
test_split: test
description: "The following are multiple choice questions (with answers) about abstract algebra.\n\n"
doc_to_text: "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:"
doc_to_choice: ["A", "B", "C", "D"]
doc_to_target: "{{answer}}"
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
......@@ -2,24 +2,23 @@ group: mmlu_flan_cot_fewshot
dataset_path: cais/mmlu
validation_split: validation
fewshot_split: dev
doc_to_text: "\n\nQ: {{question.strip()}}\n(A) {{choices[0]}} (B) {{choices[1]}} (C) {{choices[2]}} (D) {{choices[3]}}\nA: Let's think step by step."
fewshot_delimiter: ""
output_type: greedy_until
doc_to_text: "Q: {{question.strip()}}\n(A) {{choices[0]}} (B) {{choices[1]}} (C) {{choices[2]}} (D) {{choices[3]}}\nA: Let's think step by step."
doc_to_target: "{{['(A)', '(B)', '(C)', '(D)'][answer]}}"
metric_list:
- metric: exact_match
aggregation: mean
higher_is_better: true
ignore_case: true
ignore_punctuation: true
filter_list:
- name: "get-answer"
filter:
- function: "regex"
regex_pattern: "(?<=The answer is )(.*)(?=.)"
- function: "take_first"
generation_kwargs:
until:
- "</s>"
do_sample: false
temperature: 0.0
filter_list:
- name: "get-answer"
filter:
- function: "regex"
regex_pattern: "(?<=The answer is )(.*)(.)"
- function: "take_first"
\ No newline at end of file
metric_list:
- metric: exact_match
aggregation: mean
higher_is_better: true
ignore_case: true
ignore_punctuation: true
\ No newline at end of file
......@@ -2,24 +2,23 @@ group: mmlu_flan_cot_zeroshot
dataset_path: cais/mmlu
validation_split: validation
fewshot_split: dev
doc_to_text: "\n\nQ: {{question.strip()}}\n(A) {{choices[0]}} (B) {{choices[1]}} (C) {{choices[2]}} (D) {{choices[3]}}\nA: Let's think step by step."
output_type: greedy_until
fewshot_delimiter: ""
doc_to_text: "Q: {{question.strip()}}\n(A) {{choices[0]}} (B) {{choices[1]}} (C) {{choices[2]}} (D) {{choices[3]}}\nA: Let's think step by step."
doc_to_target: "{{['(A)', '(B)', '(C)', '(D)'][answer]}}"
filter_list:
- name: "get-answer"
filter:
- function: "regex"
regex_pattern: "(?<=The answer is )(.*)(?=.)"
- function: "take_first"
generation_kwargs:
until:
- "</s>"
do_sample: false
temperature: 0.0
metric_list:
- metric: exact_match
aggregation: mean
higher_is_better: true
ignore_case: true
ignore_punctuation: true
generation_kwargs:
until:
- "</s>"
do_sample: false
temperature: 0.0
filter_list:
- name: "get-answer"
filter:
- function: "regex"
regex_pattern: "(?<=The answer is )(.*)(.)"
- function: "take_first"
\ No newline at end of file
......@@ -2,19 +2,13 @@ group: mmlu_flan_n_shot_generative
dataset_path: cais/mmlu
test_split: test
fewshot_split: dev
# doc_to_text: "Q: {{question.strip()}}\n(A) {{choices[0]}} (B) {{choices[1]}} (C) {{choices[2]}} (D) {{choices[3]}}\nA: "
doc_to_text: "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:"
output_type: greedy_until
# doc_to_target: "{{['(A)', '(B)', '(C)', '(D)'][answer]}}"
doc_to_target: "{{['A', 'B', 'C', 'D'][answer]}}"
doc_to_text: "Q: {{question.strip()}}\n(A) {{choices[0]}} (B) {{choices[1]}} (C) {{choices[2]}} (D) {{choices[3]}}\nA: "
doc_to_target: "{{['(A)', '(B)', '(C)', '(D)'][answer]}}"
generation_kwargs:
until:
- "</s>"
metric_list:
- metric: exact_match
aggregation: mean
higher_is_better: true
# ignore_case: true
# ignore_punctuation: true
generation_kwargs:
until:
- "</s>"
# do_sample: false
# temperature: 0.0
\ No newline at end of file
group: mmlu_flan_n_shot_loglikelihood
dataset_path: cais/mmlu
# validation_split: validation
test_split: test
fewshot_split: dev
output_type: multiple_choice
doc_to_text: "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:"
doc_to_choice: ["A", "B", "C", "D"]
doc_to_text: "Q: {{question.strip()}}\n(A) {{choices[0]}} (B) {{choices[1]}} (C) {{choices[2]}} (D) {{choices[3]}}\nA: "
doc_to_choice: ["(A)", "(B)", "(C)", "(D)"]
doc_to_target: answer
metric_list:
- metric: acc
......
dataset_name: abstract_algebra
description: 'The following are multiple choice questions (with answers) about abstract algebra.
'
include: _mmlu_flan_generative_template_yaml
task: mmlu_flan_n_shot_abstract_algebra
dataset_name: anatomy
description: 'The following are multiple choice questions (with answers) about anatomy.
'
include: _mmlu_flan_generative_template_yaml
task: mmlu_flan_n_shot_anatomy
dataset_name: astronomy
description: 'The following are multiple choice questions (with answers) about astronomy.
'
include: _mmlu_flan_generative_template_yaml
task: mmlu_flan_n_shot_astronomy
dataset_name: business_ethics
description: 'The following are multiple choice questions (with answers) about business ethics.
'
include: _mmlu_flan_generative_template_yaml
task: mmlu_flan_n_shot_business_ethics
dataset_name: clinical_knowledge
description: 'The following are multiple choice questions (with answers) about clinical knowledge.
'
include: _mmlu_flan_generative_template_yaml
task: mmlu_flan_n_shot_clinical_knowledge
dataset_name: college_biology
description: 'The following are multiple choice questions (with answers) about college biology.
'
include: _mmlu_flan_generative_template_yaml
task: mmlu_flan_n_shot_college_biology
dataset_name: college_chemistry
description: 'The following are multiple choice questions (with answers) about college chemistry.
'
include: _mmlu_flan_generative_template_yaml
task: mmlu_flan_n_shot_college_chemistry
dataset_name: college_computer_science
description: 'The following are multiple choice questions (with answers) about college computer science.
'
include: _mmlu_flan_generative_template_yaml
task: mmlu_flan_n_shot_college_computer_science
dataset_name: college_mathematics
description: 'The following are multiple choice questions (with answers) about college mathematics.
'
include: _mmlu_flan_generative_template_yaml
task: mmlu_flan_n_shot_college_mathematics
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment