Commit 03be40e2 authored by lintangsutawika's avatar lintangsutawika
Browse files

udpate

parent e795efcf
...@@ -26,6 +26,10 @@ Homepage: https://github.com/suzgunmirac/BIG-Bench-Hard ...@@ -26,6 +26,10 @@ Homepage: https://github.com/suzgunmirac/BIG-Bench-Hard
#### Groups #### Groups
- `bbh_flan_zeroshot` - `bbh_flan_zeroshot`
- `bbh_flan_fewshot`
- `bbh_flan_cot_fewshot`
- `bbh_flan_cot_zeroshot`
#### Tasks #### Tasks
......
...@@ -7,8 +7,8 @@ metric_list: ...@@ -7,8 +7,8 @@ metric_list:
- metric: exact_match - metric: exact_match
aggregation: mean aggregation: mean
higher_is_better: true higher_is_better: true
ignore_case: true # ignore_case: true
ignore_punctuation: true # ignore_punctuation: true
generation_kwargs: generation_kwargs:
until: until:
- "</s>" - "</s>"
......
...@@ -7,8 +7,8 @@ metric_list: ...@@ -7,8 +7,8 @@ metric_list:
- metric: exact_match - metric: exact_match
aggregation: mean aggregation: mean
higher_is_better: true higher_is_better: true
ignore_case: true # ignore_case: true
ignore_punctuation: true # ignore_punctuation: true
generation_kwargs: generation_kwargs:
until: until:
- "</s>" - "</s>"
......
...@@ -7,8 +7,8 @@ metric_list: ...@@ -7,8 +7,8 @@ metric_list:
- metric: exact_match - metric: exact_match
aggregation: mean aggregation: mean
higher_is_better: true higher_is_better: true
ignore_case: true # ignore_case: true
ignore_punctuation: true # ignore_punctuation: true
generation_kwargs: generation_kwargs:
until: until:
- "</s>" - "</s>"
......
...@@ -7,8 +7,8 @@ metric_list: ...@@ -7,8 +7,8 @@ metric_list:
- metric: exact_match - metric: exact_match
aggregation: mean aggregation: mean
higher_is_better: true higher_is_better: true
ignore_case: true # ignore_case: true
ignore_punctuation: true # ignore_punctuation: true
generation_kwargs: generation_kwargs:
until: until:
- "</s>" - "</s>"
......
...@@ -92,7 +92,6 @@ if __name__ == "__main__": ...@@ -92,7 +92,6 @@ if __name__ == "__main__":
base_yaml_name = os.path.split(args.base_yaml_path)[-1] base_yaml_name = os.path.split(args.base_yaml_path)[-1]
with open(args.base_yaml_path) as f: with open(args.base_yaml_path) as f:
base_yaml = yaml.full_load(f) base_yaml = yaml.full_load(f)
print(base_yaml)
if args.cot_prompt_path is not None: if args.cot_prompt_path is not None:
import json import json
...@@ -115,4 +114,4 @@ if __name__ == "__main__": ...@@ -115,4 +114,4 @@ if __name__ == "__main__":
file_save_path = args.save_prefix_path + f"_{subject}.yaml" file_save_path = args.save_prefix_path + f"_{subject}.yaml"
eval_logger.info(f"Saving yaml for subset {subject} to {file_save_path}") eval_logger.info(f"Saving yaml for subset {subject} to {file_save_path}")
with open(file_save_path, "w") as yaml_file: with open(file_save_path, "w") as yaml_file:
yaml.dump(yaml_dict, yaml_file, width=float("inf")) yaml.dump(yaml_dict, yaml_file, width=float("inf"), allow_unicode=True, default_style='"')
group:
- mmlu
- mmlu_original
- multiple_choice
task: mmlu_original_abstract_algebra
dataset_path: cais/mmlu
dataset_name: abstract_algebra
output_type: multiple_choice
validation_split: validation
test_split: test
description: "The following are multiple choice questions (with answers) about abstract algebra.\n\n"
doc_to_text: "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:"
doc_to_choice: ["A", "B", "C", "D"]
doc_to_target: "{{answer}}"
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
...@@ -2,24 +2,23 @@ group: mmlu_flan_cot_fewshot ...@@ -2,24 +2,23 @@ group: mmlu_flan_cot_fewshot
dataset_path: cais/mmlu dataset_path: cais/mmlu
validation_split: validation validation_split: validation
fewshot_split: dev fewshot_split: dev
doc_to_text: "\n\nQ: {{question.strip()}}\n(A) {{choices[0]}} (B) {{choices[1]}} (C) {{choices[2]}} (D) {{choices[3]}}\nA: Let's think step by step."
fewshot_delimiter: ""
output_type: greedy_until output_type: greedy_until
doc_to_text: "Q: {{question.strip()}}\n(A) {{choices[0]}} (B) {{choices[1]}} (C) {{choices[2]}} (D) {{choices[3]}}\nA: Let's think step by step."
doc_to_target: "{{['(A)', '(B)', '(C)', '(D)'][answer]}}" doc_to_target: "{{['(A)', '(B)', '(C)', '(D)'][answer]}}"
metric_list: filter_list:
- metric: exact_match - name: "get-answer"
aggregation: mean filter:
higher_is_better: true - function: "regex"
ignore_case: true regex_pattern: "(?<=The answer is )(.*)(?=.)"
ignore_punctuation: true - function: "take_first"
generation_kwargs: generation_kwargs:
until: until:
- "</s>" - "</s>"
do_sample: false do_sample: false
temperature: 0.0 temperature: 0.0
filter_list: metric_list:
- name: "get-answer" - metric: exact_match
filter: aggregation: mean
- function: "regex" higher_is_better: true
regex_pattern: "(?<=The answer is )(.*)(.)" ignore_case: true
- function: "take_first" ignore_punctuation: true
\ No newline at end of file \ No newline at end of file
...@@ -2,24 +2,23 @@ group: mmlu_flan_cot_zeroshot ...@@ -2,24 +2,23 @@ group: mmlu_flan_cot_zeroshot
dataset_path: cais/mmlu dataset_path: cais/mmlu
validation_split: validation validation_split: validation
fewshot_split: dev fewshot_split: dev
doc_to_text: "\n\nQ: {{question.strip()}}\n(A) {{choices[0]}} (B) {{choices[1]}} (C) {{choices[2]}} (D) {{choices[3]}}\nA: Let's think step by step."
output_type: greedy_until output_type: greedy_until
fewshot_delimiter: "" doc_to_text: "Q: {{question.strip()}}\n(A) {{choices[0]}} (B) {{choices[1]}} (C) {{choices[2]}} (D) {{choices[3]}}\nA: Let's think step by step."
doc_to_target: "{{['(A)', '(B)', '(C)', '(D)'][answer]}}" doc_to_target: "{{['(A)', '(B)', '(C)', '(D)'][answer]}}"
filter_list:
- name: "get-answer"
filter:
- function: "regex"
regex_pattern: "(?<=The answer is )(.*)(?=.)"
- function: "take_first"
generation_kwargs:
until:
- "</s>"
do_sample: false
temperature: 0.0
metric_list: metric_list:
- metric: exact_match - metric: exact_match
aggregation: mean aggregation: mean
higher_is_better: true higher_is_better: true
ignore_case: true ignore_case: true
ignore_punctuation: true ignore_punctuation: true
generation_kwargs:
until:
- "</s>"
do_sample: false
temperature: 0.0
filter_list:
- name: "get-answer"
filter:
- function: "regex"
regex_pattern: "(?<=The answer is )(.*)(.)"
- function: "take_first"
\ No newline at end of file
...@@ -2,19 +2,13 @@ group: mmlu_flan_n_shot_generative ...@@ -2,19 +2,13 @@ group: mmlu_flan_n_shot_generative
dataset_path: cais/mmlu dataset_path: cais/mmlu
test_split: test test_split: test
fewshot_split: dev fewshot_split: dev
# doc_to_text: "Q: {{question.strip()}}\n(A) {{choices[0]}} (B) {{choices[1]}} (C) {{choices[2]}} (D) {{choices[3]}}\nA: "
doc_to_text: "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:"
output_type: greedy_until output_type: greedy_until
# doc_to_target: "{{['(A)', '(B)', '(C)', '(D)'][answer]}}" doc_to_text: "Q: {{question.strip()}}\n(A) {{choices[0]}} (B) {{choices[1]}} (C) {{choices[2]}} (D) {{choices[3]}}\nA: "
doc_to_target: "{{['A', 'B', 'C', 'D'][answer]}}" doc_to_target: "{{['(A)', '(B)', '(C)', '(D)'][answer]}}"
generation_kwargs:
until:
- "</s>"
metric_list: metric_list:
- metric: exact_match - metric: exact_match
aggregation: mean aggregation: mean
higher_is_better: true higher_is_better: true
# ignore_case: true
# ignore_punctuation: true
generation_kwargs:
until:
- "</s>"
# do_sample: false
# temperature: 0.0
\ No newline at end of file
group: mmlu_flan_n_shot_loglikelihood group: mmlu_flan_n_shot_loglikelihood
dataset_path: cais/mmlu dataset_path: cais/mmlu
# validation_split: validation
test_split: test test_split: test
fewshot_split: dev fewshot_split: dev
output_type: multiple_choice output_type: multiple_choice
doc_to_text: "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:" doc_to_text: "Q: {{question.strip()}}\n(A) {{choices[0]}} (B) {{choices[1]}} (C) {{choices[2]}} (D) {{choices[3]}}\nA: "
doc_to_choice: ["A", "B", "C", "D"] doc_to_choice: ["(A)", "(B)", "(C)", "(D)"]
doc_to_target: answer doc_to_target: answer
metric_list: metric_list:
- metric: acc - metric: acc
......
dataset_name: abstract_algebra
description: 'The following are multiple choice questions (with answers) about abstract algebra.
'
include: _mmlu_flan_generative_template_yaml
task: mmlu_flan_n_shot_abstract_algebra
dataset_name: anatomy
description: 'The following are multiple choice questions (with answers) about anatomy.
'
include: _mmlu_flan_generative_template_yaml
task: mmlu_flan_n_shot_anatomy
dataset_name: astronomy
description: 'The following are multiple choice questions (with answers) about astronomy.
'
include: _mmlu_flan_generative_template_yaml
task: mmlu_flan_n_shot_astronomy
dataset_name: business_ethics
description: 'The following are multiple choice questions (with answers) about business ethics.
'
include: _mmlu_flan_generative_template_yaml
task: mmlu_flan_n_shot_business_ethics
dataset_name: clinical_knowledge
description: 'The following are multiple choice questions (with answers) about clinical knowledge.
'
include: _mmlu_flan_generative_template_yaml
task: mmlu_flan_n_shot_clinical_knowledge
dataset_name: college_biology
description: 'The following are multiple choice questions (with answers) about college biology.
'
include: _mmlu_flan_generative_template_yaml
task: mmlu_flan_n_shot_college_biology
dataset_name: college_chemistry
description: 'The following are multiple choice questions (with answers) about college chemistry.
'
include: _mmlu_flan_generative_template_yaml
task: mmlu_flan_n_shot_college_chemistry
dataset_name: college_computer_science
description: 'The following are multiple choice questions (with answers) about college computer science.
'
include: _mmlu_flan_generative_template_yaml
task: mmlu_flan_n_shot_college_computer_science
dataset_name: college_mathematics
description: 'The following are multiple choice questions (with answers) about college mathematics.
'
include: _mmlu_flan_generative_template_yaml
task: mmlu_flan_n_shot_college_mathematics
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment