udpate

03be40e2 · lintangsutawika · e795efcf · 03be40e2 · 03be40e2 · 03be40e2
Commit 03be40e2 authored Sep 04, 2023 by lintangsutawika
20 changed files
--- a/lm_eval/tasks/bbh/README.md
+++ b/lm_eval/tasks/bbh/README.md
@@ -26,6 +26,10 @@ Homepage: https://github.com/suzgunmirac/BIG-Bench-Hard
 #### Groups
 - `bbh_flan_zeroshot`
+- `bbh_flan_fewshot`
+- `bbh_flan_cot_fewshot`
+- `bbh_flan_cot_zeroshot`
 #### Tasks

--- a/lm_eval/tasks/bbh/flan_cot_fewshot/_flan_cot_fewshot_template_yaml
+++ b/lm_eval/tasks/bbh/flan_cot_fewshot/_flan_cot_fewshot_template_yaml
@@ -7,8 +7,8 @@ metric_list:
  - metric: exact_match
    aggregation: mean
    higher_is_better: true
-    ignore_case: true
+    # ignore_case: true
-    ignore_punctuation: true
+    # ignore_punctuation: true
 generation_kwargs:
  until:
    - "</s>"

--- a/lm_eval/tasks/bbh/flan_cot_zeroshot/_flan_cot_zeroshot_template_yaml
+++ b/lm_eval/tasks/bbh/flan_cot_zeroshot/_flan_cot_zeroshot_template_yaml
@@ -7,8 +7,8 @@ metric_list:
  - metric: exact_match
    aggregation: mean
    higher_is_better: true
-    ignore_case: true
+    # ignore_case: true
-    ignore_punctuation: true
+    # ignore_punctuation: true
 generation_kwargs:
  until:
    - "</s>"

--- a/lm_eval/tasks/bbh/flan_fewshot/_flan_fewshot_template_yaml
+++ b/lm_eval/tasks/bbh/flan_fewshot/_flan_fewshot_template_yaml
@@ -7,8 +7,8 @@ metric_list:
  - metric: exact_match
    aggregation: mean
    higher_is_better: true
-    ignore_case: true
+    # ignore_case: true
-    ignore_punctuation: true
+    # ignore_punctuation: true
 generation_kwargs:
  until:
    - "</s>"

--- a/lm_eval/tasks/bbh/flan_zeroshot/_flan_zeroshot_template_yaml
+++ b/lm_eval/tasks/bbh/flan_zeroshot/_flan_zeroshot_template_yaml
@@ -7,8 +7,8 @@ metric_list:
  - metric: exact_match
    aggregation: mean
    higher_is_better: true
-    ignore_case: true
+    # ignore_case: true
-    ignore_punctuation: true
+    # ignore_punctuation: true
 generation_kwargs:
  until:
    - "</s>"

--- a/lm_eval/tasks/mmlu/_generate_configs.py
+++ b/lm_eval/tasks/mmlu/_generate_configs.py
@@ -92,7 +92,6 @@ if __name__ == "__main__":
    base_yaml_name = os.path.split(args.base_yaml_path)[-1]
    with open(args.base_yaml_path) as f:
        base_yaml = yaml.full_load(f)
-    print(base_yaml)
    if args.cot_prompt_path is not None:
        import json
@@ -115,4 +114,4 @@ if __name__ == "__main__":
        file_save_path = args.save_prefix_path + f"_{subject}.yaml"
        eval_logger.info(f"Saving yaml for subset {subject} to {file_save_path}")
        with open(file_save_path, "w") as yaml_file:
-            yaml.dump(yaml_dict, yaml_file, width=float("inf"))
+            yaml.dump(yaml_dict, yaml_file, width=float("inf"), allow_unicode=True, default_style='"')
--- a/lm_eval/tasks/mmlu/default/hendrycks_test_original_default.yaml
+++ b/lm_eval/tasks/mmlu/default/hendrycks_test_original_default.yaml
-group:
-  - mmlu
-  - mmlu_original
-  - multiple_choice
-task: mmlu_original_abstract_algebra
-dataset_path: cais/mmlu
-dataset_name: abstract_algebra
-output_type: multiple_choice
-validation_split: validation
-test_split: test
-description: "The following are multiple choice questions (with answers) about abstract algebra.\n\n"
-doc_to_text: "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:"
-doc_to_choice: ["A", "B", "C", "D"]
-doc_to_target: "{{answer}}"
-metric_list:
-  - metric: acc
-    aggregation: mean
-    higher_is_better: true
-  - metric: acc_norm
-    aggregation: mean
-    higher_is_better: true
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu_flan_cot_fewshot_template_yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu_flan_cot_fewshot_template_yaml
@@ -2,24 +2,23 @@ group: mmlu_flan_cot_fewshot
 dataset_path: cais/mmlu
 validation_split: validation
 fewshot_split: dev
-doc_to_text: "\n\nQ: {{question.strip()}}\n(A) {{choices[0]}} (B) {{choices[1]}} (C) {{choices[2]}} (D) {{choices[3]}}\nA: Let's think step by step."
-fewshot_delimiter: ""
 output_type: greedy_until
+doc_to_text: "Q: {{question.strip()}}\n(A) {{choices[0]}} (B) {{choices[1]}} (C) {{choices[2]}} (D) {{choices[3]}}\nA: Let's think step by step."
 doc_to_target: "{{['(A)', '(B)', '(C)', '(D)'][answer]}}"
-metric_list:
+filter_list:
-  - metric: exact_match
+  - name: "get-answer"
-    aggregation: mean
+    filter:
-    higher_is_better: true
+      - function: "regex"
-    ignore_case: true
+        regex_pattern: "(?<=The answer is )(.*)(?=.)"
-    ignore_punctuation: true
+      - function: "take_first"
 generation_kwargs:
  until:
    - "</s>"
  do_sample: false
  temperature: 0.0
-filter_list:
+metric_list:
-  - name: "get-answer"
+  - metric: exact_match
-    filter:
+    aggregation: mean
-      - function: "regex"
+    higher_is_better: true
-        regex_pattern: "(?<=The answer is )(.*)(.)"
+    ignore_case: true
-      - function: "take_first"
+    ignore_punctuation: true
\ No newline at end of file
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu_flan_generative_template_yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu_flan_generative_template_yaml
@@ -2,24 +2,23 @@ group: mmlu_flan_cot_zeroshot
 dataset_path: cais/mmlu
 validation_split: validation
 fewshot_split: dev
-doc_to_text: "\n\nQ: {{question.strip()}}\n(A) {{choices[0]}} (B) {{choices[1]}} (C) {{choices[2]}} (D) {{choices[3]}}\nA: Let's think step by step."
 output_type: greedy_until
-fewshot_delimiter: ""
+doc_to_text: "Q: {{question.strip()}}\n(A) {{choices[0]}} (B) {{choices[1]}} (C) {{choices[2]}} (D) {{choices[3]}}\nA: Let's think step by step."
 doc_to_target: "{{['(A)', '(B)', '(C)', '(D)'][answer]}}"
+filter_list:
+  - name: "get-answer"
+    filter:
+      - function: "regex"
+        regex_pattern: "(?<=The answer is )(.*)(?=.)"
+      - function: "take_first"
+generation_kwargs:
+  until:
+    - "</s>"
+  do_sample: false
+  temperature: 0.0
 metric_list:
  - metric: exact_match
    aggregation: mean
    higher_is_better: true
    ignore_case: true
    ignore_punctuation: true
-generation_kwargs:
-  until:
-    - "</s>"
-  do_sample: false
-  temperature: 0.0
-filter_list:
-  - name: "get-answer"
-    filter:
-      - function: "regex"
-        regex_pattern: "(?<=The answer is )(.*)(.)"
-      - function: "take_first"
\ No newline at end of file
--- a/lm_eval/tasks/mmlu/flan_n_shot/_mmlu_flan_generative_template_yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/_mmlu_flan_generative_template_yaml
@@ -2,19 +2,13 @@ group: mmlu_flan_n_shot_generative
 dataset_path: cais/mmlu
 test_split: test
 fewshot_split: dev
-# doc_to_text: "Q: {{question.strip()}}\n(A) {{choices[0]}} (B) {{choices[1]}} (C) {{choices[2]}} (D) {{choices[3]}}\nA: "
-doc_to_text: "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:"
 output_type: greedy_until
-# doc_to_target: "{{['(A)', '(B)', '(C)', '(D)'][answer]}}"
+doc_to_text: "Q: {{question.strip()}}\n(A) {{choices[0]}} (B) {{choices[1]}} (C) {{choices[2]}} (D) {{choices[3]}}\nA: "
-doc_to_target: "{{['A', 'B', 'C', 'D'][answer]}}"
+doc_to_target: "{{['(A)', '(B)', '(C)', '(D)'][answer]}}"
+generation_kwargs:
+  until:
+    - "</s>"
 metric_list:
  - metric: exact_match
    aggregation: mean
    higher_is_better: true
-    # ignore_case: true
-    # ignore_punctuation: true
-generation_kwargs:
-  until:
-    - "</s>"
-#   do_sample: false
-#   temperature: 0.0
\ No newline at end of file
--- a/lm_eval/tasks/mmlu/flan_n_shot/_mmlu_flan_loglikelihood_template_yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/_mmlu_flan_loglikelihood_template_yaml
 group: mmlu_flan_n_shot_loglikelihood
 dataset_path: cais/mmlu
-# validation_split: validation
 test_split: test
 fewshot_split: dev
 output_type: multiple_choice
-doc_to_text: "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:"
+doc_to_text: "Q: {{question.strip()}}\n(A) {{choices[0]}} (B) {{choices[1]}} (C) {{choices[2]}} (D) {{choices[3]}}\nA: "
-doc_to_choice: ["A", "B", "C", "D"]
+doc_to_choice: ["(A)", "(B)", "(C)", "(D)"]
 doc_to_target: answer
 metric_list:
  - metric: acc

--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_abstract_algebra.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_abstract_algebra.yaml
-dataset_name: abstract_algebra
-description: 'The following are multiple choice questions (with answers) about abstract algebra.
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_n_shot_abstract_algebra
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_anatomy.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_anatomy.yaml
-dataset_name: anatomy
-description: 'The following are multiple choice questions (with answers) about anatomy.
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_n_shot_anatomy
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_astronomy.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_astronomy.yaml
-dataset_name: astronomy
-description: 'The following are multiple choice questions (with answers) about astronomy.
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_n_shot_astronomy
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_business_ethics.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_business_ethics.yaml
-dataset_name: business_ethics
-description: 'The following are multiple choice questions (with answers) about business ethics.
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_n_shot_business_ethics
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_clinical_knowledge.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_clinical_knowledge.yaml
-dataset_name: clinical_knowledge
-description: 'The following are multiple choice questions (with answers) about clinical knowledge.
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_n_shot_clinical_knowledge
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_college_biology.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_college_biology.yaml
-dataset_name: college_biology
-description: 'The following are multiple choice questions (with answers) about college biology.
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_n_shot_college_biology
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_college_chemistry.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_college_chemistry.yaml
-dataset_name: college_chemistry
-description: 'The following are multiple choice questions (with answers) about college chemistry.
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_n_shot_college_chemistry
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_college_computer_science.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_college_computer_science.yaml
-dataset_name: college_computer_science
-description: 'The following are multiple choice questions (with answers) about college computer science.
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_n_shot_college_computer_science
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_college_mathematics.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_college_mathematics.yaml
-dataset_name: college_mathematics
-description: 'The following are multiple choice questions (with answers) about college mathematics.
-  '
-include: _mmlu_flan_generative_template_yaml
-task: mmlu_flan_n_shot_college_mathematics