update

06d3406e · lintangsutawika · f23ae748 · f23ae748 · f23ae748 · f23ae748
Commit 06d3406e authored Sep 04, 2023 by lintangsutawika
20 changed files
--- a/lm_eval/tasks/bbh/multistep_arithmetic_two.yaml
+++ b/lm_eval/tasks/bbh/multistep_arithmetic_two.yaml
-# Generated by _generate_configs.py
-dataset_name: multistep_arithmetic_two
-include: _template_yaml
-task: bbh_multistep_arithmetic_two
--- a/lm_eval/tasks/bbh/navigate.yaml
+++ b/lm_eval/tasks/bbh/navigate.yaml
-# Generated by _generate_configs.py
-dataset_name: navigate
-include: _template_yaml
-task: bbh_navigate
--- a/lm_eval/tasks/bbh/object_counting.yaml
+++ b/lm_eval/tasks/bbh/object_counting.yaml
-# Generated by _generate_configs.py
-dataset_name: object_counting
-include: _template_yaml
-task: bbh_object_counting
--- a/lm_eval/tasks/bbh/penguins_in_a_table.yaml
+++ b/lm_eval/tasks/bbh/penguins_in_a_table.yaml
-# Generated by _generate_configs.py
-dataset_name: penguins_in_a_table
-include: _template_yaml
-task: bbh_penguins_in_a_table
--- a/lm_eval/tasks/bbh/reasoning_about_colored_objects.yaml
+++ b/lm_eval/tasks/bbh/reasoning_about_colored_objects.yaml
-# Generated by _generate_configs.py
-dataset_name: reasoning_about_colored_objects
-include: _template_yaml
-task: bbh_reasoning_about_colored_objects
--- a/lm_eval/tasks/bbh/ruin_names.yaml
+++ b/lm_eval/tasks/bbh/ruin_names.yaml
-# Generated by _generate_configs.py
-dataset_name: ruin_names
-include: _template_yaml
-task: bbh_ruin_names
--- a/lm_eval/tasks/bbh/salient_translation_error_detection.yaml
+++ b/lm_eval/tasks/bbh/salient_translation_error_detection.yaml
-# Generated by _generate_configs.py
-dataset_name: salient_translation_error_detection
-include: _template_yaml
-task: bbh_salient_translation_error_detection
--- a/lm_eval/tasks/bbh/snarks.yaml
+++ b/lm_eval/tasks/bbh/snarks.yaml
-# Generated by _generate_configs.py
-dataset_name: snarks
-include: _template_yaml
-task: bbh_snarks
--- a/lm_eval/tasks/bbh/sports_understanding.yaml
+++ b/lm_eval/tasks/bbh/sports_understanding.yaml
-# Generated by _generate_configs.py
-dataset_name: sports_understanding
-include: _template_yaml
-task: bbh_sports_understanding
--- a/lm_eval/tasks/bbh/temporal_sequences.yaml
+++ b/lm_eval/tasks/bbh/temporal_sequences.yaml
-# Generated by _generate_configs.py
-dataset_name: temporal_sequences
-include: _template_yaml
-task: bbh_temporal_sequences
--- a/lm_eval/tasks/bbh/tracking_shuffled_objects_five_objects.yaml
+++ b/lm_eval/tasks/bbh/tracking_shuffled_objects_five_objects.yaml
-# Generated by _generate_configs.py
-dataset_name: tracking_shuffled_objects_five_objects
-include: _template_yaml
-task: bbh_tracking_shuffled_objects_five_objects
--- a/lm_eval/tasks/bbh/tracking_shuffled_objects_seven_objects.yaml
+++ b/lm_eval/tasks/bbh/tracking_shuffled_objects_seven_objects.yaml
-# Generated by _generate_configs.py
-dataset_name: tracking_shuffled_objects_seven_objects
-include: _template_yaml
-task: bbh_tracking_shuffled_objects_seven_objects
--- a/lm_eval/tasks/bbh/tracking_shuffled_objects_three_objects.yaml
+++ b/lm_eval/tasks/bbh/tracking_shuffled_objects_three_objects.yaml
-# Generated by _generate_configs.py
-dataset_name: tracking_shuffled_objects_three_objects
-include: _template_yaml
-task: bbh_tracking_shuffled_objects_three_objects
--- a/lm_eval/tasks/bbh/web_of_lies.yaml
+++ b/lm_eval/tasks/bbh/web_of_lies.yaml
-# Generated by _generate_configs.py
-dataset_name: web_of_lies
-include: _template_yaml
-task: bbh_web_of_lies
--- a/lm_eval/tasks/bbh/word_sorting.yaml
+++ b/lm_eval/tasks/bbh/word_sorting.yaml
-# Generated by _generate_configs.py
-dataset_name: word_sorting
-include: _template_yaml
-task: bbh_word_sorting
--- a/lm_eval/tasks/mmlu/_generate_configs.py
+++ b/lm_eval/tasks/mmlu/_generate_configs.py
@@ -115,4 +115,4 @@ if __name__ == "__main__":
        file_save_path = args.save_prefix_path + f"_{subject}.yaml"
        eval_logger.info(f"Saving yaml for subset {subject} to {file_save_path}")
        with open(file_save_path, "w") as yaml_file:
-            yaml.dump(yaml_dict, yaml_file)
+            yaml.dump(yaml_dict, yaml_file, width=float("inf"))
--- a/lm_eval/tasks/mmlu/flan_n_shot/_mmlu_flan_generative_template_yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/_mmlu_flan_generative_template_yaml
 group: mmlu_flan
 dataset_path: cais/mmlu
-validation_split: validation
+# validation_split: validation
+test_split: test
 fewshot_split: dev
-doc_to_text: "Q: {{question.strip()}}\n(A) {{choices[0]}} (B) {{choices[1]}} (C) {{choices[2]}} (D) {{choices[3]}}\nA:"
+# doc_to_text: "Q: {{question.strip()}}\n(A) {{choices[0]}} (B) {{choices[1]}} (C) {{choices[2]}} (D) {{choices[3]}}\nA: "
+doc_to_text: "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:"
 output_type: greedy_until
-doc_to_target: "{{['(A)', '(B)', '(C)', '(D)'][answer]}}"
+# doc_to_target: "{{['(A)', '(B)', '(C)', '(D)'][answer]}}"
+doc_to_target: "{{['A', 'B', 'C', 'D'][answer]}}"
 metric_list:
  - metric: exact_match
    aggregation: mean
    higher_is_better: true
-    ignore_case: true
-    ignore_punctuation: true
+    # ignore_case: true
+    # ignore_punctuation: true
 generation_kwargs:
  until:
    - "</s>"
-  do_sample: false
-  temperature: 0.0
\ No newline at end of file
+#   do_sample: false
+#   temperature: 0.0
\ No newline at end of file
--- a/lm_eval/tasks/mmlu/flan_n_shot/_mmlu_flan_loglikelihood_template_yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/_mmlu_flan_loglikelihood_template_yaml
 group: mmlu_flan_loglikelihood
 dataset_path: cais/mmlu
-validation_split: validation
+# validation_split: validation
+test_split: test
 fewshot_split: dev
-doc_to_text: "Q: {{question.strip()}}\n(A) {{choices[0]}} (B) {{choices[1]}} (C) {{choices[2]}} (D) {{choices[3]}}\nA:"
 output_type: multiple_choice
-doc_to_choice: ['(A)', '(B)', '(C)', '(D)']
+doc_to_text: "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:"
+doc_to_choice: ["A", "B", "C", "D"]
 doc_to_target: answer
 metric_list:
  - metric: acc
    aggregation: mean
    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
\ No newline at end of file
--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_abstract_algebra.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_abstract_algebra.yaml
 dataset_name: abstract_algebra
-description: 'The following are multiple choice questions (with answers) about abstract
-  algebra.
+description: 'The following are multiple choice questions (with answers) about abstract algebra.


  '

--- a/lm_eval/tasks/mmlu/flan_n_shot/mmlu_business_ethics.yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/mmlu_business_ethics.yaml
 dataset_name: business_ethics
-description: 'The following are multiple choice questions (with answers) about business
-  ethics.
+description: 'The following are multiple choice questions (with answers) about business ethics.


  '