update

06d3406e · lintangsutawika · f23ae748 · 06d3406e · f23ae748 · 06d3406e
Commit 06d3406e authored Sep 04, 2023 by lintangsutawika
20 changed files
--- a/lm_eval/benchmarks/flan/prompt_templates/flan_anli.yaml
+++ b/lm_eval/benchmarks/flan/prompt_templates/flan_anli.yaml
 # Flan Prompt Templates
 prompts:
  "template-0":
-    doc_to_text: "{{context}}\n\nChoose your answer: based on the paragraph above can we conclude that \"{{hypothesis}}\"?\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nI think the answer is"
+    doc_to_text: "{{context}}\n\nChoose your answer: based on the paragraph above can we conclude that \"{{hypothesis}}\"?\n\nOPTIONS:\n- Yes\n- It\'s impossible to say\n- No\nI think the answer is"
-    doc_to_target: """{{["Yes", "It's impossible to say", "No"][label]}}"""
+    doc_to_target: "{{['Yes', 'It\'s impossible to say', 'No'][label]}}"
  "template-1":
-    doc_to_text: "{{context}}\n\nBased on that paragraph can we conclude that this sentence is true?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
+    doc_to_text: "{{context}}\n\nBased on that paragraph can we conclude that this sentence is true?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It\'s impossible to say\n- No"
-    doc_to_target: "{{["Yes", "It's impossible to say", "No"][label]}}"
+    doc_to_target: "{{['Yes', 'It\'s impossible to say', 'No'][label]}}"
  "template-2":
-    doc_to_text: "{{context}}\n\nCan we draw the following conclusion?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
+    doc_to_text: "{{context}}\n\nCan we draw the following conclusion?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It\'s impossible to say\n- No"
-    doc_to_target: "{{["Yes", "It's impossible to say", "No"][label]}}"
+    doc_to_target: "{{['Yes', 'It\'s impossible to say', 'No'][label]}}"
  "template-3":
-    doc_to_text: "{{context}}\nDoes this next sentence follow, given the preceding text?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
+    doc_to_text: "{{context}}\nDoes this next sentence follow, given the preceding text?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It\'s impossible to say\n- No"
-    doc_to_target: "{{["Yes", "It's impossible to say", "No"][label]}}"
+    doc_to_target: "{{['Yes', 'It\'s impossible to say', 'No'][label]}}"
  "template-4":
-    doc_to_text: "{{context}}\nCan we infer the following?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nThe answer is:"
+    doc_to_text: "{{context}}\nCan we infer the following?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It\'s impossible to say\n- No\nThe answer is:"
-    doc_to_target: "{{["Yes", "It's impossible to say", "No"][label]}}"
+    doc_to_target: "{{['Yes', 'It\'s impossible to say', 'No'][label]}}"
  "template-5":
-    doc_to_text: "Read the following paragraph and determine if the hypothesis is true:\n\n{{context}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nHypothesis: {{hypothesis}}\n\n\n"
+    doc_to_text: "Read the following paragraph and determine if the hypothesis is true:\n\n{{context}}\n\nOPTIONS:\n- Yes\n- It\'s impossible to say\n- No\nHypothesis: {{hypothesis}}\n\n\n"
-    doc_to_target: "{{["Yes", "It's impossible to say", "No"][label]}}"
+    doc_to_target: "{{['Yes', 'It\'s impossible to say', 'No'][label]}}"
  "template-6":
-    doc_to_text: "Read the text and determine if the sentence is true (see options at the end):\n\n{{context}}\n\nSentence: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
+    doc_to_text: "Read the text and determine if the sentence is true (see options at the end):\n\n{{context}}\n\nSentence: {{hypothesis}}\nOPTIONS:\n- Yes\n- It\'s impossible to say\n- No"
-    doc_to_target: "{{["Yes", "It's impossible to say", "No"][label]}}"
+    doc_to_target: "{{['Yes', 'It\'s impossible to say', 'No'][label]}}"
  "template-7":
-    doc_to_text: "Can we draw the following hypothesis from the context (see options)? \n\nContext:\n\n{{context}}\n\nHypothesis: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
+    doc_to_text: "Can we draw the following hypothesis from the context (see options)? \n\nContext:\n\n{{context}}\n\nHypothesis: {{hypothesis}}\nOPTIONS:\n- Yes\n- It\'s impossible to say\n- No"
-    doc_to_target: "{{["Yes", "It's impossible to say", "No"][label]}}"
+    doc_to_target: "{{['Yes', 'It\'s impossible to say', 'No'][label]}}"
  "template-8":
-    doc_to_text: "Choose from options: Determine if the sentence is true based on the text below:\n{{hypothesis}}\n\n{{context}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
+    doc_to_text: "Choose from options: Determine if the sentence is true based on the text below:\n{{hypothesis}}\n\n{{context}}\nOPTIONS:\n- Yes\n- It\'s impossible to say\n- No"
-    doc_to_target: "{{["Yes", "It's impossible to say", "No"][label]}}"
+    doc_to_target: "{{['Yes', 'It\'s impossible to say', 'No'][label]}}"
--- a/lm_eval/benchmarks/flan/prompt_templates/flan_bbh.yaml
+++ b/lm_eval/benchmarks/flan/prompt_templates/flan_bbh.yaml
-# Flan Prompt Templates
-prompts:
-  "template-0":
-    doc_to_text: "{{context}}\n\nChoose your answer: based on the paragraph above can we conclude that \"{{hypothesis}}\"?\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nI think the answer is"
-    doc_to_target: "{{['Yes', 'It\'s impossible to say', 'No'][label]}}"
-  "template-1":
-    doc_to_text: "{{context}}\n\nBased on that paragraph can we conclude that this sentence is true?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
-    doc_to_target: "{{['Yes', 'It\'s impossible to say', 'No'][label]}}"
-  "template-2":
-    doc_to_text: "{{context}}\n\nCan we draw the following conclusion?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
-    doc_to_target: "{{['Yes', 'It\'s impossible to say', 'No'][label]}}"
-  "template-3":
-    doc_to_text: "{{context}}\nDoes this next sentence follow, given the preceding text?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
-    doc_to_target: "{{['Yes', 'It\'s impossible to say', 'No'][label]}}"
-  "template-4":
-    doc_to_text: "{{context}}\nCan we infer the following?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nThe answer is:"
-    doc_to_target: "{{['Yes', 'It\'s impossible to say', 'No'][label]}}"
-  "template-5":
-    doc_to_text: "Read the following paragraph and determine if the hypothesis is true:\n\n{{context}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nHypothesis: {{hypothesis}}\n\n\n"
-    doc_to_target: "{{['Yes', 'It\'s impossible to say', 'No'][label]}}"
-  "template-6":
-    doc_to_text: "Read the text and determine if the sentence is true (see options at the end):\n\n{{context}}\n\nSentence: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
-    doc_to_target: "{{['Yes', 'It\'s impossible to say', 'No'][label]}}"
-  "template-7":
-    doc_to_text: "Can we draw the following hypothesis from the context (see options)? \n\nContext:\n\n{{context}}\n\nHypothesis: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
-    doc_to_target: "{{['Yes', 'It\'s impossible to say', 'No'][label]}}"
-  "template-8":
-    doc_to_text: "Choose from options: Determine if the sentence is true based on the text below:\n{{hypothesis}}\n\n{{context}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
-    doc_to_target: "{{['Yes', 'It\'s impossible to say', 'No'][label]}}"
--- a/lm_eval/benchmarks/flan/yaml_templates/held_in_template_yaml
+++ b/lm_eval/benchmarks/flan/yaml_templates/held_in_template_yaml
@@ -8,6 +8,6 @@ metric_list:
    ignore_punctuation: true
 generation_kwargs:
  until:
-    - "\n\n"
+    - "</s>"
  do_sample: false
  temperature: 0.0
--- a/lm_eval/benchmarks/flan_held_in.yaml
+++ b/lm_eval/benchmarks/flan_held_in.yaml
@@ -25,13 +25,13 @@ task:
    dataset_path: anli
    use_prompt: flan/prompt_templates/flan_anli.yaml:*
    validation_split: dev_r3
-  # - include: flan/yaml_templates/held_in_template_yaml
+  - include: flan/yaml_templates/held_in_template_yaml
-  #   task: ai2_arc
+    task: ai2_arc
-  #   dataset_path: ARC-Easy
+    dataset_path: ARC-Easy
-  #   use_prompt: local:*
+    use_prompt: local:*
-  #   validation_split: validation
+    validation_split: validation
-  # - include: flan/yaml_templates/held_in_template_yaml
+  - include: flan/yaml_templates/held_in_template_yaml
-  #   task: ai2_arc
+    task: ai2_arc
-  #   dataset_path: ARC-Challange
+    dataset_path: ARC-Challange
-  #   use_prompt: local:*
+    use_prompt: local:*
-  #   validation_split: validation
+    validation_split: validation
--- a/lm_eval/benchmarks/flan_held_out.yaml
+++ b/lm_eval/benchmarks/flan_held_out.yaml
 group: flan_held_out
 task:
-  - bbh
+  - bbh_flan
-  - mmlu
+  - mmlu_flan
--- a/lm_eval/benchmarks/t0_eval.yaml
+++ b/lm_eval/benchmarks/t0_eval.yaml
@@ -6,6 +6,7 @@ task:
    use_prompt: promptsource:*
    training_split: train
    validation_split: validation
+    output_type: greedy_until
    metric_list:
      - metric: exact_match
        aggregation: mean
@@ -18,18 +19,6 @@ task:
    use_prompt: promptsource:*
    training_split: train
    validation_split: validation
-    metric_list:
-      - metric: exact_match
-        aggregation: mean
-        higher_is_better: true
-        ignore_case: true
-        ignore_punctuation: true
-  # Natural Language Inference
-  - dataset_path: super_glue
-    dataset_name: cb
-    use_prompt: promptsource:*
-    training_split: train
-    validation_split: validation
    output_type: greedy_until
    metric_list:
      - metric: exact_match
@@ -37,67 +26,86 @@ task:
        higher_is_better: true
        ignore_case: true
        ignore_punctuation: true
-  - dataset_path: super_glue
+  # # Natural Language Inference
-    dataset_name: rte
+  # - dataset_path: super_glue
-    use_prompt: promptsource:*
+  #   dataset_name: cb
-    training_split: train
+  #   use_prompt: promptsource:*
-    validation_split: validation
+  #   training_split: train
-    metric_list:
+  #   validation_split: validation
-      - metric: exact_match
+  #   output_type: greedy_until
-        aggregation: mean
+  #   metric_list:
-        higher_is_better: true
+  #     - metric: exact_match
-        ignore_case: true
+  #       aggregation: mean
-        ignore_punctuation: true
+  #       higher_is_better: true
-  - task: anli_r1
+  #       ignore_case: true
-    dataset_path: anli
+  #       ignore_punctuation: true
-    use_prompt: promptsource:*
+  # - dataset_path: super_glue
-    training_split: train_r1
+  #   dataset_name: rte
-    validation_split: dev_r1
+  #   use_prompt: promptsource:*
-    metric_list:
+  #   training_split: train
-      - metric: exact_match
+  #   validation_split: validation
-        aggregation: mean
+  #   output_type: greedy_until
-        higher_is_better: true
+  #   metric_list:
-        ignore_case: true
+  #     - metric: exact_match
-        ignore_punctuation: true
+  #       aggregation: mean
-  - task: anli_r2
+  #       higher_is_better: true
-    dataset_path: anli
+  #       ignore_case: true
-    use_prompt: promptsource:*
+  #       ignore_punctuation: true
-    training_split: train_r2
+  # - task: anli_r1
-    validation_split: dev_r2
+  #   dataset_path: anli
-    metric_list:
+  #   use_prompt: promptsource:*
-      - metric: exact_match
+  #   training_split: train_r1
-        aggregation: mean
+  #   validation_split: dev_r1
-        higher_is_better: true
+  #   output_type: greedy_until
-        ignore_case: true
+  #   metric_list:
-        ignore_punctuation: true
+  #     - metric: exact_match
-  - task: anli_r3
+  #       aggregation: mean
-    dataset_path: anli
+  #       higher_is_better: true
-    use_prompt: promptsource:*
+  #       ignore_case: true
-    training_split: train_r3
+  #       ignore_punctuation: true
-    validation_split: dev_r3
+  # - task: anli_r2
-    metric_list:
+  #   dataset_path: anli
-      - metric: exact_match
+  #   use_prompt: promptsource:*
-        aggregation: mean
+  #   training_split: train_r2
-        higher_is_better: true
+  #   validation_split: dev_r2
-        ignore_case: true
+  #   output_type: greedy_until
-        ignore_punctuation: true
+  #   metric_list:
-  # Sentence Completion
+  #     - metric: exact_match
-  - dataset_path: super_glue
+  #       aggregation: mean
-    dataset_name: copa
+  #       higher_is_better: true
-    use_prompt: promptsource:*
+  #       ignore_case: true
-    training_split: train
+  #       ignore_punctuation: true
-    validation_split: validation
+  # - task: anli_r3
-    metric_list:
+  #   dataset_path: anli
-      - metric: exact_match
+  #   use_prompt: promptsource:*
-        aggregation: mean
+  #   training_split: train_r3
-        higher_is_better: true
+  #   validation_split: dev_r3
-        ignore_case: true
+  #   output_type: greedy_until
-        ignore_punctuation: true
+  #   metric_list:
+  #     - metric: exact_match
+  #       aggregation: mean
+  #       higher_is_better: true
+  #       ignore_case: true
+  #       ignore_punctuation: true
+  # # Sentence Completion
+  # - dataset_path: super_glue
+  #   dataset_name: copa
+  #   use_prompt: promptsource:*
+  #   training_split: train
+  #   validation_split: validation
+  #   output_type: greedy_until
+  #   metric_list:
+  #     - metric: exact_match
+  #       aggregation: mean
+  #       higher_is_better: true
+  #       ignore_case: true
+  #       ignore_punctuation: true
  # Natural Language Inference
  - dataset_path: hellaswag
    use_prompt: promptsource:*
    training_split: train
    validation_split: validation
+    output_type: greedy_until
    metric_list:
      - metric: exact_match
        aggregation: mean
@@ -110,6 +118,7 @@ task:
    use_prompt: promptsource:*
    training_split: train
    validation_split: validation
+    output_type: greedy_until
    metric_list:
      - metric: exact_match
        aggregation: mean

--- a/lm_eval/tasks/bbh/_generate_configs.py
+++ b/lm_eval/tasks/bbh/_generate_configs.py
@@ -27,3 +27,6 @@ def main() -> None:
 if __name__ == "__main__":
    main()
+# https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/main/cot-prompts/boolean_expressions.txt
--- a/lm_eval/tasks/bbh/_template_yaml
+++ b/lm_eval/tasks/bbh/_template_yaml
@@ -2,16 +2,14 @@ group: bbh
 dataset_path: lukaemon/bbh
 output_type: greedy_until
 test_split: test
-doc_to_text: "{{input}}"
+doc_to_text: "Q: {{input}}\nA:"
 doc_to_target: "{{target}}"
 metric_list:
  - metric: exact_match
    aggregation: mean
    higher_is_better: true
-    ignore_case: true
-    ignore_punctuation: false
 generation_kwargs:
  until:
-    - "\n\n"
+    - "</s>"
  do_sample: false
  temperature: 0.0
--- a/lm_eval/tasks/bbh/boolean_expressions.yaml
+++ b/lm_eval/tasks/bbh/boolean_expressions.yaml
-# Generated by _generate_configs.py
-dataset_name: boolean_expressions
-include: _template_yaml
-task: bbh_boolean_expressions
--- a/lm_eval/tasks/bbh/causal_judgement.yaml
+++ b/lm_eval/tasks/bbh/causal_judgement.yaml
-# Generated by _generate_configs.py
-dataset_name: causal_judgement
-include: _template_yaml
-task: bbh_causal_judgement
--- a/lm_eval/tasks/bbh/date_understanding.yaml
+++ b/lm_eval/tasks/bbh/date_understanding.yaml
-# Generated by _generate_configs.py
-dataset_name: date_understanding
-include: _template_yaml
-task: bbh_date_understanding
--- a/lm_eval/tasks/bbh/disambiguation_qa.yaml
+++ b/lm_eval/tasks/bbh/disambiguation_qa.yaml
-# Generated by _generate_configs.py
-dataset_name: disambiguation_qa
-include: _template_yaml
-task: bbh_disambiguation_qa
--- a/lm_eval/tasks/bbh/dyck_languages.yaml
+++ b/lm_eval/tasks/bbh/dyck_languages.yaml
-# Generated by _generate_configs.py
-dataset_name: dyck_languages
-include: _template_yaml
-task: bbh_dyck_languages
--- a/lm_eval/tasks/bbh/formal_fallacies.yaml
+++ b/lm_eval/tasks/bbh/formal_fallacies.yaml
-# Generated by _generate_configs.py
-dataset_name: formal_fallacies
-include: _template_yaml
-task: bbh_formal_fallacies
--- a/lm_eval/tasks/bbh/geometric_shapes.yaml
+++ b/lm_eval/tasks/bbh/geometric_shapes.yaml
-# Generated by _generate_configs.py
-dataset_name: geometric_shapes
-include: _template_yaml
-task: bbh_geometric_shapes
--- a/lm_eval/tasks/bbh/hyperbaton.yaml
+++ b/lm_eval/tasks/bbh/hyperbaton.yaml
-# Generated by _generate_configs.py
-dataset_name: hyperbaton
-include: _template_yaml
-task: bbh_hyperbaton
--- a/lm_eval/tasks/bbh/logical_deduction_five_objects.yaml
+++ b/lm_eval/tasks/bbh/logical_deduction_five_objects.yaml
-# Generated by _generate_configs.py
-dataset_name: logical_deduction_five_objects
-include: _template_yaml
-task: bbh_logical_deduction_five_objects
--- a/lm_eval/tasks/bbh/logical_deduction_seven_objects.yaml
+++ b/lm_eval/tasks/bbh/logical_deduction_seven_objects.yaml
-# Generated by _generate_configs.py
-dataset_name: logical_deduction_seven_objects
-include: _template_yaml
-task: bbh_logical_deduction_seven_objects
--- a/lm_eval/tasks/bbh/logical_deduction_three_objects.yaml
+++ b/lm_eval/tasks/bbh/logical_deduction_three_objects.yaml
-# Generated by _generate_configs.py
-dataset_name: logical_deduction_three_objects
-include: _template_yaml
-task: bbh_logical_deduction_three_objects
--- a/lm_eval/tasks/bbh/movie_recommendation.yaml
+++ b/lm_eval/tasks/bbh/movie_recommendation.yaml
-# Generated by _generate_configs.py
-dataset_name: movie_recommendation
-include: _template_yaml
-task: bbh_movie_recommendation