update for held-in tasks

ec03783f · lintangsutawika · 4f5b72bc · ec03783f · ec03783f · ec03783f
Commit ec03783f authored Sep 04, 2023 by lintangsutawika
8 changed files
--- a/lm_eval/benchmarks/flan/prompt_templates/flan_anli.yaml
+++ b/lm_eval/benchmarks/flan/prompt_templates/flan_anli.yaml
 # Flan Prompt Templates
 prompts:
  "template-0":
-    doc_to_text: "{{context}}\n\nChoose your answer: based on the paragraph above can we conclude that \"{{hypothesis}}\"?\n\nOPTIONS:\n- Yes\n- It\'s impossible to say\n- No\nI think the answer is"
+    doc_to_text: "{{premise}}\n\nChoose your answer: based on the paragraph above can we conclude that \"{{hypothesis}}\"?\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nI think the answer is"
-    doc_to_target: "{{['Yes', 'It\'s impossible to say', 'No'][label]}}"
+    doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
  "template-1":
-    doc_to_text: "{{context}}\n\nBased on that paragraph can we conclude that this sentence is true?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It\'s impossible to say\n- No"
+    doc_to_text: "{{premise}}\n\nBased on that paragraph can we conclude that this sentence is true?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
-    doc_to_target: "{{['Yes', 'It\'s impossible to say', 'No'][label]}}"
+    doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
  "template-2":
-    doc_to_text: "{{context}}\n\nCan we draw the following conclusion?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It\'s impossible to say\n- No"
+    doc_to_text: "{{premise}}\n\nCan we draw the following conclusion?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
-    doc_to_target: "{{['Yes', 'It\'s impossible to say', 'No'][label]}}"
+    doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
  "template-3":
-    doc_to_text: "{{context}}\nDoes this next sentence follow, given the preceding text?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It\'s impossible to say\n- No"
+    doc_to_text: "{{premise}}\nDoes this next sentence follow, given the preceding text?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
-    doc_to_target: "{{['Yes', 'It\'s impossible to say', 'No'][label]}}"
+    doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
  "template-4":
-    doc_to_text: "{{context}}\nCan we infer the following?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It\'s impossible to say\n- No\nThe answer is:"
+    doc_to_text: "{{premise}}\nCan we infer the following?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nThe answer is:"
-    doc_to_target: "{{['Yes', 'It\'s impossible to say', 'No'][label]}}"
+    doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
  "template-5":
-    doc_to_text: "Read the following paragraph and determine if the hypothesis is true:\n\n{{context}}\n\nOPTIONS:\n- Yes\n- It\'s impossible to say\n- No\nHypothesis: {{hypothesis}}\n\n\n"
+    doc_to_text: "Read the following paragraph and determine if the hypothesis is true:\n\n{{premise}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nHypothesis: {{hypothesis}}\n\n\n"
-    doc_to_target: "{{['Yes', 'It\'s impossible to say', 'No'][label]}}"
+    doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
  "template-6":
-    doc_to_text: "Read the text and determine if the sentence is true (see options at the end):\n\n{{context}}\n\nSentence: {{hypothesis}}\nOPTIONS:\n- Yes\n- It\'s impossible to say\n- No"
+    doc_to_text: "Read the text and determine if the sentence is true (see options at the end):\n\n{{premise}}\n\nSentence: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
-    doc_to_target: "{{['Yes', 'It\'s impossible to say', 'No'][label]}}"
+    doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
  "template-7":
-    doc_to_text: "Can we draw the following hypothesis from the context (see options)? \n\nContext:\n\n{{context}}\n\nHypothesis: {{hypothesis}}\nOPTIONS:\n- Yes\n- It\'s impossible to say\n- No"
+    doc_to_text: "Can we draw the following hypothesis from the context (see options)? \n\nContext:\n\n{{premise}}\n\nHypothesis: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
-    doc_to_target: "{{['Yes', 'It\'s impossible to say', 'No'][label]}}"
+    doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
  "template-8":
-    doc_to_text: "Choose from options: Determine if the sentence is true based on the text below:\n{{hypothesis}}\n\n{{context}}\nOPTIONS:\n- Yes\n- It\'s impossible to say\n- No"
+    doc_to_text: "Choose from options: Determine if the sentence is true based on the text below:\n{{hypothesis}}\n\n{{premise}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
-    doc_to_target: "{{['Yes', 'It\'s impossible to say', 'No'][label]}}"
+    doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
--- a/lm_eval/benchmarks/flan/prompt_templates/flan_arc.yaml
+++ b/lm_eval/benchmarks/flan/prompt_templates/flan_arc.yaml
@@ -2,23 +2,22 @@
 prompts:
  "template-0":
    doc_to_text: "{{question}}\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
-    doc_to_target: "{{[choices.text][choices.label.index(answerKey)]}}"
+    doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
  "template-1":
    doc_to_text: "Question: {{question}}\nOPTIONS:\n- {{choices.text|join('\n- ')}}\nAnswer:"
-    doc_to_target: "{{[choices.text][choices.label.index(answerKey)]}}"
+    doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
  "template-2":
    doc_to_text: "Question: {{question}}\n\nWhat is the correct answer to the question from the following choices?\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
-    doc_to_target: "{{[choices.text][choices.label.index(answerKey)]}}"
+    doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
  "template-3":
    doc_to_text: "Q: {{question}}\nWhat is the correct answer to this question?\nOPTIONS:\n- {{choices.text|join('\n- ')}}...A:"
-    doc_to_target: "{{[choices.text][choices.label.index(answerKey)]}}"
+    doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
  "template-4":
    doc_to_text: "Choose your answer?\n\n{{question}}\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
-    doc_to_target: "{{[choices.text][choices.label.index(answerKey)]}}"
+    doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
  "template-5":
    doc_to_text: "Answer the question\n\n{{question}}\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
-    doc_to_target: "{{[choices.text][choices.label.index(answerKey)]}}"
+    doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
  "template-6":
    doc_to_text: "{{question}}\n\nPick the answer from these options\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
-    doc_to_target: "{{[choices.text][choices.label.index(answerKey)]}}"
+    doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
--- a/lm_eval/benchmarks/flan/yaml_templates/cot_template_yaml
+++ b/lm_eval/benchmarks/flan/yaml_templates/cot_template_yaml
@@ -6,8 +6,6 @@ metric_list:
  - metric: exact_match
    aggregation: mean
    higher_is_better: true
-    ignore_case: true
-    ignore_punctuation: true
 generation_kwargs:
  until:
    - "\n\n"

--- a/lm_eval/benchmarks/flan/yaml_templates/held_in_template_yaml
+++ b/lm_eval/benchmarks/flan/yaml_templates/held_in_template_yaml
@@ -4,8 +4,6 @@ metric_list:
  - metric: exact_match
    aggregation: mean
    higher_is_better: true
-    ignore_case: true
-    ignore_punctuation: true
 generation_kwargs:
  until:
    - "</s>"

--- a/lm_eval/benchmarks/flan_anli.yaml
+++ b/lm_eval/benchmarks/flan_anli.yaml
+group: flan_anli
+task:
+  - include: flan/yaml_templates/held_in_template_yaml
+    task: anli_r1
+    dataset_path: anli
+    use_prompt: flan/prompt_templates/flan_anli.yaml:*
+    validation_split: dev_r1
+  - include: flan/yaml_templates/held_in_template_yaml
+    task: anli_r2
+    dataset_path: anli
+    use_prompt: flan/prompt_templates/flan_anli.yaml:*
+    validation_split: dev_r2
+  - include: flan/yaml_templates/held_in_template_yaml
+    task: anli_r3
+    dataset_path: anli
+    use_prompt: flan/prompt_templates/flan_anli.yaml:*
+    validation_split: dev_r3
--- a/lm_eval/benchmarks/flan_boolq.yaml
+++ b/lm_eval/benchmarks/flan_boolq.yaml
+group: flan_boolq
+task:
+  - include: flan/yaml_templates/held_in_template_yaml
+    dataset_path: super_glue
+    dataset_name: boolq
+    use_prompt: flan/prompt_templates/flan_boolq.yaml:*
+    validation_split: validation
--- a/lm_eval/benchmarks/flan_held_in.yaml
+++ b/lm_eval/benchmarks/flan_held_in.yaml
@@ -26,12 +26,14 @@ task:
    use_prompt: flan/prompt_templates/flan_anli.yaml:*
    validation_split: dev_r3
  - include: flan/yaml_templates/held_in_template_yaml
-    task: ai2_arc
+    task: arc_easy
-    dataset_path: ARC-Easy
+    dataset_path: ai2_arc
-    use_prompt: local:*
+    dataset_name: ARC-Easy
+    use_prompt: flan/prompt_templates/flan_arc.yaml:*
    validation_split: validation
  - include: flan/yaml_templates/held_in_template_yaml
-    task: ai2_arc
+    task: arc_challenge
-    dataset_path: ARC-Challange
+    dataset_path: ai2_arc
-    use_prompt: local:*
+    dataset_name: ARC-Challenge
+    use_prompt: flan/prompt_templates/flan_arc.yaml:*
    validation_split: validation
--- a/lm_eval/benchmarks/flan_rte.yaml
+++ b/lm_eval/benchmarks/flan_rte.yaml
+group: flan_rte
+task:
+  - include: flan/yaml_templates/held_in_template_yaml
+    dataset_path: super_glue
+    dataset_name: rte
+    use_prompt: flan/prompt_templates/flan_rte.yaml:*
+    validation_split: validation