init

3041681f · silencealiang · 291fc518 · 3041681f · 3041681f · 3041681f
Commit 3041681f authored Mar 19, 2025 by silencealiang
20 changed files
--- a/LM-Evaluation-Harness-240310/lm_eval/tasks/benchmarks/flan/flan_held_in.yaml
+++ b/LM-Evaluation-Harness-240310/lm_eval/tasks/benchmarks/flan/flan_held_in.yaml
+group: flan_held_in
+group_alias: Flan (Held-In)
+task:
+  # ANLI R1
+  - group: anli_r1_flan
+    group_alias: ANLI R1
+    task:
+      - task: anli_r1
+        task_alias: prompt-0
+        include: _held_in_template_yaml
+        doc_to_text: "{{premise}}\n\nChoose your answer: based on the paragraph above can we conclude that \"{{hypothesis}}\"?\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nI think the answer is"
+        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
+      - task: anli_r1
+        task_alias: prompt-1
+        include: _held_in_template_yaml
+        doc_to_text: "{{premise}}\n\nBased on that paragraph can we conclude that this sentence is true?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
+        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
+      - task: anli_r1
+        task_alias: prompt-2
+        include: _held_in_template_yaml
+        doc_to_text: "{{premise}}\n\nCan we draw the following conclusion?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
+        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
+      - task: anli_r1
+        task_alias: prompt-3
+        include: _held_in_template_yaml
+        doc_to_text: "{{premise}}\nDoes this next sentence follow, given the preceding text?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
+        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
+      - task: anli_r1
+        task_alias: prompt-4
+        include: _held_in_template_yaml
+        doc_to_text: "{{premise}}\nCan we infer the following?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nThe answer is:"
+        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
+      - task: anli_r1
+        task_alias: prompt-5
+        include: _held_in_template_yaml
+        doc_to_text: "Read the following paragraph and determine if the hypothesis is true:\n\n{{premise}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nHypothesis: {{hypothesis}}\n\n\n"
+        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
+      - task: anli_r1
+        task_alias: prompt-6
+        include: _held_in_template_yaml
+        doc_to_text: "Read the text and determine if the sentence is true (see options at the end):\n\n{{premise}}\n\nSentence: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
+        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
+      - task: anli_r1
+        task_alias: prompt-7
+        include: _held_in_template_yaml
+        doc_to_text: "Can we draw the following hypothesis from the context (see options)? \n\nContext:\n\n{{premise}}\n\nHypothesis: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
+        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
+      - task: anli_r1
+        task_alias: prompt-8
+        include: _held_in_template_yaml
+        doc_to_text: "Choose from options: Determine if the sentence is true based on the text below:\n{{hypothesis}}\n\n{{premise}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
+        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
+  # ANLI R2
+  - group: anli_r2_flan
+    group_alias: ANLI R2
+    task:
+      - task: anli_r2
+        task_alias: prompt-0
+        include: _held_in_template_yaml
+        doc_to_text: "{{premise}}\n\nChoose your answer: based on the paragraph above can we conclude that \"{{hypothesis}}\"?\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nI think the answer is"
+        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
+      - task: anli_r2
+        task_alias: prompt-1
+        include: _held_in_template_yaml
+        doc_to_text: "{{premise}}\n\nBased on that paragraph can we conclude that this sentence is true?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
+        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
+      - task: anli_r2
+        task_alias: prompt-2
+        include: _held_in_template_yaml
+        doc_to_text: "{{premise}}\n\nCan we draw the following conclusion?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
+        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
+      - task: anli_r2
+        task_alias: prompt-3
+        include: _held_in_template_yaml
+        doc_to_text: "{{premise}}\nDoes this next sentence follow, given the preceding text?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
+        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
+      - task: anli_r2
+        task_alias: prompt-4
+        include: _held_in_template_yaml
+        doc_to_text: "{{premise}}\nCan we infer the following?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nThe answer is:"
+        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
+      - task: anli_r2
+        task_alias: prompt-5
+        include: _held_in_template_yaml
+        doc_to_text: "Read the following paragraph and determine if the hypothesis is true:\n\n{{premise}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nHypothesis: {{hypothesis}}\n\n\n"
+        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
+      - task: anli_r2
+        task_alias: prompt-6
+        include: _held_in_template_yaml
+        doc_to_text: "Read the text and determine if the sentence is true (see options at the end):\n\n{{premise}}\n\nSentence: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
+        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
+      - task: anli_r2
+        task_alias: prompt-7
+        include: _held_in_template_yaml
+        doc_to_text: "Can we draw the following hypothesis from the context (see options)? \n\nContext:\n\n{{premise}}\n\nHypothesis: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
+        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
+      - task: anli_r2
+        task_alias: prompt-8
+        include: _held_in_template_yaml
+        doc_to_text: "Choose from options: Determine if the sentence is true based on the text below:\n{{hypothesis}}\n\n{{premise}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
+        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
+  # ANLI R3
+  - group: anli_r3_flan
+    group_alias: ANLI R3
+    task:
+      - task: anli_r3
+        task_alias: prompt-0
+        include: _held_in_template_yaml
+        doc_to_text: "{{premise}}\n\nChoose your answer: based on the paragraph above can we conclude that \"{{hypothesis}}\"?\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nI think the answer is"
+        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
+      - task: anli_r3
+        task_alias: prompt-1
+        include: _held_in_template_yaml
+        doc_to_text: "{{premise}}\n\nBased on that paragraph can we conclude that this sentence is true?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
+        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
+      - task: anli_r3
+        task_alias: prompt-2
+        include: _held_in_template_yaml
+        doc_to_text: "{{premise}}\n\nCan we draw the following conclusion?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
+        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
+      - task: anli_r3
+        task_alias: prompt-3
+        include: _held_in_template_yaml
+        doc_to_text: "{{premise}}\nDoes this next sentence follow, given the preceding text?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
+        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
+      - task: anli_r3
+        task_alias: prompt-4
+        include: _held_in_template_yaml
+        doc_to_text: "{{premise}}\nCan we infer the following?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nThe answer is:"
+        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
+      - task: anli_r3
+        task_alias: prompt-5
+        include: _held_in_template_yaml
+        doc_to_text: "Read the following paragraph and determine if the hypothesis is true:\n\n{{premise}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nHypothesis: {{hypothesis}}\n\n\n"
+        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
+      - task: anli_r3
+        task_alias: prompt-6
+        include: _held_in_template_yaml
+        doc_to_text: "Read the text and determine if the sentence is true (see options at the end):\n\n{{premise}}\n\nSentence: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
+        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
+      - task: anli_r3
+        task_alias: prompt-7
+        include: _held_in_template_yaml
+        doc_to_text: "Can we draw the following hypothesis from the context (see options)? \n\nContext:\n\n{{premise}}\n\nHypothesis: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
+        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
+      - task: anli_r3
+        task_alias: prompt-8
+        include: _held_in_template_yaml
+        doc_to_text: "Choose from options: Determine if the sentence is true based on the text below:\n{{hypothesis}}\n\n{{premise}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
+        doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
+  # Arc Easy
+  - group: arc_easy_flan
+    group_alias: Arc Easy
+    task:
+      - task: arc_easy
+        task_alias: prompt-0
+        include: _held_in_template_yaml
+        doc_to_text: "{{question}}\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
+        doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
+      - task: arc_easy
+        task_alias: prompt-1
+        include: _held_in_template_yaml
+        doc_to_text: "Question: {{question}}\nOPTIONS:\n- {{choices.text|join('\n- ')}}\nAnswer:"
+        doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
+      - task: arc_easy
+        task_alias: prompt-2
+        include: _held_in_template_yaml
+        doc_to_text: "Question: {{question}}\n\nWhat is the correct answer to the question from the following choices?\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
+        doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
+      - task: arc_easy
+        task_alias: prompt-3
+        include: _held_in_template_yaml
+        doc_to_text: "Q: {{question}}\nWhat is the correct answer to this question?\nOPTIONS:\n- {{choices.text|join('\n- ')}}...A:"
+        doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
+      - task: arc_easy
+        task_alias: prompt-4
+        include: _held_in_template_yaml
+        doc_to_text: "Choose your answer?\n\n{{question}}\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
+        doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
+      - task: arc_easy
+        task_alias: prompt-5
+        include: _held_in_template_yaml
+        doc_to_text: "Answer the question\n\n{{question}}\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
+        doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
+      - task: arc_easy
+        task_alias: prompt-6
+        include: _held_in_template_yaml
+        doc_to_text: "{{question}}\n\nPick the answer from these options\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
+        doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
+  # Arc Challenge
+  - group: arc_challenge_flan
+    group_alias: Arc Challenge
+    task:
+      - task: arc_challenge
+        task_alias: prompt-0
+        include: _held_in_template_yaml
+        doc_to_text: "{{question}}\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
+        doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
+      - task: arc_challenge
+        task_alias: prompt-1
+        include: _held_in_template_yaml
+        doc_to_text: "Question: {{question}}\nOPTIONS:\n- {{choices.text|join('\n- ')}}\nAnswer:"
+        doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
+      - task: arc_challenge
+        task_alias: prompt-2
+        include: _held_in_template_yaml
+        doc_to_text: "Question: {{question}}\n\nWhat is the correct answer to the question from the following choices?\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
+        doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
+      - task: arc_challenge
+        task_alias: prompt-3
+        include: _held_in_template_yaml
+        doc_to_text: "Q: {{question}}\nWhat is the correct answer to this question?\nOPTIONS:\n- {{choices.text|join('\n- ')}}...A:"
+        doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
+      - task: arc_challenge
+        task_alias: prompt-4
+        include: _held_in_template_yaml
+        doc_to_text: "Choose your answer?\n\n{{question}}\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
+        doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
+      - task: arc_challenge
+        task_alias: prompt-5
+        include: _held_in_template_yaml
+        doc_to_text: "Answer the question\n\n{{question}}\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
+        doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
+      - task: arc_challenge
+        task_alias: prompt-6
+        include: _held_in_template_yaml
+        doc_to_text: "{{question}}\n\nPick the answer from these options\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
+        doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
+  # BoolQ
+  - group: boolq_flan
+    group_alias: BoolQ
+    task:
+      - task: boolq
+        task_alias: prompt-0
+        include: _held_in_template_yaml
+        doc_to_text: "{{passage}}\n\nCan we conclude that {{question}}?\n\nOPTIONS:\n- no\n- yes"
+        doc_to_target: "{{['no', 'yes'][label]}}"
+      - task: boolq
+        task_alias: prompt-1
+        include: _held_in_template_yaml
+        doc_to_text: "{{passage}}\n\nIs it true that {{question}}?\n\nOPTIONS:\n- no\n- yes"
+        doc_to_target: "{{['no', 'yes'][label]}}"
+      - task: boolq
+        task_alias: prompt-2
+        include: _held_in_template_yaml
+        doc_to_text: "{{passage}}\n\n{{question}}?\n\nOPTIONS:\n- no\n- yes"
+        doc_to_target: "{{['no', 'yes'][label]}}"
+      - task: boolq
+        task_alias: prompt-3
+        include: _held_in_template_yaml
+        doc_to_text: "Text: {{passage}}\n\nQuestion: {{question}}?\n\nOPTIONS:\n- no\n- yes"
+        doc_to_target: "{{['no', 'yes'][label]}}"
+      - task: boolq
+        task_alias: prompt-4
+        include: _held_in_template_yaml
+        doc_to_text: "{{passage}}\n\nWhat's the best answer to this question: {{question}}?\n\nOPTIONS:\n- no\n- yes"
+        doc_to_target: "{{['no', 'yes'][label]}}"
+      - task: boolq
+        task_alias: prompt-5
+        include: _held_in_template_yaml
+        doc_to_text: "{{passage}}\nBased on the above text what's the best answer to this question: {{question}}?\n\nOPTIONS:\n- no\n- yes"
+        doc_to_target: "{{['no', 'yes'][label]}}"
+      - task: boolq
+        task_alias: prompt-6
+        include: _held_in_template_yaml
+        doc_to_text: "{{passage}}\nAnswer this question making sure that the answer is supposed by the text: {{question}}?\n\nOPTIONS:\n- no\n- yes"
+        doc_to_target: "{{['no', 'yes'][label]}}"
+      - task: boolq
+        task_alias: prompt-7
+        include: _held_in_template_yaml
+        doc_to_text: "{{passage}}\n\nIs the following statement correct based on the text\n\n{{question}}\n\nOPTIONS:\n- no\n- yes"
+        doc_to_target: "{{['no', 'yes'][label]}}"
+      - task: boolq
+        task_alias: prompt-8
+        include: _held_in_template_yaml
+        doc_to_text: "{{passage}}\n\nIs this statement correct \"{{question}}\"?\n\nOPTIONS:\n- no\n- yes"
+        doc_to_target: "{{['no', 'yes'][label]}}"
+      - task: boolq
+        task_alias: prompt-9
+        include: _held_in_template_yaml
+        doc_to_text: "Is it true that {{question}} based on the following text?\n\n{{passage}}\n\nOPTIONS:\n- no\n- yes"
+        doc_to_target: "{{['no', 'yes'][label]}}"
+  # RTE
+  - group: rte_flan
+    group_alias: RTE
+    task:
+      - task: rte
+        task_alias: prompt-0
+        include: _held_in_template_yaml
+        doc_to_text: "{{sentence1}}\n\nQuestion with options: Based on the paragraph above can we conclude that \"{{sentence2}}\"?\n\nOPTIONS:\n- yes\n- no"
+        doc_to_target: "{{['yes', 'no'][label]}}"
+      - task: rte
+        task_alias: prompt-1
+        include: _held_in_template_yaml
+        doc_to_text: "{{sentence1}}\n\nBased on that paragraph can we conclude that the sentence below is true?\n{{sentence2}}\n\nOPTIONS:\n- yes\n- no"
+        doc_to_target: "{{['yes', 'no'][label]}}"
+      - task: rte
+        task_alias: prompt-2
+        include: _held_in_template_yaml
+        doc_to_text: "{{sentence1}}\n\nQ with options: Can we draw the following conclusion?\n{{sentence2}}\n\nOPTIONS:\n- yes\n- no"
+        doc_to_target: "{{['yes', 'no'][label]}}"
+      - task: rte
+        task_alias: prompt-3
+        include: _held_in_template_yaml
+        doc_to_text: "{{sentence1}}\nDoes this next sentence follow, given the preceding text?\n{{sentence2}}\n\nOPTIONS:\n- yes\n- no"
+        doc_to_target: "{{['yes', 'no'][label]}}"
+      - task: rte
+        task_alias: prompt-4
+        include: _held_in_template_yaml
+        doc_to_text: "{{sentence1}}\nOPTIONS:\n- yes\n- no\nQuestion: Can we infer the following?\n{{sentence2}}"
+        doc_to_target: "{{['yes', 'no'][label]}}"
+      - task: rte
+        task_alias: prompt-5
+        include: _held_in_template_yaml
+        doc_to_text: "Read the following paragraph and determine if the hypothesis is true. Select from options at the end:\n\n{{sentence1}}\n\nHypothesis: {{sentence2}}\nOPTIONS:\n- yes\n- no\nThe answer is"
+        doc_to_target: "{{['yes', 'no'][label]}}"
+      - task: rte
+        task_alias: prompt-6
+        include: _held_in_template_yaml
+        doc_to_text: "Read the text and determine if the sentence is true:\n\n{{sentence1}}\n\nSentence: {{sentence2}}\nOPTIONS:\n- yes\n- no\nA:"
+        doc_to_target: "{{['yes', 'no'][label]}}"
+      - task: rte
+        task_alias: prompt-7
+        include: _held_in_template_yaml
+        doc_to_text: "Question with options: can we draw the following hypothesis from the context? \n\nContext:\n\n{{sentence1}}\n\nHypothesis: {{sentence2}}\nOPTIONS:\n- yes\n- no\nA:"
+        doc_to_target: "{{['yes', 'no'][label]}}"
+      - task: rte
+        task_alias: prompt-8
+        include: _held_in_template_yaml
+        doc_to_text: "Determine if the sentence is true based on the text below. Choose from options.\n{{sentence2}}\n\n{{sentence1}}\nOPTIONS:\n- yes\n- no"
+        doc_to_target: "{{['yes', 'no'][label]}}"
--- a/LM-Evaluation-Harness-240310/lm_eval/tasks/benchmarks/flan/flan_held_out.yaml
+++ b/LM-Evaluation-Harness-240310/lm_eval/tasks/benchmarks/flan/flan_held_out.yaml
+group: flan_held_out
+task:
+  # BBH
+  - bbh_zeroshot
+  - bbh_fewshot
+  - bbh_cot_fewshot
+  - bbh_cot_zeroshot
+  # MMLU
+  - mmlu
+  - mmlu_flan_n_shot_generative
+  - mmlu_flan_n_shot_loglikelihood
+  - mmlu_flan_cot_zeroshot
+  - mmlu_flan_cot_fewshot
--- a/LM-Evaluation-Harness-240310/lm_eval/tasks/benchmarks/minerva_math.yaml
+++ b/LM-Evaluation-Harness-240310/lm_eval/tasks/benchmarks/minerva_math.yaml
+group: minerva_math
+task:
+  - minerva_math_algebra
+  - minerva_math_counting_and_prob
+  - minerva_math_geometry
+  - minerva_math_intermediate_algebra
+  - minerva_math_num_theory
+  - minerva_math_prealgebra
+  - minerva_math_precalc
--- a/LM-Evaluation-Harness-240310/lm_eval/tasks/benchmarks/multimedqa/README.md
+++ b/LM-Evaluation-Harness-240310/lm_eval/tasks/benchmarks/multimedqa/README.md
+# MultiMedQA (multiple-choice subset)
+
+### Paper
+
+Title: Large Language Models Encode Clinical Knowledge
+
+Abstract: https://arxiv.org/abs/2212.13138
+
+A benchmark combining four existing multiple-choice question answering datasets spanning professional medical exams and research queries.
+
+### Citation
+
+```
+@Article{Singhal2023,
+author={Singhal, Karan and Azizi, Shekoofeh and Tu, Tao and Mahdavi, S. Sara and Wei, Jason and Chung, Hyung Won and Scales, Nathan and Tanwani, Ajay and Cole-Lewis, Heather and Pfohl, Stephen and Payne, Perry and Seneviratne, Martin and Gamble, Paul and Kelly, Chris and Babiker, Abubakr and Sch{\"a}rli, Nathanael and Chowdhery, Aakanksha and Mansfield, Philip and Demner-Fushman, Dina and Ag{\"u}era y Arcas, Blaise and Webster, Dale and Corrado, Greg S. and Matias, Yossi and Chou, Katherine and Gottweis, Juraj and Tomasev, Nenad and Liu, Yun and Rajkomar, Alvin and Barral, Joelle and Semturs, Christopher and Karthikesalingam, Alan and Natarajan, Vivek},
+title={Large language models encode clinical knowledge},
+journal={Nature},
+year={2023},
+month={Aug},
+day={01},
+volume={620},
+number={7972},
+pages={172-180},
+issn={1476-4687},
+doi={10.1038/s41586-023-06291-2},
+url={https://doi.org/10.1038/s41586-023-06291-2}
+}
+```
+
+### Tasks
+
+* [PubMedQA](https://pubmedqa.github.io/) - 1,000 expert-labeled Q&A pairs where a question and corresponding PubMed abstract as context is given and the a yes/maybe/no answer must be produced. Unlike the rest of the tasks in this suite, PubMedQA is a closed-domain Q&A task.
+* [MedQA](https://github.com/jind11/MedQA) - US Medical License Exam (USMLE) questions with 4 or 5 possible answers. Typically, only the 4-option questions are used.
+* [MedMCQA](https://medmcqa.github.io/) - 4-option multiple choice questions from Indian medical entrance examinations, >191k total questions.
+* [MMLU](https://arxiv.org/abs/2009.03300) - 4-option multiple choice exam questions from a variety of domains. The following 6 domains are utilized here:
+	* Anatomy
+	* Clinical Knowledge
+	* College Medicine
+	* Medical Genetics
+	* Professional Medicine
+	* College Biology
+
+Note that MultiMedQA also includes some short-form and long-form Q&A tasks (LiveQA, MedicationQA, HealthSearchQA). Evaluation on these tasks is usually done by experts and is not typically performed automatically, and therefore is ignored here.
--- a/LM-Evaluation-Harness-240310/lm_eval/tasks/benchmarks/multimedqa/multimedqa.yaml
+++ b/LM-Evaluation-Harness-240310/lm_eval/tasks/benchmarks/multimedqa/multimedqa.yaml
+group: multimedqa
+task:
+  - pubmedqa
+  - medmcqa
+  - medqa_4options
+  - task: mmlu_anatomy
+    task_alias: "anatomy (mmlu)"
+  - task: mmlu_clinical_knowledge
+    task_alias: "clinical_knowledge (mmlu)"
+  - task: mmlu_college_medicine
+    task_alias: "college_medicine (mmlu)"
+  - task: mmlu_medical_genetics
+    task_alias: "medical_genetics (mmlu)"
+  - task: mmlu_professional_medicine
+    task_alias: "professional_medicine (mmlu)"
+  - task: mmlu_college_biology
+    task_alias: "college_biology (mmlu)"
--- a/LM-Evaluation-Harness-240310/lm_eval/tasks/benchmarks/openllm.yaml
+++ b/LM-Evaluation-Harness-240310/lm_eval/tasks/benchmarks/openllm.yaml
+group: openllm
+group_alias: Open LLM Leaderboard
+task:
+  - task: arc_challenge
+    fewshot_split: validation
+    num_fewshot: 25
+  - task: hellaswag
+    fewshot_split: train
+    num_fewshot: 10
+  - task: truthfulqa
+    num_fewshot: 0
+  - task: mmlu
+    num_fewshot: 5
+  - task: winogrande
+    fewshot_split: train
+    num_fewshot: 5
+  - task: gsm8k
+    num_fewshot: 5
--- a/LM-Evaluation-Harness-240310/lm_eval/tasks/benchmarks/pythia.yaml
+++ b/LM-Evaluation-Harness-240310/lm_eval/tasks/benchmarks/pythia.yaml
+group: pythia
+task:
+  - lambada_openai
+  - logiqa
+  - piqa
+  - sciq
+  - wikitext
+  - winogrande
+  - wsc
+  - ai2_arc
+  - blimp
+  - mmlu
--- a/LM-Evaluation-Harness-240310/lm_eval/tasks/benchmarks/t0_eval.yaml
+++ b/LM-Evaluation-Harness-240310/lm_eval/tasks/benchmarks/t0_eval.yaml
+group: t0_eval
+task:
+  # Coreference Resolution
+  - dataset_path: super_glue
+    dataset_name: wsc.fixed
+    use_prompt: promptsource:*
+    training_split: train
+    validation_split: validation
+    output_type: generate_until
+    metric_list:
+      - metric: exact_match
+        aggregation: mean
+        higher_is_better: true
+        ignore_case: true
+        ignore_punctuation: true
+  # Coreference Resolution
+  - dataset_path: winogrande
+    dataset_name: winogrande_xl
+    use_prompt: promptsource:*
+    training_split: train
+    validation_split: validation
+    output_type: generate_until
+    metric_list:
+      - metric: exact_match
+        aggregation: mean
+        higher_is_better: true
+        ignore_case: true
+        ignore_punctuation: true
+  # Natural Language Inference
+  - dataset_path: super_glue
+    dataset_name: cb
+    use_prompt: promptsource:*
+    training_split: train
+    validation_split: validation
+    output_type: generate_until
+    metric_list:
+      - metric: exact_match
+        aggregation: mean
+        higher_is_better: true
+        ignore_case: true
+        ignore_punctuation: true
+  - dataset_path: super_glue
+    dataset_name: rte
+    use_prompt: promptsource:*
+    training_split: train
+    validation_split: validation
+    output_type: generate_until
+    metric_list:
+      - metric: exact_match
+        aggregation: mean
+        higher_is_better: true
+        ignore_case: true
+        ignore_punctuation: true
+  - task: anli_r1
+    dataset_path: anli
+    use_prompt: promptsource:*
+    training_split: train_r1
+    validation_split: dev_r1
+    output_type: generate_until
+    metric_list:
+      - metric: exact_match
+        aggregation: mean
+        higher_is_better: true
+        ignore_case: true
+        ignore_punctuation: true
+  - task: anli_r2
+    dataset_path: anli
+    use_prompt: promptsource:*
+    training_split: train_r2
+    validation_split: dev_r2
+    output_type: generate_until
+    metric_list:
+      - metric: exact_match
+        aggregation: mean
+        higher_is_better: true
+        ignore_case: true
+        ignore_punctuation: true
+  - task: anli_r3
+    dataset_path: anli
+    use_prompt: promptsource:*
+    training_split: train_r3
+    validation_split: dev_r3
+    output_type: generate_until
+    metric_list:
+      - metric: exact_match
+        aggregation: mean
+        higher_is_better: true
+        ignore_case: true
+        ignore_punctuation: true
+  # Sentence Completion
+  - dataset_path: super_glue
+    dataset_name: copa
+    use_prompt: promptsource:*
+    training_split: train
+    validation_split: validation
+    output_type: generate_until
+    metric_list:
+      - metric: exact_match
+        aggregation: mean
+        higher_is_better: true
+        ignore_case: true
+        ignore_punctuation: true
+  # Natural Language Inference
+  - dataset_path: hellaswag
+    use_prompt: promptsource:*
+    training_split: train
+    validation_split: validation
+    output_type: generate_until
+    metric_list:
+      - metric: exact_match
+        aggregation: mean
+        higher_is_better: true
+        ignore_case: true
+        ignore_punctuation: true
+  # Word Sense Disambiguation
+  - dataset_path: super_glue
+    dataset_name: wic
+    use_prompt: promptsource:*
+    training_split: train
+    validation_split: validation
+    output_type: generate_until
+    metric_list:
+      - metric: exact_match
+        aggregation: mean
+        higher_is_better: true
+        ignore_case: true
+        ignore_punctuation: true
--- a/LM-Evaluation-Harness-240310/lm_eval/tasks/bigbench/README.md
+++ b/LM-Evaluation-Harness-240310/lm_eval/tasks/bigbench/README.md
+# BigBench
+
+### Paper
+
+Title: `Beyond the Imitation Game: Quantifying and extrapolating the capabilities of language models`
+
+Abstract: https://arxiv.org/abs/2206.04615
+
+The Beyond the Imitation Game Benchmark (BIG-bench) is a collaborative benchmark intended to probe large language models and extrapolate their future capabilities.
+
+Homepage: https://github.com/google/BIG-bench
+
+
+### Citation
+
+```
+@misc{srivastava2022imitation,
+      title={Beyond the Imitation Game: Quantifying and extrapolating the capabilities of language models},
+      author={Aarohi Srivastava and Abhinav Rastogi and Abhishek Rao and Abu Awal Md Shoeb and Abubakar Abid and Adam Fisch and Adam R. Brown and Adam Santoro and Aditya Gupta and Adrià Garriga-Alonso and Agnieszka Kluska and Aitor Lewkowycz and Akshat Agarwal and Alethea Power and Alex Ray and Alex Warstadt and Alexander W. Kocurek and Ali Safaya and Ali Tazarv and Alice Xiang and Alicia Parrish and Allen Nie and Aman Hussain and Amanda Askell and Amanda Dsouza and Ambrose Slone and Ameet Rahane and Anantharaman S. Iyer and Anders Andreassen and Andrea Madotto and Andrea Santilli and Andreas Stuhlmüller and Andrew Dai and Andrew La and Andrew Lampinen and Andy Zou and Angela Jiang and Angelica Chen and Anh Vuong and Animesh Gupta and Anna Gottardi and Antonio Norelli and Anu Venkatesh and Arash Gholamidavoodi and Arfa Tabassum and Arul Menezes and Arun Kirubarajan and Asher Mullokandov and Ashish Sabharwal and Austin Herrick and Avia Efrat and Aykut Erdem and Ayla Karakaş and B. Ryan Roberts and Bao Sheng Loe and Barret Zoph and Bartłomiej Bojanowski and Batuhan Özyurt and Behnam Hedayatnia and Behnam Neyshabur and Benjamin Inden and Benno Stein and Berk Ekmekci and Bill Yuchen Lin and Blake Howald and Cameron Diao and Cameron Dour and Catherine Stinson and Cedrick Argueta and César Ferri Ramírez and Chandan Singh and Charles Rathkopf and Chenlin Meng and Chitta Baral and Chiyu Wu and Chris Callison-Burch and Chris Waites and Christian Voigt and Christopher D. Manning and Christopher Potts and Cindy Ramirez and Clara E. Rivera and Clemencia Siro and Colin Raffel and Courtney Ashcraft and Cristina Garbacea and Damien Sileo and Dan Garrette and Dan Hendrycks and Dan Kilman and Dan Roth and Daniel Freeman and Daniel Khashabi and Daniel Levy and Daniel Moseguí González and Danielle Perszyk and Danny Hernandez and Danqi Chen and Daphne Ippolito and Dar Gilboa and David Dohan and David Drakard and David Jurgens and Debajyoti Datta and Deep Ganguli and Denis Emelin and Denis Kleyko and Deniz Yuret and Derek Chen and Derek Tam and Dieuwke Hupkes and Diganta Misra and Dilyar Buzan and Dimitri Coelho Mollo and Diyi Yang and Dong-Ho Lee and Ekaterina Shutova and Ekin Dogus Cubuk and Elad Segal and Eleanor Hagerman and Elizabeth Barnes and Elizabeth Donoway and Ellie Pavlick and Emanuele Rodola and Emma Lam and Eric Chu and Eric Tang and Erkut Erdem and Ernie Chang and Ethan A. Chi and Ethan Dyer and Ethan Jerzak and Ethan Kim and Eunice Engefu Manyasi and Evgenii Zheltonozhskii and Fanyue Xia and Fatemeh Siar and Fernando Martínez-Plumed and Francesca Happé and Francois Chollet and Frieda Rong and Gaurav Mishra and Genta Indra Winata and Gerard de Melo and Germán Kruszewski and Giambattista Parascandolo and Giorgio Mariani and Gloria Wang and Gonzalo Jaimovitch-López and Gregor Betz and Guy Gur-Ari and Hana Galijasevic and Hannah Kim and Hannah Rashkin and Hannaneh Hajishirzi and Harsh Mehta and Hayden Bogar and Henry Shevlin and Hinrich Schütze and Hiromu Yakura and Hongming Zhang and Hugh Mee Wong and Ian Ng and Isaac Noble and Jaap Jumelet and Jack Geissinger and Jackson Kernion and Jacob Hilton and Jaehoon Lee and Jaime Fernández Fisac and James B. Simon and James Koppel and James Zheng and James Zou and Jan Kocoń and Jana Thompson and Jared Kaplan and Jarema Radom and Jascha Sohl-Dickstein and Jason Phang and Jason Wei and Jason Yosinski and Jekaterina Novikova and Jelle Bosscher and Jennifer Marsh and Jeremy Kim and Jeroen Taal and Jesse Engel and Jesujoba Alabi and Jiacheng Xu and Jiaming Song and Jillian Tang and Joan Waweru and John Burden and John Miller and John U. Balis and Jonathan Berant and Jörg Frohberg and Jos Rozen and Jose Hernandez-Orallo and Joseph Boudeman and Joseph Jones and Joshua B. Tenenbaum and Joshua S. Rule and Joyce Chua and Kamil Kanclerz and Karen Livescu and Karl Krauth and Karthik Gopalakrishnan and Katerina Ignatyeva and Katja Markert and Kaustubh D. Dhole and Kevin Gimpel and Kevin Omondi and Kory Mathewson and Kristen Chiafullo and Ksenia Shkaruta and Kumar Shridhar and Kyle McDonell and Kyle Richardson and Laria Reynolds and Leo Gao and Li Zhang and Liam Dugan and Lianhui Qin and Lidia Contreras-Ochando and Louis-Philippe Morency and Luca Moschella and Lucas Lam and Lucy Noble and Ludwig Schmidt and Luheng He and Luis Oliveros Colón and Luke Metz and Lütfi Kerem Şenel and Maarten Bosma and Maarten Sap and Maartje ter Hoeve and Maheen Farooqi and Manaal Faruqui and Mantas Mazeika and Marco Baturan and Marco Marelli and Marco Maru and Maria Jose Ramírez Quintana and Marie Tolkiehn and Mario Giulianelli and Martha Lewis and Martin Potthast and Matthew L. Leavitt and Matthias Hagen and Mátyás Schubert and Medina Orduna Baitemirova and Melody Arnaud and Melvin McElrath and Michael A. Yee and Michael Cohen and Michael Gu and Michael Ivanitskiy and Michael Starritt and Michael Strube and Michał Swędrowski and Michele Bevilacqua and Michihiro Yasunaga and Mihir Kale and Mike Cain and Mimee Xu and Mirac Suzgun and Mo Tiwari and Mohit Bansal and Moin Aminnaseri and Mor Geva and Mozhdeh Gheini and Mukund Varma T and Nanyun Peng and Nathan Chi and Nayeon Lee and Neta Gur-Ari Krakover and Nicholas Cameron and Nicholas Roberts and Nick Doiron and Nikita Nangia and Niklas Deckers and Niklas Muennighoff and Nitish Shirish Keskar and Niveditha S. Iyer and Noah Constant and Noah Fiedel and Nuan Wen and Oliver Zhang and Omar Agha and Omar Elbaghdadi and Omer Levy and Owain Evans and Pablo Antonio Moreno Casares and Parth Doshi and Pascale Fung and Paul Pu Liang and Paul Vicol and Pegah Alipoormolabashi and Peiyuan Liao and Percy Liang and Peter Chang and Peter Eckersley and Phu Mon Htut and Pinyu Hwang and Piotr Miłkowski and Piyush Patil and Pouya Pezeshkpour and Priti Oli and Qiaozhu Mei and Qing Lyu and Qinlang Chen and Rabin Banjade and Rachel Etta Rudolph and Raefer Gabriel and Rahel Habacker and Ramón Risco Delgado and Raphaël Millière and Rhythm Garg and Richard Barnes and Rif A. Saurous and Riku Arakawa and Robbe Raymaekers and Robert Frank and Rohan Sikand and Roman Novak and Roman Sitelew and Ronan LeBras and Rosanne Liu and Rowan Jacobs and Rui Zhang and Ruslan Salakhutdinov and Ryan Chi and Ryan Lee and Ryan Stovall and Ryan Teehan and Rylan Yang and Sahib Singh and Saif M. Mohammad and Sajant Anand and Sam Dillavou and Sam Shleifer and Sam Wiseman and Samuel Gruetter and Samuel R. Bowman and Samuel S. Schoenholz and Sanghyun Han and Sanjeev Kwatra and Sarah A. Rous and Sarik Ghazarian and Sayan Ghosh and Sean Casey and Sebastian Bischoff and Sebastian Gehrmann and Sebastian Schuster and Sepideh Sadeghi and Shadi Hamdan and Sharon Zhou and Shashank Srivastava and Sherry Shi and Shikhar Singh and Shima Asaadi and Shixiang Shane Gu and Shubh Pachchigar and Shubham Toshniwal and Shyam Upadhyay and Shyamolima and Debnath and Siamak Shakeri and Simon Thormeyer and Simone Melzi and Siva Reddy and Sneha Priscilla Makini and Soo-Hwan Lee and Spencer Torene and Sriharsha Hatwar and Stanislas Dehaene and Stefan Divic and Stefano Ermon and Stella Biderman and Stephanie Lin and Stephen Prasad and Steven T. Piantadosi and Stuart M. Shieber and Summer Misherghi and Svetlana Kiritchenko and Swaroop Mishra and Tal Linzen and Tal Schuster and Tao Li and Tao Yu and Tariq Ali and Tatsu Hashimoto and Te-Lin Wu and Théo Desbordes and Theodore Rothschild and Thomas Phan and Tianle Wang and Tiberius Nkinyili and Timo Schick and Timofei Kornev and Timothy Telleen-Lawton and Titus Tunduny and Tobias Gerstenberg and Trenton Chang and Trishala Neeraj and Tushar Khot and Tyler Shultz and Uri Shaham and Vedant Misra and Vera Demberg and Victoria Nyamai and Vikas Raunak and Vinay Ramasesh and Vinay Uday Prabhu and Vishakh Padmakumar and Vivek Srikumar and William Fedus and William Saunders and William Zhang and Wout Vossen and Xiang Ren and Xiaoyu Tong and Xinran Zhao and Xinyi Wu and Xudong Shen and Yadollah Yaghoobzadeh and Yair Lakretz and Yangqiu Song and Yasaman Bahri and Yejin Choi and Yichi Yang and Yiding Hao and Yifu Chen and Yonatan Belinkov and Yu Hou and Yufang Hou and Yuntao Bai and Zachary Seid and Zhuoye Zhao and Zijian Wang and Zijie J. Wang and Zirui Wang and Ziyi Wu},
+      year={2022},
+      eprint={2206.04615},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+```
+
+### Groups and Tasks
+
+#### Groups
+
+* `group_name`: `Short description`
+
+#### Tasks
+
+* `task_name`: `1-sentence description of what this particular task does`
+* `task_name2`: ...
+
+### Checklist
+
+For adding novel benchmarks/datasets to the library:
+* [ ] Is the task an existing benchmark in the literature?
+  * [ ] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+
+
+If other tasks on this dataset are already supported:
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/LM-Evaluation-Harness-240310/lm_eval/tasks/bigbench/generate_tasks.py
+++ b/LM-Evaluation-Harness-240310/lm_eval/tasks/bigbench/generate_tasks.py
+import os
+
+import yaml
+
+
+all_subtasks = [
+    "abstract_narrative_understanding",
+    "anachronisms",
+    "analogical_similarity",
+    "analytic_entailment",
+    "arithmetic",
+    "ascii_word_recognition",
+    "authorship_verification",
+    "auto_categorization",
+    "auto_debugging",
+    "bbq_lite_json",
+    "bridging_anaphora_resolution_barqa",
+    "causal_judgment",
+    "cause_and_effect",
+    "checkmate_in_one",
+    "chess_state_tracking",
+    "chinese_remainder_theorem",
+    "cifar10_classification",
+    "code_line_description",
+    "codenames",
+    "color",
+    "common_morpheme",
+    "conceptual_combinations",
+    "conlang_translation",
+    "contextual_parametric_knowledge_conflicts",
+    "crash_blossom",
+    "crass_ai",
+    "cryobiology_spanish",
+    "cryptonite",
+    "cs_algorithms",
+    "dark_humor_detection",
+    "date_understanding",
+    "disambiguation_qa",
+    "discourse_marker_prediction",
+    "disfl_qa",
+    "dyck_languages",
+    "elementary_math_qa",
+    "emoji_movie",
+    "emojis_emotion_prediction",
+    "empirical_judgments",
+    "english_proverbs",
+    "english_russian_proverbs",
+    "entailed_polarity",
+    "entailed_polarity_hindi",
+    "epistemic_reasoning",
+    "evaluating_information_essentiality",
+    "fact_checker",
+    "fantasy_reasoning",
+    "few_shot_nlg",
+    "figure_of_speech_detection",
+    "formal_fallacies_syllogisms_negation",
+    "gem",
+    "gender_inclusive_sentences_german",
+    "general_knowledge",
+    "geometric_shapes",
+    "goal_step_wikihow",
+    "gre_reading_comprehension",
+    "hhh_alignment",
+    "hindi_question_answering",
+    "hindu_knowledge",
+    "hinglish_toxicity",
+    "human_organs_senses",
+    "hyperbaton",
+    "identify_math_theorems",
+    "identify_odd_metaphor",
+    "implicatures",
+    "implicit_relations",
+    "intent_recognition",
+    "international_phonetic_alphabet_nli",
+    "international_phonetic_alphabet_transliterate",
+    "intersect_geometry",
+    "irony_identification",
+    "kanji_ascii",
+    "kannada",
+    "key_value_maps",
+    "known_unknowns",
+    "language_games",
+    "language_identification",
+    "linguistic_mappings",
+    "linguistics_puzzles",
+    "list_functions",
+    "logic_grid_puzzle",
+    "logical_args",
+    "logical_deduction",
+    "logical_fallacy_detection",
+    "logical_sequence",
+    "mathematical_induction",
+    "matrixshapes",
+    "metaphor_boolean",
+    "metaphor_understanding",
+    "minute_mysteries_qa",
+    "misconceptions",
+    "misconceptions_russian",
+    "mnist_ascii",
+    "modified_arithmetic",
+    "moral_permissibility",
+    "movie_dialog_same_or_different",
+    "movie_recommendation",
+    "mult_data_wrangling",
+    "multiemo",
+    "natural_instructions",
+    "navigate",
+    "nonsense_words_grammar",
+    "novel_concepts",
+    "object_counting",
+    "odd_one_out",
+    "operators",
+    "paragraph_segmentation",
+    "parsinlu_qa",
+    "parsinlu_reading_comprehension",
+    "penguins_in_a_table",
+    "periodic_elements",
+    "persian_idioms",
+    "phrase_relatedness",
+    "physical_intuition",
+    "physics",
+    "physics_questions",
+    "play_dialog_same_or_different",
+    "polish_sequence_labeling",
+    "presuppositions_as_nli",
+    "qa_wikidata",
+    "question_selection",
+    "real_or_fake_text",
+    "reasoning_about_colored_objects",
+    "repeat_copy_logic",
+    "rephrase",
+    "riddle_sense",
+    "ruin_names",
+    "salient_translation_error_detection",
+    "scientific_press_release",
+    "semantic_parsing_in_context_sparc",
+    "semantic_parsing_spider",
+    "sentence_ambiguity",
+    "similarities_abstraction",
+    "simp_turing_concept",
+    "simple_arithmetic_json",
+    "simple_arithmetic_json_multiple_choice",
+    "simple_arithmetic_json_subtasks",
+    "simple_arithmetic_multiple_targets_json",
+    "simple_ethical_questions",
+    "simple_text_editing",
+    "snarks",
+    "social_iqa",
+    "social_support",
+    "sports_understanding",
+    "strange_stories",
+    "strategyqa",
+    "sufficient_information",
+    "suicide_risk",
+    "swahili_english_proverbs",
+    "swedish_to_german_proverbs",
+    "symbol_interpretation",
+    "temporal_sequences",
+    "tense",
+    "timedial",
+    "topical_chat",
+    "tracking_shuffled_objects",
+    "understanding_fables",
+    "undo_permutation",
+    "unit_conversion",
+    "unit_interpretation",
+    "unnatural_in_context_learning",
+    "vitaminc_fact_verification",
+    "what_is_the_tao",
+    "which_wiki_edit",
+    "winowhy",
+    "word_sorting",
+    "word_unscrambling",
+]
+
+
+def main() -> None:
+    for path, task_type in zip(
+        ["multiple_choice", "generate_until"],
+        ["multiple_choice_template_yaml", "generate_until_template_yaml"],
+    ):
+        os.makedirs(path, exist_ok=True)
+        for task in all_subtasks:
+            file_name = f"{task}.yaml"
+            try:
+                with open(f"{path}/{file_name}", "w", encoding="utf-8") as f:
+                    f.write("# Generated by utils.py\n")
+                    yaml.dump(
+                        {
+                            "include": f"../{task_type}",
+                            "task": "bigbench_"
+                            + task
+                            + "_{}".format(task_type.split("_template_yaml")[0]),
+                            "dataset_name": task
+                            + "_zero_shot",  # zero-shot version of the dataset
+                        },
+                        f,
+                        width=float("inf"),
+                        allow_unicode=True,
+                    )
+            except FileExistsError:
+                pass
+
+
+if __name__ == "__main__":
+    main()
--- a/LM-Evaluation-Harness-240310/lm_eval/tasks/bigbench/generate_until/abstract_narrative_understanding.yaml
+++ b/LM-Evaluation-Harness-240310/lm_eval/tasks/bigbench/generate_until/abstract_narrative_understanding.yaml
+# Generated by utils.py
+dataset_name: abstract_narrative_understanding_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_abstract_narrative_understanding_generate_until
--- a/LM-Evaluation-Harness-240310/lm_eval/tasks/bigbench/generate_until/anachronisms.yaml
+++ b/LM-Evaluation-Harness-240310/lm_eval/tasks/bigbench/generate_until/anachronisms.yaml
+# Generated by utils.py
+dataset_name: anachronisms_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_anachronisms_generate_until
--- a/LM-Evaluation-Harness-240310/lm_eval/tasks/bigbench/generate_until/analogical_similarity.yaml
+++ b/LM-Evaluation-Harness-240310/lm_eval/tasks/bigbench/generate_until/analogical_similarity.yaml
+# Generated by utils.py
+dataset_name: analogical_similarity_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_analogical_similarity_generate_until
--- a/LM-Evaluation-Harness-240310/lm_eval/tasks/bigbench/generate_until/analytic_entailment.yaml
+++ b/LM-Evaluation-Harness-240310/lm_eval/tasks/bigbench/generate_until/analytic_entailment.yaml
+# Generated by utils.py
+dataset_name: analytic_entailment_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_analytic_entailment_generate_until
--- a/LM-Evaluation-Harness-240310/lm_eval/tasks/bigbench/generate_until/arithmetic.yaml
+++ b/LM-Evaluation-Harness-240310/lm_eval/tasks/bigbench/generate_until/arithmetic.yaml
+# Generated by utils.py
+dataset_name: arithmetic_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_arithmetic_generate_until
--- a/LM-Evaluation-Harness-240310/lm_eval/tasks/bigbench/generate_until/ascii_word_recognition.yaml
+++ b/LM-Evaluation-Harness-240310/lm_eval/tasks/bigbench/generate_until/ascii_word_recognition.yaml
+# Generated by utils.py
+dataset_name: ascii_word_recognition_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_ascii_word_recognition_generate_until
--- a/LM-Evaluation-Harness-240310/lm_eval/tasks/bigbench/generate_until/authorship_verification.yaml
+++ b/LM-Evaluation-Harness-240310/lm_eval/tasks/bigbench/generate_until/authorship_verification.yaml
+# Generated by utils.py
+dataset_name: authorship_verification_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_authorship_verification_generate_until
--- a/LM-Evaluation-Harness-240310/lm_eval/tasks/bigbench/generate_until/auto_categorization.yaml
+++ b/LM-Evaluation-Harness-240310/lm_eval/tasks/bigbench/generate_until/auto_categorization.yaml
+# Generated by utils.py
+dataset_name: auto_categorization_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_auto_categorization_generate_until
--- a/LM-Evaluation-Harness-240310/lm_eval/tasks/bigbench/generate_until/auto_debugging.yaml
+++ b/LM-Evaluation-Harness-240310/lm_eval/tasks/bigbench/generate_until/auto_debugging.yaml
+# Generated by utils.py
+dataset_name: auto_debugging_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_auto_debugging_generate_until
--- a/LM-Evaluation-Harness-240310/lm_eval/tasks/bigbench/generate_until/bbq_lite_json.yaml
+++ b/LM-Evaluation-Harness-240310/lm_eval/tasks/bigbench/generate_until/bbq_lite_json.yaml
+# Generated by utils.py
+dataset_name: bbq_lite_json_zero_shot
+include: ../generate_until_template_yaml
+task: bigbench_bbq_lite_json_generate_until