Merge pull request #816 from EleutherAI/flan-benchmark

[Refactor] Flan benchmark

Merge pull request #816 from EleutherAI/flan-benchmark
[Refactor] Flan benchmark
6769119f · Hailey Schoelkopf · GitHub · 4824a832 · 7d5e511c · 6769119f
Unverified Commit 6769119f authored Oct 06, 2023 by Hailey Schoelkopf Committed by GitHub Oct 06, 2023
20 changed files
--- a/lm_eval/tasks/benchmarks/flan/flan_anli.yaml
+++ b/lm_eval/tasks/benchmarks/flan/flan_anli.yaml
+group: flan_anli
+task:
+  - include: yaml_templates/held_in_template_yaml
+    task: anli_r1
+    dataset_path: anli
+    use_prompt: prompt_templates/anli.yaml:*
+    validation_split: dev_r1
+  - include: yaml_templates/held_in_template_yaml
+    task: anli_r2
+    dataset_path: anli
+    use_prompt: prompt_templates/anli.yaml:*
+    validation_split: dev_r2
+  - include: yaml_templates/held_in_template_yaml
+    task: anli_r3
+    dataset_path: anli
+    use_prompt: prompt_templates/anli.yaml:*
+    validation_split: dev_r3
--- a/lm_eval/tasks/benchmarks/flan/flan_arc.yaml
+++ b/lm_eval/tasks/benchmarks/flan/flan_arc.yaml
+group: flan_arc
+task:
+  - include: yaml_templates/held_in_template_yaml
+    task: arc_easy
+    dataset_path: ai2_arc
+    dataset_name: ARC-Easy
+    use_prompt: prompt_templates/arc.yaml:*
+    validation_split: validation
+  - include: yaml_templates/held_in_template_yaml
+    task: arc_challenge
+    dataset_path: ai2_arc
+    dataset_name: ARC-Challenge
+    use_prompt: prompt_templates/arc.yaml:*
+    validation_split: validation
--- a/lm_eval/tasks/benchmarks/flan/flan_boolq.yaml
+++ b/lm_eval/tasks/benchmarks/flan/flan_boolq.yaml
+group: flan_boolq
+task:
+  - include: yaml_templates/held_in_template_yaml
+    dataset_path: super_glue
+    dataset_name: boolq
+    use_prompt: prompt_templates/boolq.yaml:*
+    validation_split: validation
--- a/lm_eval/tasks/benchmarks/flan/flan_cot.yaml
+++ b/lm_eval/tasks/benchmarks/flan/flan_cot.yaml
+group: flan_cot
+task:
+  - include: yaml_templates/cot_template_yaml
+    dataset_path: gsmk
+    dataset_name: boolq
+    use_prompt: promptsource:*
+    validation_split: validation
+  - include: yaml_templates/cot_template_yaml
+    dataset_path: EleutherAI/asdiv
+    use_prompt: promptsource:*
+    validation_split: validation
--- a/lm_eval/tasks/benchmarks/flan/flan_held_in.yaml
+++ b/lm_eval/tasks/benchmarks/flan/flan_held_in.yaml
+group: flan_held_in
+task:
+  - flan_boolq
+  - flan_rte
+  - flan_anli
+  - flan_arc
--- a/lm_eval/tasks/benchmarks/flan/flan_held_in_yaml
+++ b/lm_eval/tasks/benchmarks/flan/flan_held_in_yaml
+group: flan_held_in
+task:
+  - include: flan/yaml_templates/held_in_template_yaml
+    dataset_path: super_glue
+    dataset_name: boolq
+    use_prompt: flan/prompt_templates/boolq.yaml:*
+    validation_split: validation
+  - include: flan/yaml_templates/held_in_template_yaml
+    dataset_path: super_glue
+    dataset_name: rte
+    use_prompt: flan/prompt_templates/rte.yaml:*
+    validation_split: validation
+  - include: flan/yaml_templates/held_in_template_yaml
+    task: anli_r1
+    dataset_path: anli
+    use_prompt: flan/prompt_templates/anli.yaml:*
+    validation_split: dev_r1
+  - include: flan/yaml_templates/held_in_template_yaml
+    task: anli_r2
+    dataset_path: anli
+    use_prompt: flan/prompt_templates/anli.yaml:*
+    validation_split: dev_r2
+  - include: flan/yaml_templates/held_in_template_yaml
+    task: anli_r3
+    dataset_path: anli
+    use_prompt: flan/prompt_templates/anli.yaml:*
+    validation_split: dev_r3
+  - include: flan/yaml_templates/held_in_template_yaml
+    task: arc_easy
+    dataset_path: ai2_arc
+    dataset_name: ARC-Easy
+    use_prompt: flan/prompt_templates/arc.yaml:*
+    validation_split: validation
+  - include: flan/yaml_templates/held_in_template_yaml
+    task: arc_challenge
+    dataset_path: ai2_arc
+    dataset_name: ARC-Challenge
+    use_prompt: flan/prompt_templates/arc.yaml:*
+    validation_split: validation
--- a/lm_eval/tasks/benchmarks/flan/flan_held_out.yaml
+++ b/lm_eval/tasks/benchmarks/flan/flan_held_out.yaml
+group: flan_held_out
+task:
+  # BBH
+  - bbh_flan_zeroshot
+  - bbh_flan_fewshot
+  - bbh_flan_cot_fewshot
+  - bbh_flan_cot_zeroshot
+  # MMLU
+  - mmlu
+  - mmlu_flan_n_shot_generative
+  - mmlu_flan_n_shot_loglikelihood
+  - mmlu_flan_cot_zeroshot
+  - mmlu_flan_cot_fewshot
--- a/lm_eval/tasks/benchmarks/flan/flan_rte.yaml
+++ b/lm_eval/tasks/benchmarks/flan/flan_rte.yaml
+group: flan_rte
+task:
+  - include: yaml_templates/held_in_template_yaml
+    dataset_path: super_glue
+    dataset_name: rte
+    use_prompt: prompt_templates/rte.yaml:*
+    validation_split: validation
--- a/lm_eval/tasks/benchmarks/flan/prompt_templates/anli.yaml
+++ b/lm_eval/tasks/benchmarks/flan/prompt_templates/anli.yaml
+# Flan Prompt Templates
+prompts:
+  "template-0":
+    doc_to_text: "{{premise}}\n\nChoose your answer: based on the paragraph above can we conclude that \"{{hypothesis}}\"?\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nI think the answer is"
+    doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
+  "template-1":
+    doc_to_text: "{{premise}}\n\nBased on that paragraph can we conclude that this sentence is true?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
+    doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
+  "template-2":
+    doc_to_text: "{{premise}}\n\nCan we draw the following conclusion?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
+    doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
+  "template-3":
+    doc_to_text: "{{premise}}\nDoes this next sentence follow, given the preceding text?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
+    doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
+  "template-4":
+    doc_to_text: "{{premise}}\nCan we infer the following?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nThe answer is:"
+    doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
+  "template-5":
+    doc_to_text: "Read the following paragraph and determine if the hypothesis is true:\n\n{{premise}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nHypothesis: {{hypothesis}}\n\n\n"
+    doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
+  "template-6":
+    doc_to_text: "Read the text and determine if the sentence is true (see options at the end):\n\n{{premise}}\n\nSentence: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
+    doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
+  "template-7":
+    doc_to_text: "Can we draw the following hypothesis from the context (see options)? \n\nContext:\n\n{{premise}}\n\nHypothesis: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
+    doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
+  "template-8":
+    doc_to_text: "Choose from options: Determine if the sentence is true based on the text below:\n{{hypothesis}}\n\n{{premise}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
+    doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
--- a/lm_eval/tasks/benchmarks/flan/prompt_templates/arc.yaml
+++ b/lm_eval/tasks/benchmarks/flan/prompt_templates/arc.yaml
+# Flan Prompt Templates
+prompts:
+  "template-0":
+    doc_to_text: "{{question}}\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
+    doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
+  "template-1":
+    doc_to_text: "Question: {{question}}\nOPTIONS:\n- {{choices.text|join('\n- ')}}\nAnswer:"
+    doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
+  "template-2":
+    doc_to_text: "Question: {{question}}\n\nWhat is the correct answer to the question from the following choices?\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
+    doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
+  "template-3":
+    doc_to_text: "Q: {{question}}\nWhat is the correct answer to this question?\nOPTIONS:\n- {{choices.text|join('\n- ')}}...A:"
+    doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
+  "template-4":
+    doc_to_text: "Choose your answer?\n\n{{question}}\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
+    doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
+  "template-5":
+    doc_to_text: "Answer the question\n\n{{question}}\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
+    doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
+  "template-6":
+    doc_to_text: "{{question}}\n\nPick the answer from these options\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
+    doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
--- a/lm_eval/tasks/benchmarks/flan/prompt_templates/boolq.yaml
+++ b/lm_eval/tasks/benchmarks/flan/prompt_templates/boolq.yaml
+# Flan Prompt Templates
+prompts:
+  "template-0":
+    doc_to_text: "{{passage}}\n\nCan we conclude that {{question}}?\n\nOPTIONS:\n- no\n- yes"
+    doc_to_target: "{{['no', 'yes'][label]}}"
+  "template-1":
+    doc_to_text: "{{passage}}\n\nIs it true that {{question}}?\n\nOPTIONS:\n- no\n- yes"
+    doc_to_target: "{{['no', 'yes'][label]}}"
+  "template-2":
+    doc_to_text: "{{passage}}\n\n{{question}}?\n\nOPTIONS:\n- no\n- yes"
+    doc_to_target: "{{['no', 'yes'][label]}}"
+  "template-3":
+    doc_to_text: "Text: {{passage}}\n\nQuestion: {{question}}?\n\nOPTIONS:\n- no\n- yes"
+    doc_to_target: "{{['no', 'yes'][label]}}"
+  "template-4":
+    doc_to_text: "{{passage}}\n\nWhat's the best answer to this question: {{question}}?\n\nOPTIONS:\n- no\n- yes"
+    doc_to_target: "{{['no', 'yes'][label]}}"
+  "template-5":
+    doc_to_text: "{{passage}}\nBased on the above text what's the best answer to this question: {{question}}?\n\nOPTIONS:\n- no\n- yes"
+    doc_to_target: "{{['no', 'yes'][label]}}"
+  "template-6":
+    doc_to_text: "{{passage}}\nAnswer this question making sure that the answer is supposed by the text: {{question}}?\n\nOPTIONS:\n- no\n- yes"
+    doc_to_target: "{{['no', 'yes'][label]}}"
+  "template-7":
+    doc_to_text: "{{passage}}\n\nIs the following statement correct based on the text\n\n{{question}}\n\nOPTIONS:\n- no\n- yes"
+    doc_to_target: "{{['no', 'yes'][label]}}"
+  "template-8":
+    # doc_to_text: "{{title}}\n\n{{passage}}\n\nIs this statement correct \"{{question}}\"?\n\nOPTIONS:\n- no\n- yes"
+    doc_to_text: "{{passage}}\n\nIs this statement correct \"{{question}}\"?\n\nOPTIONS:\n- no\n- yes"
+    doc_to_target: "{{['no', 'yes'][label]}}"
+  "template-9":
+    doc_to_text: "Is it true that {{question}} based on the following text?\n\n{{passage}}\n\nOPTIONS:\n- no\n- yes"
+    doc_to_target: "{{['no', 'yes'][label]}}"
--- a/lm_eval/tasks/benchmarks/flan/prompt_templates/rte.yaml
+++ b/lm_eval/tasks/benchmarks/flan/prompt_templates/rte.yaml
+# Flan Prompt Templates
+prompts:
+  "template-0":
+    doc_to_text: "{{premise}}\n\nQuestion with options: Based on the paragraph above can we conclude that \"{{hypothesis}}\"?\n\nOPTIONS:\n- yes\n- no"
+    doc_to_target: "{{['yes', 'no'][label]}}"
+  "template-1":
+    doc_to_text: "{{premise}}\n\nBased on that paragraph can we conclude that the sentence below is true?\n{{hypothesis}}\n\nOPTIONS:\n- yes\n- no"
+    doc_to_target: "{{['yes', 'no'][label]}}"
+  "template-2":
+    doc_to_text: "{{premise}}\n\nQ with options: Can we draw the following conclusion?\n{{hypothesis}}\n\nOPTIONS:\n- yes\n- no"
+    doc_to_target: "{{['yes', 'no'][label]}}"
+  "template-3":
+    doc_to_text: "{{premise}}\nDoes this next sentence follow, given the preceding text?\n{{hypothesis}}\n\nOPTIONS:\n- yes\n- no"
+    doc_to_target: "{{['yes', 'no'][label]}}"
+  "template-4":
+    doc_to_text: "{{premise}}\nOPTIONS:\n- yes\n- no\nQuestion: Can we infer the following?\n{{hypothesis}}"
+    doc_to_target: "{{['yes', 'no'][label]}}"
+  "template-5":
+    doc_to_text: "Read the following paragraph and determine if the hypothesis is true. Select from options at the end:\n\n{{premise}}\n\nHypothesis: {{hypothesis}}\nOPTIONS:\n- yes\n- no\nThe answer is"
+    doc_to_target: "{{['yes', 'no'][label]}}"
+  "template-6":
+    doc_to_text: "Read the text and determine if the sentence is true:\n\n{{premise}}\n\nSentence: {{hypothesis}}\nOPTIONS:\n- yes\n- no\nA:"
+    doc_to_target: "{{['yes', 'no'][label]}}"
+  "template-7":
+    doc_to_text: "Question with options: can we draw the following hypothesis from the context? \n\nContext:\n\n{{premise}}\n\nHypothesis: {{hypothesis}}\nOPTIONS:\n- yes\n- no\nA:"
+    doc_to_target: "{{['yes', 'no'][label]}}"
+  "template-8":
+    doc_to_text: "Determine if the sentence is true based on the text below. Choose from options.\n{{hypothesis}}\n\n{{premise}}\nOPTIONS:\n- yes\n- no"
+    doc_to_target: "{{['yes', 'no'][label]}}"
--- a/lm_eval/tasks/benchmarks/flan/yaml_templates/cot_template_yaml
+++ b/lm_eval/tasks/benchmarks/flan/yaml_templates/cot_template_yaml
+group: flan-cot
+output_type: greedy_until
+validation_split: validation
+doc_to_target: "{{answer}}"
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+generation_kwargs:
+  until:
+    - "\n\n"
+  do_sample: false
+  temperature: 0.0
+filter_list:
+  - name: "get-answer"
+    filter:
+      - function: "regex"
+        regex_pattern: "The answer is (\\-?[0-9\\.\\,]+)"
+      - function: "take_first"
--- a/lm_eval/tasks/benchmarks/flan/yaml_templates/held_in_template_yaml
+++ b/lm_eval/tasks/benchmarks/flan/yaml_templates/held_in_template_yaml
+output_type: greedy_until
+validation_split: validation
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+generation_kwargs:
+  until:
+    - "</s>"
+  do_sample: false
+  temperature: 0.0
--- a/lm_eval/tasks/benchmarks/t0_eval.yaml
+++ b/lm_eval/tasks/benchmarks/t0_eval.yaml
@@ -6,6 +6,7 @@ task:
    use_prompt: promptsource:*
    training_split: train
    validation_split: validation
+    output_type: greedy_until
    metric_list:
      - metric: exact_match
        aggregation: mean
@@ -18,6 +19,7 @@ task:
    use_prompt: promptsource:*
    training_split: train
    validation_split: validation
+    output_type: greedy_until
    metric_list:
      - metric: exact_match
        aggregation: mean
@@ -42,6 +44,7 @@ task:
    use_prompt: promptsource:*
    training_split: train
    validation_split: validation
+    output_type: greedy_until
    metric_list:
      - metric: exact_match
        aggregation: mean
@@ -53,6 +56,7 @@ task:
    use_prompt: promptsource:*
    training_split: train_r1
    validation_split: dev_r1
+    output_type: greedy_until
    metric_list:
      - metric: exact_match
        aggregation: mean
@@ -64,6 +68,7 @@ task:
    use_prompt: promptsource:*
    training_split: train_r2
    validation_split: dev_r2
+    output_type: greedy_until
    metric_list:
      - metric: exact_match
        aggregation: mean
@@ -75,6 +80,7 @@ task:
    use_prompt: promptsource:*
    training_split: train_r3
    validation_split: dev_r3
+    output_type: greedy_until
    metric_list:
      - metric: exact_match
        aggregation: mean
@@ -87,6 +93,7 @@ task:
    use_prompt: promptsource:*
    training_split: train
    validation_split: validation
+    output_type: greedy_until
    metric_list:
      - metric: exact_match
        aggregation: mean
@@ -98,6 +105,7 @@ task:
    use_prompt: promptsource:*
    training_split: train
    validation_split: validation
+    output_type: greedy_until
    metric_list:
      - metric: exact_match
        aggregation: mean
@@ -110,6 +118,7 @@ task:
    use_prompt: promptsource:*
    training_split: train
    validation_split: validation
+    output_type: greedy_until
    metric_list:
      - metric: exact_match
        aggregation: mean

--- a/lm_eval/tasks/bigbench/README.md
+++ b/lm_eval/tasks/bigbench/README.md
@@ -6,7 +6,7 @@ Title: `Beyond the Imitation Game: Quantifying and extrapolating the capabilitie

 Abstract: https://arxiv.org/abs/2206.04615

-The Beyond the Imitation Game Benchmark (BIG-bench) is a collaborative benchmark intended to probe large language models and extrapolate their future capabilities. 
+The Beyond the Imitation Game Benchmark (BIG-bench) is a collaborative benchmark intended to probe large language models and extrapolate their future capabilities.

 Homepage: https://github.com/google/BIG-bench


--- a/lm_eval/tasks/bigbench/generate_tasks.py
+++ b/lm_eval/tasks/bigbench/generate_tasks.py
@@ -2,179 +2,182 @@ import os
 import yaml

 all_subtasks = [
-    'abstract_narrative_understanding',
-    'anachronisms',
-    'analogical_similarity',
-    'analytic_entailment',
-    'arithmetic',
-    'ascii_word_recognition',
-    'authorship_verification',
-    'auto_categorization',
-    'auto_debugging',
-    'bbq_lite_json',
-    'bridging_anaphora_resolution_barqa',
-    'causal_judgment',
-    'cause_and_effect',
-    'checkmate_in_one',
-    'chess_state_tracking',
-    'chinese_remainder_theorem',
-    'cifar10_classification',
-    'code_line_description',
-    'codenames',
-    'color',
-    'common_morpheme',
-    'conceptual_combinations',
-    'conlang_translation',
-    'contextual_parametric_knowledge_conflicts',
-    'crash_blossom',
-    'crass_ai',
-    'cryobiology_spanish',
-    'cryptonite',
-    'cs_algorithms',
-    'dark_humor_detection',
-    'date_understanding',
-    'disambiguation_qa',
-    'discourse_marker_prediction',
-    'disfl_qa',
-    'dyck_languages',
-    'elementary_math_qa',
-    'emoji_movie',
-    'emojis_emotion_prediction',
-    'empirical_judgments',
-    'english_proverbs',
-    'english_russian_proverbs',
-    'entailed_polarity',
-    'entailed_polarity_hindi',
-    'epistemic_reasoning',
-    'evaluating_information_essentiality',
-    'fact_checker',
-    'fantasy_reasoning',
-    'few_shot_nlg',
-    'figure_of_speech_detection',
-    'formal_fallacies_syllogisms_negation',
-    'gem',
-    'gender_inclusive_sentences_german',
-    'general_knowledge',
-    'geometric_shapes',
-    'goal_step_wikihow',
-    'gre_reading_comprehension',
-    'hhh_alignment',
-    'hindi_question_answering',
-    'hindu_knowledge',
-    'hinglish_toxicity',
-    'human_organs_senses',
-    'hyperbaton',
-    'identify_math_theorems',
-    'identify_odd_metaphor',
-    'implicatures',
-    'implicit_relations',
-    'intent_recognition',
-    'international_phonetic_alphabet_nli',
-    'international_phonetic_alphabet_transliterate',
-    'intersect_geometry',
-    'irony_identification',
-    'kanji_ascii',
-    'kannada',
-    'key_value_maps',
-    'known_unknowns',
-    'language_games',
-    'language_identification',
-    'linguistic_mappings',
-    'linguistics_puzzles',
-    'list_functions',
-    'logic_grid_puzzle',
-    'logical_args',
-    'logical_deduction',
-    'logical_fallacy_detection',
-    'logical_sequence',
-    'mathematical_induction',
-    'matrixshapes',
-    'metaphor_boolean',
-    'metaphor_understanding',
-    'minute_mysteries_qa',
-    'misconceptions',
-    'misconceptions_russian',
-    'mnist_ascii',
-    'modified_arithmetic',
-    'moral_permissibility',
-    'movie_dialog_same_or_different',
-    'movie_recommendation',
-    'mult_data_wrangling',
-    'multiemo',
-    'natural_instructions',
-    'navigate',
-    'nonsense_words_grammar',
-    'novel_concepts',
-    'object_counting',
-    'odd_one_out',
-    'operators',
-    'paragraph_segmentation',
-    'parsinlu_qa',
-    'parsinlu_reading_comprehension',
-    'penguins_in_a_table',
-    'periodic_elements',
-    'persian_idioms',
-    'phrase_relatedness',
-    'physical_intuition',
-    'physics',
-    'physics_questions',
-    'play_dialog_same_or_different',
-    'polish_sequence_labeling',
-    'presuppositions_as_nli',
-    'qa_wikidata',
-    'question_selection',
-    'real_or_fake_text',
-    'reasoning_about_colored_objects',
-    'repeat_copy_logic',
-    'rephrase',
-    'riddle_sense',
-    'ruin_names',
-    'salient_translation_error_detection',
-    'scientific_press_release',
-    'semantic_parsing_in_context_sparc',
-    'semantic_parsing_spider',
-    'sentence_ambiguity',
-    'similarities_abstraction',
-    'simp_turing_concept',
-    'simple_arithmetic_json',
-    'simple_arithmetic_json_multiple_choice',
-    'simple_arithmetic_json_subtasks',
-    'simple_arithmetic_multiple_targets_json',
-    'simple_ethical_questions',
-    'simple_text_editing',
-    'snarks',
-    'social_iqa',
-    'social_support',
-    'sports_understanding',
-    'strange_stories',
-    'strategyqa',
-    'sufficient_information',
-    'suicide_risk',
-    'swahili_english_proverbs',
-    'swedish_to_german_proverbs',
-    'symbol_interpretation',
-    'temporal_sequences',
-    'tense',
-    'timedial',
-    'topical_chat',
-    'tracking_shuffled_objects',
-    'understanding_fables',
-    'undo_permutation',
-    'unit_conversion',
-    'unit_interpretation',
-    'unnatural_in_context_learning',
-    'vitaminc_fact_verification',
-    'what_is_the_tao',
-    'which_wiki_edit',
-    'winowhy',
-    'word_sorting',
-    'word_unscrambling'
-    ]
+    "abstract_narrative_understanding",
+    "anachronisms",
+    "analogical_similarity",
+    "analytic_entailment",
+    "arithmetic",
+    "ascii_word_recognition",
+    "authorship_verification",
+    "auto_categorization",
+    "auto_debugging",
+    "bbq_lite_json",
+    "bridging_anaphora_resolution_barqa",
+    "causal_judgment",
+    "cause_and_effect",
+    "checkmate_in_one",
+    "chess_state_tracking",
+    "chinese_remainder_theorem",
+    "cifar10_classification",
+    "code_line_description",
+    "codenames",
+    "color",
+    "common_morpheme",
+    "conceptual_combinations",
+    "conlang_translation",
+    "contextual_parametric_knowledge_conflicts",
+    "crash_blossom",
+    "crass_ai",
+    "cryobiology_spanish",
+    "cryptonite",
+    "cs_algorithms",
+    "dark_humor_detection",
+    "date_understanding",
+    "disambiguation_qa",
+    "discourse_marker_prediction",
+    "disfl_qa",
+    "dyck_languages",
+    "elementary_math_qa",
+    "emoji_movie",
+    "emojis_emotion_prediction",
+    "empirical_judgments",
+    "english_proverbs",
+    "english_russian_proverbs",
+    "entailed_polarity",
+    "entailed_polarity_hindi",
+    "epistemic_reasoning",
+    "evaluating_information_essentiality",
+    "fact_checker",
+    "fantasy_reasoning",
+    "few_shot_nlg",
+    "figure_of_speech_detection",
+    "formal_fallacies_syllogisms_negation",
+    "gem",
+    "gender_inclusive_sentences_german",
+    "general_knowledge",
+    "geometric_shapes",
+    "goal_step_wikihow",
+    "gre_reading_comprehension",
+    "hhh_alignment",
+    "hindi_question_answering",
+    "hindu_knowledge",
+    "hinglish_toxicity",
+    "human_organs_senses",
+    "hyperbaton",
+    "identify_math_theorems",
+    "identify_odd_metaphor",
+    "implicatures",
+    "implicit_relations",
+    "intent_recognition",
+    "international_phonetic_alphabet_nli",
+    "international_phonetic_alphabet_transliterate",
+    "intersect_geometry",
+    "irony_identification",
+    "kanji_ascii",
+    "kannada",
+    "key_value_maps",
+    "known_unknowns",
+    "language_games",
+    "language_identification",
+    "linguistic_mappings",
+    "linguistics_puzzles",
+    "list_functions",
+    "logic_grid_puzzle",
+    "logical_args",
+    "logical_deduction",
+    "logical_fallacy_detection",
+    "logical_sequence",
+    "mathematical_induction",
+    "matrixshapes",
+    "metaphor_boolean",
+    "metaphor_understanding",
+    "minute_mysteries_qa",
+    "misconceptions",
+    "misconceptions_russian",
+    "mnist_ascii",
+    "modified_arithmetic",
+    "moral_permissibility",
+    "movie_dialog_same_or_different",
+    "movie_recommendation",
+    "mult_data_wrangling",
+    "multiemo",
+    "natural_instructions",
+    "navigate",
+    "nonsense_words_grammar",
+    "novel_concepts",
+    "object_counting",
+    "odd_one_out",
+    "operators",
+    "paragraph_segmentation",
+    "parsinlu_qa",
+    "parsinlu_reading_comprehension",
+    "penguins_in_a_table",
+    "periodic_elements",
+    "persian_idioms",
+    "phrase_relatedness",
+    "physical_intuition",
+    "physics",
+    "physics_questions",
+    "play_dialog_same_or_different",
+    "polish_sequence_labeling",
+    "presuppositions_as_nli",
+    "qa_wikidata",
+    "question_selection",
+    "real_or_fake_text",
+    "reasoning_about_colored_objects",
+    "repeat_copy_logic",
+    "rephrase",
+    "riddle_sense",
+    "ruin_names",
+    "salient_translation_error_detection",
+    "scientific_press_release",
+    "semantic_parsing_in_context_sparc",
+    "semantic_parsing_spider",
+    "sentence_ambiguity",
+    "similarities_abstraction",
+    "simp_turing_concept",
+    "simple_arithmetic_json",
+    "simple_arithmetic_json_multiple_choice",
+    "simple_arithmetic_json_subtasks",
+    "simple_arithmetic_multiple_targets_json",
+    "simple_ethical_questions",
+    "simple_text_editing",
+    "snarks",
+    "social_iqa",
+    "social_support",
+    "sports_understanding",
+    "strange_stories",
+    "strategyqa",
+    "sufficient_information",
+    "suicide_risk",
+    "swahili_english_proverbs",
+    "swedish_to_german_proverbs",
+    "symbol_interpretation",
+    "temporal_sequences",
+    "tense",
+    "timedial",
+    "topical_chat",
+    "tracking_shuffled_objects",
+    "understanding_fables",
+    "undo_permutation",
+    "unit_conversion",
+    "unit_interpretation",
+    "unnatural_in_context_learning",
+    "vitaminc_fact_verification",
+    "what_is_the_tao",
+    "which_wiki_edit",
+    "winowhy",
+    "word_sorting",
+    "word_unscrambling",
+]


 def main() -> None:

-    for path, task_type in zip(["multiple_choice", "greedy_until"], ["multiple_choice_template_yaml", "greedy_until_template_yaml"]):
+    for path, task_type in zip(
+        ["multiple_choice", "greedy_until"],
+        ["multiple_choice_template_yaml", "greedy_until_template_yaml"],
+    ):
        os.makedirs(path, exist_ok=True)
        for task in all_subtasks:
            file_name = f"{task}.yaml"
@@ -184,11 +187,15 @@ def main() -> None:
                    yaml.dump(
                        {
                            "include": f"../{task_type}",
-                            "task": "bigbench_" + task + "_{}".format(task_type.split("_template_yaml")[0]),
-                            "dataset_name": task + "_zero_shot", # zero-shot version of the dataset
+                            "task": "bigbench_"
+                            + task
+                            + "_{}".format(task_type.split("_template_yaml")[0]),
+                            "dataset_name": task
+                            + "_zero_shot",  # zero-shot version of the dataset
                        },
                        f,
-                        width=float("inf"), allow_unicode=True
+                        width=float("inf"),
+                        allow_unicode=True,
                    )
            except FileExistsError:
                pass

--- a/lm_eval/tasks/code_x_glue/code-text/bleu.py
+++ b/lm_eval/tasks/code_x_glue/code-text/bleu.py
+#!/usr/bin/python
+import os
+import re
+import sys
+import math
+import subprocess
+import xml.sax.saxutils
+
+from typing import List, Pattern, Tuple, Union, Dict, Any, Optional
+
+"""
+This script was adapted from the original version by hieuhoang1972 which is part of MOSES.
+"""
+
+# $Id: bleu.py 1307 2007-03-14 22:22:36Z hieuhoang1972 $
+
+"""Provides:
+
+cook_refs(refs, n=4): Transform a list of reference sentences as strings into a form usable by cook_test().
+cook_test(test, refs, n=4): Transform a test sentence as a string (together with the cooked reference sentences) into a form usable by score_cooked().
+score_cooked(alltest, n=4): Score a list of cooked test sentences.
+
+score_set(s, testid, refids, n=4): Interface with dataset.py; calculate BLEU score of testid against refids.
+
+The reason for breaking the BLEU computation into three phases cook_refs(), cook_test(), and score_cooked() is to allow the caller to calculate BLEU scores for multiple test sets as efficiently as possible.
+"""
+
+# Added to bypass NIST-style pre-processing of hyp and ref files -- wade
+nonorm = 0
+
+preserve_case = False
+eff_ref_len = "shortest"
+
+normalize1: List[Tuple[Union[Pattern[str], str], str]] = [
+    ("<skipped>", ""),  # strip "skipped" tags
+    (r"-\n", ""),  # strip end-of-line hyphenation and join lines
+    (r"\n", " "),  # join lines
+    #    (r'(\d)\s+(?=\d)', r'\1'), # join digits
+]
+normalize1 = [(re.compile(pattern), replace) for (pattern, replace) in normalize1]
+
+normalize2: List[Tuple[Union[Pattern[str], str], str]] = [
+    (
+        r"([\{-\~\[-\` -\&\(-\+\:-\@\/])",
+        r" \1 ",
+    ),  # tokenize punctuation. apostrophe is missing
+    (
+        r"([^0-9])([\.,])",
+        r"\1 \2 ",
+    ),  # tokenize period and comma unless preceded by a digit
+    (
+        r"([\.,])([^0-9])",
+        r" \1 \2",
+    ),  # tokenize period and comma unless followed by a digit
+    (r"([0-9])(-)", r"\1 \2 "),  # tokenize dash when preceded by a digit
+]
+normalize2 = [(re.compile(pattern), replace) for (pattern, replace) in normalize2]
+
+
+def normalize(s):
+    """Normalize and tokenize text. This is lifted from NIST mteval-v11a.pl."""
+    # Added to bypass NIST-style pre-processing of hyp and ref files -- wade
+    if nonorm:
+        return s.split()
+    if type(s) is not str:
+        s = " ".join(s)
+    # language-independent part:
+    for (pattern, replace) in normalize1:
+        s = re.sub(pattern, replace, s)
+    s = xml.sax.saxutils.unescape(s, {"&quot;": '"'})
+    # language-dependent part (assuming Western languages):
+    s = " %s " % s
+    if not preserve_case:
+        s = s.lower()  # this might not be identical to the original
+    for (pattern, replace) in normalize2:
+        s = re.sub(pattern, replace, s)
+    return s.split()
+
+
+def count_ngrams(words, n=4):
+    counts: Dict[Any, int] = {}
+    for k in range(1, n + 1):
+        for i in range(len(words) - k + 1):
+            ngram = tuple(words[i : i + k])
+            counts[ngram] = counts.get(ngram, 0) + 1
+    return counts
+
+
+def cook_refs(refs, n=4):
+    """Takes a list of reference sentences for a single segment
+    and returns an object that encapsulates everything that BLEU
+    needs to know about them."""
+
+    refs = [normalize(ref) for ref in refs]
+    maxcounts: Dict[Tuple[str], int] = {}
+    for ref in refs:
+        counts = count_ngrams(ref, n)
+        for (ngram, count) in counts.items():
+            maxcounts[ngram] = max(maxcounts.get(ngram, 0), count)
+    return ([len(ref) for ref in refs], maxcounts)
+
+
+def cook_test(test, item, n=4):
+    """Takes a test sentence and returns an object that
+    encapsulates everything that BLEU needs to know about it."""
+    (reflens, refmaxcounts) = item
+    test = normalize(test)
+    result: Dict[str, Any] = {}
+    result["testlen"] = len(test)
+
+    # Calculate effective reference sentence length.
+
+    if eff_ref_len == "shortest":
+        result["reflen"] = min(reflens)
+    elif eff_ref_len == "average":
+        result["reflen"] = float(sum(reflens)) / len(reflens)
+    elif eff_ref_len == "closest":
+        min_diff: Optional[int] = None
+        for reflen in reflens:
+            if min_diff is None or abs(reflen - len(test)) < min_diff:
+                min_diff = abs(reflen - len(test))
+                result["reflen"] = reflen
+
+    result["guess"] = [max(len(test) - k + 1, 0) for k in range(1, n + 1)]
+
+    result["correct"] = [0] * n
+    counts = count_ngrams(test, n)
+    for (ngram, count) in counts.items():
+        result["correct"][len(ngram) - 1] += min(refmaxcounts.get(ngram, 0), count)
+
+    return result
+
+
+def score_cooked(allcomps, n=4, ground=0, smooth=1):
+    totalcomps: Dict[str, Any] = {
+        "testlen": 0,
+        "reflen": 0,
+        "guess": [0] * n,
+        "correct": [0] * n,
+    }
+    for comps in allcomps:
+        for key in ["testlen", "reflen"]:
+            totalcomps[key] += comps[key]
+        for key in ["guess", "correct"]:
+            for k in range(n):
+                totalcomps[key][k] += comps[key][k]
+    logbleu = 0.0
+    all_bleus: List[float] = []
+    for k in range(n):
+        correct = totalcomps["correct"][k]
+        guess = totalcomps["guess"][k]
+        addsmooth = 0
+        if smooth == 1 and k > 0:
+            addsmooth = 1
+        logbleu += math.log(correct + addsmooth + sys.float_info.min) - math.log(
+            guess + addsmooth + sys.float_info.min
+        )
+        if guess == 0:
+            all_bleus.append(-10000000.0)
+        else:
+            all_bleus.append(math.log(correct + sys.float_info.min) - math.log(guess))
+
+    logbleu /= float(n)
+    all_bleus.insert(0, logbleu)
+
+    brevPenalty = min(
+        0, 1 - float(totalcomps["reflen"] + 1) / (totalcomps["testlen"] + 1)
+    )
+    for i in range(len(all_bleus)):
+        if i == 0:
+            all_bleus[i] += brevPenalty
+        all_bleus[i] = math.exp(all_bleus[i])
+    return all_bleus
+
+
+def bleu(refs, candidate, ground=0, smooth=1):
+    refs = cook_refs(refs)
+    test = cook_test(candidate, refs)
+    return score_cooked([test], ground=ground, smooth=smooth)
+
+
+def splitPuncts(line):
+    return " ".join(re.findall(r"[\w]+|[^\s\w]", line))
+
+
+def computeMaps(predictions, goldfile):
+    predictionMap: Dict[str, list] = {}
+    goldMap: Dict[str, list] = {}
+    gf = open(goldfile, "r")
+
+    for row in predictions:
+        cols = row.strip().split("\t")
+        if len(cols) == 1:
+            (rid, pred) = (cols[0], "")
+        else:
+            (rid, pred) = (cols[0], cols[1])
+        predictionMap[rid] = [splitPuncts(pred.strip().lower())]
+
+    for row in gf:
+        (rid, pred) = row.split("\t")
+        if rid in predictionMap:  # Only insert if the id exists for the method
+            if rid not in goldMap:
+                goldMap[rid] = []
+            goldMap[rid].append(splitPuncts(pred.strip().lower()))
+
+    sys.stderr.write("Total: " + str(len(goldMap)) + "\n")
+    return (goldMap, predictionMap)
+
+
+# m1 is the reference map
+# m2 is the prediction map
+def bleuFromMaps(m1, m2):
+    score = [0] * 5
+    num = 0.0
+
+    for key in m1:
+        if key in m2:
+            bl = bleu(m1[key], m2[key][0])
+            score = [score[i] + bl[i] for i in range(0, len(bl))]
+            num += 1
+    return [s * 100.0 / num for s in score]
+
+
+def smoothed_bleu_4(references, predictions, **kwargs):
+
+    predictionMap = {}
+    goldMap = {}
+
+    for rid, pred in enumerate(predictions):
+        predictionMap[rid] = [splitPuncts(pred.strip().lower())]
+
+    for rid, row in enumerate(references):
+        goldMap[rid] = [splitPuncts(row.strip().lower())]
+
+    return bleuFromMaps(goldMap, predictionMap)[0]
+
+
+if __name__ == "__main__":
+    reference_file = sys.argv[1]
+    predictions = []
+    for row in sys.stdin:
+        predictions.append(row)
+    (goldMap, predictionMap) = computeMaps(predictions, reference_file)
+    print(bleuFromMaps(goldMap, predictionMap)[0])
--- a/lm_eval/tasks/code_x_glue/code-text/go.yaml
+++ b/lm_eval/tasks/code_x_glue/code-text/go.yaml
+group:
+  - codexglue_code2text
+task: code2text_go
+dataset_path: CM/codexglue_code2text_go
+training_split: train
+validation_split: validation
+test_split: test
+output_type: greedy_until
+generation_kwargs:
+  num_beams: 10
+  max_length: 128
+  until:
+    - "</s>"
+doc_to_text: !function utils.doc_to_text
+doc_to_target: !function utils.doc_to_target
+metric_list:
+  - metric: !function bleu.smoothed_bleu_4
+    aggregation: mean
+    higher_is_better: True
--- a/lm_eval/tasks/code_x_glue/code-text/java.yaml
+++ b/lm_eval/tasks/code_x_glue/code-text/java.yaml
+group:
+  - codexglue_code2text
+task: code2text_java
+dataset_path: CM/codexglue_code2text_java
+training_split: train
+validation_split: validation
+test_split: test
+output_type: greedy_until
+generation_kwargs:
+  num_beams: 10
+  max_length: 128
+  until:
+    - "</s>"
+doc_to_text: !function utils.doc_to_text
+doc_to_target: !function utils.doc_to_target
+metric_list:
+  - metric: !function bleu.smoothed_bleu_4
+    aggregation: mean
+    higher_is_better: True