Unverified Commit 6769119f authored by Hailey Schoelkopf's avatar Hailey Schoelkopf Committed by GitHub
Browse files

Merge pull request #816 from EleutherAI/flan-benchmark

[Refactor] Flan benchmark
parents 4824a832 7d5e511c
group: flan_anli
task:
- include: yaml_templates/held_in_template_yaml
task: anli_r1
dataset_path: anli
use_prompt: prompt_templates/anli.yaml:*
validation_split: dev_r1
- include: yaml_templates/held_in_template_yaml
task: anli_r2
dataset_path: anli
use_prompt: prompt_templates/anli.yaml:*
validation_split: dev_r2
- include: yaml_templates/held_in_template_yaml
task: anli_r3
dataset_path: anli
use_prompt: prompt_templates/anli.yaml:*
validation_split: dev_r3
group: flan_arc
task:
- include: yaml_templates/held_in_template_yaml
task: arc_easy
dataset_path: ai2_arc
dataset_name: ARC-Easy
use_prompt: prompt_templates/arc.yaml:*
validation_split: validation
- include: yaml_templates/held_in_template_yaml
task: arc_challenge
dataset_path: ai2_arc
dataset_name: ARC-Challenge
use_prompt: prompt_templates/arc.yaml:*
validation_split: validation
group: flan_boolq
task:
- include: yaml_templates/held_in_template_yaml
dataset_path: super_glue
dataset_name: boolq
use_prompt: prompt_templates/boolq.yaml:*
validation_split: validation
group: flan_cot
task:
- include: yaml_templates/cot_template_yaml
dataset_path: gsmk
dataset_name: boolq
use_prompt: promptsource:*
validation_split: validation
- include: yaml_templates/cot_template_yaml
dataset_path: EleutherAI/asdiv
use_prompt: promptsource:*
validation_split: validation
group: flan_held_in
task:
- flan_boolq
- flan_rte
- flan_anli
- flan_arc
group: flan_held_in
task:
- include: flan/yaml_templates/held_in_template_yaml
dataset_path: super_glue
dataset_name: boolq
use_prompt: flan/prompt_templates/boolq.yaml:*
validation_split: validation
- include: flan/yaml_templates/held_in_template_yaml
dataset_path: super_glue
dataset_name: rte
use_prompt: flan/prompt_templates/rte.yaml:*
validation_split: validation
- include: flan/yaml_templates/held_in_template_yaml
task: anli_r1
dataset_path: anli
use_prompt: flan/prompt_templates/anli.yaml:*
validation_split: dev_r1
- include: flan/yaml_templates/held_in_template_yaml
task: anli_r2
dataset_path: anli
use_prompt: flan/prompt_templates/anli.yaml:*
validation_split: dev_r2
- include: flan/yaml_templates/held_in_template_yaml
task: anli_r3
dataset_path: anli
use_prompt: flan/prompt_templates/anli.yaml:*
validation_split: dev_r3
- include: flan/yaml_templates/held_in_template_yaml
task: arc_easy
dataset_path: ai2_arc
dataset_name: ARC-Easy
use_prompt: flan/prompt_templates/arc.yaml:*
validation_split: validation
- include: flan/yaml_templates/held_in_template_yaml
task: arc_challenge
dataset_path: ai2_arc
dataset_name: ARC-Challenge
use_prompt: flan/prompt_templates/arc.yaml:*
validation_split: validation
group: flan_held_out
task:
# BBH
- bbh_flan_zeroshot
- bbh_flan_fewshot
- bbh_flan_cot_fewshot
- bbh_flan_cot_zeroshot
# MMLU
- mmlu
- mmlu_flan_n_shot_generative
- mmlu_flan_n_shot_loglikelihood
- mmlu_flan_cot_zeroshot
- mmlu_flan_cot_fewshot
group: flan_rte
task:
- include: yaml_templates/held_in_template_yaml
dataset_path: super_glue
dataset_name: rte
use_prompt: prompt_templates/rte.yaml:*
validation_split: validation
# Flan Prompt Templates
prompts:
"template-0":
doc_to_text: "{{premise}}\n\nChoose your answer: based on the paragraph above can we conclude that \"{{hypothesis}}\"?\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nI think the answer is"
doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
"template-1":
doc_to_text: "{{premise}}\n\nBased on that paragraph can we conclude that this sentence is true?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
"template-2":
doc_to_text: "{{premise}}\n\nCan we draw the following conclusion?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
"template-3":
doc_to_text: "{{premise}}\nDoes this next sentence follow, given the preceding text?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
"template-4":
doc_to_text: "{{premise}}\nCan we infer the following?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nThe answer is:"
doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
"template-5":
doc_to_text: "Read the following paragraph and determine if the hypothesis is true:\n\n{{premise}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nHypothesis: {{hypothesis}}\n\n\n"
doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
"template-6":
doc_to_text: "Read the text and determine if the sentence is true (see options at the end):\n\n{{premise}}\n\nSentence: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
"template-7":
doc_to_text: "Can we draw the following hypothesis from the context (see options)? \n\nContext:\n\n{{premise}}\n\nHypothesis: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
"template-8":
doc_to_text: "Choose from options: Determine if the sentence is true based on the text below:\n{{hypothesis}}\n\n{{premise}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
# Flan Prompt Templates
prompts:
"template-0":
doc_to_text: "{{question}}\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
"template-1":
doc_to_text: "Question: {{question}}\nOPTIONS:\n- {{choices.text|join('\n- ')}}\nAnswer:"
doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
"template-2":
doc_to_text: "Question: {{question}}\n\nWhat is the correct answer to the question from the following choices?\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
"template-3":
doc_to_text: "Q: {{question}}\nWhat is the correct answer to this question?\nOPTIONS:\n- {{choices.text|join('\n- ')}}...A:"
doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
"template-4":
doc_to_text: "Choose your answer?\n\n{{question}}\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
"template-5":
doc_to_text: "Answer the question\n\n{{question}}\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
"template-6":
doc_to_text: "{{question}}\n\nPick the answer from these options\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
# Flan Prompt Templates
prompts:
"template-0":
doc_to_text: "{{passage}}\n\nCan we conclude that {{question}}?\n\nOPTIONS:\n- no\n- yes"
doc_to_target: "{{['no', 'yes'][label]}}"
"template-1":
doc_to_text: "{{passage}}\n\nIs it true that {{question}}?\n\nOPTIONS:\n- no\n- yes"
doc_to_target: "{{['no', 'yes'][label]}}"
"template-2":
doc_to_text: "{{passage}}\n\n{{question}}?\n\nOPTIONS:\n- no\n- yes"
doc_to_target: "{{['no', 'yes'][label]}}"
"template-3":
doc_to_text: "Text: {{passage}}\n\nQuestion: {{question}}?\n\nOPTIONS:\n- no\n- yes"
doc_to_target: "{{['no', 'yes'][label]}}"
"template-4":
doc_to_text: "{{passage}}\n\nWhat's the best answer to this question: {{question}}?\n\nOPTIONS:\n- no\n- yes"
doc_to_target: "{{['no', 'yes'][label]}}"
"template-5":
doc_to_text: "{{passage}}\nBased on the above text what's the best answer to this question: {{question}}?\n\nOPTIONS:\n- no\n- yes"
doc_to_target: "{{['no', 'yes'][label]}}"
"template-6":
doc_to_text: "{{passage}}\nAnswer this question making sure that the answer is supposed by the text: {{question}}?\n\nOPTIONS:\n- no\n- yes"
doc_to_target: "{{['no', 'yes'][label]}}"
"template-7":
doc_to_text: "{{passage}}\n\nIs the following statement correct based on the text\n\n{{question}}\n\nOPTIONS:\n- no\n- yes"
doc_to_target: "{{['no', 'yes'][label]}}"
"template-8":
# doc_to_text: "{{title}}\n\n{{passage}}\n\nIs this statement correct \"{{question}}\"?\n\nOPTIONS:\n- no\n- yes"
doc_to_text: "{{passage}}\n\nIs this statement correct \"{{question}}\"?\n\nOPTIONS:\n- no\n- yes"
doc_to_target: "{{['no', 'yes'][label]}}"
"template-9":
doc_to_text: "Is it true that {{question}} based on the following text?\n\n{{passage}}\n\nOPTIONS:\n- no\n- yes"
doc_to_target: "{{['no', 'yes'][label]}}"
# Flan Prompt Templates
prompts:
"template-0":
doc_to_text: "{{premise}}\n\nQuestion with options: Based on the paragraph above can we conclude that \"{{hypothesis}}\"?\n\nOPTIONS:\n- yes\n- no"
doc_to_target: "{{['yes', 'no'][label]}}"
"template-1":
doc_to_text: "{{premise}}\n\nBased on that paragraph can we conclude that the sentence below is true?\n{{hypothesis}}\n\nOPTIONS:\n- yes\n- no"
doc_to_target: "{{['yes', 'no'][label]}}"
"template-2":
doc_to_text: "{{premise}}\n\nQ with options: Can we draw the following conclusion?\n{{hypothesis}}\n\nOPTIONS:\n- yes\n- no"
doc_to_target: "{{['yes', 'no'][label]}}"
"template-3":
doc_to_text: "{{premise}}\nDoes this next sentence follow, given the preceding text?\n{{hypothesis}}\n\nOPTIONS:\n- yes\n- no"
doc_to_target: "{{['yes', 'no'][label]}}"
"template-4":
doc_to_text: "{{premise}}\nOPTIONS:\n- yes\n- no\nQuestion: Can we infer the following?\n{{hypothesis}}"
doc_to_target: "{{['yes', 'no'][label]}}"
"template-5":
doc_to_text: "Read the following paragraph and determine if the hypothesis is true. Select from options at the end:\n\n{{premise}}\n\nHypothesis: {{hypothesis}}\nOPTIONS:\n- yes\n- no\nThe answer is"
doc_to_target: "{{['yes', 'no'][label]}}"
"template-6":
doc_to_text: "Read the text and determine if the sentence is true:\n\n{{premise}}\n\nSentence: {{hypothesis}}\nOPTIONS:\n- yes\n- no\nA:"
doc_to_target: "{{['yes', 'no'][label]}}"
"template-7":
doc_to_text: "Question with options: can we draw the following hypothesis from the context? \n\nContext:\n\n{{premise}}\n\nHypothesis: {{hypothesis}}\nOPTIONS:\n- yes\n- no\nA:"
doc_to_target: "{{['yes', 'no'][label]}}"
"template-8":
doc_to_text: "Determine if the sentence is true based on the text below. Choose from options.\n{{hypothesis}}\n\n{{premise}}\nOPTIONS:\n- yes\n- no"
doc_to_target: "{{['yes', 'no'][label]}}"
group: flan-cot
output_type: greedy_until
validation_split: validation
doc_to_target: "{{answer}}"
metric_list:
- metric: exact_match
aggregation: mean
higher_is_better: true
generation_kwargs:
until:
- "\n\n"
do_sample: false
temperature: 0.0
filter_list:
- name: "get-answer"
filter:
- function: "regex"
regex_pattern: "The answer is (\\-?[0-9\\.\\,]+)"
- function: "take_first"
output_type: greedy_until
validation_split: validation
metric_list:
- metric: exact_match
aggregation: mean
higher_is_better: true
generation_kwargs:
until:
- "</s>"
do_sample: false
temperature: 0.0
...@@ -6,6 +6,7 @@ task: ...@@ -6,6 +6,7 @@ task:
use_prompt: promptsource:* use_prompt: promptsource:*
training_split: train training_split: train
validation_split: validation validation_split: validation
output_type: greedy_until
metric_list: metric_list:
- metric: exact_match - metric: exact_match
aggregation: mean aggregation: mean
...@@ -18,6 +19,7 @@ task: ...@@ -18,6 +19,7 @@ task:
use_prompt: promptsource:* use_prompt: promptsource:*
training_split: train training_split: train
validation_split: validation validation_split: validation
output_type: greedy_until
metric_list: metric_list:
- metric: exact_match - metric: exact_match
aggregation: mean aggregation: mean
...@@ -42,6 +44,7 @@ task: ...@@ -42,6 +44,7 @@ task:
use_prompt: promptsource:* use_prompt: promptsource:*
training_split: train training_split: train
validation_split: validation validation_split: validation
output_type: greedy_until
metric_list: metric_list:
- metric: exact_match - metric: exact_match
aggregation: mean aggregation: mean
...@@ -53,6 +56,7 @@ task: ...@@ -53,6 +56,7 @@ task:
use_prompt: promptsource:* use_prompt: promptsource:*
training_split: train_r1 training_split: train_r1
validation_split: dev_r1 validation_split: dev_r1
output_type: greedy_until
metric_list: metric_list:
- metric: exact_match - metric: exact_match
aggregation: mean aggregation: mean
...@@ -64,6 +68,7 @@ task: ...@@ -64,6 +68,7 @@ task:
use_prompt: promptsource:* use_prompt: promptsource:*
training_split: train_r2 training_split: train_r2
validation_split: dev_r2 validation_split: dev_r2
output_type: greedy_until
metric_list: metric_list:
- metric: exact_match - metric: exact_match
aggregation: mean aggregation: mean
...@@ -75,6 +80,7 @@ task: ...@@ -75,6 +80,7 @@ task:
use_prompt: promptsource:* use_prompt: promptsource:*
training_split: train_r3 training_split: train_r3
validation_split: dev_r3 validation_split: dev_r3
output_type: greedy_until
metric_list: metric_list:
- metric: exact_match - metric: exact_match
aggregation: mean aggregation: mean
...@@ -87,6 +93,7 @@ task: ...@@ -87,6 +93,7 @@ task:
use_prompt: promptsource:* use_prompt: promptsource:*
training_split: train training_split: train
validation_split: validation validation_split: validation
output_type: greedy_until
metric_list: metric_list:
- metric: exact_match - metric: exact_match
aggregation: mean aggregation: mean
...@@ -98,6 +105,7 @@ task: ...@@ -98,6 +105,7 @@ task:
use_prompt: promptsource:* use_prompt: promptsource:*
training_split: train training_split: train
validation_split: validation validation_split: validation
output_type: greedy_until
metric_list: metric_list:
- metric: exact_match - metric: exact_match
aggregation: mean aggregation: mean
...@@ -110,6 +118,7 @@ task: ...@@ -110,6 +118,7 @@ task:
use_prompt: promptsource:* use_prompt: promptsource:*
training_split: train training_split: train
validation_split: validation validation_split: validation
output_type: greedy_until
metric_list: metric_list:
- metric: exact_match - metric: exact_match
aggregation: mean aggregation: mean
......
...@@ -6,7 +6,7 @@ Title: `Beyond the Imitation Game: Quantifying and extrapolating the capabilitie ...@@ -6,7 +6,7 @@ Title: `Beyond the Imitation Game: Quantifying and extrapolating the capabilitie
Abstract: https://arxiv.org/abs/2206.04615 Abstract: https://arxiv.org/abs/2206.04615
The Beyond the Imitation Game Benchmark (BIG-bench) is a collaborative benchmark intended to probe large language models and extrapolate their future capabilities. The Beyond the Imitation Game Benchmark (BIG-bench) is a collaborative benchmark intended to probe large language models and extrapolate their future capabilities.
Homepage: https://github.com/google/BIG-bench Homepage: https://github.com/google/BIG-bench
......
...@@ -2,179 +2,182 @@ import os ...@@ -2,179 +2,182 @@ import os
import yaml import yaml
all_subtasks = [ all_subtasks = [
'abstract_narrative_understanding', "abstract_narrative_understanding",
'anachronisms', "anachronisms",
'analogical_similarity', "analogical_similarity",
'analytic_entailment', "analytic_entailment",
'arithmetic', "arithmetic",
'ascii_word_recognition', "ascii_word_recognition",
'authorship_verification', "authorship_verification",
'auto_categorization', "auto_categorization",
'auto_debugging', "auto_debugging",
'bbq_lite_json', "bbq_lite_json",
'bridging_anaphora_resolution_barqa', "bridging_anaphora_resolution_barqa",
'causal_judgment', "causal_judgment",
'cause_and_effect', "cause_and_effect",
'checkmate_in_one', "checkmate_in_one",
'chess_state_tracking', "chess_state_tracking",
'chinese_remainder_theorem', "chinese_remainder_theorem",
'cifar10_classification', "cifar10_classification",
'code_line_description', "code_line_description",
'codenames', "codenames",
'color', "color",
'common_morpheme', "common_morpheme",
'conceptual_combinations', "conceptual_combinations",
'conlang_translation', "conlang_translation",
'contextual_parametric_knowledge_conflicts', "contextual_parametric_knowledge_conflicts",
'crash_blossom', "crash_blossom",
'crass_ai', "crass_ai",
'cryobiology_spanish', "cryobiology_spanish",
'cryptonite', "cryptonite",
'cs_algorithms', "cs_algorithms",
'dark_humor_detection', "dark_humor_detection",
'date_understanding', "date_understanding",
'disambiguation_qa', "disambiguation_qa",
'discourse_marker_prediction', "discourse_marker_prediction",
'disfl_qa', "disfl_qa",
'dyck_languages', "dyck_languages",
'elementary_math_qa', "elementary_math_qa",
'emoji_movie', "emoji_movie",
'emojis_emotion_prediction', "emojis_emotion_prediction",
'empirical_judgments', "empirical_judgments",
'english_proverbs', "english_proverbs",
'english_russian_proverbs', "english_russian_proverbs",
'entailed_polarity', "entailed_polarity",
'entailed_polarity_hindi', "entailed_polarity_hindi",
'epistemic_reasoning', "epistemic_reasoning",
'evaluating_information_essentiality', "evaluating_information_essentiality",
'fact_checker', "fact_checker",
'fantasy_reasoning', "fantasy_reasoning",
'few_shot_nlg', "few_shot_nlg",
'figure_of_speech_detection', "figure_of_speech_detection",
'formal_fallacies_syllogisms_negation', "formal_fallacies_syllogisms_negation",
'gem', "gem",
'gender_inclusive_sentences_german', "gender_inclusive_sentences_german",
'general_knowledge', "general_knowledge",
'geometric_shapes', "geometric_shapes",
'goal_step_wikihow', "goal_step_wikihow",
'gre_reading_comprehension', "gre_reading_comprehension",
'hhh_alignment', "hhh_alignment",
'hindi_question_answering', "hindi_question_answering",
'hindu_knowledge', "hindu_knowledge",
'hinglish_toxicity', "hinglish_toxicity",
'human_organs_senses', "human_organs_senses",
'hyperbaton', "hyperbaton",
'identify_math_theorems', "identify_math_theorems",
'identify_odd_metaphor', "identify_odd_metaphor",
'implicatures', "implicatures",
'implicit_relations', "implicit_relations",
'intent_recognition', "intent_recognition",
'international_phonetic_alphabet_nli', "international_phonetic_alphabet_nli",
'international_phonetic_alphabet_transliterate', "international_phonetic_alphabet_transliterate",
'intersect_geometry', "intersect_geometry",
'irony_identification', "irony_identification",
'kanji_ascii', "kanji_ascii",
'kannada', "kannada",
'key_value_maps', "key_value_maps",
'known_unknowns', "known_unknowns",
'language_games', "language_games",
'language_identification', "language_identification",
'linguistic_mappings', "linguistic_mappings",
'linguistics_puzzles', "linguistics_puzzles",
'list_functions', "list_functions",
'logic_grid_puzzle', "logic_grid_puzzle",
'logical_args', "logical_args",
'logical_deduction', "logical_deduction",
'logical_fallacy_detection', "logical_fallacy_detection",
'logical_sequence', "logical_sequence",
'mathematical_induction', "mathematical_induction",
'matrixshapes', "matrixshapes",
'metaphor_boolean', "metaphor_boolean",
'metaphor_understanding', "metaphor_understanding",
'minute_mysteries_qa', "minute_mysteries_qa",
'misconceptions', "misconceptions",
'misconceptions_russian', "misconceptions_russian",
'mnist_ascii', "mnist_ascii",
'modified_arithmetic', "modified_arithmetic",
'moral_permissibility', "moral_permissibility",
'movie_dialog_same_or_different', "movie_dialog_same_or_different",
'movie_recommendation', "movie_recommendation",
'mult_data_wrangling', "mult_data_wrangling",
'multiemo', "multiemo",
'natural_instructions', "natural_instructions",
'navigate', "navigate",
'nonsense_words_grammar', "nonsense_words_grammar",
'novel_concepts', "novel_concepts",
'object_counting', "object_counting",
'odd_one_out', "odd_one_out",
'operators', "operators",
'paragraph_segmentation', "paragraph_segmentation",
'parsinlu_qa', "parsinlu_qa",
'parsinlu_reading_comprehension', "parsinlu_reading_comprehension",
'penguins_in_a_table', "penguins_in_a_table",
'periodic_elements', "periodic_elements",
'persian_idioms', "persian_idioms",
'phrase_relatedness', "phrase_relatedness",
'physical_intuition', "physical_intuition",
'physics', "physics",
'physics_questions', "physics_questions",
'play_dialog_same_or_different', "play_dialog_same_or_different",
'polish_sequence_labeling', "polish_sequence_labeling",
'presuppositions_as_nli', "presuppositions_as_nli",
'qa_wikidata', "qa_wikidata",
'question_selection', "question_selection",
'real_or_fake_text', "real_or_fake_text",
'reasoning_about_colored_objects', "reasoning_about_colored_objects",
'repeat_copy_logic', "repeat_copy_logic",
'rephrase', "rephrase",
'riddle_sense', "riddle_sense",
'ruin_names', "ruin_names",
'salient_translation_error_detection', "salient_translation_error_detection",
'scientific_press_release', "scientific_press_release",
'semantic_parsing_in_context_sparc', "semantic_parsing_in_context_sparc",
'semantic_parsing_spider', "semantic_parsing_spider",
'sentence_ambiguity', "sentence_ambiguity",
'similarities_abstraction', "similarities_abstraction",
'simp_turing_concept', "simp_turing_concept",
'simple_arithmetic_json', "simple_arithmetic_json",
'simple_arithmetic_json_multiple_choice', "simple_arithmetic_json_multiple_choice",
'simple_arithmetic_json_subtasks', "simple_arithmetic_json_subtasks",
'simple_arithmetic_multiple_targets_json', "simple_arithmetic_multiple_targets_json",
'simple_ethical_questions', "simple_ethical_questions",
'simple_text_editing', "simple_text_editing",
'snarks', "snarks",
'social_iqa', "social_iqa",
'social_support', "social_support",
'sports_understanding', "sports_understanding",
'strange_stories', "strange_stories",
'strategyqa', "strategyqa",
'sufficient_information', "sufficient_information",
'suicide_risk', "suicide_risk",
'swahili_english_proverbs', "swahili_english_proverbs",
'swedish_to_german_proverbs', "swedish_to_german_proverbs",
'symbol_interpretation', "symbol_interpretation",
'temporal_sequences', "temporal_sequences",
'tense', "tense",
'timedial', "timedial",
'topical_chat', "topical_chat",
'tracking_shuffled_objects', "tracking_shuffled_objects",
'understanding_fables', "understanding_fables",
'undo_permutation', "undo_permutation",
'unit_conversion', "unit_conversion",
'unit_interpretation', "unit_interpretation",
'unnatural_in_context_learning', "unnatural_in_context_learning",
'vitaminc_fact_verification', "vitaminc_fact_verification",
'what_is_the_tao', "what_is_the_tao",
'which_wiki_edit', "which_wiki_edit",
'winowhy', "winowhy",
'word_sorting', "word_sorting",
'word_unscrambling' "word_unscrambling",
] ]
def main() -> None: def main() -> None:
for path, task_type in zip(["multiple_choice", "greedy_until"], ["multiple_choice_template_yaml", "greedy_until_template_yaml"]): for path, task_type in zip(
["multiple_choice", "greedy_until"],
["multiple_choice_template_yaml", "greedy_until_template_yaml"],
):
os.makedirs(path, exist_ok=True) os.makedirs(path, exist_ok=True)
for task in all_subtasks: for task in all_subtasks:
file_name = f"{task}.yaml" file_name = f"{task}.yaml"
...@@ -184,11 +187,15 @@ def main() -> None: ...@@ -184,11 +187,15 @@ def main() -> None:
yaml.dump( yaml.dump(
{ {
"include": f"../{task_type}", "include": f"../{task_type}",
"task": "bigbench_" + task + "_{}".format(task_type.split("_template_yaml")[0]), "task": "bigbench_"
"dataset_name": task + "_zero_shot", # zero-shot version of the dataset + task
+ "_{}".format(task_type.split("_template_yaml")[0]),
"dataset_name": task
+ "_zero_shot", # zero-shot version of the dataset
}, },
f, f,
width=float("inf"), allow_unicode=True width=float("inf"),
allow_unicode=True,
) )
except FileExistsError: except FileExistsError:
pass pass
......
#!/usr/bin/python
import os
import re
import sys
import math
import subprocess
import xml.sax.saxutils
from typing import List, Pattern, Tuple, Union, Dict, Any, Optional
"""
This script was adapted from the original version by hieuhoang1972 which is part of MOSES.
"""
# $Id: bleu.py 1307 2007-03-14 22:22:36Z hieuhoang1972 $
"""Provides:
cook_refs(refs, n=4): Transform a list of reference sentences as strings into a form usable by cook_test().
cook_test(test, refs, n=4): Transform a test sentence as a string (together with the cooked reference sentences) into a form usable by score_cooked().
score_cooked(alltest, n=4): Score a list of cooked test sentences.
score_set(s, testid, refids, n=4): Interface with dataset.py; calculate BLEU score of testid against refids.
The reason for breaking the BLEU computation into three phases cook_refs(), cook_test(), and score_cooked() is to allow the caller to calculate BLEU scores for multiple test sets as efficiently as possible.
"""
# Added to bypass NIST-style pre-processing of hyp and ref files -- wade
nonorm = 0
preserve_case = False
eff_ref_len = "shortest"
normalize1: List[Tuple[Union[Pattern[str], str], str]] = [
("<skipped>", ""), # strip "skipped" tags
(r"-\n", ""), # strip end-of-line hyphenation and join lines
(r"\n", " "), # join lines
# (r'(\d)\s+(?=\d)', r'\1'), # join digits
]
normalize1 = [(re.compile(pattern), replace) for (pattern, replace) in normalize1]
normalize2: List[Tuple[Union[Pattern[str], str], str]] = [
(
r"([\{-\~\[-\` -\&\(-\+\:-\@\/])",
r" \1 ",
), # tokenize punctuation. apostrophe is missing
(
r"([^0-9])([\.,])",
r"\1 \2 ",
), # tokenize period and comma unless preceded by a digit
(
r"([\.,])([^0-9])",
r" \1 \2",
), # tokenize period and comma unless followed by a digit
(r"([0-9])(-)", r"\1 \2 "), # tokenize dash when preceded by a digit
]
normalize2 = [(re.compile(pattern), replace) for (pattern, replace) in normalize2]
def normalize(s):
"""Normalize and tokenize text. This is lifted from NIST mteval-v11a.pl."""
# Added to bypass NIST-style pre-processing of hyp and ref files -- wade
if nonorm:
return s.split()
if type(s) is not str:
s = " ".join(s)
# language-independent part:
for (pattern, replace) in normalize1:
s = re.sub(pattern, replace, s)
s = xml.sax.saxutils.unescape(s, {"&quot;": '"'})
# language-dependent part (assuming Western languages):
s = " %s " % s
if not preserve_case:
s = s.lower() # this might not be identical to the original
for (pattern, replace) in normalize2:
s = re.sub(pattern, replace, s)
return s.split()
def count_ngrams(words, n=4):
counts: Dict[Any, int] = {}
for k in range(1, n + 1):
for i in range(len(words) - k + 1):
ngram = tuple(words[i : i + k])
counts[ngram] = counts.get(ngram, 0) + 1
return counts
def cook_refs(refs, n=4):
"""Takes a list of reference sentences for a single segment
and returns an object that encapsulates everything that BLEU
needs to know about them."""
refs = [normalize(ref) for ref in refs]
maxcounts: Dict[Tuple[str], int] = {}
for ref in refs:
counts = count_ngrams(ref, n)
for (ngram, count) in counts.items():
maxcounts[ngram] = max(maxcounts.get(ngram, 0), count)
return ([len(ref) for ref in refs], maxcounts)
def cook_test(test, item, n=4):
"""Takes a test sentence and returns an object that
encapsulates everything that BLEU needs to know about it."""
(reflens, refmaxcounts) = item
test = normalize(test)
result: Dict[str, Any] = {}
result["testlen"] = len(test)
# Calculate effective reference sentence length.
if eff_ref_len == "shortest":
result["reflen"] = min(reflens)
elif eff_ref_len == "average":
result["reflen"] = float(sum(reflens)) / len(reflens)
elif eff_ref_len == "closest":
min_diff: Optional[int] = None
for reflen in reflens:
if min_diff is None or abs(reflen - len(test)) < min_diff:
min_diff = abs(reflen - len(test))
result["reflen"] = reflen
result["guess"] = [max(len(test) - k + 1, 0) for k in range(1, n + 1)]
result["correct"] = [0] * n
counts = count_ngrams(test, n)
for (ngram, count) in counts.items():
result["correct"][len(ngram) - 1] += min(refmaxcounts.get(ngram, 0), count)
return result
def score_cooked(allcomps, n=4, ground=0, smooth=1):
totalcomps: Dict[str, Any] = {
"testlen": 0,
"reflen": 0,
"guess": [0] * n,
"correct": [0] * n,
}
for comps in allcomps:
for key in ["testlen", "reflen"]:
totalcomps[key] += comps[key]
for key in ["guess", "correct"]:
for k in range(n):
totalcomps[key][k] += comps[key][k]
logbleu = 0.0
all_bleus: List[float] = []
for k in range(n):
correct = totalcomps["correct"][k]
guess = totalcomps["guess"][k]
addsmooth = 0
if smooth == 1 and k > 0:
addsmooth = 1
logbleu += math.log(correct + addsmooth + sys.float_info.min) - math.log(
guess + addsmooth + sys.float_info.min
)
if guess == 0:
all_bleus.append(-10000000.0)
else:
all_bleus.append(math.log(correct + sys.float_info.min) - math.log(guess))
logbleu /= float(n)
all_bleus.insert(0, logbleu)
brevPenalty = min(
0, 1 - float(totalcomps["reflen"] + 1) / (totalcomps["testlen"] + 1)
)
for i in range(len(all_bleus)):
if i == 0:
all_bleus[i] += brevPenalty
all_bleus[i] = math.exp(all_bleus[i])
return all_bleus
def bleu(refs, candidate, ground=0, smooth=1):
refs = cook_refs(refs)
test = cook_test(candidate, refs)
return score_cooked([test], ground=ground, smooth=smooth)
def splitPuncts(line):
return " ".join(re.findall(r"[\w]+|[^\s\w]", line))
def computeMaps(predictions, goldfile):
predictionMap: Dict[str, list] = {}
goldMap: Dict[str, list] = {}
gf = open(goldfile, "r")
for row in predictions:
cols = row.strip().split("\t")
if len(cols) == 1:
(rid, pred) = (cols[0], "")
else:
(rid, pred) = (cols[0], cols[1])
predictionMap[rid] = [splitPuncts(pred.strip().lower())]
for row in gf:
(rid, pred) = row.split("\t")
if rid in predictionMap: # Only insert if the id exists for the method
if rid not in goldMap:
goldMap[rid] = []
goldMap[rid].append(splitPuncts(pred.strip().lower()))
sys.stderr.write("Total: " + str(len(goldMap)) + "\n")
return (goldMap, predictionMap)
# m1 is the reference map
# m2 is the prediction map
def bleuFromMaps(m1, m2):
score = [0] * 5
num = 0.0
for key in m1:
if key in m2:
bl = bleu(m1[key], m2[key][0])
score = [score[i] + bl[i] for i in range(0, len(bl))]
num += 1
return [s * 100.0 / num for s in score]
def smoothed_bleu_4(references, predictions, **kwargs):
predictionMap = {}
goldMap = {}
for rid, pred in enumerate(predictions):
predictionMap[rid] = [splitPuncts(pred.strip().lower())]
for rid, row in enumerate(references):
goldMap[rid] = [splitPuncts(row.strip().lower())]
return bleuFromMaps(goldMap, predictionMap)[0]
if __name__ == "__main__":
reference_file = sys.argv[1]
predictions = []
for row in sys.stdin:
predictions.append(row)
(goldMap, predictionMap) = computeMaps(predictions, reference_file)
print(bleuFromMaps(goldMap, predictionMap)[0])
group:
- codexglue_code2text
task: code2text_go
dataset_path: CM/codexglue_code2text_go
training_split: train
validation_split: validation
test_split: test
output_type: greedy_until
generation_kwargs:
num_beams: 10
max_length: 128
until:
- "</s>"
doc_to_text: !function utils.doc_to_text
doc_to_target: !function utils.doc_to_target
metric_list:
- metric: !function bleu.smoothed_bleu_4
aggregation: mean
higher_is_better: True
group:
- codexglue_code2text
task: code2text_java
dataset_path: CM/codexglue_code2text_java
training_split: train
validation_split: validation
test_split: test
output_type: greedy_until
generation_kwargs:
num_beams: 10
max_length: 128
until:
- "</s>"
doc_to_text: !function utils.doc_to_text
doc_to_target: !function utils.doc_to_target
metric_list:
- metric: !function bleu.smoothed_bleu_4
aggregation: mean
higher_is_better: True
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment