Merge branch 'main' of https://github.com/EleutherAI/lm-evaluation-harness into t5v2-alt-plus

02e841ce · lintangsutawika · 90ad5db7 · e74ec966 · 02e841ce · 02e841ce
Commit 02e841ce authored Mar 14, 2024 by lintangsutawika
20 changed files
--- a/lm_eval/tasks/french_bench/_default_template_yaml
+++ b/lm_eval/tasks/french_bench/_default_template_yaml
+test_split: test
+fewshot_split: valid
+fewshot_config:
+  sampler: first_n
--- a/lm_eval/tasks/french_bench/french_bench_arc_challenge.yaml
+++ b/lm_eval/tasks/french_bench/french_bench_arc_challenge.yaml
+group:
+  - french_bench
+  - french_bench_mc
+task: french_bench_arc_challenge
+dataset_path: manu/french_bench_arc_challenge
+output_type: multiple_choice
+training_split: train
+validation_split: validation
+test_split: test
+doc_to_text: "Question: {{question}}\nRéponse:"
+doc_to_target: "{{['A', 'B', 'C', 'D'].index(answerKey)}}"
+doc_to_choice: "{{choices}}"
+should_decontaminate: true
+doc_to_decontamination_query: "Question: {{question}}\nRéponse:"
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
--- a/lm_eval/tasks/french_bench/french_bench_boolqa.yaml
+++ b/lm_eval/tasks/french_bench/french_bench_boolqa.yaml
+include: "_default_template_yaml"
+group:
+  - french_bench
+  - french_bench_extra
+description: "D'après l'information dans le contexte donné, quelle est la réponse à la question ?"
+task: french_bench_boolqa
+dataset_path: manu/french_boolq
+output_type: multiple_choice
+validation_split: valid
+doc_to_text: "\nContexte: {{passage}}\n\nQuestion: {{question}}\n"
+doc_to_choice: ["Oui", "Non"]
+# doc_to_text: "\nContexte: {{passage}}\n\nQuestion: {{question}}\n\nD'après l'information dans le contexte, la réponse est:\nA. Oui \nB. Non\n\nRéponse:"
+# doc_to_choice: ["A", "B"]
+doc_to_target: "{{[1, 0].index(label)}}"
+should_decontaminate: true
+doc_to_decontamination_query: passage
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
--- a/lm_eval/tasks/french_bench/french_bench_fquadv2.yaml
+++ b/lm_eval/tasks/french_bench/french_bench_fquadv2.yaml
+include: "_default_template_yaml"
+group:
+  - french_bench
+  - french_bench_extra
+description: "D'après l'information dans le contexte donné, donne la réponse à la question en citant quelques mots du contexte. Si il est impossible de répondre avec les informations du contexte, répond 'Impossible'."
+task: french_bench_fquadv2
+dataset_path: manu/fquad2_test
+output_type: generate_until
+validation_split: valid
+doc_to_text: "\nContexte: {{context}}\n\nQuestion: {{question}}\n\nRéponse:"
+doc_to_target: "{% if answers.text| length > 0 %}{{answers.text[0]}}{% else %}{{['Impossible']}}{% endif %}"
+target_delimiter: " "
+should_decontaminate: true
+doc_to_decontamination_query: context
+generation_kwargs:
+  until:
+    - "\n"
+# filter_list:
+#   - name: remove_whitespace
+#     filter:
+#       - function: remove_whitespace
+#       - function: take_first
+metric_list:
+  - metric: !function utils.exact
+    aggregation: mean
+    higher_is_better: true
+  - metric: !function utils.f1
+    aggregation: mean
+    higher_is_better: true
--- a/lm_eval/tasks/french_bench/french_bench_fquadv2_bool.yaml
+++ b/lm_eval/tasks/french_bench/french_bench_fquadv2_bool.yaml
+include: "_default_template_yaml"
+group:
+  - french_bench
+  - french_bench_extra
+description: "D'après l'information présente dans le contexte, est il possible de répondre à la question ?"
+task: french_bench_fquadv2_bool
+dataset_path: manu/fquad2_test
+output_type: multiple_choice
+validation_split: valid
+doc_to_text: "\nContexte: {{context}}\n\nQuestion: {{question}}\n\nD'après l'information présente dans le contexte, répondre à la question est:\nA. Possible \nB. Impossible\n\nRéponse:"
+doc_to_choice: ["A", "B"]
+doc_to_target: "{{[False, True].index(is_impossible)}}"
+should_decontaminate: true
+doc_to_decontamination_query: context
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
--- a/lm_eval/tasks/french_bench/french_bench_fquadv2_genq.yaml
+++ b/lm_eval/tasks/french_bench/french_bench_fquadv2_genq.yaml
+include: "_default_template_yaml"
+group:
+  - french_bench
+  - french_bench_gen
+description: "D'après l'information dans le contexte donné, quelle question a été posée pour obtenir la réponse donnée ?"
+task: french_bench_fquadv2_genq
+dataset_path: manu/fquad2_test
+output_type: generate_until
+validation_split: valid_hasAns
+test_split: test_hasAns
+fewshot_split: valid_hasAns
+doc_to_text: "\nContexte: {{context}}\n\nRéponse: {% if answers.text| length > 0 %}{{answers.text[0]}}{% else %}{{['Impossible']}}{% endif %}\n\nQuestion:"
+doc_to_target: "{{question}}"
+target_delimiter: " "
+should_decontaminate: true
+doc_to_decontamination_query: question
+generation_kwargs:
+  until:
+    - "\n"
+# filter_list:
+#   - name: remove_whitespace
+#     filter:
+#       - function: remove_whitespace
+#       - function: take_first
+metric_list:
+  - metric: !function utils.rouge1
+    higher_is_better: true
+    aggregation:  !function utils.rouge1_agg
+  - metric: !function utils.f1
+    aggregation: mean
+    higher_is_better: true
--- a/lm_eval/tasks/french_bench/french_bench_fquadv2_hasAns.yaml
+++ b/lm_eval/tasks/french_bench/french_bench_fquadv2_hasAns.yaml
+include: "_default_template_yaml"
+group:
+  - french_bench
+  - french_bench_gen
+description: "D'après l'information dans le contexte donné, donne la réponse à la question en citant quelques mots du contexte. Si il est impossible de répondre avec les informations du contexte, répond 'Impossible'."
+task: french_bench_fquadv2_hasAns
+dataset_path: manu/fquad2_test
+output_type: generate_until
+validation_split: valid_hasAns
+test_split: test_hasAns
+fewshot_split: valid_hasAns
+doc_to_text: "\nContexte: {{context}}\n\nQuestion: {{question}}\n\nRéponse:"
+doc_to_target: "{% if answers.text| length > 0 %}{{answers.text[0]}}{% else %}{{['Impossible']}}{% endif %}"
+target_delimiter: " "
+should_decontaminate: true
+doc_to_decontamination_query: context
+generation_kwargs:
+  until:
+    - "\n"
+# filter_list:
+#   - name: remove_whitespace
+#     filter:
+#       - function: remove_whitespace
+#       - function: take_first
+metric_list:
+  - metric: !function utils.exact
+    aggregation: mean
+    higher_is_better: true
+  - metric: !function utils.f1
+    aggregation: mean
+    higher_is_better: true
+  - metric: !function utils.rouge1
+    higher_is_better: true
+    aggregation: !function utils.rouge1_agg
--- a/lm_eval/tasks/french_bench/french_bench_grammar.yaml
+++ b/lm_eval/tasks/french_bench/french_bench_grammar.yaml
+include: "_default_template_yaml"
+group:
+  - french_bench
+  - french_bench_mc
+description: "Répond au mieux en complétant la question avec une des réponses proposées."
+dataset_path: manu/french-bench-grammar-vocab-reading
+output_type: multiple_choice
+validation_split: Grammar
+fewshot_split: Grammar
+test_split: Grammar
+#doc_to_text: "Question: {{question.strip()}}\nA: {{answerA}}\nB: {{answerB}}\nC: {{answerC}}\nD: {{answerD}}\nRéponse："
+#doc_to_choice: ["A", "B", "C", "D"]
+doc_to_text: "La phrase suivante est correcte grammaticalement:\n"
+doc_to_choice: "{{[question.replace('<...>', answerA), question.replace('<...>', answerB), question.replace('<...>', answerC), question.replace('<...>', answerD)]}}"
+doc_to_target: '{{["answerA", "answerB", "answerC", "answerD"].index("answer" + answer)}}'
+task: french_bench_grammar
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
--- a/lm_eval/tasks/french_bench/french_bench_hellaswag.yaml
+++ b/lm_eval/tasks/french_bench/french_bench_hellaswag.yaml
+group:
+  - french_bench
+  - french_bench_mc
+task: french_bench_hellaswag
+dataset_path: manu/french_bench_hellaswag
+output_type: multiple_choice
+training_split: validation
+validation_split: validation
+test_split: null
+process_docs: !function utils.process_docs
+doc_to_text: "{{query}}"
+doc_to_target: "{{label}}"
+doc_to_choice: "{{choices}}"
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
--- a/lm_eval/tasks/french_bench/french_bench_multifquad.yaml
+++ b/lm_eval/tasks/french_bench/french_bench_multifquad.yaml
+include: "_default_template_yaml"
+group:
+  - french_bench
+  - french_bench_gen
+description: "D'après l'information dans le contexte donné, donne la réponse à la question en citant quelques extraits du contexte."
+task: french_bench_multifquad
+dataset_path: manu/multifquad_test
+output_type: generate_until
+validation_split: valid
+test_split: test
+fewshot_split: valid
+doc_to_text: "\nContexte: {{context}}\n\nQuestion: {{question}}\n\nRéponse:"
+doc_to_target: "{{', '.join(answers.text)}}"
+target_delimiter: " "
+should_decontaminate: true
+doc_to_decontamination_query: context
+generation_kwargs:
+  until:
+    - "\n"
+# filter_list:
+#   - name: remove_whitespace
+#     filter:
+#       - function: remove_whitespace
+#       - function: take_first
+metric_list:
+  - metric: !function utils.exact
+    aggregation: mean
+    higher_is_better: true
+  - metric: !function utils.f1
+    aggregation: mean
+    higher_is_better: true
+  - metric: !function utils.rouge1
+    higher_is_better: true
+    aggregation:  !function utils.rouge1_agg
--- a/lm_eval/tasks/french_bench/french_bench_opus_perplexity.yaml
+++ b/lm_eval/tasks/french_bench/french_bench_opus_perplexity.yaml
+group:
+  - french_bench_perplexity
+task: french_bench_opus_perplexity
+dataset_path: manu/opus100-en-fr
+output_type: loglikelihood_rolling
+test_split: test
+fewshot_split: validation
+validation_split: validation
+num_fewshot: 0
+doc_to_text: ""
+doc_to_target: "{{text}}"
+should_decontaminate: true
+doc_to_decontamination_query: "{{text}}"
+metric_list:
+  - metric: word_perplexity
+    aggregation: weighted_perplexity
+    higher_is_better: false
+  - metric: byte_perplexity
+    aggregation: weighted_perplexity
+    higher_is_better: false
+  - metric: bits_per_byte
+    aggregation: bits_per_byte
+    higher_is_better: false
--- a/lm_eval/tasks/french_bench/french_bench_orangesum_abstract.yaml
+++ b/lm_eval/tasks/french_bench/french_bench_orangesum_abstract.yaml
+include: "_default_template_yaml"
+group:
+  - french_bench
+  - french_bench_gen
+description: "Résume l'article en une phrase."
+task: french_bench_orangesum_abstract
+dataset_path: orange_sum
+dataset_name: abstract
+output_type: generate_until
+validation_split: validation
+fewshot_split: validation
+doc_to_text: "\nArticle: {{text}}\n\nRésumé:"
+doc_to_target: "{{summary}}"
+target_delimiter: " "
+should_decontaminate: true
+doc_to_decontamination_query: summary
+generation_kwargs:
+  until:
+    - "\n"
+# filter_list:
+#   - name: remove_whitespace
+#     filter:
+#       - function: remove_whitespace
+#       - function: take_first
+metric_list:
+  - metric: !function utils.rouge1
+    higher_is_better: true
+    aggregation:  !function utils.rouge1_agg
--- a/lm_eval/tasks/french_bench/french_bench_orangesum_title.yaml
+++ b/lm_eval/tasks/french_bench/french_bench_orangesum_title.yaml
+include: "_default_template_yaml"
+group:
+  - french_bench
+  - french_bench_extra
+description: "Trouve le titre de l'article."
+task: french_bench_orangesum_title
+dataset_path: orange_sum
+dataset_name: title
+output_type: generate_until
+validation_split: validation
+fewshot_split: validation
+doc_to_text: "\nArticle: {{text}}\n\nTitre:"
+doc_to_target: "{{summary}}"
+target_delimiter: " "
+should_decontaminate: true
+doc_to_decontamination_query: summary
+generation_kwargs:
+  until:
+    - "\n"
+# filter_list:
+#   - name: remove_whitespace
+#     filter:
+#       - function: remove_whitespace
+#       - function: take_first
+metric_list:
+  - metric: !function utils.rouge1
+    higher_is_better: true
+    aggregation:  !function utils.rouge1_agg
--- a/lm_eval/tasks/french_bench/french_bench_reading_comp.yaml
+++ b/lm_eval/tasks/french_bench/french_bench_reading_comp.yaml
+include: "_default_template_yaml"
+group:
+  - french_bench
+  - french_bench_extra
+# description: "Répond au mieux en complétant la question avec une des réponses proposées."
+dataset_path: manu/french-bench-grammar-vocab-reading
+output_type: multiple_choice
+validation_split: Reading
+fewshot_split: Reading
+test_split: Reading
+# doc_to_text: "Context: {{context}}\nQuestion: {{question.strip()}}\nA: {{answerA}}\nB: {{answerB}}\nC: {{answerC}}\nD: {{answerD}}\nRéponse："
+# doc_to_choice: "{{['A: '+answerA, 'B: '+answerB, 'C: '+answerC, 'D: '+answerD]}}"
+doc_to_text: "Context: {{context}}\n\n"
+doc_to_choice: "{{[question.replace('<...>', answerA) if '<...>' in question else question + ' ' +answerA, question.replace('<...>', answerB) if '<...>' in question else question + ' ' + answerB, question.replace('<...>', answerC) if '<...>' in question else question + ' ' + answerC, question.replace('<...>', answerD) if '<...>' in question else question + ' ' + answerD]}}"
+doc_to_target: '{{["answerA", "answerB", "answerC", "answerD"].index("answer" + answer)}}'
+# doc_to_choice: "{{['A: '+answerA, 'B: '+answerB, 'C: '+answerC, 'D: '+answerD]}}"
+# doc_to_target: answer
+task: french_bench_reading_comp
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
--- a/lm_eval/tasks/french_bench/french_bench_topic_based_nli.yaml
+++ b/lm_eval/tasks/french_bench/french_bench_topic_based_nli.yaml
+include: "_default_template_yaml"
+group:
+  - french_bench
+  - french_bench_extra
+description: "A propos du thème spécifié, l'avis client est il positif, négatif, ou neutre ?"
+task: french_bench_topic_based_nli
+dataset_path: manu/topic_based_nli_test
+output_type: multiple_choice
+validation_split: valid
+# doc_to_text: "\nAvis Client: {{text}}\n\nEn considèrant uniquement le thème \"{{topic}}\", l'avis client est plutot:\nA. Positif \nB. Négatif\nC. Mitigé \nD. Neutre\nE. Absent\n\nRéponse:"
+# doc_to_choice: ["A", "B", "C", "D", "E"]
+doc_to_text: "\nAvis Client: {{text}}\n\nA propos du thème \"{{topic}}\", l'avis client est"
+doc_to_choice: ['positif', 'négatif', 'neutre']
+doc_to_target: "{{['positif', 'negatif', 'neutre'].index(polarity)}}"
+should_decontaminate: true
+doc_to_decontamination_query: texte
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
--- a/lm_eval/tasks/french_bench/french_bench_trivia.yaml
+++ b/lm_eval/tasks/french_bench/french_bench_trivia.yaml
+include: "_default_template_yaml"
+group:
+  - french_bench
+  - french_bench_gen
+task: french_bench_trivia
+dataset_path: manu/french-trivia
+output_type: generate_until
+validation_split: train
+test_split: train
+fewshot_split: train
+doc_to_text: "{{Question}}\nAnswer:"
+doc_to_target: "{{Answer}}"
+target_delimiter: " "
+should_decontaminate: true
+doc_to_decontamination_query: Question
+generation_kwargs:
+  until:
+    - "\n"
+# filter_list:
+#   - name: remove_whitespace
+#     filter:
+#       - function: remove_whitespace
+#       - function: take_first
+metric_list:
+  - metric: !function utils.exact
+    aggregation: mean
+    higher_is_better: true
+  - metric: !function utils.f1
+    aggregation: mean
+    higher_is_better: true
+  - metric: !function utils.rouge1
+    higher_is_better: true
+    aggregation:  !function utils.rouge1_agg
+  - metric: !function utils.is_included
+    higher_is_better: true
+    aggregation: mean
--- a/lm_eval/tasks/french_bench/french_bench_vocab.yaml
+++ b/lm_eval/tasks/french_bench/french_bench_vocab.yaml
+include: "_default_template_yaml"
+group:
+  - french_bench
+  - french_bench_mc
+# description: "Répond au mieux en complétant la question avec une des réponses proposées."
+dataset_path: manu/french-bench-grammar-vocab-reading
+output_type: multiple_choice
+validation_split: Vocabulary
+fewshot_split: Vocabulary
+test_split: Vocabulary
+# doc_to_text: "Question: {{question.strip()}}\nA: {{answerA}}\nB: {{answerB}}\nC: {{answerC}}\nD: {{answerD}}\nRéponse："
+# doc_to_choice: ["A", "B", "C", "D"]
+doc_to_text: "La phrase suivante est logique sémantiquement:\n"
+doc_to_choice: "{{[question.replace('<...>', answerA), question.replace('<...>', answerB), question.replace('<...>', answerC), question.replace('<...>', answerD)]}}"
+doc_to_target: '{{["answerA", "answerB", "answerC", "answerD"].index("answer" + answer)}}'
+task: french_bench_vocab
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
--- a/lm_eval/tasks/french_bench/french_bench_wikitext_fr.yaml
+++ b/lm_eval/tasks/french_bench/french_bench_wikitext_fr.yaml
+group:
+  - french_bench_perplexity
+task: french_bench_wikitext_fr
+dataset_path: asi/wikitext_fr
+dataset_name: wikitext-35
+output_type: loglikelihood_rolling
+training_split: train
+validation_split: validation
+test_split: test
+num_fewshot: 0
+doc_to_text: ""
+doc_to_target: !function preprocess_wikitext.wikitext_detokenizer
+process_results: !function preprocess_wikitext.process_results
+should_decontaminate: true
+doc_to_decontamination_query: "{{paragraph}}"
+metric_list:
+  - metric: word_perplexity
+    aggregation: weighted_perplexity
+    higher_is_better: false
+  - metric: byte_perplexity
+    aggregation: weighted_perplexity
+    higher_is_better: false
+  - metric: bits_per_byte
+    aggregation: bits_per_byte
+    higher_is_better: false
--- a/lm_eval/tasks/french_bench/french_bench_xnli.yaml
+++ b/lm_eval/tasks/french_bench/french_bench_xnli.yaml
+include: "_default_template_yaml"
+group:
+  - french_bench
+  - french_bench_extra
+description: "La prémisse et l'hypothèse sont elles en accord, neutres en elles, ou en contradiction ?"
+dataset_path: xnli
+dataset_name: fr
+output_type: multiple_choice
+validation_split: validation
+fewshot_split: validation
+test_split: test
+# doc_to_text: "\nPrémisse: {{premise}}\n\nHypothèse: {{hypothesis}}\n\nLa prémisse et l'hypothèse sont:\nA. En accord\nB. Neutre\nC. En contradiction\nRéponse:"
+# doc_to_choice: "{{['A: En accord', 'B: Neutre', 'C: En contradiction']}}"
+doc_to_text: "\nPrémisse: {{premise}}\n\nHypothèse: {{hypothesis}}\n\nLa prémisse et l'hypothèse sont"
+doc_to_choice: "{{['en accord', 'neutres entre elles', 'en contradiction']}}"
+doc_to_target: label
+task: french_bench_xnli
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
--- a/lm_eval/tasks/french_bench/preprocess_wikitext.py
+++ b/lm_eval/tasks/french_bench/preprocess_wikitext.py
+import re
+
+
+def wikitext_detokenizer(doc):
+    string = doc["paragraph"]
+    # contractions
+    string = string.replace("s '", "s'")
+    string = re.sub(r"/' [0-9]/", r"/'[0-9]/", string)
+    # number separators
+    string = string.replace(" @-@ ", "-")
+    string = string.replace(" @,@ ", ",")
+    string = string.replace(" @.@ ", ".")
+    # punctuation
+    string = string.replace(" : ", ": ")
+    string = string.replace(" ; ", "; ")
+    string = string.replace(" . ", ". ")
+    string = string.replace(" ! ", "! ")
+    string = string.replace(" ? ", "? ")
+    string = string.replace(" , ", ", ")
+    # double brackets
+    string = re.sub(r"\(\s*([^\)]*?)\s*\)", r"(\1)", string)
+    string = re.sub(r"\[\s*([^\]]*?)\s*\]", r"[\1]", string)
+    string = re.sub(r"{\s*([^}]*?)\s*}", r"{\1}", string)
+    string = re.sub(r"\"\s*([^\"]*?)\s*\"", r'"\1"', string)
+    string = re.sub(r"'\s*([^']*?)\s*'", r"'\1'", string)
+    # miscellaneous
+    string = string.replace("= = = =", "====")
+    string = string.replace("= = =", "===")
+    string = string.replace("= =", "==")
+    string = string.replace(" " + chr(176) + " ", chr(176))
+    string = string.replace(" \n", "\n")
+    string = string.replace("\n ", "\n")
+    string = string.replace(" N ", " 1 ")
+    string = string.replace(" 's", "'s")
+
+    return string
+
+
+def process_results(doc, results):
+    (loglikelihood,) = results
+    # IMPORTANT: wikitext counts number of words in *original doc before detokenization*
+    _words = len(re.split(r"\s+", doc["paragraph"]))
+    _bytes = len(doc["paragraph"].encode("utf-8"))
+    return {
+        "word_perplexity": (loglikelihood, _words),
+        "byte_perplexity": (loglikelihood, _bytes),
+        "bits_per_byte": (loglikelihood, _bytes),
+    }