Commit 02e841ce authored by lintangsutawika's avatar lintangsutawika
Browse files

Merge branch 'main' of https://github.com/EleutherAI/lm-evaluation-harness into t5v2-alt-plus

parents 90ad5db7 e74ec966
test_split: test
fewshot_split: valid
fewshot_config:
sampler: first_n
group:
- french_bench
- french_bench_mc
task: french_bench_arc_challenge
dataset_path: manu/french_bench_arc_challenge
output_type: multiple_choice
training_split: train
validation_split: validation
test_split: test
doc_to_text: "Question: {{question}}\nRéponse:"
doc_to_target: "{{['A', 'B', 'C', 'D'].index(answerKey)}}"
doc_to_choice: "{{choices}}"
should_decontaminate: true
doc_to_decontamination_query: "Question: {{question}}\nRéponse:"
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
include: "_default_template_yaml"
group:
- french_bench
- french_bench_extra
description: "D'après l'information dans le contexte donné, quelle est la réponse à la question ?"
task: french_bench_boolqa
dataset_path: manu/french_boolq
output_type: multiple_choice
validation_split: valid
doc_to_text: "\nContexte: {{passage}}\n\nQuestion: {{question}}\n"
doc_to_choice: ["Oui", "Non"]
# doc_to_text: "\nContexte: {{passage}}\n\nQuestion: {{question}}\n\nD'après l'information dans le contexte, la réponse est:\nA. Oui \nB. Non\n\nRéponse:"
# doc_to_choice: ["A", "B"]
doc_to_target: "{{[1, 0].index(label)}}"
should_decontaminate: true
doc_to_decontamination_query: passage
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
include: "_default_template_yaml"
group:
- french_bench
- french_bench_extra
description: "D'après l'information dans le contexte donné, donne la réponse à la question en citant quelques mots du contexte. Si il est impossible de répondre avec les informations du contexte, répond 'Impossible'."
task: french_bench_fquadv2
dataset_path: manu/fquad2_test
output_type: generate_until
validation_split: valid
doc_to_text: "\nContexte: {{context}}\n\nQuestion: {{question}}\n\nRéponse:"
doc_to_target: "{% if answers.text| length > 0 %}{{answers.text[0]}}{% else %}{{['Impossible']}}{% endif %}"
target_delimiter: " "
should_decontaminate: true
doc_to_decontamination_query: context
generation_kwargs:
until:
- "\n"
# filter_list:
# - name: remove_whitespace
# filter:
# - function: remove_whitespace
# - function: take_first
metric_list:
- metric: !function utils.exact
aggregation: mean
higher_is_better: true
- metric: !function utils.f1
aggregation: mean
higher_is_better: true
include: "_default_template_yaml"
group:
- french_bench
- french_bench_extra
description: "D'après l'information présente dans le contexte, est il possible de répondre à la question ?"
task: french_bench_fquadv2_bool
dataset_path: manu/fquad2_test
output_type: multiple_choice
validation_split: valid
doc_to_text: "\nContexte: {{context}}\n\nQuestion: {{question}}\n\nD'après l'information présente dans le contexte, répondre à la question est:\nA. Possible \nB. Impossible\n\nRéponse:"
doc_to_choice: ["A", "B"]
doc_to_target: "{{[False, True].index(is_impossible)}}"
should_decontaminate: true
doc_to_decontamination_query: context
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
include: "_default_template_yaml"
group:
- french_bench
- french_bench_gen
description: "D'après l'information dans le contexte donné, quelle question a été posée pour obtenir la réponse donnée ?"
task: french_bench_fquadv2_genq
dataset_path: manu/fquad2_test
output_type: generate_until
validation_split: valid_hasAns
test_split: test_hasAns
fewshot_split: valid_hasAns
doc_to_text: "\nContexte: {{context}}\n\nRéponse: {% if answers.text| length > 0 %}{{answers.text[0]}}{% else %}{{['Impossible']}}{% endif %}\n\nQuestion:"
doc_to_target: "{{question}}"
target_delimiter: " "
should_decontaminate: true
doc_to_decontamination_query: question
generation_kwargs:
until:
- "\n"
# filter_list:
# - name: remove_whitespace
# filter:
# - function: remove_whitespace
# - function: take_first
metric_list:
- metric: !function utils.rouge1
higher_is_better: true
aggregation: !function utils.rouge1_agg
- metric: !function utils.f1
aggregation: mean
higher_is_better: true
include: "_default_template_yaml"
group:
- french_bench
- french_bench_gen
description: "D'après l'information dans le contexte donné, donne la réponse à la question en citant quelques mots du contexte. Si il est impossible de répondre avec les informations du contexte, répond 'Impossible'."
task: french_bench_fquadv2_hasAns
dataset_path: manu/fquad2_test
output_type: generate_until
validation_split: valid_hasAns
test_split: test_hasAns
fewshot_split: valid_hasAns
doc_to_text: "\nContexte: {{context}}\n\nQuestion: {{question}}\n\nRéponse:"
doc_to_target: "{% if answers.text| length > 0 %}{{answers.text[0]}}{% else %}{{['Impossible']}}{% endif %}"
target_delimiter: " "
should_decontaminate: true
doc_to_decontamination_query: context
generation_kwargs:
until:
- "\n"
# filter_list:
# - name: remove_whitespace
# filter:
# - function: remove_whitespace
# - function: take_first
metric_list:
- metric: !function utils.exact
aggregation: mean
higher_is_better: true
- metric: !function utils.f1
aggregation: mean
higher_is_better: true
- metric: !function utils.rouge1
higher_is_better: true
aggregation: !function utils.rouge1_agg
include: "_default_template_yaml"
group:
- french_bench
- french_bench_mc
description: "Répond au mieux en complétant la question avec une des réponses proposées."
dataset_path: manu/french-bench-grammar-vocab-reading
output_type: multiple_choice
validation_split: Grammar
fewshot_split: Grammar
test_split: Grammar
#doc_to_text: "Question: {{question.strip()}}\nA: {{answerA}}\nB: {{answerB}}\nC: {{answerC}}\nD: {{answerD}}\nRéponse:"
#doc_to_choice: ["A", "B", "C", "D"]
doc_to_text: "La phrase suivante est correcte grammaticalement:\n"
doc_to_choice: "{{[question.replace('<...>', answerA), question.replace('<...>', answerB), question.replace('<...>', answerC), question.replace('<...>', answerD)]}}"
doc_to_target: '{{["answerA", "answerB", "answerC", "answerD"].index("answer" + answer)}}'
task: french_bench_grammar
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
group:
- french_bench
- french_bench_mc
task: french_bench_hellaswag
dataset_path: manu/french_bench_hellaswag
output_type: multiple_choice
training_split: validation
validation_split: validation
test_split: null
process_docs: !function utils.process_docs
doc_to_text: "{{query}}"
doc_to_target: "{{label}}"
doc_to_choice: "{{choices}}"
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
include: "_default_template_yaml"
group:
- french_bench
- french_bench_gen
description: "D'après l'information dans le contexte donné, donne la réponse à la question en citant quelques extraits du contexte."
task: french_bench_multifquad
dataset_path: manu/multifquad_test
output_type: generate_until
validation_split: valid
test_split: test
fewshot_split: valid
doc_to_text: "\nContexte: {{context}}\n\nQuestion: {{question}}\n\nRéponse:"
doc_to_target: "{{', '.join(answers.text)}}"
target_delimiter: " "
should_decontaminate: true
doc_to_decontamination_query: context
generation_kwargs:
until:
- "\n"
# filter_list:
# - name: remove_whitespace
# filter:
# - function: remove_whitespace
# - function: take_first
metric_list:
- metric: !function utils.exact
aggregation: mean
higher_is_better: true
- metric: !function utils.f1
aggregation: mean
higher_is_better: true
- metric: !function utils.rouge1
higher_is_better: true
aggregation: !function utils.rouge1_agg
group:
- french_bench_perplexity
task: french_bench_opus_perplexity
dataset_path: manu/opus100-en-fr
output_type: loglikelihood_rolling
test_split: test
fewshot_split: validation
validation_split: validation
num_fewshot: 0
doc_to_text: ""
doc_to_target: "{{text}}"
should_decontaminate: true
doc_to_decontamination_query: "{{text}}"
metric_list:
- metric: word_perplexity
aggregation: weighted_perplexity
higher_is_better: false
- metric: byte_perplexity
aggregation: weighted_perplexity
higher_is_better: false
- metric: bits_per_byte
aggregation: bits_per_byte
higher_is_better: false
include: "_default_template_yaml"
group:
- french_bench
- french_bench_gen
description: "Résume l'article en une phrase."
task: french_bench_orangesum_abstract
dataset_path: orange_sum
dataset_name: abstract
output_type: generate_until
validation_split: validation
fewshot_split: validation
doc_to_text: "\nArticle: {{text}}\n\nRésumé:"
doc_to_target: "{{summary}}"
target_delimiter: " "
should_decontaminate: true
doc_to_decontamination_query: summary
generation_kwargs:
until:
- "\n"
# filter_list:
# - name: remove_whitespace
# filter:
# - function: remove_whitespace
# - function: take_first
metric_list:
- metric: !function utils.rouge1
higher_is_better: true
aggregation: !function utils.rouge1_agg
include: "_default_template_yaml"
group:
- french_bench
- french_bench_extra
description: "Trouve le titre de l'article."
task: french_bench_orangesum_title
dataset_path: orange_sum
dataset_name: title
output_type: generate_until
validation_split: validation
fewshot_split: validation
doc_to_text: "\nArticle: {{text}}\n\nTitre:"
doc_to_target: "{{summary}}"
target_delimiter: " "
should_decontaminate: true
doc_to_decontamination_query: summary
generation_kwargs:
until:
- "\n"
# filter_list:
# - name: remove_whitespace
# filter:
# - function: remove_whitespace
# - function: take_first
metric_list:
- metric: !function utils.rouge1
higher_is_better: true
aggregation: !function utils.rouge1_agg
include: "_default_template_yaml"
group:
- french_bench
- french_bench_extra
# description: "Répond au mieux en complétant la question avec une des réponses proposées."
dataset_path: manu/french-bench-grammar-vocab-reading
output_type: multiple_choice
validation_split: Reading
fewshot_split: Reading
test_split: Reading
# doc_to_text: "Context: {{context}}\nQuestion: {{question.strip()}}\nA: {{answerA}}\nB: {{answerB}}\nC: {{answerC}}\nD: {{answerD}}\nRéponse:"
# doc_to_choice: "{{['A: '+answerA, 'B: '+answerB, 'C: '+answerC, 'D: '+answerD]}}"
doc_to_text: "Context: {{context}}\n\n"
doc_to_choice: "{{[question.replace('<...>', answerA) if '<...>' in question else question + ' ' +answerA, question.replace('<...>', answerB) if '<...>' in question else question + ' ' + answerB, question.replace('<...>', answerC) if '<...>' in question else question + ' ' + answerC, question.replace('<...>', answerD) if '<...>' in question else question + ' ' + answerD]}}"
doc_to_target: '{{["answerA", "answerB", "answerC", "answerD"].index("answer" + answer)}}'
# doc_to_choice: "{{['A: '+answerA, 'B: '+answerB, 'C: '+answerC, 'D: '+answerD]}}"
# doc_to_target: answer
task: french_bench_reading_comp
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
include: "_default_template_yaml"
group:
- french_bench
- french_bench_extra
description: "A propos du thème spécifié, l'avis client est il positif, négatif, ou neutre ?"
task: french_bench_topic_based_nli
dataset_path: manu/topic_based_nli_test
output_type: multiple_choice
validation_split: valid
# doc_to_text: "\nAvis Client: {{text}}\n\nEn considèrant uniquement le thème \"{{topic}}\", l'avis client est plutot:\nA. Positif \nB. Négatif\nC. Mitigé \nD. Neutre\nE. Absent\n\nRéponse:"
# doc_to_choice: ["A", "B", "C", "D", "E"]
doc_to_text: "\nAvis Client: {{text}}\n\nA propos du thème \"{{topic}}\", l'avis client est"
doc_to_choice: ['positif', 'négatif', 'neutre']
doc_to_target: "{{['positif', 'negatif', 'neutre'].index(polarity)}}"
should_decontaminate: true
doc_to_decontamination_query: texte
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
include: "_default_template_yaml"
group:
- french_bench
- french_bench_gen
task: french_bench_trivia
dataset_path: manu/french-trivia
output_type: generate_until
validation_split: train
test_split: train
fewshot_split: train
doc_to_text: "{{Question}}\nAnswer:"
doc_to_target: "{{Answer}}"
target_delimiter: " "
should_decontaminate: true
doc_to_decontamination_query: Question
generation_kwargs:
until:
- "\n"
# filter_list:
# - name: remove_whitespace
# filter:
# - function: remove_whitespace
# - function: take_first
metric_list:
- metric: !function utils.exact
aggregation: mean
higher_is_better: true
- metric: !function utils.f1
aggregation: mean
higher_is_better: true
- metric: !function utils.rouge1
higher_is_better: true
aggregation: !function utils.rouge1_agg
- metric: !function utils.is_included
higher_is_better: true
aggregation: mean
include: "_default_template_yaml"
group:
- french_bench
- french_bench_mc
# description: "Répond au mieux en complétant la question avec une des réponses proposées."
dataset_path: manu/french-bench-grammar-vocab-reading
output_type: multiple_choice
validation_split: Vocabulary
fewshot_split: Vocabulary
test_split: Vocabulary
# doc_to_text: "Question: {{question.strip()}}\nA: {{answerA}}\nB: {{answerB}}\nC: {{answerC}}\nD: {{answerD}}\nRéponse:"
# doc_to_choice: ["A", "B", "C", "D"]
doc_to_text: "La phrase suivante est logique sémantiquement:\n"
doc_to_choice: "{{[question.replace('<...>', answerA), question.replace('<...>', answerB), question.replace('<...>', answerC), question.replace('<...>', answerD)]}}"
doc_to_target: '{{["answerA", "answerB", "answerC", "answerD"].index("answer" + answer)}}'
task: french_bench_vocab
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
group:
- french_bench_perplexity
task: french_bench_wikitext_fr
dataset_path: asi/wikitext_fr
dataset_name: wikitext-35
output_type: loglikelihood_rolling
training_split: train
validation_split: validation
test_split: test
num_fewshot: 0
doc_to_text: ""
doc_to_target: !function preprocess_wikitext.wikitext_detokenizer
process_results: !function preprocess_wikitext.process_results
should_decontaminate: true
doc_to_decontamination_query: "{{paragraph}}"
metric_list:
- metric: word_perplexity
aggregation: weighted_perplexity
higher_is_better: false
- metric: byte_perplexity
aggregation: weighted_perplexity
higher_is_better: false
- metric: bits_per_byte
aggregation: bits_per_byte
higher_is_better: false
include: "_default_template_yaml"
group:
- french_bench
- french_bench_extra
description: "La prémisse et l'hypothèse sont elles en accord, neutres en elles, ou en contradiction ?"
dataset_path: xnli
dataset_name: fr
output_type: multiple_choice
validation_split: validation
fewshot_split: validation
test_split: test
# doc_to_text: "\nPrémisse: {{premise}}\n\nHypothèse: {{hypothesis}}\n\nLa prémisse et l'hypothèse sont:\nA. En accord\nB. Neutre\nC. En contradiction\nRéponse:"
# doc_to_choice: "{{['A: En accord', 'B: Neutre', 'C: En contradiction']}}"
doc_to_text: "\nPrémisse: {{premise}}\n\nHypothèse: {{hypothesis}}\n\nLa prémisse et l'hypothèse sont"
doc_to_choice: "{{['en accord', 'neutres entre elles', 'en contradiction']}}"
doc_to_target: label
task: french_bench_xnli
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
import re
def wikitext_detokenizer(doc):
string = doc["paragraph"]
# contractions
string = string.replace("s '", "s'")
string = re.sub(r"/' [0-9]/", r"/'[0-9]/", string)
# number separators
string = string.replace(" @-@ ", "-")
string = string.replace(" @,@ ", ",")
string = string.replace(" @.@ ", ".")
# punctuation
string = string.replace(" : ", ": ")
string = string.replace(" ; ", "; ")
string = string.replace(" . ", ". ")
string = string.replace(" ! ", "! ")
string = string.replace(" ? ", "? ")
string = string.replace(" , ", ", ")
# double brackets
string = re.sub(r"\(\s*([^\)]*?)\s*\)", r"(\1)", string)
string = re.sub(r"\[\s*([^\]]*?)\s*\]", r"[\1]", string)
string = re.sub(r"{\s*([^}]*?)\s*}", r"{\1}", string)
string = re.sub(r"\"\s*([^\"]*?)\s*\"", r'"\1"', string)
string = re.sub(r"'\s*([^']*?)\s*'", r"'\1'", string)
# miscellaneous
string = string.replace("= = = =", "====")
string = string.replace("= = =", "===")
string = string.replace("= =", "==")
string = string.replace(" " + chr(176) + " ", chr(176))
string = string.replace(" \n", "\n")
string = string.replace("\n ", "\n")
string = string.replace(" N ", " 1 ")
string = string.replace(" 's", "'s")
return string
def process_results(doc, results):
(loglikelihood,) = results
# IMPORTANT: wikitext counts number of words in *original doc before detokenization*
_words = len(re.split(r"\s+", doc["paragraph"]))
_bytes = len(doc["paragraph"].encode("utf-8"))
return {
"word_perplexity": (loglikelihood, _words),
"byte_perplexity": (loglikelihood, _bytes),
"bits_per_byte": (loglikelihood, _bytes),
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment