Commit 948f120f authored by Baber's avatar Baber
Browse files

Merge branch 'main' into autobatchtest

# Conflicts:
#	lm_eval/models/huggingface.py
parents a5b1c7a8 bd80a6c0
# File generated by `create-yamls.py`
include: _flores_common_yaml
task: flores_it-ca
doc_to_text: 'Italian sentence: {{sentence_ita_Latn}}
Catalan sentence:'
doc_to_target: '{{sentence_cat_Latn}}'
# File generated by `create-yamls.py`
include: _flores_common_yaml
task: flores_pt-ca
doc_to_text: 'Portuguese sentence: {{sentence_por_Latn}}
Catalan sentence:'
doc_to_target: '{{sentence_cat_Latn}}'
task: mgsm_direct_ca
dataset_path: projecte-aina/mgsm_ca
doc_to_target: '{{answer_number|string}}'
doc_to_text: '{% if answer != None %}{{question + "\nResposta: "}}{% else %}{{"Pregunta: " + question + "\nResposta: "}}{% endif %}'
output_type: generate_until
training_split: train
test_split: test
target_delimiter: ""
generation_kwargs:
until:
- "\n\n"
- "\n"
filter_list:
- name: remove_whitespace
filter:
- function: remove_whitespace
- function: take_first
metric_list:
- metric: exact_match
aggregation: mean
higher_is_better: true
ignore_case: true
ignore_punctuation: true
metadata:
version: 1.0
task: openbookqa_ca
dataset_path: projecte-aina/openbookqa_ca
output_type: multiple_choice
training_split: null
validation_split: validation
test_split: test
doc_to_text: question_stem
doc_to_target: "{{choices.label.index(answerKey.lstrip())}}"
doc_to_choice: "{{choices.text}}"
should_decontaminate: true
doc_to_decontamination_query: question_stem
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
task: parafraseja
dataset_path: projecte-aina/Parafraseja
output_type: multiple_choice
dataset_name: null
test_split: test
training_split: train
validation_split: validation
doc_to_choice: '{{[sentence1+", veritat? No, "+sentence2, sentence1+", veritat? Sí, "+sentence2]}}'
process_docs: !function utils.process_docs_paraphrases
doc_to_text: ''
doc_to_target: label
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
task: paws_ca
dataset_path: projecte-aina/PAWS-ca
dataset_name: null
output_type: multiple_choice
training_split: train
validation_split: validation
test_split: test
process_docs: !function utils.process_docs_paraphrases
doc_to_text: ''
doc_to_target: label
doc_to_choice: '{{[sentence1+", veritat? No, "+sentence2, sentence1+", veritat? Sí, "+sentence2]}}'
target_delimiter: ''
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
tag: phrases_va
dataset_path: gplsi/CA-VA_alignment_test
output_type: generate_until
training_split: null
validation_split: null
test_split: test
fewshot_split: test
num_fewshot: 5
target_delimiter: ' '
generation_kwargs:
until:
- "\n"
metric_list:
- metric: bleu
aggregation: bleu
higher_is_better: true
- metric: ter
aggregation: ter
higher_is_better: false
- metric: chrf
aggregation: chrf
higher_is_better: true
metadata:
version: 1.0
# File generated by `create-yamls.py`
include: _phrases_va_common.yaml
task: phrases_ca-va
doc_to_text: 'Oració en català: {{ca}}
Oració en valencià:'
doc_to_target: '{{va}}'
# File generated by `create-yamls.py`
include: _phrases_va_common.yaml
task: phrases_va-ca
doc_to_text: 'Oració en valencià: {{va}}
Oració en català:'
doc_to_target: '{{ca}}'
task: piqa_ca
dataset_path: projecte-aina/piqa_ca
dataset_name: null
output_type: multiple_choice
training_split: null
validation_split: validation
test_split: null
doc_to_text: "Pregunta: {{goal}}\nResposta:"
doc_to_target: label
doc_to_choice: "{{[sol1, sol2]}}"
should_decontaminate: true
doc_to_decontamination_query: goal
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
task: siqa_ca
dataset_path: projecte-aina/siqa_ca
output_type: multiple_choice
training_split: null
validation_split: validation
test_split: null
doc_to_text: "Pregunta: {{context}} {{question}}\nResposta:"
target_delimiter: " "
doc_to_choice: "{{[answerA, answerB, answerC]}}"
doc_to_target: "{{ (label|int) - 1 }}"
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
task: teca
dataset_path: projecte-aina/teca
dataset_name: null
training_split: train
validation_split: validation
test_split: test
output_type: multiple_choice
process_docs: !function utils.process_doc_nli
doc_to_text: ""
doc_to_target: label
target_delimiter: ""
doc_to_choice: '{{[premise + ", correcte? Sí, " + hypothesis, premise + ", correcte? A més, " + hypothesis, premise + ", correcte? No, " + hypothesis]}}'
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
import re
from itertools import product
import evaluate
import transformers.data.metrics.squad_metrics as squad_metrics
from lm_eval.utils import general_detokenize
def lowercase_first_letter(text):
return text[0].lower() + text[1:]
def process_doc_nli(dataset):
def process_fn(doc):
# Detokenize(remove extra whitespaces)
doc["premise"] = general_detokenize(doc["premise"]).strip()
doc["hypothesis"] = general_detokenize(doc["hypothesis"]).strip()
# Remove last punctuation mark in the premise
doc["premise"] = (
doc["premise"][:-1]
if doc["premise"].endswith((".", ",", "!", "?"))
else doc["premise"]
)
# Lowercase the first letter in the hypothesis
doc["hypothesis"] = lowercase_first_letter(doc["hypothesis"])
# Ensure that the hypothesis ends with a dot
doc["hypothesis"] = (
(doc["hypothesis"] + ".")
if not doc["hypothesis"].endswith(".")
else doc["hypothesis"]
)
return doc
return dataset.map(process_fn)
def process_results_coqcat(doc, results):
# Get all possible answers and compute the scores
turn_id = len(doc["questions"])
answers = [doc["answers"]["input_text"][turn_id - 1]]
additional_answers_list = doc.get("additional_answers")
if additional_answers_list:
for key, additional_answers in additional_answers_list.items():
if additional_answers["input_text"][turn_id - 1].lower() not in map(
str.lower, answers
):
answers.append(additional_answers["input_text"][turn_id - 1])
gold_list = answers
pred = results[0].strip().split("\n")[0]
# import code; code.interact(local=dict(globals(), **locals()))
f1_sum = 0.0
em_sum = 0.0
if len(gold_list) > 1:
for i in range(len(gold_list)):
gold_answers = gold_list[0:i] + gold_list[i + 1 :]
# predictions compared against (n) golds and take maximum
em_sum += max(squad_metrics.compute_exact(a, pred) for a in gold_answers)
f1_sum += max(squad_metrics.compute_f1(a, pred) for a in gold_answers)
else:
em_sum += max(squad_metrics.compute_exact(a, pred) for a in gold_list)
f1_sum += max(squad_metrics.compute_f1(a, pred) for a in gold_list)
# import code; code.interact(local=dict(globals(), **locals()))
return {
"em": em_sum / max(1, len(gold_list)),
"f1": f1_sum / max(1, len(gold_list)),
}
def process_results_qa(doc, results):
preds = results[0]
reference = doc["answers"][0]["text"]
# import code; code.interact(local=dict(globals(), **locals()))
f1_sum = squad_metrics.compute_f1(reference, preds)
exact_match = squad_metrics.compute_exact(reference, preds)
return {"f1": f1_sum, "exact_match": exact_match}
def process_doc_cabreu(dataset):
def process_fn(doc):
# Remove duplicate spaces
doc["content"] = re.sub(r" +", " ", doc["content"])
for summary_type, index in product(
["abstractive", "extractive", "extreme"], ["a1", "a2", "a3"]
):
doc["summaries"][summary_type][index] = re.sub(
r" +", " ", doc["summaries"][summary_type][index]
)
return doc
return dataset.map(process_fn)
def process_docs_paraphrases(dataset):
empty_docs = []
def _process_doc(doc):
if doc["sentence1"] not in [None, ""] and doc["sentence2"] not in [None, ""]:
doc["sentence1"] = general_detokenize(doc["sentence1"]).strip()
doc["sentence2"] = general_detokenize(doc["sentence2"]).strip()
# Remove final punctuation mark in the first sentence
if doc["sentence1"].endswith((".", ",", ";")):
doc["sentence1"] = doc["sentence1"][:-1]
# Start the second sentence in lowercase (to be used after "Yes, ...")
doc["sentence2"] = lowercase_first_letter(doc["sentence2"])
return doc
else:
empty_docs.append(doc)
return doc
return dataset.filter(
lambda doc: doc["sentence1"] not in [None, ""]
and doc["sentence2"] not in [None, ""]
).map(_process_doc)
def process_docs_copa_ca(dataset):
def _process_doc(doc):
doc["choice1"] = lowercase_first_letter(doc["choice1"])
doc["choice2"] = lowercase_first_letter(doc["choice2"])
return doc
return dataset.map(_process_doc)
def rouge1(items):
"""
# passthrough for efficiency
"""
return items
def rouge1_agg(items):
"""
Higher is better
"""
refs = list(zip(*items))[0]
preds = list(zip(*items))[1]
rouge_scorer = evaluate.load("rouge")
return rouge_scorer.compute(predictions=preds, references=refs)["rouge1"]
task: wnli_ca
dataset_path: projecte-aina/wnli-ca
dataset_name: null
output_type: multiple_choice
training_split: train
validation_split: validation
test_split: null
doc_to_text: "{{sentence1}}\nPregunta: {{sentence2}} Cert o Fals?\nResposta:"
doc_to_target: label
doc_to_choice: ["Fals", "Cert"]
metric_list:
- metric: acc
metadata:
version: 1.0
task: xnli_ca
dataset_path: projecte-aina/xnli-ca
dataset_name: null
include: ../xnli/xnli_common_yaml
output_type: multiple_choice
doc_to_choice: '{{[premise+", correcte? Sí, "+hypothesis,premise+", correcte? A més,
"+hypothesis,premise+", correcte? No, "+hypothesis]}}'
doc_to_text: ''
target_delimiter: ''
process_docs: !function utils.process_doc_nli
training_split: null
validation_split: validation
doc_to_target: label
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
task: xquad_ca
dataset_path: projecte-aina/xquad-ca
dataset_name: null
output_type: generate_until
doc_to_text: "Context: {{context}}\n\nPregunta: {{question}}\n\nResposta:"
doc_to_target: '{{answers[0]["text"]}}'
validation_split: null
test_split: test
target_delimiter: ' '
process_results: !function utils.process_results_qa
generation_kwargs:
until:
- "\n"
do_sample: false
temperature: 0.0
metric_list:
- metric: exact_match
aggregation: mean
higher_is_better: true
- metric: f1
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
task: xstorycloze_ca
dataset_path: projecte-aina/xstorycloze_ca
dataset_name: ca
output_type: multiple_choice
training_split: train
validation_split: eval
doc_to_text: "{{[input_sentence_1, input_sentence_2, input_sentence_3, input_sentence_4]|join(' ')}}"
doc_to_target: "{{answer_right_ending-1}}"
doc_to_choice: "{{[sentence_quiz1, sentence_quiz2]}}"
should_decontaminate: true
doc_to_decontamination_query: "{{[input_sentence_1, input_sentence_2, input_sentence_3, input_sentence_4]|join(' ')}}"
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
aggregate_metric_list:
- aggregation: mean
metric: acc
weight_by_size: true
- aggregation: mean
metric: acc_norm
weight_by_size: true
group: ceval-valid
metadata:
version: 2.0
task:
- ceval-valid_computer_network
- ceval-valid_operating_system
- ceval-valid_computer_architecture
- ceval-valid_college_programming
- ceval-valid_college_physics
- ceval-valid_college_chemistry
- ceval-valid_advanced_mathematics
- ceval-valid_probability_and_statistics
- ceval-valid_discrete_mathematics
- ceval-valid_electrical_engineer
- ceval-valid_metrology_engineer
- ceval-valid_high_school_mathematics
- ceval-valid_high_school_physics
- ceval-valid_high_school_chemistry
- ceval-valid_high_school_biology
- ceval-valid_middle_school_mathematics
- ceval-valid_middle_school_biology
- ceval-valid_middle_school_physics
- ceval-valid_middle_school_chemistry
- ceval-valid_veterinary_medicine
- ceval-valid_college_economics
- ceval-valid_business_administration
- ceval-valid_marxism
- ceval-valid_mao_zedong_thought
- ceval-valid_education_science
- ceval-valid_teacher_qualification
- ceval-valid_high_school_politics
- ceval-valid_high_school_geography
- ceval-valid_middle_school_politics
- ceval-valid_middle_school_geography
- ceval-valid_modern_chinese_history
- ceval-valid_ideological_and_moral_cultivation
- ceval-valid_logic
- ceval-valid_law
- ceval-valid_chinese_language_and_literature
- ceval-valid_art_studies
- ceval-valid_professional_tour_guide
- ceval-valid_legal_professional
- ceval-valid_high_school_chinese
- ceval-valid_high_school_history
- ceval-valid_middle_school_history
- ceval-valid_civil_servant
- ceval-valid_sports_science
- ceval-valid_plant_protection
- ceval-valid_basic_medicine
- ceval-valid_clinical_medicine
- ceval-valid_urban_and_rural_planner
- ceval-valid_accountant
- ceval-valid_fire_engineer
- ceval-valid_environmental_impact_assessment_engineer
- ceval-valid_tax_accountant
- ceval-valid_physician
group: ceval-valid
dataset_path: ceval/ceval-exam dataset_path: ceval/ceval-exam
validation_split: val validation_split: val
fewshot_split: dev fewshot_split: dev
...@@ -16,4 +15,4 @@ metric_list: ...@@ -16,4 +15,4 @@ metric_list:
aggregation: mean aggregation: mean
higher_is_better: true higher_is_better: true
metadata: metadata:
version: 1.0 version: 2.0
...@@ -8,7 +8,7 @@ import os ...@@ -8,7 +8,7 @@ import os
import yaml import yaml
from tqdm import tqdm from tqdm import tqdm
from lm_eval.logger import eval_logger from lm_eval.utils import eval_logger
SUBJECTS = { SUBJECTS = {
...@@ -117,3 +117,26 @@ if __name__ == "__main__": ...@@ -117,3 +117,26 @@ if __name__ == "__main__":
allow_unicode=True, allow_unicode=True,
default_style='"', default_style='"',
) )
# write group config out
group_yaml_dict = {
"group": "ceval-valid",
"task": [f"ceval-valid_{task_name}" for task_name in SUBJECTS.keys()],
"aggregate_metric_list": [
{"metric": "acc", "aggregation": "mean", "weight_by_size": True},
{"metric": "acc_norm", "aggregation": "mean", "weight_by_size": True},
],
"metadata": {"version": 1.0},
}
file_save_path = "_" + args.save_prefix_path + ".yaml"
with open(file_save_path, "w", encoding="utf-8") as group_yaml_file:
yaml.dump(
group_yaml_dict,
group_yaml_file,
width=float("inf"),
allow_unicode=True,
default_style='"',
)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment