Commit 25869601 authored by Baber's avatar Baber
Browse files

Merge branch 'main' into mathvista

# Conflicts:
#	lm_eval/models/hf_vlms.py
parents 56f40c53 c1d8795d
# File generated by `create-yamls.py`
include: _flores_common_yaml
task: flores_es-gl
doc_to_text: 'Spanish sentence: {{sentence_spa_Latn}}
Galician sentence:'
doc_to_target: '{{sentence_glg_Latn}}'
# File generated by `create-yamls.py`
include: _flores_common_yaml
task: flores_es-it
doc_to_text: 'Spanish sentence: {{sentence_spa_Latn}}
Italian sentence:'
doc_to_target: '{{sentence_ita_Latn}}'
# File generated by `create-yamls.py`
include: _flores_common_yaml
task: flores_es-pt
doc_to_text: 'Spanish sentence: {{sentence_spa_Latn}}
Portuguese sentence:'
doc_to_target: '{{sentence_por_Latn}}'
group: flores_es
task:
- flores_es-en
- flores_en-es
- flores_es-eu
- flores_eu-es
- flores_es-pt
- flores_pt-es
- flores_es-it
- flores_it-es
- flores_es-fr
- flores_fr-es
- flores_es-ca
- flores_ca-es
- flores_es-gl
- flores_gl-es
- flores_es-de
- flores_de-es
aggregate_metric_list:
- metric: bleu
aggregation: mean
weight_by_size: false
metadata:
version: 1.0
# File generated by `create-yamls.py`
include: _flores_common_yaml
task: flores_eu-es
doc_to_text: 'Basque sentence: {{sentence_eus_Latn}}
Spanish sentence:'
doc_to_target: '{{sentence_spa_Latn}}'
# File generated by `create-yamls.py`
include: _flores_common_yaml
task: flores_fr-es
doc_to_text: 'French sentence: {{sentence_fra_Latn}}
Spanish sentence:'
doc_to_target: '{{sentence_spa_Latn}}'
# File generated by `create-yamls.py`
include: _flores_common_yaml
task: flores_gl-es
doc_to_text: 'Galician sentence: {{sentence_glg_Latn}}
Spanish sentence:'
doc_to_target: '{{sentence_spa_Latn}}'
# File generated by `create-yamls.py`
include: _flores_common_yaml
task: flores_it-es
doc_to_text: 'Italian sentence: {{sentence_ita_Latn}}
Spanish sentence:'
doc_to_target: '{{sentence_spa_Latn}}'
# File generated by `create-yamls.py`
include: _flores_common_yaml
task: flores_pt-es
doc_to_text: 'Portuguese sentence: {{sentence_por_Latn}}
Spanish sentence:'
doc_to_target: '{{sentence_spa_Latn}}'
include: ../mgsm/direct/mgsm_direct_es.yaml
doc_to_target: '{{answer_number|string}}'
doc_to_text: '{% if answer is not none %}{{question+"\nRespuesta: "}}{% else %}{{"Pregunta: "+question+"\nRespuesta: "}}{% endif %}'
generation_kwargs:
until:
- "\n\n"
- "\n"
task: mgsm_direct_es_spanish_bench
task: openbookqa_es
dataset_path: BSC-LT/openbookqa-es
output_type: multiple_choice
training_split: null
validation_split: validation
test_split: test
doc_to_text: question_stem
doc_to_target: "{{choices.label.index(answerKey.lstrip())}}"
doc_to_choice: "{{choices.text}}"
should_decontaminate: true
doc_to_decontamination_query: question_stem
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
task: paws_es_spanish_bench
dataset_path: paws-x
dataset_name: es
output_type: multiple_choice
training_split: train
validation_split: validation
test_split: test
process_docs: !function utils.process_docs_paraphrases
doc_to_text: ''
doc_to_target: label
doc_to_choice: '{{[sentence1+", ¿verdad? No, "+sentence2, sentence1+", ¿verdad? Sí, "+sentence2]}}'
target_delimiter: ''
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
tag: phrases_es
dataset_path: gplsi/ES-VA_translation_test
output_type: generate_until
training_split: null
validation_split: null
test_split: test
fewshot_split: test
num_fewshot: 5
target_delimiter: ' '
generation_kwargs:
until:
- "\n"
metric_list:
- metric: bleu
aggregation: bleu
higher_is_better: true
- metric: ter
aggregation: ter
higher_is_better: false
- metric: chrf
aggregation: chrf
higher_is_better: true
metadata:
version: 1.0
# File generated by `create-yamls.py`
include: _phrases_es_common.yaml
task: phrases_es-va
doc_to_text: 'Oració en espanyol: {{es}}
Oració en valencià:'
doc_to_target: '{{va}}'
# File generated by `create-yamls.py`
include: _phrases_es_common.yaml
task: phrases_va-es
doc_to_text: 'Oració en valencià: {{va}}
Oració en espanyol:'
doc_to_target: '{{es}}'
group: spanish_bench
task:
- belebele_spa_Latn
- copa_es
- escola
- openbookqa_es
- wnli_es
- xnli_es_spanish_bench
- xstorycloze_es
- xquad_es
- xlsum_es
- paws_es_spanish_bench
- mgsm_direct_es_spanish_bench
- flores_es
- phrases_es
metadata:
version: 1.0
import re
from itertools import product
import evaluate
import transformers.data.metrics.squad_metrics as squad_metrics
from lm_eval.utils import general_detokenize
def lowercase_first_letter(text):
return text[0].lower() + text[1:]
def process_doc_nli(dataset):
def process_fn(doc):
# Detokenize(remove extra whitespaces)
doc["premise"] = general_detokenize(doc["premise"]).strip()
doc["hypothesis"] = general_detokenize(doc["hypothesis"]).strip()
# Remove last punctuation mark in the premise
doc["premise"] = (
doc["premise"][:-1]
if doc["premise"].endswith((".", ",", "!", "?"))
else doc["premise"]
)
# Lowercase the first letter in the hypothesis
doc["hypothesis"] = lowercase_first_letter(doc["hypothesis"])
# Ensure that the hypothesis ends with a dot
doc["hypothesis"] = (
(doc["hypothesis"] + ".")
if not doc["hypothesis"].endswith(".")
else doc["hypothesis"]
)
return doc
return dataset.map(process_fn)
def process_results_qa(doc, results):
preds = results[0]
reference = doc["answers"]["text"][0]
# import code; code.interact(local=dict(globals(), **locals()))
f1_sum = squad_metrics.compute_f1(reference, preds)
exact_match = squad_metrics.compute_exact(reference, preds)
return {"f1": f1_sum, "exact_match": exact_match}
def process_xlsum(dataset):
def _process_doc(doc):
# Remove double spaces
doc["text"] = re.sub(r" +", " ", doc["text"])
doc["summary"] = re.sub(r" +", " ", doc["summary"])
return doc
return dataset.map(_process_doc)
def process_docs_paraphrases(dataset):
empty_docs = []
def _process_doc(doc):
if doc["sentence1"] not in [None, ""] and doc["sentence2"] not in [None, ""]:
doc["sentence1"] = general_detokenize(doc["sentence1"]).strip()
doc["sentence2"] = general_detokenize(doc["sentence2"]).strip()
# Remove final punctuation mark in the first sentence
if doc["sentence1"].endswith((".", ",", ";")):
doc["sentence1"] = doc["sentence1"][:-1]
# Start the second sentence in lowercase (to be used after "Yes, ...")
doc["sentence2"] = lowercase_first_letter(doc["sentence2"])
return doc
else:
empty_docs.append(doc)
return doc
if empty_docs != []:
len_empty_docs = len(empty_docs)
print(
f"Found {len_empty_docs} empty documents out of the {len(dataset)} total docs in the dataset: {empty_docs}"
)
return dataset.filter(
lambda doc: doc["sentence1"] not in [None, ""]
and doc["sentence2"] not in [None, ""]
).map(_process_doc)
def process_docs_copa_es(dataset):
def _process_doc(doc):
doc["choice1"] = lowercase_first_letter(doc["choice1"])
doc["choice2"] = lowercase_first_letter(doc["choice2"])
return doc
return dataset.map(_process_doc)
def rouge1(items):
"""
# passthrough for efficiency
"""
return items
def rouge1_agg(items):
"""
Higher is better
"""
refs = list(zip(*items))[0]
preds = list(zip(*items))[1]
rouge_scorer = evaluate.load("rouge")
# import code; code.interact(local=dict(globals(), **locals()))
return rouge_scorer.compute(predictions=preds, references=refs)["rouge1"]
task: wnli_es
dataset_path: PlanTL-GOB-ES/wnli-es
dataset_name: null
output_type: multiple_choice
training_split: train
validation_split: validation
test_split: null
doc_to_text: "{{sentence1}}\nPregunta: {{sentence2}} ¿Verdadero o Falso?\nRespuesta:"
doc_to_target: label
doc_to_choice: ["Falso", "Verdadero"]
metric_list:
- metric: acc
metadata:
version: 1.0
task: xlsum_es
dataset_path: csebuetnlp/xlsum
dataset_name: spanish
doc_to_text: 'Texto: {{text}}
Resumen:'
doc_to_target: '{{summary}}'
output_type: generate_until
test_split: test
training_split: train
validation_split: validation
fewshot_split: train
process_docs: !function utils.process_xlsum
metric_list:
- metric: bleu
aggregation: bleu
higher_is_better: true
- metric: !function utils.rouge1
aggregation: !function utils.rouge1_agg
higher_is_better: true
metadata:
version: 1.0
# Task configuration derived from Eleuther AI's implementation as of March 22, 2024, supplemented with an additional preprocessing function
task: xnli_es_spanish_bench
dataset_path: xnli
dataset_name: es
output_type: multiple_choice
doc_to_choice: '{{[premise+", ¿correcto? Sí, "+hypothesis,premise+", ¿correcto? Así
que, "+hypothesis,premise+", ¿correcto? No, "+hypothesis]}}'
doc_to_text: ''
target_delimiter: ''
process_docs: !function utils.process_doc_nli
training_split: null
validation_split: validation
doc_to_target: label
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment