Commit 2106fbeb authored by Baber's avatar Baber
Browse files

Merge branch 'main' into mathvista

# Conflicts:
#	lm_eval/models/openai_completions.py
parents 4354fe46 703fbffd
# Generated by generate_tasks.py
include: mlqa_common_yaml
task: mlqa_vi_es
dataset_name: mlqa.vi.es
process_results: !function utils.process_results_vi
# Generated by generate_tasks.py
include: mlqa_common_yaml
task: mlqa_vi_hi
dataset_name: mlqa.vi.hi
process_results: !function utils.process_results_vi
# Generated by generate_tasks.py
include: mlqa_common_yaml
task: mlqa_vi_vi
dataset_name: mlqa.vi.vi
process_results: !function utils.process_results_vi
# Generated by generate_tasks.py
include: mlqa_common_yaml
task: mlqa_vi_zh
dataset_name: mlqa.vi.zh
process_results: !function utils.process_results_vi
# Generated by generate_tasks.py
include: mlqa_common_yaml
task: mlqa_zh_ar
dataset_name: mlqa.zh.ar
process_results: !function utils.process_results_zh
# Generated by generate_tasks.py
include: mlqa_common_yaml
task: mlqa_zh_de
dataset_name: mlqa.zh.de
process_results: !function utils.process_results_zh
# Generated by generate_tasks.py
include: mlqa_common_yaml
task: mlqa_zh_en
dataset_name: mlqa.zh.en
process_results: !function utils.process_results_zh
# Generated by generate_tasks.py
include: mlqa_common_yaml
task: mlqa_zh_es
dataset_name: mlqa.zh.es
process_results: !function utils.process_results_zh
# Generated by generate_tasks.py
include: mlqa_common_yaml
task: mlqa_zh_hi
dataset_name: mlqa.zh.hi
process_results: !function utils.process_results_zh
# Generated by generate_tasks.py
include: mlqa_common_yaml
task: mlqa_zh_vi
dataset_name: mlqa.zh.vi
process_results: !function utils.process_results_zh
# Generated by generate_tasks.py
include: mlqa_common_yaml
task: mlqa_zh_zh
dataset_name: mlqa.zh.zh
process_results: !function utils.process_results_zh
"""
Code based on Official evaluation script for the MLQA dataset.
Repo: https://github.com/facebookresearch/MLQA/blob/main/mlqa_evaluation_v1.py
"""
import re
import string
import sys
import unicodedata
from collections import Counter
import datasets
PUNCT = {
chr(i)
for i in range(sys.maxunicode)
if unicodedata.category(chr(i)).startswith("P")
}.union(string.punctuation)
WHITESPACE_LANGS = ["en", "es", "hi", "vi", "de", "ar"]
MIXED_SEGMENTATION_LANGS = ["zh"]
def whitespace_tokenize(text):
return text.split()
def mixed_segmentation(text):
segs_out = []
temp_str = ""
for char in text:
if re.search(r"[\u4e00-\u9fa5]", char) or char in PUNCT:
if temp_str != "":
ss = whitespace_tokenize(temp_str)
segs_out.extend(ss)
temp_str = ""
segs_out.append(char)
else:
temp_str += char
if temp_str != "":
ss = whitespace_tokenize(temp_str)
segs_out.extend(ss)
return segs_out
def normalize_answer(s, lang):
"""Lower text and remove punctuation, articles and extra whitespace."""
def remove_articles(text, lang):
if lang == "en":
return re.sub(r"\b(a|an|the)\b", " ", text)
elif lang == "es":
return re.sub(r"\b(un|una|unos|unas|el|la|los|las)\b", " ", text)
elif lang == "hi":
return text # Hindi does not have formal articles
elif lang == "vi":
return re.sub(r"\b(của|là|cái|chiếc|những)\b", " ", text)
elif lang == "de":
return re.sub(
r"\b(ein|eine|einen|einem|eines|einer|der|die|das|den|dem|des)\b",
" ",
text,
)
elif lang == "ar":
return re.sub(r"\sال^|ال", " ", text)
elif lang == "zh":
return text # Chinese does not have formal articles
else:
raise Exception("Unknown Language {}".format(lang))
def white_space_fix(text, lang):
if lang in WHITESPACE_LANGS:
tokens = whitespace_tokenize(text)
elif lang in MIXED_SEGMENTATION_LANGS:
tokens = mixed_segmentation(text)
else:
raise Exception("Unknown Language {}".format(lang))
return " ".join([t for t in tokens if t.strip() != ""])
def remove_punc(text):
return "".join(ch for ch in text if ch not in PUNCT)
def lower(text):
return text.lower()
return white_space_fix(remove_articles(remove_punc(lower(s)), lang), lang)
def f1_score(prediction, ground_truth, lang):
prediction_tokens = normalize_answer(prediction, lang).split()
ground_truth_tokens = normalize_answer(ground_truth, lang).split()
common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
num_same = sum(common.values())
if num_same == 0:
return 0
precision = 1.0 * num_same / len(prediction_tokens)
recall = 1.0 * num_same / len(ground_truth_tokens)
f1 = (2 * precision * recall) / (precision + recall)
return f1
def exact_match_score(prediction, ground_truth, lang):
return normalize_answer(prediction, lang) == normalize_answer(ground_truth, lang)
def metric_max_over_ground_truths(metric_fn, prediction, ground_truths, lang):
scores_for_ground_truths = []
for ground_truth in ground_truths:
score = metric_fn(prediction, ground_truth, lang)
scores_for_ground_truths.append(score)
return max(scores_for_ground_truths)
def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
def _process_doc(doc):
out_doc = {
"context": doc["context"],
"question": doc["question"],
"answers": doc["answers"]["text"],
}
return out_doc
return dataset.map(_process_doc)
# Base function
def process_results_lang(doc, results, lang):
ground_truths = doc["answers"]
prediction = results[0].strip()
exact_match = metric_max_over_ground_truths(
exact_match_score, prediction, ground_truths, lang
)
f1 = metric_max_over_ground_truths(f1_score, prediction, ground_truths, lang)
return {"exact_match": exact_match, "f1": f1}
# Language Wrapper functions
def process_results_en(doc, results):
return process_results_lang(doc, results, "en")
def process_results_es(doc, results):
return process_results_lang(doc, results, "es")
def process_results_hi(doc, results):
return process_results_lang(doc, results, "hi")
def process_results_vi(doc, results):
return process_results_lang(doc, results, "vi")
def process_results_de(doc, results):
return process_results_lang(doc, results, "de")
def process_results_ar(doc, results):
return process_results_lang(doc, results, "ar")
def process_results_zh(doc, results):
return process_results_lang(doc, results, "zh")
......@@ -12,7 +12,7 @@ filter_list:
- function: "take_first"
- name: "flexible-extract"
filter:
- function: !function utils.MultiChoiceRegexFilter
- function: "multi_choice_regex"
group_select: -1
ignore_case: true
ignore_punctuation: true
......
......@@ -12,7 +12,7 @@ training_split: null
validation_split: null
test_split: test
fewshot_split: null
doc_to_text: "Ahora eres una Inteligencia Artificial experta en desmontar titulares sensacionalistas o clickbait. Tu tarea consiste en analizar noticias con titulares sensacionalistas y generar un resumen de una sola frase que revele la verdad detrás del titular.\nEste es el titular de la noticia: {{web_headline}}\nEl titular plantea una pregunta o proporciona información incompleta. Debes buscar en el cuerpo de la noticia una frase que responda lo que se sugiere en el título. Responde siempre que puedas parafraseando el texto original. Usa siempre las mínimas palabras posibles. Recuerda responder siempre en Español.\nEste es el cuerpo de la noticia:\n{{web_text}}"
doc_to_text: "Ahora eres una Inteligencia Artificial experta en desmontar titulares sensacionalistas o clickbait. Tu tarea consiste en analizar noticias con titulares sensacionalistas y generar un resumen de una sola frase que revele la verdad detrás del titular.\nEste es el titular de la noticia: {{web_headline}}\nEl titular plantea una pregunta o proporciona información incompleta. Debes buscar en el cuerpo de la noticia una frase que responda lo que se sugiere en el título. Siempre que puedas cita el texto original, especialmente si se trata de una frase que alguien ha dicho. Si citas una frase que alguien ha dicho, usa comillas para indicar que es una cita. Usa siempre las mínimas palabras posibles. No es necesario que la respuesta sea una oración completa, puede ser sólo el foco de la pregunta. Recuerda responder siempre en Español.\nEste es el cuerpo de la noticia:\n{{web_text}}"
doc_to_target: summary
target_delimiter: " "
num_fewshot: 0
......
......@@ -77,3 +77,7 @@ If other tasks on this dataset are already supported:
* [ ] Is the "Main" variant of this task clearly denoted?
* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
### Changelog
* v1 (2024-11-05) PR #2434 corrected doc_to_choice labels to the correct order
# Generated by utils.py
dataset_name: de
doc_to_choice: '{{[sentence1+", richtig? Ja, "+sentence2, sentence1+", richtig? Nein,
doc_to_choice: '{{[sentence1+", richtig? Nein, "+sentence2, sentence1+", richtig? Ja,
"+sentence2]}}'
doc_to_text: ''
include: pawsx_template_yaml
......
# Generated by utils.py
dataset_name: en
doc_to_choice: '{{[sentence1+", right? Yes, "+sentence2, sentence1+", right? No, "+sentence2]}}'
doc_to_choice: '{{[sentence1+", right? No, "+sentence2, sentence1+", right? Yes, "+sentence2]}}'
doc_to_text: ''
include: pawsx_template_yaml
task: paws_en
# Generated by utils.py
dataset_name: es
doc_to_choice: '{{[sentence1+", verdad? , "+sentence2, sentence1+", verdad? No,
doc_to_choice: '{{[sentence1+", verdad? No, "+sentence2, sentence1+", verdad? ,
"+sentence2]}}'
doc_to_text: ''
include: pawsx_template_yaml
......
# Generated by utils.py
dataset_name: fr
doc_to_choice: '{{[sentence1+", n''est-ce pas? Oui, "+sentence2, sentence1+", n''est-ce
pas? No, "+sentence2]}}'
doc_to_choice: '{{[sentence1+", n''est-ce pas? Non, "+sentence2, sentence1+", n''est-ce
pas? Oui, "+sentence2]}}'
doc_to_text: ''
include: pawsx_template_yaml
task: paws_fr
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment