Commit efb46937 authored by Baber's avatar Baber
Browse files

Merge branch 'main' into convert_gen

# Conflicts:
#	lm_eval/__main__.py
#	lm_eval/evaluator.py
parents 7fbf899c ade01428
tag: evalita-mp_wic_tasks
task: evalita-mp_wic_prompt-4
task_alias: prompt-4
include: _wic_template_yaml
doc_to_text: "Devi determinare se una stessa parola usata in due frasi differenti ha lo stesso significato in entrambi i contesti. La parola '{{sentence1[start1:end1]}}' nella frase '{{sentence1}}' ha lo stesso significato della parola '{{sentence2[start2:end2]}}' nella frase '{{sentence2}}'?\nA: \nB: No\nRisposta:"
doc_to_choice: ["B", "A"]
tag: evalita-mp_wic_tasks
task: evalita-mp_wic_prompt-5
task_alias: prompt-5
include: _wic_template_yaml
doc_to_text: "La parola: '{{sentence1[start1:end1]}}' nella frase: '{{sentence1}}' e la parola: '{{sentence2[start2:end2]}}' nella frase: '{{sentence2}}'"
doc_to_choice: ["non hanno lo stesso significato", "hanno lo stesso significato"]
tag: evalita-mp_wic_tasks
task: evalita-mp_wic_prompt-6
task_alias: prompt-6
include: _wic_template_yaml
doc_to_text: "Devi determinare se una stessa parola usata in due frasi differenti ha lo stesso significato in entrambi i contesti. La parola: '{{sentence1[start1:end1]}}' nella frase: '{{sentence1}}' e la parola: '{{sentence2[start2:end2]}}' nella frase: '{{sentence2}}'"
doc_to_choice: ["non hanno lo stesso significato", "hanno lo stesso significato"]
group: evalita-mp_wic
group_alias: word-in-context
task:
- evalita-mp_wic_tasks # this has to match the tag in the task yaml file
aggregate_metric_list:
- metric: f1
weight_by_size: True
metadata:
version: 1
dataset_path: evalitahf/faq
test_split: test_1
fewshot_split: dev_1
doc_to_target: !function utils.faq_doc_to_target
doc_to_choice: ["A", "B", "C", "D"]
output_type: multiple_choice
metadata:
version: 1
dataset_path: evalitahf/hatespeech_detection
output_type: multiple_choice
test_split: test_all
fewshot_split: dev
validation_split: dev
doc_to_target: hs # 0 = Falso, 1 = Vero
doc_to_choice: ["Falso", "Vero"]
metadata:
version: 1
dataset_path: evalitahf/lexical_substitution
test_split: test
validation_split: dev
fewshot_split: dev
output_type: generate_until
generation_kwargs:
until:
- "</s>"
doc_to_target: !function utils.ls_doc_to_target
process_results: !function utils.ls_process_results
metric_list:
- metric: f1
higher_is_better: True
aggregation: !function metrics._aggreg_ls
metadata:
version: 1
dataset_path: evalitahf/entity_recognition
output_type: generate_until
generation_kwargs:
until:
- "</s>"
- "\n"
doc_to_target: !function utils.ner_doc_to_target
process_results: !function utils.ner_process_results
metric_list:
- metric: f1
higher_is_better: True
aggregation: !function metrics._aggreg_ner
metadata:
version: 1
dataset_path: evalitahf/relation_extraction
test_split: test
output_type: generate_until
generation_kwargs:
until:
- "</s>"
doc_to_target: !function utils.re_doc_to_target
process_results: !function utils.rel_process_results_v3
metric_list:
- metric: f1
higher_is_better: True
aggregation: !function metrics._aggreg_rel
metadata:
version: 1
dataset_path: evalitahf/sentiment_analysis
output_type: multiple_choice
test_split: test
fewshot_split: train
validation_split: test
doc_to_target: !function utils.sa_doc_to_target_v2
doc_to_choice: ["positivo", "negativo", "neutrale", "misto"]
metadata:
version: 1
dataset_path: evalitahf/sentiment_analysis
output_type: multiple_choice
test_split: test
fewshot_split: train
validation_split: test
doc_to_target: !function utils.sa_doc_to_target
doc_to_choice: !function utils.sa_doc_to_choice
metadata:
version: 1
dataset_path: evalitahf/summarization-fp
output_type: generate_until
generation_kwargs:
until:
- "</s>"
test_split: test_100
fewshot_split: dev
doc_to_target: "{{target}}"
metadata:
version: 1
dataset_path: ARTeLab/fanpage
output_type: generate_until
generation_kwargs:
until:
- "</s>"
test_split: test
doc_to_target: "{{target}}"
metadata:
version: 1.0
dataset_path: silvia-casola/WITS
output_type: generate_until
generation_kwargs:
until:
- "</s>"
test_split: test_100
fewshot_split: dev
#test_split: train
doc_to_target: "{{summary}}"
metadata:
version: 1
dataset_path: evalitahf/textual_entailment
output_type: multiple_choice
test_split: test
fewshot_split: dev
validation_split: dev
doc_to_target: "{{ 0 if entailment == 'SI' else 1 }}"
doc_to_choice: ["Sì", "No"]
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
metadata:
version: 1
dataset_path: evalitahf/word_in_context
dataset_name: default
output_type: multiple_choice
test_split: test
fewshot_split: dev
validation_split: dev
doc_to_target: label # 0: No, 1: Si
doc_to_choice: ["No", "Sì"]
metric_list:
- metric: f1
higher_is_better: true
aggregation: f1
metadata:
version: 1.0
import torch
from sklearn.metrics import f1_score, precision_score, recall_score
inference_decorator = (
torch.inference_mode if torch.__version__ >= "2.0.0" else torch.no_grad
)
def _aggreg_ls(predictions):
"""
Custom aggregation to compute corpus level metrics for the lexical substitution task
predictions is a list of tuples (prec, has_answ, has_annotation)
prec is the precision before dividing by |A|
has_answ is 0 if the model did not produce any answer
has_annotation is 0 if the gold answer is empty: no synonims from annotators
"""
# get |A| and |T| to compute the final precision and recall using a lambda function
A = sum([p[1] for p in predictions])
T = sum([p[2] for p in predictions])
# compute the final precision and recall
if A == 0:
prec = sum([p[0] for p in predictions]) / 1
else:
prec = sum([p[0] for p in predictions]) / A
if T == 0:
rec = sum([p[0] for p in predictions]) / 1
else:
rec = sum([p[0] for p in predictions]) / T
# compute the final F1 score
f1 = 0
if prec + rec != 0:
f1 = (2 * prec * rec) / (prec + rec)
return f1
def _aggreg_sa_v2(predictions):
"""
This aggregation considers the sentiment analysis task as a multiple choice one with four classes
the f1 score is computed as the average of the f1 scores for each class weighted by the number of samples
See sklearn.metrics.f1_score for more details
"""
predictions, references = zip(*predictions)
f1 = f1_score(references, predictions, average="weighted")
return f1
def _aggreg_sa(predictions):
"""
Custom aggregation function for the sentiment analysis task
The original tasks compute the F1 score for each class and then average them
Since the prompt cast the task to a multple choice one we need to aggregate the results in a different way
"""
# split the predictions and references in two lists (pred is a tuple)
predictions, references = zip(*predictions)
"""
Class 0: positivo -> 'opos': 1, 'oneg': 0
Class 1: negativo -> 'opos': 0, 'oneg': 1
etc.
"""
def _map_to_original_labels(x):
"""
Return two separate list of labels for opos and oneg
x is a list of integers
"""
opos = []
oneg = []
for i in x:
if i == 0:
# positive
opos.append(1)
oneg.append(0)
elif i == 1:
# negative
opos.append(0)
oneg.append(1)
elif i == 2:
# neutral
opos.append(0)
oneg.append(0)
elif i == 3:
# mixed
opos.append(1)
oneg.append(1)
else:
pass
return opos, oneg
pred_opos, pred_oneg = _map_to_original_labels(predictions)
ref_opos, ref_oneg = _map_to_original_labels(references)
opos_f1 = f1_score(ref_opos, pred_opos, average=None)
opos_f1_c0 = f1_score(ref_opos, pred_opos, average=None)[0]
if len(opos_f1) > 1:
opos_f1_c1 = opos_f1[1]
else:
opos_f1_c1 = 0
# oneg class
oneg_prec_c0, oneg_prec_c1 = precision_score(
ref_oneg, pred_oneg, labels=[0, 1], average=None
)
oneg_rec_c0, oneg_rec_c1 = recall_score(
ref_oneg, pred_oneg, labels=[0, 1], average=None
)
oneg_f1 = f1_score(ref_oneg, pred_oneg, average=None)
oneg_f1_c0 = f1_score(ref_oneg, pred_oneg, average=None)[0]
if len(oneg_f1) > 1:
oneg_f1_c1 = f1_score(ref_oneg, pred_oneg, average=None)[1]
else:
oneg_f1_c1 = 0
# average f1 score for each class (opos and oneg)
f1_score_opos = (opos_f1_c0 + opos_f1_c1) / 2
f1_score_oneg = (oneg_f1_c0 + oneg_f1_c1) / 2
# average f1 score for the two classes
f1_final = (f1_score_opos + f1_score_oneg) / 2
return f1_final
def _aggreg_ner(predictions):
pred, ref = zip(*predictions)
# concat all the predictions and references
all_pred = []
for p in pred:
all_pred.extend(p)
all_ref = []
for r in ref:
all_ref.extend(r)
# compute the F1 score
f1 = f1_score(all_ref, all_pred, average=None)
if len(f1) > 1:
f1_sum = sum(f1[:-1]) / (len(f1) - 1)
else:
f1_sum = f1[0]
return f1_sum
def _aggreg_rel(predictions):
pred, ref = zip(*predictions)
# concat all the predictions and references
all_pred = []
for p in pred:
all_pred.extend(p)
all_ref = []
for r in ref:
all_ref.extend(r)
# compute the F1 score
f1 = f1_score(all_ref, all_pred, average="macro")
return f1
# ------------------------ DOCUMENT DATING ---------------------------
def _aggreg_dd(items):
unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
fscore = f1_score(golds, preds, average="macro")
return fscore
import logging
from evaluate import load
from sklearn.metrics import f1_score
eval_logger = logging.getLogger("lm-eval")
# ---------------------- SENTIMENT ANALYSIS ----------------------
def sa_doc_to_target(x):
"""
Function to extract the target from the dataset for sentiment analysis
"""
opos = x["opos"]
oneg = x["oneg"]
# return indexes matches the choices in sa_doc_to_choice
if opos == "1" and oneg == "0":
return 0
elif opos == "0" and oneg == "1":
return 1
elif opos == "0" and oneg == "0":
return 2
elif opos == "1" and oneg == "1":
return 3
else:
pass
def sa_doc_to_target_v2(x):
"""
Function to extract the target from the dataset for sentiment analysis
"""
opos = x["opos"]
oneg = x["oneg"]
# return indexes matches the choices in sa_doc_to_choice
if opos == "1" and oneg == "0":
return 0
elif opos == "0" and oneg == "1":
return 1
elif opos == "0" and oneg == "0":
return 2
elif opos == "1" and oneg == "1":
return 3
else:
pass
def sa_doc_to_choice(x):
"""
Function to return the choices from the dataset for sentiment analysis
"""
return ["Positivo", "Negativo", "Neutrale", "Misto"]
# ---------------------- LEXICAL SUBSTITUTION ----------------------
NO_SYN_STRING = "&&NOSYN&&"
def _ls_gold_to_target(x):
"""
Generate the target for the lexical similarity task
"""
# all_answers = [(i["word"], i["count"]) for i in x["answers"]]
if len(x["answers"]) == 0:
return NO_SYN_STRING
ans_str = ""
for i in x["answers"]:
ans_str += i["word"] + "$$" + str(i["count"]) + "::"
if len(ans_str) != 0 and ans_str[-2] == ":":
ans_str = ans_str[:-2]
# print(ans_str)
return ans_str
def ls_doc_to_target(x):
"""
Generate the target for the lexical similarity task
"""
if len(x["answers"]) == 0:
return NO_SYN_STRING
ans_str = ""
for i in x["answers"]:
ans_str += i["word"] + ", "
if len(ans_str) != 0 and ans_str[-2] == ",":
ans_str = ans_str[:-2]
return ans_str
def _ls_split_gold(x):
"""
Split the gold string into a list of tuples
"""
if x == NO_SYN_STRING:
return [], []
answers = x.split("::")
words = []
freqs = []
if len(answers) != 0:
for a in answers:
if "$$" in a:
word, count = a.split("$$")
words.append(word)
try:
freqs.append(int(count))
except ValueError:
freqs.append(0)
return words, freqs
def ls_process_results(doc, results):
"""
Process the results of the evaluation for the lexical substitution task
look at coqa for another example
"""
gold_to_target = _ls_gold_to_target(doc)
words, freqs = _ls_split_gold(gold_to_target)
prec = 0
# Considering a maximum of the first 10 synonyms
results = split_text_with_regex(results[0], LS_SPLIT_REGEX)
results = results[: min(10, len(results))]
# Remove non-alphabetic characters from the word at the end of the list
if results: # Check if results is not empty
results[-1] = "".join(char for char in results[-1] if char.isalpha())
has_answ = 0 if len(results) == 0 else 1 # so we can compute |A|
has_annotation = 0 if len(words) == 0 else 1 # so we can compute |T|
matching_res = [] # for debugging
for r in results:
if r in words:
# get frequency of the synonyms from annotators
idx = words.index(r.strip())
prec += freqs[idx]
matching_res.append(r)
# In the case of the OOT (out of ten) subtask, this normalization should not be applied
# ai = len(results) if len(results) != 0 else 1
# prec = prec / ai
Hi = sum(freqs)
if Hi != 0:
prec = prec / Hi
else:
eval_logger.debug("H_i is 0")
return {"f1": (prec, has_answ, has_annotation)}
# ---------------------- NER ----------------------
NO_ENT_STRING = "&&NOENT&&"
NER_ENTITY_SEPARATOR = ","
NER_TYPE_SEPARATOR = "$"
NER_MAPPING_V2 = {"PER": 0, "LOC": 1, "ORG": 2, NO_ENT_STRING: 3, "O": 4}
NER_MAPPING = {"PER": 0, "LOC": 1, "ORG": 2, "O": 3}
def _ner_gold_to_target(x: list) -> list:
"""
Convert the gold entities to the target format according to the NER_MAPPING
"""
res = [NER_MAPPING[e["type"]] for e in x]
return res
def _ner_gold_to_target_v2(x: list) -> list:
"""
Convert the gold entities to the target format according to the NER_MAPPING
"""
res = [NER_MAPPING[e["type"]] for e in x]
return res
def ner_doc_to_target(doc):
ents = doc["entities"]
targ_str = ""
# Entità$Tipo%Entità$Tipo.
if ents == []:
return NO_ENT_STRING
else:
for e in ents:
targ_str += (
e["entity_text"] + NER_TYPE_SEPARATOR + e["type"] + NER_ENTITY_SEPARATOR
)
return targ_str[:-1]
def ner_process_results(doc, results):
"""
Process the results of the Named Entity Recognition task
"""
# each document has a list of entities with the following format:
# [{"entity_text": "string", "type": "string"}]
gold = doc["entities"]
raw_results = results[0]
results = _ner_process_raw_output(raw_results)
gold_labels = _ner_gold_to_target(gold)
res_labels = [0] * len(gold_labels)
matched_gold_idx = []
if len(results) > len(gold):
for r in results:
r_text = r[0]
r_type = r[1]
for i in range(len(gold)):
if r_text == gold[i]["entity_text"] and r_type == gold[i]["type"]:
res_labels[i] = NER_MAPPING[r_type]
matched_gold_idx.append(i)
# Since we have more results than gold, we artificially set to false positive the remaining labels
# extend gold label list
for i in range(len(results) - len(gold)):
gold_labels.append(3)
res_labels.append(2)
elif len(results) == 0 and len(gold) == 0:
res_labels = [3]
gold_labels = res_labels
else: # len(results) <= len(gold)
for r in results:
r_text = r[0]
r_type = r[1]
for i in range(len(gold)):
if r_text == gold[i]["entity_text"] and r_type == gold[i]["type"]:
res_labels[i] = NER_MAPPING[r_type]
matched_gold_idx.append(i)
# we map all wrong predictions to the "O" class
for i in range(len(gold_labels)):
if i in matched_gold_idx:
continue
if gold_labels[i] == 1:
res_labels[i] = 3
elif gold_labels[i] == 0:
res_labels[i] = 3
else:
res_labels[i] = 3
assert len(gold_labels) == len(res_labels)
return {"f1": (res_labels, gold_labels)}
def ner_process_results_v2(doc, results):
"""
Process the results of the Named Entity Recognition task
This version considers and score explicitly when the model responds that there are no entities
"""
# each document has a list of entities with the following format:
# [{"entity_text": "string", "type": "string"}]
gold = doc["entities"]
raw_results = results[0]
results = _ner_process_raw_output_v2(raw_results)
# eval_logger.debug(f"results {results}")
# eval_logger.debug(f"gold {gold}")
gold_labels = _ner_gold_to_target_v2(gold)
res_labels = [0] * len(gold_labels)
matched_gold_idx = []
if len(results) > len(gold):
for r in results:
# print(r)
r_text = r[0]
r_type = r[1]
for i in range(len(gold)):
if r_text == gold[i]["entity_text"] and r_type == gold[i]["type"]:
res_labels[i] = NER_MAPPING[r_type]
matched_gold_idx.append(i)
# Since we have more results than gold, we artificially set to false positive the remaining labels
# extend gold label list
for i in range(len(results) - len(gold)):
# gold_labels.append(3)
# res_labels.append(2)
gold_labels.append(4)
res_labels.append(3)
elif len(results) == 0 and len(gold) == 0:
# res_labels = [random.choice([0, 1, 2, 3])]
res_labels = [3]
gold_labels = res_labels
elif len(results) == 1 and results[0] == NO_ENT_STRING:
# res_labels = [3]
res_labels = [4]
gold_labels = res_labels
else: # len(results) <= len(gold)
for r in results:
r_text = r[0]
r_type = r[1]
for i in range(len(gold)):
if r_text == gold[i]["entity_text"] and r_type == gold[i]["type"]:
res_labels[i] = NER_MAPPING[r_type]
matched_gold_idx.append(i)
# we map all wrong predictions to the "O" class
for i in range(len(gold_labels)):
if i in matched_gold_idx:
continue
if gold_labels[i] == 1:
# res_labels[i] = 2
res_labels[i] = 4
elif gold_labels[i] == 0:
# res_labels[i] = 1
res_labels[i] = 4
else:
res_labels[i] = 4
assert len(gold_labels) == len(res_labels)
return {"f1": (res_labels, gold_labels)}
def _ner_process_raw_output(llm_result: str) -> list[tuple]:
if NO_ENT_STRING in llm_result:
return []
if llm_result == "":
return ["WRONG"]
tmp_results = llm_result.split(NER_ENTITY_SEPARATOR)
results = []
for res in tmp_results:
r = res.strip()
# split on type separator
r_text = ""
r_type = ""
r_splitted = r.split(NER_TYPE_SEPARATOR)
if len(r_splitted) < 2:
r_text = r_splitted[0]
r_type = ""
else:
r_text = r_splitted[0]
r_type = r_splitted[1]
if r_text != "":
results.append((r_text, r_type.upper()))
return results
def _ner_process_raw_output_v2(llm_result: str) -> list[tuple]:
if NO_ENT_STRING in llm_result:
return [NO_ENT_STRING]
if llm_result == "":
return ["WRONG"]
tmp_results = llm_result.split(NER_ENTITY_SEPARATOR)
results = []
for res in tmp_results:
r = res.strip()
# split on type separator
r_text = ""
r_type = ""
r_splitted = r.split(NER_TYPE_SEPARATOR)
if len(r_splitted) < 2:
r_text = r_splitted[0]
r_type = ""
else:
r_text = r_splitted[0]
r_type = r_splitted[1]
if r_text != "":
results.append((r_text, r_type.upper()))
return results
# ---------------------- RELATION EXTRACTION ----------------------
def _rel_process_raw_output(llm_result: str) -> list[str]:
if NO_REL_STRING in llm_result:
return []
if llm_result == "":
return ["WRONG"]
tmp_results = llm_result.split(INTER_REL_SEPARATOR)
relations = []
for res in tmp_results:
r_text1 = ""
r_text2 = ""
r_splitted = res.split(INTRA_REL_SEPARATOR)
if len(r_splitted) < 2:
r_text1 = r_splitted[0].strip()
r_text2 = ""
else:
r_text1 = r_splitted[0].strip()
r_text2 = r_splitted[1].strip()
relations.append((r_text1, r_text2))
assert len(relations) == len(tmp_results)
return relations
INTER_REL_SEPARATOR = "%"
INTRA_REL_SEPARATOR = "$"
NO_REL_STRING = "&&NOREL&&"
def re_doc_to_target(doc):
ents = doc["relations"]
targ_str = ""
# Entità$Tipo%Entità$Tipo.
if ents == []:
return NO_ENT_STRING
else:
for e in ents:
targ_str += e[0] + INTRA_REL_SEPARATOR + e[1] + INTER_REL_SEPARATOR
return targ_str[:-1]
def _rel_gold_to_target(x: list) -> list:
if x == []:
return [0]
else:
return [1] * len(x)
def rel_doc_to_target(doc):
rel = doc["relations"]
targ_str = ""
# misura1$result1%misure2$result2.
if rel == []:
return NO_REL_STRING
else:
for r in rel:
targ_str += r[0] + "$" + r[1] + "%"
return targ_str[:-1]
def _extract_relations(results):
relations = []
for r in results:
r_text1 = ""
r_text2 = ""
r_splitted = r.split(INTRA_REL_SEPARATOR)
if len(r_splitted) < 2:
r_text1 = r_splitted[0]
r_text2 = ""
else:
r_text1 = r_splitted[0]
r_text2 = r_splitted[1]
relations.append((r_text1, r_text2))
assert len(relations) == len(results)
return relations
def rel_process_results_v3(doc, results):
"""
Process the results of the Relation extraction task not considering the order of the relation extracted
"""
# each document has a list of relation with the following format:
# [[text1, text2], [text3, text4]]
gold = doc["relations"]
raw_results = results[0]
has_results = 0 if NO_REL_STRING in raw_results else 1
has_gold = 1 if gold != [] else 0
res_labels = []
gold_labels = []
if has_results == 0 and has_gold:
# False negative
gold_labels = _rel_gold_to_target(gold)
res_labels = [0] * len(gold_labels)
elif has_results == 0 and has_gold == 0:
# True negative
gold_labels = _rel_gold_to_target(gold)
res_labels = gold_labels
elif has_results and has_gold == 0:
# False positive
gold_labels = _rel_gold_to_target(gold)
res_labels = [1] * len(gold_labels)
else:
results = _rel_process_raw_output(raw_results)
# results = raw_results.split(INTER_REL_SEPARATOR)
gold_labels = _rel_gold_to_target(gold)
res_labels = [0] * len(gold_labels)
assert len(gold) > 0
for i in range(len(gold)):
for j in range(len(results)):
r_text1 = results[j][0]
r_text2 = results[j][1]
if r_text1 == gold[i][0] and r_text2 == gold[i][1]: # list of lists
res_labels[i] = 1
results[j] = ("DELETED", "DELETED")
elif r_text1 == "DELETED" and r_text2 == "DELETED":
continue
else:
pass
# if there are more predictions than gold, we set the remaining predictions to false positive
if len(results) - len(gold) > 0:
for i in range(len(results) - len(gold)):
if results[i] == ("DELETED", "DELETED"):
continue
res_labels.append(1)
gold_labels.append(0)
assert len(gold_labels) == len(res_labels)
return {"f1": (res_labels, gold_labels)}
LS_SPLIT_REGEX = r"[^,]+"
def split_text_with_regex(text, pattern):
"""
pattern: str - a regex pattern to match the text
text: str - the text to split
"""
import re
# Get text with model-generated words for comparison with the gold standard
text = text.split("\n")[0]
# Find all matches for the pattern
matches = re.findall(pattern, text)
# Split each matched segment further if it contains a comma and is quoted
result = []
for match in matches:
if match.startswith('"') and match.endswith('"'):
# Remove the quotes and split inside the quoted string
inner_matches = re.findall(r"[^,]+", match[1:-1])
result.extend(inner_matches)
else:
result.append(match)
# Strip leading and trailing whitespaces from each element
result = [element.strip().replace('"', "") for element in result]
return result
# ---------------------- SUMMARIZATION ----------------------
def rouge1_score(references, predictions, **kwargs):
"""
suboptimal way of compute rouge because of the following issue:
https://github.com/EleutherAI/lm-evaluation-harness/issues/1302
"""
rouge = load("rouge")
return rouge.compute(predictions=predictions, references=references, **kwargs)[
"rouge1"
]
def process_results_sum(doc, results):
"""
Process the results of the Evalita summarization task
"""
ref = doc["summary"] if "summary" in doc.keys() else doc["target"]
rouge_scorer = load("rouge", keep_in_memory=True)
r1score = rouge_scorer.compute(predictions=results, references=[ref])["rouge1"]
return {
"rouge1": r1score,
}
def faq_doc_to_target(x):
if x["correct_answer"] == "A":
return 0
elif x["correct_answer"] == "B":
return 1
elif x["correct_answer"] == "C":
return 2
elif x["correct_answer"] == "D":
return 3
else:
eval_logger.warning(
'WARNING: correct answer not found or not in ["A", "B", "C", "D"]'
)
def ht_doc_to_target(x):
if x["source"] == "ilgiornale":
return 0
elif x["source"] == "repubblica":
return 1
else:
eval_logger.warning(
'WARNING: source not found or not in ["ilgiornale", "repubblica"]'
)
...@@ -33,7 +33,9 @@ class FDA(ConfigurableTask): ...@@ -33,7 +33,9 @@ class FDA(ConfigurableTask):
def doc_to_target(self, doc): def doc_to_target(self, doc):
return doc["value"] return doc["value"]
def construct_requests(self, doc, ctx, **kwargs): def construct_requests(
self, doc, ctx, chat_template=None, apply_chat_template=False, **kwargs
):
"""Uses RequestFactory to construct Requests and returns an iterable of """Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM. Requests which will be sent to the LM.
......
...@@ -26,7 +26,40 @@ The datasets included in GalicianBench that have been made public in previous pu ...@@ -26,7 +26,40 @@ The datasets included in GalicianBench that have been made public in previous pu
### Citation ### Citation
Paper for GalicianBench coming soon.
```
@inproceedings{baucells-etal-2025-iberobench,
title = "{I}bero{B}ench: A Benchmark for {LLM} Evaluation in {I}berian Languages",
author = "Baucells, Irene and
Aula-Blasco, Javier and
de-Dios-Flores, Iria and
Paniagua Su{\'a}rez, Silvia and
Perez, Naiara and
Salles, Anna and
Sotelo Docio, Susana and
Falc{\~a}o, J{\'u}lia and
Saiz, Jose Javier and
Sepulveda Torres, Robiert and
Barnes, Jeremy and
Gamallo, Pablo and
Gonzalez-Agirre, Aitor and
Rigau, German and
Villegas, Marta",
editor = "Rambow, Owen and
Wanner, Leo and
Apidianaki, Marianna and
Al-Khalifa, Hend and
Eugenio, Barbara Di and
Schockaert, Steven",
booktitle = "Proceedings of the 31st International Conference on Computational Linguistics",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.coling-main.699/",
pages = "10491--10519",
}
```
### Groups and Tasks ### Groups and Tasks
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment