Commit 2106fbeb authored by Baber's avatar Baber
Browse files

Merge branch 'main' into mathvista

# Conflicts:
#	lm_eval/models/openai_completions.py
parents 4354fe46 703fbffd
# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import glob
import json
import os
from datetime import datetime
from itertools import combinations
from pathlib import Path
from typing import List
import pandas as pd
from lm_eval.tasks.score.math.math_grader import math_equal
from lm_eval.utils import handle_non_serializable, make_table
N_SEEDS = 5
def load_json_logs(file_paths, subtasks):
"""
Loads JSON logs of jsonl format from file paths into a single DataFrame.
Args:
file_paths: List of file paths to the JSON logs.
Returns:
A DataFrame containing the logs.
"""
per_seed_df = {
"question_id": [],
"final_answer_seed_": [],
"gt": [],
"category": [],
}
_search_key = None
for i in range(len(file_paths)):
file_path = file_paths[i]
with open(file_path, "r") as f:
for line in f:
datapoint = json.loads(line)
if _search_key is None:
if "non_greedy_macro_accuracy" in datapoint:
_search_key = "non_greedy_macro_accuracy"
elif "non_greedy_accuracy" in datapoint:
_search_key = "non_greedy_accuracy"
question_id, final_answer, gt, category = datapoint[_search_key]
if subtasks is not None:
category = subtasks[i]
per_seed_df["question_id"].append(question_id)
per_seed_df["final_answer_seed_"].append(final_answer)
per_seed_df["gt"].append(gt)
per_seed_df["category"].append(category)
df = pd.DataFrame(per_seed_df)
return df
def calculate_consistency_rate(responses: List[List[str]]) -> float:
"""
Calculate the Consistency Rate (CR) for a given set of responses.
Args:
responses: List of lists, where each inner list contains responses to the same question.
Returns:
The consistency rate as a float.
"""
total_similarity = 0
total_combinations = 0
for response_set in responses:
pairs = combinations(response_set, 2)
num_pairs = len(response_set) * (len(response_set) - 1) / 2
total_combinations += num_pairs
for answer1, answer2 in pairs:
total_similarity += int(answer1 == answer2)
return total_similarity / total_combinations if total_combinations > 0 else 0.0
def calculate_math_consistency_rate(responses: List[List[str]]) -> float:
"""
Calculate the Consistency Rate (CR) for a given set of responses.
Args:
responses: List of lists, where each inner list contains responses to the same question.
Returns:
The consistency rate as a float.
"""
total_similarity = 0
total_combinations = 0
for response_set in responses:
pairs = combinations(response_set, 2)
num_pairs = len(response_set) * (len(response_set) - 1) / 2
total_combinations += num_pairs
for answer1, answer2 in pairs:
total_similarity += int(math_equal(answer1, answer2))
return total_similarity / total_combinations if total_combinations > 0 else 0.0
def main():
parser = argparse.ArgumentParser(
description="Calculate consistency rate from JSON logs."
)
parser.add_argument(
"--log_dir", help="Path to the directory containing the JSON log files."
)
parser.add_argument("--dataset", help="Dataset name: agieval, mmlu_pro or math")
args = parser.parse_args()
for seed in range(1, N_SEEDS + 1):
# Checking if directories exist
seed_log_dir = os.path.join(args.log_dir, f"seed_{seed}")
assert os.path.exists(
seed_log_dir
), f"No logs found for seed={seed}. No directory found at {seed_log_dir}"
subtasks = None
if args.dataset == "agieval":
agieval_subtasks = [
"aqua_rat",
"logiqa_en",
"lsat_ar",
"lsat_lr",
"lsat_rc",
"sat_en",
"sat_math",
]
subtasks = agieval_subtasks
file_paths = []
for subtask in agieval_subtasks:
log_path = os.path.join(
seed_log_dir,
f"*/samples_non_greedy_robustness_agieval_{subtask}_*.jsonl",
)
subtask_logs = glob.glob(log_path)
if len(subtask_logs) == 0:
raise FileNotFoundError(
f"No logs found for agieval subtask {subtask} for seed={seed} in the path {log_path}."
)
elif len(subtask_logs) > 1:
raise FileExistsError(
f"Multiple logs found for agieval subtask {subtask} for seed={seed}."
)
file_paths.append(subtask_logs[0])
elif args.dataset == "mmlu_pro":
task_logs = glob.glob(
os.path.join(
seed_log_dir,
"*/samples_score_non_greedy_robustness_mmlu_pro_*.jsonl",
)
)
file_paths = []
if len(task_logs) == 0:
raise FileNotFoundError(
f"No logs found for mmlu_pro for seed={seed}. PATH: {seed_log_dir}"
)
elif len(task_logs) > 1:
raise FileExistsError(
f"Multiple logs found for mmlu_pro for seed={seed}."
)
file_paths.append(task_logs[0])
elif args.dataset == "math":
math_subtasks = [
"algebra",
"counting_and_prob",
"geometry",
"intermediate_algebra",
"num_theory",
"prealgebra",
"precalc",
]
subtasks = math_subtasks
file_paths = []
for subtask in math_subtasks:
log_path = os.path.join(
seed_log_dir,
f"*/samples_non_greedy_robustness_math_{subtask}_*.jsonl",
)
subtask_logs = glob.glob(log_path)
if len(subtask_logs) == 0:
raise FileNotFoundError(
f"No logs found for math subtask {subtask} for seed={seed} in the path {log_path}."
)
elif len(subtask_logs) > 1:
raise FileExistsError(
f"Multiple logs found for math subtask {subtask} for seed={seed}."
)
file_paths.append(subtask_logs[0])
else:
raise ValueError(
"Invalid dataset name. only agieval, mmlu_pro and math are supported."
)
df = load_json_logs(file_paths, subtasks)
# merge all dfs by question_id, category and gt
if seed == 1:
df_all = df
df_all[f"final_answer_seed_{seed}"] = df["final_answer_seed_"]
else:
df_all = df_all.merge(
df, on=["question_id", "category"], suffixes=("", seed)
)
responses = df_all[
[f"final_answer_seed_{seed}" for seed in range(1, N_SEEDS + 1)]
].values.tolist()
# calculate per seed accuracy
if args.dataset == "math":
consistency_rate = calculate_math_consistency_rate(responses)
results = {"alias": f"score_non_greedy_robustness_{args.dataset}"}
results.update(
{
"consistency_rate,none": consistency_rate,
"consistency_rate_stderr,none": "N/A",
}
)
for seed in range(1, N_SEEDS + 1):
df_all[f"accuracy_seed_{seed}"] = df_all[
[f"final_answer_seed_{seed}", "gt"]
].apply(lambda x: math_equal(*x), axis=1)
accuracy = df_all[f"accuracy_seed_{seed}"].mean()
results[f"seed_{seed}_accuracy,none"] = accuracy
results[f"seed_{seed}_accuracy_stderr,none"] = "N/A"
else:
consistency_rate = calculate_consistency_rate(responses)
results = {"alias": f"score_non_greedy_robustness_{args.dataset}"}
results.update(
{
"consistency_rate,none": consistency_rate,
"consistency_rate_stderr,none": "N/A",
}
)
for seed in range(1, N_SEEDS + 1):
df_all[f"accuracy_seed_{seed}"] = (
df_all[f"final_answer_seed_{seed}"] == df_all["gt"]
)
accuracy = df_all[f"accuracy_seed_{seed}"].mean()
results[f"seed_{seed}_accuracy,none"] = accuracy
results[f"seed_{seed}_accuracy_stderr,none"] = "N/A"
metrics = [f"seed_{seed}_accuracy" for seed in range(1, N_SEEDS + 1)] + [
"consistency_rate"
]
higher_is_better = {metric: True for metric in metrics}
results_dict = {
"results": {f"score_non_greedy_robustness_{args.dataset}": results},
"group_subtasks": {f"score_non_greedy_robustness_{args.dataset}": []},
"configs": None,
"versions": {f"score_non_greedy_robustness_{args.dataset}": 1},
"n-shot": {f"score_non_greedy_robustness_{args.dataset}": 0},
"higher_is_better": {
f"score_non_greedy_robustness_{args.dataset}": higher_is_better
},
"n-samples": None,
}
dumped = json.dumps(
results_dict,
indent=2,
default=handle_non_serializable,
ensure_ascii=False,
)
path = Path(args.log_dir)
path.mkdir(parents=True, exist_ok=True)
date_id = datetime.now().isoformat().replace(":", "-")
file_results_aggregated = path.joinpath(f"{args.dataset}_results_{date_id}.json")
file_results_aggregated.open("w", encoding="utf-8").write(dumped)
print(make_table(results_dict))
if __name__ == "__main__":
main()
# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
group: score_robustness
task:
- score_robustness_agieval
- score_robustness_mmlu_pro
- score_robustness_math
metadata:
version: 1.0
# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import copy
import json
import re
import string
import sys
from functools import partial
from itertools import combinations
from typing import Any, Dict, List
import numpy as np
from datasets import Dataset
from lm_eval.utils import eval_logger
NUMERALS = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10"]
ROMAN_NUMERALS = ["I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX", "X"]
def __repeat_elements(lst, n):
result = []
for element in lst:
result.extend([element] * n)
return result
def process_docs_add_prompts(
doc: Dataset,
templates_key: str,
template_file_path: str,
dataset_specific_preprocess: callable = None,
) -> Dataset:
try:
with open(template_file_path) as f:
prompt_templates = json.load(f)[templates_key]
except FileNotFoundError:
eval_logger.error("Prompt templates not found")
sys.exit()
if dataset_specific_preprocess is not None:
doc = dataset_specific_preprocess(doc)
def process_batch(batch):
n = len(prompt_templates)
initial_len = len(next(iter(batch.values())))
result = {key: __repeat_elements(values, n) for key, values in batch.items()}
result["prompt_id"] = list(range(n)) * initial_len
result["prompt"] = [prompt_templates[i]["prompt"] for i in result["prompt_id"]]
if "options_format" in prompt_templates[0]:
result["options_format"] = [
prompt_templates[i]["options_format"] for i in result["prompt_id"]
]
return result
return doc.map(process_batch, batched=True)
def option_order_robustness_process_docs(
doc: Dataset,
template_file_path: str,
templates_key: str,
labels: list,
dataset_specific_preprocess: callable = None,
) -> Dataset:
try:
with open(template_file_path) as f:
prompt_template = json.load(f)[templates_key]
prompt = prompt_template["prompt"]
options_format = prompt_template["options_format"]
except FileNotFoundError:
eval_logger.error("Prompt templates not found")
sys.exit()
if dataset_specific_preprocess is not None:
doc = dataset_specific_preprocess(doc)
def repeat_doc_swap_correct_answer(batched_docs):
initial_len = len(next(iter(batched_docs.values())))
keys = list(batched_docs.keys())
new_batched_docs = {key: [] for key in keys}
new_batched_docs["always_same_option"] = []
new_batched_docs["prompt"] = []
new_batched_docs["options_format"] = []
new_batched_docs["original_answer_index"] = []
for doc_ind in range(initial_len):
for label_ind, label in enumerate(labels):
new_batched_docs["original_answer_index"].append(
batched_docs["answer_index"][doc_ind]
)
for key in keys:
new_batched_docs[key].append(
copy.deepcopy(batched_docs[key][doc_ind])
)
if label_ind < len(batched_docs["options"][doc_ind]):
if key == "options":
# Swap correct answer with label_ind option
new_batched_docs[key][-1][label_ind] = batched_docs[
"options"
][doc_ind][batched_docs["answer_index"][doc_ind]]
new_batched_docs[key][-1][
batched_docs["answer_index"][doc_ind]
] = batched_docs["options"][doc_ind][label_ind]
if key == "answer_index":
new_batched_docs[key][-1] = label_ind
if key == "answer":
new_batched_docs[key][-1] = label
new_batched_docs["always_same_option"].append(label)
new_batched_docs["prompt"].append(prompt)
new_batched_docs["options_format"].append(options_format)
return new_batched_docs
return doc.map(repeat_doc_swap_correct_answer, batched=True)
def non_greedy_robustness_process_docs(
doc: Dataset,
templates_key: str,
template_file_path: str,
dataset_specific_preprocess: callable = None,
) -> Dataset:
try:
with open(template_file_path) as f:
prompt_template = json.load(f)[templates_key]
prompt = prompt_template["prompt"]
options_format = prompt_template.get("options_format", None)
except FileNotFoundError:
eval_logger.error("Prompt templates not found")
sys.exit()
if dataset_specific_preprocess is not None:
doc = dataset_specific_preprocess(doc)
def add_prompt_col(batched_docs):
initial_len = len(next(iter(batched_docs.values())))
new_batched_docs = copy.deepcopy(batched_docs)
new_batched_docs["prompt"] = [prompt] * initial_len
if options_format is not None:
new_batched_docs["options_format"] = [options_format] * initial_len
return new_batched_docs
return doc.map(add_prompt_col, batched=True)
def robustness_doc_to_text(doc: Dataset) -> str:
upper_case = string.ascii_uppercase
lower_case = string.ascii_lowercase
prompt = doc["prompt"]
options_format = doc.get("options_format", "")
question = doc["question"]
catrgory = doc.get("category", "")
options = None
if options_format:
options = "".join(
[
options_format.format(
letter=upper_case[i],
option=doc["options"][i],
numeral=NUMERALS[i],
roman_numeral=ROMAN_NUMERALS[i],
lower_case_letter=lower_case[i],
)
for i in range(len(doc["options"]))
]
)
return prompt.format(question=question, options=options, category=catrgory)
def __postprocess_pred(pred):
if "the best answer is" not in pred.lower():
return pred
pred_proc = (
pred.lower().split("the best answer is ")[-1].split("\n")[0].split(" ")[0]
)
pred_proc = re.sub(r"[^a-zA-Z0-9]", "", pred_proc).strip()
return pred_proc.upper()
def translate_model_answer_to_labels(answer, labels, option_format=None):
answer = answer.upper()
if option_format is None:
return answer
elif "numeral" in option_format:
if "roman" in option_format:
if answer not in ROMAN_NUMERALS:
return answer
else:
return labels[ROMAN_NUMERALS.index(answer)]
if answer not in NUMERALS:
return answer
else:
return labels[NUMERALS.index(answer)]
return answer
def calculate_consistency_rate(responses: List[List[str]]) -> float:
"""
Calculate the Consistency Rate (CR) for a given set of responses.
Args:
responses: List of lists, where each inner list contains responses to the same question.
Returns:
The consistency rate as a float.
"""
total_similarity = 0
total_combinations = 0
for response_set in responses:
pairs = combinations(response_set, 2)
num_pairs = len(response_set) * (len(response_set) - 1) / 2
total_combinations += num_pairs
for answer1, answer2 in pairs:
total_similarity += int(answer1 == answer2)
return total_similarity / total_combinations if total_combinations > 0 else 0.0
def prompt_consistency_rate(results: List[Dict[str, Any]]) -> float:
"""
Calculate the Consistency Rate (CR) for a given set of responses.
Args:
responses: List of lists, where each inner list contains responses to the same question.
Returns:
The consistency rate as a float.
"""
question_answers_dict = {}
for result in results:
question_id, prompt_id, final_answer, gt = result
if question_id not in question_answers_dict:
question_answers_dict[question_id] = []
question_answers_dict[question_id].append(final_answer)
question_answers_list = [answers for answers in question_answers_dict.values()]
return calculate_consistency_rate(question_answers_list)
def options_consistency_rate(results: List[Dict[str, Any]], labels) -> float:
"""
Calculate the Consistency Rate (CR) for a given set of responses.
Args:
responses: List of lists, where each inner list contains responses to the same question.
Returns:
The consistency rate as a float.
"""
question_answers_dict = {}
for result in results:
(
question_id,
always_same_option,
final_answer,
original_answer_index,
answer_index,
) = result
if final_answer == labels[original_answer_index]:
final_answer = always_same_option
if final_answer == always_same_option:
final_answer = labels[original_answer_index]
if question_id not in question_answers_dict:
question_answers_dict[question_id] = []
question_answers_dict[question_id].append(final_answer)
question_answers_list = [answers for answers in question_answers_dict.values()]
return calculate_consistency_rate(question_answers_list)
......@@ -4,7 +4,8 @@ from functools import reduce
import numpy as np
import transformers.data.metrics.squad_metrics as squad_metrics
from datasets import Dataset, load_metric
from datasets import Dataset
from evaluate import load
from transformers import AutoTokenizer
from lm_eval.api.instance import Instance
......@@ -48,7 +49,10 @@ def _download_metric():
from huggingface_hub import hf_hub_download
scrolls_metric_path = hf_hub_download(
repo_id="tau/scrolls", repo_type="dataset", filename="metrics/scrolls.py"
repo_id="tau/scrolls",
repo_type="dataset",
filename="metrics/scrolls.py",
revision="refs/pr/5",
)
updated_scrolls_metric_path = (
os.path.dirname(scrolls_metric_path)
......@@ -119,7 +123,7 @@ class _SCROLLSTask(ConfigurableTask):
def __init__(self, config=None):
super().__init__(config={"metadata": {"version": self.VERSION}})
if self.DATASET_NAME is not None:
self.metric = load_metric(_download_metric(), config_name=self.DATASET_NAME)
self.metric = load(_download_metric(), config_name=self.DATASET_NAME)
def has_training_docs(self):
return True
......@@ -253,11 +257,14 @@ class _SCROLLSMultipleChoiceTask(_SCROLLSTask):
}
def construct_requests(self, doc, ctx, **kwargs):
apply_chat_template = kwargs.pop("apply_chat_template", False)
request_list = [
Instance(
request_type="loglikelihood",
doc=doc,
arguments=(ctx, " {}".format(choice)),
arguments=(ctx, " {}".format(choice))
if not apply_chat_template
else (ctx, "{}".format(choice)),
idx=i,
**kwargs,
)
......@@ -285,6 +292,7 @@ class _SCROLLSSummaryTask(_SCROLLSTask):
}
def construct_requests(self, doc, ctx, **kwargs):
kwargs.pop("apply_chat_template", False)
return Instance(
request_type="generate_until",
doc=doc,
......@@ -327,19 +335,22 @@ class Qasper(_SCROLLSTask):
return {"f1": (prediction, doc["outputs"])}
def construct_requests(self, doc, ctx, **kwargs):
apply_chat_template = kwargs.pop("apply_chat_template", False)
if doc["is_yes_no"]:
return [
Instance(
request_type="loglikelihood",
doc=doc,
arguments=(ctx, " yes"),
arguments=(ctx, " yes")
if not apply_chat_template
else (ctx, "yes"),
idx=0,
**kwargs,
),
Instance(
request_type="loglikelihood",
doc=doc,
arguments=(ctx, " no"),
arguments=(ctx, " no") if not apply_chat_template else (ctx, "no"),
idx=1,
**kwargs,
),
......@@ -406,6 +417,7 @@ class NarrativeQA(_SCROLLSTask):
return {"f1": (results[0], doc["outputs"])}
def construct_requests(self, doc, ctx, **kwargs):
kwargs.pop("apply_chat_template", False)
return Instance(
request_type="generate_until",
doc=doc,
......
# File generated by `create-yamls.py`
include: _phrases_es_common.yaml
include: _phrases_es_common
task: phrases_es-va
doc_to_text: 'Oració en espanyol: {{es}}
......
# File generated by `create-yamls.py`
include: _phrases_es_common.yaml
include: _phrases_es_common
task: phrases_va-es
doc_to_text: 'Oració en valencià: {{va}}
......
......@@ -35,15 +35,6 @@ def process_doc_nli(dataset):
return dataset.map(process_fn)
def process_results_qa(doc, results):
preds = results[0]
reference = doc["answers"]["text"][0]
# import code; code.interact(local=dict(globals(), **locals()))
f1_sum = squad_metrics.compute_f1(reference, preds)
exact_match = squad_metrics.compute_exact(reference, preds)
return {"f1": f1_sum, "exact_match": exact_match}
def process_xlsum(dataset):
def _process_doc(doc):
# Remove double spaces
......
tag: storycloze
task: storycloze_2016
dataset_path: story_cloze
dataset_name: 2016
dataset_name: "2016"
output_type: multiple_choice
validation_split: validation
test_split: test
......
tag: storycloze
task: storycloze_2018
dataset_path: story_cloze
dataset_name: 2018
dataset_name: "2018"
output_type: multiple_choice
validation_split: validation
test_split: test
......
......@@ -6,6 +6,8 @@ The full Unitxt catalog can be viewed in an [online explorer](https://unitxt.rea
Read more about Unitxt at [www.unitxt.ai](https://www.unitxt.ai/).
To use Unitxt dataset with lm-eval, you should first install unitxt via 'pip install unitxt'.
### Paper
Title: `Unitxt: Flexible, Shareable and Reusable Data Preparation and Evaluation for Generative AI`
......
......@@ -6,11 +6,11 @@ Addressing this need, we present Unitxt, an innovative library for customizable
import importlib.util
import re
from collections.abc import Callable
from functools import partial
from typing import Any, Dict, Optional
import datasets
import evaluate
from lm_eval.api.instance import Instance
from lm_eval.api.task import ConfigurableTask
......@@ -28,16 +28,21 @@ _CITATION = """
"""
def is_unitxt_installed() -> bool:
return importlib.util.find_spec("unitxt") is not None
def assert_unitxt_installed():
if importlib.util.find_spec("unitxt") is None:
raise Exception(
"Please install unitxt via 'pip install unitxt'. For more information see: https://www.unitxt.ai/"
)
def score(items, metric):
predictions, references = zip(*items)
evaluator = evaluate.load("unitxt/metric")
assert_unitxt_installed()
from unitxt import evaluate
for reference in references:
reference["metrics"] = [metric]
results = evaluator.compute(predictions=predictions, references=references)
results = evaluate(predictions, references)
return results[0]["score"]["global"]["score"]
......@@ -61,16 +66,10 @@ class Unitxt(ConfigurableTask):
self.metrics = self.dataset["test"][0]["metrics"]
def download(self, dataset_kwargs: Optional[Dict[str, Any]] = None) -> None:
if is_unitxt_installed():
from unitxt import load_dataset
assert_unitxt_installed()
from unitxt import load_dataset
self.dataset = load_dataset(self.DATASET_NAME)
else:
self.dataset = datasets.load_dataset(
name=self.DATASET_NAME,
path="unitxt/data",
trust_remote_code=True,
)
self.dataset = load_dataset(self.DATASET_NAME, disable_cache=False)
def has_training_docs(self):
return "train" in self.dataset
......@@ -102,6 +101,27 @@ class Unitxt(ConfigurableTask):
def get_arguments(self, doc, ctx):
return (ctx, {"until": ["\n"]})
def fewshot_context(
self,
doc: str,
num_fewshot: int,
system_instruction: Optional[str] = None,
apply_chat_template: bool = False,
fewshot_as_multiturn: bool = False,
chat_template: Optional[Callable] = None,
) -> str:
source = self.doc_to_text(doc)
if isinstance(source, list):
if apply_chat_template:
formated_source = chat_template(self.doc_to_text(doc))
return formated_source
else:
raise Exception(
"Got chat template format from Unitxt, but apply_chat_template is false. Add '--apply_chat_template' to command line."
)
else:
return source
def construct_requests(self, doc, ctx, **kwargs):
"""Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
......@@ -113,6 +133,7 @@ class Unitxt(ConfigurableTask):
language description, as well as the few shot examples, and the question
part of the document for `doc`.
"""
kwargs.pop("apply_chat_template", False) # Not used by unitxt
return [
Instance(
request_type="generate_until",
......
# XQuAD
### Paper
Title: `On the Cross-lingual Transferability of Monolingual Representations`
Abstract: https://aclanthology.org/2020.acl-main.421.pdf
XQuAD (Cross-lingual Question Answering Dataset) is a benchmark dataset for evaluating cross-lingual question answering performance. The dataset consists of a subset of 240 paragraphs and 1190 question-answer pairs from the development set of SQuAD v1.1 (Rajpurkar et al., 2016) together with their professional translations into ten languages: Spanish, German, Greek, Russian, Turkish, Arabic, Vietnamese, Thai, Chinese, and Hindi. Consequently, the dataset is entirely parallel across 11 languages.
Homepage: https://github.com/deepmind/xquad
### Citation
```
@article{Artetxe:etal:2019,
author = {Mikel Artetxe and Sebastian Ruder and Dani Yogatama},
title = {On the cross-lingual transferability of monolingual representations},
journal = {CoRR},
volume = {abs/1910.11856},
year = {2019},
archivePrefix = {arXiv},
eprint = {1910.11856}
}
```
### Groups and Tasks
#### Groups
* `xquad`: All available languages.
#### Tasks
Perform extractive question answering for each language's subset of XQuAD.
* `xquad_ar`: Arabic
* `xquad_de`: German
* `xquad_el`: Greek
* `xquad_en`: English
* `xquad_es`: Spanish
* `xquad_hi`: Hindi
* `xquad_ro`: Romanian
* `xquad_ru`: Russian
* `xquad_th`: Thai
* `xquad_tr`: Turkish
* `xquad_vi`: Vietnamese
* `xquad_zh`: Chinese
### Checklist
For adding novel benchmarks/datasets to the library:
* [x] Is the task an existing benchmark in the literature?
* [x] Have you referenced the original paper that introduced the task?
* [x] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
If other tasks on this dataset are already supported:
* [ ] Is the "Main" variant of this task clearly denoted?
* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
import re
from itertools import product
import evaluate
import transformers.data.metrics.squad_metrics as squad_metrics
from lm_eval.utils import general_detokenize
def process_results_qa(doc, results):
preds = results[0]
reference = doc["answers"]["text"][0]
f1_sum = squad_metrics.compute_f1(reference, preds)
exact_match = squad_metrics.compute_exact(reference, preds)
return {"f1": f1_sum, "exact_match": exact_match}
include: xquad_common_yaml
task: xquad_ar
dataset_name: xquad.ar
doc_to_text: "سيا: {{context}}\n\nسؤال: {{question}}\n\nإجابة:"
task: xquad_es
# This file will be included in the generated language-specific task configs.
# It doesn't have a yaml file extension as it is not meant to be imported directly
# by the harness.
tag: xquad
task: null
dataset_path: xquad
dataset_name: xquad.es
dataset_name: null
output_type: generate_until
doc_to_text: "Contexto: {{context}}\n\nPregunta: {{question}}\n\nRespuesta:"
doc_to_target: '{{answers["text"][0]}}'
validation_split: validation
target_delimiter: ' '
doc_to_text: null
doc_to_target: '{{answers["text"][0]}}'
process_results: !function utils.process_results_qa
target_delimiter: ' '
generation_kwargs:
until:
- "\n"
......
include: xquad_common_yaml
task: xquad_de
dataset_name: xquad.de
doc_to_text: "Kontext: {{context}}\n\nFrage: {{question}}\n\nAntwort:"
include: xquad_common_yaml
task: xquad_el
dataset_name: xquad.el
doc_to_text: "Συμφραζόμενα: {{context}}\n\nΕρώτηση: {{question}}\n\nΑπάντηση:"
include: xquad_common_yaml
task: xquad_en
dataset_name: xquad.en
doc_to_text: "Context: {{context}}\n\nQuestion: {{question}}\n\nAnswer:"
include: xquad_common_yaml
task: xquad_es
dataset_name: xquad.es
doc_to_text: "Contexto: {{context}}\n\nPregunta: {{question}}\n\nRespuesta:"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment