Merge branch 'main' into mathvista

# Conflicts: # lm_eval/models/openai_completions.py

Merge branch 'main' into mathvista
# Conflicts: # lm_eval/models/openai_completions.py
2106fbeb · Baber · 4354fe46 · 703fbffd · 2106fbeb · 2106fbeb
Commit 2106fbeb authored Jan 15, 2025 by Baber
20 changed files
--- a/lm_eval/tasks/score/non_greedy_summarizer.py
+++ b/lm_eval/tasks/score/non_greedy_summarizer.py
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import glob
+import json
+import os
+from datetime import datetime
+from itertools import combinations
+from pathlib import Path
+from typing import List
+
+import pandas as pd
+
+from lm_eval.tasks.score.math.math_grader import math_equal
+from lm_eval.utils import handle_non_serializable, make_table
+
+
+N_SEEDS = 5
+
+
+def load_json_logs(file_paths, subtasks):
+    """
+    Loads JSON logs of jsonl format from file paths into a single DataFrame.
+
+    Args:
+        file_paths: List of file paths to the JSON logs.
+
+    Returns:
+        A DataFrame containing the logs.
+    """
+    per_seed_df = {
+        "question_id": [],
+        "final_answer_seed_": [],
+        "gt": [],
+        "category": [],
+    }
+    _search_key = None
+    for i in range(len(file_paths)):
+        file_path = file_paths[i]
+        with open(file_path, "r") as f:
+            for line in f:
+                datapoint = json.loads(line)
+                if _search_key is None:
+                    if "non_greedy_macro_accuracy" in datapoint:
+                        _search_key = "non_greedy_macro_accuracy"
+                    elif "non_greedy_accuracy" in datapoint:
+                        _search_key = "non_greedy_accuracy"
+                question_id, final_answer, gt, category = datapoint[_search_key]
+                if subtasks is not None:
+                    category = subtasks[i]
+                per_seed_df["question_id"].append(question_id)
+                per_seed_df["final_answer_seed_"].append(final_answer)
+                per_seed_df["gt"].append(gt)
+                per_seed_df["category"].append(category)
+    df = pd.DataFrame(per_seed_df)
+    return df
+
+
+def calculate_consistency_rate(responses: List[List[str]]) -> float:
+    """
+    Calculate the Consistency Rate (CR) for a given set of responses.
+
+    Args:
+    responses: List of lists, where each inner list contains responses to the same question.
+
+    Returns:
+    The consistency rate as a float.
+    """
+    total_similarity = 0
+    total_combinations = 0
+
+    for response_set in responses:
+        pairs = combinations(response_set, 2)
+        num_pairs = len(response_set) * (len(response_set) - 1) / 2
+        total_combinations += num_pairs
+        for answer1, answer2 in pairs:
+            total_similarity += int(answer1 == answer2)
+
+    return total_similarity / total_combinations if total_combinations > 0 else 0.0
+
+
+def calculate_math_consistency_rate(responses: List[List[str]]) -> float:
+    """
+    Calculate the Consistency Rate (CR) for a given set of responses.
+
+    Args:
+    responses: List of lists, where each inner list contains responses to the same question.
+
+    Returns:
+    The consistency rate as a float.
+    """
+    total_similarity = 0
+    total_combinations = 0
+
+    for response_set in responses:
+        pairs = combinations(response_set, 2)
+        num_pairs = len(response_set) * (len(response_set) - 1) / 2
+        total_combinations += num_pairs
+        for answer1, answer2 in pairs:
+            total_similarity += int(math_equal(answer1, answer2))
+
+    return total_similarity / total_combinations if total_combinations > 0 else 0.0
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Calculate consistency rate from JSON logs."
+    )
+    parser.add_argument(
+        "--log_dir", help="Path to the directory containing the JSON log files."
+    )
+    parser.add_argument("--dataset", help="Dataset name: agieval, mmlu_pro or math")
+    args = parser.parse_args()
+
+    for seed in range(1, N_SEEDS + 1):
+        # Checking if directories exist
+        seed_log_dir = os.path.join(args.log_dir, f"seed_{seed}")
+        assert os.path.exists(
+            seed_log_dir
+        ), f"No logs found for seed={seed}. No directory found at {seed_log_dir}"
+        subtasks = None
+        if args.dataset == "agieval":
+            agieval_subtasks = [
+                "aqua_rat",
+                "logiqa_en",
+                "lsat_ar",
+                "lsat_lr",
+                "lsat_rc",
+                "sat_en",
+                "sat_math",
+            ]
+            subtasks = agieval_subtasks
+            file_paths = []
+            for subtask in agieval_subtasks:
+                log_path = os.path.join(
+                    seed_log_dir,
+                    f"*/samples_non_greedy_robustness_agieval_{subtask}_*.jsonl",
+                )
+                subtask_logs = glob.glob(log_path)
+                if len(subtask_logs) == 0:
+                    raise FileNotFoundError(
+                        f"No logs found for agieval subtask {subtask} for seed={seed} in the path {log_path}."
+                    )
+                elif len(subtask_logs) > 1:
+                    raise FileExistsError(
+                        f"Multiple logs found for agieval subtask {subtask} for seed={seed}."
+                    )
+                file_paths.append(subtask_logs[0])
+
+        elif args.dataset == "mmlu_pro":
+            task_logs = glob.glob(
+                os.path.join(
+                    seed_log_dir,
+                    "*/samples_score_non_greedy_robustness_mmlu_pro_*.jsonl",
+                )
+            )
+            file_paths = []
+            if len(task_logs) == 0:
+                raise FileNotFoundError(
+                    f"No logs found for mmlu_pro for seed={seed}. PATH: {seed_log_dir}"
+                )
+            elif len(task_logs) > 1:
+                raise FileExistsError(
+                    f"Multiple logs found for mmlu_pro for seed={seed}."
+                )
+            file_paths.append(task_logs[0])
+
+        elif args.dataset == "math":
+            math_subtasks = [
+                "algebra",
+                "counting_and_prob",
+                "geometry",
+                "intermediate_algebra",
+                "num_theory",
+                "prealgebra",
+                "precalc",
+            ]
+            subtasks = math_subtasks
+            file_paths = []
+
+            for subtask in math_subtasks:
+                log_path = os.path.join(
+                    seed_log_dir,
+                    f"*/samples_non_greedy_robustness_math_{subtask}_*.jsonl",
+                )
+
+                subtask_logs = glob.glob(log_path)
+                if len(subtask_logs) == 0:
+                    raise FileNotFoundError(
+                        f"No logs found for math subtask {subtask} for seed={seed} in the path {log_path}."
+                    )
+                elif len(subtask_logs) > 1:
+                    raise FileExistsError(
+                        f"Multiple logs found for math subtask {subtask} for seed={seed}."
+                    )
+                file_paths.append(subtask_logs[0])
+
+        else:
+            raise ValueError(
+                "Invalid dataset name. only agieval, mmlu_pro and math are supported."
+            )
+
+        df = load_json_logs(file_paths, subtasks)
+
+        # merge all dfs by question_id, category and gt
+        if seed == 1:
+            df_all = df
+            df_all[f"final_answer_seed_{seed}"] = df["final_answer_seed_"]
+        else:
+            df_all = df_all.merge(
+                df, on=["question_id", "category"], suffixes=("", seed)
+            )
+
+    responses = df_all[
+        [f"final_answer_seed_{seed}" for seed in range(1, N_SEEDS + 1)]
+    ].values.tolist()
+
+    # calculate per seed accuracy
+
+    if args.dataset == "math":
+        consistency_rate = calculate_math_consistency_rate(responses)
+        results = {"alias": f"score_non_greedy_robustness_{args.dataset}"}
+
+        results.update(
+            {
+                "consistency_rate,none": consistency_rate,
+                "consistency_rate_stderr,none": "N/A",
+            }
+        )
+
+        for seed in range(1, N_SEEDS + 1):
+            df_all[f"accuracy_seed_{seed}"] = df_all[
+                [f"final_answer_seed_{seed}", "gt"]
+            ].apply(lambda x: math_equal(*x), axis=1)
+            accuracy = df_all[f"accuracy_seed_{seed}"].mean()
+            results[f"seed_{seed}_accuracy,none"] = accuracy
+            results[f"seed_{seed}_accuracy_stderr,none"] = "N/A"
+
+    else:
+        consistency_rate = calculate_consistency_rate(responses)
+        results = {"alias": f"score_non_greedy_robustness_{args.dataset}"}
+
+        results.update(
+            {
+                "consistency_rate,none": consistency_rate,
+                "consistency_rate_stderr,none": "N/A",
+            }
+        )
+
+        for seed in range(1, N_SEEDS + 1):
+            df_all[f"accuracy_seed_{seed}"] = (
+                df_all[f"final_answer_seed_{seed}"] == df_all["gt"]
+            )
+            accuracy = df_all[f"accuracy_seed_{seed}"].mean()
+            results[f"seed_{seed}_accuracy,none"] = accuracy
+            results[f"seed_{seed}_accuracy_stderr,none"] = "N/A"
+
+    metrics = [f"seed_{seed}_accuracy" for seed in range(1, N_SEEDS + 1)] + [
+        "consistency_rate"
+    ]
+    higher_is_better = {metric: True for metric in metrics}
+
+    results_dict = {
+        "results": {f"score_non_greedy_robustness_{args.dataset}": results},
+        "group_subtasks": {f"score_non_greedy_robustness_{args.dataset}": []},
+        "configs": None,
+        "versions": {f"score_non_greedy_robustness_{args.dataset}": 1},
+        "n-shot": {f"score_non_greedy_robustness_{args.dataset}": 0},
+        "higher_is_better": {
+            f"score_non_greedy_robustness_{args.dataset}": higher_is_better
+        },
+        "n-samples": None,
+    }
+
+    dumped = json.dumps(
+        results_dict,
+        indent=2,
+        default=handle_non_serializable,
+        ensure_ascii=False,
+    )
+
+    path = Path(args.log_dir)
+    path.mkdir(parents=True, exist_ok=True)
+
+    date_id = datetime.now().isoformat().replace(":", "-")
+    file_results_aggregated = path.joinpath(f"{args.dataset}_results_{date_id}.json")
+    file_results_aggregated.open("w", encoding="utf-8").write(dumped)
+
+    print(make_table(results_dict))
+
+
+if __name__ == "__main__":
+    main()
--- a/lm_eval/tasks/score/score_robustness.yaml
+++ b/lm_eval/tasks/score/score_robustness.yaml
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#    http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+group: score_robustness
+task:
+  - score_robustness_agieval
+  - score_robustness_mmlu_pro
+  - score_robustness_math
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/score/utils.py
+++ b/lm_eval/tasks/score/utils.py
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import json
+import re
+import string
+import sys
+from functools import partial
+from itertools import combinations
+from typing import Any, Dict, List
+
+import numpy as np
+from datasets import Dataset
+
+from lm_eval.utils import eval_logger
+
+
+NUMERALS = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10"]
+ROMAN_NUMERALS = ["I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX", "X"]
+
+
+def __repeat_elements(lst, n):
+    result = []
+    for element in lst:
+        result.extend([element] * n)
+    return result
+
+
+def process_docs_add_prompts(
+    doc: Dataset,
+    templates_key: str,
+    template_file_path: str,
+    dataset_specific_preprocess: callable = None,
+) -> Dataset:
+    try:
+        with open(template_file_path) as f:
+            prompt_templates = json.load(f)[templates_key]
+    except FileNotFoundError:
+        eval_logger.error("Prompt templates not found")
+        sys.exit()
+    if dataset_specific_preprocess is not None:
+        doc = dataset_specific_preprocess(doc)
+
+    def process_batch(batch):
+        n = len(prompt_templates)
+        initial_len = len(next(iter(batch.values())))
+
+        result = {key: __repeat_elements(values, n) for key, values in batch.items()}
+        result["prompt_id"] = list(range(n)) * initial_len
+        result["prompt"] = [prompt_templates[i]["prompt"] for i in result["prompt_id"]]
+        if "options_format" in prompt_templates[0]:
+            result["options_format"] = [
+                prompt_templates[i]["options_format"] for i in result["prompt_id"]
+            ]
+        return result
+
+    return doc.map(process_batch, batched=True)
+
+
+def option_order_robustness_process_docs(
+    doc: Dataset,
+    template_file_path: str,
+    templates_key: str,
+    labels: list,
+    dataset_specific_preprocess: callable = None,
+) -> Dataset:
+    try:
+        with open(template_file_path) as f:
+            prompt_template = json.load(f)[templates_key]
+            prompt = prompt_template["prompt"]
+            options_format = prompt_template["options_format"]
+    except FileNotFoundError:
+        eval_logger.error("Prompt templates not found")
+        sys.exit()
+
+    if dataset_specific_preprocess is not None:
+        doc = dataset_specific_preprocess(doc)
+
+    def repeat_doc_swap_correct_answer(batched_docs):
+        initial_len = len(next(iter(batched_docs.values())))
+        keys = list(batched_docs.keys())
+        new_batched_docs = {key: [] for key in keys}
+        new_batched_docs["always_same_option"] = []
+        new_batched_docs["prompt"] = []
+        new_batched_docs["options_format"] = []
+        new_batched_docs["original_answer_index"] = []
+
+        for doc_ind in range(initial_len):
+            for label_ind, label in enumerate(labels):
+                new_batched_docs["original_answer_index"].append(
+                    batched_docs["answer_index"][doc_ind]
+                )
+                for key in keys:
+                    new_batched_docs[key].append(
+                        copy.deepcopy(batched_docs[key][doc_ind])
+                    )
+                    if label_ind < len(batched_docs["options"][doc_ind]):
+                        if key == "options":
+                            # Swap correct answer with label_ind option
+                            new_batched_docs[key][-1][label_ind] = batched_docs[
+                                "options"
+                            ][doc_ind][batched_docs["answer_index"][doc_ind]]
+                            new_batched_docs[key][-1][
+                                batched_docs["answer_index"][doc_ind]
+                            ] = batched_docs["options"][doc_ind][label_ind]
+
+                        if key == "answer_index":
+                            new_batched_docs[key][-1] = label_ind
+
+                        if key == "answer":
+                            new_batched_docs[key][-1] = label
+
+                new_batched_docs["always_same_option"].append(label)
+                new_batched_docs["prompt"].append(prompt)
+                new_batched_docs["options_format"].append(options_format)
+        return new_batched_docs
+
+    return doc.map(repeat_doc_swap_correct_answer, batched=True)
+
+
+def non_greedy_robustness_process_docs(
+    doc: Dataset,
+    templates_key: str,
+    template_file_path: str,
+    dataset_specific_preprocess: callable = None,
+) -> Dataset:
+    try:
+        with open(template_file_path) as f:
+            prompt_template = json.load(f)[templates_key]
+            prompt = prompt_template["prompt"]
+            options_format = prompt_template.get("options_format", None)
+    except FileNotFoundError:
+        eval_logger.error("Prompt templates not found")
+        sys.exit()
+
+    if dataset_specific_preprocess is not None:
+        doc = dataset_specific_preprocess(doc)
+
+    def add_prompt_col(batched_docs):
+        initial_len = len(next(iter(batched_docs.values())))
+        new_batched_docs = copy.deepcopy(batched_docs)
+        new_batched_docs["prompt"] = [prompt] * initial_len
+        if options_format is not None:
+            new_batched_docs["options_format"] = [options_format] * initial_len
+
+        return new_batched_docs
+
+    return doc.map(add_prompt_col, batched=True)
+
+
+def robustness_doc_to_text(doc: Dataset) -> str:
+    upper_case = string.ascii_uppercase
+    lower_case = string.ascii_lowercase
+    prompt = doc["prompt"]
+    options_format = doc.get("options_format", "")
+    question = doc["question"]
+    catrgory = doc.get("category", "")
+    options = None
+    if options_format:
+        options = "".join(
+            [
+                options_format.format(
+                    letter=upper_case[i],
+                    option=doc["options"][i],
+                    numeral=NUMERALS[i],
+                    roman_numeral=ROMAN_NUMERALS[i],
+                    lower_case_letter=lower_case[i],
+                )
+                for i in range(len(doc["options"]))
+            ]
+        )
+    return prompt.format(question=question, options=options, category=catrgory)
+
+
+def __postprocess_pred(pred):
+    if "the best answer is" not in pred.lower():
+        return pred
+    pred_proc = (
+        pred.lower().split("the best answer is ")[-1].split("\n")[0].split(" ")[0]
+    )
+    pred_proc = re.sub(r"[^a-zA-Z0-9]", "", pred_proc).strip()
+    return pred_proc.upper()
+
+
+def translate_model_answer_to_labels(answer, labels, option_format=None):
+    answer = answer.upper()
+
+    if option_format is None:
+        return answer
+
+    elif "numeral" in option_format:
+        if "roman" in option_format:
+            if answer not in ROMAN_NUMERALS:
+                return answer
+            else:
+                return labels[ROMAN_NUMERALS.index(answer)]
+
+        if answer not in NUMERALS:
+            return answer
+        else:
+            return labels[NUMERALS.index(answer)]
+
+    return answer
+
+
+def calculate_consistency_rate(responses: List[List[str]]) -> float:
+    """
+    Calculate the Consistency Rate (CR) for a given set of responses.
+
+    Args:
+    responses: List of lists, where each inner list contains responses to the same question.
+
+    Returns:
+    The consistency rate as a float.
+    """
+    total_similarity = 0
+    total_combinations = 0
+
+    for response_set in responses:
+        pairs = combinations(response_set, 2)
+        num_pairs = len(response_set) * (len(response_set) - 1) / 2
+        total_combinations += num_pairs
+        for answer1, answer2 in pairs:
+            total_similarity += int(answer1 == answer2)
+
+    return total_similarity / total_combinations if total_combinations > 0 else 0.0
+
+
+def prompt_consistency_rate(results: List[Dict[str, Any]]) -> float:
+    """
+    Calculate the Consistency Rate (CR) for a given set of responses.
+
+    Args:
+    responses: List of lists, where each inner list contains responses to the same question.
+
+    Returns:
+    The consistency rate as a float.
+    """
+    question_answers_dict = {}
+
+    for result in results:
+        question_id, prompt_id, final_answer, gt = result
+        if question_id not in question_answers_dict:
+            question_answers_dict[question_id] = []
+        question_answers_dict[question_id].append(final_answer)
+
+    question_answers_list = [answers for answers in question_answers_dict.values()]
+
+    return calculate_consistency_rate(question_answers_list)
+
+
+def options_consistency_rate(results: List[Dict[str, Any]], labels) -> float:
+    """
+    Calculate the Consistency Rate (CR) for a given set of responses.
+
+    Args:
+    responses: List of lists, where each inner list contains responses to the same question.
+
+    Returns:
+    The consistency rate as a float.
+    """
+    question_answers_dict = {}
+    for result in results:
+        (
+            question_id,
+            always_same_option,
+            final_answer,
+            original_answer_index,
+            answer_index,
+        ) = result
+        if final_answer == labels[original_answer_index]:
+            final_answer = always_same_option
+        if final_answer == always_same_option:
+            final_answer = labels[original_answer_index]
+        if question_id not in question_answers_dict:
+            question_answers_dict[question_id] = []
+        question_answers_dict[question_id].append(final_answer)
+
+    question_answers_list = [answers for answers in question_answers_dict.values()]
+
+    return calculate_consistency_rate(question_answers_list)
--- a/lm_eval/tasks/scrolls/task.py
+++ b/lm_eval/tasks/scrolls/task.py
@@ -4,7 +4,8 @@ from functools import reduce

 import numpy as np
 import transformers.data.metrics.squad_metrics as squad_metrics
-from datasets import Dataset, load_metric
+from datasets import Dataset
+from evaluate import load
 from transformers import AutoTokenizer

 from lm_eval.api.instance import Instance
@@ -48,7 +49,10 @@ def _download_metric():
    from huggingface_hub import hf_hub_download

    scrolls_metric_path = hf_hub_download(
-        repo_id="tau/scrolls", repo_type="dataset", filename="metrics/scrolls.py"
+        repo_id="tau/scrolls",
+        repo_type="dataset",
+        filename="metrics/scrolls.py",
+        revision="refs/pr/5",
    )
    updated_scrolls_metric_path = (
        os.path.dirname(scrolls_metric_path)
@@ -119,7 +123,7 @@ class _SCROLLSTask(ConfigurableTask):
    def __init__(self, config=None):
        super().__init__(config={"metadata": {"version": self.VERSION}})
        if self.DATASET_NAME is not None:
-            self.metric = load_metric(_download_metric(), config_name=self.DATASET_NAME)
+            self.metric = load(_download_metric(), config_name=self.DATASET_NAME)

    def has_training_docs(self):
        return True
@@ -253,11 +257,14 @@ class _SCROLLSMultipleChoiceTask(_SCROLLSTask):
        }

    def construct_requests(self, doc, ctx, **kwargs):
+        apply_chat_template = kwargs.pop("apply_chat_template", False)
        request_list = [
            Instance(
                request_type="loglikelihood",
                doc=doc,
-                arguments=(ctx, " {}".format(choice)),
+                arguments=(ctx, " {}".format(choice))
+                if not apply_chat_template
+                else (ctx, "{}".format(choice)),
                idx=i,
                **kwargs,
            )
@@ -285,6 +292,7 @@ class _SCROLLSSummaryTask(_SCROLLSTask):
        }

    def construct_requests(self, doc, ctx, **kwargs):
+        kwargs.pop("apply_chat_template", False)
        return Instance(
            request_type="generate_until",
            doc=doc,
@@ -327,19 +335,22 @@ class Qasper(_SCROLLSTask):
        return {"f1": (prediction, doc["outputs"])}

    def construct_requests(self, doc, ctx, **kwargs):
+        apply_chat_template = kwargs.pop("apply_chat_template", False)
        if doc["is_yes_no"]:
            return [
                Instance(
                    request_type="loglikelihood",
                    doc=doc,
-                    arguments=(ctx, " yes"),
+                    arguments=(ctx, " yes")
+                    if not apply_chat_template
+                    else (ctx, "yes"),
                    idx=0,
                    **kwargs,
                ),
                Instance(
                    request_type="loglikelihood",
                    doc=doc,
-                    arguments=(ctx, " no"),
+                    arguments=(ctx, " no") if not apply_chat_template else (ctx, "no"),
                    idx=1,
                    **kwargs,
                ),
@@ -406,6 +417,7 @@ class NarrativeQA(_SCROLLSTask):
        return {"f1": (results[0], doc["outputs"])}

    def construct_requests(self, doc, ctx, **kwargs):
+        kwargs.pop("apply_chat_template", False)
        return Instance(
            request_type="generate_until",
            doc=doc,

--- a/lm_eval/tasks/spanish_bench/phrases_es/_phrases_es_common.yaml
+++ b/lm_eval/tasks/spanish_bench/phrases_es/_phrases_es_common.yaml
--- a/lm_eval/tasks/spanish_bench/phrases_es/phrases_es-va.yaml
+++ b/lm_eval/tasks/spanish_bench/phrases_es/phrases_es-va.yaml
 # File generated by `create-yamls.py`
-include: _phrases_es_common.yaml
+include: _phrases_es_common
 task: phrases_es-va
 doc_to_text: 'Oració en espanyol: {{es}}


--- a/lm_eval/tasks/spanish_bench/phrases_es/phrases_va-es.yaml
+++ b/lm_eval/tasks/spanish_bench/phrases_es/phrases_va-es.yaml
 # File generated by `create-yamls.py`
-include: _phrases_es_common.yaml
+include: _phrases_es_common
 task: phrases_va-es
 doc_to_text: 'Oració en valencià: {{va}}


--- a/lm_eval/tasks/spanish_bench/utils.py
+++ b/lm_eval/tasks/spanish_bench/utils.py
@@ -35,15 +35,6 @@ def process_doc_nli(dataset):
    return dataset.map(process_fn)


-def process_results_qa(doc, results):
-    preds = results[0]
-    reference = doc["answers"]["text"][0]
-    # import code; code.interact(local=dict(globals(), **locals()))
-    f1_sum = squad_metrics.compute_f1(reference, preds)
-    exact_match = squad_metrics.compute_exact(reference, preds)
-    return {"f1": f1_sum, "exact_match": exact_match}
-
-
 def process_xlsum(dataset):
    def _process_doc(doc):
        # Remove double spaces

--- a/lm_eval/tasks/storycloze/storycloze_2016.yaml
+++ b/lm_eval/tasks/storycloze/storycloze_2016.yaml
 tag: storycloze
 task: storycloze_2016
 dataset_path: story_cloze
-dataset_name: 2016
+dataset_name: "2016"
 output_type: multiple_choice
 validation_split: validation
 test_split: test

--- a/lm_eval/tasks/storycloze/storycloze_2018.yaml
+++ b/lm_eval/tasks/storycloze/storycloze_2018.yaml
 tag: storycloze
 task: storycloze_2018
 dataset_path: story_cloze
-dataset_name: 2018
+dataset_name: "2018"
 output_type: multiple_choice
 validation_split: validation
 test_split: test

--- a/lm_eval/tasks/unitxt/README.md
+++ b/lm_eval/tasks/unitxt/README.md
@@ -6,6 +6,8 @@ The full Unitxt catalog can be viewed in an [online explorer](https://unitxt.rea

 Read more about Unitxt at [www.unitxt.ai](https://www.unitxt.ai/).

+To use Unitxt dataset with lm-eval, you should first install unitxt via 'pip install unitxt'.
+
 ### Paper

 Title: `Unitxt: Flexible, Shareable and Reusable Data Preparation and Evaluation for Generative AI`

--- a/lm_eval/tasks/unitxt/task.py
+++ b/lm_eval/tasks/unitxt/task.py
@@ -6,11 +6,11 @@ Addressing this need, we present Unitxt, an innovative library for customizable

 import importlib.util
 import re
+from collections.abc import Callable
 from functools import partial
 from typing import Any, Dict, Optional

 import datasets
-import evaluate

 from lm_eval.api.instance import Instance
 from lm_eval.api.task import ConfigurableTask
@@ -28,16 +28,21 @@ _CITATION = """
 """


-def is_unitxt_installed() -> bool:
-    return importlib.util.find_spec("unitxt") is not None
+def assert_unitxt_installed():
+    if importlib.util.find_spec("unitxt") is None:
+        raise Exception(
+            "Please install unitxt via 'pip install unitxt'. For more information see: https://www.unitxt.ai/"
+        )


 def score(items, metric):
    predictions, references = zip(*items)
-    evaluator = evaluate.load("unitxt/metric")
+    assert_unitxt_installed()
+    from unitxt import evaluate
+
    for reference in references:
        reference["metrics"] = [metric]
-    results = evaluator.compute(predictions=predictions, references=references)
+    results = evaluate(predictions, references)
    return results[0]["score"]["global"]["score"]


@@ -61,16 +66,10 @@ class Unitxt(ConfigurableTask):
        self.metrics = self.dataset["test"][0]["metrics"]

    def download(self, dataset_kwargs: Optional[Dict[str, Any]] = None) -> None:
-        if is_unitxt_installed():
-            from unitxt import load_dataset
+        assert_unitxt_installed()
+        from unitxt import load_dataset

-            self.dataset = load_dataset(self.DATASET_NAME)
-        else:
-            self.dataset = datasets.load_dataset(
-                name=self.DATASET_NAME,
-                path="unitxt/data",
-                trust_remote_code=True,
-            )
+        self.dataset = load_dataset(self.DATASET_NAME, disable_cache=False)

    def has_training_docs(self):
        return "train" in self.dataset
@@ -102,6 +101,27 @@ class Unitxt(ConfigurableTask):
    def get_arguments(self, doc, ctx):
        return (ctx, {"until": ["\n"]})

+    def fewshot_context(
+        self,
+        doc: str,
+        num_fewshot: int,
+        system_instruction: Optional[str] = None,
+        apply_chat_template: bool = False,
+        fewshot_as_multiturn: bool = False,
+        chat_template: Optional[Callable] = None,
+    ) -> str:
+        source = self.doc_to_text(doc)
+        if isinstance(source, list):
+            if apply_chat_template:
+                formated_source = chat_template(self.doc_to_text(doc))
+                return formated_source
+            else:
+                raise Exception(
+                    "Got chat template format from Unitxt, but apply_chat_template is false. Add '--apply_chat_template' to command line."
+                )
+        else:
+            return source
+
    def construct_requests(self, doc, ctx, **kwargs):
        """Uses RequestFactory to construct Requests and returns an iterable of
        Requests which will be sent to the LM.
@@ -113,6 +133,7 @@ class Unitxt(ConfigurableTask):
            language description, as well as the few shot examples, and the question
            part of the document for `doc`.
        """
+        kwargs.pop("apply_chat_template", False)  # Not used by unitxt
        return [
            Instance(
                request_type="generate_until",

--- a/lm_eval/tasks/xquad/README.md
+++ b/lm_eval/tasks/xquad/README.md
+# XQuAD
+
+### Paper
+
+Title: `On the Cross-lingual Transferability of Monolingual Representations`
+
+Abstract: https://aclanthology.org/2020.acl-main.421.pdf
+
+XQuAD (Cross-lingual Question Answering Dataset) is a benchmark dataset for evaluating cross-lingual question answering performance. The dataset consists of a subset of 240 paragraphs and 1190 question-answer pairs from the development set of SQuAD v1.1 (Rajpurkar et al., 2016) together with their professional translations into ten languages: Spanish, German, Greek, Russian, Turkish, Arabic, Vietnamese, Thai, Chinese, and Hindi. Consequently, the dataset is entirely parallel across 11 languages.
+
+Homepage: https://github.com/deepmind/xquad
+
+
+### Citation
+
+```
+@article{Artetxe:etal:2019,
+      author    = {Mikel Artetxe and Sebastian Ruder and Dani Yogatama},
+      title     = {On the cross-lingual transferability of monolingual representations},
+      journal   = {CoRR},
+      volume    = {abs/1910.11856},
+      year      = {2019},
+      archivePrefix = {arXiv},
+      eprint    = {1910.11856}
+}
+```
+
+### Groups and Tasks
+
+#### Groups
+
+* `xquad`: All available languages.
+
+#### Tasks
+Perform extractive question answering for each language's subset of XQuAD.
+* `xquad_ar`: Arabic
+* `xquad_de`: German
+* `xquad_el`: Greek
+* `xquad_en`: English
+* `xquad_es`: Spanish
+* `xquad_hi`: Hindi
+* `xquad_ro`: Romanian
+* `xquad_ru`: Russian
+* `xquad_th`: Thai
+* `xquad_tr`: Turkish
+* `xquad_vi`: Vietnamese
+* `xquad_zh`: Chinese
+
+
+
+### Checklist
+
+For adding novel benchmarks/datasets to the library:
+* [x] Is the task an existing benchmark in the literature?
+  * [x] Have you referenced the original paper that introduced the task?
+  * [x] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+
+
+If other tasks on this dataset are already supported:
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/lm_eval/tasks/xquad/utils.py
+++ b/lm_eval/tasks/xquad/utils.py
+import re
+from itertools import product
+
+import evaluate
+import transformers.data.metrics.squad_metrics as squad_metrics
+
+from lm_eval.utils import general_detokenize
+
+
+def process_results_qa(doc, results):
+    preds = results[0]
+    reference = doc["answers"]["text"][0]
+    f1_sum = squad_metrics.compute_f1(reference, preds)
+    exact_match = squad_metrics.compute_exact(reference, preds)
+    return {"f1": f1_sum, "exact_match": exact_match}
--- a/lm_eval/tasks/xquad/xquad_ar.yaml
+++ b/lm_eval/tasks/xquad/xquad_ar.yaml
+include: xquad_common_yaml
+task: xquad_ar
+dataset_name: xquad.ar
+doc_to_text: "سيا: {{context}}\n\nسؤال: {{question}}\n\nإجابة:"
--- a/lm_eval/tasks/spanish_bench/xquad_es.yaml
+++ b/lm_eval/tasks/spanish_bench/xquad_es.yaml
-task: xquad_es
+# This file will be included in the generated language-specific task configs.
+# It doesn't have a yaml file extension as it is not meant to be imported directly
+# by the harness.
+tag: xquad
+task: null
 dataset_path: xquad
-dataset_name: xquad.es
+dataset_name: null
 output_type: generate_until
-doc_to_text: "Contexto: {{context}}\n\nPregunta: {{question}}\n\nRespuesta:"
-doc_to_target: '{{answers["text"][0]}}'
 validation_split: validation
-target_delimiter: ' '
+doc_to_text: null
+doc_to_target: '{{answers["text"][0]}}'
 process_results: !function utils.process_results_qa
+target_delimiter: ' '
 generation_kwargs:
  until:
    - "\n"

--- a/lm_eval/tasks/xquad/xquad_de.yaml
+++ b/lm_eval/tasks/xquad/xquad_de.yaml
+include: xquad_common_yaml
+task: xquad_de
+dataset_name: xquad.de
+doc_to_text: "Kontext: {{context}}\n\nFrage: {{question}}\n\nAntwort:"
--- a/lm_eval/tasks/xquad/xquad_el.yaml
+++ b/lm_eval/tasks/xquad/xquad_el.yaml
+include: xquad_common_yaml
+task: xquad_el
+dataset_name: xquad.el
+doc_to_text: "Συμφραζόμενα: {{context}}\n\nΕρώτηση: {{question}}\n\nΑπάντηση:"
--- a/lm_eval/tasks/xquad/xquad_en.yaml
+++ b/lm_eval/tasks/xquad/xquad_en.yaml
+include: xquad_common_yaml
+task: xquad_en
+dataset_name: xquad.en
+doc_to_text: "Context: {{context}}\n\nQuestion: {{question}}\n\nAnswer:"
--- a/lm_eval/tasks/xquad/xquad_es.yaml
+++ b/lm_eval/tasks/xquad/xquad_es.yaml
+include: xquad_common_yaml
+task: xquad_es
+dataset_name: xquad.es
+doc_to_text: "Contexto: {{context}}\n\nPregunta: {{question}}\n\nRespuesta:"