removed yaml version of squad

d3f429ac · lintangsutawika · 9817e7c2 · d3f429ac · 9817e7c2 · 9817e7c2
Commit d3f429ac authored Nov 08, 2023 by lintangsutawika
8 changed files
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -18,7 +18,7 @@ import logging
 eval_logger = logging.getLogger("lm-eval")

 # import python tasks
-from .squadv2.squad import SQuAD2
+from .squad import SQuAD2




--- a/lm_eval/tasks/squadv2/README.md
+++ b/lm_eval/tasks/squadv2/README.md
-# Task-name
-
-### Paper
-
-Title: `Know What You Don’t Know: Unanswerable Questions for SQuAD`
-Abstract: https://arxiv.org/abs/1806.03822
-
-Stanford Question Answering Dataset (SQuAD) is a reading comprehension dataset,
-consisting of questions posed by crowdworkers on a set of Wikipedia articles,
-where the answer to every question is a segment of text, or span, from the
-corresponding reading passage, or the question might be unanswerable.
-SQuAD2.0 combines the 100,000 questions in SQuAD1.1 with over 50,000 unanswerable
-questions written adversarially by crowdworkers to look similar to answerable ones.
-To do well on SQuAD2.0, systems must not only answer questions when possible, but
-also determine when no answer is supported by the paragraph and abstain from answering.
-
-Homepage: https://rajpurkar.github.io/SQuAD-explorer/
-
-
-### Citation
-
-```
-@misc{rajpurkar2018know,
-    title={Know What You Don't Know: Unanswerable Questions for SQuAD},
-    author={Pranav Rajpurkar and Robin Jia and Percy Liang},
-    year={2018},
-    eprint={1806.03822},
-    archivePrefix={arXiv},
-    primaryClass={cs.CL}
-}
-```
-
-### Groups and Tasks
-
-#### Groups
-
-* `squadv2_complete`: Runs both `squadv2` and `squadv2_noans_loglikelihood`
-
-#### Tasks
-
-* `squadv2`: `Default squadv2 task`
-* `squadv2_noans_loglikelihood`: `Additional task to acquire the probability of model predicting there is no answer`
-
-### Checklist
-
-For adding novel benchmarks/datasets to the library:
-* [ ] Is the task an existing benchmark in the literature?
-  * [ ] Have you referenced the original paper that introduced the task?
-  * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
-
-
-If other tasks on this dataset are already supported:
-* [ ] Is the "Main" variant of this task clearly denoted?
-* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
-* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/lm_eval/tasks/squadv2/_template_yaml
+++ b/lm_eval/tasks/squadv2/_template_yaml
-dataset_path: squad_v2
-training_split: train
-validation_split: validation
-doc_to_text: "Title: {{title}}\n\nBackground: {{context}}\n\nQuestion: {{question}}\n\n Answer:"
-doc_to_target: "{% if answers.text| length > 0 %}{{answers.text}}{% else %}{{['']}}{% endif %}"
-target_delimiter: ""
-should_decontaminate: true
-doc_to_decontamination_query: context
--- a/lm_eval/tasks/squadv2/default.yaml
+++ b/lm_eval/tasks/squadv2/default.yaml
-include: _template_yaml
-task: squadv2_generate_until
-output_type: generate_until
-generation_kwargs:
-  until:
-    - "\n"
-metric_list:
-  - metric: !function utils.exact
-    aggregation: mean
-    higher_is_better: true
-  - metric: !function utils.f1
-    aggregation: mean
-    higher_is_better: true
--- a/lm_eval/tasks/squadv2/no_ans.yaml
+++ b/lm_eval/tasks/squadv2/no_ans.yaml
-include: _template_yaml
-task: squadv2_noans_loglikelihood
-output_type: loglikelihood
-doc_to_target: " unanswerable"
-metric_list:
-  - metric: perplexity
--- a/lm_eval/tasks/squadv2/squad.py
+++ b/lm_eval/tasks/squadv2/squad.py
-import datasets
-
-from math import exp
-from functools import partial
-from packaging import version
-
-from lm_eval.api.task import Task
-from lm_eval.api.instance import Instance
-from lm_eval.api.registry import register_task
-
-def _squad_metric(predictions, references):
-    squad_metric = datasets.load_metric("squad_v2")
-    return squad_metric.compute(predictions=predictions, references=references)
-
-
-def _squad_agg(key, items):
-    predictions, references = zip(*items)
-
-    return _squad_metric(predictions=predictions, references=references).get(key, 0)
-
-
-@register_task("squadv2")
-class SQuAD2(Task):
-    VERSION = 1
-    DATASET_PATH = "squad_v2"
-    DATASET_NAME = None
-
-    # HF changed squad on us so we have to make sure we aren't running the old one
-    assert version.parse(datasets.__version__) >= version.parse(
-        "1.11.0"
-    ), "datasets v1.11.0 or later required for SQuAD"
-
-    def has_training_docs(self):
-        return True
-
-    def has_validation_docs(self):
-        return True
-
-    def has_test_docs(self):
-        return False
-
-    def training_docs(self):
-        return self.dataset["train"]
-
-    def validation_docs(self):
-        return self.dataset["validation"]
-
-    def doc_to_text(self, doc):
-        return (
-            "Title: "
-            + doc["title"]
-            + "\n\n"
-            + "Background: "
-            + doc["context"]
-            + "\n\n"
-            + "Question: "
-            + doc["question"]
-            + "\n\n"
-            + "Answer:"
-        )
-
-    def should_decontaminate(self):
-        return True
-
-    def doc_to_decontamination_query(self, doc):
-        return doc["context"]
-
-    def doc_to_target(self, doc):
-        answer_list = doc["answers"]["text"]
-        if len(answer_list) > 0:
-            answer = answer_list[0]
-        else:
-            answer = "unanswerable"
-        return " " + answer
-
-    def construct_requests(self, doc, ctx, **kwargs):
-        """Uses RequestFactory to construct Requests and returns an iterable of
-        Requests which will be sent to the LM.
-
-        :param doc:
-            The document as returned from training_docs, validation_docs, or test_docs.
-        :param ctx: str
-            The context string, generated by fewshot_context. This includes the natural
-            language description, as well as the few shot examples, and the question
-            part of the document for `doc`.
-        """
-
-        return [
-            Instance(
-                request_type="generate_until",
-                doc=doc,
-                arguments=(ctx, {"until": ["\n"]}),
-                idx=0,
-                **kwargs
-            ),
-            Instance(
-                request_type="loglikelihood",
-                doc=doc,
-                arguments=(ctx, " " + "unanswerable"),
-                idx=0,
-                **kwargs
-            )
-        ]
-
-    def process_results(self, doc, results):
-        """Take a single document and the LM results and evaluates, returning a
-        dict where keys are the names of submetrics and values are the values of
-        the metric for that one document
-
-        :param doc:
-            The document as returned from training_docs, validation_docs, or test_docs.
-        :param results:
-            The results of the requests created in construct_requests.
-        """
-        
-        continuation, (logprob_unanswerable, _) = results
-
-        no_answer_probability = exp(logprob_unanswerable)
-
-        predictions = {
-            "id": doc["id"],
-            "prediction_text": continuation,
-            "no_answer_probability": no_answer_probability,
-        }
-
-        references = {
-            "id": doc["id"],
-            "answers": doc["answers"],
-        }
-
-        return {
-            "exact": (
-                predictions,
-                references,
-            ),  # Exact match (the normalized answer exactly match the gold answer)
-            "f1": (
-                predictions,
-                references,
-            ),  # The F-score of predicted tokens versus the gold answer
-            "HasAns_exact": (
-                predictions,
-                references,
-            ),  # Exact match (the normalized answer exactly match the gold answer)
-            "HasAns_f1": (
-                predictions,
-                references,
-            ),  # The F-score of predicted tokens versus the gold answer
-            "NoAns_exact": (
-                predictions,
-                references,
-            ),  # Exact match (the normalized answer exactly match the gold answer)
-            "NoAns_f1": (
-                predictions,
-                references,
-            ),  # The F-score of predicted tokens versus the gold answer
-            "best_exact": (
-                predictions,
-                references,
-            ),  # Best exact match (with varying threshold)
-            "best_f1": (predictions, references),  # Best F1 (with varying threshold)
-        }
-
-    def aggregation(self):
-        """
-        :returns: {str: [float] -> float}
-            A dictionary where keys are the names of submetrics and values are
-            functions that aggregate a list of metrics
-        """
-        return {
-            "exact": partial(
-                _squad_agg, "exact"
-            ),  # Exact match (the normalized answer exactly match the gold answer)
-            "f1": partial(
-                _squad_agg, "f1"
-            ),  # The F-score of predicted tokens versus the gold answer
-            "HasAns_exact": partial(
-                _squad_agg, "HasAns_exact"
-            ),  # Exact match (the normalized answer exactly match the gold answer)
-            "HasAns_f1": partial(
-                _squad_agg, "HasAns_f1"
-            ),  # The F-score of predicted tokens versus the gold answer
-            "NoAns_exact": partial(
-                _squad_agg, "NoAns_exact"
-            ),  # Exact match (the normalized answer exactly match the gold answer)
-            "NoAns_f1": partial(
-                _squad_agg, "NoAns_f1"
-            ),  # The F-score of predicted tokens versus the gold answer
-            "best_exact": partial(
-                _squad_agg, "best_exact"
-            ),  # Best exact match (with varying threshold)
-            "best_f1": partial(
-                _squad_agg, "best_f1"
-            ),  # Best F1 (with varying threshold)
-        }
-
-    def higher_is_better(self):
-        """
-        :returns: {str: bool}
-            A dictionary where keys are the names of submetrics and values are
-            whether a higher value of the submetric is better
-        """
-        return {
-            "exact": True,  # Exact match (the normalized answer exactly match the gold answer)
-            "f1": True,  # The F-score of predicted tokens versus the gold answer
-            "HasAns_exact": True,  # Exact match (the normalized answer exactly match the gold answer)
-            "HasAns_f1": True,  # The F-score of predicted tokens versus the gold answer
-            "NoAns_exact": True,  # Exact match (the normalized answer exactly match the gold answer)
-            "NoAns_f1": True,  # The F-score of predicted tokens versus the gold answer
-            "best_exact": True,  # Best exact match (with varying threshold)
-            "best_f1": True,  # Best F1 (with varying threshold)
-        }
\ No newline at end of file
--- a/lm_eval/tasks/squadv2/utils.py
+++ b/lm_eval/tasks/squadv2/utils.py
-import re
-import string
-import collections
-
-
-def normalize_answer(s):
-    """Lower text and remove punctuation, articles and extra whitespace."""
-
-    def remove_articles(text):
-        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
-        return re.sub(regex, " ", text)
-
-    def white_space_fix(text):
-        return " ".join(text.split())
-
-    def remove_punc(text):
-        exclude = set(string.punctuation)
-        return "".join(ch for ch in text if ch not in exclude)
-
-    def lower(text):
-        return text.lower()
-
-    return white_space_fix(remove_articles(remove_punc(lower(s))))
-
-
-def get_tokens(s):
-    if not s:
-        return []
-    return normalize_answer(s).split()
-
-
-# Exact match (the normalized answer exactly match the gold answer)
-def exact(predictions, references):
-    return int(normalize_answer(references[0]) == normalize_answer(predictions[0]))
-
-
-# The F-score of predicted tokens versus the gold answer
-def f1(predictions, references):
-    gold_toks = get_tokens(references[0])
-    pred_toks = get_tokens(predictions[0])
-    common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
-    num_same = sum(common.values())
-    if len(gold_toks) == 0 or len(pred_toks) == 0:
-        # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
-        return int(gold_toks == pred_toks)
-    if num_same == 0:
-        return 0
-    precision = 1.0 * num_same / len(pred_toks)
-    recall = 1.0 * num_same / len(gold_toks)
-    f1 = (2 * precision * recall) / (precision + recall)
-    return f1
--- a/lm_eval/tasks/squadv2/with_noans_prob.yaml
+++ b/lm_eval/tasks/squadv2/with_noans_prob.yaml
-group: squadv2_complete
-task:
-  - squadv2_generate_until
-  - squadv2_noans_loglikelihood