Merge branch 'master' into PytestUpdate

38a240ce · Stella Biderman · GitHub · c9011859 · 33f5572a · 38a240ce
Unverified Commit 38a240ce authored Apr 27, 2022 by Stella Biderman Committed by GitHub Apr 27, 2022
Hide whitespace changes
Inline Side-by-side

Showing with 168 additions and 2 deletions

setup.py setup.py +7 -2

templates/new_task.py templates/new_task.py +128 -0

tests/test_gpt2.py tests/test_gpt2.py +33 -0

No files found.
--- a/setup.py
+++ b/setup.py
@@ -18,8 +18,13 @@ setuptools.setup(
        "License :: OSI Approved :: MIT License",
        "Operating System :: OS Independent",
    ],
-    python_requires='>=3.6',
+    python_requires=">=3.6",
    install_requires=[
+        "promptsource @ git+https://github.com/bigscience-workshop/promptsource@eval-hackathon",
+        "wrapt",
+        "nltk",
+        "jinja2",
+        "black",
        "datasets==2.0.0",
        "click>=7.1",
        "scikit-learn>=0.24.1",
@@ -40,7 +45,7 @@ setuptools.setup(
        "openai==0.6.4",
        "jieba==0.42.1",
        "nagisa==0.2.7",
-        "bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt"
+        "bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt",
    ],
    dependency_links=[
        "https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt",

--- a/templates/new_task.py
+++ b/templates/new_task.py
+# TODO: Remove all TODO comments once the implementation is complete.
+"""
+TODO: Add the Paper Title on this line.
+TODO: Add the paper's PDF URL (preferrably from arXiv) on this line.
+
+TODO: Write a Short Description of the task.
+
+Homepage: TODO: Add the URL to the task's Homepage here.
+"""
+from lm_eval.base import PromptSourceTask
+
+
+# TODO: Add the BibTeX citation for the task.
+_CITATION = """
+"""
+
+
+# TODO: Replace `NewTask` with the name of your Task.
+class NewTask(PromptSourceTask):
+    VERSION = 0
+    # TODO: Add the `DATASET_PATH` string. This will be the name of the `Task`
+    # dataset as denoted in HuggingFace `datasets`.
+    DATASET_PATH = ""
+    # TODO: Add the `DATASET_NAME` string. This is the name of a subset within
+    # `DATASET_PATH`. If there aren't specific subsets you need, leave this as `None`.
+    DATASET_NAME = None
+
+    def has_training_docs(self):
+        # TODO: Fill in the return with `True` if the Task has training data; else `False`.
+        return False
+
+    def has_validation_docs(self):
+        # TODO: Fill in the return with `True` if the Task has validation data; else `False`.
+        return False
+
+    def has_test_docs(self):
+        # TODO: Fill in the return with `True` if the Task has test data; else `False`.
+        return False
+
+    def training_docs(self):
+        if self.has_training_docs():
+            # We cache training documents in `self._training_docs` for faster
+            # few-shot processing. If the data is too large to fit in memory,
+            # return the training data as a generator instead of a list.
+            if self._training_docs is None:
+                # TODO: Return the training document generator from `self.dataset`.
+                # If you need to process the data, `map` over the documents with
+                # the custom procesing function, `self._process_doc`. E.g.
+                # `map(self._process_doc, self.dataset["validation"])`
+                # In most case you can leave this as is unless the dataset split is
+                # named differently than the default `"train"`.
+                self._training_docs = list(self.dataset["train"])
+            return self._training_docs
+
+    def validation_docs(self):
+        if self.has_validation_docs():
+            # TODO: Return the validation document generator from `self.dataset`.
+            # If you need to process the data, `map` over the documents with the
+            # custom procesing function, `self._process_doc`. E.g.
+            # `map(self._process_doc, self.dataset["validation"])`
+            # In most case you can leave this as is unless the dataset split is
+            # named differently than the default `"validation"`.
+            return self.dataset["validation"]
+
+    def test_docs(self):
+        if self.has_test_docs():
+            # TODO: Return the test document generator from `self.dataset`.
+            # If you need to process the data, `map` over the documents with the
+            # custom processing function, `self._process_doc`. E.g.
+            # `map(self._process_doc, self.dataset["test"])`
+            # In most case you can leave this as is unless the dataset split is
+            # named differently than the default `"test"`.
+            return self.dataset["test"]
+
+    def stopping_criteria(self):
+        # TODO: Denote the string where the generation should be split.
+        # For example, for `coqa`, this is '\nQ:' and for `drop` '.'.
+        # NOTE: You may delete this function if the task does not required generation.
+        return None
+
+    def construct_requests(self, doc, ctx):
+        """Uses RequestFactory to construct Requests and returns an iterable of
+        Requests which will be sent to the LM.
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or
+            test_docs.
+        :param ctx: str
+            The context string, generated by fewshot_context. This includes the natural
+            language description, as well as the few shot examples, and the question
+            part of the document for `doc`.
+        """
+        # TODO: Construct your language model requests with the request factory, `rf`,
+        # and return them as an iterable.
+        return []
+
+    def process_results(self, doc, results):
+        """Take a single document and the LM results and evaluates, returning a
+        dict where keys are the names of submetrics and values are the values of
+        the metric for that one document
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param results:
+            The results of the requests created in construct_requests.
+        """
+        # TODO: For each (sub)metric in the task evaluation, add a key-value pair
+        # with the metric name as key and the corresponding metric result as value
+        # for the current `doc`.
+        return {}
+
+    def aggregation(self):
+        """
+        :returns: {str: [metric_score] -> float}
+            A dictionary where keys are the names of submetrics and values are
+            functions that aggregate a list of metric scores
+        """
+        # TODO: For each (sub)metric in the task evaluation, add a key-value pair
+        # with the metric name as key and an aggregation function as value which
+        # determines how to combine results from each document in the dataset.
+        # Check `lm_eval.metrics` to find built-in aggregation functions.
+        return {}
+
+    def higher_is_better(self):
+        # TODO: For each (sub)metric in the task evaluation, add a key-value pair
+        # with the metric name as key and a `bool` value determining whether or
+        # not higher values of that metric are deemed better.
+        return {}
\ No newline at end of file
--- a/tests/test_gpt2.py
+++ b/tests/test_gpt2.py
+import random
+import lm_eval.models as models
+import pytest
+import torch
+from transformers import StoppingCriteria
+
+
+@pytest.mark.parametrize(
+    "eos_token,test_input,expected", 
+    [
+        ("not", "i like", "i like to say that I'm not"), 
+        ("say that", "i like", "i like to say that"),
+        ("great", "big science is", "big science is a great"),
+        ("<|endoftext|>", "big science has", "big science has been done in the past, but it's not the same as the science of the")
+    ]
+)
+def test_stopping_criteria(eos_token, test_input, expected):
+    random.seed(42)
+    torch.random.manual_seed(42)
+
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    gpt2 = models.get_model("gpt2")(device=device)
+
+    context = torch.tensor([gpt2.tokenizer.encode(test_input)])
+    stopping_criteria_ids = gpt2.tokenizer.encode(eos_token)
+
+    generations = gpt2._model_generate(
+        context,
+        max_length=20,
+        stopping_criteria_ids=stopping_criteria_ids
+    )
+    generations = gpt2.tokenizer.decode(generations[0])
+    assert generations == expected