Update stopping criteria for few-shot

b62d1bec · Tian Yun · 5e59320b · b62d1bec · b62d1bec · b62d1bec
Commit b62d1bec authored Apr 28, 2022 by Tian Yun
8 changed files
--- a/lm_eval/base.py
+++ b/lm_eval/base.py
@@ -694,11 +694,9 @@ class PromptSourceTask(Task):
    def stopping_criteria(self) -> Optional[str]:
        """Denote where the generation should end.
-        For example, for coqa, this is '\nQ:' and for drop '.'.
+        By default, its "\n###\n".
-        By default, its None, meaning to generate up to max or EOT, whichever comes first.
        """
-        return None
+        return "\n###\n"
    def max_generation_length(self) -> Optional[int]:
        """Denote where the max length of the generation if it is obvious from the task."""

--- a/lm_eval/tasks/coqa.py
+++ b/lm_eval/tasks/coqa.py
@@ -90,8 +90,8 @@ class CoQA(PromptSourceTask):
            "f1": f1_sum / max(1, len(gold_list)),
        }
-    def stopping_criteria(self):
+    # def stopping_criteria(self):
-        return "\n\n"
+    #     return "\n\n"
    # def construct_requests(self, doc, ctx):
    #     """Uses RequestFactory to construct Requests and returns an iterable of

--- a/lm_eval/tasks/drop.py
+++ b/lm_eval/tasks/drop.py
@@ -92,8 +92,8 @@ class DROP(PromptSourceTask):
    #     """
    #     conts = [rf.greedy_until(ctx, ["."])]
    #     return conts
-    def stopping_criteria(self):
+    # def stopping_criteria(self):
-        return "."
+    #     return "."
    def process_results(self, doc, results):
        """Take a single document and the LM results and evaluates, returning a

--- a/lm_eval/tasks/gem_asset_turk.py
+++ b/lm_eval/tasks/gem_asset_turk.py
@@ -78,8 +78,8 @@ class AssetTurk(PromptSourceTask):
    def test_docs(self):
        return self.dataset[str(self.SPLIT)]
-    def stopping_criteria(self):
+    # def stopping_criteria(self):
-        return None
+    #     return None
    def max_generation_length(self):
        return 200

--- a/lm_eval/tasks/gem_webnlg.py
+++ b/lm_eval/tasks/gem_webnlg.py
@@ -70,8 +70,8 @@ class WebNLG(PromptSourceTask):
            else:
                return self.dataset["test"]
-    def stopping_criteria(self):
+    # def stopping_criteria(self):
-        return None
+    #     return None
    def max_generation_length(self):
        return 250

--- a/lm_eval/tasks/glue.py
+++ b/lm_eval/tasks/glue.py
@@ -236,8 +236,8 @@ class MRPC(PromptSourceTask):
    def has_test_docs(self):
        return False
-    def stopping_criteria(self):
+    # def stopping_criteria(self):
-        return "\n"
+    #     return "\n###\n"
    def training_docs(self):
        if self._training_docs is None:

--- a/lm_eval/tasks/wino_bias.py
+++ b/lm_eval/tasks/wino_bias.py
@@ -54,8 +54,8 @@ class WinoBias(PromptSourceTask):
    def test_docs(self):
        return self.dataset["test"]
-    def stopping_criteria(self):
+    # def stopping_criteria(self):
-        return "\n"
+    #     return "\n"
    def process_results(self, doc, results):
        """Take a single document and the LM results and evaluates, returning a

--- a/templates/new_task.py
+++ b/templates/new_task.py
@@ -73,10 +73,10 @@ class NewTask(PromptSourceTask):
            return self.dataset["test"]
    def stopping_criteria(self):
-        # TODO: Denote the string where the generation should be split.
+        # Only define this method when you want to control few-shot generations on specific tokens.
-        # For example, for `coqa`, this is '\nQ:' and for `drop` '.'.
+        # The default is set to '\n###\n'.
        # NOTE: You may delete this function if the task does not required generation.
-        return None
+        return "\n###\n"
    def construct_requests(self, doc, ctx):
        """Uses RequestFactory to construct Requests and returns an iterable of