add webqs evaluation and fallback to test set when validation is unavailable

34eb121f · Anthony DiPofi · 6598967b · 34eb121f · 34eb121f · 34eb121f
Commit 34eb121f authored Jan 31, 2021 by Anthony DiPofi
Hide whitespace changes
Inline Side-by-side

Showing with 37 additions and 42 deletions

lm_eval/tasks/__init__.py lm_eval/tasks/__init__.py +2 -1

lm_eval/tasks/webqs.py lm_eval/tasks/webqs.py +27 -38

main.py main.py +8 -3

No files found.
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -17,6 +17,7 @@ from . import lambada
 from . import race 
 from . import piqa
 from . import triviaqa
+from . import webqs
 TASK_REGISTRY = {
@@ -55,7 +56,7 @@ TASK_REGISTRY = {
    # "squad": squad.SQuAD, # not implemented yet
    "race": race.RACE,
    # "naturalqs": naturalqs.NaturalQs, # not implemented yet
-    # "webqs": webqs.WebQs, # not implemented yet
+    "webqs": webqs.WebQs,
    # "wsc273": wsc273.WinogradSchemaChallenge273, # not implemented yet
    # "winogrande": winogrande.Winogrande, # not implemented yet
    "anli_r1": anli.ANLIRound1,

--- a/lm_eval/tasks/webqs.py
+++ b/lm_eval/tasks/webqs.py
 from . common import HFTask
+from lm_eval.base import mean, rf
 class WebQs(HFTask):
    DATASET_PATH = "web_questions"
@@ -18,7 +19,6 @@ class WebQs(HFTask):
        return ""
    def doc_to_text(self, doc):
-        print(doc)
        return "Q: " + doc['question'] + '\nA:'
    def doc_to_target(self, doc):
@@ -26,48 +26,37 @@ class WebQs(HFTask):
        # multiple correct answers being possible.
        # TODO: make sure we're actually handling multi-answer correctly
        return " " + doc['answers'][0]
+    def _remove_prefixes(self, aliases):
+        # Optimization: Remove any alias that has a strict prefix elsewhere in the list
+        # we can do this because if the prefix is acceptable by isgreedy, we can stop looking
+        aliases.sort()
+        ret = [aliases[0]]
+        for alias in aliases[1:]:
+            if not alias.startswith(ret[-1]):
+                ret.append(alias)
+        return ret
    def construct_requests(self, doc, ctx):
-        """ Uses RequestFactory to construct Requests and returns an iterable of 
+        ret = []
-        Requests which will be sent to the LM.
+        for alias in self._remove_prefixes(doc['answers']):
+            _, is_prediction = rf.loglikelihood(ctx, " " + alias)
+            ret.append(is_prediction)
+        return ret
-        :param doc:
-            The document as returned from training_docs, validation_docs, or test_docs.
-        :param ctx: str
-            The context string, generated by fewshot_context. This includes the natural 
-            language description, as well as the few shot examples, and the question
-            part of the document for `doc`. 
-        """
-        # TODO: implement evaluation.
-        raise NotImplementedError('Evaluation not implemented')
    def process_results(self, doc, results):
-        """Take a single document and the LM results and evaluates, returning a 
+        return {
-        dict where keys are the names of submetrics and values are the values of 
+            "acc": float(any(results))
-        the metric for that one document
+        }
-        :param doc:
-            The document as returned from training_docs, validation_docs, or test_docs.
-        :param results:
-            The results of the requests created in construct_requests.
-        """
-        # TODO: implement evaluation.
-        raise NotImplementedError('Evaluation not implemented')
    def aggregation(self):
-        """
+        return {
-        :returns: {str: [float] -> float}
+            "acc": mean,
-            A dictionary where keys are the names of submetrics and values are 
+        }
-            functions that aggregate a list of metrics
-        """
-        # TODO: implement evaluation.
-        raise NotImplementedError('Evaluation not implemented')
    def higher_is_better(self):
-        """
+        return {
-        :returns: {str: bool}
+            "acc": True
-            A dictionary where keys are the names of submetrics and values are 
+        }
-            whether a higher value of the submetric is better
\ No newline at end of file
-        """
-        # TODO: implement evaluation.
-        raise NotImplementedError('Evaluation not implemented')
\ No newline at end of file
--- a/main.py
+++ b/main.py
@@ -32,8 +32,7 @@ def main():
        task_names = args.tasks.split(",")
    task_dict = tasks.get_task_dict(task_names)
-    # TODO: fall back to test docs
+    task_dict_items = [(name, task) for name, task in task_dict.items() if(task.has_validation_docs() or task.has_test_docs())]
-    task_dict_items = [(name, task) for name, task in task_dict.items() if task.has_validation_docs()]
    results = collections.defaultdict(dict)
@@ -50,7 +49,13 @@ def main():
    # get lists of each type of requeste
    for task_name, task in task_dict_items:
-        for doc_id, doc in enumerate(itertools.islice(task.validation_docs(), 0, args.limit)):
+        #default to validation doc, fall back to test doc if validation unavailable
+        if task.has_validation_docs():
+            task_doc_func = task.validation_docs
+        elif task.has_test_docs():
+            task_doc_func = task.test_docs
+        for doc_id, doc in enumerate(itertools.islice(task_doc_func(), 0, args.limit)):
            docs[(task_name, doc_id)] = doc
            ctx = task.fewshot_context(