Implement ARC (data)

12852d5c · Leo Gao · 9e29c2a8 · 12852d5c · 12852d5c · 12852d5c
Commit 12852d5c authored Oct 05, 2020 by Leo Gao
Hide whitespace changes
Inline Side-by-side

Showing with 231 additions and 0 deletions

lm_eval/tasks/__init__.py lm_eval/tasks/__init__.py +2 -0

lm_eval/tasks/race.py lm_eval/tasks/race.py +69 -0

lm_eval/utils_stream.py lm_eval/utils_stream.py +160 -0

No files found.
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
 from . import superglue
 from . import glue
 from . import arc
+from . import race

 TASK_REGISTRY = {
    "cola": glue.CoLA,
@@ -19,6 +20,7 @@ TASK_REGISTRY = {
    "wsc": superglue.WinogradSchemaChallenge,
    "arc_easy": arc.ARCEasy,
    "arc_challenge": arc.ARCChallenge,
+    "race": race.RACE,
 }



--- a/lm_eval/tasks/race.py
+++ b/lm_eval/tasks/race.py
+from . common import HFNLPTask
+from ..utils_stream import X, each, apply, join, filt, one
+import collections
+import nlp
+
+
+class RACE(HFNLPTask):
+    NLP_PATH = "race"
+    NLP_NAME = "high"
+
+    cache = {}
+
+    def has_training_docs(self):
+        return True
+
+    def has_validation_docs(self):
+        return True
+
+    def has_test_docs(self):
+        return True
+
+    def _collate_data(self, set):
+        if set in self.cache: return self.cache[set]
+        # One big issue with HF's implementation of this dataset: it makes a
+        # separate document for each question; meanwhile, in the GPT3 paper it
+        # is shown that one document is made per passage.
+
+        r = collections.defaultdict(list)
+        for item in nlp.load_dataset(path=self.NLP_PATH, name=self.NLP_NAME)[set]:
+            r[item['article']].append(item)
+        
+        res = list(r.values() >> each(lambda x: {
+            'article': x[0]['article'],
+            'problems': x >> each(lambda y: {
+                'question': y['question'],
+                'answer': y['answer'],
+                'options': y['options'],
+            })
+        }))
+
+        self.cache[set] = res
+        return res
+
+    def training_docs(self):
+        return self._collate_data("train")
+
+    def validation_docs(self):
+        return self._collate_data("validation")
+
+    def test_docs(self):
+        return self._collate_data("test")
+
+    def fewshot_description(self):
+        # TODO: figure out description
+        return ""
+
+    def doc_to_text(self, doc, include_target=True):
+        print(doc)
+        r = "Article:\n" + doc['article'] + '\n\n'
+
+        r += doc['problems'] >> each(
+            lambda x: 'Q: ' + x['question'] + '\n\nA: ' + x['options'][['A', 'B', 'C', 'D'].index(x['answer'])]) \
+                >> join('\n\n')
+
+        return r
+
+    def evaluate(self, docs, lm, provide_description, num_fewshot):
+        # TODO: implement
+        raise NotImplementedError()
\ No newline at end of file
--- a/lm_eval/utils_stream.py
+++ b/lm_eval/utils_stream.py
+import os
+from functools import reduce
+import operator
+import lm_dataformat as lmd
+from tqdm import tqdm
+import json
+
+
+class ExitCodeError(Exception): pass
+
+
+def sh(x):
+    if os.system(x): raise ExitCodeError()
+
+def ls(x):
+    return [x + '/' + fn for fn in os.listdir(x)]
+
+def lsr(x):
+    if os.path.isdir(x):
+        return reduce(operator.add, map(lsr, ls(x)), [])
+    else:
+        return [x]
+
+def fwrite(fname, content):
+    with open(fname, 'w') as fh:
+        fh.write(content)
+
+def fread(fname):
+    with open(fname) as fh:
+        return fh.read()
+
+class each:
+    def __init__(self, f):
+        self.f = f
+
+    def __rrshift__(self, other):
+        return list(map(self.f, other))
+
+class filt:
+    def __init__(self, f):
+        self.f = f
+
+    def __rrshift__(self, other):
+        return list(filter(self.f, other))
+
+class apply:
+    def __init__(self, f):
+        self.f = f
+
+    def __rrshift__(self, other):
+        return self.f(other)
+
+class one:
+    def __rrshift__(self, other):
+        try:
+            if isinstance(other, list): 
+                assert len(other) == 1
+                return other[0]
+            return next(other)
+        except:
+            return None
+
+class join:
+    def __init__(self, sep):
+        self.sep = sep
+
+    def __rrshift__(self, other):
+        if other is None: return
+        try:
+            return self.sep.join(other)
+        except:
+            return None
+
+
+Y = object()
+
+def id(x):
+    return x
+
+class Reflective:
+    def __getattribute__(self, f):
+        def _fn(*args, **kwargs):
+            return lambda x: x.__getattribute__(f)(*args, **kwargs)
+        return _fn
+    
+    def __getitem__(self, a):
+        return lambda x: x[a]
+    
+    def __mul__(self, other):
+        if other == Y:
+            def _f(x, y=None):
+                if y == None:
+                    x, y = x
+                
+                return x * y
+            return  _f
+        
+        return lambda x: x * other
+    
+    def __rmul__(self, other):
+        if other == Y:
+            def _f(x, y=None):
+                if y == None:
+                    x, y = x
+                
+                return y * x
+            return  _f
+        
+        return lambda x: other * x
+    
+    def __add__(self, other):
+        if other == Y:
+            def _f(x, y=None):
+                if y == None:
+                    x, y = x
+                
+                return x + y
+            return  _f
+        
+        return lambda x: x + other
+    
+    def __radd__(self, other):
+        if other == Y:
+            def _f(x, y=None):
+                if y == None:
+                    x, y = x
+                
+                return y + x
+            return  _f
+        
+        return lambda x: other + x
+
+# (b -> a -> b) -> b -> [a] -> b
+def foldl(f, init, arr):
+    curr = init
+    for elem in arr:
+        curr = f(curr, elem)
+    return curr
+
+# (a -> b -> b) -> b -> [a] -> b
+def foldr(f, init, arr):
+    curr = init
+    for elem in arr[::-1]:
+        curr = f(elem, curr)
+    return curr
+
+
+def comp(*fs):
+    if len(fs) == 1:
+        return fs[0]
+    
+    def _f(x):
+        for f in fs[::-1]:
+            x = f(x)
+    
+        return x
+    return _f
+
+
+X = Reflective()
\ No newline at end of file