test_tasks.py 6.07 KB
Newer Older
Leo Gao's avatar
Leo Gao committed
1
2
import lm_eval.tasks as tasks
import pytest
3
from itertools import islice
Leo Gao's avatar
Leo Gao committed
4
5


baberabb's avatar
baberabb committed
6
7
8
9
10
11
12
13
14
15
16
@pytest.fixture()
def task_class(task_name="arc_easy"):
    return next(
        (name, cls) for name, cls in tasks.TASK_REGISTRY.items() if name == task_name
    )[1]


@pytest.fixture()
def limit(limit=10):
    return limit

Leo Gao's avatar
Leo Gao committed
17

baberabb's avatar
baberabb committed
18
19
20
def test_download(task_class):
    task_class().download()
    assert task_class.dataset is not None
Leo Gao's avatar
Leo Gao committed
21
22


baberabb's avatar
baberabb committed
23
24
def test_has_training_docs(task_class):
    assert task_class().has_training_docs() in [True, False]
Leo Gao's avatar
Leo Gao committed
25

Leo Gao's avatar
Leo Gao committed
26

baberabb's avatar
baberabb committed
27
28
def test_check_training_docs(task_class):
    assert task_class().has_training_docs()
29

30

baberabb's avatar
baberabb committed
31
32
def test_has_validation_docs(task_class):
    assert task_class().has_training_docs() in [True, False]
Leo Gao's avatar
Leo Gao committed
33

34

baberabb's avatar
baberabb committed
35
36
def test_check_validation_docs(task_class):
    assert task_class().has_training_docs()
37

Fabrizio Milo's avatar
Fabrizio Milo committed
38

baberabb's avatar
baberabb committed
39
40
def test_has_test_docs(task_class):
    assert task_class.has_training_docs() in [True, False]
41
42


baberabb's avatar
baberabb committed
43
44
def test_check_test_docs(task_class):
    assert task_class.has_training_docs()
45

Fabrizio Milo's avatar
Fabrizio Milo committed
46

baberabb's avatar
baberabb committed
47
48
def test_should_decontaminate(task_class):
    assert task_class.should_decontaminate() in [True, False]
49

50

baberabb's avatar
baberabb committed
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
def test_doc_to_text(task_class, limit):
    task = task_class()
    arr = list(islice(task.test_docs(), limit)) if limit else list(task.test_docs())
    _array = [task.doc_to_text(doc) for doc in arr]
    # space convention; allow txt to have length 0 for perplexity-like tasks since the model tacks an <|endoftext|> on
    assert all(
        isinstance(x, str) and (x[-1] != " " if len(x) != 0 else True) for x in _array
    )


def test_doc_to_target(task_class, limit):
    task = task_class()
    arr = list(islice(task.test_docs(), limit)) if limit else list(task.test_target())
    _array_target = [task.doc_to_target(doc) for doc in arr]
    assert all(isinstance(doc, str) for doc in _array_target)
    # _array_text = [task.doc_to_text(doc) for doc in arr]
    # Not working
    # assert all(tgt[0] == " " or txt[-1] == "\n" if  len(txt) != 0 else True for txt, tgt in zip(_array_text, _array_target))
69

Fabrizio Milo's avatar
Fabrizio Milo committed
70

baberabb's avatar
baberabb committed
71
72
73
def test_build_all_requests(task_class, limit):
    task_class().build_all_requests(rank=1, limit=limit, world_size=1)
    assert task_class.instances is not None
74

75

baberabb's avatar
baberabb committed
76
def test_construct_requests(task_class, limit):
77
    task = task_class()
baberabb's avatar
baberabb committed
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
    arr = list(islice(task.test_docs(), limit)) if limit else list(task.test_docs())
    requests = [task.construct_requests(doc, task.doc_to_text(doc)) for doc in arr]
    assert all(isinstance(doc, list) for doc in requests)
    assert len(requests) == limit if limit else True


def test_create_choices(task_class):
    arr = list(islice(task_class().test_docs(), 1))
    choices = task_class().create_choices(arr[0])
    assert choices is not None
    # checking if number of choices is correct


# @pytest.mark.parametrize("taskname,task_class", tasks.TASK_REGISTRY.items())
# def test_basic_interface(taskname, task_class):
#     print("Evaluating task", taskname)
#     task = task_class()
#
#     assert task.has_training_docs() in [True, False]
#     assert task.has_validation_docs() in [True, False]
#     assert task.has_test_docs() in [True, False]
#
#     assert isinstance(task.aggregation(), dict)
#     assert isinstance(task.higher_is_better(), dict)
#     assert task.aggregation().keys() == task.higher_is_better().keys()
#
#     for v in task.higher_is_better().values():
#         assert v in [True, False]
#
#     assert isinstance(task.VERSION, int)
#
#     # test deterministic docs
#     # (don't test train because it's slow)
#
#     task2 = task_class()
#
#     limit = None
#
#     if taskname in ["triviaqa"] or taskname.startswith("pile_"):
#         limit = 10000
#     if task.has_validation_docs():
#         arr = list(islice(task.validation_docs(), limit))
#         arr2 = list(islice(task2.validation_docs(), limit))
#
#         assert arr == arr2
#
#         reqs = [task.construct_requests(doc, task.doc_to_text(doc)) for doc in arr]
#         reqs2 = [task2.construct_requests(doc, task2.doc_to_text(doc)) for doc in arr2]
#
#         assert reqs == reqs2
#
#     if task.has_test_docs():
#         arr = list(islice(task.test_docs(), limit))
#         arr2 = list(islice(task2.test_docs(), limit))
#
#         assert arr == arr2
#
#         reqs = [task.construct_requests(doc, task.doc_to_text(doc)) for doc in arr]
#         reqs2 = [task2.construct_requests(doc, task2.doc_to_text(doc)) for doc in arr2]
#
#         assert reqs == reqs2
#
#     if task.has_training_docs():
#         arr = list(islice(task.training_docs(), limit))
#         arr2 = list(islice(task2.training_docs(), limit))
#
#         assert arr == arr2
#
#         reqs = [task.construct_requests(doc, task.doc_to_text(doc)) for doc in arr]
#         reqs2 = [task2.construct_requests(doc, task2.doc_to_text(doc)) for doc in arr2]
#
#         assert reqs == reqs2
#
#
# @pytest.mark.parametrize("taskname,task_class", tasks.TASK_REGISTRY.items())
# def test_documents_and_requests(taskname, task_class):
#     print("Evaluating task", taskname)
#     task = task_class()
#     fns = []
#     if task.has_training_docs():
#         fns.append(task.training_docs)
#     if task.has_validation_docs():
#         fns.append(task.validation_docs)
#     # test doc might not have labels
#     # if task.has_test_docs(): fns.append(task.test_docs)
#
#     for fn in fns:
#         # print(list(islice(fn(), 10)))
#         for doc in islice(fn(), 10):
#
#             txt = task.doc_to_text(doc)
#             tgt = task.doc_to_target(doc)
#
#             assert isinstance(txt, str)
#             assert isinstance(tgt, str)
#
#             # space convention
#             # allow txt to have length 0 for perplexity-like tasks since the model tacks an <|endoftext|> on
#             if len(txt) != 0:
#                 assert txt[-1] != " "
#                 assert tgt[0] == " " or txt[-1] == "\n"
#
#             reqs = task.construct_requests(doc, txt)
#
#             # construct_requests can return just one request
#             if not isinstance(reqs, (list, tuple)):
#                 reqs = [reqs]
#
#             # todo: mock lm after refactoring evaluator.py to not be a mess
#             # for req in reqs:
#             #     assert isinstance(req, base.Request)