test_evaluator.py 1.43 KB
Newer Older
1
2
import os
import lm_eval.base as base
Leo Gao's avatar
Leo Gao committed
3
4
5
import lm_eval.tasks as tasks
import lm_eval.models as models
import lm_eval.evaluator as evaluator
6
import random
Leo Gao's avatar
Leo Gao committed
7
8
9
10
11
12
import pytest


# TODO: more fine grained unit tests rather than this big honking integration
# test once we break evaluator into smaller, more manageable pieces

Leo Gao's avatar
Leo Gao committed
13
@pytest.mark.parametrize("taskname,Task", tasks.TASK_REGISTRY.items())
Leo Gao's avatar
Leo Gao committed
14
15
def test_evaluator(taskname, Task):
    task_dict = tasks.get_task_dict([taskname])
16
17
18

    os.system("rm test_cache.db")
    lm = base.CachingLM(models.get_model('dummy')(), "test_cache.db")
19
20
21

    def ll_fn(reqs):
        for ctx, cont in reqs:
Leo Gao's avatar
Leo Gao committed
22
            if len(ctx) == 0: continue
23
24
25
26
27
28
29
30
31
32
33
            # space convention
            assert ctx[-1] != ' '
            assert cont[0] == ' ' or ctx[-1] == '\n'
        
        res = []
        
        random.seed(42)
        for _ in reqs:
            res.append((-random.random(), False))

        return res
Jason Phang's avatar
Jason Phang committed
34
35
36
37
38
39
40
41

    def ll_perp_fn(reqs):
        for string, in reqs:
            assert isinstance(string, str)

        res = []
        random.seed(42)
        for _ in reqs:
Leo Gao's avatar
Leo Gao committed
42
            res.append(-random.random())
Jason Phang's avatar
Jason Phang committed
43
44

        return res
45
46

    lm.loglikelihood = ll_fn
47
    lm.loglikelihood_rolling = ll_perp_fn
48
49
50
51
52
53
54

    limit = 10
    e1 = evaluator.evaluate(lm, task_dict, False, 0, limit, bootstrap_iters=10)
    e2 = evaluator.evaluate(lm, task_dict, False, 0, limit, bootstrap_iters=10)

    # check taht caching is working
    assert e1 == e2