test_evaluator.py 1.65 KB
Newer Older
1
2
import os
import lm_eval.base as base
Leo Gao's avatar
Leo Gao committed
3
4
5
import lm_eval.tasks as tasks
import lm_eval.models as models
import lm_eval.evaluator as evaluator
6
import random
Leo Gao's avatar
Leo Gao committed
7
8
9
10
11
12
import pytest


# TODO: more fine grained unit tests rather than this big honking integration
# test once we break evaluator into smaller, more manageable pieces

bzantium's avatar
bzantium committed
13

14
15
@pytest.mark.parametrize("taskname,task_class", tasks.TASK_REGISTRY.items())
def test_evaluator(taskname, task_class):
Leo Gao's avatar
Leo Gao committed
16
    task_dict = tasks.get_task_dict([taskname])
17
18

    os.system("rm test_cache.db")
bzantium's avatar
bzantium committed
19
    lm = base.CachingLM(models.get_model("dummy")(), "test_cache.db")
20
21
22

    def ll_fn(reqs):
        for ctx, cont in reqs:
23
24
            if len(ctx) == 0:
                continue
25
            # space convention
bzantium's avatar
bzantium committed
26
27
28
            assert ctx[-1] != " "
            assert cont[0] == " " or ctx[-1] == "\n"

29
        res = []
bzantium's avatar
bzantium committed
30

31
32
33
34
35
        random.seed(42)
        for _ in reqs:
            res.append((-random.random(), False))

        return res
Jason Phang's avatar
Jason Phang committed
36
37

    def ll_perp_fn(reqs):
bzantium's avatar
bzantium committed
38
        for (string,) in reqs:
Jason Phang's avatar
Jason Phang committed
39
40
41
42
43
            assert isinstance(string, str)

        res = []
        random.seed(42)
        for _ in reqs:
Leo Gao's avatar
Leo Gao committed
44
            res.append(-random.random())
Jason Phang's avatar
Jason Phang committed
45
46

        return res
47
48

    lm.loglikelihood = ll_fn
49
    lm.loglikelihood_rolling = ll_perp_fn
50
51

    limit = 10
52
    e1 = evaluator.evaluate(
bzantium's avatar
bzantium committed
53
54
55
56
57
58
        lm=lm,
        task_dict=task_dict,
        num_fewshot=0,
        limit=limit,
        bootstrap_iters=10,
        description_dict=None,
59
60
    )
    e2 = evaluator.evaluate(
bzantium's avatar
bzantium committed
61
62
63
64
65
66
        lm=lm,
        task_dict=task_dict,
        num_fewshot=0,
        limit=limit,
        bootstrap_iters=10,
        description_dict=None,
67
    )
68

69
    # check that caching is working
70
    assert e1 == e2