test_evaluator.py 1.7 KB
Newer Older
1
import os
haileyschoelkopf's avatar
haileyschoelkopf committed
2
3
4

# import lm_eval.base as base
import lm_eval.api.registry as registry
Leo Gao's avatar
Leo Gao committed
5
import lm_eval.tasks as tasks
haileyschoelkopf's avatar
haileyschoelkopf committed
6
7
8

# import lm_eval.models as models

Leo Gao's avatar
Leo Gao committed
9
import lm_eval.evaluator as evaluator
10
import random
Leo Gao's avatar
Leo Gao committed
11
12
13
14
15
16
import pytest


# TODO: more fine grained unit tests rather than this big honking integration
# test once we break evaluator into smaller, more manageable pieces

Fabrizio Milo's avatar
Fabrizio Milo committed
17

18
19
@pytest.mark.parametrize("taskname,task_class", tasks.TASK_REGISTRY.items())
def test_evaluator(taskname, task_class):
Leo Gao's avatar
Leo Gao committed
20
    task_dict = tasks.get_task_dict([taskname])
21

haileyschoelkopf's avatar
haileyschoelkopf committed
22
23
24
25
    # TODO: re-add cachingLM
    # os.system("rm test_cache.db")
    # lm = base.CachingLM(models.get_model("dummy")(), "test_cache.db")
    lm = registry.get_model("dummy")()
26
27
28

    def ll_fn(reqs):
        for ctx, cont in reqs:
29
30
            if len(ctx) == 0:
                continue
31
            # space convention
Fabrizio Milo's avatar
Fabrizio Milo committed
32
33
34
            assert ctx[-1] != " "
            assert cont[0] == " " or ctx[-1] == "\n"

35
        res = []
Fabrizio Milo's avatar
Fabrizio Milo committed
36

37
38
39
40
41
        random.seed(42)
        for _ in reqs:
            res.append((-random.random(), False))

        return res
Jason Phang's avatar
Jason Phang committed
42
43

    def ll_perp_fn(reqs):
Fabrizio Milo's avatar
Fabrizio Milo committed
44
        for (string,) in reqs:
Jason Phang's avatar
Jason Phang committed
45
46
47
48
49
            assert isinstance(string, str)

        res = []
        random.seed(42)
        for _ in reqs:
Leo Gao's avatar
Leo Gao committed
50
            res.append(-random.random())
Jason Phang's avatar
Jason Phang committed
51
52

        return res
53
54

    lm.loglikelihood = ll_fn
55
    lm.loglikelihood_rolling = ll_perp_fn
56
57

    limit = 10
58
    e1 = evaluator.evaluate(
Fabrizio Milo's avatar
Fabrizio Milo committed
59
60
61
62
63
        lm=lm,
        task_dict=task_dict,
        num_fewshot=0,
        limit=limit,
        bootstrap_iters=10,
64
65
    )
    e2 = evaluator.evaluate(
Fabrizio Milo's avatar
Fabrizio Milo committed
66
67
68
69
70
        lm=lm,
        task_dict=task_dict,
        num_fewshot=0,
        limit=limit,
        bootstrap_iters=10,
71
    )
72

73
    # check that caching is working
74
    assert e1 == e2