test_models.py 6.33 KB
Newer Older
Leo Gao's avatar
Leo Gao committed
1
import pytest
2
import unittest.mock as mock
Leo Gao's avatar
Leo Gao committed
3
import lm_eval.models as models
4

Leo Gao's avatar
Leo Gao committed
5
6

def test_gpt2():
Fabrizio Milo's avatar
Fabrizio Milo committed
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
    gpt2 = models.get_model("gpt2").create_from_arg_string("device=cpu")
    (
        (ll_dog, ig_dog),
        (ll_cat, ig_cat),
        (_, ll_max_0),
        (_, ll_max_1),
        (_, ll_max_2),
        *vals,
    ) = gpt2.loglikelihood(
        [
            ("The quick brown fox jumps over the lazy", " dog"),
            ("The quick brown fox jumps over the lazy", " cat"),
            ("The quick brown fox jumps over the lazy", ", lazy dog"),
            ("The quick brown fox jumps over the lazy", ", lazy fox"),
            (
                "The quick brown fox jumps over the lazy",
                ", lazy fox and they both fall to the ground",
            ),
            (
                """A mult""",
                """ilayer perceptron (MLP) is a class of feedforward artificial neural network (ANN)""",
            ),
            (
                """The term MLP is used ambiguously, sometimes loosely to any feedforward ANN, sometimes strictly to refer to networks composed of multiple layers of perceptrons""",
                """ (with threshold activation); see § Terminology""",
            ),
            (
                """Multilayer perceptrons are sometimes coll""",
                """oquially referred to as "vanilla" neural networks, especially when they have a single hidden layer.[1]""",
            ),
            (
                """An MLP consists of at least three layers of nodes: an input layer, a hidden layer and an output layer. Except for the input nodes, each node is a neuron that uses a nonlinear""",
                """ activation function.""",
            ),
            (
                """MLP utilizes a supervised""",
                """ learning technique called backpropagation for training.[2][3] Its multiple layers and non-linear activation distinguish MLP from a linear perceptron. It can distinguish data that is not linearly separable.[4]""",
            ),
            (
                """Recent work has demonstrated substantial gains on many NLP tasks and benchmarks by pre-training on a large corpus of text followed by fine-tuning on a specific task. While typically task-agnostic""",
                """ in architecture, this method still requires task-specific fine-tuning datasets of thousands or tens of thousands of examples. By contrast, humans can generally perform a new language task from only a few examples or from simple instructions - something which current NLP systems still largely struggle to do. Here we show that scaling up language models greatly improves task-agnostic, few-shot performance, sometimes even reaching competitiveness with prior state-of-the-art fine-tuning approaches. """,
            ),
            (
                """Specifically, we train GPT-3, an autoregressive language model with 175""",
                """ billion parameters, 10x more than any previous non-sparse language model, and test its performance in the few-shot setting. For all tasks, GPT-3 is applied without any gradient updates or fine-tuning, with tasks and few-shot demonstrations specified purely via text interaction with the model. GPT-3 achieves strong performance on many NLP datasets, including translation, question-answering, and cloze tasks, as well as several tasks that require on-the-fly reasoning or domain adaptation, such as unscrambling words, using a novel word in a sentence, or performing 3-digit arithmetic. At the same time, we also identify some datasets where GPT-3's few-shot learning still struggles, as well as some datasets where GPT-3 faces methodological issues related to training on large web corpora. Finally, we find that GPT-3 can generate samples of news articles which human evaluators have difficulty distinguishing from articles written by humans. We discuss broader societal impacts of this finding and of GPT-3 in general.""",
            ),
            (
                """A mult""",
                """ilayer perceptron (MLP) is a class of feedforward artificial neural network (ANN)""",
            ),
            ("""Hello""", """ World"""),
        ]
    )
Leo Gao's avatar
Leo Gao committed
60
61
62
63

    assert ll_dog > ll_cat
    assert not ig_cat

64
65
66
67
    assert not ll_max_0
    assert ll_max_1
    assert ll_max_2

Leo Gao's avatar
Leo Gao committed
68
    # test empty context
Fabrizio Milo's avatar
Fabrizio Milo committed
69
    gpt2.loglikelihood([("", "test")])
Leo Gao's avatar
Leo Gao committed
70

Fabrizio Milo's avatar
Fabrizio Milo committed
71
72
73
    (gen,) = gpt2.greedy_until(
        [("The quick brown fox jumps over the lazy", [".", "\n"])]
    )
Leo Gao's avatar
Leo Gao committed
74

Fabrizio Milo's avatar
Fabrizio Milo committed
75
    assert gen == ", lazy fox and they both fall to the ground"
Leo Gao's avatar
Leo Gao committed
76

77
    targets = [
Fabrizio Milo's avatar
Fabrizio Milo committed
78
79
80
81
82
83
84
85
86
        -61.60536193847656,
        -56.57843780517578,
        -62.131004333496094,
        -9.799489974975586,
        -153.96334838867188,
        -341.222900390625,
        -731.1475830078125,
        -61.60536193847656,
        -8.682319641113281,
87
    ]
Leo Gao's avatar
Leo Gao committed
88
89

    for (pred, _), tgt in zip(vals, targets):
90
        assert pred == pytest.approx(tgt, rel=1e-3)
Leo Gao's avatar
Leo Gao committed
91

Jason Phang's avatar
Jason Phang committed
92
93

def test_gpt2_perplexity():
Fabrizio Milo's avatar
Fabrizio Milo committed
94
    gpt2 = models.get_model("gpt2").create_from_arg_string("device=cpu")
Jason Phang's avatar
Jason Phang committed
95
    test_string = "We study empirical scaling laws for language model performance on the cross-entropy loss."
96
    perplexity = gpt2.loglikelihood_rolling([(test_string,)])[0]
Fabrizio Milo's avatar
Fabrizio Milo committed
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
    tgt = sum(
        [
            -4.9599953,
            -8.069298,
            -8.308624,
            -10.178513,
            -8.906924,
            -1.9318912,
            -7.745445,
            -7.146077,
            -5.2072,
            -3.5882986,
            -1.9957212,
            -8.044922,
            -0.20841774,
            -5.1096807,
            -0.099879116,
            -8.888423,
            -4.6180487,
        ]
    )
118
    assert perplexity == pytest.approx(tgt, rel=1e-3)
Jason Phang's avatar
Jason Phang committed
119

Fabrizio Milo's avatar
Fabrizio Milo committed
120
121
122
    with mock.patch.object(
        models.gpt2.HFLM, "max_length", new_callable=mock.PropertyMock
    ) as mock_max_length:
123
        mock_max_length.return_value = 5
Fabrizio Milo's avatar
Fabrizio Milo committed
124
        gpt2 = models.get_model("gpt2").create_from_arg_string("device=cpu")
125
        perplexity = gpt2.loglikelihood_rolling([(test_string,)])[0]
Fabrizio Milo's avatar
Fabrizio Milo committed
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
    tgt = sum(
        [
            -4.96001,
            -8.069275,
            -8.308612,
            -10.178482,
            -8.90691,
            -4.037338,
            -8.09261,
            -11.662385,
            -10.206891,
            -4.425003,
            -2.2563353,
            -7.909143,
            -1.9304147,
            -7.3610134,
            -2.3120654,
            -7.3229,
            -2.1643813,
        ]
    )
147
    assert perplexity == pytest.approx(tgt, rel=1e-3)