main.py 7.53 KB
Newer Older
1
2
import csv
import os
3
import time
4
5
6

import click
import torch
7
import torch.nn.functional as F
8
9
10
11
12
13
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer


@click.command()
@click.argument("datadir", required=True)
def main(datadir):
14
    model_runner = ModelRunner.create()
15
16
17
18
19
20

    with open(
        os.path.join(datadir, "cloze_test_test__spring2016 - cloze_test_ALL_test.csv")
    ) as f:
        storycloze_test_examples = list(csv.DictReader(f))

21
22
23
24
25
26
    start_time = time.time()
    example_evaluations = evaluate_examples(model_runner, storycloze_test_examples)
    end_time = time.time()
    print(
        f"Total time for {len(storycloze_test_examples)} examples: {end_time - start_time}"
    )
27
28
29
30
31
32
33
34
35
36
    fraction_correct = len(
        [
            evaluation
            for evaluation in example_evaluations
            if evaluation["was_model_correct"]
        ]
    ) / float(len(example_evaluations))
    print(f"Fraction correct: {fraction_correct}")


37
38
39
40
41
42
43
44
45
46
def evaluate_examples(model_runner, examples):
    prompts = [
        "{} {} {} {}".format(
            example["InputSentence1"],
            example["InputSentence2"],
            example["InputSentence3"],
            example["InputSentence4"],
        )
        for example in examples
    ]
47

48
49
50
51
52
53
54
55
56
    inputs_for_sentence_1 = [
        prompt + " " + example["RandomFifthSentenceQuiz1"]
        for prompt, example in zip(prompts, examples)
    ]
    inputs_for_sentence_2 = [
        prompt + " " + example["RandomFifthSentenceQuiz2"]
        for prompt, example in zip(prompts, examples)
    ]

57
58
    average_token_loglikelihoods_with_sentence_1 = (
        model_runner.compute_average_token_loglikelihoods_on_batch(inputs_for_sentence_1)
59
    )
60
61
    average_token_loglikelihoods_with_sentence_2 = (
        model_runner.compute_average_token_loglikelihoods_on_batch(inputs_for_sentence_2)
62
63
    )

64
65
66
    evaluation_results = []
    for i in range(len(examples)):
        if (
67
68
            average_token_loglikelihoods_with_sentence_1[i]
            > average_token_loglikelihoods_with_sentence_2[i]
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
        ):
            model_answer = examples[i]["RandomFifthSentenceQuiz1"]
            model_answer_code = "1"
        else:
            model_answer = examples[i]["RandomFifthSentenceQuiz2"]
            model_answer_code = "2"

        evaluation_results.append(
            {
                "model_answer": model_answer,
                "was_model_correct": model_answer_code
                == examples[i]["AnswerRightEnding"],
            }
        )
    return evaluation_results


class ModelRunner:
    def __init__(self):
        self.inference_requests = []
        self.num_inferences = 0

        self.model = None
        self.tokenizer = None

    @classmethod
    def create(cls):
        model_runner = cls()

        model_runner.model = AutoModelForCausalLM.from_pretrained(
            # 117M
100
            pretrained_model_name_or_path="gpt2-large",
101
            config=AutoConfig.from_pretrained(
102
                "gpt2-large",
103
104
105
106
107
                # <|endoftext|>
                pad_token_id=50256,
            ),
        ).to("cuda")
        model_runner.model = model_runner.model.eval()
108
        model_runner.tokenizer = AutoTokenizer.from_pretrained("gpt2-large")
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
        model_runner.tokenizer.pad_token = "<|endoftext|>"

        prompt = "The quick brown fox jumps over"
        encoded_prompt = model_runner.tokenizer.encode(
            prompt, add_special_tokens=False, return_tensors="pt"
        ).to("cuda")

        # Sanity check the model
        [output_token_ids] = model_runner.model.generate(
            input_ids=encoded_prompt,
            max_length=100,
            tempareture=0,
            do_sample=False,
            num_return_sequences=1,
        )
        decoded_output = model_runner.tokenizer.decode(output_token_ids.tolist())
        # Next word should be "the" ("The quick brown fox jumps over *the*...")
        assert decoded_output[len(prompt + " ") :].startswith("the")

        return model_runner

130
    def compute_average_token_loglikelihoods_on_batch(self, input_texts):
131
        """
132
        For each input text in the batch, compute the average log-likelihood over all tokens.
133

134
        For example, if an input sequence is 3 tokens long, and the token loglikelihoods are [-1, -2, -3], the "average token loglikelihood" is -2.
135
136
137
138
139
140
141
        """
        # The ModelRunner can take a big batch on input_texts, and it can be as large as the caller wants.
        # But to prevent the GPU from running out of memory, we need to subdivide the overall batch
        # into "GPU batches", and the "GPU batch size" depends on the model and hardware.
        # For GPT-2-117M, a GPU can process a batch of roughly 10 or so inputs before the inference latency starts to increase.
        gpu_batch_size = 20

142
        average_token_loglikelihoods = []
143
        for i in range(0, len(input_texts), gpu_batch_size):
144
145
            average_token_loglikelihoods.extend(
                self._average_token_loglikelihoods_on_gpu_batch(
146
147
148
                    input_texts[i : i + gpu_batch_size]
                )
            )
149
        return average_token_loglikelihoods
150

151
    def _average_token_loglikelihoods_on_gpu_batch(self, input_texts):
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
        tokenized_inputs = self.tokenizer(
            input_texts,
            add_special_tokens=False,
            return_tensors="pt",
            padding="longest",
        )[
            # https://github.com/huggingface/transformers/issues/5480#issuecomment-653259416
            "input_ids"
        ].to(
            "cuda"
        )

        start_time = time.time()
        output_logits = self.model(tokenized_inputs).logits
        self.num_inferences += 1

168
169
170
171
172
173
174
175
176
        # Normalize probabilities - at each position, the token likelihoods should add up to 1
        output_loglikelihoods = F.log_softmax(
            output_logits,
            # The embedding dimension
            dim=-1,
        )

        # Align the output loglikelihoods to the input tokens.
        loglikelihoods_for_input_positions = output_loglikelihoods[
177
178
179
            # The batch dimension
            :,
            # The position dimension
180
            # The last loglikelihood needs to be dropped, because it's predicting the "next token", and it doesn't correspond to any input token
181
182
183
184
            :-1,
            # The embedding dimension
            :,
        ]
185
        input_tokens_at_positions_with_loglikelihoods = tokenized_inputs[
186
187
188
189
190
191
            # The batch dimension
            :,
            # The position dimension
            # The model does not predict the first input token, so the first token needs to be dropped.
            1:,
        ]
192
193
194
195
        # At each position, the model outputs ~50k loglikelihoods, one for every possible token.
        # To get the loglikelihoods of the tokens that were actually provided, we need to select the right loglikelihood at each position.
        loglikelihoods_for_provided_tokens = torch.gather(
            loglikelihoods_for_input_positions,
196
            2,
197
            input_tokens_at_positions_with_loglikelihoods.unsqueeze(2),
198
199
        ).squeeze(2)

200
201
202
        mask_for_non_padded_positions = input_tokens_at_positions_with_loglikelihoods != 50256
        average_token_loglikelihoods = (
            loglikelihoods_for_provided_tokens * mask_for_non_padded_positions
203
        ).sum(1) / mask_for_non_padded_positions.sum(1)
204
        average_token_loglikelihoods = average_token_loglikelihoods.tolist()
205
206
207
208
209

        end_time = time.time()
        print(
            f"Time to evaluate once (inference #{self.num_inferences}): {end_time - start_time}"
        )
210
        return average_token_loglikelihoods
211
212
213


if __name__ == "__main__":
214
    main()