import csv import os import time import click import torch from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer @click.command() @click.argument("datadir", required=True) def main(datadir): model_runner = ModelRunner.create() with open( os.path.join(datadir, "cloze_test_test__spring2016 - cloze_test_ALL_test.csv") ) as f: storycloze_test_examples = list(csv.DictReader(f)) start_time = time.time() example_evaluations = evaluate_examples(model_runner, storycloze_test_examples) end_time = time.time() print( f"Total time for {len(storycloze_test_examples)} examples: {end_time - start_time}" ) fraction_correct = len( [ evaluation for evaluation in example_evaluations if evaluation["was_model_correct"] ] ) / float(len(example_evaluations)) print(f"Fraction correct: {fraction_correct}") def evaluate_examples(model_runner, examples): prompts = [ "{} {} {} {}".format( example["InputSentence1"], example["InputSentence2"], example["InputSentence3"], example["InputSentence4"], ) for example in examples ] inputs_for_sentence_1 = [ prompt + " " + example["RandomFifthSentenceQuiz1"] for prompt, example in zip(prompts, examples) ] inputs_for_sentence_2 = [ prompt + " " + example["RandomFifthSentenceQuiz2"] for prompt, example in zip(prompts, examples) ] average_token_logits_with_sentence_1 = ( model_runner.compute_average_token_logits_on_batch(inputs_for_sentence_1) ) average_token_logits_with_sentence_2 = ( model_runner.compute_average_token_logits_on_batch(inputs_for_sentence_2) ) evaluation_results = [] for i in range(len(examples)): if ( average_token_logits_with_sentence_1[i] > average_token_logits_with_sentence_2[i] ): model_answer = examples[i]["RandomFifthSentenceQuiz1"] model_answer_code = "1" else: model_answer = examples[i]["RandomFifthSentenceQuiz2"] model_answer_code = "2" evaluation_results.append( { "model_answer": model_answer, "was_model_correct": model_answer_code == examples[i]["AnswerRightEnding"], } ) return evaluation_results class ModelRunner: def __init__(self): self.inference_requests = [] self.num_inferences = 0 self.model = None self.tokenizer = None @classmethod def create(cls): model_runner = cls() model_runner.model = AutoModelForCausalLM.from_pretrained( # 117M pretrained_model_name_or_path="gpt2", config=AutoConfig.from_pretrained( "gpt2", # <|endoftext|> pad_token_id=50256, ), ).to("cuda") model_runner.model = model_runner.model.eval() model_runner.tokenizer = AutoTokenizer.from_pretrained("gpt2") model_runner.tokenizer.pad_token = "<|endoftext|>" prompt = "The quick brown fox jumps over" encoded_prompt = model_runner.tokenizer.encode( prompt, add_special_tokens=False, return_tensors="pt" ).to("cuda") # Sanity check the model [output_token_ids] = model_runner.model.generate( input_ids=encoded_prompt, max_length=100, tempareture=0, do_sample=False, num_return_sequences=1, ) decoded_output = model_runner.tokenizer.decode(output_token_ids.tolist()) # Next word should be "the" ("The quick brown fox jumps over *the*...") assert decoded_output[len(prompt + " ") :].startswith("the") return model_runner def compute_average_token_logits_on_batch(self, input_texts): """ For each input text in the batch, compute the average logit (log-likelihood) over all tokens. For example, if an input sequence is 3 tokens long, and the token logits are [-1, -2, -3], the "average token logit" is -2. """ # The ModelRunner can take a big batch on input_texts, and it can be as large as the caller wants. # But to prevent the GPU from running out of memory, we need to subdivide the overall batch # into "GPU batches", and the "GPU batch size" depends on the model and hardware. # For GPT-2-117M, a GPU can process a batch of roughly 10 or so inputs before the inference latency starts to increase. gpu_batch_size = 20 average_token_logits = [] for i in range(0, len(input_texts), gpu_batch_size): average_token_logits.extend( self._average_token_logits_on_gpu_batch( input_texts[i : i + gpu_batch_size] ) ) return average_token_logits def _average_token_logits_on_gpu_batch(self, input_texts): tokenized_inputs = self.tokenizer( input_texts, add_special_tokens=False, return_tensors="pt", padding="longest", )[ # https://github.com/huggingface/transformers/issues/5480#issuecomment-653259416 "input_ids" ].to( "cuda" ) start_time = time.time() output_logits = self.model(tokenized_inputs).logits self.num_inferences += 1 # Align the output logits to the input tokens. logits_for_input_positions = output_logits[ # The batch dimension :, # The position dimension # The last logit needs to be dropped, because it's predicting the "next token", and it doesn't correspond to any input token :-1, # The embedding dimension :, ] input_tokens_at_positions_with_logits = tokenized_inputs[ # The batch dimension :, # The position dimension # The model does not predict the first input token, so the first token needs to be dropped. 1:, ] # At each position, the model outputs ~50k logits, one for every possible token. # To get the logits of the tokens that were actually provided, we need to select the right logit at each position. logits_for_provided_tokens = torch.gather( logits_for_input_positions, 2, input_tokens_at_positions_with_logits.unsqueeze(2), ).squeeze(2) mask_for_non_padded_positions = input_tokens_at_positions_with_logits != 50256 average_token_logits = ( logits_for_provided_tokens * mask_for_non_padded_positions ).sum(1) / mask_for_non_padded_positions.sum(1) average_token_logits = average_token_logits.tolist() end_time = time.time() print( f"Time to evaluate once (inference #{self.num_inferences}): {end_time - start_time}" ) return average_token_logits if __name__ == "__main__": main()