"...targets/git@developer.sourcefind.cn:gaoqiong/migraphx.git" did not exist on "d1a1a28b581678267088acbe2eb9dbdb0f4e9a92"
Commit 599045ba authored by uyhcire's avatar uyhcire
Browse files

Batch model inputs to speed things up

parent 622f17ce
import csv import csv
import os import os
import time
import click import click
import torch import torch
...@@ -9,46 +10,19 @@ from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer ...@@ -9,46 +10,19 @@ from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
@click.command() @click.command()
@click.argument("datadir", required=True) @click.argument("datadir", required=True)
def main(datadir): def main(datadir):
model = AutoModelForCausalLM.from_pretrained( model_runner = ModelRunner.create()
# 117M
pretrained_model_name_or_path="gpt2",
config=AutoConfig.from_pretrained(
"gpt2",
# <|endoftext|>
pad_token_id=50256,
),
).to("cuda")
model = model.eval()
tokenizer = AutoTokenizer.from_pretrained("gpt2")
prompt = "The quick brown fox jumps over"
encoded_prompt = tokenizer.encode(
prompt, add_special_tokens=False, return_tensors="pt"
).to("cuda")
# Sanity check the model
[output_token_ids] = model.generate(
input_ids=encoded_prompt,
max_length=100,
tempareture=0,
do_sample=False,
num_return_sequences=1,
)
decoded_output = tokenizer.decode(output_token_ids.tolist())
# Next word should be "the" ("The quick brown fox jumps over *the*...")
print(decoded_output[len(prompt + " ") :][:10])
assert decoded_output[len(prompt + " ") :].startswith("the")
with open( with open(
os.path.join(datadir, "cloze_test_test__spring2016 - cloze_test_ALL_test.csv") os.path.join(datadir, "cloze_test_test__spring2016 - cloze_test_ALL_test.csv")
) as f: ) as f:
storycloze_test_examples = list(csv.DictReader(f)) storycloze_test_examples = list(csv.DictReader(f))
example_evaluations = [ start_time = time.time()
evaluate_example(model, tokenizer, example) example_evaluations = evaluate_examples(model_runner, storycloze_test_examples)
for example in storycloze_test_examples end_time = time.time()
] print(
f"Total time for {len(storycloze_test_examples)} examples: {end_time - start_time}"
)
fraction_correct = len( fraction_correct = len(
[ [
evaluation evaluation
...@@ -59,58 +33,174 @@ def main(datadir): ...@@ -59,58 +33,174 @@ def main(datadir):
print(f"Fraction correct: {fraction_correct}") print(f"Fraction correct: {fraction_correct}")
def evaluate_example(model, tokenizer, example): def evaluate_examples(model_runner, examples):
storycloze_prompt = "{} {} {} {}".format( prompts = [
example["InputSentence1"], "{} {} {} {}".format(
example["InputSentence2"], example["InputSentence1"],
example["InputSentence3"], example["InputSentence2"],
example["InputSentence4"], example["InputSentence3"],
) example["InputSentence4"],
)
for example in examples
]
# Calculate *per-token* likelihoods, as the paper did inputs_for_sentence_1 = [
per_token_logit_for_sentence1 = compute_per_token_logit_for_completion( prompt + " " + example["RandomFifthSentenceQuiz1"]
model, tokenizer, storycloze_prompt, example["RandomFifthSentenceQuiz1"] for prompt, example in zip(prompts, examples)
]
inputs_for_sentence_2 = [
prompt + " " + example["RandomFifthSentenceQuiz2"]
for prompt, example in zip(prompts, examples)
]
average_token_logits_with_sentence_1 = (
model_runner.compute_average_token_logits_on_batch(inputs_for_sentence_1)
) )
per_token_logit_for_sentence2 = compute_per_token_logit_for_completion( average_token_logits_with_sentence_2 = (
model, tokenizer, storycloze_prompt, example["RandomFifthSentenceQuiz2"] model_runner.compute_average_token_logits_on_batch(inputs_for_sentence_2)
) )
if per_token_logit_for_sentence1 > per_token_logit_for_sentence2: evaluation_results = []
model_answer = example["RandomFifthSentenceQuiz1"] for i in range(len(examples)):
model_answer_code = "1" if (
else: average_token_logits_with_sentence_1[i]
model_answer = example["RandomFifthSentenceQuiz2"] > average_token_logits_with_sentence_2[i]
model_answer_code = "2" ):
model_answer = examples[i]["RandomFifthSentenceQuiz1"]
return { model_answer_code = "1"
"model_answer": model_answer, else:
"was_model_correct": model_answer_code == example["AnswerRightEnding"], model_answer = examples[i]["RandomFifthSentenceQuiz2"]
} model_answer_code = "2"
evaluation_results.append(
def compute_per_token_logit_for_completion(model, tokenizer, prompt, completion): {
encoded_prompt_with_completion = tokenizer.encode( "model_answer": model_answer,
prompt + " " + completion, "was_model_correct": model_answer_code
add_special_tokens=False, == examples[i]["AnswerRightEnding"],
return_tensors="pt", }
).to("cuda") )
output_logits = model(encoded_prompt_with_completion).logits return evaluation_results
# Align the output logits to the input tokens.
# The last logit needs to be dropped, because it's predicting the "next token", and it doesn't correspond to any input token class ModelRunner:
logits_for_input_positions = output_logits[0, :-1, :] def __init__(self):
# The model does not predict the first input token, so it needs to be dropped as well. self.inference_requests = []
input_tokens_at_positions_with_logits = encoded_prompt_with_completion[0, 1:] self.num_inferences = 0
# At each position, the model outputs ~50k logits, one for every possible token.
# To get the logits of the tokens that were actually provided, we need to select the right logit at each position. self.model = None
logits_for_provided_tokens = torch.gather( self.tokenizer = None
logits_for_input_positions,
1, @classmethod
input_tokens_at_positions_with_logits.unsqueeze(1), def create(cls):
).squeeze(1) model_runner = cls()
return logits_for_provided_tokens.mean().item() model_runner.model = AutoModelForCausalLM.from_pretrained(
# 117M
pretrained_model_name_or_path="gpt2",
config=AutoConfig.from_pretrained(
"gpt2",
# <|endoftext|>
pad_token_id=50256,
),
).to("cuda")
model_runner.model = model_runner.model.eval()
model_runner.tokenizer = AutoTokenizer.from_pretrained("gpt2")
model_runner.tokenizer.pad_token = "<|endoftext|>"
prompt = "The quick brown fox jumps over"
encoded_prompt = model_runner.tokenizer.encode(
prompt, add_special_tokens=False, return_tensors="pt"
).to("cuda")
# Sanity check the model
[output_token_ids] = model_runner.model.generate(
input_ids=encoded_prompt,
max_length=100,
tempareture=0,
do_sample=False,
num_return_sequences=1,
)
decoded_output = model_runner.tokenizer.decode(output_token_ids.tolist())
# Next word should be "the" ("The quick brown fox jumps over *the*...")
assert decoded_output[len(prompt + " ") :].startswith("the")
return model_runner
def compute_average_token_logits_on_batch(self, input_texts):
"""
For each input text in the batch, compute the average logit (log-likelihood) over all tokens.
For example, if an input sequence is 3 tokens long, and the token logits are [-1, -2, -3], the "average token logit" is -2.
"""
# The ModelRunner can take a big batch on input_texts, and it can be as large as the caller wants.
# But to prevent the GPU from running out of memory, we need to subdivide the overall batch
# into "GPU batches", and the "GPU batch size" depends on the model and hardware.
# For GPT-2-117M, a GPU can process a batch of roughly 10 or so inputs before the inference latency starts to increase.
gpu_batch_size = 20
average_token_logits = []
for i in range(0, len(input_texts), gpu_batch_size):
average_token_logits.extend(
self._average_token_logits_on_gpu_batch(
input_texts[i : i + gpu_batch_size]
)
)
return average_token_logits
def _average_token_logits_on_gpu_batch(self, input_texts):
tokenized_inputs = self.tokenizer(
input_texts,
add_special_tokens=False,
return_tensors="pt",
padding="longest",
)[
# https://github.com/huggingface/transformers/issues/5480#issuecomment-653259416
"input_ids"
].to(
"cuda"
)
start_time = time.time()
output_logits = self.model(tokenized_inputs).logits
self.num_inferences += 1
# Align the output logits to the input tokens.
logits_for_input_positions = output_logits[
# The batch dimension
:,
# The position dimension
# The last logit needs to be dropped, because it's predicting the "next token", and it doesn't correspond to any input token
:-1,
# The embedding dimension
:,
]
input_tokens_at_positions_with_logits = tokenized_inputs[
# The batch dimension
:,
# The position dimension
# The model does not predict the first input token, so the first token needs to be dropped.
1:,
]
# At each position, the model outputs ~50k logits, one for every possible token.
# To get the logits of the tokens that were actually provided, we need to select the right logit at each position.
logits_for_provided_tokens = torch.gather(
logits_for_input_positions,
2,
input_tokens_at_positions_with_logits.unsqueeze(2),
).squeeze(2)
mask_for_non_padded_positions = input_tokens_at_positions_with_logits != 50256
average_token_logits = (
logits_for_provided_tokens * mask_for_non_padded_positions
).sum(1) / mask_for_non_padded_positions.sum(1)
average_token_logits = average_token_logits.tolist()
end_time = time.time()
print(
f"Time to evaluate once (inference #{self.num_inferences}): {end_time - start_time}"
)
return average_token_logits
if __name__ == "__main__": if __name__ == "__main__":
main() main()
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment