"vscode:/vscode.git/clone" did not exist on "372ac40a37c7d0f8e5b8cc7ed5c502109ef6ce53"
Unverified Commit 6803e647 authored by Leo Gao's avatar Leo Gao Committed by GitHub
Browse files

Merge pull request #79 from EleutherAI/bmk_refactor

Bmk refactor
parents 2e1b05d2 041ea8a7
...@@ -7,8 +7,6 @@ The goal of this project is to build a set of tools for evaluating LMs on typica ...@@ -7,8 +7,6 @@ The goal of this project is to build a set of tools for evaluating LMs on typica
2. Removing task val/test data from LM training set 2. Removing task val/test data from LM training set
3. Adding task training data to LM training set 3. Adding task training data to LM training set
The raw Google doc can be found here: https://docs.google.com/document/d/177dwJpH8GHebISXYZSn4NL98sXdCtQMH82b7O5F7jmw/edit?usp=sharing
## Usage ## Usage
### Evaluate a task ### Evaluate a task
...@@ -99,6 +97,3 @@ With the data downloader in place, we simply need to (1) expose the val/test exa ...@@ -99,6 +97,3 @@ With the data downloader in place, we simply need to (1) expose the val/test exa
### 3. Adding task training data to LM training set ### 3. Adding task training data to LM training set
This part is the easiest. I guess we just write out some text files containing the training data? We can let the usual LM preprocessing pipeline handle it from there. This part is the easiest. I guess we just write out some text files containing the training data? We can let the usual LM preprocessing pipeline handle it from there.
## Summary (need to convert from google docs at some point):
https://docs.google.com/document/d/177dwJpH8GHebISXYZSn4NL98sXdCtQMH82b7O5F7jmw/edit?usp=sharing
import csv
import os
import time
import click
import torch
import torch.nn.functional as F
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
@click.command()
@click.argument("datadir", required=True)
def main(datadir):
model_runner = ModelRunner.create()
with open(
os.path.join(datadir, "cloze_test_test__spring2016 - cloze_test_ALL_test.csv")
) as f:
storycloze_test_examples = list(csv.DictReader(f))
start_time = time.time()
example_evaluations = evaluate_examples(model_runner, storycloze_test_examples)
end_time = time.time()
print(
f"Total time for {len(storycloze_test_examples)} examples: {end_time - start_time}"
)
fraction_correct = len(
[
evaluation
for evaluation in example_evaluations
if evaluation["was_model_correct"]
]
) / float(len(example_evaluations))
print(f"Fraction correct: {fraction_correct}")
def evaluate_examples(model_runner, examples):
prompts = [
"{} {} {} {}".format(
example["InputSentence1"],
example["InputSentence2"],
example["InputSentence3"],
example["InputSentence4"],
)
for example in examples
]
inputs_for_sentence_1 = [
prompt + " " + example["RandomFifthSentenceQuiz1"]
for prompt, example in zip(prompts, examples)
]
inputs_for_sentence_2 = [
prompt + " " + example["RandomFifthSentenceQuiz2"]
for prompt, example in zip(prompts, examples)
]
average_token_loglikelihoods_with_sentence_1 = (
model_runner.compute_average_token_loglikelihoods_on_batch(inputs_for_sentence_1)
)
average_token_loglikelihoods_with_sentence_2 = (
model_runner.compute_average_token_loglikelihoods_on_batch(inputs_for_sentence_2)
)
evaluation_results = []
for i in range(len(examples)):
if (
average_token_loglikelihoods_with_sentence_1[i]
> average_token_loglikelihoods_with_sentence_2[i]
):
model_answer = examples[i]["RandomFifthSentenceQuiz1"]
model_answer_code = "1"
else:
model_answer = examples[i]["RandomFifthSentenceQuiz2"]
model_answer_code = "2"
evaluation_results.append(
{
"model_answer": model_answer,
"was_model_correct": model_answer_code
== examples[i]["AnswerRightEnding"],
}
)
return evaluation_results
class ModelRunner:
def __init__(self):
self.inference_requests = []
self.num_inferences = 0
self.model = None
self.tokenizer = None
@classmethod
def create(cls):
model_runner = cls()
model_runner.model = AutoModelForCausalLM.from_pretrained(
# 117M
pretrained_model_name_or_path="gpt2-large",
config=AutoConfig.from_pretrained(
"gpt2-large",
# <|endoftext|>
pad_token_id=50256,
),
).to("cuda")
model_runner.model = model_runner.model.eval()
model_runner.tokenizer = AutoTokenizer.from_pretrained("gpt2-large")
model_runner.tokenizer.pad_token = "<|endoftext|>"
prompt = "The quick brown fox jumps over"
encoded_prompt = model_runner.tokenizer.encode(
prompt, add_special_tokens=False, return_tensors="pt"
).to("cuda")
# Sanity check the model
[output_token_ids] = model_runner.model.generate(
input_ids=encoded_prompt,
max_length=100,
tempareture=0,
do_sample=False,
num_return_sequences=1,
)
decoded_output = model_runner.tokenizer.decode(output_token_ids.tolist())
# Next word should be "the" ("The quick brown fox jumps over *the*...")
assert decoded_output[len(prompt + " ") :].startswith("the")
return model_runner
def compute_average_token_loglikelihoods_on_batch(self, input_texts):
"""
For each input text in the batch, compute the average log-likelihood over all tokens.
For example, if an input sequence is 3 tokens long, and the token loglikelihoods are [-1, -2, -3], the "average token loglikelihood" is -2.
"""
# The ModelRunner can take a big batch on input_texts, and it can be as large as the caller wants.
# But to prevent the GPU from running out of memory, we need to subdivide the overall batch
# into "GPU batches", and the "GPU batch size" depends on the model and hardware.
# For GPT-2-117M, a GPU can process a batch of roughly 10 or so inputs before the inference latency starts to increase.
gpu_batch_size = 20
average_token_loglikelihoods = []
for i in range(0, len(input_texts), gpu_batch_size):
average_token_loglikelihoods.extend(
self._average_token_loglikelihoods_on_gpu_batch(
input_texts[i : i + gpu_batch_size]
)
)
return average_token_loglikelihoods
def _average_token_loglikelihoods_on_gpu_batch(self, input_texts):
tokenized_inputs = self.tokenizer(
input_texts,
add_special_tokens=False,
return_tensors="pt",
padding="longest",
)[
# https://github.com/huggingface/transformers/issues/5480#issuecomment-653259416
"input_ids"
].to(
"cuda"
)
start_time = time.time()
output_logits = self.model(tokenized_inputs).logits
self.num_inferences += 1
# Normalize probabilities - at each position, the token likelihoods should add up to 1
output_loglikelihoods = F.log_softmax(
output_logits,
# The embedding dimension
dim=-1,
)
# Align the output loglikelihoods to the input tokens.
loglikelihoods_for_input_positions = output_loglikelihoods[
# The batch dimension
:,
# The position dimension
# The last loglikelihood needs to be dropped, because it's predicting the "next token", and it doesn't correspond to any input token
:-1,
# The embedding dimension
:,
]
input_tokens_at_positions_with_loglikelihoods = tokenized_inputs[
# The batch dimension
:,
# The position dimension
# The model does not predict the first input token, so the first token needs to be dropped.
1:,
]
# At each position, the model outputs ~50k loglikelihoods, one for every possible token.
# To get the loglikelihoods of the tokens that were actually provided, we need to select the right loglikelihood at each position.
loglikelihoods_for_provided_tokens = torch.gather(
loglikelihoods_for_input_positions,
2,
input_tokens_at_positions_with_loglikelihoods.unsqueeze(2),
).squeeze(2)
mask_for_non_padded_positions = input_tokens_at_positions_with_loglikelihoods != 50256
average_token_loglikelihoods = (
loglikelihoods_for_provided_tokens * mask_for_non_padded_positions
).sum(1) / mask_for_non_padded_positions.sum(1)
average_token_loglikelihoods = average_token_loglikelihoods.tolist()
end_time = time.time()
print(
f"Time to evaluate once (inference #{self.num_inferences}): {end_time - start_time}"
)
return average_token_loglikelihoods
if __name__ == "__main__":
main()
import abc import abc
import random import random
import collections
class LM(abc.ABC): class LM(abc.ABC):
@abc.abstractmethod @abc.abstractmethod
def loglikelihood(self, context, continuation): def loglikelihood(self, requests):
"""Compute log-likelihood of generating a continuation from a context """Compute log-likelihood of generating a continuation from a context.
Downstream tasks should attempt to use loglikelihood instead of other
:param context: str LM calls whenever possible.
Context string
:param continuation: str :param requests: list
The continuation over which log likelihood will be calculated. If A list of pairs (context, continuation)
there is a word boundary, the space should be in the continuation. context: str
For example, context="hello" continuation=" world" is correct. Context string
:return: float continuation: str
The continuation over which log likelihood will be calculated. If
there is a word boundary, the space should be in the continuation.
For example, context="hello" continuation=" world" is correct.
:return: list
A list of pairs (logprob, isgreedy)
logprob: float
The log probability of `contination`
isgreedy:
Whether `contination` would be generated by greedy sampling from `context`
"""
pass
@abc.abstractmethod
def greedy_until(self, requests):
"""Generate greedily until a stopping sequence
:param requests: list
A list of pairs (context, until)
context: str
Context string
until: str
The string sequence to generate until. This string sequence may
span across multiple tokens, or may be part of one token.
:return: list
A list of strings continuation
continuation: str
The generated continuation.
""" """
pass pass
...@@ -78,22 +106,47 @@ class Dataset(abc.ABC): ...@@ -78,22 +106,47 @@ class Dataset(abc.ABC):
return random.sample(self._traindocs, k) return random.sample(self._traindocs, k)
@abc.abstractmethod @abc.abstractmethod
def doc_to_text(self, doc, include_target=True): def doc_to_text(self, doc):
pass
@abc.abstractmethod
def doc_to_target(self, doc):
pass
@abc.abstractmethod
def construct_requests(self, ctx):
""" Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
:param ctx: str
The context string, generated by fewshot_context.
"""
pass pass
@abc.abstractmethod @abc.abstractmethod
def evaluate(self, docs, lm, provide_description, num_fewshot): def process_results(self, doc, results):
"""Take iterable of docs and evaluates, returning a dict with the following format: """Take a single document and the LM results and evaluates, returning a
list of dicts, each with the following format:
{ {
"major": float, "submetric": str,
"minor": dict, "value": float,
"higher_is_better": bool, "higher_is_better": bool,
"aggregation": ([float] -> float),
} }
* `major` should be a single, representative number, for programmatic comparison * `submetric` should be the name of the metric
* `minor` should be a dictionary containing all relevant sub-metrics * `value` should be the value of the metric
* `higher_is_better` determines whether a higher metric is better * `higher_is_better` determines whether a higher metric is better
* `aggregation` should be a function that takes a list of floats and
aggregates them into one float. This should be the same for all
submetrics of the same name; if it differs, an error should be
raised.
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param results:
The results of the requests created in construct_requests.
""" """
pass pass
...@@ -103,8 +156,30 @@ class Dataset(abc.ABC): ...@@ -103,8 +156,30 @@ class Dataset(abc.ABC):
def fewshot_context(self, doc, num_fewshot, provide_description): def fewshot_context(self, doc, num_fewshot, provide_description):
raw_description = self.fewshot_description() raw_description = self.fewshot_description()
description = (raw_description + "\n===\n\n") if provide_description and raw_description else "" description = (raw_description + "\n===\n\n") if provide_description and raw_description else ""
labeled_examples = "\n\n".join( labeled_examples = "\n\n".join(
map(self.doc_to_text, self.fewshot_examples(k=num_fewshot)) [self.doc_to_text(doc) + self.doc_to_target(doc) for doc in self.fewshot_examples(k=num_fewshot)]
) + "\n\n" ) + "\n\n"
example = self.doc_to_text(doc, include_target=False).strip()
return description + labeled_examples + example example = self.doc_to_text(doc).strip()
\ No newline at end of file return description + labeled_examples + example
def mean(arr):
return sum(arr) / len(arr)
def median(arr):
return arr[len(arr) // 2]
Request = collections.namedtuple('Request', ('type', 'args'))
class RequestFactory:
def __getattr__(self, attr):
def fn(*args):
return Request(attr, args)
return fn
rf = RequestFactory()
# REMINDER: this code needs to be rewritten for the new framework. Remove this comment when the code is fully converted.
from lm_eval.base import LM from lm_eval.base import LM
from . import MODEL_REGISTRY from . import MODEL_REGISTRY
......
...@@ -3,6 +3,7 @@ import torch ...@@ -3,6 +3,7 @@ import torch
import torch.nn.functional as F import torch.nn.functional as F
from lm_eval.base import LM from lm_eval.base import LM
from lm_eval import utils from lm_eval import utils
from tqdm import tqdm
class GPT2LM(LM): class GPT2LM(LM):
...@@ -17,14 +18,24 @@ class GPT2LM(LM): ...@@ -17,14 +18,24 @@ class GPT2LM(LM):
args = utils.simple_parse_args_string(arg_string) args = utils.simple_parse_args_string(arg_string)
return cls(device=args.get("device", "cpu")) return cls(device=args.get("device", "cpu"))
def loglikelihood(self, context, continuation, truncate=True): def loglikelihood(self, requests):
# when too long to fit in context, truncate from the left res = []
context_enc = self.tokenizer.encode(context) # TODO: vectorize properly
continuation_enc = self.tokenizer.encode(continuation) for context, continuation in tqdm(requests):
inp = torch.tensor([(context_enc + continuation_enc)[-1024:]], dtype=torch.long).to(self.device) # when too long to fit in context, truncate from the left
ctxlen = len(context_enc) - max(0, len(context_enc) + len(continuation_enc) - 1024) context_enc = self.tokenizer.encode(context)
continuation_enc = self.tokenizer.encode(continuation)
inp = torch.tensor([(context_enc + continuation_enc)[-1024:]], dtype=torch.long).to(self.device)
ctxlen = len(context_enc) - max(0, len(context_enc) + len(continuation_enc) - 1024)
cont_toks = inp[:, ctxlen:] # [batch, seq] cont_toks = inp[:, ctxlen:] # [batch, seq]
logits = F.log_softmax(self.gpt2(inp)[0], dim=-1)[:, ctxlen - 1:-1] # [batch, seq, vocab] logits = F.log_softmax(self.gpt2(inp)[0], dim=-1)[:, ctxlen - 1:-1] # [batch, seq, vocab]
return torch.gather(logits, 2, cont_toks.unsqueeze(-1)).squeeze(-1) # TODO: implement isgreedy
res.append((float(torch.gather(logits, 2, cont_toks.unsqueeze(-1)).squeeze(-1)), False))
return res
def greedy_until(self, requests):
# TODO: implement
pass
\ No newline at end of file
# REMINDER: this code needs to be rewritten for the new framework. Remove this comment when the code is fully converted.
import os import os
import transformers import transformers
from lm_eval.base import LM from lm_eval.base import LM
......
# REMINDER: this code needs to be rewritten for the new framework. Remove this comment when the code is fully converted.
from . common import HFTask from . common import HFTask
class ANLIBase(HFTask): class ANLIBase(HFTask):
......
# REMINDER: this code needs to be rewritten for the new framework. Remove this comment when the code is fully converted.
from . common import HFTask from . common import HFTask
class ARCEasy(HFTask): class ARCEasy(HFTask):
......
# REMINDER: this code needs to be rewritten for the new framework. Remove this comment when the code is fully converted.
import json import json
import random import random
from lm_eval.base import Dataset from lm_eval.base import Dataset
......
# REMINDER: this code needs to be rewritten for the new framework. Remove this comment when the code is fully converted.
import numpy as np import numpy as np
import json import json
from scipy.stats import pearsonr, spearmanr from scipy.stats import pearsonr, spearmanr
......
# REMINDER: this code needs to be rewritten for the new framework. Remove this comment when the code is fully converted.
import numpy as np import numpy as np
from scipy.stats import pearsonr, spearmanr from scipy.stats import pearsonr, spearmanr
from sklearn.metrics import f1_score, matthews_corrcoef from sklearn.metrics import f1_score, matthews_corrcoef
......
# REMINDER: this code needs to be rewritten for the new framework. Remove this comment when the code is fully converted.
import numpy as np import numpy as np
from scipy.stats import pearsonr, spearmanr from scipy.stats import pearsonr, spearmanr
from sklearn.metrics import f1_score, matthews_corrcoef from sklearn.metrics import f1_score, matthews_corrcoef
......
# REMINDER: this code needs to be rewritten for the new framework. Remove this comment when the code is fully converted.
from lm_eval.base import Dataset from lm_eval.base import Dataset
from lm_eval.utils import sh from lm_eval.utils import sh
import json import json
......
# REMINDER: this code needs to be rewritten for the new framework. Remove this comment when the code is fully converted.
from . common import HFTask from . common import HFTask
from itertools import islice
class NaturalQs(HFTask): class NaturalQs(HFTask):
DATASET_PATH = "natural_questions" DATASET_PATH = "natural_questions"
......
# REMINDER: this code needs to be rewritten for the new framework. Remove this comment when the code is fully converted.
import numpy as np import numpy as np
from scipy.stats import pearsonr, spearmanr from scipy.stats import pearsonr, spearmanr
from sklearn.metrics import f1_score, matthews_corrcoef from sklearn.metrics import f1_score, matthews_corrcoef
......
# REMINDER: this code needs to be rewritten for the new framework. Remove this comment when the code is fully converted.
import json import json
import random import random
from lm_eval.base import Dataset from lm_eval.base import Dataset
......
# REMINDER: this code needs to be rewritten for the new framework. Remove this comment when the code is fully converted.
import json import json
import random import random
import os import os
......
# REMINDER: this code needs to be rewritten for the new framework. Remove this comment when the code is fully converted.
from . common import HFTask from . common import HFTask
from ..utils_stream import X, each, apply, join, filt, one from ..utils_stream import X, each, apply, join, filt, one
import collections import collections
......
# REMINDER: this code needs to be rewritten for the new framework. Remove this comment when the code is fully converted.
import json import json
import random import random
import os import os
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment