Merge pull request #79 from EleutherAI/bmk_refactor

Bmk refactor

Merge pull request #79 from EleutherAI/bmk_refactor
Bmk refactor
6803e647 · Leo Gao · GitHub · 2e1b05d2 · 041ea8a7 · 6803e647
Unverified Commit 6803e647 authored Jan 05, 2021 by Leo Gao Committed by GitHub Jan 05, 2021
20 changed files
--- a/README.md
+++ b/README.md
@@ -7,8 +7,6 @@ The goal of this project is to build a set of tools for evaluating LMs on typica
 2. Removing task val/test data from LM training set
 3. Adding task training data to LM training set
-The raw Google doc can be found here: https://docs.google.com/document/d/177dwJpH8GHebISXYZSn4NL98sXdCtQMH82b7O5F7jmw/edit?usp=sharing
 ## Usage
 ### Evaluate a task
@@ -99,6 +97,3 @@ With the data downloader in place, we simply need to (1) expose the val/test exa
 ### 3. Adding task training data to LM training set
 This part is the easiest. I guess we just write out some text files containing the training data? We can let the usual LM preprocessing pipeline handle it from there.
-## Summary (need to convert from google docs at some point):
-https://docs.google.com/document/d/177dwJpH8GHebISXYZSn4NL98sXdCtQMH82b7O5F7jmw/edit?usp=sharing
--- a/batch_eval/__init__.py
+++ b/batch_eval/__init__.py
--- a/batch_eval/main.py
+++ b/batch_eval/main.py
-import csv
-import os
-import time
-import click
-import torch
-import torch.nn.functional as F
-from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
-@click.command()
-@click.argument("datadir", required=True)
-def main(datadir):
-    model_runner = ModelRunner.create()
-    with open(
-        os.path.join(datadir, "cloze_test_test__spring2016 - cloze_test_ALL_test.csv")
-    ) as f:
-        storycloze_test_examples = list(csv.DictReader(f))
-    start_time = time.time()
-    example_evaluations = evaluate_examples(model_runner, storycloze_test_examples)
-    end_time = time.time()
-    print(
-        f"Total time for {len(storycloze_test_examples)} examples: {end_time - start_time}"
-    )
-    fraction_correct = len(
-        [
-            evaluation
-            for evaluation in example_evaluations
-            if evaluation["was_model_correct"]
-        ]
-    ) / float(len(example_evaluations))
-    print(f"Fraction correct: {fraction_correct}")
-def evaluate_examples(model_runner, examples):
-    prompts = [
-        "{} {} {} {}".format(
-            example["InputSentence1"],
-            example["InputSentence2"],
-            example["InputSentence3"],
-            example["InputSentence4"],
-        )
-        for example in examples
-    ]
-    inputs_for_sentence_1 = [
-        prompt + " " + example["RandomFifthSentenceQuiz1"]
-        for prompt, example in zip(prompts, examples)
-    ]
-    inputs_for_sentence_2 = [
-        prompt + " " + example["RandomFifthSentenceQuiz2"]
-        for prompt, example in zip(prompts, examples)
-    ]
-    average_token_loglikelihoods_with_sentence_1 = (
-        model_runner.compute_average_token_loglikelihoods_on_batch(inputs_for_sentence_1)
-    )
-    average_token_loglikelihoods_with_sentence_2 = (
-        model_runner.compute_average_token_loglikelihoods_on_batch(inputs_for_sentence_2)
-    )
-    evaluation_results = []
-    for i in range(len(examples)):
-        if (
-            average_token_loglikelihoods_with_sentence_1[i]
-            > average_token_loglikelihoods_with_sentence_2[i]
-        ):
-            model_answer = examples[i]["RandomFifthSentenceQuiz1"]
-            model_answer_code = "1"
-        else:
-            model_answer = examples[i]["RandomFifthSentenceQuiz2"]
-            model_answer_code = "2"
-        evaluation_results.append(
-            {
-                "model_answer": model_answer,
-                "was_model_correct": model_answer_code
-                == examples[i]["AnswerRightEnding"],
-            }
-        )
-    return evaluation_results
-class ModelRunner:
-    def __init__(self):
-        self.inference_requests = []
-        self.num_inferences = 0
-        self.model = None
-        self.tokenizer = None
-    @classmethod
-    def create(cls):
-        model_runner = cls()
-        model_runner.model = AutoModelForCausalLM.from_pretrained(
-            # 117M
-            pretrained_model_name_or_path="gpt2-large",
-            config=AutoConfig.from_pretrained(
-                "gpt2-large",
-                # <|endoftext|>
-                pad_token_id=50256,
-            ),
-        ).to("cuda")
-        model_runner.model = model_runner.model.eval()
-        model_runner.tokenizer = AutoTokenizer.from_pretrained("gpt2-large")
-        model_runner.tokenizer.pad_token = "<|endoftext|>"
-        prompt = "The quick brown fox jumps over"
-        encoded_prompt = model_runner.tokenizer.encode(
-            prompt, add_special_tokens=False, return_tensors="pt"
-        ).to("cuda")
-        # Sanity check the model
-        [output_token_ids] = model_runner.model.generate(
-            input_ids=encoded_prompt,
-            max_length=100,
-            tempareture=0,
-            do_sample=False,
-            num_return_sequences=1,
-        )
-        decoded_output = model_runner.tokenizer.decode(output_token_ids.tolist())
-        # Next word should be "the" ("The quick brown fox jumps over *the*...")
-        assert decoded_output[len(prompt + " ") :].startswith("the")
-        return model_runner
-    def compute_average_token_loglikelihoods_on_batch(self, input_texts):
-        """
-        For each input text in the batch, compute the average log-likelihood over all tokens.
-        For example, if an input sequence is 3 tokens long, and the token loglikelihoods are [-1, -2, -3], the "average token loglikelihood" is -2.
-        """
-        # The ModelRunner can take a big batch on input_texts, and it can be as large as the caller wants.
-        # But to prevent the GPU from running out of memory, we need to subdivide the overall batch
-        # into "GPU batches", and the "GPU batch size" depends on the model and hardware.
-        # For GPT-2-117M, a GPU can process a batch of roughly 10 or so inputs before the inference latency starts to increase.
-        gpu_batch_size = 20
-        average_token_loglikelihoods = []
-        for i in range(0, len(input_texts), gpu_batch_size):
-            average_token_loglikelihoods.extend(
-                self._average_token_loglikelihoods_on_gpu_batch(
-                    input_texts[i : i + gpu_batch_size]
-                )
-            )
-        return average_token_loglikelihoods
-    def _average_token_loglikelihoods_on_gpu_batch(self, input_texts):
-        tokenized_inputs = self.tokenizer(
-            input_texts,
-            add_special_tokens=False,
-            return_tensors="pt",
-            padding="longest",
-        )[
-            # https://github.com/huggingface/transformers/issues/5480#issuecomment-653259416
-            "input_ids"
-        ].to(
-            "cuda"
-        )
-        start_time = time.time()
-        output_logits = self.model(tokenized_inputs).logits
-        self.num_inferences += 1
-        # Normalize probabilities - at each position, the token likelihoods should add up to 1
-        output_loglikelihoods = F.log_softmax(
-            output_logits,
-            # The embedding dimension
-            dim=-1,
-        )
-        # Align the output loglikelihoods to the input tokens.
-        loglikelihoods_for_input_positions = output_loglikelihoods[
-            # The batch dimension
-            :,
-            # The position dimension
-            # The last loglikelihood needs to be dropped, because it's predicting the "next token", and it doesn't correspond to any input token
-            :-1,
-            # The embedding dimension
-            :,
-        ]
-        input_tokens_at_positions_with_loglikelihoods = tokenized_inputs[
-            # The batch dimension
-            :,
-            # The position dimension
-            # The model does not predict the first input token, so the first token needs to be dropped.
-            1:,
-        ]
-        # At each position, the model outputs ~50k loglikelihoods, one for every possible token.
-        # To get the loglikelihoods of the tokens that were actually provided, we need to select the right loglikelihood at each position.
-        loglikelihoods_for_provided_tokens = torch.gather(
-            loglikelihoods_for_input_positions,
-            2,
-            input_tokens_at_positions_with_loglikelihoods.unsqueeze(2),
-        ).squeeze(2)
-        mask_for_non_padded_positions = input_tokens_at_positions_with_loglikelihoods != 50256
-        average_token_loglikelihoods = (
-            loglikelihoods_for_provided_tokens * mask_for_non_padded_positions
-        ).sum(1) / mask_for_non_padded_positions.sum(1)
-        average_token_loglikelihoods = average_token_loglikelihoods.tolist()
-        end_time = time.time()
-        print(
-            f"Time to evaluate once (inference #{self.num_inferences}): {end_time - start_time}"
-        )
-        return average_token_loglikelihoods
-if __name__ == "__main__":
-    main()
--- a/lm_eval/base.py
+++ b/lm_eval/base.py
 import abc
 import random
+import collections
 class LM(abc.ABC):
    @abc.abstractmethod
-    def loglikelihood(self, context, continuation):
+    def loglikelihood(self, requests):
-        """Compute log-likelihood of generating a continuation from a context
+        """Compute log-likelihood of generating a continuation from a context.
+        Downstream tasks should attempt to use loglikelihood instead of other 
-        :param context: str
+        LM calls whenever possible.
-            Context string
-        :param continuation: str
+        :param requests: list
-            The continuation over which log likelihood will be calculated. If 
+            A list of pairs (context, continuation)
-            there is a word boundary, the space should be in the continuation. 
+            context: str
-            For example, context="hello" continuation=" world" is correct.
+                Context string
-        :return: float
+            continuation: str
+                The continuation over which log likelihood will be calculated. If 
+                there is a word boundary, the space should be in the continuation. 
+                For example, context="hello" continuation=" world" is correct.
+        :return: list
+            A list of pairs (logprob, isgreedy)
+            logprob: float
+                The log probability of `contination`
+            isgreedy:
+                Whether `contination` would be generated by greedy sampling from `context`
+        """
+        pass
+    @abc.abstractmethod
+    def greedy_until(self, requests):
+        """Generate greedily until a stopping sequence
+        :param requests: list
+            A list of pairs (context, until)
+            context: str
+                Context string
+            until: str
+                The string sequence to generate until. This string sequence may 
+                span across multiple tokens, or may be part of one token.
+        :return: list
+            A list of strings continuation
+            continuation: str
+                The generated continuation.
        """
        pass
@@ -78,22 +106,47 @@ class Dataset(abc.ABC):
        return random.sample(self._traindocs, k)
    @abc.abstractmethod
-    def doc_to_text(self, doc, include_target=True):
+    def doc_to_text(self, doc):
+        pass
+    @abc.abstractmethod
+    def doc_to_target(self, doc):
+        pass
+    @abc.abstractmethod
+    def construct_requests(self, ctx):
+        """ Uses RequestFactory to construct Requests and returns an iterable of 
+        Requests which will be sent to the LM.
+        :param ctx: str
+            The context string, generated by fewshot_context.
+        """
        pass
    @abc.abstractmethod
-    def evaluate(self, docs, lm, provide_description, num_fewshot):
+    def process_results(self, doc, results):
-        """Take iterable of docs and evaluates, returning a dict with the following format:
+        """Take a single document and the LM results and evaluates, returning a 
+        list of dicts, each with the following format:
        {
-            "major": float,
+            "submetric": str,
-            "minor": dict,
+            "value": float,
            "higher_is_better": bool,
+            "aggregation": ([float] -> float),
        }
-        * `major` should be a single, representative number, for programmatic comparison
+        * `submetric` should be the name of the metric
-        * `minor` should be a dictionary containing all relevant sub-metrics
+        * `value` should be the value of the metric
        * `higher_is_better` determines whether a higher metric is better
+        * `aggregation` should be a function that takes a list of floats and 
+            aggregates them into one float. This should be the same for all 
+            submetrics of the same name; if it differs, an error should be 
+            raised.
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param results:
+            The results of the requests created in construct_requests.
        """
        pass
@@ -103,8 +156,30 @@ class Dataset(abc.ABC):
    def fewshot_context(self, doc, num_fewshot, provide_description):
        raw_description = self.fewshot_description()
        description = (raw_description + "\n===\n\n") if provide_description and raw_description else ""
        labeled_examples = "\n\n".join(
-            map(self.doc_to_text, self.fewshot_examples(k=num_fewshot))
+            [self.doc_to_text(doc) + self.doc_to_target(doc) for doc in self.fewshot_examples(k=num_fewshot)]
        ) + "\n\n"
-        example = self.doc_to_text(doc, include_target=False).strip()
-        return description + labeled_examples + example
+        example = self.doc_to_text(doc).strip()
\ No newline at end of file
+        return description + labeled_examples + example
+def mean(arr):
+    return sum(arr) / len(arr)
+def median(arr):
+    return arr[len(arr) // 2]
+Request = collections.namedtuple('Request', ('type', 'args'))
+class RequestFactory:
+    def __getattr__(self, attr):
+        def fn(*args):
+            return Request(attr, args)
+        return fn
+rf = RequestFactory()
--- a/lm_eval/models/dummy.py
+++ b/lm_eval/models/dummy.py
+# REMINDER: this code needs to be rewritten for the new framework. Remove this comment when the code is fully converted.
 from lm_eval.base import LM
 from . import MODEL_REGISTRY

--- a/lm_eval/models/gpt2.py
+++ b/lm_eval/models/gpt2.py
@@ -3,6 +3,7 @@ import torch
 import torch.nn.functional as F
 from lm_eval.base import LM
 from lm_eval import utils
+from tqdm import tqdm
 class GPT2LM(LM):
@@ -17,14 +18,24 @@ class GPT2LM(LM):
        args = utils.simple_parse_args_string(arg_string)
        return cls(device=args.get("device", "cpu"))
-    def loglikelihood(self, context, continuation, truncate=True):
+    def loglikelihood(self, requests):
-        # when too long to fit in context, truncate from the left
+        res = []
-        context_enc = self.tokenizer.encode(context)
+        # TODO: vectorize properly
-        continuation_enc = self.tokenizer.encode(continuation)
+        for context, continuation in tqdm(requests):
-        inp = torch.tensor([(context_enc + continuation_enc)[-1024:]], dtype=torch.long).to(self.device)
+            # when too long to fit in context, truncate from the left
-        ctxlen = len(context_enc) - max(0, len(context_enc) + len(continuation_enc) - 1024)
+            context_enc = self.tokenizer.encode(context)
+            continuation_enc = self.tokenizer.encode(continuation)
+            inp = torch.tensor([(context_enc + continuation_enc)[-1024:]], dtype=torch.long).to(self.device)
+            ctxlen = len(context_enc) - max(0, len(context_enc) + len(continuation_enc) - 1024)
-        cont_toks = inp[:, ctxlen:]  # [batch, seq]
+            cont_toks = inp[:, ctxlen:]  # [batch, seq]
-        logits = F.log_softmax(self.gpt2(inp)[0], dim=-1)[:, ctxlen - 1:-1]  # [batch, seq, vocab]
+            logits = F.log_softmax(self.gpt2(inp)[0], dim=-1)[:, ctxlen - 1:-1]  # [batch, seq, vocab]
-        return torch.gather(logits, 2, cont_toks.unsqueeze(-1)).squeeze(-1)
+            # TODO: implement isgreedy
+            res.append((float(torch.gather(logits, 2, cont_toks.unsqueeze(-1)).squeeze(-1)), False))
+        return res
+    def greedy_until(self, requests):
+        # TODO: implement
+        pass
\ No newline at end of file
--- a/lm_eval/models/gpt3.py
+++ b/lm_eval/models/gpt3.py
+# REMINDER: this code needs to be rewritten for the new framework. Remove this comment when the code is fully converted.
 import os
 import transformers
 from lm_eval.base import LM

--- a/lm_eval/tasks/anli.py
+++ b/lm_eval/tasks/anli.py
+# REMINDER: this code needs to be rewritten for the new framework. Remove this comment when the code is fully converted.
 from . common import HFTask
 class ANLIBase(HFTask):

--- a/lm_eval/tasks/arc.py
+++ b/lm_eval/tasks/arc.py
+# REMINDER: this code needs to be rewritten for the new framework. Remove this comment when the code is fully converted.
 from . common import HFTask
 class ARCEasy(HFTask):

--- a/lm_eval/tasks/coqa.py
+++ b/lm_eval/tasks/coqa.py
+# REMINDER: this code needs to be rewritten for the new framework. Remove this comment when the code is fully converted.
 import json
 import random
 from lm_eval.base import Dataset

--- a/lm_eval/tasks/drop.py
+++ b/lm_eval/tasks/drop.py
+# REMINDER: this code needs to be rewritten for the new framework. Remove this comment when the code is fully converted.
 import numpy as np
 import json
 from scipy.stats import pearsonr, spearmanr

--- a/lm_eval/tasks/glue.py
+++ b/lm_eval/tasks/glue.py
+# REMINDER: this code needs to be rewritten for the new framework. Remove this comment when the code is fully converted.
 import numpy as np
 from scipy.stats import pearsonr, spearmanr
 from sklearn.metrics import f1_score, matthews_corrcoef

--- a/lm_eval/tasks/hellaswag.py
+++ b/lm_eval/tasks/hellaswag.py
+# REMINDER: this code needs to be rewritten for the new framework. Remove this comment when the code is fully converted.
 import numpy as np
 from scipy.stats import pearsonr, spearmanr
 from sklearn.metrics import f1_score, matthews_corrcoef

--- a/lm_eval/tasks/lambada.py
+++ b/lm_eval/tasks/lambada.py
+# REMINDER: this code needs to be rewritten for the new framework. Remove this comment when the code is fully converted.
 from lm_eval.base import Dataset
 from lm_eval.utils import sh
 import json

--- a/lm_eval/tasks/naturalqs.py
+++ b/lm_eval/tasks/naturalqs.py
+# REMINDER: this code needs to be rewritten for the new framework. Remove this comment when the code is fully converted.
 from . common import HFTask
+from itertools import islice
 class NaturalQs(HFTask):
    DATASET_PATH = "natural_questions"

--- a/lm_eval/tasks/openbookqa.py
+++ b/lm_eval/tasks/openbookqa.py
+# REMINDER: this code needs to be rewritten for the new framework. Remove this comment when the code is fully converted.
 import numpy as np
 from scipy.stats import pearsonr, spearmanr
 from sklearn.metrics import f1_score, matthews_corrcoef

--- a/lm_eval/tasks/piqa.py
+++ b/lm_eval/tasks/piqa.py
+# REMINDER: this code needs to be rewritten for the new framework. Remove this comment when the code is fully converted.
 import json
 import random
 from lm_eval.base import Dataset

--- a/lm_eval/tasks/quac.py
+++ b/lm_eval/tasks/quac.py
+# REMINDER: this code needs to be rewritten for the new framework. Remove this comment when the code is fully converted.
 import json
 import random
 import os

--- a/lm_eval/tasks/race.py
+++ b/lm_eval/tasks/race.py
+# REMINDER: this code needs to be rewritten for the new framework. Remove this comment when the code is fully converted.
 from . common import HFTask
 from ..utils_stream import X, each, apply, join, filt, one
 import collections

--- a/lm_eval/tasks/sat.py
+++ b/lm_eval/tasks/sat.py
+# REMINDER: this code needs to be rewritten for the new framework. Remove this comment when the code is fully converted.
 import json
 import random
 import os