Refactor LM organization for more reuse

7f24a08b · Leo Gao · e5066c69 · 7f24a08b · 7f24a08b · 7f24a08b
Commit 7f24a08b authored Oct 11, 2021 by Leo Gao
5 changed files
--- a/lm_eval/base.py
+++ b/lm_eval/base.py
 import abc
 import random
+from typing import Iterable
 import numpy as np
 import re
+from tqdm import tqdm

 from lm_eval.metrics import mean, perplexity, weighted_perplexity, weighted_mean
+from lm_eval import utils


 class LM(abc.ABC):
@@ -96,20 +99,70 @@ class LM(abc.ABC):
        pass

    @classmethod
-    def create_from_arg_string(cls, arg_string):
-        """Constructor method, in case models need additional arguments
-        e.g. OpenAI API engine, paths for loading, other params
-
-        :param arg_string: str
-            Left up to individual model class to handle
-
-        """
-        return cls()
+    def create_from_arg_string(cls, arg_string, additional_config={}):
+        args = utils.simple_parse_args_string(arg_string)
+        args2 = {k: v for k, v in additional_config.items() if v is not None}
+        return cls(**args, **args2)

    def set_cache_hook(self, cache_hook):
        self.cache_hook = cache_hook


+class TokenizedLM(LM):
+    @abc.abstractmethod
+    def tok_encode(self, string: str): pass
+    
+    @abc.abstractmethod
+    def tok_decode(self, tokens: Iterable[int]): pass
+
+    @abc.abstractmethod
+    def _loglikelihood_tokens(self, requests, disable_tqdm=False): pass
+
+    # subclass must implement properties vocab_size, eot_token_id, max_gen_toks.
+    # TODO: enforce this somehow
+
+    def loglikelihood(self, requests):
+        new_reqs = []
+        for context, continuation in requests:
+            if context == "":
+                # end of text as context
+                context_enc = [self.eot_token_id]
+            else:
+                context_enc = self.tok_encode(context)
+
+            continuation_enc = self.tok_encode(continuation)
+
+            new_reqs.append(((context, continuation), context_enc, continuation_enc))
+
+        return self._loglikelihood_tokens(new_reqs)
+
+    def loglikelihood_rolling(self, requests):
+        # TODO: Implement caching once we've confirmed the perplexity implementation
+        # TODO: automatic batch size detection for vectorization
+
+        loglikelihoods = []
+        for string, in tqdm(requests):
+            rolling_token_windows = list(map(utils.make_disjoint_window, utils.get_rolling_token_windows(
+                token_list=self.tok_encode(string),
+                prefix_token=self.eot_token_id,
+                max_seq_len=self.max_length,
+                context_len=1,
+            )))
+
+            rolling_token_windows = [(None,) + x for x in rolling_token_windows]
+
+            # TODO: extract out this call so it only gets called once and also somehow figure out partial caching for that
+            string_nll = self._loglikelihood_tokens(rolling_token_windows, disable_tqdm=True)
+            
+            # discard is_greedy
+            string_nll = [x[0] for x in string_nll]
+            
+            string_nll = sum(string_nll)
+            loglikelihoods.append(string_nll)
+
+        return loglikelihoods
+
+
 class Task(abc.ABC):
    """A task represents an entire benchmark including its dataset, problems,
    answers, and evaluation methods. See BoolQ for a simple example implementation

--- a/lm_eval/models/__init__.py
+++ b/lm_eval/models/__init__.py
@@ -3,6 +3,7 @@ from . import gpt3
 from . import dummy

 MODEL_REGISTRY = {
+    "hf": gpt2.HFLM,
    "gpt2": gpt2.GPT2LM,
    "gpt3": gpt3.GPT3LM,
    "dummy": dummy.DummyLM,

--- a/lm_eval/models/gpt2.py
+++ b/lm_eval/models/gpt2.py
@@ -2,115 +2,36 @@ import transformers
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from lm_eval.base import LM
+from lm_eval.base import LM, TokenizedLM
 from lm_eval import utils
 from tqdm import tqdm
 import numpy as np
+from abc import ABC, abstractmethod
+from typing import Iterable


-class GPT2LM(LM):
-    MAX_GEN_TOKS = 256
+class TorchLM(TokenizedLM):
+    @abstractmethod
+    def _model_generate(self, context, max_length, eos_token_id):
+        pass

-    def __init__(self, device='cuda', pretrained='gpt2', revision='main', subfolder=None, tokenizer=None, batch_size=1):
-        super().__init__()
-
-        assert isinstance(device, str)
-        assert isinstance(pretrained, str)
-        assert isinstance(batch_size, int)
-
-        if device:
-            self.device = torch.device(device)
-        else:
-            self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
-
-        # TODO: update this to be less of a hack once subfolder is fixed in HF
-        self.gpt2 = transformers.AutoModelForCausalLM.from_pretrained(pretrained, revision=revision +("/" + subfolder if subfolder is not None else "")).to(self.device)
-        self.gpt2.eval()
-
-        # pretrained tokenizer for neo is broken for now so just hardcoding this to gpt2
-        self.tokenizer = transformers.AutoTokenizer.from_pretrained(pretrained if tokenizer is None else tokenizer, revision=revision, subfolder=subfolder)
-
-        assert isinstance(self.tokenizer, (
-            transformers.GPT2Tokenizer, transformers.GPT2TokenizerFast,
-            transformers.T5Tokenizer, transformers.T5TokenizerFast,
-        )), "this tokenizer has not been checked for compatibility yet!"
-
-        self.VOCAB_SIZE = self.tokenizer.vocab_size
-        self.EOT_TOKEN_ID = self.tokenizer.eos_token_id
-        print(self.EOT_TOKEN_ID)
-
-        try:
-            self.max_length = self.gpt2.config.n_ctx
-        except AttributeError:
-            # gptneoconfig doesn't have n_ctx apparantly
-            self.max_length = self.gpt2.config.max_position_embeddings
-
-        if isinstance(self.tokenizer, (transformers.GPT2Tokenizer, transformers.GPT2TokenizerFast)): 
-            assert self.tokenizer.encode('hello\n\nhello') == [31373, 198, 198, 31373]
-
-        # multithreading and batching
-        gpus = torch.cuda.device_count()
-        batch_size_per_gpu = batch_size # todo: adaptive batch size
-
-        # TODO: fix multi-gpu
-        self.batch_size = batch_size_per_gpu# * gpus
-
-        # TODO: fix multi-gpu
-        # if gpus > 1:
-        #     self.gpt2 = nn.DataParallel(self.gpt2)
-
-    @classmethod
-    def create_from_arg_string(cls, arg_string, additional_config={}):
-        args = utils.simple_parse_args_string(arg_string)
-        args2 = {k: v for k, v in additional_config.items() if v is not None}
-        return cls(**args, **args2)
-
-    def loglikelihood(self, requests):
-        new_reqs = []
-        for context, continuation in requests:
-            if context == "":
-                # end of text as context
-                context_enc = [self.EOT_TOKEN_ID]
-            else:
-                context_enc = self.tokenizer.encode(context, add_special_tokens=False)
-
-            continuation_enc = self.tokenizer.encode(continuation, add_special_tokens=False)
-
-            new_reqs.append(((context, continuation), context_enc, continuation_enc))
-
-        return self._loglikelihood_tokens(new_reqs)
-
-    def loglikelihood_rolling(self, requests):
-        # TODO: Implement caching once we've confirmed the perplexity implementation
-        # TODO: automatic batch size detection for vectorization
-
-        loglikelihoods = []
-        with torch.no_grad():
-            for string, in tqdm(requests):
-                rolling_token_windows = list(map(utils.make_disjoint_window, utils.get_rolling_token_windows(
-                    token_list=self.tokenizer.encode(string, add_special_tokens=False),
-                    prefix_token=self.EOT_TOKEN_ID,
-                    max_seq_len=self.max_length,
-                    context_len=1,
-                )))
-
-                rolling_token_windows = [(None,) + x for x in rolling_token_windows]
-
-                # TODO: extract out this call so it only gets called once and also somehow figure out partial caching for that
-                string_nll = self._loglikelihood_tokens(rolling_token_windows, disable_tqdm=True)
-                
-                # discard is_greedy
-                string_nll = [x[0] for x in string_nll]
+    @abstractmethod
+    def _model_call(self, inps):
+        """
+        inps: a torch tensor of shape [batch, sequence]
+        the size of sequence may vary from call to call

-                string_nll = sum(string_nll)
-                loglikelihoods.append(string_nll)
+        returns: a torch tensor of shape [batch, sequence, vocab] with the
+        logits retuned from the model
+        """
+        pass

-        return loglikelihoods
+    # subclass must implement properties batch_size, vocab_size, eot_token_id, max_gen_toks, device.
+    # TODO: enforce this somehow

    def _loglikelihood_tokens(self, requests, disable_tqdm=False):
        # TODO: implement some kind of efficient-request-middleware that lumps together requests with the same context
        res = []
-        with torch.no_grad():

        def _collate(x):
            # the negative sign on len(toks) sorts descending - this has a few advantages:
@@ -145,7 +66,7 @@ class GPT2LM(LM):
                #          CTX      CONT
                # inp    0 1 2 3|4 5 6 7 8 9 <- last token is deleted by inp[:, :-1]
                # gpt2    \               \
-                    # logits   1 2 3|4 5 6 7 8 9   <- the ctx half gets tossed out by the [:, -len(continuation_enc):, :self.VOCAB_SIZE] slice
+                # logits   1 2 3|4 5 6 7 8 9   <- the ctx half gets tossed out by the [:, -len(continuation_enc):, :self.vocab_size] slice
                # cont_toks      4 5 6 7 8 9

                # when too long to fit in context, truncate from the left
@@ -197,23 +118,15 @@ class GPT2LM(LM):

        return reord.get_original(res)
    
-    def _model_call(self, inps):
-        """
-        inps: a torch tensor of shape [batch, sequence]
-        the size of sequence may vary from call to call
-
-        returns: a torch tensor of shape [batch, sequence, vocab] with the
-        logits retuned from the model
-        """
-        return self.gpt2(inps)[0][:, :, :50257]
-    
    def greedy_until(self, requests):
        # TODO: implement fully general `until` that handles untils that are 
        # multiple tokens or that span multiple tokens correctly
+
+        # TODO: extract to TokenizedLM?
        res = []

        def _collate(x):
-            toks = self.tokenizer.encode(x[0], add_special_tokens=False)
+            toks = self.tok_encode(x[0])
            return (len(toks), x[0])
        
        reord = utils.Reorderer(requests, _collate)
@@ -221,18 +134,13 @@ class GPT2LM(LM):
        for context, until in tqdm(reord.get_reordered()):
            if isinstance(until, str): until = [until]

-            context_enc = torch.tensor([self.tokenizer.encode(context, add_special_tokens=False)[self.MAX_GEN_TOKS - self.max_length:]]).to(self.device)
+            primary_until, = self.tok_encode(until[0])
            
-            primary_until, = self.tokenizer.encode(until[0], add_special_tokens=False)
+            context_enc = torch.tensor([self.tok_encode(context)[self.max_gen_toks - self.max_length:]]).to(self.device)

-            cont = self.gpt2.generate(
-                context_enc,
-                max_length=context_enc.shape[1] + self.MAX_GEN_TOKS,
-                eos_token_id=primary_until,
-                do_sample=False
-            )
+            cont = self._model_generate(context_enc, context_enc.shape[1] + self.max_gen_toks, primary_until)

-            s = self.tokenizer.decode(cont[0].tolist()[context_enc.shape[1]:])
+            s = self.tok_decode(cont[0].tolist()[context_enc.shape[1]:])

            for term in until:
                s = s.split(term)[0]
@@ -243,3 +151,83 @@ class GPT2LM(LM):
            res.append(s)
        
        return reord.get_original(res)
+
+
+class HFLM(TorchLM):
+
+    def __init__(self, device='cuda', pretrained='gpt2', revision='main', subfolder=None, tokenizer=None, batch_size=1):
+        super().__init__()
+
+        assert isinstance(device, str)
+        assert isinstance(pretrained, str)
+        assert isinstance(batch_size, int)
+
+        if device:
+            self.device = torch.device(device)
+        else:
+            self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+
+        # TODO: update this to be less of a hack once subfolder is fixed in HF
+        self.gpt2 = transformers.AutoModelForCausalLM.from_pretrained(pretrained, revision=revision +("/" + subfolder if subfolder is not None else "")).to(self.device)
+        self.gpt2.eval()
+
+        # pretrained tokenizer for neo is broken for now so just hardcoding this to gpt2
+        self.tokenizer = transformers.AutoTokenizer.from_pretrained(pretrained if tokenizer is None else tokenizer, revision=revision, subfolder=subfolder)
+
+        assert isinstance(self.tokenizer, (
+            transformers.GPT2Tokenizer, transformers.GPT2TokenizerFast,
+            transformers.T5Tokenizer, transformers.T5TokenizerFast,
+        )), "this tokenizer has not been checked for compatibility yet!"
+
+        self.vocab_size = self.tokenizer.vocab_size
+        self.eot_token_id = self.tokenizer.eos_token_id # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
+        self.max_gen_toks = 256
+
+        try:
+            self.max_length = self.gpt2.config.n_ctx
+        except AttributeError:
+            # gptneoconfig doesn't have n_ctx apparantly
+            self.max_length = self.gpt2.config.max_position_embeddings
+
+        if isinstance(self.tokenizer, (transformers.GPT2Tokenizer, transformers.GPT2TokenizerFast)):
+            assert self.tokenizer.encode('hello\n\nhello') == [31373, 198, 198, 31373], self.tokenizer.encode('hello\n\nhello')
+
+        # multithreading and batching
+        gpus = torch.cuda.device_count()
+        batch_size_per_gpu = batch_size # todo: adaptive batch size
+
+        # TODO: fix multi-gpu
+        self.batch_size = batch_size_per_gpu# * gpus
+
+        # TODO: fix multi-gpu
+        # if gpus > 1:
+        #     self.gpt2 = nn.DataParallel(self.gpt2)
+    
+    def tok_encode(self, string: str):
+        return self.tokenizer.encode(string, add_special_tokens=False)
+    
+    def tok_decode(self, tokens):
+        return self.tokenizer.decode(tokens)
+
+    def _model_call(self, inps):
+        """
+        inps: a torch tensor of shape [batch, sequence]
+        the size of sequence may vary from call to call
+
+        returns: a torch tensor of shape [batch, sequence, vocab] with the
+        logits retuned from the model
+        """
+        with torch.no_grad():
+            return self.gpt2(inps)[0][:, :, :50257]
+    
+    def _model_generate(self, context, max_length, eos_token_id):
+        return self.gpt2.generate(
+            context,
+            max_length=max_length,
+            eos_token_id=eos_token_id,
+            do_sample=False
+        )
+
+
+# for backwards compability
+GPT2LM = HFLM
\ No newline at end of file
--- a/lm_eval/models/gpt3.py
+++ b/lm_eval/models/gpt3.py
 import os
 import numpy as np
 import transformers
-from lm_eval.base import LM
+from lm_eval.base import LM, TokenizedLM
 from lm_eval import utils
 from tqdm import tqdm
 import time
@@ -35,11 +35,8 @@ def oa_completion(**kwargs):
            backoff_time *= 1.5


-class GPT3LM(LM):
-
-    MAX_LENGTH = 2048
+class GPT3LM(TokenizedLM):
    REQ_CHUNK_SIZE = 20
-    MAX_GEN_TOKS = 256

    def __init__(self, engine, truncate=False):
        """
@@ -50,10 +47,15 @@ class GPT3LM(LM):
            Truncate input if too long (if False and input is too long, throw error)
        """
        super().__init__()
+
        import openai
        self.engine = engine
        self.tokenizer = transformers.GPT2TokenizerFast.from_pretrained('gpt2')

+        self.vocab_size = self.tokenizer.vocab_size
+        self.eot_token_id = self.tokenizer.eos_token_id
+        self.max_gen_toks = 256
+        self.max_length = 2048

        # to make the annoying "Using pad_token, but it is not set yet." error go away
        self.tokenizer.pad_token = "<|endoftext|>"
@@ -64,26 +66,11 @@ class GPT3LM(LM):
        # Read from environment variable OPENAI_API_SECRET_KEY
        openai.api_key = os.environ["OPENAI_API_SECRET_KEY"]
    
-    @classmethod
-    def create_from_arg_string(cls, arg_string, additional_config={}):
-        args = utils.simple_parse_args_string(arg_string)
-        args2 = {k: v for k, v in additional_config.items() if v is not None}
-        return cls(**args, **args2)
-
-    def loglikelihood(self, requests):
-        new_reqs = []
-        for context, continuation in requests:
-            if context == "":
-                # end of text as context
-                context_enc = [50256]
-            else:
-                context_enc = self.tokenizer.encode(context)
+    def tok_encode(self, string: str):
+        return self.tokenizer.encode(string, add_special_tokens=False)
    
-            continuation_enc = self.tokenizer.encode(continuation)
-
-            new_reqs.append(((context, continuation), context_enc, continuation_enc))
-
-        return self._loglikelihood_tokens(new_reqs)
+    def tok_decode(self, tokens):
+        return self.tokenizer.decode(tokens)

    def loglikelihood_rolling(self, requests):
        # TODO: switch implementation to use _loglikelihood_tokens rather than having it do its own thing
@@ -94,7 +81,7 @@ class GPT3LM(LM):
            rolling_token_windows = utils.get_rolling_token_windows(
                token_list=encoded,
                prefix_token=self.end_of_text_token_id,
-                max_seq_len=self.MAX_LENGTH,
+                max_seq_len=self.max_length,
                context_len=1,
            )
            string_loglikelihoods = []
@@ -109,8 +96,28 @@ class GPT3LM(LM):

        return loglikelihoods

-    def _loglikelihood_tokens(self, requests):
-        import openai
+    def get_token_logprobs(self, input_tokens, pred_tokens):
+        pred_start = len(input_tokens) - len(pred_tokens) + 1
+        # We're going to stitch together the input_tokens and pred_tokens
+        # In the longest case, this gets us to length = max_seq_len+1 (which the API works with)
+        assert input_tokens[pred_start:] == pred_tokens[:-1]
+        token_ids = input_tokens + [pred_tokens[-1]]
+        response = oa_completion(
+            engine=self.engine,
+            prompt=token_ids,
+            max_tokens=0,
+            temperature=0.0,
+            logprobs=0,
+            echo=True,
+        )
+        logprobs = np.array(response["choices"][0]["logprobs"]["token_logprobs"][pred_start:])
+        positions = np.arange(pred_start-1, pred_start-1 + len(token_ids[pred_start:]))
+        return {
+            "logprobs": logprobs,
+            "positions": positions,
+        }
+
+    def _loglikelihood_tokens(self, requests, disable_tqdm=False):
        res = []

        def _collate(x):
@@ -122,12 +129,12 @@ class GPT3LM(LM):
        
        reord = utils.Reorderer(requests, _collate)

-        for chunk in tqdm(list(utils.chunks(reord.get_reordered(), self.REQ_CHUNK_SIZE))):
+        for chunk in tqdm(list(utils.chunks(reord.get_reordered(), self.REQ_CHUNK_SIZE)), disable=disable_tqdm):
            inps = []
            ctxlens = []
            for cache_key, context_enc, continuation_enc in chunk:
-                inp = (context_enc + continuation_enc)[-self.MAX_LENGTH:]
-                ctxlen = len(context_enc) - max(0, len(context_enc) + len(continuation_enc) - self.MAX_LENGTH)
+                inp = (context_enc + continuation_enc)[-self.max_length:]
+                ctxlen = len(context_enc) - max(0, len(context_enc) + len(continuation_enc) - self.max_length)

                inps.append(inp)
                ctxlens.append(ctxlen)
@@ -151,34 +158,13 @@ class GPT3LM(LM):

        return reord.get_original(res)

-    def get_token_logprobs(self, input_tokens, pred_tokens):
-        pred_start = len(input_tokens) - len(pred_tokens) + 1
-        # We're going to stitch together the input_tokens and pred_tokens
-        # In the longest case, this gets us to length = max_seq_len+1 (which the API works with)
-        assert input_tokens[pred_start:] == pred_tokens[:-1]
-        token_ids = input_tokens + [pred_tokens[-1]]
-        response = oa_completion(
-            engine=self.engine,
-            prompt=token_ids,
-            max_tokens=0,
-            temperature=0.0,
-            logprobs=0,
-            echo=True,
-        )
-        logprobs = np.array(response["choices"][0]["logprobs"]["token_logprobs"][pred_start:])
-        positions = np.arange(pred_start-1, pred_start-1 + len(token_ids[pred_start:]))
-        return {
-            "logprobs": logprobs,
-            "positions": positions,
-        }
-
    def greedy_until(self, requests):
        if not requests: return []
        import openai
        res = []

        def _collate(x):
-            toks = self.tokenizer.encode(x[0])
+            toks = self.tok_encode(x[0])
            return (len(toks), x[0])
        
        reord = utils.Reorderer(requests, _collate)
@@ -199,14 +185,14 @@ class GPT3LM(LM):
        for chunk, until in tqdm(list(sameuntil_chunks(reord.get_reordered(), self.REQ_CHUNK_SIZE))):
            inps = []
            for context, _ in chunk:
-                context_enc = self.tokenizer.encode(context)
-                inp = context_enc[-(self.MAX_LENGTH - self.MAX_GEN_TOKS):]
+                context_enc = self.tok_encode(context)
+                inp = context_enc[-(self.max_length - self.max_gen_toks):]
                inps.append(inp)

            response = oa_completion(
                engine=self.engine,
                prompt=inps,
-                max_tokens=self.MAX_GEN_TOKS, 
+                max_tokens=self.max_gen_toks, 
                temperature=0.,
                logprobs=10,
                stop=until

--- a/tests/test_gpt3.py
+++ b/tests/test_gpt3.py
@@ -85,7 +85,7 @@ def test_gpt3_perplexity():
    assert perplexity == pytest.approx(tgt, rel=1e-3)

    # Hack: modify gpt3 to have shorter context length to induce rolling windows
-    gpt3.MAX_LENGTH = 5
+    gpt3.max_length = 5
    perplexity = gpt3.loglikelihood_rolling([(test_string,)])[0]
    tgt = -101.93490880000002
    assert perplexity == pytest.approx(tgt, rel=1e-3)