Minor changes

f16c301e · Leo Gao · 1e7f884d · f16c301e · f16c301e · f16c301e
Commit f16c301e authored May 05, 2021 by Leo Gao
Showing with 7 additions and 8 deletions

lm_eval/base.py lm_eval/base.py +1 -2

lm_eval/evaluator.py lm_eval/evaluator.py +1 -0

lm_eval/models/gpt2.py lm_eval/models/gpt2.py +3 -3

lm_eval/models/gpt3.py lm_eval/models/gpt3.py +2 -3

No files found.
--- a/lm_eval/base.py
+++ b/lm_eval/base.py
@@ -346,8 +346,7 @@ class PerplexityTask(Task, abc.ABC):
    def count_words(self, s):
        """ Downstream tasks with custom word boundaries should override this! """
        return len(re.split(r"\s+", s))
-    def 
 req_ret_lens = {
    'loglikelihood': 2,

--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -63,6 +63,7 @@ def evaluate(lm, task_dict, provide_description, num_fewshot, limit):
        # only in index. We could implement some kind of caching, but that would be more of a bandaid
        # solution. we could also implement some kind of autogrouping here; they should end up next to each other.
+        print("Running", reqtype, "requests")
        resps = getattr(lm, reqtype)([req.args for req in reqs])
        resps = [x if req.index is None else x[req.index] for x, req in zip(resps, reqs)]

--- a/lm_eval/models/gpt2.py
+++ b/lm_eval/models/gpt2.py
@@ -71,7 +71,7 @@ class GPT2LM(LM):
                rolling_token_windows = [(None,) + x for x in rolling_token_windows]
                # TODO: extract out this call so it only gets called once and also somehow figure out partial caching for that
-                string_nll = self._loglikelihood_tokens(rolling_token_windows)
+                string_nll = self._loglikelihood_tokens(rolling_token_windows, disable_tqdm=True)
                # discard is_greedy
                string_nll = [x[0] for x in string_nll]
@@ -81,7 +81,7 @@ class GPT2LM(LM):
        return loglikelihoods
-    def _loglikelihood_tokens(self, requests):
+    def _loglikelihood_tokens(self, requests, disable_tqdm=False):
        # TODO: implement some kind of efficient-request-middleware that lumps together requests with the same context
        res = []
        with torch.no_grad():
@@ -93,7 +93,7 @@ class GPT2LM(LM):
                return (len(toks), tuple(toks))
            reord = utils.Reorderer(requests, _collate)
-            for cache_key, context_enc, continuation_enc in tqdm(reord.get_reordered()):
+            for cache_key, context_enc, continuation_enc in tqdm(reord.get_reordered(), disable=disable_tqdm):
                assert len(context_enc) > 0
                assert len(continuation_enc) > 0
                assert len(continuation_enc) <= self.max_length

--- a/lm_eval/models/gpt3.py
+++ b/lm_eval/models/gpt3.py
@@ -85,8 +85,7 @@ class GPT3LM(LM):
        return self._loglikelihood_tokens(new_reqs)
    def loglikelihood_perplexity(self, requests):
-        # TODO: Implement caching once we've confirmed the perplexity implementation
+        # TODO: switch implementation to use _loglikelihood_tokens rather than having it do its own thing
-        # TODO: Add chunking
        loglikelihoods = []
        for string, in tqdm(requests):
@@ -104,7 +103,7 @@ class GPT3LM(LM):
                    pred_tokens=pred_tokens,
                )
                string_loglikelihoods.append(block_output["logprobs"])
-            string_loglikelihoods = np.concatenate(string_loglikelihoods)
+            string_loglikelihoods = np.concatenate(string_loglikelihoods).sum()
            loglikelihoods.append(string_loglikelihoods)
        return loglikelihoods