add manual tqdm disabling management (#1569)

* add manual tqdm disabling management * add typing to all new args * apply precommit changes --------- Co-authored-by: haileyschoelkopf <hailey@eleuther.ai>

add manual tqdm disabling management (#1569)
* add manual tqdm disabling management * add typing to all new args * apply precommit changes --------- Co-authored-by: haileyschoelkopf <hailey@eleuther.ai>
e74ec966 · achervyakov · GitHub · 49695e8d · e74ec966 · e74ec966
Unverified Commit e74ec966 authored Mar 13, 2024 by achervyakov Committed by GitHub Mar 13, 2024
10 changed files
--- a/lm_eval/api/model.py
+++ b/lm_eval/api/model.py
@@ -304,7 +304,9 @@ class TemplateLM(LM):
        return context_enc, continuation_enc
-    def loglikelihood(self, requests) -> List[Tuple[float, bool]]:
+    def loglikelihood(
+        self, requests, disable_tqdm: bool = False
+    ) -> List[Tuple[float, bool]]:
        new_reqs = []
        for context, continuation in [req.args for req in requests]:
            if context == "":
@@ -318,12 +320,14 @@ class TemplateLM(LM):
            new_reqs.append(((context, continuation), context_enc, continuation_enc))
-        return self._loglikelihood_tokens(new_reqs)
+        return self._loglikelihood_tokens(new_reqs, disable_tqdm=disable_tqdm)
    @abc.abstractmethod
-    def loglikelihood_rolling(self, requests) -> List[Tuple[float, bool]]:
+    def loglikelihood_rolling(
+        self, requests, disable_tqdm: bool = False
+    ) -> List[Tuple[float, bool]]:
        pass
    @abc.abstractmethod
-    def generate_until(self, requests) -> List[str]:
+    def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]:
        pass
--- a/lm_eval/models/anthropic_llms.py
+++ b/lm_eval/models/anthropic_llms.py
@@ -147,7 +147,7 @@ please install anthropic via `pip install lm-eval[anthropic]` or `pip install -e
    def _loglikelihood_tokens(self, requests, disable_tqdm: bool = False):
        raise NotImplementedError("No support for logits.")
-    def generate_until(self, requests) -> List[str]:
+    def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]:
        try:
            import anthropic
        except ModuleNotFoundError:
@@ -162,7 +162,7 @@ please install anthropic via `pip install lm-eval[anthropic]` or `pip install -e
        _requests: List[Tuple[str, dict]] = [req.args for req in requests]
        res = []
-        for request in tqdm(_requests):
+        for request in tqdm(_requests, disable=disable_tqdm):
            try:
                inp = request[0]
                request_args = request[1]
@@ -199,8 +199,8 @@ please install anthropic via `pip install lm-eval[anthropic]` or `pip install -e
        # Isn't used because we override generate_until
        raise NotImplementedError()
-    def loglikelihood(self, requests):
+    def loglikelihood(self, requests, disable_tqdm: bool = False):
        raise NotImplementedError("No support for logits.")
-    def loglikelihood_rolling(self, requests):
+    def loglikelihood_rolling(self, requests, disable_tqdm: bool = False):
        raise NotImplementedError("No support for logits.")
--- a/lm_eval/models/dummy.py
+++ b/lm_eval/models/dummy.py
 import random
+from tqdm import tqdm
 from lm_eval.api.model import LM
 from lm_eval.api.registry import register_model
@@ -13,27 +15,27 @@ class DummyLM(LM):
    def create_from_arg_string(cls, arg_string, additional_config=None):
        return cls()
-    def loglikelihood(self, requests):
+    def loglikelihood(self, requests, disable_tqdm: bool = False):
        res = []
-        for _ in requests:
+        for _ in tqdm(requests, disable=disable_tqdm):
            res.append((-random.random(), False))
        return res
-    def generate_until(self, requests):
+    def generate_until(self, requests, disable_tqdm: bool = False):
        res = []
-        for ctx, _ in requests:
+        for ctx, _ in tqdm(requests, disable=disable_tqdm):
            res.append("lol")
            assert ctx.strip() != ""
        return res
-    def loglikelihood_rolling(self, requests):
+    def loglikelihood_rolling(self, requests, disable_tqdm: bool = False):
        res = []
-        for _ in requests:
+        for _ in tqdm(requests, disable=disable_tqdm):
            res.append(-random.random())
        return res
--- a/lm_eval/models/gguf.py
+++ b/lm_eval/models/gguf.py
@@ -70,11 +70,13 @@ class GGUFLM(LM):
        else:
            raise Exception(f"Failed to get a valid response after {retries} retries.")
-    def loglikelihood(self, requests):
+    def loglikelihood(self, requests, disable_tqdm: bool = False):
        if not requests:
            return []
        res = []
-        for context, continuation in tqdm([req.args for req in requests]):
+        for context, continuation in tqdm(
+            [req.args for req in requests], disable=disable_tqdm
+        ):
            response = self.gguf_completion(context=context, continuation=continuation)
            if response and "choices" in response and response["choices"]:
                choice = response["choices"][0]
@@ -97,12 +99,12 @@ class GGUFLM(LM):
                assert False
        return res
-    def generate_until(self, requests):
+    def generate_until(self, requests, disable_tqdm: bool = False):
        if not requests:
            return []
        res = []
-        for request in tqdm([req.args for req in requests]):
+        for request in tqdm([req.args for req in requests], disable=disable_tqdm):
            inp = request[0]
            request_args = request[1]
            until = request_args.get("until", ["</s>"])
@@ -122,7 +124,7 @@ class GGUFLM(LM):
                res.append(None)  # Add default value in case of error
        return res
-    def loglikelihood_rolling(self, requests):
+    def loglikelihood_rolling(self, requests, disable_tqdm: bool = False):
        raise NotImplementedError(
            "loglikelihood_rolling not yet supported for GGUF models"
        )
--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
@@ -790,7 +790,9 @@ class HFLM(TemplateLM):
        return logits
-    def loglikelihood_rolling(self, requests: List[Instance]) -> List[float]:
+    def loglikelihood_rolling(
+        self, requests: List[Instance], disable_tqdm: bool = False
+    ) -> List[float]:
        loglikelihoods = []
        adaptive_batch_size = None
@@ -801,7 +803,9 @@ class HFLM(TemplateLM):
            print(f"Determined Largest batch size: {batch_size}")
            adaptive_batch_size = batch_size
-        for (string,) in tqdm([req.args for req in requests], disable=(self.rank != 0)):
+        for (string,) in tqdm(
+            [req.args for req in requests], disable=(disable_tqdm or (self.rank != 0))
+        ):
            rolling_token_windows = list(
                map(
                    utils.make_disjoint_window,
@@ -1079,7 +1083,9 @@ class HFLM(TemplateLM):
        return re_ord.get_original(res)
-    def generate_until(self, requests: List[Instance]) -> List[str]:
+    def generate_until(
+        self, requests: List[Instance], disable_tqdm: bool = False
+    ) -> List[str]:
        res = []
        def _collate(req: Tuple[str, dict]):
@@ -1095,7 +1101,7 @@ class HFLM(TemplateLM):
        pbar = tqdm(
            total=len(requests),
-            disable=(self.rank != 0),
+            disable=(disable_tqdm or (self.rank != 0)),
            desc="Running generate_until requests",
        )
        adaptive_batch_size = None

--- a/lm_eval/models/neuron_optimum.py
+++ b/lm_eval/models/neuron_optimum.py
@@ -447,12 +447,14 @@ class NEURON_HF(TemplateLM):
        return logits
-    def loglikelihood_rolling(self, requests):
+    def loglikelihood_rolling(self, requests, disable_tqdm: bool = False):
        loglikelihoods = []
        adaptive_batch_size = None
-        for (string,) in tqdm([req.args for req in requests], disable=(self.rank != 0)):
+        for (string,) in tqdm(
+            [req.args for req in requests], disable=(disable_tqdm or (self.rank != 0))
+        ):
            rolling_token_windows = list(
                map(
                    utils.make_disjoint_window,
@@ -616,7 +618,7 @@ class NEURON_HF(TemplateLM):
        return re_ord.get_original(res)
-    def generate_until(self, requests):
+    def generate_until(self, requests, disable_tqdm: bool = False):
        res = defaultdict(list)
        re_ords = {}
@@ -638,7 +640,7 @@ class NEURON_HF(TemplateLM):
            # within each set of reqs for given kwargs, we reorder by token length, descending.
            re_ords[key] = utils.Reorderer([req.args for req in reqs], _collate)
-        pbar = tqdm(total=len(requests), disable=(self.rank != 0))
+        pbar = tqdm(total=len(requests), disable=(disable_tqdm or (self.rank != 0)))
        # for each different set of kwargs, we execute all requests, by batch.
        for key, re_ord in re_ords.items():

--- a/lm_eval/models/openai_completions.py
+++ b/lm_eval/models/openai_completions.py
@@ -231,7 +231,7 @@ class OpenaiCompletionsLM(TemplateLM):
                    self.cache_hook.add_partial("loglikelihood", cache_key, answer)
        return re_ord.get_original(res)
-    def generate_until(self, requests) -> List[str]:
+    def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]:
        if not requests:
            return []
        res = []
@@ -258,7 +258,8 @@ class OpenaiCompletionsLM(TemplateLM):
        # todo: more intelligent batching for heterogeneous `until`
        for chunk, request_args in tqdm(
-            list(sameuntil_chunks(re_ord.get_reordered(), self.batch_size))
+            list(sameuntil_chunks(re_ord.get_reordered(), self.batch_size)),
+            disable=disable_tqdm,
        ):
            inps = []
            self._max_gen_toks = request_args.get("max_gen_toks", self.max_gen_toks)
@@ -308,10 +309,12 @@ class OpenaiCompletionsLM(TemplateLM):
        # Isn't used because we override generate_until
        raise NotImplementedError()
-    def loglikelihood_rolling(self, requests) -> List[float]:
+    def loglikelihood_rolling(
+        self, requests, disable_tqdm: bool = False
+    ) -> List[float]:
        loglikelihoods = []
-        for (string,) in tqdm([req.args for req in requests]):
+        for (string,) in tqdm([req.args for req in requests], disable=disable_tqdm):
            rolling_token_windows = list(
                map(
                    utils.make_disjoint_window,
@@ -398,7 +401,7 @@ class OpenaiChatCompletionsLM(LM):
        # Isn't used because we override _loglikelihood_tokens
        raise NotImplementedError()
-    def generate_until(self, requests) -> List[str]:
+    def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]:
        res = defaultdict(list)
        re_ords = {}
@@ -412,7 +415,7 @@ class OpenaiChatCompletionsLM(LM):
                [req.args for req in reqs], lambda x: (-len(x[0]), x[0])
            )
-        pbar = tqdm(total=len(requests), disable=(self.rank != 0))
+        pbar = tqdm(total=len(requests), disable=(disable_tqdm or (self.rank != 0)))
        for key, re_ord in re_ords.items():
            # n needs to be 1 because messages in
            # chat completion are not batch but
@@ -471,8 +474,8 @@ class OpenaiChatCompletionsLM(LM):
        return grouper.get_original(res)
-    def loglikelihood(self, requests):
+    def loglikelihood(self, requests, disable_tqdm: bool = False):
        raise NotImplementedError("No support for logits.")
-    def loglikelihood_rolling(self, requests):
+    def loglikelihood_rolling(self, requests, disable_tqdm: bool = False):
        raise NotImplementedError("No support for logits.")
--- a/lm_eval/models/textsynth.py
+++ b/lm_eval/models/textsynth.py
@@ -95,9 +95,9 @@ class TextSynthLM(LM):
        # Isn't used because we override loglikelihood, loglikelihood_rolling and generate_until
        raise NotImplementedError()
-    def loglikelihood(self, requests):
+    def loglikelihood(self, requests, disable_tqdm: bool = False):
        res = []
-        for context, continuation in tqdm(requests):
+        for context, continuation in tqdm(requests, disable=disable_tqdm):
            response = textsynth_completion(
                url=self.api_url + "/v1/engines/" + self.engine + "/logprob",
                headers={"Authorization": "Bearer " + self.api_key},
@@ -119,7 +119,7 @@ class TextSynthLM(LM):
                assert False
        return res
-    def loglikelihood_rolling(self, requests):
+    def loglikelihood_rolling(self, requests, disable_tqdm: bool = False):
        # TODO: The TextSynth API does not support tokenized inputs so we cannot
        # manually partition long contexts into smaller rolling windows as
        # done for other models derived from `BaseLM`. Override this method
@@ -129,12 +129,12 @@ class TextSynthLM(LM):
            "input tokenization support from TextSynth."
        )
-    def generate_until(self, requests):
+    def generate_until(self, requests, disable_tqdm: bool = False):
        if not requests:
            return []
        res = []
-        for request in tqdm(requests):
+        for request in tqdm(requests, disable=disable_tqdm):
            inp = request[0]
            request_args = request[1]
            until = request_args["until"]

--- a/lm_eval/models/vllm_causallms.py
+++ b/lm_eval/models/vllm_causallms.py
@@ -215,10 +215,12 @@ class VLLM(TemplateLM):
        )
        return outputs
-    def loglikelihood_rolling(self, requests: List[Instance]) -> List[float]:
+    def loglikelihood_rolling(
+        self, requests: List[Instance], disable_tqdm: bool = False
+    ) -> List[float]:
        loglikelihoods = []
-        for (string,) in tqdm([req.args for req in requests]):
+        for (string,) in tqdm([req.args for req in requests], disable=disable_tqdm):
            rolling_token_windows = list(
                map(
                    make_disjoint_window,
@@ -244,7 +246,9 @@ class VLLM(TemplateLM):
            loglikelihoods.append(string_nll)
        return loglikelihoods
-    def generate_until(self, requests: List[Instance]) -> List[str]:
+    def generate_until(
+        self, requests: List[Instance], disable_tqdm: bool = False
+    ) -> List[str]:
        res = []
        # batch tokenize contexts
@@ -273,7 +277,7 @@ class VLLM(TemplateLM):
        pbar = tqdm(
            total=len(requests),
-            disable=(self.rank != 0),
+            disable=(disable_tqdm or (self.rank != 0)),
            desc="Running generate_until requests",
        )
        # for each different set of kwargs, we execute all requests, by batch.

--- a/lm_eval/tasks/agieval/utils.py
+++ b/lm_eval/tasks/agieval/utils.py
 # Answer parsing and normalization code, from
 # https://github.com/ruixiangcui/AGIEval/blob/main/src/
 # math_equivalence.py and post_process.py
-from typing import Dict, List
 import re
+from typing import Dict, List
 import numpy as np
-def parse_math_answer(raw_string):
+def parse_math_answer(raw_string):
    def remove_boxed(s):
        left = "\\boxed{"
        try:
-            assert s[:len(left)] == left
+            assert s[: len(left)] == left
            assert s[-1] == "}"
-            answer = s[len(left):-1]
+            answer = s[len(left) : -1]
            if "=" in answer:
                answer = answer.split("=")[-1].lstrip(" ")
            return answer
-        except:
+        except Exception:
            return None
    def last_boxed_only_string(string):
@@ -40,10 +39,10 @@ def parse_math_answer(raw_string):
                    break
            i += 1
-        if right_brace_idx == None:
+        if right_brace_idx is None:
            retval = None
        else:
-            retval = string[idx:right_brace_idx + 1]
+            retval = string[idx : right_brace_idx + 1]
        return retval
@@ -92,7 +91,7 @@ def _fix_fracs(string):
            else:
                try:
                    assert len(substr) >= 2
-                except:
+                except Exception:
                    return string
                a = substr[0]
                b = substr[1]
@@ -123,7 +122,7 @@ def _fix_a_slash_b(string):
        assert string == "{}/{}".format(a, b)
        new_string = "\\frac{" + str(a) + "}{" + str(b) + "}"
        return new_string
-    except:
+    except Exception:
        return string
@@ -237,7 +236,7 @@ def is_equiv(str1, str2, verbose=False):
        if verbose:
            print(ss1, ss2)
        return ss1 == ss2
-    except:
+    except Exception:
        return str1 == str2
@@ -258,18 +257,18 @@ def process_results(doc: dict, results: List[str]) -> Dict[str, int]:
    }
    return results
 # use a custom process_results() function, because AGIEval can have multiple valid answers
 def process_results_mcqa(doc, results):
+    results = [result[0] for result in results]
-        results = [result[0] for result in results]
+    gold = doc["gold"]
-        gold = doc["gold"]
+    acc = 1.0 if int(np.argmax(results)) in gold else 0.0
+    completion_len = np.array([float(len(i)) for i in doc["choices"]])
+    acc_norm = 1.0 if int(np.argmax(results / completion_len)) in gold else 0.0
-        acc = 1.0 if int(np.argmax(results)) in gold else 0.0
+    return {
-        completion_len = np.array([float(len(i)) for i in doc["choices"]])
+        "acc": acc,
-        acc_norm = 1.0 if int(np.argmax(results / completion_len)) in gold else 0.0
+        "acc_norm": acc_norm,
+    }
-        return {
-            "acc": acc,
-            "acc_norm": acc_norm,
-        }