Unverified Commit e74ec966 authored by achervyakov's avatar achervyakov Committed by GitHub
Browse files

add manual tqdm disabling management (#1569)



* add manual tqdm disabling management

* add typing to all new args

* apply precommit changes

---------
Co-authored-by: default avatarhaileyschoelkopf <hailey@eleuther.ai>
parent 49695e8d
...@@ -304,7 +304,9 @@ class TemplateLM(LM): ...@@ -304,7 +304,9 @@ class TemplateLM(LM):
return context_enc, continuation_enc return context_enc, continuation_enc
def loglikelihood(self, requests) -> List[Tuple[float, bool]]: def loglikelihood(
self, requests, disable_tqdm: bool = False
) -> List[Tuple[float, bool]]:
new_reqs = [] new_reqs = []
for context, continuation in [req.args for req in requests]: for context, continuation in [req.args for req in requests]:
if context == "": if context == "":
...@@ -318,12 +320,14 @@ class TemplateLM(LM): ...@@ -318,12 +320,14 @@ class TemplateLM(LM):
new_reqs.append(((context, continuation), context_enc, continuation_enc)) new_reqs.append(((context, continuation), context_enc, continuation_enc))
return self._loglikelihood_tokens(new_reqs) return self._loglikelihood_tokens(new_reqs, disable_tqdm=disable_tqdm)
@abc.abstractmethod @abc.abstractmethod
def loglikelihood_rolling(self, requests) -> List[Tuple[float, bool]]: def loglikelihood_rolling(
self, requests, disable_tqdm: bool = False
) -> List[Tuple[float, bool]]:
pass pass
@abc.abstractmethod @abc.abstractmethod
def generate_until(self, requests) -> List[str]: def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]:
pass pass
...@@ -147,7 +147,7 @@ please install anthropic via `pip install lm-eval[anthropic]` or `pip install -e ...@@ -147,7 +147,7 @@ please install anthropic via `pip install lm-eval[anthropic]` or `pip install -e
def _loglikelihood_tokens(self, requests, disable_tqdm: bool = False): def _loglikelihood_tokens(self, requests, disable_tqdm: bool = False):
raise NotImplementedError("No support for logits.") raise NotImplementedError("No support for logits.")
def generate_until(self, requests) -> List[str]: def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]:
try: try:
import anthropic import anthropic
except ModuleNotFoundError: except ModuleNotFoundError:
...@@ -162,7 +162,7 @@ please install anthropic via `pip install lm-eval[anthropic]` or `pip install -e ...@@ -162,7 +162,7 @@ please install anthropic via `pip install lm-eval[anthropic]` or `pip install -e
_requests: List[Tuple[str, dict]] = [req.args for req in requests] _requests: List[Tuple[str, dict]] = [req.args for req in requests]
res = [] res = []
for request in tqdm(_requests): for request in tqdm(_requests, disable=disable_tqdm):
try: try:
inp = request[0] inp = request[0]
request_args = request[1] request_args = request[1]
...@@ -199,8 +199,8 @@ please install anthropic via `pip install lm-eval[anthropic]` or `pip install -e ...@@ -199,8 +199,8 @@ please install anthropic via `pip install lm-eval[anthropic]` or `pip install -e
# Isn't used because we override generate_until # Isn't used because we override generate_until
raise NotImplementedError() raise NotImplementedError()
def loglikelihood(self, requests): def loglikelihood(self, requests, disable_tqdm: bool = False):
raise NotImplementedError("No support for logits.") raise NotImplementedError("No support for logits.")
def loglikelihood_rolling(self, requests): def loglikelihood_rolling(self, requests, disable_tqdm: bool = False):
raise NotImplementedError("No support for logits.") raise NotImplementedError("No support for logits.")
import random import random
from tqdm import tqdm
from lm_eval.api.model import LM from lm_eval.api.model import LM
from lm_eval.api.registry import register_model from lm_eval.api.registry import register_model
...@@ -13,27 +15,27 @@ class DummyLM(LM): ...@@ -13,27 +15,27 @@ class DummyLM(LM):
def create_from_arg_string(cls, arg_string, additional_config=None): def create_from_arg_string(cls, arg_string, additional_config=None):
return cls() return cls()
def loglikelihood(self, requests): def loglikelihood(self, requests, disable_tqdm: bool = False):
res = [] res = []
for _ in requests: for _ in tqdm(requests, disable=disable_tqdm):
res.append((-random.random(), False)) res.append((-random.random(), False))
return res return res
def generate_until(self, requests): def generate_until(self, requests, disable_tqdm: bool = False):
res = [] res = []
for ctx, _ in requests: for ctx, _ in tqdm(requests, disable=disable_tqdm):
res.append("lol") res.append("lol")
assert ctx.strip() != "" assert ctx.strip() != ""
return res return res
def loglikelihood_rolling(self, requests): def loglikelihood_rolling(self, requests, disable_tqdm: bool = False):
res = [] res = []
for _ in requests: for _ in tqdm(requests, disable=disable_tqdm):
res.append(-random.random()) res.append(-random.random())
return res return res
...@@ -70,11 +70,13 @@ class GGUFLM(LM): ...@@ -70,11 +70,13 @@ class GGUFLM(LM):
else: else:
raise Exception(f"Failed to get a valid response after {retries} retries.") raise Exception(f"Failed to get a valid response after {retries} retries.")
def loglikelihood(self, requests): def loglikelihood(self, requests, disable_tqdm: bool = False):
if not requests: if not requests:
return [] return []
res = [] res = []
for context, continuation in tqdm([req.args for req in requests]): for context, continuation in tqdm(
[req.args for req in requests], disable=disable_tqdm
):
response = self.gguf_completion(context=context, continuation=continuation) response = self.gguf_completion(context=context, continuation=continuation)
if response and "choices" in response and response["choices"]: if response and "choices" in response and response["choices"]:
choice = response["choices"][0] choice = response["choices"][0]
...@@ -97,12 +99,12 @@ class GGUFLM(LM): ...@@ -97,12 +99,12 @@ class GGUFLM(LM):
assert False assert False
return res return res
def generate_until(self, requests): def generate_until(self, requests, disable_tqdm: bool = False):
if not requests: if not requests:
return [] return []
res = [] res = []
for request in tqdm([req.args for req in requests]): for request in tqdm([req.args for req in requests], disable=disable_tqdm):
inp = request[0] inp = request[0]
request_args = request[1] request_args = request[1]
until = request_args.get("until", ["</s>"]) until = request_args.get("until", ["</s>"])
...@@ -122,7 +124,7 @@ class GGUFLM(LM): ...@@ -122,7 +124,7 @@ class GGUFLM(LM):
res.append(None) # Add default value in case of error res.append(None) # Add default value in case of error
return res return res
def loglikelihood_rolling(self, requests): def loglikelihood_rolling(self, requests, disable_tqdm: bool = False):
raise NotImplementedError( raise NotImplementedError(
"loglikelihood_rolling not yet supported for GGUF models" "loglikelihood_rolling not yet supported for GGUF models"
) )
...@@ -790,7 +790,9 @@ class HFLM(TemplateLM): ...@@ -790,7 +790,9 @@ class HFLM(TemplateLM):
return logits return logits
def loglikelihood_rolling(self, requests: List[Instance]) -> List[float]: def loglikelihood_rolling(
self, requests: List[Instance], disable_tqdm: bool = False
) -> List[float]:
loglikelihoods = [] loglikelihoods = []
adaptive_batch_size = None adaptive_batch_size = None
...@@ -801,7 +803,9 @@ class HFLM(TemplateLM): ...@@ -801,7 +803,9 @@ class HFLM(TemplateLM):
print(f"Determined Largest batch size: {batch_size}") print(f"Determined Largest batch size: {batch_size}")
adaptive_batch_size = batch_size adaptive_batch_size = batch_size
for (string,) in tqdm([req.args for req in requests], disable=(self.rank != 0)): for (string,) in tqdm(
[req.args for req in requests], disable=(disable_tqdm or (self.rank != 0))
):
rolling_token_windows = list( rolling_token_windows = list(
map( map(
utils.make_disjoint_window, utils.make_disjoint_window,
...@@ -1079,7 +1083,9 @@ class HFLM(TemplateLM): ...@@ -1079,7 +1083,9 @@ class HFLM(TemplateLM):
return re_ord.get_original(res) return re_ord.get_original(res)
def generate_until(self, requests: List[Instance]) -> List[str]: def generate_until(
self, requests: List[Instance], disable_tqdm: bool = False
) -> List[str]:
res = [] res = []
def _collate(req: Tuple[str, dict]): def _collate(req: Tuple[str, dict]):
...@@ -1095,7 +1101,7 @@ class HFLM(TemplateLM): ...@@ -1095,7 +1101,7 @@ class HFLM(TemplateLM):
pbar = tqdm( pbar = tqdm(
total=len(requests), total=len(requests),
disable=(self.rank != 0), disable=(disable_tqdm or (self.rank != 0)),
desc="Running generate_until requests", desc="Running generate_until requests",
) )
adaptive_batch_size = None adaptive_batch_size = None
......
...@@ -447,12 +447,14 @@ class NEURON_HF(TemplateLM): ...@@ -447,12 +447,14 @@ class NEURON_HF(TemplateLM):
return logits return logits
def loglikelihood_rolling(self, requests): def loglikelihood_rolling(self, requests, disable_tqdm: bool = False):
loglikelihoods = [] loglikelihoods = []
adaptive_batch_size = None adaptive_batch_size = None
for (string,) in tqdm([req.args for req in requests], disable=(self.rank != 0)): for (string,) in tqdm(
[req.args for req in requests], disable=(disable_tqdm or (self.rank != 0))
):
rolling_token_windows = list( rolling_token_windows = list(
map( map(
utils.make_disjoint_window, utils.make_disjoint_window,
...@@ -616,7 +618,7 @@ class NEURON_HF(TemplateLM): ...@@ -616,7 +618,7 @@ class NEURON_HF(TemplateLM):
return re_ord.get_original(res) return re_ord.get_original(res)
def generate_until(self, requests): def generate_until(self, requests, disable_tqdm: bool = False):
res = defaultdict(list) res = defaultdict(list)
re_ords = {} re_ords = {}
...@@ -638,7 +640,7 @@ class NEURON_HF(TemplateLM): ...@@ -638,7 +640,7 @@ class NEURON_HF(TemplateLM):
# within each set of reqs for given kwargs, we reorder by token length, descending. # within each set of reqs for given kwargs, we reorder by token length, descending.
re_ords[key] = utils.Reorderer([req.args for req in reqs], _collate) re_ords[key] = utils.Reorderer([req.args for req in reqs], _collate)
pbar = tqdm(total=len(requests), disable=(self.rank != 0)) pbar = tqdm(total=len(requests), disable=(disable_tqdm or (self.rank != 0)))
# for each different set of kwargs, we execute all requests, by batch. # for each different set of kwargs, we execute all requests, by batch.
for key, re_ord in re_ords.items(): for key, re_ord in re_ords.items():
......
...@@ -231,7 +231,7 @@ class OpenaiCompletionsLM(TemplateLM): ...@@ -231,7 +231,7 @@ class OpenaiCompletionsLM(TemplateLM):
self.cache_hook.add_partial("loglikelihood", cache_key, answer) self.cache_hook.add_partial("loglikelihood", cache_key, answer)
return re_ord.get_original(res) return re_ord.get_original(res)
def generate_until(self, requests) -> List[str]: def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]:
if not requests: if not requests:
return [] return []
res = [] res = []
...@@ -258,7 +258,8 @@ class OpenaiCompletionsLM(TemplateLM): ...@@ -258,7 +258,8 @@ class OpenaiCompletionsLM(TemplateLM):
# todo: more intelligent batching for heterogeneous `until` # todo: more intelligent batching for heterogeneous `until`
for chunk, request_args in tqdm( for chunk, request_args in tqdm(
list(sameuntil_chunks(re_ord.get_reordered(), self.batch_size)) list(sameuntil_chunks(re_ord.get_reordered(), self.batch_size)),
disable=disable_tqdm,
): ):
inps = [] inps = []
self._max_gen_toks = request_args.get("max_gen_toks", self.max_gen_toks) self._max_gen_toks = request_args.get("max_gen_toks", self.max_gen_toks)
...@@ -308,10 +309,12 @@ class OpenaiCompletionsLM(TemplateLM): ...@@ -308,10 +309,12 @@ class OpenaiCompletionsLM(TemplateLM):
# Isn't used because we override generate_until # Isn't used because we override generate_until
raise NotImplementedError() raise NotImplementedError()
def loglikelihood_rolling(self, requests) -> List[float]: def loglikelihood_rolling(
self, requests, disable_tqdm: bool = False
) -> List[float]:
loglikelihoods = [] loglikelihoods = []
for (string,) in tqdm([req.args for req in requests]): for (string,) in tqdm([req.args for req in requests], disable=disable_tqdm):
rolling_token_windows = list( rolling_token_windows = list(
map( map(
utils.make_disjoint_window, utils.make_disjoint_window,
...@@ -398,7 +401,7 @@ class OpenaiChatCompletionsLM(LM): ...@@ -398,7 +401,7 @@ class OpenaiChatCompletionsLM(LM):
# Isn't used because we override _loglikelihood_tokens # Isn't used because we override _loglikelihood_tokens
raise NotImplementedError() raise NotImplementedError()
def generate_until(self, requests) -> List[str]: def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]:
res = defaultdict(list) res = defaultdict(list)
re_ords = {} re_ords = {}
...@@ -412,7 +415,7 @@ class OpenaiChatCompletionsLM(LM): ...@@ -412,7 +415,7 @@ class OpenaiChatCompletionsLM(LM):
[req.args for req in reqs], lambda x: (-len(x[0]), x[0]) [req.args for req in reqs], lambda x: (-len(x[0]), x[0])
) )
pbar = tqdm(total=len(requests), disable=(self.rank != 0)) pbar = tqdm(total=len(requests), disable=(disable_tqdm or (self.rank != 0)))
for key, re_ord in re_ords.items(): for key, re_ord in re_ords.items():
# n needs to be 1 because messages in # n needs to be 1 because messages in
# chat completion are not batch but # chat completion are not batch but
...@@ -471,8 +474,8 @@ class OpenaiChatCompletionsLM(LM): ...@@ -471,8 +474,8 @@ class OpenaiChatCompletionsLM(LM):
return grouper.get_original(res) return grouper.get_original(res)
def loglikelihood(self, requests): def loglikelihood(self, requests, disable_tqdm: bool = False):
raise NotImplementedError("No support for logits.") raise NotImplementedError("No support for logits.")
def loglikelihood_rolling(self, requests): def loglikelihood_rolling(self, requests, disable_tqdm: bool = False):
raise NotImplementedError("No support for logits.") raise NotImplementedError("No support for logits.")
...@@ -95,9 +95,9 @@ class TextSynthLM(LM): ...@@ -95,9 +95,9 @@ class TextSynthLM(LM):
# Isn't used because we override loglikelihood, loglikelihood_rolling and generate_until # Isn't used because we override loglikelihood, loglikelihood_rolling and generate_until
raise NotImplementedError() raise NotImplementedError()
def loglikelihood(self, requests): def loglikelihood(self, requests, disable_tqdm: bool = False):
res = [] res = []
for context, continuation in tqdm(requests): for context, continuation in tqdm(requests, disable=disable_tqdm):
response = textsynth_completion( response = textsynth_completion(
url=self.api_url + "/v1/engines/" + self.engine + "/logprob", url=self.api_url + "/v1/engines/" + self.engine + "/logprob",
headers={"Authorization": "Bearer " + self.api_key}, headers={"Authorization": "Bearer " + self.api_key},
...@@ -119,7 +119,7 @@ class TextSynthLM(LM): ...@@ -119,7 +119,7 @@ class TextSynthLM(LM):
assert False assert False
return res return res
def loglikelihood_rolling(self, requests): def loglikelihood_rolling(self, requests, disable_tqdm: bool = False):
# TODO: The TextSynth API does not support tokenized inputs so we cannot # TODO: The TextSynth API does not support tokenized inputs so we cannot
# manually partition long contexts into smaller rolling windows as # manually partition long contexts into smaller rolling windows as
# done for other models derived from `BaseLM`. Override this method # done for other models derived from `BaseLM`. Override this method
...@@ -129,12 +129,12 @@ class TextSynthLM(LM): ...@@ -129,12 +129,12 @@ class TextSynthLM(LM):
"input tokenization support from TextSynth." "input tokenization support from TextSynth."
) )
def generate_until(self, requests): def generate_until(self, requests, disable_tqdm: bool = False):
if not requests: if not requests:
return [] return []
res = [] res = []
for request in tqdm(requests): for request in tqdm(requests, disable=disable_tqdm):
inp = request[0] inp = request[0]
request_args = request[1] request_args = request[1]
until = request_args["until"] until = request_args["until"]
......
...@@ -215,10 +215,12 @@ class VLLM(TemplateLM): ...@@ -215,10 +215,12 @@ class VLLM(TemplateLM):
) )
return outputs return outputs
def loglikelihood_rolling(self, requests: List[Instance]) -> List[float]: def loglikelihood_rolling(
self, requests: List[Instance], disable_tqdm: bool = False
) -> List[float]:
loglikelihoods = [] loglikelihoods = []
for (string,) in tqdm([req.args for req in requests]): for (string,) in tqdm([req.args for req in requests], disable=disable_tqdm):
rolling_token_windows = list( rolling_token_windows = list(
map( map(
make_disjoint_window, make_disjoint_window,
...@@ -244,7 +246,9 @@ class VLLM(TemplateLM): ...@@ -244,7 +246,9 @@ class VLLM(TemplateLM):
loglikelihoods.append(string_nll) loglikelihoods.append(string_nll)
return loglikelihoods return loglikelihoods
def generate_until(self, requests: List[Instance]) -> List[str]: def generate_until(
self, requests: List[Instance], disable_tqdm: bool = False
) -> List[str]:
res = [] res = []
# batch tokenize contexts # batch tokenize contexts
...@@ -273,7 +277,7 @@ class VLLM(TemplateLM): ...@@ -273,7 +277,7 @@ class VLLM(TemplateLM):
pbar = tqdm( pbar = tqdm(
total=len(requests), total=len(requests),
disable=(self.rank != 0), disable=(disable_tqdm or (self.rank != 0)),
desc="Running generate_until requests", desc="Running generate_until requests",
) )
# for each different set of kwargs, we execute all requests, by batch. # for each different set of kwargs, we execute all requests, by batch.
......
# Answer parsing and normalization code, from # Answer parsing and normalization code, from
# https://github.com/ruixiangcui/AGIEval/blob/main/src/ # https://github.com/ruixiangcui/AGIEval/blob/main/src/
# math_equivalence.py and post_process.py # math_equivalence.py and post_process.py
from typing import Dict, List
import re import re
from typing import Dict, List
import numpy as np import numpy as np
def parse_math_answer(raw_string):
def parse_math_answer(raw_string):
def remove_boxed(s): def remove_boxed(s):
left = "\\boxed{" left = "\\boxed{"
try: try:
assert s[:len(left)] == left assert s[: len(left)] == left
assert s[-1] == "}" assert s[-1] == "}"
answer = s[len(left):-1] answer = s[len(left) : -1]
if "=" in answer: if "=" in answer:
answer = answer.split("=")[-1].lstrip(" ") answer = answer.split("=")[-1].lstrip(" ")
return answer return answer
except: except Exception:
return None return None
def last_boxed_only_string(string): def last_boxed_only_string(string):
...@@ -40,10 +39,10 @@ def parse_math_answer(raw_string): ...@@ -40,10 +39,10 @@ def parse_math_answer(raw_string):
break break
i += 1 i += 1
if right_brace_idx == None: if right_brace_idx is None:
retval = None retval = None
else: else:
retval = string[idx:right_brace_idx + 1] retval = string[idx : right_brace_idx + 1]
return retval return retval
...@@ -92,7 +91,7 @@ def _fix_fracs(string): ...@@ -92,7 +91,7 @@ def _fix_fracs(string):
else: else:
try: try:
assert len(substr) >= 2 assert len(substr) >= 2
except: except Exception:
return string return string
a = substr[0] a = substr[0]
b = substr[1] b = substr[1]
...@@ -123,7 +122,7 @@ def _fix_a_slash_b(string): ...@@ -123,7 +122,7 @@ def _fix_a_slash_b(string):
assert string == "{}/{}".format(a, b) assert string == "{}/{}".format(a, b)
new_string = "\\frac{" + str(a) + "}{" + str(b) + "}" new_string = "\\frac{" + str(a) + "}{" + str(b) + "}"
return new_string return new_string
except: except Exception:
return string return string
...@@ -237,7 +236,7 @@ def is_equiv(str1, str2, verbose=False): ...@@ -237,7 +236,7 @@ def is_equiv(str1, str2, verbose=False):
if verbose: if verbose:
print(ss1, ss2) print(ss1, ss2)
return ss1 == ss2 return ss1 == ss2
except: except Exception:
return str1 == str2 return str1 == str2
...@@ -258,18 +257,18 @@ def process_results(doc: dict, results: List[str]) -> Dict[str, int]: ...@@ -258,18 +257,18 @@ def process_results(doc: dict, results: List[str]) -> Dict[str, int]:
} }
return results return results
# use a custom process_results() function, because AGIEval can have multiple valid answers # use a custom process_results() function, because AGIEval can have multiple valid answers
def process_results_mcqa(doc, results): def process_results_mcqa(doc, results):
results = [result[0] for result in results]
results = [result[0] for result in results] gold = doc["gold"]
gold = doc["gold"] acc = 1.0 if int(np.argmax(results)) in gold else 0.0
completion_len = np.array([float(len(i)) for i in doc["choices"]])
acc_norm = 1.0 if int(np.argmax(results / completion_len)) in gold else 0.0
acc = 1.0 if int(np.argmax(results)) in gold else 0.0 return {
completion_len = np.array([float(len(i)) for i in doc["choices"]]) "acc": acc,
acc_norm = 1.0 if int(np.argmax(results / completion_len)) in gold else 0.0 "acc_norm": acc_norm,
}
return {
"acc": acc,
"acc_norm": acc_norm,
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment