Commit 470fb31c authored by lintangsutawika's avatar lintangsutawika
Browse files

resolved git conflict

parents dfb41835 4d10ad56
from lm_eval.api.model import LM
from lm_eval.api.registry import register_model
from typing import Any, List, Tuple
from tqdm import tqdm
import time
from lm_eval import utils
from typing import List, Any, Tuple
from lm_eval.api.model import LM
from lm_eval.api.registry import register_model
from lm_eval.utils import retry_on_specific_exceptions
eval_logger = utils.eval_logger
......@@ -45,26 +48,30 @@ def anthropic_completion(
please install anthropic via `pip install lm-eval[anthropic]` or `pip install -e .[anthropic]`",
)
backoff_time: float = 3
while True:
try:
response = client.completions.create(
prompt=f"{anthropic.HUMAN_PROMPT} {prompt}{anthropic.AI_PROMPT}",
model=model,
# NOTE: Claude really likes to do CoT, and overly aggressive stop sequences
# (e.g. gsm8k's ":") may truncate a lot of the input.
stop_sequences=[anthropic.HUMAN_PROMPT] + stop,
max_tokens_to_sample=max_tokens_to_sample,
temperature=temperature,
**kwargs,
)
return response.completion
except anthropic.RateLimitError as e:
eval_logger.warning(
f"RateLimitError occurred: {e.__cause__}\n Retrying in {backoff_time} seconds"
)
time.sleep(backoff_time)
backoff_time *= 1.5
def _exception_callback(e: Exception, sleep_time: float) -> None:
eval_logger.warning(
f"RateLimitError occurred: {e.__cause__}\n Retrying in {sleep_time} seconds"
)
@retry_on_specific_exceptions(
on_exceptions=[anthropic.RateLimitError],
max_retries=None, # retry forever, consider changing
on_exception_callback=_exception_callback,
)
def completion():
response = client.completions.create(
prompt=f"{anthropic.HUMAN_PROMPT} {prompt}{anthropic.AI_PROMPT}",
model=model,
# NOTE: Claude really likes to do CoT, and overly aggressive stop sequences
# (e.g. gsm8k's ":") may truncate a lot of the input.
stop_sequences=[anthropic.HUMAN_PROMPT] + stop,
max_tokens_to_sample=max_tokens_to_sample,
temperature=temperature,
**kwargs,
)
return response.completion
return completion()
@register_model("anthropic")
......@@ -141,6 +148,14 @@ please install anthropic via `pip install lm-eval[anthropic]` or `pip install -e
raise NotImplementedError("No support for logits.")
def generate_until(self, requests) -> List[str]:
try:
import anthropic
except ModuleNotFoundError:
raise Exception(
"attempted to use 'anthropic' LM type, but package `anthropic` is not installed. \
please install anthropic via `pip install lm-eval[anthropic]` or `pip install -e .[anthropic]`",
)
if not requests:
return []
......
import random
from lm_eval.api.model import LM
from lm_eval.api.registry import register_model
......
import requests
import logging
import time
from tqdm import tqdm
import requests
from requests.exceptions import RequestException
from tqdm import tqdm
from lm_eval.api.model import LM
from lm_eval.api.registry import register_model
logger = logging.getLogger(__name__)
......
import copy
import os
from packaging import version
from pathlib import Path
from typing import List, Literal, Optional, Tuple, Union
import torch
import torch.nn.functional as F
import transformers
from accelerate import Accelerator, DistributedType, find_executable_batch_size
from packaging import version
from peft import PeftModel
from peft import __version__ as PEFT_VERSION
from tqdm import tqdm
from transformers.models.auto.modeling_auto import (
MODEL_FOR_CAUSAL_LM_MAPPING_NAMES,
MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES,
)
from peft import __version__ as PEFT_VERSION, PeftModel
import copy
from collections import defaultdict
from tqdm import tqdm
from pathlib import Path
import torch.nn.functional as F
from lm_eval import utils
from lm_eval.api.instance import Instance
from lm_eval.api.model import LM
from lm_eval.api.registry import register_model
from lm_eval.utils import Collator, stop_sequences_criteria
from lm_eval.utils import MultiTokenEOSCriteria, stop_sequences_criteria
from accelerate import Accelerator, find_executable_batch_size, DistributedType
from typing import List, Optional, Union, Tuple, Literal
eval_logger = utils.eval_logger
......@@ -107,9 +105,7 @@ class HFLM(LM):
eval_logger.warning(
"`pretrained` model kwarg is not of type `str`. Many other model arguments may be ignored. Please do not launch via accelerate or use `parallelize=True` if passing an existing model this way."
)
assert (
not parallelize
), "`parallelize=True` is not compatible with passing pre-initialized model to `pretrained`"
assert not parallelize, "`parallelize=True` is not compatible with passing pre-initialized model to `pretrained`"
self._model = pretrained
self._device = self._model.device
......@@ -145,9 +141,7 @@ class HFLM(LM):
+ [f"cuda:{i}" for i in range(torch.cuda.device_count())]
+ ["mps", "mps:0"]
)
if device:
if device not in device_list:
device = int(device)
if device and device in device_list:
self._device = torch.device(device)
eval_logger.info(f"Using device '{device}'")
if device in ("mps", "mps:0") and version.parse(
......@@ -170,7 +164,7 @@ class HFLM(LM):
f"Using `accelerate launch` or `parallelize=True`, device '{device}' will be overridden when placing model."
)
# TODO: include in warning that `load_in_8bit` etc. affect this too
self._device = device
self._device = torch.device(device)
# TODO: update this to be less of a hack once subfolder is fixed in HF
revision = revision + ("/" + subfolder if subfolder is not None else "")
......@@ -207,7 +201,7 @@ class HFLM(LM):
self.model.eval()
self.model.tie_weights()
if (gpus >= 1 or self.device.type == "mps") and isinstance(pretrained, str):
if isinstance(pretrained, str) and (gpus >= 1 or str(self.device) == "mps"):
if not (parallelize or autogptq or ("device_map" in kwargs)):
# place model onto device requested manually,
# if not using HF Accelerate or device_map
......@@ -238,7 +232,7 @@ class HFLM(LM):
elif self.tokenizer.eos_token:
self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
else:
if "Qwen" in pretrained:
if self.config.model_type == "qwen":
# Qwen's trust_remote_code tokenizer does not allow for adding special tokens
self.tokenizer.pad_token = "<|endoftext|>"
else:
......@@ -279,10 +273,13 @@ class HFLM(LM):
"with 'accelerate launch *script*'. "
f"Current run will proceed with {accelerator.num_processes} devices."
)
assert accelerator.distributed_type in [
DistributedType.FSDP,
DistributedType.MULTI_GPU,
], "Unsupported distributed type provided. Only DDP and FSDP are supported."
assert (
accelerator.distributed_type
in [
DistributedType.FSDP,
DistributedType.MULTI_GPU,
]
), "Unsupported distributed type provided. Only DDP and FSDP are supported."
if accelerator.distributed_type == DistributedType.FSDP:
self._model = accelerator.prepare(self.model)
else:
......@@ -417,7 +414,6 @@ class HFLM(LM):
revision: str = "main",
trust_remote_code: bool = False,
) -> None:
self._config = transformers.AutoConfig.from_pretrained(
pretrained,
revision=revision,
......@@ -635,7 +631,7 @@ class HFLM(LM):
padding_side: str = "left",
left_truncate_len: int = None,
truncation: bool = False,
) -> Tuple[List[int], List[int]]:
) -> Tuple[torch.Tensor, torch.Tensor]:
# encode a batch of strings. converts to tensors and pads automatically, unlike tok_encode.
old_padding_side = self.tokenizer.padding_side
self.tokenizer.padding_side = padding_side
......@@ -751,8 +747,9 @@ class HFLM(LM):
for context, continuation in [req.args for req in requests]:
if context == "":
# end of text as context
context_enc, continuation_enc = [self.eot_token_id], self.tok_encode(
continuation
context_enc, continuation_enc = (
[self.eot_token_id],
self.tok_encode(continuation),
)
else:
context_enc, continuation_enc = self._encode_pair(context, continuation)
......@@ -844,6 +841,7 @@ class HFLM(LM):
res = []
def _collate(x):
"""Defines the key for the sorted method"""
# the negative sign on len(toks) sorts descending - this has a few advantages:
# - time estimates will always be over not underestimates, which is more useful for planning
# - to know the size of a batch when going through the list, you know the first one is always the batch
......@@ -854,26 +852,27 @@ class HFLM(LM):
toks = x[1] + x[2]
return -len(toks), tuple(toks)
re_ord = utils.Reorderer(requests, _collate)
re_ord = Collator(requests, sort_fn=_collate)
n_reordered_requests = len(re_ord.get_reordered())
# automatic (variable) batch size detection for vectorization
# pull longest context sample from request
chunks = utils.chunks(
re_ord.get_reordered(),
n=self.batch_size
n_reordered_requests = len(re_ord)
batch_size = (
self.batch_size
if self.batch_size != "auto"
else override_bs
if override_bs is not None
else 0,
fn=self._batch_scheduler
else 0
)
batch_fn = (
self._batch_scheduler
if self.batch_size == "auto"
and n_reordered_requests > 0
and not override_bs
else None,
else None
)
chunks = re_ord.get_batched(n=batch_size, batch_fn=batch_fn)
pbar = tqdm(total=len(requests), disable=(disable_tqdm or (self.rank != 0)))
for chunk in chunks:
inps = []
......@@ -995,9 +994,7 @@ class HFLM(LM):
greedy_tokens = logits.argmax(dim=-1)
cont_toks = torch.tensor(
cont_toks, dtype=torch.long, device=self.device
).unsqueeze(
0
) # [1, seq]
).unsqueeze(0) # [1, seq]
max_equal = (greedy_tokens == cont_toks).all()
# Obtain log-probs at the corresponding continuation token indices
......@@ -1019,10 +1016,10 @@ class HFLM(LM):
return re_ord.get_original(res)
def generate_until(self, requests: List[Instance]) -> List[str]:
res = defaultdict(list)
re_ords = {}
res = []
def _collate(x):
"""Defines the key for the sorted method"""
# the negative sign on len(toks) sorts descending - this has a few advantages:
# - time estimates will always be over not underestimates, which is more useful for planning
# - to know the size of a batch when going through the list, you know the first one is always the batch
......@@ -1032,14 +1029,6 @@ class HFLM(LM):
toks = self.tok_encode(x[0])
return -len(toks), x[0]
# we group requests by their generation_kwargs,
# so that we don't try to execute e.g. greedy sampling and temp=0.8 sampling
# in the same batch.
grouper = utils.Grouper(requests, lambda x: str(x.args[1]))
for key, reqs in grouper.get_grouped().items():
# within each set of reqs for given kwargs, we reorder by token length, descending.
re_ords[key] = utils.Reorderer([req.args for req in reqs], _collate)
pbar = tqdm(total=len(requests), disable=(self.rank != 0))
if self.batch_size == "auto":
# using rolling window with maximum context
......@@ -1048,98 +1037,102 @@ class HFLM(LM):
print(f"Determined Largest batch size: {batch_size}")
adaptive_batch_size = batch_size
# for each different set of kwargs, we execute all requests, by batch.
for key, re_ord in re_ords.items():
chunks = utils.chunks(
re_ord.get_reordered(),
n=self.batch_size
if self.batch_size != "auto"
else adaptive_batch_size
if adaptive_batch_size is not None
else 0,
fn=self._batch_scheduler
if self.batch_size == "auto" and not adaptive_batch_size
else None,
)
for chunk in chunks:
contexts, all_gen_kwargs = zip(*chunk)
# we assume all gen kwargs in the batch are the same
# this is safe to assume because the `grouper` object ensures it.
gen_kwargs = all_gen_kwargs[0]
# unpack our keyword arguments.
until = None
if isinstance(gen_kwargs, dict):
kwargs = copy.deepcopy(gen_kwargs) # edge case for repeats > 1
if "until" in kwargs.keys():
until = kwargs.pop("until")
if isinstance(until, str):
until = [kwargs]
elif not isinstance(until, list):
raise ValueError(
f"Expected `kwargs['until']` to be of type Union[str,list] but got {until}"
)
else:
raise ValueError(
f"Expected `kwargs` to be of type `dict` but got {kwargs}"
)
if not until:
until = [self.tok_decode(self.eot_token_id)]
if "max_gen_toks" in kwargs.keys():
max_gen_toks = kwargs.pop("max_gen_toks")
else:
max_gen_toks = self.max_gen_toks
batch_size = (
self.batch_size
if self.batch_size != "auto"
else adaptive_batch_size
if adaptive_batch_size is not None
else 0
)
batch_fn = (
self._batch_scheduler
if self.batch_size == "auto" and not adaptive_batch_size
else None
)
# set the max length in tokens of inputs ("context_enc")
if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
# max len for inputs = max length, minus room to generate the max new tokens
max_ctx_len = self.max_length - max_gen_toks
elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM:
# max len for inputs = encoder's whole max_length
max_ctx_len = self.max_length
# encode, pad, and truncate contexts for this batch
context_enc, attn_masks = self.tok_batch_encode(
contexts,
left_truncate_len=max_ctx_len,
truncation=self.truncation,
)
context_enc = context_enc.to(self.device)
attn_masks = attn_masks.to(self.device)
if "max_length" not in kwargs:
kwargs["max_length"] = context_enc.shape[1] + max_gen_toks
# perform batched generation
cont = self._model_generate(
context=context_enc,
attention_mask=attn_masks,
stop=until,
**kwargs,
# we group requests by their generation_kwargs,
# so that we don't try to execute e.g. greedy sampling and temp=0.8 sampling
# in the same batch.
re_ords = Collator([reg.args for reg in requests], _collate, grouping=True)
chunks = re_ords.get_batched(n=batch_size, batch_fn=batch_fn)
for chunk in chunks:
contexts, all_gen_kwargs = zip(*chunk)
# we assume all gen kwargs in the batch are the same
# this is safe to assume because the `grouper` object ensures it.
gen_kwargs = all_gen_kwargs[0]
# unpack our keyword arguments.
until = None
if isinstance(gen_kwargs, dict):
kwargs = copy.deepcopy(gen_kwargs) # edge case for repeats > 1
if "until" in kwargs.keys():
until = kwargs.pop("until")
if isinstance(until, str):
until = [kwargs]
elif not isinstance(until, list):
raise ValueError(
f"Expected `kwargs['until']` to be of type Union[str,list] but got {until}"
)
else:
raise ValueError(
f"Expected `kwargs` to be of type `dict` but got {kwargs}"
)
if not until:
until = [self.tok_decode(self.eot_token_id)]
if "max_gen_toks" in kwargs.keys():
max_gen_toks = kwargs.pop("max_gen_toks")
else:
max_gen_toks = self.max_gen_toks
cont_toks_list = cont.tolist()
for cont_toks, context in zip(cont_toks_list, contexts):
# discard context + left-padding toks if using causal decoder-only LM
if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
cont_toks = cont_toks[context_enc.shape[1] :]
# set the max length in tokens of inputs ("context_enc")
if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
# max len for inputs = max length, minus room to generate the max new tokens
max_ctx_len = self.max_length - max_gen_toks
elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM:
# max len for inputs = encoder's whole max_length
max_ctx_len = self.max_length
# encode, pad, and truncate contexts for this batch
context_enc, attn_masks = self.tok_batch_encode(
contexts,
left_truncate_len=max_ctx_len,
truncation=self.truncation,
)
context_enc = context_enc.to(self.device)
attn_masks = attn_masks.to(self.device)
s = self.tok_decode(cont_toks)
if "max_length" not in kwargs:
kwargs["max_length"] = context_enc.shape[1] + max_gen_toks
# use secondary stop seqs to cut off should-have-been-stopped content post-hoc
for term in until:
if len(term) > 0:
# ignore '' separator,
# for seq2seq case where self.tok_decode(self.eot_token_id) = ''
s = s.split(term)[0]
# perform batched generation
cont = self._model_generate(
context=context_enc,
attention_mask=attn_masks,
stop=until,
**kwargs,
)
res[key].append(s)
cont_toks_list = cont.tolist()
for cont_toks, context in zip(cont_toks_list, contexts):
# discard context + left-padding toks if using causal decoder-only LM
if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
cont_toks = cont_toks[context_enc.shape[1] :]
self.cache_hook.add_partial(
"generate_until", (context, gen_kwargs), s
)
pbar.update(1)
# reorder this group of results back to original unsorted form
res[key] = re_ord.get_original(res[key])
s = self.tok_decode(cont_toks)
# use secondary stop seqs to cut off should-have-been-stopped content post-hoc
for term in until:
if len(term) > 0:
# ignore '' separator,
# for seq2seq case where self.tok_decode(self.eot_token_id) = ''
s = s.split(term)[0]
res.append(s)
self.cache_hook.add_partial("generate_until", (context, gen_kwargs), s)
pbar.update(1)
# reorder this group of results back to original unsorted form
res = re_ords.get_original(res)
pbar.close()
return grouper.get_original(res)
return res
from typing import Optional, Union
import torch
from lm_eval import utils
from lm_eval.api.registry import register_model
from lm_eval.models.huggingface import HFLM
@register_model("mamba_ssm")
class MambaLMWrapper(HFLM):
def __init__(
self,
pretrained="state-spaces/mamba-130m",
**kwargs,
) -> None:
"""
Mamba (via the `mamba_ssm` package) supports the following args:
```
d_model: int,
n_layer: int,
vocab_size: int,
initializer_cfg=None,
pad_vocab_size_multiple: int = 1,
ssm_cfg=None,
norm_epsilon: float = 1e-5,
rms_norm: bool = False,
initializer_cfg=None,
fused_add_norm=False,
residual_in_fp32=False,
```
See https://github.com/state-spaces/mamba/blob/main/mamba_ssm/models/mixer_seq_simple.py#L175 for more info.
The above can all be passed via `--model_args` or to this __init__() directly
but we recommend placing many of these within the config.json file uploaded alongside your
Mamba model to the HF Hub instead.
All other HuggingFace from_pretrained() kwargs
such as those related to
`parallelize=True`, PEFT, autoGPTQ,
or any sub-configurations of these advanced args,
are unsupported by the `mamba_ssm` package.
The HFLM arguments
`backend`, `revision`, `subfolder`, `tokenizer`, `truncation`, `max_length`,
`device`, `dtype`, `batch_size`, `max_batch_size`, `trust_remote_code`, `use_fast_tokenizer`
Are all supported by Mamba where they do not conflict
with Mamba-specific restrictions such as causal LMs only.
"""
if "backend" in kwargs:
# mamba currently only supports causal models
assert kwargs["backend"] == "causal"
super().__init__(
pretrained=pretrained,
# set appropriate defaults for tokenizer, max length, etc
backend=kwargs.get("backend", "causal"),
tokenizer=kwargs.get("tokenizer", "EleutherAI/gpt-neox-20b"),
max_length=kwargs.get("max_length", 2048),
**kwargs,
)
def _get_config(
self,
pretrained: str,
**kwargs,
) -> None:
try:
from mamba_ssm.utils.hf import load_config_hf # noqa: F811
except ModuleNotFoundError:
raise Exception(
"attempted to use 'mamba_ssm' LM type, but package `mamba_ssm` is not installed. \
please install mamba via `pip install lm-eval[mamba]` or `pip install -e .[mamba]`",
)
self._config = load_config_hf(pretrained)
def _create_model(
self,
pretrained: str,
dtype: Optional[Union[str, torch.dtype]] = "float16",
# no `parallelize=True` options
# no PEFT and quantization options
# Mamba does not support arbitrary HF from_pretrained() args
**kwargs,
) -> None:
try:
from mamba_ssm.models.mixer_seq_simple import MambaLMHeadModel # noqa: F811
except ModuleNotFoundError:
raise Exception(
"attempted to use 'mamba_ssm' LM type, but package `mamba_ssm` is not installed. \
please install mamba via `pip install lm-eval[mamba]` or `pip install -e .[mamba]`",
)
self._model = MambaLMHeadModel.from_pretrained(
pretrained,
device=self._device,
dtype=torch.float16 if dtype == "auto" else utils.get_dtype(dtype),
**kwargs,
)
def _model_generate(self, context, max_length, stop, **generation_kwargs):
for key in ("do_sample", "attention_mask"):
if key in generation_kwargs:
generation_kwargs.pop(key)
# mamba's custom GenerationMixin currently does not support
# passing stopping criteria.
# for the time being, we simply generate to max length,
# then truncate (equivalent result)
# -- this should be revisited to speed up generation
# stopping_criteria = stop_sequences_criteria(
# self.tokenizer, stop, 1, context.shape[0]
# )
return self.model.generate(
input_ids=context,
max_length=max_length,
# stopping_criteria=stopping_criteria,
# pad_token_id=self.tokenizer.pad_token_id,
# use_cache=True,
**generation_kwargs,
)
import os
import time
from typing import List, Tuple, Optional
import copy
import os
from collections import defaultdict
from importlib.util import find_spec
from typing import List, Optional, Tuple
from tqdm import tqdm
from lm_eval import utils
from lm_eval.api.model import LM
from lm_eval.api.registry import register_model
from lm_eval.utils import retry_on_specific_exceptions
def get_result(response, ctxlen: int) -> Tuple[float, bool]:
......@@ -44,24 +45,28 @@ def oa_completion(**kwargs):
Retry with back-off until they respond
"""
try:
import openai, tiktoken # noqa: E401
except ModuleNotFoundError:
if not find_spec("openai") or not find_spec("tiktoken"):
raise Exception(
"attempted to use 'openai' LM type, but package `openai` or `tiktoken` are not installed. \
please install these via `pip install lm-eval[openai]` or `pip install -e .[openai]`",
"attempted to use 'openai' LM type, but package `openai` or `tiktoken` are not installed. "
"Please install these via `pip install lm-eval[openai]` or `pip install -e .[openai]`"
)
else:
import openai
backoff_time = 3
while True:
try:
return openai.completions.create(**kwargs)
except openai.OpenAIError:
import traceback
def _exception_callback(e: Exception, sleep_time: float) -> None:
import traceback
traceback.print_exc()
traceback.print_exc()
time.sleep(backoff_time)
backoff_time *= 1.5
@retry_on_specific_exceptions(
on_exceptions=[openai.OpenAIError],
max_retries=None, # retry forever, consider changing
on_exception_callback=_exception_callback,
)
def completion():
return openai.completions.create(**kwargs)
return completion()
@register_model("openai-completions")
......@@ -88,7 +93,8 @@ class OpenaiCompletionsLM(LM):
super().__init__()
self.seed = seed
try:
import openai, tiktoken # noqa: E401
import openai # noqa: E401
import tiktoken
except ModuleNotFoundError:
raise Exception(
"attempted to use 'openai' LM type, but package `openai` or `tiktoken` are not installed. \
......@@ -102,7 +108,7 @@ class OpenaiCompletionsLM(LM):
self._max_gen_toks = max_gen_toks
self._max_length = max_length
# Read from environment variable OPENAI_API_SECRET_KEY
# Read from environment variable OPENAI_API_KEY
openai.api_key = os.environ["OPENAI_API_KEY"]
@property
......@@ -154,8 +160,9 @@ class OpenaiCompletionsLM(LM):
for context, continuation in [req.args for req in requests]:
if context == "":
# end of text as context
context_enc, continuation_enc = [self.eot_token_id], self.tok_encode(
continuation
context_enc, continuation_enc = (
[self.eot_token_id],
self.tok_encode(continuation),
)
else:
context_enc, continuation_enc = self._encode_pair(context, continuation)
......@@ -247,6 +254,7 @@ class OpenaiCompletionsLM(LM):
list(sameuntil_chunks(re_ord.get_reordered(), self.REQ_CHUNK_SIZE))
):
inps = []
self._max_gen_toks = request_args.pop("max_gen_toks", self.max_gen_toks)
for context, _ in chunk:
context_enc = self.tok_encode(context)
inp = context_enc[-(self.max_length - self.max_gen_toks) :]
......@@ -326,68 +334,68 @@ def oa_chat_completion(client, **kwargs):
Retry with back-off until they respond
"""
try:
import openai, tiktoken # noqa: E401
except ModuleNotFoundError:
if not find_spec("openai") or not find_spec("tiktoken"):
raise Exception(
"attempted to use 'openai' LM type, but package `openai` or `tiktoken` are not installed. \
please install these via `pip install lm-eval[openai]` or `pip install -e .[openai]`",
"attempted to use 'openai' LM type, but package `openai` or `tiktoken` are not installed. "
"Please install these via `pip install lm-eval[openai]` or `pip install -e .[openai]`"
)
else:
import openai
async def _get_completions(**kwargs):
chat_completions = await client.chat.completions.create(**kwargs)
return chat_completions
def _exception_callback(e: Exception, sleep_time: float) -> None:
import traceback
backoff_time = 3
while True:
try:
return client.chat.completions.create(**kwargs)
except openai.OpenAIError:
import traceback
traceback.print_exc()
@retry_on_specific_exceptions(
on_exceptions=[openai.OpenAIError],
max_retries=None, # retry forever, consider changing
on_exception_callback=_exception_callback,
)
def completion():
return client.chat.completions.create(**kwargs)
traceback.print_exc()
time.sleep(backoff_time)
backoff_time *= 1.5
return completion()
@register_model("openai-chat-completions")
@register_model("openai-chat-completions", "local-chat-completions")
class OpenaiChatCompletionsLM(LM):
def __init__(
self, model: str = "gpt-3.5-turbo", truncate: bool = False, batch_size: int = 1
self,
model: str = "gpt-3.5-turbo", # GPT model or Local model using HuggingFace model paths
base_url: str = None,
truncate: bool = False,
**kwargs,
) -> None:
"""
:param model: str
Implements an OpenAI-style chat completion API for
accessing both OpenAI OR locally-hosted models using
HuggingFace Tokenizer
OpenAI API model (e.g. gpt-3.5-turbo)
using the **gen_kwargs passed on init
:param truncate: bool
Truncate input if too long (if False and input is too long, throw error)
"""
super().__init__()
try:
import openai, tiktoken # noqa: E401
import openai # noqa: E401
except ModuleNotFoundError:
raise Exception(
"attempted to use 'openai' LM type, but package `openai` or `tiktoken` are not installed. \
please install these via `pip install lm-eval[openai]` or `pip install -e .[openai]`",
)
self.model = model
self.frequency_penalty = 0
self.logit_bias = None
self.n = 1
self.presence_penalty = 0
self.temperature = 1
self.top_p = 1
self.tokenizer = tiktoken.encoding_for_model(self.model)
self.vocab_size = self.tokenizer.n_vocab
self.base_url = base_url
self.truncate = truncate
self.end_of_text_token_id = self.tokenizer.eot_token
# Read from environment variable OPENAI_API_KEY
self.client = openai.OpenAI() # openai.AsyncOpenAI()
@property
def eot_token_id(self):
return self.end_of_text_token_id
# Set to EMPTY for local
if self.base_url:
self.client = openai.OpenAI(base_url=self.base_url)
else:
self.client = openai.OpenAI() # openai.AsyncOpenAI()
@property
def max_length(self) -> int:
......@@ -408,53 +416,19 @@ class OpenaiChatCompletionsLM(LM):
# Isn't used because we override _loglikelihood_tokens
raise NotImplementedError()
def tok_encode(self, string: str) -> List[int]:
return self.tokenizer.encode(string)
def tok_decode(self, tokens: List[int]) -> str:
return self.tokenizer.decode(tokens)
def _encode_pair(
self, context: str, continuation: str
) -> Tuple[List[int], List[int]]:
n_spaces = len(context) - len(context.rstrip())
if n_spaces > 0:
continuation = context[-n_spaces:] + continuation
context = context[:-n_spaces]
whole_enc = self.tok_encode(context + continuation)
context_enc = self.tok_encode(context)
context_enc_len = len(context_enc)
continuation_enc = whole_enc[context_enc_len:]
return context_enc, continuation_enc
def generate_until(self, requests) -> List[str]:
res = defaultdict(list)
re_ords = {}
def _collate(x):
toks = self.tok_encode(x[0])
return -len(toks), x[0]
# we group requests by their generation_kwargs,
# so that we don't try to execute e.g. greedy sampling and temp=0.8 sampling
# in the same batch.
grouper = utils.Grouper(requests, lambda x: str(x.args[1]))
for key, reqs in grouper.get_grouped().items():
# within each set of reqs for given kwargs, we reorder by token length, descending.
re_ords[key] = utils.Reorderer([req.args for req in reqs], _collate)
def sameuntil_chunks(xs, size):
ret = []
lastuntil = xs[0][1]
for x in xs:
if len(ret) >= size or x[1] != lastuntil:
yield ret, lastuntil
ret = []
lastuntil = x[1]
ret.append(x)
if ret:
yield ret, lastuntil
re_ords[key] = utils.Reorderer(
[req.args for req in reqs], lambda x: (-len(x[0]), x[0])
)
pbar = tqdm(total=len(requests), disable=(self.rank != 0))
for key, re_ord in re_ords.items():
......@@ -468,37 +442,26 @@ class OpenaiChatCompletionsLM(LM):
gen_kwargs = all_gen_kwargs[0]
until = None
if isinstance(gen_kwargs, dict):
kwargs = copy.deepcopy(gen_kwargs) # edge case for repeats > 1
if isinstance(kwargs := copy.deepcopy(gen_kwargs), dict):
if "do_sample" in kwargs.keys():
kwargs.pop("do_sample")
if "until" in kwargs.keys():
until = kwargs.pop("until")
if isinstance(until, str):
until = [kwargs]
elif not isinstance(until, list):
raise ValueError(
f"Expected `kwargs['until']` to be of type Union[str,list] but got {until}"
f"Expected repr(kwargs['until']) to be of type Union[str, list] but got {until}"
)
kwargs["stop"] = until
kwargs["max_tokens"] = kwargs.pop("max_gen_toks", self.max_gen_toks)
else:
raise ValueError(
f"Expected `kwargs` to be of type `dict` but got {kwargs}"
f"Expected repr(kwargs) to be of type repr(dict) but got {kwargs}"
)
if "max_gen_toks" in kwargs.keys():
max_gen_toks = kwargs.pop("max_gen_toks")
else:
max_gen_toks = self.max_gen_toks
response = oa_chat_completion(
client=self.client,
messages=inps,
model=self.model,
frequency_penalty=self.frequency_penalty,
# logit_bias=self.logit_bias,
max_tokens=max_gen_toks,
n=self.n,
presence_penalty=self.presence_penalty,
temperature=self.temperature,
top_p=self.top_p,
client=self.client, messages=inps, model=self.model, **kwargs
)
for resp, (context, args_) in zip(response.choices, chunk):
......
......@@ -13,11 +13,13 @@ Homepage: https://textsynth.com/index.html
"""
import logging
import os
import requests as _requests
import time
from tqdm import tqdm
from lm_eval.api.model import LM
from lm_eval.api.registry import register_model
from lm_eval.utils import retry_on_specific_exceptions
logger = logging.getLogger(__name__)
......@@ -27,21 +29,26 @@ def textsynth_completion(**kwargs):
"""Query TextSynth API for completion.
Retry with back-off until they respond.
"""
backoff_time = 3
while True:
try:
return _requests.post(**kwargs)
except _requests.exceptions.RequestException:
import traceback
traceback.print_exc()
time.sleep(backoff_time)
backoff_time *= 1.5
def _exception_callback(e: Exception, sleep_time: float) -> None:
import traceback
traceback.print_exc()
@retry_on_specific_exceptions(
on_exceptions=[_requests.exceptions.RequestException],
max_retries=None, # retry forever, consider changing
on_exception_callback=_exception_callback,
)
def completion():
return _requests.post(**kwargs)
return completion()
@register_model("textsynth")
class TextSynthLM(LM):
def __init__(self, engine, truncate: bool = False) -> None:
def __init__(self, engine, truncate: bool = False, **kwargs) -> None:
"""
:param engine: str
TextSynth API engine (e.g. `gptj_6B`)
......@@ -149,7 +156,7 @@ class TextSynthLM(LM):
self.cache_hook.add_partial("generate_until", (inp, request_args), s)
else:
logger.error(
f"The following response does not contain generated `text`. "
"The following response does not contain generated `text`. "
"Got:\n{resp}"
)
assert False
......
import copy
from collections import defaultdict
from typing import List, Tuple, Optional, Literal, Union, Any
from transformers import AutoTokenizer
from importlib.util import find_spec
from typing import List, Literal, Optional, Tuple, Union
from tqdm import tqdm
from lm_eval import utils
from lm_eval.api.instance import Instance
from lm_eval.api.model import LM
import copy
from tqdm import tqdm
from lm_eval.api.registry import register_model
from lm_eval import utils
try:
from vllm import LLM, SamplingParams
from ray.util.multiprocessing import Pool
from vllm import LLM, SamplingParams
from vllm.transformers_utils.tokenizer import get_tokenizer
except ModuleNotFoundError:
pass
......@@ -54,12 +57,10 @@ class VLLM(LM):
):
super().__init__()
try:
import vllm
except ModuleNotFoundError:
if not find_spec("vllm"):
raise Exception(
"attempted to use 'vllm' LM type, but package `vllm` is not installed. \
please install vllm via `pip install lm-eval[vllm]` or `pip install -e .[vllm]`",
"attempted to use 'vllm' LM type, but package `vllm` is not installed. "
"Please install vllm via `pip install lm-eval[vllm]` or `pip install -e .[vllm]`"
)
assert "cuda" in device or device is None, "vLLM only supports CUDA"
......@@ -95,7 +96,8 @@ please install vllm via `pip install lm-eval[vllm]` or `pip install -e .[vllm]`"
trust_remote_code=trust_remote_code,
tokenizer_revision=tokenizer_revision,
)
self.batch_size = batch_size
self.batch_size = "auto" if batch_size.startswith("auto:") else batch_size
self._max_gen_toks = max_gen_toks
@property
......@@ -193,8 +195,9 @@ please install vllm via `pip install lm-eval[vllm]` or `pip install -e .[vllm]`"
for context, continuation in [req.args for req in requests]:
if context == "":
# end of text as context
context_enc, continuation_enc = [self.eot_token_id], self.tok_encode(
continuation
context_enc, continuation_enc = (
[self.eot_token_id],
self.tok_encode(continuation),
)
else:
context_enc, continuation_enc = self._encode_pair(context, continuation)
......
......@@ -69,7 +69,6 @@ def get_prompt(prompt_id: str, dataset_name: str = None, subset_name: str = None
def load_prompt_list(
use_prompt: str, dataset_name=None, subset_name=None, yaml_path=None, **kwargs
):
category_name, prompt_name = use_prompt.split(":")
if category_name == "promptsource":
......@@ -113,7 +112,6 @@ class PromptString:
self.prompt_string = prompt_string
def apply(self, doc):
doc_to_text = self.prompt_string["doc_to_text"]
doc_to_target = self.prompt_string["doc_to_target"]
......
......@@ -131,6 +131,9 @@ def include_task_folder(task_dir: str, register_task: bool = True) -> None:
"""
Calling this function
"""
# Track whether any tasks failed during loading
import_fail = False
for root, subdirs, file_list in os.walk(task_dir):
# if (subdirs == [] or subdirs == ["__pycache__"]) and (len(file_list) > 0):
for f in file_list:
......@@ -155,20 +158,27 @@ def include_task_folder(task_dir: str, register_task: bool = True) -> None:
# Log this silently and show it only when
# the user defines the appropriate verbosity.
except ModuleNotFoundError as e:
except (ImportError, ModuleNotFoundError) as e:
import_fail = True
eval_logger.debug(
f"{yaml_path}: {e}. Config will not be added to registry."
)
except Exception as error:
import traceback
eval_logger.debug(
"Failed to load config in\n"
eval_logger.warning(
"Unexpected error loading config in\n"
f" {yaml_path}\n"
" Config will not be added to registry\n"
f" Error: {error}\n"
f" Traceback: {traceback.format_exc()}"
)
if import_fail:
eval_logger.warning(
"Some tasks could not be loaded due to missing dependencies."
" Run with `--verbosity DEBUG` for full details."
)
return 0
......@@ -180,7 +190,6 @@ def include_path(task_dir):
def initialize_tasks(verbosity="INFO"):
eval_logger.setLevel(getattr(logging, f"{verbosity}"))
task_dir = os.path.dirname(os.path.abspath(__file__)) + "/"
......
......@@ -23,4 +23,4 @@ metric_list:
aggregation: mean
higher_is_better: true
metadata:
- version: 1.0
version: 1.0
......@@ -20,4 +20,4 @@ metric_list:
aggregation: mean
higher_is_better: true
metadata:
- version: 1.0
version: 1.0
......@@ -13,4 +13,4 @@ metric_list:
aggregation: mean
higher_is_better: true
metadata:
- version: 1.0
version: 1.0
......@@ -11,4 +11,4 @@ metric_list:
aggregation: mean
higher_is_better: true
metadata:
- version: 1.0
version: 1.0
......@@ -17,4 +17,4 @@ metric_list:
aggregation: mean
higher_is_better: true
metadata:
- version: 0.0
version: 0.0
......@@ -24,7 +24,6 @@ def parse_args():
if __name__ == "__main__":
args = parse_args()
# get filename of base_yaml so we can `"include": ` it in our other YAMLs.
......@@ -37,7 +36,6 @@ if __name__ == "__main__":
dataset_path = "lukaemon/bbh"
for task in tqdm(datasets.get_dataset_infos(dataset_path).keys()):
resp = requests.get(
f"https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/main/cot-prompts/{task}.txt"
).content.decode("utf-8")
......
......@@ -27,4 +27,4 @@ filter_list:
- function: "take_first"
num_fewshot: 0
metadata:
- version: 1.0
version: 1.0
......@@ -24,4 +24,4 @@ filter_list:
- function: "take_first"
num_fewshot: 0
metadata:
- version: 0
version: 0
......@@ -18,4 +18,4 @@ generation_kwargs:
temperature: 0.0
num_fewshot: 0
metadata:
- version: 0
version: 0
......@@ -18,4 +18,4 @@ generation_kwargs:
temperature: 0.0
num_fewshot: 0
metadata:
- version: 0
version: 0
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment