Unverified Commit 904bba12 authored by Baber Abbasi's avatar Baber Abbasi Committed by GitHub
Browse files

Pin datasets < 4.0.0 (#3172)

* Fix: pin datasets < 4.0

* fix

* update type hints in HF

* fix hellaswag path
parent 2eea3f50
...@@ -433,7 +433,9 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None: ...@@ -433,7 +433,9 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
# because it's already been determined based on the prior env var before launching our # because it's already been determined based on the prior env var before launching our
# script--`datasets` gets imported by lm_eval internally before these lines can update the env. # script--`datasets` gets imported by lm_eval internally before these lines can update the env.
import datasets import datasets
from packaging.version import parse as vparse
if vparse(datasets.__version__) < vparse("4.0.0"):
datasets.config.HF_DATASETS_TRUST_REMOTE_CODE = True datasets.config.HF_DATASETS_TRUST_REMOTE_CODE = True
if isinstance(args.model_args, dict): if isinstance(args.model_args, dict):
......
...@@ -981,6 +981,10 @@ class ConfigurableTask(Task): ...@@ -981,6 +981,10 @@ class ConfigurableTask(Task):
def download( def download(
self, dataset_kwargs: Optional[Dict[str, Any]] = None, **kwargs self, dataset_kwargs: Optional[Dict[str, Any]] = None, **kwargs
) -> None: ) -> None:
from packaging.version import parse as vparse
if dataset_kwargs and vparse(datasets.__version__) >= vparse("4.0.0"):
dataset_kwargs.pop("trust_remote_code", None)
if isinstance(self.config.custom_dataset, Callable): if isinstance(self.config.custom_dataset, Callable):
eval_logger.warning( eval_logger.warning(
f"{self.config.task}: Custom kwargs can be passed to `--metadata` in console (as json string) or to the TaskManager." f"{self.config.task}: Custom kwargs can be passed to `--metadata` in console (as json string) or to the TaskManager."
......
...@@ -3,9 +3,10 @@ from __future__ import annotations ...@@ -3,9 +3,10 @@ from __future__ import annotations
import copy import copy
import logging import logging
import os import os
from collections.abc import Iterator, Sequence
from datetime import timedelta from datetime import timedelta
from pathlib import Path from pathlib import Path
from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Tuple, Union from typing import TYPE_CHECKING, Any, Literal
import jinja2 import jinja2
import torch import torch
...@@ -19,6 +20,7 @@ from accelerate import ( ...@@ -19,6 +20,7 @@ from accelerate import (
from accelerate.utils import get_max_memory from accelerate.utils import get_max_memory
from huggingface_hub import HfApi from huggingface_hub import HfApi
from packaging import version from packaging import version
from packaging.version import parse as vparse
from tqdm import tqdm from tqdm import tqdm
from transformers.models.auto.modeling_auto import ( from transformers.models.auto.modeling_auto import (
MODEL_FOR_CAUSAL_LM_MAPPING_NAMES, MODEL_FOR_CAUSAL_LM_MAPPING_NAMES,
...@@ -26,7 +28,6 @@ from transformers.models.auto.modeling_auto import ( ...@@ -26,7 +28,6 @@ from transformers.models.auto.modeling_auto import (
) )
from lm_eval import utils from lm_eval import utils
from lm_eval.api.instance import Instance
from lm_eval.api.model import TemplateLM from lm_eval.api.model import TemplateLM
from lm_eval.api.registry import register_model from lm_eval.api.registry import register_model
from lm_eval.models.utils import ( from lm_eval.models.utils import (
...@@ -42,15 +43,17 @@ from lm_eval.models.utils import ( ...@@ -42,15 +43,17 @@ from lm_eval.models.utils import (
if TYPE_CHECKING: if TYPE_CHECKING:
from transformers.quantizers import AutoQuantizationConfig from transformers.quantizers.auto import AutoQuantizationConfig
from lm_eval.api.instance import Instance
eval_logger = logging.getLogger(__name__) eval_logger = logging.getLogger(__name__)
TOKENIZER_INFINITY = 1000000000000000019884624838656
@register_model("hf-auto", "hf", "huggingface") @register_model("hf-auto", "hf", "huggingface")
class HFLM(TemplateLM): class HFLM(TemplateLM):
""" """An abstracted Huggingface model class. Enables usage with both models of
An abstracted Huggingface model class. Enables usage with both models of
`transformers.AutoModelForCausalLM` and `transformers.AutoModelForSeq2SeqLM` classes. `transformers.AutoModelForCausalLM` and `transformers.AutoModelForSeq2SeqLM` classes.
Supports data-parallel multi-GPU with HF Accelerate. Supports data-parallel multi-GPU with HF Accelerate.
...@@ -61,48 +64,45 @@ class HFLM(TemplateLM): ...@@ -61,48 +64,45 @@ class HFLM(TemplateLM):
def __init__( def __init__(
self, self,
pretrained: Union[str, transformers.PreTrainedModel], pretrained: str | transformers.PreTrainedModel,
backend: Literal["default", "causal", "seq2seq"] = "default", backend: Literal["default", "causal", "seq2seq"] = "default",
# override whether the model should be treated as decoder-only (causal) or encoder-decoder (seq2seq) # override whether the model should be treated as decoder-only (causal) or encoder-decoder (seq2seq)
revision: Optional[str] = "main", revision: str | None = "main",
subfolder: str = "", subfolder: str = "",
tokenizer: Optional[ tokenizer: str
Union[ | transformers.PreTrainedTokenizer
str, | transformers.PreTrainedTokenizerFast
transformers.PreTrainedTokenizer, | None = None,
transformers.PreTrainedTokenizerFast, truncation: bool | None = False,
]
] = None,
truncation: Optional[bool] = False,
logits_cache: bool = True, logits_cache: bool = True,
max_length: Optional[int] = None, max_length: int | None = None,
device: Optional[str] = "cuda", device: str | None = "cuda",
dtype: Optional[Union[str, torch.dtype]] = "auto", dtype: str | torch.dtype | None = "auto",
softmax_dtype: Optional[Union[str, torch.dtype]] = None, softmax_dtype: str | torch.dtype | None = None,
mixed_precision_dtype: Optional[Union[str, torch.dtype]] = None, mixed_precision_dtype: str | torch.dtype | None = None,
batch_size: Optional[Union[int, str]] = 1, batch_size: int | str | None = 1,
max_batch_size: Optional[int] = 64, max_batch_size: int | None = 64,
trust_remote_code: Optional[bool] = False, trust_remote_code: bool | None = False,
use_fast_tokenizer: Optional[bool] = True, use_fast_tokenizer: bool | None = True,
add_bos_token: Optional[bool] = False, add_bos_token: bool | None = False,
prefix_token_id: Optional[int] = None, prefix_token_id: int | None = None,
# arguments used for splitting a model across GPUs naively. # arguments used for splitting a model across GPUs naively.
# only used if `parallelize=True`. # only used if `parallelize=True`.
parallelize: Optional[bool] = False, parallelize: bool | None = False,
max_memory_per_gpu: Optional[Union[int, str]] = None, max_memory_per_gpu: int | str | None = None,
max_cpu_memory: Optional[Union[int, str]] = None, max_cpu_memory: int | str | None = None,
offload_folder: Optional[Union[str, os.PathLike]] = "./offload", offload_folder: str | os.PathLike | None = "./offload",
# PEFT, delta weights and quantization options # PEFT, delta weights and quantization options
peft: Optional[str] = None, peft: str | None = None,
delta: Optional[str] = None, delta: str | None = None,
autogptq: Optional[Union[bool, str]] = False, autogptq: bool | str | None = False,
gptqmodel: Optional[bool] = False, gptqmodel: bool | None = False,
gguf_file: Optional[str] = None, gguf_file: str | None = None,
# end token for thinking, either the string or int token id. # end token for thinking, either the string or int token id.
# splits to get response after this token (if provided). # splits to get response after this token (if provided).
think_end_token: Union[str, int, None] = None, think_end_token: str | int | None = None,
enable_thinking: bool | None = None, enable_thinking: bool | None = None,
chat_template_args: Optional[dict[str, Any]] = None, chat_template_args: dict[str, Any] | None = None,
**kwargs, **kwargs,
) -> None: ) -> None:
super().__init__() super().__init__()
...@@ -280,9 +280,10 @@ class HFLM(TemplateLM): ...@@ -280,9 +280,10 @@ class HFLM(TemplateLM):
self.batch_size_per_gpu = int(batch_size) self.batch_size_per_gpu = int(batch_size)
if isinstance(pretrained, str): if isinstance(pretrained, str):
if gpus >= 1 or str(self.device) == "mps": if (gpus >= 1 or str(self.device) == "mps") and not (
parallelize or autogptq or hasattr(self, "accelerator")
):
# TODO: can remove this whole snippet except in the mps case, perhaps? # TODO: can remove this whole snippet except in the mps case, perhaps?
if not (parallelize or autogptq or hasattr(self, "accelerator")):
# place model onto device requested manually, # place model onto device requested manually,
# if not using HF Accelerate or device_map # if not using HF Accelerate or device_map
# or any other option that preloads model onto device # or any other option that preloads model onto device
...@@ -336,12 +337,12 @@ class HFLM(TemplateLM): ...@@ -336,12 +337,12 @@ class HFLM(TemplateLM):
def _get_accelerate_args( def _get_accelerate_args(
self, self,
parallelize: Optional[bool] = None, parallelize: bool | None = None,
device_map: Optional[str] = "auto", device_map: str | None = "auto",
max_memory_per_gpu: Optional[Union[int, str]] = None, max_memory_per_gpu: int | str | None = None,
max_cpu_memory: Optional[Union[int, str]] = None, max_cpu_memory: int | str | None = None,
offload_folder: Optional[str] = "./offload", offload_folder: str | None = "./offload",
gpus: Optional[int] = None, gpus: int | None = None,
) -> dict: ) -> dict:
"""Returns the kwargs needed to apply `accelerate` in `AutoModel.from_pretrained`.""" """Returns the kwargs needed to apply `accelerate` in `AutoModel.from_pretrained`."""
num_local_processes = int(os.environ.get("LOCAL_WORLD_SIZE", 1)) num_local_processes = int(os.environ.get("LOCAL_WORLD_SIZE", 1))
...@@ -379,13 +380,8 @@ class HFLM(TemplateLM): ...@@ -379,13 +380,8 @@ class HFLM(TemplateLM):
} }
else: # Estimating the possible memory requirements else: # Estimating the possible memory requirements
max_memory_all_gpus = get_max_memory() max_memory_all_gpus = get_max_memory()
if "cpu" in max_memory_all_gpus: max_memory_all_gpus.pop("cpu", None)
del max_memory_all_gpus["cpu"] if hasattr(self, "accelerator"):
if not hasattr(self, "accelerator"):
max_memory_per_gpu_map = {
k: v for k, v in max_memory_all_gpus.items()
}
else:
# use only 1 / num_processes of the GPUs if we are running under accelerate launch # use only 1 / num_processes of the GPUs if we are running under accelerate launch
max_memory_per_gpu_map = { max_memory_per_gpu_map = {
k: v k: v
...@@ -393,6 +389,9 @@ class HFLM(TemplateLM): ...@@ -393,6 +389,9 @@ class HFLM(TemplateLM):
if k % num_local_processes if k % num_local_processes
== (self.accelerator.process_index % num_local_processes) == (self.accelerator.process_index % num_local_processes)
} }
else:
max_memory_per_gpu_map = max_memory_all_gpus
args["max_memory"] = max_memory_per_gpu_map args["max_memory"] = max_memory_per_gpu_map
args["device_map"] = "auto" if device_map is None else device_map args["device_map"] = "auto" if device_map is None else device_map
eval_logger.info( eval_logger.info(
...@@ -436,12 +435,12 @@ class HFLM(TemplateLM): ...@@ -436,12 +435,12 @@ class HFLM(TemplateLM):
return self._model return self._model
@property @property
def eot_token_id(self): def eot_token_id(self) -> int:
# we use EOT because end of *text* is more accurate for what we're doing than end of *sentence* # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
return self.tokenizer.eos_token_id return self.tokenizer.eos_token_id
@property @property
def prefix_token_id(self): def prefix_token_id(self) -> int:
# it is used as prefix for loglikelihood # it is used as prefix for loglikelihood
if self.custom_prefix_token_id is not None: if self.custom_prefix_token_id is not None:
return self.custom_prefix_token_id return self.custom_prefix_token_id
...@@ -450,7 +449,7 @@ class HFLM(TemplateLM): ...@@ -450,7 +449,7 @@ class HFLM(TemplateLM):
return self.tokenizer.eos_token_id return self.tokenizer.eos_token_id
@property @property
def max_length(self): def max_length(self) -> int:
if self._max_length: # if max length manually set, return it if self._max_length: # if max length manually set, return it
return self._max_length return self._max_length
seqlen_config_attrs = ("n_positions", "max_position_embeddings", "n_ctx") seqlen_config_attrs = ("n_positions", "max_position_embeddings", "n_ctx")
...@@ -458,7 +457,7 @@ class HFLM(TemplateLM): ...@@ -458,7 +457,7 @@ class HFLM(TemplateLM):
if hasattr(self.model.config, attr): if hasattr(self.model.config, attr):
return getattr(self.model.config, attr) return getattr(self.model.config, attr)
if hasattr(self.tokenizer, "model_max_length"): if hasattr(self.tokenizer, "model_max_length"):
if self.tokenizer.model_max_length == 1000000000000000019884624838656: if self.tokenizer.model_max_length == TOKENIZER_INFINITY:
return self._DEFAULT_MAX_LENGTH return self._DEFAULT_MAX_LENGTH
return self.tokenizer.model_max_length return self.tokenizer.model_max_length
return self._DEFAULT_MAX_LENGTH return self._DEFAULT_MAX_LENGTH
...@@ -489,12 +488,12 @@ class HFLM(TemplateLM): ...@@ -489,12 +488,12 @@ class HFLM(TemplateLM):
def _get_backend( def _get_backend(
self, self,
config: Union[transformers.PretrainedConfig, transformers.AutoConfig], config: transformers.PretrainedConfig | transformers.AutoConfig,
backend: Literal["default", "causal", "seq2seq"] = "default", backend: Literal["default", "causal", "seq2seq"] = "default",
trust_remote_code: Optional[bool] = False, trust_remote_code: bool | None = False,
) -> None: ) -> None:
""" """Helper method during initialization.
Helper method during initialization.
Determines the backend ("causal" (decoder-only) or "seq2seq" (encoder-decoder)) model type to be used. Determines the backend ("causal" (decoder-only) or "seq2seq" (encoder-decoder)) model type to be used.
sets `self.AUTO_MODEL_CLASS` appropriately if not already set. sets `self.AUTO_MODEL_CLASS` appropriately if not already set.
...@@ -506,9 +505,7 @@ class HFLM(TemplateLM): ...@@ -506,9 +505,7 @@ class HFLM(TemplateLM):
if backend != "default": if backend != "default":
# if we've settled on non-default backend, use that manually # if we've settled on non-default backend, use that manually
if backend == "causal": if backend in ["causal", "seq2seq"]:
self.backend = backend
elif backend == "seq2seq":
self.backend = backend self.backend = backend
eval_logger.info( eval_logger.info(
f"Overrode HF model backend type, and using type '{self.backend}'" f"Overrode HF model backend type, and using type '{self.backend}'"
...@@ -516,7 +513,7 @@ class HFLM(TemplateLM): ...@@ -516,7 +513,7 @@ class HFLM(TemplateLM):
else: else:
# determine and use the default HF backend for this model, based on its config + metadata. # determine and use the default HF backend for this model, based on its config + metadata.
if ( if (
getattr(config, "model_type") getattr(config, "model_type", None)
in MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES in MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES
): ):
# first check if model type is listed under seq2seq models, since some # first check if model type is listed under seq2seq models, since some
...@@ -525,7 +522,7 @@ class HFLM(TemplateLM): ...@@ -525,7 +522,7 @@ class HFLM(TemplateLM):
self.backend = "seq2seq" self.backend = "seq2seq"
eval_logger.debug(f"Using model type '{self.backend}'") eval_logger.debug(f"Using model type '{self.backend}'")
elif ( elif (
getattr(self.config, "model_type") in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES getattr(config, "model_type", None) in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
): ):
self.backend = "causal" self.backend = "causal"
eval_logger.debug(f"Using model type '{self.backend}'") eval_logger.debug(f"Using model type '{self.backend}'")
...@@ -554,10 +551,10 @@ class HFLM(TemplateLM): ...@@ -554,10 +551,10 @@ class HFLM(TemplateLM):
pretrained: str, pretrained: str,
revision: str = "main", revision: str = "main",
trust_remote_code: bool = False, trust_remote_code: bool = False,
gguf_file: Optional[str] = None, gguf_file: str | None = None,
subfolder: str = "", subfolder: str = "",
) -> None: ) -> None:
"""Return the model config for HuggingFace models""" """Return the model config for HuggingFace models."""
self._config = transformers.AutoConfig.from_pretrained( self._config = transformers.AutoConfig.from_pretrained(
pretrained, pretrained,
revision=revision, revision=revision,
...@@ -569,29 +566,28 @@ class HFLM(TemplateLM): ...@@ -569,29 +566,28 @@ class HFLM(TemplateLM):
def _create_model( def _create_model(
self, self,
pretrained: str, pretrained: str,
revision: Optional[str] = "main", revision: str | None = "main",
dtype: Optional[Union[str, torch.dtype]] = "auto", dtype: str | torch.dtype | None = "auto",
trust_remote_code: Optional[bool] = False, trust_remote_code: bool | None = False,
# arguments used for splitting a model across GPUs naively. # arguments used for splitting a model across GPUs naively.
# only used if `parallelize=True`. # only used if `parallelize=True`.
# (accelerate naive PP (device_map) options) # (accelerate naive PP (device_map) options)
parallelize: Optional[bool] = False, parallelize: bool | None = False,
gpus: Optional[int] = None, gpus: int | None = None,
max_memory_per_gpu: Optional[Union[int, str]] = None, max_memory_per_gpu: int | str | None = None,
max_cpu_memory: Optional[Union[int, str]] = None, max_cpu_memory: int | str | None = None,
offload_folder: Optional[str] = "./offload", offload_folder: str | None = "./offload",
# PEFT, delta weights and quantization options # PEFT, delta weights and quantization options
peft: Optional[str] = None, peft: str | None = None,
delta: Optional[str] = None, delta: str | None = None,
autogptq: Optional[Union[bool, str]] = False, autogptq: bool | str | None = False,
gptqmodel: Optional[bool] = False, gptqmodel: bool | None = False,
gguf_file: Optional[str] = None, gguf_file: str | None = None,
quantization_config: Optional["AutoQuantizationConfig"] = None, quantization_config: AutoQuantizationConfig | None = None,
subfolder: str = "", subfolder: str = "",
**kwargs, **kwargs,
) -> None: ) -> None:
""" """Initializes an HF or HF-compatible PreTrainedModel from scratch
Initializes an HF or HF-compatible PreTrainedModel from scratch
inside HFLM, using the kwargs passed into self.__init__(). inside HFLM, using the kwargs passed into self.__init__().
Also handles functionality such as AutoGPTQ usage and PEFT wrapping. Also handles functionality such as AutoGPTQ usage and PEFT wrapping.
...@@ -602,12 +598,12 @@ class HFLM(TemplateLM): ...@@ -602,12 +598,12 @@ class HFLM(TemplateLM):
please consider subclassing HFLM and overriding this and other methods as needed. please consider subclassing HFLM and overriding this and other methods as needed.
""" """
model_kwargs = kwargs if kwargs else {} model_kwargs = kwargs or {}
model_kwargs.update( model_kwargs.update(
self._get_accelerate_args( self._get_accelerate_args(
parallelize=parallelize, parallelize=parallelize,
device_map=kwargs.get("device_map", None), device_map=kwargs.get("device_map"),
max_memory_per_gpu=max_memory_per_gpu, max_memory_per_gpu=max_memory_per_gpu,
max_cpu_memory=max_cpu_memory, max_cpu_memory=max_cpu_memory,
offload_folder=offload_folder, offload_folder=offload_folder,
...@@ -616,16 +612,12 @@ class HFLM(TemplateLM): ...@@ -616,16 +612,12 @@ class HFLM(TemplateLM):
) )
if not autogptq and not gptqmodel: if not autogptq and not gptqmodel:
if model_kwargs.get("load_in_4bit", None): if model_kwargs.get("load_in_4bit"):
assert transformers.__version__ >= "4.30.0", ( assert vparse(transformers.__version__) >= vparse("4.30.0"), (
"load_in_4bit requires transformers >= 4.30.0" "load_in_4bit requires transformers >= 4.30.0"
) )
if transformers.__version__ >= "4.30.0": if compute_dtype := model_kwargs.get("bnb_4bit_compute_dtype"):
if model_kwargs.get("load_in_4bit", None): model_kwargs["bnb_4bit_compute_dtype"] = get_dtype(compute_dtype)
if model_kwargs.get("bnb_4bit_compute_dtype", None):
model_kwargs["bnb_4bit_compute_dtype"] = get_dtype(
model_kwargs["bnb_4bit_compute_dtype"]
)
self._model = self.AUTO_MODEL_CLASS.from_pretrained( self._model = self.AUTO_MODEL_CLASS.from_pretrained(
pretrained, pretrained,
...@@ -650,7 +642,7 @@ class HFLM(TemplateLM): ...@@ -650,7 +642,7 @@ class HFLM(TemplateLM):
raise type(exception)( raise type(exception)(
"Tried to load auto_gptq, but auto-gptq is not installed ", "Tried to load auto_gptq, but auto-gptq is not installed ",
"please install auto-gptq via pip install lm-eval[gptq] or pip install -e .[gptq]", "please install auto-gptq via pip install lm-eval[gptq] or pip install -e .[gptq]",
) ) from exception
self._model = AutoGPTQForCausalLM.from_quantized( self._model = AutoGPTQForCausalLM.from_quantized(
pretrained, pretrained,
...@@ -669,7 +661,7 @@ class HFLM(TemplateLM): ...@@ -669,7 +661,7 @@ class HFLM(TemplateLM):
raise type(exception)( raise type(exception)(
"Tried to load gptqmodel, but gptqmodel is not installed ", "Tried to load gptqmodel, but gptqmodel is not installed ",
"please install gptqmodel via `pip install gptqmodel --no-build-isolation` or `pip install lm-eval[gptqmodel] --no-build-isolation`", "please install gptqmodel via `pip install gptqmodel --no-build-isolation` or `pip install lm-eval[gptqmodel] --no-build-isolation`",
) ) from exception
self._model = GPTQModel.from_quantized( self._model = GPTQModel.from_quantized(
pretrained, trust_remote_code=trust_remote_code, **model_kwargs pretrained, trust_remote_code=trust_remote_code, **model_kwargs
...@@ -684,8 +676,9 @@ class HFLM(TemplateLM): ...@@ -684,8 +676,9 @@ class HFLM(TemplateLM):
from peft import PeftModel from peft import PeftModel
from peft import __version__ as PEFT_VERSION from peft import __version__ as PEFT_VERSION
if model_kwargs.get("load_in_4bit", None): if model_kwargs.get("load_in_4bit") and vparse(PEFT_VERSION) < vparse(
if version.parse(PEFT_VERSION) < version.parse("0.4.0"): "0.4.0"
):
raise AssertionError("load_in_4bit requires peft >= 0.4.0") raise AssertionError("load_in_4bit requires peft >= 0.4.0")
if self._model.config.vocab_size != len(self.tokenizer): if self._model.config.vocab_size != len(self.tokenizer):
# resize model for LoRAs with added tokens # resize model for LoRAs with added tokens
...@@ -711,36 +704,32 @@ class HFLM(TemplateLM): ...@@ -711,36 +704,32 @@ class HFLM(TemplateLM):
for name, param in self._model.state_dict().items(): for name, param in self._model.state_dict().items():
try: try:
param.data += _model_delta.state_dict()[name] param.data += _model_delta.state_dict()[name]
except KeyError: except KeyError as e:
raise KeyError(f"Delta model is missing weights for layer: {name}") raise KeyError(
f"Delta model is missing weights for layer: {name}"
) from e
except Exception as e: except Exception as e:
raise RuntimeError( raise RuntimeError(
f"Failed to add delta weights to layer {name}. Error: {e}" f"Failed to add delta weights to layer {name}. Error: {e}"
) ) from e
del _model_delta del _model_delta
return None
def _create_tokenizer( def _create_tokenizer(
self, self,
pretrained: Union[str, transformers.PreTrainedModel], pretrained: str | transformers.PreTrainedModel,
tokenizer: Optional[ tokenizer: str
Union[ | transformers.PreTrainedTokenizer
str, | transformers.PreTrainedTokenizerFast
transformers.PreTrainedTokenizer, | None,
transformers.PreTrainedTokenizerFast, revision: str | None = "main",
] trust_remote_code: bool | None = False,
], use_fast_tokenizer: bool | None = True,
revision: Optional[str] = "main", gguf_file: str | None = None,
trust_remote_code: Optional[bool] = False, add_bos_token: bool | None = False,
use_fast_tokenizer: Optional[bool] = True, subfolder: str | None = "",
gguf_file: Optional[str] = None,
add_bos_token: Optional[bool] = False,
subfolder: Optional[str] = "",
) -> None: ) -> None:
""" """Helper method during initialization.
Helper method during initialization.
Create a tokenizer object corresponding to the correct Create a tokenizer object corresponding to the correct
tokenizer for value of `pretrained`, or use the pre-initialized tokenizer passed. tokenizer for value of `pretrained`, or use the pre-initialized tokenizer passed.
...@@ -769,8 +758,12 @@ class HFLM(TemplateLM): ...@@ -769,8 +758,12 @@ class HFLM(TemplateLM):
) )
else: else:
assert isinstance( assert isinstance(
tokenizer, transformers.PreTrainedTokenizer tokenizer,
) or isinstance(tokenizer, transformers.PreTrainedTokenizerFast) (
transformers.PreTrainedTokenizer,
transformers.PreTrainedTokenizerFast,
),
)
self.tokenizer = tokenizer self.tokenizer = tokenizer
else: else:
# Get tokenizer based on 'pretrained' # Get tokenizer based on 'pretrained'
...@@ -782,9 +775,8 @@ class HFLM(TemplateLM): ...@@ -782,9 +775,8 @@ class HFLM(TemplateLM):
self.tokenizer = transformers.AutoTokenizer.from_pretrained( self.tokenizer = transformers.AutoTokenizer.from_pretrained(
model_name, **kwargs model_name, **kwargs
) )
return None
def _detect_batch_size(self, requests=None, pos: int = 0): def _detect_batch_size(self, requests: Sequence | None = None, pos: int = 0):
if requests: if requests:
_, context_enc, continuation_enc = requests[pos] _, context_enc, continuation_enc = requests[pos]
max_length = len( max_length = len(
...@@ -799,7 +791,7 @@ class HFLM(TemplateLM): ...@@ -799,7 +791,7 @@ class HFLM(TemplateLM):
# if OOM, then halves batch_size and tries again # if OOM, then halves batch_size and tries again
@find_executable_batch_size(starting_batch_size=self.max_batch_size) @find_executable_batch_size(starting_batch_size=self.max_batch_size)
def forward_batch(batch_size): def forward_batch(batch_size: int):
if self.backend == "seq2seq": if self.backend == "seq2seq":
length = max(max_context_enc, max_cont_enc) length = max(max_context_enc, max_cont_enc)
batched_conts = torch.ones( batched_conts = torch.ones(
...@@ -846,8 +838,11 @@ class HFLM(TemplateLM): ...@@ -846,8 +838,11 @@ class HFLM(TemplateLM):
return batch_size return batch_size
def tok_encode( def tok_encode(
self, string: str, left_truncate_len=None, add_special_tokens=None self,
) -> List[int]: string: str,
left_truncate_len: int | None = None,
add_special_tokens: bool | None = None,
) -> list[int]:
""" """ """ """
# default for None - empty dict, use predefined tokenizer param # default for None - empty dict, use predefined tokenizer param
# used for all models except for CausalLM or predefined value # used for all models except for CausalLM or predefined value
...@@ -873,11 +868,11 @@ class HFLM(TemplateLM): ...@@ -873,11 +868,11 @@ class HFLM(TemplateLM):
def tok_batch_encode( def tok_batch_encode(
self, self,
strings: List[str], strings: list[str],
padding_side: str = "left", padding_side: str = "left",
left_truncate_len: int = None, left_truncate_len: int | None = None,
truncation: bool = False, truncation: bool = False,
) -> Tuple[torch.Tensor, torch.Tensor]: ) -> tuple[torch.Tensor, torch.Tensor]:
# encode a batch of strings. converts to tensors and pads automatically, unlike tok_encode. # encode a batch of strings. converts to tensors and pads automatically, unlike tok_encode.
old_padding_side = self.tokenizer.padding_side old_padding_side = self.tokenizer.padding_side
self.tokenizer.padding_side = padding_side self.tokenizer.padding_side = padding_side
...@@ -896,7 +891,7 @@ class HFLM(TemplateLM): ...@@ -896,7 +891,7 @@ class HFLM(TemplateLM):
if left_truncate_len: if left_truncate_len:
original_lengths = encoding["input_ids"].size(1) original_lengths = encoding["input_ids"].size(1)
if original_lengths > left_truncate_len: if original_lengths > left_truncate_len:
eval_logger.warn( eval_logger.warning(
f"Left truncation applied. Original sequence length was {original_lengths}, " f"Left truncation applied. Original sequence length was {original_lengths}, "
f"truncating to last {left_truncate_len} tokens. Some content will be lost.", f"truncating to last {left_truncate_len} tokens. Some content will be lost.",
) )
...@@ -908,11 +903,17 @@ class HFLM(TemplateLM): ...@@ -908,11 +903,17 @@ class HFLM(TemplateLM):
return encoding["input_ids"], encoding["attention_mask"] return encoding["input_ids"], encoding["attention_mask"]
def tok_decode(self, tokens, skip_special_tokens=True): def tok_decode(self, tokens: Iterator[list[str]], skip_special_tokens: bool = True):
return self.tokenizer.decode(tokens, skip_special_tokens=skip_special_tokens) return self.tokenizer.decode(tokens, skip_special_tokens=skip_special_tokens)
def _model_call(self, inps, attn_mask=None, labels=None): def _model_call(
self,
inps: torch.Tensor,
attn_mask: torch.Tensor | None = None,
labels: torch.Tensor | None = None,
) -> torch.Tensor:
""" """
:param inps: torch.Tensor :param inps: torch.Tensor
A torch tensor of shape [batch, (sequence_ctx + sequence_cont)] or of shape A torch tensor of shape [batch, (sequence_ctx + sequence_cont)] or of shape
[batch, sequence_ctx]. the size of sequence may vary from call to call [batch, sequence_ctx]. the size of sequence may vary from call to call
...@@ -926,32 +927,40 @@ class HFLM(TemplateLM): ...@@ -926,32 +927,40 @@ class HFLM(TemplateLM):
A torch tensor of shape [batch, sequence, vocab] with the A torch tensor of shape [batch, sequence, vocab] with the
logits returned from the model's decoder logits returned from the model's decoder
""" """
with torch.no_grad(): with (
with torch.autocast( torch.no_grad(),
torch.autocast(
device_type=self.device.type, device_type=self.device.type,
dtype=self.mixed_precision_dtype, dtype=self.mixed_precision_dtype,
enabled=self.mixed_precision_dtype is not None, enabled=self.mixed_precision_dtype is not None,
),
): ):
if attn_mask is not None or labels is not None: if attn_mask is not None or labels is not None:
assert attn_mask is not None and labels is not None assert attn_mask is not None and labels is not None
assert self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM assert transformers.AutoModelForSeq2SeqLM == self.AUTO_MODEL_CLASS
return self.model( return self.model(
input_ids=inps, attention_mask=attn_mask, labels=labels input_ids=inps, attention_mask=attn_mask, labels=labels
).logits ).logits
else:
assert self.AUTO_MODEL_CLASS in ( assert self.AUTO_MODEL_CLASS in (
transformers.AutoModelForCausalLM, transformers.AutoModelForCausalLM,
transformers.AutoModelForVision2Seq, transformers.AutoModelForVision2Seq,
) )
return self.model(inps).logits return self.model(inps).logits
def _model_generate(self, context, max_length, stop, **generation_kwargs): def _model_generate(
self,
context,
max_length: int,
stop: list[str],
**generation_kwargs: dict[str, Any],
) -> torch.Tensor:
# temperature = 0.0 if not set # temperature = 0.0 if not set
# if do_sample is false and temp==0.0: # if do_sample is false and temp==0.0:
# remove temperature, as do_sample=False takes care of this # remove temperature, as do_sample=False takes care of this
# and we don't want a warning from HF # and we don't want a warning from HF
generation_kwargs["temperature"] = generation_kwargs.get("temperature", 0.0) generation_kwargs["temperature"] = generation_kwargs.get("temperature", 0.0)
do_sample = generation_kwargs.get("do_sample", None) do_sample = generation_kwargs.get("do_sample")
# The temperature has to be a strictly positive float -- if it is 0.0, use greedy decoding strategies # The temperature has to be a strictly positive float -- if it is 0.0, use greedy decoding strategies
if generation_kwargs.get("temperature") == 0.0 and do_sample is None: if generation_kwargs.get("temperature") == 0.0 and do_sample is None:
...@@ -978,7 +987,10 @@ class HFLM(TemplateLM): ...@@ -978,7 +987,10 @@ class HFLM(TemplateLM):
) )
def _select_cont_toks( def _select_cont_toks(
self, logits: torch.Tensor, contlen: int = None, inplen: int = None self,
logits: torch.Tensor,
contlen: int | None = None,
inplen: int | None = None,
) -> torch.Tensor: ) -> torch.Tensor:
if self.backend == "causal": if self.backend == "causal":
assert contlen and inplen, ( assert contlen and inplen, (
...@@ -998,8 +1010,8 @@ class HFLM(TemplateLM): ...@@ -998,8 +1010,8 @@ class HFLM(TemplateLM):
return logits return logits
def loglikelihood_rolling( def loglikelihood_rolling(
self, requests: List[Instance], disable_tqdm: bool = False self, requests: list[Instance], disable_tqdm: bool = False
) -> List[float]: ) -> list[float]:
adaptive_batch_size = None adaptive_batch_size = None
if self.batch_size == "auto": if self.batch_size == "auto":
# using rolling window with maximum context # using rolling window with maximum context
...@@ -1018,7 +1030,7 @@ class HFLM(TemplateLM): ...@@ -1018,7 +1030,7 @@ class HFLM(TemplateLM):
disable=(disable_tqdm or (self.rank != 0)), disable=(disable_tqdm or (self.rank != 0)),
) )
): ):
rolling_token_windows: List[Tuple[List[int], List[int]]] = list( rolling_token_windows: list[tuple[list[int], list[int]]] = list(
map( map(
utils.make_disjoint_window, utils.make_disjoint_window,
utils.get_rolling_token_windows( utils.get_rolling_token_windows(
...@@ -1102,15 +1114,15 @@ class HFLM(TemplateLM): ...@@ -1102,15 +1114,15 @@ class HFLM(TemplateLM):
def _loglikelihood_tokens( def _loglikelihood_tokens(
self, self,
requests: List[Tuple[Tuple[str, str], List[int], List[int]]], requests: list[tuple[tuple[str, str], list[int], list[int]]],
disable_tqdm: bool = False, disable_tqdm: bool = False,
override_bs: int = None, override_bs: int | None = None,
) -> List[Tuple[float, bool]]: ) -> list[tuple[float, bool]]:
# TODO: implement some kind of efficient-request-middleware that lumps together requests with the same context # TODO: implement some kind of efficient-request-middleware that lumps together requests with the same context
res = [] res = []
def _collate(req: Tuple[Tuple[str, str], List[int], List[int]]): def _collate(req: tuple[tuple[str, str], list[int], list[int]]):
"""Defines the key for the sorted method""" """Defines the key for the sorted method."""
# the negative sign on len(toks) sorts descending - this has a few advantages: # the negative sign on len(toks) sorts descending - this has a few advantages:
# - time estimates will always be over not underestimates, which is more useful for planning # - time estimates will always be over not underestimates, which is more useful for planning
# - to know the size of a batch when going through the list, you know the first one is always the batch # - to know the size of a batch when going through the list, you know the first one is always the batch
...@@ -1121,8 +1133,8 @@ class HFLM(TemplateLM): ...@@ -1121,8 +1133,8 @@ class HFLM(TemplateLM):
toks = req[1] + req[2] toks = req[1] + req[2]
return -len(toks), tuple(toks) return -len(toks), tuple(toks)
def _lookup_one_token_cont(req: Tuple[Tuple[str, str], List[int], List[int]]): def _lookup_one_token_cont(req: tuple[tuple[str, str], list[int], list[int]]):
"""Defines the key to group and lookup one-token continuations""" """Defines the key to group and lookup one-token continuations."""
# Use with group_by="contexts" (optional)" # Use with group_by="contexts" (optional)"
# allows for the creation of a lookup, so we can reuse logits in case of one-token continuations. # allows for the creation of a lookup, so we can reuse logits in case of one-token continuations.
# speeds up some multiple-choice tasks proportionally to the number of choices. # speeds up some multiple-choice tasks proportionally to the number of choices.
...@@ -1295,7 +1307,7 @@ class HFLM(TemplateLM): ...@@ -1295,7 +1307,7 @@ class HFLM(TemplateLM):
# original args. Otherwise, expands the logits batch dimension and yields each # original args. Otherwise, expands the logits batch dimension and yields each
# batch along with matching continuation tokens and prompt strings. # batch along with matching continuation tokens and prompt strings.
# logits -> [1, seq, vocab] # logits -> [1, seq, vocab]
for request_str, cont_toks, logits in re_ord.get_cache( for request_str, cont_toks, logits in re_ord.get_cache( # noqa
req_str=request_str, req_str=request_str,
cxt_toks=ctx_tokens, cxt_toks=ctx_tokens,
cont_toks=cont_toks, cont_toks=cont_toks,
...@@ -1336,11 +1348,11 @@ class HFLM(TemplateLM): ...@@ -1336,11 +1348,11 @@ class HFLM(TemplateLM):
return re_ord.get_original(res) return re_ord.get_original(res)
def generate_until( def generate_until(
self, requests: List[Instance], disable_tqdm: bool = False self, requests: list[Instance], disable_tqdm: bool = False
) -> List[str]: ) -> list[str]:
res = [] res = []
def _collate(req: Tuple[str, dict]): def _collate(req: tuple[str, dict]):
"""Defines the key for the sorted method""" """Defines the key for the sorted method"""
# the negative sign on len(toks) sorts descending - this has a few advantages: # the negative sign on len(toks) sorts descending - this has a few advantages:
# - time estimates will always be over not underestimates, which is more useful for planning # - time estimates will always be over not underestimates, which is more useful for planning
...@@ -1400,10 +1412,10 @@ class HFLM(TemplateLM): ...@@ -1400,10 +1412,10 @@ class HFLM(TemplateLM):
# add EOS token to stop sequences # add EOS token to stop sequences
until = handle_stop_sequences(kwargs.pop("until", None), eos=eos) until = handle_stop_sequences(kwargs.pop("until", None), eos=eos)
else: else:
raise ValueError( raise TypeError(
f"Expected `kwargs` to be of type `dict` but got {type(gen_kwargs)}" f"Expected `kwargs` to be of type `dict` but got {type(gen_kwargs)}"
) )
if "max_gen_toks" in kwargs.keys(): if "max_gen_toks" in kwargs:
max_gen_toks = kwargs.pop("max_gen_toks") max_gen_toks = kwargs.pop("max_gen_toks")
else: else:
max_gen_toks = self.max_gen_toks max_gen_toks = self.max_gen_toks
...@@ -1481,11 +1493,9 @@ class HFLM(TemplateLM): ...@@ -1481,11 +1493,9 @@ class HFLM(TemplateLM):
return res return res
def apply_chat_template( def apply_chat_template(
self, chat_history: List[Dict[str, str]], add_generation_prompt: bool = True self, chat_history: list[dict[str, str]], add_generation_prompt: bool = True
) -> str: ) -> str:
""" """Method to apply a chat template to a list of chat history between user and model."""
Method to apply a chat template to a list of chat history between user and model.
"""
try: try:
chat_templated = self.tokenizer.apply_chat_template( chat_templated = self.tokenizer.apply_chat_template(
chat_history, chat_history,
...@@ -1510,9 +1520,7 @@ class HFLM(TemplateLM): ...@@ -1510,9 +1520,7 @@ class HFLM(TemplateLM):
return chat_templated return chat_templated
def get_model_info(self) -> dict: def get_model_info(self) -> dict:
""" """Method to get Hugging Face model information for experiment reproducibility."""
Method to get Hugging Face model information for experiment reproducibility.
"""
def get_model_num_params(model) -> int: def get_model_num_params(model) -> int:
if hasattr(model, "num_parameters"): if hasattr(model, "num_parameters"):
......
tag: tag:
- multiple_choice - multiple_choice
task: hellaswag task: hellaswag
dataset_path: hellaswag dataset_path: Rowan/hellaswag
dataset_name: null dataset_name: null
output_type: multiple_choice output_type: multiple_choice
training_split: train training_split: train
......
...@@ -21,7 +21,7 @@ license = { "text" = "MIT" } ...@@ -21,7 +21,7 @@ license = { "text" = "MIT" }
dependencies = [ dependencies = [
"accelerate>=0.26.0", "accelerate>=0.26.0",
"evaluate", "evaluate",
"datasets>=2.16.0", "datasets>=2.16.0,<4.0",
"evaluate>=0.4.0", "evaluate>=0.4.0",
"jsonlines", "jsonlines",
"numexpr", "numexpr",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment