"backend/apps/vscode:/vscode.git/clone" did not exist on "1d6bbdf917249fc464932faeefe8aa1af71c863b"
Unverified Commit 4b796978 authored by Younes Belkada's avatar Younes Belkada Committed by GitHub
Browse files

🚨🚨🚨 [`Refactor`] Move third-party related utility files into `integrations/` folder 🚨🚨🚨 (#25599)



* move deepspeed to `lib_integrations.deepspeed`

* more refactor

* oops

* fix slow tests

* Fix docs

* fix docs

* addess feedback

* address feedback

* final modifs for PEFT

* fixup

* ok now

* trigger CI

* trigger CI again

* Update docs/source/en/main_classes/deepspeed.md
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* import from `integrations`

* address feedback

* revert removal of `deepspeed` module

* revert removal of `deepspeed` module

* fix conflicts

* ooops

* oops

* add deprecation warning

* place it on the top

* put `FutureWarning`

* fix conflicts with not_doctested.txt

* add back `bitsandbytes` module with a depr warning

* fix

* fix

* fixup

* oops

* fix doctests

---------
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>
parent 4d9e45f3
......@@ -2065,20 +2065,20 @@ In this case you usually need to raise the value of `initial_scale_power`. Setti
## Non-Trainer Deepspeed Integration
The [`~deepspeed.HfDeepSpeedConfig`] is used to integrate Deepspeed into the 🤗 Transformers core
The [`~integrations.HfDeepSpeedConfig`] is used to integrate Deepspeed into the 🤗 Transformers core
functionality, when [`Trainer`] is not used. The only thing that it does is handling Deepspeed ZeRO-3 param gathering and automatically splitting the model onto multiple gpus during `from_pretrained` call. Everything else you have to do by yourself.
When using [`Trainer`] everything is automatically taken care of.
When not using [`Trainer`], to efficiently deploy DeepSpeed ZeRO-3, you must instantiate the
[`~deepspeed.HfDeepSpeedConfig`] object before instantiating the model and keep that object alive.
[`~integrations.HfDeepSpeedConfig`] object before instantiating the model and keep that object alive.
If you're using Deepspeed ZeRO-1 or ZeRO-2 you don't need to use `HfDeepSpeedConfig` at all.
For example for a pretrained model:
```python
from transformers.deepspeed import HfDeepSpeedConfig
from transformers.integrations import HfDeepSpeedConfig
from transformers import AutoModel
import deepspeed
......@@ -2092,7 +2092,7 @@ engine = deepspeed.initialize(model=model, config_params=ds_config, ...)
or for non-pretrained model:
```python
from transformers.deepspeed import HfDeepSpeedConfig
from transformers.integrations import HfDeepSpeedConfig
from transformers import AutoModel, AutoConfig
import deepspeed
......@@ -2108,7 +2108,7 @@ Please note that if you're not using the [`Trainer`] integration, you're complet
## HfDeepSpeedConfig
[[autodoc]] deepspeed.HfDeepSpeedConfig
[[autodoc]] integrations.HfDeepSpeedConfig
- all
### Custom DeepSpeed ZeRO Inference
......@@ -2161,7 +2161,7 @@ Make sure to:
from transformers import AutoTokenizer, AutoConfig, AutoModelForSeq2SeqLM
from transformers.deepspeed import HfDeepSpeedConfig
from transformers.integrations import HfDeepSpeedConfig
import deepspeed
import os
import torch
......
......@@ -32,7 +32,7 @@ from copy import deepcopy # noqa
from parameterized import parameterized # noqa
from transformers import TrainingArguments, is_torch_available # noqa
from transformers.deepspeed import is_deepspeed_available # noqa
from transformers.integrations.deepspeed import is_deepspeed_available # noqa
from transformers.file_utils import WEIGHTS_NAME # noqa
from transformers.testing_utils import ( # noqa
CaptureLogger,
......
......@@ -94,6 +94,7 @@ _import_structure = {
"data.metrics": [],
"data.processors": [],
"debug_utils": [],
"deepspeed": [],
"dependency_versions_check": [],
"dependency_versions_table": [],
"dynamic_module_utils": [],
......@@ -115,8 +116,6 @@ _import_structure = {
"is_tensorboard_available",
"is_wandb_available",
],
"lib_integrations": [],
"lib_integrations.peft": [],
"modelcard": ["ModelCard"],
"modeling_tf_pytorch_utils": [
"convert_tf_weight_name_to_pt_weight_name",
......@@ -745,7 +744,6 @@ _import_structure = {
"is_vision_available",
"logging",
],
"utils.bitsandbytes": [],
"utils.quantization_config": ["BitsAndBytesConfig", "GPTQConfig"],
}
......@@ -1002,7 +1000,6 @@ else:
"TextDataset",
"TextDatasetForNextSentencePrediction",
]
_import_structure["deepspeed"] = []
_import_structure["generation"].extend(
[
"AlternatingCodebooksLogitsProcessor",
......
......@@ -12,378 +12,29 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Integration with Deepspeed
"""
import importlib.util
import weakref
from functools import partialmethod
from .dependency_versions_check import dep_version_check
from .utils import is_accelerate_available, is_torch_available, logging
if is_torch_available():
import torch
logger = logging.get_logger(__name__)
def is_deepspeed_available():
return importlib.util.find_spec("deepspeed") is not None
if is_accelerate_available() and is_deepspeed_available():
from accelerate.utils.deepspeed import HfDeepSpeedConfig as DeepSpeedConfig
else:
# Inherits from a dummy `object` if accelerate is not available, so that python succeeds to import this file.
# Deepspeed glue code will never inherit this dummy object as it checks if accelerate is available.
from builtins import object as DeepSpeedConfig
class HfDeepSpeedConfig(DeepSpeedConfig):
"""
This object contains a DeepSpeed configuration dictionary and can be quickly queried for things like zero stage.
A `weakref` of this object is stored in the module's globals to be able to access the config from areas where
things like the Trainer object is not available (e.g. `from_pretrained` and `_get_resized_embeddings`). Therefore
it's important that this object remains alive while the program is still running.
[`Trainer`] uses the `HfTrainerDeepSpeedConfig` subclass instead. That subclass has logic to sync the configuration
with values of [`TrainingArguments`] by replacing special placeholder values: `"auto"`. Without this special logic
the DeepSpeed configuration is not modified in any way.
Args:
config_file_or_dict (`Union[str, Dict]`): path to DeepSpeed config file or dict.
"""
def __init__(self, config_file_or_dict):
# set global weakref object
set_hf_deepspeed_config(self)
dep_version_check("accelerate")
dep_version_check("deepspeed")
super().__init__(config_file_or_dict)
class HfTrainerDeepSpeedConfig(HfDeepSpeedConfig):
"""
The `HfTrainerDeepSpeedConfig` object is meant to be created during `TrainingArguments` object creation and has the
same lifespan as the latter.
"""
def __init__(self, config_file_or_dict):
super().__init__(config_file_or_dict)
self._dtype = None
self.mismatches = []
def dtype(self):
if self._dtype is None:
raise ValueError("trainer_config_process() wasn't called yet to tell dtype")
return self._dtype
def is_auto(self, ds_key_long):
val = self.get_value(ds_key_long)
if val is None:
return False
else:
return val == "auto"
def fill_match(self, ds_key_long, hf_val, hf_key=None, must_match=True):
"""
A utility method that massages the config file and can optionally verify that the values match.
1. Replace "auto" values with `TrainingArguments` value.
2. If it wasn't "auto" and `must_match` is true, then check that DS config matches Trainer
config values and if mismatched add the entry to `self.mismatched` - will assert during
`trainer_config_finalize` for one or more mismatches.
"""
config, ds_key = self.find_config_node(ds_key_long)
if config is None:
return
if config.get(ds_key) == "auto":
config[ds_key] = hf_val
return
if not must_match:
return
ds_val = config.get(ds_key)
if ds_val is not None and ds_val != hf_val:
self.mismatches.append(f"- ds {ds_key_long}={ds_val} vs hf {hf_key}={hf_val}")
fill_only = partialmethod(fill_match, must_match=False)
def trainer_config_process(self, args):
"""
Adjust the config with `TrainingArguments` values. This stage is run during `TrainingArguments` object
creation.
"""
# DeepSpeed does:
# train_batch_size = world_size * train_micro_batch_size_per_gpu * gradient_accumulation_steps
train_batch_size = args.world_size * args.per_device_train_batch_size * args.gradient_accumulation_steps
self.fill_match(
"train_micro_batch_size_per_gpu", args.per_device_train_batch_size, "per_device_train_batch_size"
)
self.fill_match("gradient_accumulation_steps", args.gradient_accumulation_steps, "gradient_accumulation_steps")
self.fill_match("train_batch_size", train_batch_size, "train_batch_size (calculated)")
self.fill_match("gradient_clipping", args.max_grad_norm, "max_grad_norm")
self.fill_match("optimizer.params.lr", args.learning_rate, "learning_rate")
self.fill_match("optimizer.params.betas", [args.adam_beta1, args.adam_beta2], "adam_beta1+adam_beta2")
self.fill_match("optimizer.params.eps", args.adam_epsilon, "adam_epsilon")
self.fill_match("optimizer.params.weight_decay", args.weight_decay, "weight_decay")
self.fill_only("scheduler.params.warmup_min_lr", 0) # not a trainer arg
self.fill_match("scheduler.params.warmup_max_lr", args.learning_rate, "learning_rate")
# total_num_steps - will get set in trainer_config_finalize
# fp16
if args.fp16 or args.fp16_full_eval:
fp16_backend = "apex" if args.fp16_backend == "apex" else "amp"
else:
fp16_backend = None
if args.save_on_each_node:
# deepspeed uses shared storage by default. Let's override this setting if save_on_each_node == True
self.config["checkpoint"] = self.config.get("checkpoint", {})
self.config["checkpoint"]["use_node_local_storage"] = args.save_on_each_node
# amp: similar to the pytorch native amp - it has a bunch of optional params but we won't set
# any here unless the user did the work
self.fill_match(
"fp16.enabled",
((args.fp16 or args.fp16_full_eval) and fp16_backend == "amp"),
"fp16|fp16_full_eval+fp16_backend(amp)",
)
# apex: delegates amp work to apex (which needs to be available), but it cannot be used with any
# ZeRO features
self.fill_match("amp.enabled", fp16_backend == "apex", "fp16+fp16_backend(apex)")
self.fill_match("amp.opt_level", args.fp16_opt_level, "fp16_opt_level")
self.fill_match("bf16.enabled", (args.bf16 or args.bf16_full_eval), "bf16|bf16_full_eval")
# deepspeed's default mode is fp16 unless there is a config that says differently
if self.is_true("bf16.enabled"):
self._dtype = torch.bfloat16
elif self.is_false("fp16.enabled"):
self._dtype = torch.float32
else:
self._dtype = torch.float16
def trainer_config_finalize(self, args, model, num_training_steps):
"""
This stage is run after we have the model and know num_training_steps.
Now we can complete the configuration process.
"""
# zero
# deal with config keys that use `auto` value and rely on model's hidden_size
hidden_size_based_keys = [
"zero_optimization.reduce_bucket_size",
"zero_optimization.stage3_prefetch_bucket_size",
"zero_optimization.stage3_param_persistence_threshold",
]
hidden_size_auto_keys = [x for x in hidden_size_based_keys if self.is_auto(x)]
Integration with Deepspeed - kept for backward compatiblity, if you plan to make any edit, make sure to modify the file
in `integrations/deepspeed` instead.
if len(hidden_size_auto_keys) > 0:
if hasattr(model.config, "hidden_size"):
hidden_size = model.config.hidden_size
elif hasattr(model.config, "hidden_sizes"):
# if there are many hidden sizes pick the largest one
hidden_size = max(model.config.hidden_sizes)
else:
raise ValueError(
"The model's config file has neither `hidden_size` nor `hidden_sizes` entry, "
"therefore it's not possible to automatically fill out the following `auto` entries "
f"in the DeepSpeed config file: {hidden_size_auto_keys}. You can fix that by replacing "
"`auto` values for these keys with an integer value of your choice."
)
self.fill_only("zero_optimization.reduce_bucket_size", hidden_size * hidden_size)
if self.is_zero3():
# automatically assign the optimal config values based on model config
self.fill_only("zero_optimization.stage3_prefetch_bucket_size", 0.9 * hidden_size * hidden_size)
self.fill_only("zero_optimization.stage3_param_persistence_threshold", 10 * hidden_size)
# scheduler
self.fill_match("scheduler.params.total_num_steps", num_training_steps, "num_training_steps (calculated)")
self.fill_match("scheduler.params.warmup_num_steps", args.get_warmup_steps(num_training_steps), "warmup_steps")
if len(self.mismatches) > 0:
mismatches = "\n".join(self.mismatches)
raise ValueError(
"Please correct the following DeepSpeed config values that mismatch TrainingArguments"
f" values:\n{mismatches}\nThe easiest method is to set these DeepSpeed config values to 'auto'."
)
# keep the config object global to be able to access it anywhere during TrainingArguments life-cycle
_hf_deepspeed_config_weak_ref = None
def set_hf_deepspeed_config(hf_deepspeed_config_obj):
# this is a special weakref global object to allow us to get to Deepspeed config from APIs
# that don't have an easy way to get to the Deepspeed config outside of the Trainer domain.
global _hf_deepspeed_config_weak_ref
# will go away automatically when HfDeepSpeedConfig is destroyed (when TrainingArguments is destroyed)
_hf_deepspeed_config_weak_ref = weakref.ref(hf_deepspeed_config_obj)
def unset_hf_deepspeed_config():
# useful for unit tests to ensure the global state doesn't leak - call from `tearDown` method
global _hf_deepspeed_config_weak_ref
_hf_deepspeed_config_weak_ref = None
def is_deepspeed_zero3_enabled():
if _hf_deepspeed_config_weak_ref is not None and _hf_deepspeed_config_weak_ref() is not None:
return _hf_deepspeed_config_weak_ref().is_zero3()
else:
return False
def deepspeed_config():
if _hf_deepspeed_config_weak_ref is not None and _hf_deepspeed_config_weak_ref() is not None:
return _hf_deepspeed_config_weak_ref().config
else:
return None
def deepspeed_optim_sched(trainer, hf_deepspeed_config, args, num_training_steps, model_parameters):
"""
A convenience wrapper that deals with optimizer and lr scheduler configuration.
"""
from accelerate.utils import DummyOptim, DummyScheduler
config = hf_deepspeed_config.config
# Optimizer + Scheduler
# Currently supported combos:
# 1. DS scheduler + DS optimizer: Yes
# 2. HF scheduler + HF optimizer: Yes
# 3. DS scheduler + HF optimizer: Yes
# 4. HF scheduler + DS optimizer: No
#
# Unless Offload is enabled in which case it's:
# 1. DS scheduler + DS optimizer: Yes
# 2. HF scheduler + HF optimizer: Mostly*
# 3. DS scheduler + HF optimizer: Mostly*
# 4. HF scheduler + DS optimizer: No
#
# Mostly*: All non-native DeepSpeed optimizers that have both CPU and GPU implementation should work (except LAMB)
optimizer = None
if "optimizer" in config:
if args.adafactor:
raise ValueError(
"--adafactor was passed, but also found `optimizer` configured in the DeepSpeed config. "
"Only one optimizer can be configured."
)
optimizer = DummyOptim(params=model_parameters)
else:
if hf_deepspeed_config.is_offload():
logger.info(
"Detected ZeRO Offload and non-DeepSpeed optimizers: This combination should work as long as the"
" custom optimizer has both CPU and GPU implementation (except LAMB)"
)
# ds supports Adam, OneBitAdam, and Lamb optimizers and can import other optimizers from torch.
# But trainer uses AdamW by default.
optimizer = trainer.create_optimizer()
# To use other optimizers requires voiding warranty with: `zero_allow_untested_optimizer`
config["zero_allow_untested_optimizer"] = True
lr_scheduler = None
if "scheduler" in config:
lr_scheduler = DummyScheduler(optimizer)
else:
if isinstance(optimizer, DummyOptim):
raise ValueError(
"Found `optimizer` configured in the DeepSpeed config, but no `scheduler`. "
"Please configure a scheduler in the DeepSpeed config."
)
lr_scheduler = trainer.create_scheduler(num_training_steps=num_training_steps, optimizer=optimizer)
return optimizer, lr_scheduler
def deepspeed_init(trainer, num_training_steps, inference=False):
"""
Init DeepSpeed, after updating the DeepSpeed configuration with any relevant Trainer's args.
If `resume_from_checkpoint` was passed then an attempt to resume from a previously saved checkpoint will be made.
Args:
trainer: Trainer object
num_training_steps: per single gpu
resume_from_checkpoint: path to a checkpoint if to resume from after normal DeepSpeedEngine load
inference: launch in inference mode (no optimizer and no lr scheduler)
Returns: optimizer, lr_scheduler
We may use `deepspeed_init` more than once during the life of Trainer, when we do - it's a temp hack based on:
https://github.com/microsoft/DeepSpeed/issues/1394#issuecomment-937405374 until Deepspeed fixes a bug where it
can't resume from a checkpoint after it did some stepping https://github.com/microsoft/DeepSpeed/issues/1612
"""
from deepspeed.utils import logger as ds_logger
model = trainer.model
args = trainer.args
hf_deepspeed_config = trainer.accelerator.state.deepspeed_plugin.hf_ds_config
# resume config update - some bits like `model` and `num_training_steps` only become available during train
hf_deepspeed_config.trainer_config_finalize(args, model, num_training_steps)
# set the Deepspeed log level consistent with the Trainer
ds_logger.setLevel(args.get_process_log_level())
if inference:
# only Z3 makes sense for the inference
if not hf_deepspeed_config.is_zero3():
raise ValueError("ZeRO inference only makes sense with ZeRO Stage 3 - please adjust your config")
# in case the training config is re-used for inference
hf_deepspeed_config.del_config_sub_tree("optimizer")
hf_deepspeed_config.del_config_sub_tree("lr_scheduler")
optimizer, lr_scheduler = None, None
model_parameters = None
else:
trainer.optimizer = None # important for when deepspeed_init is used as re-init
model_parameters = list(filter(lambda p: p.requires_grad, model.parameters()))
optimizer, lr_scheduler = deepspeed_optim_sched(
trainer, hf_deepspeed_config, args, num_training_steps, model_parameters
)
# keep for quick debug:
# from pprint import pprint; pprint(config)
return optimizer, lr_scheduler
def deepspeed_load_checkpoint(deepspeed_engine, checkpoint_path):
# it's possible that the user is trying to resume from model_path, which doesn't necessarily
# contain a deepspeed checkpoint. e.g. examples just check if the dir exists and assume it's
# a resume from a checkpoint and not just a local pretrained weight. So we check here if the
# path contains what looks like a deepspeed checkpoint
import glob
deepspeed_checkpoint_dirs = sorted(glob.glob(f"{checkpoint_path}/global_step*"))
if len(deepspeed_checkpoint_dirs) > 0:
logger.info(f"Attempting to resume from {checkpoint_path}")
# this magically updates self.optimizer and self.lr_scheduler
load_path, _ = deepspeed_engine.load_checkpoint(
checkpoint_path, load_optimizer_states=True, load_lr_scheduler_states=True
)
if load_path is None:
raise ValueError(f"[deepspeed] failed to resume from checkpoint {checkpoint_path}")
else:
raise ValueError(f"Can't find a valid checkpoint at {checkpoint_path}")
Check: https://github.com/huggingface/transformers/pull/25599
"""
import warnings
warnings.warn(
"transformers.deepspeed module is deprecated and will be removed in a future version. Please import deepspeed modules directly from transformers.integrations",
FutureWarning,
)
# Backward compatibility imports, to make sure all those objects can be found in integrations/deepspeed
from .integrations.deepspeed import ( # noqa
HfDeepSpeedConfig,
HfTrainerDeepSpeedConfig,
deepspeed_config,
deepspeed_init,
deepspeed_load_checkpoint,
deepspeed_optim_sched,
is_deepspeed_available,
is_deepspeed_zero3_enabled,
set_hf_deepspeed_config,
unset_hf_deepspeed_config,
)
......@@ -24,7 +24,7 @@ import torch
import torch.distributed as dist
from torch import nn
from ..deepspeed import is_deepspeed_zero3_enabled
from ..integrations.deepspeed import is_deepspeed_zero3_enabled
from ..modeling_outputs import CausalLMOutputWithPast, Seq2SeqLMOutput
from ..models.auto import (
MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING,
......
......@@ -11,4 +11,61 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .bitsandbytes import (
get_keys_to_not_convert,
replace_8bit_linear,
replace_with_bnb_linear,
set_module_8bit_tensor_to_device,
set_module_quantized_tensor_to_device,
)
from .deepspeed import (
HfDeepSpeedConfig,
HfTrainerDeepSpeedConfig,
deepspeed_config,
deepspeed_init,
deepspeed_load_checkpoint,
deepspeed_optim_sched,
is_deepspeed_available,
is_deepspeed_zero3_enabled,
set_hf_deepspeed_config,
unset_hf_deepspeed_config,
)
from .integration_utils import (
INTEGRATION_TO_CALLBACK,
AzureMLCallback,
ClearMLCallback,
CodeCarbonCallback,
CometCallback,
DagsHubCallback,
FlyteCallback,
MLflowCallback,
NeptuneCallback,
NeptuneMissingConfiguration,
TensorBoardCallback,
WandbCallback,
get_available_reporting_integrations,
get_reporting_integration_callbacks,
hp_params,
is_azureml_available,
is_clearml_available,
is_codecarbon_available,
is_comet_available,
is_dagshub_available,
is_fairscale_available,
is_flyte_deck_standard_available,
is_flytekit_available,
is_mlflow_available,
is_neptune_available,
is_optuna_available,
is_ray_available,
is_ray_tune_available,
is_sigopt_available,
is_tensorboard_available,
is_wandb_available,
rewrite_logs,
run_hp_search_optuna,
run_hp_search_ray,
run_hp_search_sigopt,
run_hp_search_wandb,
)
from .peft import PeftAdapterMixin
import importlib.metadata
import warnings
from copy import deepcopy
from packaging import version
from ..utils import is_accelerate_available, is_bitsandbytes_available, logging
if is_bitsandbytes_available():
import bitsandbytes as bnb
import torch
import torch.nn as nn
from ..pytorch_utils import Conv1D
if is_accelerate_available():
from accelerate import init_empty_weights
from accelerate.utils import find_tied_parameters
logger = logging.get_logger(__name__)
def set_module_quantized_tensor_to_device(module, tensor_name, device, value=None, fp16_statistics=None):
"""
A helper function to set a given tensor (parameter of buffer) of a module on a specific device (note that doing
`param.to(device)` creates a new tensor not linked to the parameter, which is why we need this function). The
function is adapted from `set_module_tensor_to_device` function from accelerate that is adapted to support the
class `Int8Params` from `bitsandbytes`.
Args:
module (`torch.nn.Module`):
The module in which the tensor we want to move lives.
tensor_name (`str`):
The full name of the parameter/buffer.
device (`int`, `str` or `torch.device`):
The device on which to set the tensor.
value (`torch.Tensor`, *optional*):
The value of the tensor (useful when going from the meta device to any other device).
fp16_statistics (`torch.HalfTensor`, *optional*):
The list of fp16 statistics to set on the module, used for serialization.
"""
# Recurse if needed
if "." in tensor_name:
splits = tensor_name.split(".")
for split in splits[:-1]:
new_module = getattr(module, split)
if new_module is None:
raise ValueError(f"{module} has no attribute {split}.")
module = new_module
tensor_name = splits[-1]
if tensor_name not in module._parameters and tensor_name not in module._buffers:
raise ValueError(f"{module} does not have a parameter or a buffer named {tensor_name}.")
is_buffer = tensor_name in module._buffers
old_value = getattr(module, tensor_name)
if old_value.device == torch.device("meta") and device not in ["meta", torch.device("meta")] and value is None:
raise ValueError(f"{tensor_name} is on the meta device, we need a `value` to put in on {device}.")
is_4bit = False
is_8bit = False
if is_buffer or not is_bitsandbytes_available():
is_8bit = False
is_4bit = False
else:
is_4bit = hasattr(bnb.nn, "Params4bit") and isinstance(module._parameters[tensor_name], bnb.nn.Params4bit)
is_8bit = isinstance(module._parameters[tensor_name], bnb.nn.Int8Params)
if is_8bit or is_4bit:
param = module._parameters[tensor_name]
if param.device.type != "cuda":
if value is None:
new_value = old_value.to(device)
elif isinstance(value, torch.Tensor):
new_value = value.to("cpu")
if value.dtype == torch.int8:
is_8bit_serializable = version.parse(importlib.metadata.version("bitsandbytes")) > version.parse(
"0.37.2"
)
if not is_8bit_serializable:
raise ValueError(
"Detected int8 weights but the version of bitsandbytes is not compatible with int8 serialization. "
"Make sure to download the latest `bitsandbytes` version. `pip install --upgrade bitsandbytes`."
)
else:
new_value = torch.tensor(value, device="cpu")
# Support models using `Conv1D` in place of `nn.Linear` (e.g. gpt2) by transposing the weight matrix prior to quantization.
# Since weights are saved in the correct "orientation", we skip transposing when loading.
if issubclass(module.source_cls, Conv1D) and fp16_statistics is None:
new_value = new_value.T
kwargs = old_value.__dict__
if is_8bit:
new_value = bnb.nn.Int8Params(new_value, requires_grad=False, **kwargs).to(device)
elif is_4bit:
new_value = bnb.nn.Params4bit(new_value, requires_grad=False, **kwargs).to(device)
module._parameters[tensor_name] = new_value
if fp16_statistics is not None:
setattr(module.weight, "SCB", fp16_statistics.to(device))
else:
if value is None:
new_value = old_value.to(device)
elif isinstance(value, torch.Tensor):
new_value = value.to(device)
else:
new_value = torch.tensor(value, device=device)
if is_buffer:
module._buffers[tensor_name] = new_value
else:
new_value = nn.Parameter(new_value, requires_grad=old_value.requires_grad)
module._parameters[tensor_name] = new_value
def _replace_with_bnb_linear(
model, modules_to_not_convert=None, current_key_name=None, quantization_config=None, has_been_replaced=False
):
"""
Private method that wraps the recursion for module replacement.
Returns the converted model and a boolean that indicates if the conversion has been successfull or not.
"""
for name, module in model.named_children():
if current_key_name is None:
current_key_name = []
current_key_name.append(name)
if (isinstance(module, nn.Linear) or isinstance(module, Conv1D)) and name not in modules_to_not_convert:
# Check if the current key is not in the `modules_to_not_convert`
if not any(key in ".".join(current_key_name) for key in modules_to_not_convert):
with init_empty_weights():
if isinstance(module, Conv1D):
in_features, out_features = module.weight.shape
else:
in_features = module.in_features
out_features = module.out_features
if quantization_config.quantization_method() == "llm_int8":
model._modules[name] = bnb.nn.Linear8bitLt(
in_features,
out_features,
module.bias is not None,
has_fp16_weights=quantization_config.llm_int8_has_fp16_weight,
threshold=quantization_config.llm_int8_threshold,
)
has_been_replaced = True
else:
if (
quantization_config.llm_int8_skip_modules is not None
and name in quantization_config.llm_int8_skip_modules
):
pass
else:
model._modules[name] = bnb.nn.Linear4bit(
in_features,
out_features,
module.bias is not None,
quantization_config.bnb_4bit_compute_dtype,
compress_statistics=quantization_config.bnb_4bit_use_double_quant,
quant_type=quantization_config.bnb_4bit_quant_type,
)
has_been_replaced = True
# Store the module class in case we need to transpose the weight later
model._modules[name].source_cls = type(module)
# Force requires grad to False to avoid unexpected errors
model._modules[name].requires_grad_(False)
if len(list(module.children())) > 0:
_, has_been_replaced = _replace_with_bnb_linear(
module,
modules_to_not_convert,
current_key_name,
quantization_config,
has_been_replaced=has_been_replaced,
)
# Remove the last key for recursion
current_key_name.pop(-1)
return model, has_been_replaced
def replace_with_bnb_linear(model, modules_to_not_convert=None, current_key_name=None, quantization_config=None):
"""
A helper function to replace all `torch.nn.Linear` modules by `bnb.nn.Linear8bit` modules from the `bitsandbytes`
library. This will enable running your models using mixed int8 precision as described by the paper `LLM.int8():
8-bit Matrix Multiplication for Transformers at Scale`. Make sure `bitsandbytes` compiled with the correct CUDA
version of your hardware is installed before running this function. `pip install -i https://test.pypi.org/simple/
bitsandbytes`
The function will be run recursively and replace all `torch.nn.Linear` modules except for the `lm_head` that should
be kept as a `torch.nn.Linear` module. The replacement is done under `init_empty_weights` context manager so no
CPU/GPU memory is required to run this function. Int8 mixed-precision matrix decomposition works by separating a
matrix multiplication into two streams: (1) and systematic feature outlier stream matrix multiplied in fp16
(0.01%), (2) a regular stream of int8 matrix multiplication (99.9%). With this method, int8 inference with no
predictive degradation is possible for very large models (>=176B parameters).
Parameters:
model (`torch.nn.Module`):
Input model or `torch.nn.Module` as the function is run recursively.
modules_to_not_convert (`List[`str`]`, *optional*, defaults to `["lm_head"]`):
Names of the modules to not convert in `Linear8bitLt`. In practice we keep the `lm_head` in full precision
for numerical stability reasons.
current_key_name (`List[`str`]`, *optional*):
An array to track the current key of the recursion. This is used to check whether the current key (part of
it) is not in the list of modules to not convert (for instances modules that are offloaded to `cpu` or
`disk`).
"""
modules_to_not_convert = ["lm_head"] if modules_to_not_convert is None else modules_to_not_convert
model, has_been_replaced = _replace_with_bnb_linear(
model, modules_to_not_convert, current_key_name, quantization_config
)
if not has_been_replaced:
logger.warning(
"You are loading your model in 8bit or 4bit but no linear modules were found in your model."
" Please double check your model architecture, or submit an issue on github if you think this is"
" a bug."
)
return model
# For backward compatibility
def replace_8bit_linear(*args, **kwargs):
warnings.warn(
"`replace_8bit_linear` will be deprecated in a future version, please use `replace_with_bnb_linear` instead",
FutureWarning,
)
return replace_with_bnb_linear(*args, **kwargs)
# For backward compatiblity
def set_module_8bit_tensor_to_device(*args, **kwargs):
warnings.warn(
"`set_module_8bit_tensor_to_device` will be deprecated in a future version, please use `set_module_quantized_tensor_to_device` instead",
FutureWarning,
)
return set_module_quantized_tensor_to_device(*args, **kwargs)
def get_keys_to_not_convert(model):
r"""
An utility function to get the key of the module to keep in full precision if any For example for CausalLM modules
we may want to keep the lm_head in full precision for numerical stability reasons. For other architectures, we want
to keep the tied weights of the model. The function will return a list of the keys of the modules to not convert in
int8.
Parameters:
model (`torch.nn.Module`):
Input model
"""
# Create a copy of the model and tie the weights, then
# check if it contains tied weights
tied_model = deepcopy(model) # this has 0 cost since it is done inside `init_empty_weights` context manager`
tied_model.tie_weights()
tied_params = find_tied_parameters(tied_model)
# For compatibility with Accelerate < 0.18
if isinstance(tied_params, dict):
tied_keys = sum(list(tied_params.values()), []) + list(tied_params.keys())
else:
tied_keys = sum(tied_params, [])
has_tied_params = len(tied_keys) > 0
# If there is not tied weights, we want to keep the lm_head(output_embedding) in full precision
if not has_tied_params:
output_emb = model.get_output_embeddings()
if output_emb is not None:
list_last_module = [name for name, module in model.named_modules() if id(module) == id(output_emb)]
return list_last_module
# otherwise, no tied weights, no output embedding defined, simply keep the last module in full precision
list_modules = list(model.named_parameters())
list_last_module = [list_modules[-1][0]]
# add last module together with tied weights
intersection = set(list_last_module) - set(tied_keys)
list_untouched = list(set(tied_keys)) + list(intersection)
# remove ".weight" from the keys
names_to_remove = [".weight", ".bias"]
filtered_module_names = []
for name in list_untouched:
for name_to_remove in names_to_remove:
if name_to_remove in name:
name = name.replace(name_to_remove, "")
filtered_module_names.append(name)
return filtered_module_names
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Integration with Deepspeed
"""
import importlib.util
import weakref
from functools import partialmethod
from ..dependency_versions_check import dep_version_check
from ..utils import is_accelerate_available, is_torch_available, logging
if is_torch_available():
import torch
logger = logging.get_logger(__name__)
def is_deepspeed_available():
return importlib.util.find_spec("deepspeed") is not None
if is_accelerate_available() and is_deepspeed_available():
from accelerate.utils.deepspeed import HfDeepSpeedConfig as DeepSpeedConfig
else:
# Inherits from a dummy `object` if accelerate is not available, so that python succeeds to import this file.
# Deepspeed glue code will never inherit this dummy object as it checks if accelerate is available.
from builtins import object as DeepSpeedConfig
class HfDeepSpeedConfig(DeepSpeedConfig):
"""
This object contains a DeepSpeed configuration dictionary and can be quickly queried for things like zero stage.
A `weakref` of this object is stored in the module's globals to be able to access the config from areas where
things like the Trainer object is not available (e.g. `from_pretrained` and `_get_resized_embeddings`). Therefore
it's important that this object remains alive while the program is still running.
[`Trainer`] uses the `HfTrainerDeepSpeedConfig` subclass instead. That subclass has logic to sync the configuration
with values of [`TrainingArguments`] by replacing special placeholder values: `"auto"`. Without this special logic
the DeepSpeed configuration is not modified in any way.
Args:
config_file_or_dict (`Union[str, Dict]`): path to DeepSpeed config file or dict.
"""
def __init__(self, config_file_or_dict):
# set global weakref object
set_hf_deepspeed_config(self)
dep_version_check("accelerate")
dep_version_check("deepspeed")
super().__init__(config_file_or_dict)
class HfTrainerDeepSpeedConfig(HfDeepSpeedConfig):
"""
The `HfTrainerDeepSpeedConfig` object is meant to be created during `TrainingArguments` object creation and has the
same lifespan as the latter.
"""
def __init__(self, config_file_or_dict):
super().__init__(config_file_or_dict)
self._dtype = None
self.mismatches = []
def dtype(self):
if self._dtype is None:
raise ValueError("trainer_config_process() wasn't called yet to tell dtype")
return self._dtype
def is_auto(self, ds_key_long):
val = self.get_value(ds_key_long)
if val is None:
return False
else:
return val == "auto"
def fill_match(self, ds_key_long, hf_val, hf_key=None, must_match=True):
"""
A utility method that massages the config file and can optionally verify that the values match.
1. Replace "auto" values with `TrainingArguments` value.
2. If it wasn't "auto" and `must_match` is true, then check that DS config matches Trainer
config values and if mismatched add the entry to `self.mismatched` - will assert during
`trainer_config_finalize` for one or more mismatches.
"""
config, ds_key = self.find_config_node(ds_key_long)
if config is None:
return
if config.get(ds_key) == "auto":
config[ds_key] = hf_val
return
if not must_match:
return
ds_val = config.get(ds_key)
if ds_val is not None and ds_val != hf_val:
self.mismatches.append(f"- ds {ds_key_long}={ds_val} vs hf {hf_key}={hf_val}")
fill_only = partialmethod(fill_match, must_match=False)
def trainer_config_process(self, args):
"""
Adjust the config with `TrainingArguments` values. This stage is run during `TrainingArguments` object
creation.
"""
# DeepSpeed does:
# train_batch_size = world_size * train_micro_batch_size_per_gpu * gradient_accumulation_steps
train_batch_size = args.world_size * args.per_device_train_batch_size * args.gradient_accumulation_steps
self.fill_match(
"train_micro_batch_size_per_gpu", args.per_device_train_batch_size, "per_device_train_batch_size"
)
self.fill_match("gradient_accumulation_steps", args.gradient_accumulation_steps, "gradient_accumulation_steps")
self.fill_match("train_batch_size", train_batch_size, "train_batch_size (calculated)")
self.fill_match("gradient_clipping", args.max_grad_norm, "max_grad_norm")
self.fill_match("optimizer.params.lr", args.learning_rate, "learning_rate")
self.fill_match("optimizer.params.betas", [args.adam_beta1, args.adam_beta2], "adam_beta1+adam_beta2")
self.fill_match("optimizer.params.eps", args.adam_epsilon, "adam_epsilon")
self.fill_match("optimizer.params.weight_decay", args.weight_decay, "weight_decay")
self.fill_only("scheduler.params.warmup_min_lr", 0) # not a trainer arg
self.fill_match("scheduler.params.warmup_max_lr", args.learning_rate, "learning_rate")
# total_num_steps - will get set in trainer_config_finalize
# fp16
if args.fp16 or args.fp16_full_eval:
fp16_backend = "apex" if args.fp16_backend == "apex" else "amp"
else:
fp16_backend = None
if args.save_on_each_node:
# deepspeed uses shared storage by default. Let's override this setting if save_on_each_node == True
self.config["checkpoint"] = self.config.get("checkpoint", {})
self.config["checkpoint"]["use_node_local_storage"] = args.save_on_each_node
# amp: similar to the pytorch native amp - it has a bunch of optional params but we won't set
# any here unless the user did the work
self.fill_match(
"fp16.enabled",
((args.fp16 or args.fp16_full_eval) and fp16_backend == "amp"),
"fp16|fp16_full_eval+fp16_backend(amp)",
)
# apex: delegates amp work to apex (which needs to be available), but it cannot be used with any
# ZeRO features
self.fill_match("amp.enabled", fp16_backend == "apex", "fp16+fp16_backend(apex)")
self.fill_match("amp.opt_level", args.fp16_opt_level, "fp16_opt_level")
self.fill_match("bf16.enabled", (args.bf16 or args.bf16_full_eval), "bf16|bf16_full_eval")
# deepspeed's default mode is fp16 unless there is a config that says differently
if self.is_true("bf16.enabled"):
self._dtype = torch.bfloat16
elif self.is_false("fp16.enabled"):
self._dtype = torch.float32
else:
self._dtype = torch.float16
def trainer_config_finalize(self, args, model, num_training_steps):
"""
This stage is run after we have the model and know num_training_steps.
Now we can complete the configuration process.
"""
# zero
# deal with config keys that use `auto` value and rely on model's hidden_size
hidden_size_based_keys = [
"zero_optimization.reduce_bucket_size",
"zero_optimization.stage3_prefetch_bucket_size",
"zero_optimization.stage3_param_persistence_threshold",
]
hidden_size_auto_keys = [x for x in hidden_size_based_keys if self.is_auto(x)]
if len(hidden_size_auto_keys) > 0:
if hasattr(model.config, "hidden_size"):
hidden_size = model.config.hidden_size
elif hasattr(model.config, "hidden_sizes"):
# if there are many hidden sizes pick the largest one
hidden_size = max(model.config.hidden_sizes)
else:
raise ValueError(
"The model's config file has neither `hidden_size` nor `hidden_sizes` entry, "
"therefore it's not possible to automatically fill out the following `auto` entries "
f"in the DeepSpeed config file: {hidden_size_auto_keys}. You can fix that by replacing "
"`auto` values for these keys with an integer value of your choice."
)
self.fill_only("zero_optimization.reduce_bucket_size", hidden_size * hidden_size)
if self.is_zero3():
# automatically assign the optimal config values based on model config
self.fill_only("zero_optimization.stage3_prefetch_bucket_size", 0.9 * hidden_size * hidden_size)
self.fill_only("zero_optimization.stage3_param_persistence_threshold", 10 * hidden_size)
# scheduler
self.fill_match("scheduler.params.total_num_steps", num_training_steps, "num_training_steps (calculated)")
self.fill_match("scheduler.params.warmup_num_steps", args.get_warmup_steps(num_training_steps), "warmup_steps")
if len(self.mismatches) > 0:
mismatches = "\n".join(self.mismatches)
raise ValueError(
"Please correct the following DeepSpeed config values that mismatch TrainingArguments"
f" values:\n{mismatches}\nThe easiest method is to set these DeepSpeed config values to 'auto'."
)
# keep the config object global to be able to access it anywhere during TrainingArguments life-cycle
_hf_deepspeed_config_weak_ref = None
def set_hf_deepspeed_config(hf_deepspeed_config_obj):
# this is a special weakref global object to allow us to get to Deepspeed config from APIs
# that don't have an easy way to get to the Deepspeed config outside of the Trainer domain.
global _hf_deepspeed_config_weak_ref
# will go away automatically when HfDeepSpeedConfig is destroyed (when TrainingArguments is destroyed)
_hf_deepspeed_config_weak_ref = weakref.ref(hf_deepspeed_config_obj)
def unset_hf_deepspeed_config():
# useful for unit tests to ensure the global state doesn't leak - call from `tearDown` method
global _hf_deepspeed_config_weak_ref
_hf_deepspeed_config_weak_ref = None
def is_deepspeed_zero3_enabled():
if _hf_deepspeed_config_weak_ref is not None and _hf_deepspeed_config_weak_ref() is not None:
return _hf_deepspeed_config_weak_ref().is_zero3()
else:
return False
def deepspeed_config():
if _hf_deepspeed_config_weak_ref is not None and _hf_deepspeed_config_weak_ref() is not None:
return _hf_deepspeed_config_weak_ref().config
else:
return None
def deepspeed_optim_sched(trainer, hf_deepspeed_config, args, num_training_steps, model_parameters):
"""
A convenience wrapper that deals with optimizer and lr scheduler configuration.
"""
from accelerate.utils import DummyOptim, DummyScheduler
config = hf_deepspeed_config.config
# Optimizer + Scheduler
# Currently supported combos:
# 1. DS scheduler + DS optimizer: Yes
# 2. HF scheduler + HF optimizer: Yes
# 3. DS scheduler + HF optimizer: Yes
# 4. HF scheduler + DS optimizer: No
#
# Unless Offload is enabled in which case it's:
# 1. DS scheduler + DS optimizer: Yes
# 2. HF scheduler + HF optimizer: Mostly*
# 3. DS scheduler + HF optimizer: Mostly*
# 4. HF scheduler + DS optimizer: No
#
# Mostly*: All non-native DeepSpeed optimizers that have both CPU and GPU implementation should work (except LAMB)
optimizer = None
if "optimizer" in config:
if args.adafactor:
raise ValueError(
"--adafactor was passed, but also found `optimizer` configured in the DeepSpeed config. "
"Only one optimizer can be configured."
)
optimizer = DummyOptim(params=model_parameters)
else:
if hf_deepspeed_config.is_offload():
logger.info(
"Detected ZeRO Offload and non-DeepSpeed optimizers: This combination should work as long as the"
" custom optimizer has both CPU and GPU implementation (except LAMB)"
)
# ds supports Adam, OneBitAdam, and Lamb optimizers and can import other optimizers from torch.
# But trainer uses AdamW by default.
optimizer = trainer.create_optimizer()
# To use other optimizers requires voiding warranty with: `zero_allow_untested_optimizer`
config["zero_allow_untested_optimizer"] = True
lr_scheduler = None
if "scheduler" in config:
lr_scheduler = DummyScheduler(optimizer)
else:
if isinstance(optimizer, DummyOptim):
raise ValueError(
"Found `optimizer` configured in the DeepSpeed config, but no `scheduler`. "
"Please configure a scheduler in the DeepSpeed config."
)
lr_scheduler = trainer.create_scheduler(num_training_steps=num_training_steps, optimizer=optimizer)
return optimizer, lr_scheduler
def deepspeed_init(trainer, num_training_steps, inference=False):
"""
Init DeepSpeed, after updating the DeepSpeed configuration with any relevant Trainer's args.
If `resume_from_checkpoint` was passed then an attempt to resume from a previously saved checkpoint will be made.
Args:
trainer: Trainer object
num_training_steps: per single gpu
resume_from_checkpoint: path to a checkpoint if to resume from after normal DeepSpeedEngine load
inference: launch in inference mode (no optimizer and no lr scheduler)
Returns: optimizer, lr_scheduler
We may use `deepspeed_init` more than once during the life of Trainer, when we do - it's a temp hack based on:
https://github.com/microsoft/DeepSpeed/issues/1394#issuecomment-937405374 until Deepspeed fixes a bug where it
can't resume from a checkpoint after it did some stepping https://github.com/microsoft/DeepSpeed/issues/1612
"""
from deepspeed.utils import logger as ds_logger
model = trainer.model
args = trainer.args
hf_deepspeed_config = trainer.accelerator.state.deepspeed_plugin.hf_ds_config
# resume config update - some bits like `model` and `num_training_steps` only become available during train
hf_deepspeed_config.trainer_config_finalize(args, model, num_training_steps)
# set the Deepspeed log level consistent with the Trainer
ds_logger.setLevel(args.get_process_log_level())
if inference:
# only Z3 makes sense for the inference
if not hf_deepspeed_config.is_zero3():
raise ValueError("ZeRO inference only makes sense with ZeRO Stage 3 - please adjust your config")
# in case the training config is re-used for inference
hf_deepspeed_config.del_config_sub_tree("optimizer")
hf_deepspeed_config.del_config_sub_tree("lr_scheduler")
optimizer, lr_scheduler = None, None
model_parameters = None
else:
trainer.optimizer = None # important for when deepspeed_init is used as re-init
model_parameters = list(filter(lambda p: p.requires_grad, model.parameters()))
optimizer, lr_scheduler = deepspeed_optim_sched(
trainer, hf_deepspeed_config, args, num_training_steps, model_parameters
)
# keep for quick debug:
# from pprint import pprint; pprint(config)
return optimizer, lr_scheduler
def deepspeed_load_checkpoint(deepspeed_engine, checkpoint_path):
# it's possible that the user is trying to resume from model_path, which doesn't necessarily
# contain a deepspeed checkpoint. e.g. examples just check if the dir exists and assume it's
# a resume from a checkpoint and not just a local pretrained weight. So we check here if the
# path contains what looks like a deepspeed checkpoint
import glob
deepspeed_checkpoint_dirs = sorted(glob.glob(f"{checkpoint_path}/global_step*"))
if len(deepspeed_checkpoint_dirs) > 0:
logger.info(f"Attempting to resume from {checkpoint_path}")
# this magically updates self.optimizer and self.lr_scheduler
load_path, _ = deepspeed_engine.load_checkpoint(
checkpoint_path, load_optimizer_states=True, load_lr_scheduler_states=True
)
if load_path is None:
raise ValueError(f"[deepspeed] failed to resume from checkpoint {checkpoint_path}")
else:
raise ValueError(f"Can't find a valid checkpoint at {checkpoint_path}")
......@@ -30,8 +30,8 @@ from typing import TYPE_CHECKING, Dict, Optional
import numpy as np
from . import __version__ as version
from .utils import flatten_dict, is_datasets_available, is_pandas_available, is_torch_available, logging
from .. import __version__ as version
from ..utils import flatten_dict, is_datasets_available, is_pandas_available, is_torch_available, logging
logger = logging.get_logger(__name__)
......@@ -68,10 +68,10 @@ if TYPE_CHECKING and _has_neptune:
except importlib.metadata.PackageNotFoundError:
_has_neptune = False
from .trainer_callback import ProgressCallback, TrainerCallback # noqa: E402
from .trainer_utils import PREFIX_CHECKPOINT_DIR, BestRun, IntervalStrategy # noqa: E402
from .training_args import ParallelMode # noqa: E402
from .utils import ENV_VARS_TRUE_VALUES, is_torch_tpu_available # noqa: E402
from ..trainer_callback import ProgressCallback, TrainerCallback # noqa: E402
from ..trainer_utils import PREFIX_CHECKPOINT_DIR, BestRun, IntervalStrategy # noqa: E402
from ..training_args import ParallelMode # noqa: E402
from ..utils import ENV_VARS_TRUE_VALUES, is_torch_tpu_available # noqa: E402
# Integration functions:
......
......@@ -14,7 +14,7 @@
import inspect
from typing import Optional
from ...utils import (
from ..utils import (
check_peft_version,
find_adapter_config_file,
is_accelerate_available,
......
# Copyright 2023 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .peft_mixin import PeftAdapterMixin
......@@ -35,10 +35,9 @@ from torch.nn import CrossEntropyLoss
from .activations import get_activation
from .configuration_utils import PretrainedConfig
from .deepspeed import deepspeed_config, is_deepspeed_zero3_enabled
from .dynamic_module_utils import custom_object_save
from .generation import GenerationConfig, GenerationMixin
from .lib_integrations import PeftAdapterMixin
from .integrations import PeftAdapterMixin, deepspeed_config, is_deepspeed_zero3_enabled
from .pytorch_utils import ( # noqa: F401
Conv1D,
apply_chunking_to_forward,
......@@ -663,7 +662,7 @@ def _load_state_dict_into_meta_model(
# they won't get loaded.
if is_quantized:
from .utils.bitsandbytes import set_module_quantized_tensor_to_device
from .integrations import set_module_quantized_tensor_to_device
error_msgs = []
......@@ -2960,7 +2959,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
keep_in_fp32_modules = []
if load_in_8bit or load_in_4bit:
from .utils.bitsandbytes import get_keys_to_not_convert, replace_with_bnb_linear
from .integrations import get_keys_to_not_convert, replace_with_bnb_linear
llm_int8_skip_modules = quantization_config.llm_int8_skip_modules
load_in_8bit_fp32_cpu_offload = quantization_config.llm_int8_enable_fp32_cpu_offload
......@@ -3278,7 +3277,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
):
is_safetensors = False
if is_quantized:
from .utils.bitsandbytes import set_module_quantized_tensor_to_device
from .integrations import set_module_quantized_tensor_to_device
if device_map is not None and "disk" in device_map.values():
archive_file = (
......
......@@ -25,7 +25,7 @@ from torch import nn
from torch.nn import CrossEntropyLoss
from ...activations import ACT2FN
from ...deepspeed import is_deepspeed_zero3_enabled
from ...integrations.deepspeed import is_deepspeed_zero3_enabled
from ...modeling_outputs import (
BaseModelOutput,
CausalLMOutput,
......
......@@ -23,8 +23,8 @@ import torch.utils.checkpoint
from torch import nn
from ....activations import ACT2FN
from ....deepspeed import is_deepspeed_zero3_enabled
from ....file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward
from ....integrations.deepspeed import is_deepspeed_zero3_enabled
from ....modeling_outputs import BaseModelOutput, CausalLMOutput
from ....modeling_utils import (
PreTrainedModel,
......
......@@ -29,7 +29,7 @@ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from ...activations import get_activation
from ...configuration_utils import PretrainedConfig
from ...deepspeed import is_deepspeed_zero3_enabled
from ...integrations.deepspeed import is_deepspeed_zero3_enabled
from ...modeling_outputs import (
BaseModelOutput,
MaskedLMOutput,
......
......@@ -23,7 +23,7 @@ import torch
import torch.nn as nn
from torch.nn import LayerNorm
from ...deepspeed import is_deepspeed_available
from ...integrations.deepspeed import is_deepspeed_available
from ...modeling_outputs import ModelOutput
from ...utils import (
ContextManagers,
......
......@@ -35,7 +35,7 @@ from torch import Tensor, nn
from torch.nn import CrossEntropyLoss, LayerNorm
from ...activations import ACT2FN
from ...deepspeed import is_deepspeed_zero3_enabled
from ...integrations.deepspeed import is_deepspeed_zero3_enabled
from ...modeling_outputs import (
BaseModelOutput,
BaseModelOutputWithPastAndCrossAttentions,
......
......@@ -24,7 +24,7 @@ from torch import nn
from torch.nn import CrossEntropyLoss
from ...activations import ACT2FN
from ...deepspeed import is_deepspeed_zero3_enabled
from ...integrations.deepspeed import is_deepspeed_zero3_enabled
from ...modeling_outputs import BaseModelOutput, CausalLMOutput, SequenceClassifierOutput
from ...modeling_utils import PreTrainedModel
from ...utils import (
......
......@@ -23,7 +23,7 @@ from torch import nn
from torch.nn import CrossEntropyLoss
from ...activations import ACT2FN
from ...deepspeed import is_deepspeed_zero3_enabled
from ...integrations.deepspeed import is_deepspeed_zero3_enabled
from ...modeling_outputs import (
BaseModelOutput,
BaseModelOutputWithPastAndCrossAttentions,
......
......@@ -24,7 +24,7 @@ from torch.nn import CrossEntropyLoss
from torch.utils.checkpoint import checkpoint
from ...activations import ACT2FN
from ...deepspeed import is_deepspeed_zero3_enabled
from ...integrations.deepspeed import is_deepspeed_zero3_enabled
from ...modeling_outputs import (
MoEModelOutput,
MoEModelOutputWithPastAndCrossAttentions,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment