Migrate HFDeepSpeedConfig from trfrs to accelerate (#17623)

* Migrate HFDeepSpeedConfig from trfrs to accelerate * add `accelerate` to testing dep * addressing comments * addressing comments Using `_shared_state` and avoiding object creation. This is necessary as `notebook_launcher` in `launcers.py` checks `len(AcceleratorState._shared_state)>0` to throw an error. * resolving comments 1. Use simple API from accelerate to manage the deepspeed config integration 2. Update the related documentation * reverting changes and addressing comments * docstring correction * addressing nits * addressing nits * addressing nits 3 * bumping up the accelerate version to 0.10.0 * resolving import * update setup.py to include deepspeed dependencies * Update dependency_versions_table.py * fixing imports * reverting changes to CI dependencies for "run_tests_pipelines_tf*" tests These changes didn't help with resolving the failures and I believe this needs to be addressed in another PR. * removing `accelerate` as hard dependency Resolves issues related to CI Tests * adding `accelerate` as dependency for building docs resolves failure in Build PR Documentation test * adding `accelerate` as dependency in "dev" to resolve doc build issue * resolving comments 1. adding `accelerate` to extras["all"] 2. Including check for accelerate too before import HFDeepSpeedConfig from there Co-Authored-By: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * resolving comments Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

Migrate HFDeepSpeedConfig from trfrs to accelerate (#17623)
* Migrate HFDeepSpeedConfig from trfrs to accelerate * add `accelerate` to testing dep * addressing comments * addressing comments Using `_shared_state` and avoiding object creation. This is necessary as `notebook_launcher` in `launcers.py` checks `len(AcceleratorState._shared_state)>0` to throw an error. * resolving comments 1. Use simple API from accelerate to manage the deepspeed config integration 2. Update the related documentation * reverting changes and addressing comments * docstring correction * addressing nits * addressing nits * addressing nits 3 * bumping up the accelerate version to 0.10.0 * resolving import * update setup.py to include deepspeed dependencies * Update dependency_versions_table.py * fixing imports * reverting changes to CI dependencies for "run_tests_pipelines_tf*" tests These changes didn't help with resolving the failures and I believe this needs to be addressed in another PR. * removing `accelerate` as hard dependency Resolves issues related to CI Tests * adding `accelerate` as dependency for building docs resolves failure in Build PR Documentation test * adding `accelerate` as dependency in "dev" to resolve doc build issue * resolving comments 1. adding `accelerate` to extras["all"] 2. Including check for accelerate too before import HFDeepSpeedConfig from there Co-Authored-By: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * resolving comments Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
21a77242 · Sourab Mangrulkar · GitHub · e44a569f · 21a77242 · 21a77242
Unverified Commit 21a77242 authored Jun 17, 2022 by Sourab Mangrulkar Committed by GitHub Jun 17, 2022
4 changed files
--- a/setup.py
+++ b/setup.py
@@ -97,7 +97,7 @@ if stale_egg_info.exists():
 # 2. once modified, run: `make deps_table_update` to update src/transformers/dependency_versions_table.py
 _deps = [
    "Pillow",
-    "accelerate>=0.9.0",
+    "accelerate>=0.10.0",
    "black~=22.0,>=22.3",
    "codecarbon==1.2.0",
    "cookiecutter==1.7.3",
@@ -242,6 +242,7 @@ extras["tf"] = deps_list("tensorflow", "onnxconverter-common", "tf2onnx")
 extras["tf-cpu"] = deps_list("tensorflow-cpu", "onnxconverter-common", "tf2onnx")

 extras["torch"] = deps_list("torch")
+extras["accelerate"] = deps_list("accelerate")

 if os.name == "nt":  # windows
    extras["retrieval"] = deps_list("datasets")  # faiss is not supported on windows
@@ -257,7 +258,7 @@ extras["onnx"] = deps_list("onnxconverter-common", "tf2onnx") + extras["onnxrunt
 extras["modelcreation"] = deps_list("cookiecutter")

 extras["sagemaker"] = deps_list("sagemaker")
-extras["deepspeed"] = deps_list("deepspeed")
+extras["deepspeed"] = deps_list("deepspeed") + extras["accelerate"]
 extras["fairscale"] = deps_list("fairscale")
 extras["optuna"] = deps_list("optuna")
 extras["ray"] = deps_list("ray[tune]")
@@ -295,7 +296,7 @@ extras["testing"] = (
        "hf-doc-builder",
        "protobuf",  # Can be removed once we can unpin protobuf
        "sacremoses",
-        "rjieba"
+        "rjieba",
    )
    + extras["retrieval"]
    + extras["modelcreation"]
@@ -316,6 +317,7 @@ extras["all"] = (
    + extras["integrations"]
    + extras["timm"]
    + extras["codecarbon"]
+    + extras["accelerate"]
 )

 # Might need to add doc-builder and some specific deps in the future
@@ -325,8 +327,8 @@ extras["docs_specific"] = ["hf-doc-builder"]
 extras["docs"] = extras["all"] + extras["docs_specific"]

 extras["dev-torch"] = (
-    extras['testing']
-    + extras['torch']
+    extras["testing"]
+    + extras["torch"]
    + extras["sentencepiece"]
    + extras["tokenizers"]
    + extras["torch-speech"]
@@ -342,8 +344,8 @@ extras["dev-torch"] = (
    + extras["onnxruntime"]
 )
 extras["dev-tensorflow"] = (
-        extras['testing']
-        + extras['tf']
+    extras["testing"]
+    + extras["tf"]
    + extras["sentencepiece"]
    + extras["tokenizers"]
    + extras["vision"]

--- a/src/transformers/deepspeed.py
+++ b/src/transformers/deepspeed.py
@@ -16,14 +16,12 @@ Integration with Deepspeed
 """

 import importlib.util
-import io
-import json
 import weakref
 from copy import deepcopy
 from functools import partialmethod

 from .dependency_versions_check import dep_version_check
-from .utils import is_torch_available, logging
+from .utils import is_accelerate_available, is_torch_available, logging


 if is_torch_available():
@@ -36,7 +34,15 @@ def is_deepspeed_available():
    return importlib.util.find_spec("deepspeed") is not None


-class HfDeepSpeedConfig:
+if is_accelerate_available() and is_deepspeed_available():
+    from accelerate.utils.deepspeed import HfDeepSpeedConfig as DeepSpeedConfig
+else:
+    # Inherits from a dummy `object` if accelerate is not available, so that python succeeds to import this file.
+    # Deepspeed glue code will never inherit this dummy object as it checks if accelerate is available.
+    from builtins import object as DeepSpeedConfig
+
+
+class HfDeepSpeedConfig(DeepSpeedConfig):
    """
    This object contains a DeepSpeed configuration dictionary and can be quickly queried for things like zero stage.

@@ -56,108 +62,9 @@ class HfDeepSpeedConfig:
    def __init__(self, config_file_or_dict):
        # set global weakref object
        set_hf_deepspeed_config(self)
-
+        dep_version_check("accelerate")
        dep_version_check("deepspeed")
-
-        if isinstance(config_file_or_dict, dict):
-            # Don't modify user's data should they want to reuse it (e.g. in tests), because once we
-            # modified it, it will not be accepted here again, since `auto` values would have been overridden
-            config = deepcopy(config_file_or_dict)
-        elif isinstance(config_file_or_dict, str):
-            with io.open(config_file_or_dict, "r", encoding="utf-8") as f:
-                config = json.load(f)
-        else:
-            raise ValueError("expecting either a path to a DeepSpeed config file or a pre-populated dict")
-        self.config = config
-
-        # zero stage - this is done as early as possible, before model is created, to allow
-        # ``is_deepspeed_zero3_enabled`` query and getting to the early deepspeed config object
-        # during ``zero.Init()`` which needs to know the dtype, and some other hparams.
-        self._stage = self.get_value("zero_optimization.stage", -1)
-
-        # offload
-        self._offload = False
-        if self.is_zero2() or self.is_zero3():
-            offload_devices_valid = set(["cpu", "nvme"])
-            offload_devices = set(
-                [
-                    self.get_value("zero_optimization.offload_optimizer.device"),
-                    self.get_value("zero_optimization.offload_param.device"),
-                ]
-            )
-            if len(offload_devices & offload_devices_valid) > 0:
-                self._offload = True
-
-    def find_config_node(self, ds_key_long):
-        config = self.config
-
-        # find the config node of interest if it exists
-        nodes = ds_key_long.split(".")
-        ds_key = nodes.pop()
-        for node in nodes:
-            config = config.get(node)
-            if config is None:
-                return None, ds_key
-
-        return config, ds_key
-
-    def get_value(self, ds_key_long, default=None):
-        """
-        Returns the set value or `default` if no value is set
-        """
-        config, ds_key = self.find_config_node(ds_key_long)
-        if config is None:
-            return default
-        return config.get(ds_key, default)
-
-    def del_config_sub_tree(self, ds_key_long, must_exist=False):
-        """
-        Deletes a sub-section of the config file if it's found.
-
-        Unless `must_exist` is `True` the section doesn't have to exist.
-        """
-        config = self.config
-
-        # find the config node of interest if it exists
-        nodes = ds_key_long.split(".")
-        for node in nodes:
-            parent_config = config
-            config = config.get(node)
-            if config is None:
-                if must_exist:
-                    raise ValueError(f"Can't find {ds_key_long} entry in the config: {self.config}")
-                else:
-                    return
-
-        # if found remove it
-        if parent_config is not None:
-            parent_config.pop(node)
-
-    def is_true(self, ds_key_long):
-        """
-        Returns `True`/``False` only if the value is set, always `False` otherwise. So use this method to ask the very
-        specific question of whether the value is set to `True` (and it's not set to `False`` or isn't set).
-
-        """
-        value = self.get_value(ds_key_long)
-        return False if value is None else bool(value)
-
-    def is_false(self, ds_key_long):
-        """
-        Returns `True`/``False` only if the value is set, always `False` otherwise. So use this method to ask the very
-        specific question of whether the value is set to `False` (and it's not set to `True`` or isn't set).
-        """
-        value = self.get_value(ds_key_long)
-        return False if value is None else not bool(value)
-
-    def is_zero2(self):
-        return self._stage == 2
-
-    def is_zero3(self):
-        return self._stage == 3
-
-    def is_offload(self):
-        return self._offload
+        super().__init__(config_file_or_dict)


 class HfTrainerDeepSpeedConfig(HfDeepSpeedConfig):

--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@@ -3,7 +3,7 @@
 # 2. run `make deps_table_update``
 deps = {
    "Pillow": "Pillow",
-    "accelerate": "accelerate>=0.9.0",
+    "accelerate": "accelerate>=0.10.0",
    "black": "black~=22.0,>=22.3",
    "codecarbon": "codecarbon==1.2.0",
    "cookiecutter": "cookiecutter==1.7.3",

--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -35,6 +35,7 @@ from .utils import (
    ExplicitEnum,
    cached_property,
    get_full_repo_name,
+    is_accelerate_available,
    is_sagemaker_dp_enabled,
    is_sagemaker_mp_enabled,
    is_torch_available,
@@ -1163,6 +1164,8 @@ class TrainingArguments:
        if self.deepspeed:
            # - must be run very last in arg parsing, since it will use a lot of these settings.
            # - must be run before the model is created.
+            if not is_accelerate_available():
+                raise ValueError("--deepspeed requires Accelerate to be installed: `pip install accelerate`.")
            from transformers.deepspeed import HfTrainerDeepSpeedConfig

            # will be used later by the Trainer