add ascend npu accelerator support (#24879)

* Add Ascend NPU accelerator support * fix style warining

add ascend npu accelerator support (#24879)
* Add Ascend NPU accelerator support * fix style warining
9c875839 · statelesshz · GitHub · f14c7f99 · 9c875839 · 9c875839
Unverified Commit 9c875839 authored Jul 18, 2023 by statelesshz Committed by GitHub Jul 18, 2023
7 changed files
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -721,6 +721,7 @@ _import_structure = {
        "is_tokenizers_available",
        "is_torch_available",
        "is_torch_neuroncore_available",
+        "is_torch_npu_available",
        "is_torch_tpu_available",
        "is_torchvision_available",
        "is_vision_available",
@@ -4643,6 +4644,7 @@ if TYPE_CHECKING:
        is_tokenizers_available,
        is_torch_available,
        is_torch_neuroncore_available,
+        is_torch_npu_available,
        is_torch_tpu_available,
        is_torchvision_available,
        is_vision_available,

--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -91,6 +91,7 @@ from .utils import (
    is_torch_bf16_cpu_available,
    is_torch_bf16_gpu_available,
    is_torch_neuroncore_available,
+    is_torch_npu_available,
    is_torch_tensorrt_fx_available,
    is_torch_tf32_available,
    is_torch_tpu_available,
@@ -587,6 +588,26 @@ def require_torch_neuroncore(test_case):
    )


+def require_torch_npu(test_case):
+    """
+    Decorator marking a test that requires NPU (in PyTorch).
+    """
+    return unittest.skipUnless(is_torch_npu_available(), "test requires PyTorch NPU")(test_case)
+
+
+def require_torch_multi_npu(test_case):
+    """
+    Decorator marking a test that requires a multi-NPU setup (in PyTorch). These tests are skipped on a machine without
+    multiple NPUs.
+
+    To run *only* the multi_npu tests, assuming all test names contain multi_npu: $ pytest -sv ./tests -k "multi_npu"
+    """
+    if not is_torch_npu_available():
+        return unittest.skip("test requires PyTorch NPU")(test_case)
+
+    return unittest.skipUnless(torch.npu.device_count() > 1, "test requires multiple NPUs")(test_case)
+
+
 if is_torch_available():
    # Set env var CUDA_VISIBLE_DEVICES="" to force cpu-mode
    import torch

--- a/src/transformers/trainer_utils.py
+++ b/src/transformers/trainer_utils.py
@@ -36,6 +36,7 @@ from .utils import (
    is_torch_available,
    is_torch_cuda_available,
    is_torch_mps_available,
+    is_torch_npu_available,
    is_torch_tpu_available,
    requires_backends,
 )
@@ -94,6 +95,8 @@ def set_seed(seed: int):
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        # ^^ safe to call this function even if cuda is not available
+    if is_torch_npu_available():
+        torch.npu.manual_seed_all(seed)
    if is_tf_available():
        tf.random.set_seed(seed)


--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -47,6 +47,7 @@ from .utils import (
    is_torch_bf16_cpu_available,
    is_torch_bf16_gpu_available,
    is_torch_neuroncore_available,
+    is_torch_npu_available,
    is_torch_tf32_available,
    is_torch_tpu_available,
    logging,
@@ -1368,12 +1369,13 @@ class TrainingArguments:
            self.framework == "pt"
            and is_torch_available()
            and (self.device.type != "cuda")
+            and (self.device.type != "npu")
            and (get_xla_device_type(self.device) != "GPU")
            and (self.fp16 or self.fp16_full_eval)
        ):
            raise ValueError(
                "FP16 Mixed precision training with AMP or APEX (`--fp16`) and FP16 half precision evaluation"
-                " (`--fp16_full_eval`) can only be used on CUDA devices."
+                " (`--fp16_full_eval`) can only be used on CUDA or NPU devices."
            )

        if (
@@ -1769,6 +1771,10 @@ class TrainingArguments:
            elif self.use_cpu:
                device = torch.device("cpu")
                self._n_gpu = 0
+            elif is_torch_npu_available():
+                device = torch.device("npu:0")
+                torch.npu.set_device(device)
+                self._n_gpu = 1
            else:
                # if n_gpu is > 1 we'll use nn.DataParallel.
                # If you only want to use a specific subset of GPUs use `CUDA_VISIBLE_DEVICES=0`

--- a/src/transformers/utils/__init__.py
+++ b/src/transformers/utils/__init__.py
@@ -164,6 +164,7 @@ from .import_utils import (
    is_torch_fx_proxy,
    is_torch_mps_available,
    is_torch_neuroncore_available,
+    is_torch_npu_available,
    is_torch_tensorrt_fx_available,
    is_torch_tf32_available,
    is_torch_tpu_available,

--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@@ -397,6 +397,25 @@ def is_torch_neuroncore_available(check_device=True):
    return False


+@lru_cache()
+def is_torch_npu_available(check_device=False):
+    "Checks if `torch_npu` is installed and potentially if a NPU is in the environment"
+    if not _torch_available or importlib.util.find_spec("torch_npu") is None:
+        return False
+
+    import torch
+    import torch_npu  # noqa: F401
+
+    if check_device:
+        try:
+            # Will raise a RuntimeError if no NPU is found
+            _ = torch.npu.device_count()
+            return torch.npu.is_available()
+        except RuntimeError:
+            return False
+    return hasattr(torch, "npu") and torch.npu.is_available()
+
+
 def is_torchdynamo_available():
    if not is_torch_available():
        return False

--- a/tests/trainer/test_trainer_distributed.py
+++ b/tests/trainer/test_trainer_distributed.py
@@ -21,6 +21,7 @@ from transformers.testing_utils import (
    get_torch_dist_unique_port,
    require_torch_multi_gpu,
    require_torch_neuroncore,
+    require_torch_npu,
 )
 from transformers.training_args import ParallelMode
 from transformers.utils import logging
@@ -77,6 +78,20 @@ class TestTrainerDistributedNeuronCore(TestCasePlus):
        # successful return here == success - any errors would have caused an error in the sub-call


+class TestTrainerDistributedNPU(TestCasePlus):
+    @require_torch_npu
+    def test_trainer(self):
+        distributed_args = f"""--nproc_per_node=2
+            --master_port={get_torch_dist_unique_port()}
+            {self.test_file_dir}/test_trainer_distributed.py
+        """.split()
+        output_dir = self.get_auto_remove_tmp_dir()
+        args = f"--output_dir {output_dir}".split()
+        cmd = ["torchrun"] + distributed_args + args
+        execute_subprocess_async(cmd, env=self.get_env())
+        # successful return here == success - any errors would have caused an error in the sub-call
+
+
 class TestTrainerDistributed(TestCasePlus):
    @require_torch_multi_gpu
    def test_trainer(self):