Unverified Commit 9c875839 authored by statelesshz's avatar statelesshz Committed by GitHub
Browse files

add ascend npu accelerator support (#24879)

* Add Ascend NPU accelerator support

* fix style warining
parent f14c7f99
......@@ -721,6 +721,7 @@ _import_structure = {
"is_tokenizers_available",
"is_torch_available",
"is_torch_neuroncore_available",
"is_torch_npu_available",
"is_torch_tpu_available",
"is_torchvision_available",
"is_vision_available",
......@@ -4643,6 +4644,7 @@ if TYPE_CHECKING:
is_tokenizers_available,
is_torch_available,
is_torch_neuroncore_available,
is_torch_npu_available,
is_torch_tpu_available,
is_torchvision_available,
is_vision_available,
......
......@@ -91,6 +91,7 @@ from .utils import (
is_torch_bf16_cpu_available,
is_torch_bf16_gpu_available,
is_torch_neuroncore_available,
is_torch_npu_available,
is_torch_tensorrt_fx_available,
is_torch_tf32_available,
is_torch_tpu_available,
......@@ -587,6 +588,26 @@ def require_torch_neuroncore(test_case):
)
def require_torch_npu(test_case):
"""
Decorator marking a test that requires NPU (in PyTorch).
"""
return unittest.skipUnless(is_torch_npu_available(), "test requires PyTorch NPU")(test_case)
def require_torch_multi_npu(test_case):
"""
Decorator marking a test that requires a multi-NPU setup (in PyTorch). These tests are skipped on a machine without
multiple NPUs.
To run *only* the multi_npu tests, assuming all test names contain multi_npu: $ pytest -sv ./tests -k "multi_npu"
"""
if not is_torch_npu_available():
return unittest.skip("test requires PyTorch NPU")(test_case)
return unittest.skipUnless(torch.npu.device_count() > 1, "test requires multiple NPUs")(test_case)
if is_torch_available():
# Set env var CUDA_VISIBLE_DEVICES="" to force cpu-mode
import torch
......
......@@ -36,6 +36,7 @@ from .utils import (
is_torch_available,
is_torch_cuda_available,
is_torch_mps_available,
is_torch_npu_available,
is_torch_tpu_available,
requires_backends,
)
......@@ -94,6 +95,8 @@ def set_seed(seed: int):
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
# ^^ safe to call this function even if cuda is not available
if is_torch_npu_available():
torch.npu.manual_seed_all(seed)
if is_tf_available():
tf.random.set_seed(seed)
......
......@@ -47,6 +47,7 @@ from .utils import (
is_torch_bf16_cpu_available,
is_torch_bf16_gpu_available,
is_torch_neuroncore_available,
is_torch_npu_available,
is_torch_tf32_available,
is_torch_tpu_available,
logging,
......@@ -1368,12 +1369,13 @@ class TrainingArguments:
self.framework == "pt"
and is_torch_available()
and (self.device.type != "cuda")
and (self.device.type != "npu")
and (get_xla_device_type(self.device) != "GPU")
and (self.fp16 or self.fp16_full_eval)
):
raise ValueError(
"FP16 Mixed precision training with AMP or APEX (`--fp16`) and FP16 half precision evaluation"
" (`--fp16_full_eval`) can only be used on CUDA devices."
" (`--fp16_full_eval`) can only be used on CUDA or NPU devices."
)
if (
......@@ -1769,6 +1771,10 @@ class TrainingArguments:
elif self.use_cpu:
device = torch.device("cpu")
self._n_gpu = 0
elif is_torch_npu_available():
device = torch.device("npu:0")
torch.npu.set_device(device)
self._n_gpu = 1
else:
# if n_gpu is > 1 we'll use nn.DataParallel.
# If you only want to use a specific subset of GPUs use `CUDA_VISIBLE_DEVICES=0`
......
......@@ -164,6 +164,7 @@ from .import_utils import (
is_torch_fx_proxy,
is_torch_mps_available,
is_torch_neuroncore_available,
is_torch_npu_available,
is_torch_tensorrt_fx_available,
is_torch_tf32_available,
is_torch_tpu_available,
......
......@@ -397,6 +397,25 @@ def is_torch_neuroncore_available(check_device=True):
return False
@lru_cache()
def is_torch_npu_available(check_device=False):
"Checks if `torch_npu` is installed and potentially if a NPU is in the environment"
if not _torch_available or importlib.util.find_spec("torch_npu") is None:
return False
import torch
import torch_npu # noqa: F401
if check_device:
try:
# Will raise a RuntimeError if no NPU is found
_ = torch.npu.device_count()
return torch.npu.is_available()
except RuntimeError:
return False
return hasattr(torch, "npu") and torch.npu.is_available()
def is_torchdynamo_available():
if not is_torch_available():
return False
......
......@@ -21,6 +21,7 @@ from transformers.testing_utils import (
get_torch_dist_unique_port,
require_torch_multi_gpu,
require_torch_neuroncore,
require_torch_npu,
)
from transformers.training_args import ParallelMode
from transformers.utils import logging
......@@ -77,6 +78,20 @@ class TestTrainerDistributedNeuronCore(TestCasePlus):
# successful return here == success - any errors would have caused an error in the sub-call
class TestTrainerDistributedNPU(TestCasePlus):
@require_torch_npu
def test_trainer(self):
distributed_args = f"""--nproc_per_node=2
--master_port={get_torch_dist_unique_port()}
{self.test_file_dir}/test_trainer_distributed.py
""".split()
output_dir = self.get_auto_remove_tmp_dir()
args = f"--output_dir {output_dir}".split()
cmd = ["torchrun"] + distributed_args + args
execute_subprocess_async(cmd, env=self.get_env())
# successful return here == success - any errors would have caused an error in the sub-call
class TestTrainerDistributed(TestCasePlus):
@require_torch_multi_gpu
def test_trainer(self):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment