Unverified Commit 946bac79 authored by statelesshz's avatar statelesshz Committed by GitHub
Browse files

add bf16 mixed precision support for NPU (#26163)


Co-authored-by: default avatarstatelesshz <jihuazhong1@huawei.com>
parent 153755ee
...@@ -211,7 +211,7 @@ class TrainingArguments: ...@@ -211,7 +211,7 @@ class TrainingArguments:
eval_accumulation_steps (`int`, *optional*): eval_accumulation_steps (`int`, *optional*):
Number of predictions steps to accumulate the output tensors for, before moving the results to the CPU. If Number of predictions steps to accumulate the output tensors for, before moving the results to the CPU. If
left unset, the whole predictions are accumulated on GPU/TPU before being moved to the CPU (faster but left unset, the whole predictions are accumulated on GPU/NPU/TPU before being moved to the CPU (faster but
requires more memory). requires more memory).
eval_delay (`float`, *optional*): eval_delay (`float`, *optional*):
Number of epochs or steps to wait for before the first evaluation can be performed, depending on the Number of epochs or steps to wait for before the first evaluation can be performed, depending on the
...@@ -318,7 +318,7 @@ class TrainingArguments: ...@@ -318,7 +318,7 @@ class TrainingArguments:
installation](https://github.com/intel/intel-extension-for-pytorch). installation](https://github.com/intel/intel-extension-for-pytorch).
bf16 (`bool`, *optional*, defaults to `False`): bf16 (`bool`, *optional*, defaults to `False`):
Whether to use bf16 16-bit (mixed) precision training instead of 32-bit training. Requires Ampere or higher Whether to use bf16 16-bit (mixed) precision training instead of 32-bit training. Requires Ampere or higher
NVIDIA architecture or using CPU (use_cpu). This is an experimental API and it may change. NVIDIA architecture or using CPU (use_cpu) or Ascend NPU. This is an experimental API and it may change.
fp16 (`bool`, *optional*, defaults to `False`): fp16 (`bool`, *optional*, defaults to `False`):
Whether to use fp16 16-bit (mixed) precision training instead of 32-bit training. Whether to use fp16 16-bit (mixed) precision training instead of 32-bit training.
fp16_opt_level (`str`, *optional*, defaults to 'O1'): fp16_opt_level (`str`, *optional*, defaults to 'O1'):
...@@ -344,7 +344,7 @@ class TrainingArguments: ...@@ -344,7 +344,7 @@ class TrainingArguments:
local_rank (`int`, *optional*, defaults to -1): local_rank (`int`, *optional*, defaults to -1):
Rank of the process during distributed training. Rank of the process during distributed training.
ddp_backend (`str`, *optional*): ddp_backend (`str`, *optional*):
The backend to use for distributed training. Must be one of `"nccl"`, `"mpi"`, `"ccl"`, `"gloo"`. The backend to use for distributed training. Must be one of `"nccl"`, `"mpi"`, `"ccl"`, `"gloo"`, `"hccl"`.
tpu_num_cores (`int`, *optional*): tpu_num_cores (`int`, *optional*):
When training on TPU, the number of TPU cores (automatically passed by launcher script). When training on TPU, the number of TPU cores (automatically passed by launcher script).
dataloader_drop_last (`bool`, *optional*, defaults to `False`): dataloader_drop_last (`bool`, *optional*, defaults to `False`):
...@@ -855,7 +855,7 @@ class TrainingArguments: ...@@ -855,7 +855,7 @@ class TrainingArguments:
metadata={ metadata={
"help": ( "help": (
"Whether to use bf16 (mixed) precision instead of 32-bit. Requires Ampere or higher NVIDIA" "Whether to use bf16 (mixed) precision instead of 32-bit. Requires Ampere or higher NVIDIA"
" architecture or using CPU (use_cpu). This is an experimental API and it may change." " architecture or using CPU (use_cpu) or Ascend NPU. This is an experimental API and it may change."
) )
}, },
) )
...@@ -906,7 +906,7 @@ class TrainingArguments: ...@@ -906,7 +906,7 @@ class TrainingArguments:
default=None, default=None,
metadata={ metadata={
"help": "The backend to be used for distributed training", "help": "The backend to be used for distributed training",
"choices": ["nccl", "gloo", "mpi", "ccl"], "choices": ["nccl", "gloo", "mpi", "ccl", "hccl"],
}, },
) )
tpu_num_cores: Optional[int] = field( tpu_num_cores: Optional[int] = field(
...@@ -1376,6 +1376,15 @@ class TrainingArguments: ...@@ -1376,6 +1376,15 @@ class TrainingArguments:
raise ValueError( raise ValueError(
"Your setup doesn't support bf16/gpu. You need torch>=1.10, using Ampere GPU with cuda>=11.0" "Your setup doesn't support bf16/gpu. You need torch>=1.10, using Ampere GPU with cuda>=11.0"
) )
elif is_torch_npu_available():
# npu
from .pytorch_utils import is_torch_greater_or_equal_than_1_11
if not is_torch_greater_or_equal_than_1_11:
raise ValueError(
"Your setup doesn't support bf16/npu. You need torch>=1.11, using Ascend NPU with "
"`torch_npu` installed"
)
elif not is_torch_xpu_available(): elif not is_torch_xpu_available():
# xpu # xpu
from .pytorch_utils import is_torch_greater_or_equal_than_1_12 from .pytorch_utils import is_torch_greater_or_equal_than_1_12
...@@ -1439,6 +1448,7 @@ class TrainingArguments: ...@@ -1439,6 +1448,7 @@ class TrainingArguments:
self.framework == "pt" self.framework == "pt"
and is_torch_available() and is_torch_available()
and (self.device.type != "cuda") and (self.device.type != "cuda")
and (self.device.type != "npu")
and (self.device.type != "xpu") and (self.device.type != "xpu")
and (get_xla_device_type(self.device) != "GPU") and (get_xla_device_type(self.device) != "GPU")
and (get_xla_device_type(self.device) != "TPU") and (get_xla_device_type(self.device) != "TPU")
...@@ -1447,7 +1457,7 @@ class TrainingArguments: ...@@ -1447,7 +1457,7 @@ class TrainingArguments:
): ):
raise ValueError( raise ValueError(
"BF16 Mixed precision training with AMP (`--bf16`) and BF16 half precision evaluation" "BF16 Mixed precision training with AMP (`--bf16`) and BF16 half precision evaluation"
" (`--bf16_full_eval`) can only be used on CUDA, XPU (with IPEX) or CPU/TPU/NeuronCore devices." " (`--bf16_full_eval`) can only be used on CUDA, XPU (with IPEX), NPU or CPU/TPU/NeuronCore devices."
) )
if self.torchdynamo is not None: if self.torchdynamo is not None:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment