f'When using seqlen metric, the difficulty_step for curriculum learning has to be multiple of 8 (for FP16 data) or 16 (for INT8 data) to enable NVIDIA Tensor Core acceleration. Disregard this warning if this is unrelated to your metric/hardware.'
f'When using seqlen metric, the difficulty_step for curriculum learning has to be multiple of 8 (for FP16 data) or 16 (for INT8 data) to enable NVIDIA Tensor Core acceleration. Disregard this warning if this is unrelated to your metric/hardware.'
assertmetric_dtypenotin[np.float64,np.double],"Currently floating point metric values are not supported. Please change your metric into integer values (and potentially multiply a larger coefficient to keep the precision)."
assertmetric_dtypenotin[
np.float64,np.double
],"Currently floating point metric values are not supported. Please change your metric into integer values (and potentially multiply a larger coefficient to keep the precision)."
assertoccurrence<=1,f"Parameter with name: {name} occurs multiple times in optimizer.param_groups. Make sure it only appears once to prevent undefined behaviour."
...
...
@@ -1204,9 +1101,7 @@ class DeepSpeedEngine(Module):
),'You are using an untested ZeRO Optimizer. Please add <"zero_allow_untested_optimizer": true> in the configuration file to use it.'
ifself.global_rank==0:
logger.warning(
"**** You are using ZeRO with an untested optimizer, proceed with caution *****"
)
logger.warning("**** You are using ZeRO with an untested optimizer, proceed with caution *****")
msg=f'You are using ZeRO-Offload with a client provided optimizer ({type(basic_optimizer)}) which in most cases will yield poor performance. Please either use deepspeed.ops.adam.DeepSpeedCPUAdam or set an optimizer in your ds-config (https://www.deepspeed.ai/docs/config-json/#optimizer-parameters). If you really want to use a custom optimizer w. ZeRO-Offload and understand the performance impacts you can also set <"zero_force_ds_cpu_optimizer": false> in your configuration file.'
@@ -1314,32 +1200,24 @@ class DeepSpeedEngine(Module):
"'max_grad_norm' is not supported as an optimizer parameter, please switch to using the deepspeed parameter 'gradient_clipping' see: https://www.deepspeed.ai/docs/config-json/#gradient-clipping for more details"
@@ -65,6 +74,7 @@ class LossScaler(LossScalerBase):
Args:
scale (float, optional, default=1.0): The loss scale.
"""
def__init__(self,scale=1):
super(LossScaler,self).__init__(scale)
...
...
@@ -102,6 +112,7 @@ class DynamicLossScaler(LossScalerBase):
scale_factor (float, optional, default=2.0): Factor used when adjusting the loss scale. If an overflow is encountered, the loss scale is readjusted to loss scale/``scale_factor``. If ``scale_window`` consecutive iterations take place without an overflow, the loss scale is readjusted to loss_scale*``scale_factor``.
scale_window (int, optional, default=1000): Number of consecutive iterations without an overflow to wait before increasing the loss scale.
"""
def__init__(self,
init_scale=2**32,
scale_factor=2.,
...
...
@@ -109,7 +120,8 @@ class DynamicLossScaler(LossScalerBase):
@@ -39,14 +41,14 @@ class OnebitAdam(torch.optim.Optimizer):
.. _On the Convergence of Adam and Beyond:
https://openreview.net/forum?id=ryQu7f-RZ
"""
def__init__(self,
params,
deepspeed=None,
lr=1e-3,
freeze_step=100000,
bias_correction=True,
betas=(0.9,
0.999),
betas=(0.9,0.999),
eps=1e-8,
eps_inside_sqrt=False,
weight_decay=0.,
...
...
@@ -89,11 +91,12 @@ class OnebitAdam(torch.optim.Optimizer):
ifself.comm_backend_name=='nccl':
TORCH_MAJOR=int(torch.__version__.split('.')[0])
TORCH_MINOR=int(torch.__version__.split('.')[1])
assertTORCH_MAJOR>=1andTORCH_MINOR>=8,"Please use torch 1.8 or greater to enable NCCL backend in 1-bit Adam. Alternatively, please specify 'mpi' as the 'comm_backend_name' in config file to proceed with the MPI backend"
assert(
(TORCH_MAJOR==1andTORCH_MINOR>=8)orTORCH_MAJOR>=2
),"Please use torch 1.8 or greater to enable NCCL backend in 1-bit Adam. Alternatively, please specify 'mpi' as the 'comm_backend_name' in config file to proceed with the MPI backend"
assertdist.is_initialized()==True,"Please initialize the torch distributed backend."
@@ -54,14 +56,14 @@ class OnebitLamb(torch.optim.Optimizer):
.. _On the Convergence of Adam and Beyond:
https://openreview.net/forum?id=ryQu7f-RZ
"""
def__init__(self,
params,
deepspeed=None,
lr=1e-3,
freeze_step=100000,
bias_correction=True,
betas=(0.9,
0.999),
betas=(0.9,0.999),
eps=1e-8,
eps_inside_sqrt=False,
weight_decay=0.,
...
...
@@ -111,11 +113,12 @@ class OnebitLamb(torch.optim.Optimizer):
ifself.comm_backend_name=='nccl':
TORCH_MAJOR=int(torch.__version__.split('.')[0])
TORCH_MINOR=int(torch.__version__.split('.')[1])
assertTORCH_MAJOR>=1andTORCH_MINOR>=8,"Please use torch 1.8 or greater to enable NCCL backend in 1-bit Adam. Alternatively, please specify 'mpi' as the 'comm_backend_name' in config file to proceed with the MPI backend"
assert(
(TORCH_MAJOR==1andTORCH_MINOR>=8)orTORCH_MAJOR>=2
),"Please use torch 1.8 or greater to enable NCCL backend in 1-bit Adam. Alternatively, please specify 'mpi' as the 'comm_backend_name' in config file to proceed with the MPI backend"
assertdist.is_initialized()==True,"Please initialize the torch distributed backend."