assertself.cpu_offloadandtype(self.optimizer)in[DeepSpeedCPUAdam],f"fp16_master_and_gradients requires optimizer to support keeping fp16 master and gradients while keeping the optimizer states in fp32. Currently only supported using ZeRO-Offload with DeepSpeedCPUAdam. But current setting is ZeRO-Offload:{self.cpu_offload} and optimizer type {type(self.optimizer)}. Either disable fp16_master_weights_and_gradients or enable ZeRO-2 Offload with DeepSpeedCPUAdam"
f"fp16_master_and_gradients requires optimizer to support keeping fp16 master and gradients while keeping the optimizer states in fp32."\
f"Currently only supported using ZeRO-Offload with DeepSpeedCPUAdam. But current setting is ZeRO-Offload:{self.cpu_offload} and optimizer type {type(self.optimizer)}." \
f"Either disable fp16_master_weights_and_gradients or enable {self.zero_stage_string} Offload with DeepSpeedCPUAdam."
ifself.reduce_scatter:
assertself.communication_data_typein(torch.float16,torch.bfloat16),f"ZeRO-2 supports only float16 or bfloat16 communication_data_type with reduce scatter enabled. Got: '{self.communication_data_type}'"
assertself.gradient_predivide_factor==1.0,"gradient_predivide_factor != 1.0 is not yet supported with ZeRO-2 with reduce scatter enabled"
assertself.postscale_gradients,"pre-scale gradients is not yet supported with ZeRO-2 with reduce scatter enabled"
assertself.communication_data_typeinvalid_reduce_scatter_dtypes,f"{self.zero_stage_string} supports {valid_reduce_scatter_dtypes} communication_data_type with reduce scatter enabled. Got: '{self.communication_data_type}'"
assertself.gradient_predivide_factor==1.0,"gradient_predivide_factor != 1.0 is not yet supported with {self.zero_stage_string} with reduce scatter enabled"
assertself.postscale_gradients,"pre-scale gradients is not yet supported with {self.zero_stage_string} with reduce scatter enabled"
# param flattened by groups
self.bit16_groups=[]
...
...
@@ -272,7 +263,9 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
# align nccl all-gather send buffers to 4-byte boundary
assert(allgather_bucket_size%self.nccl_start_alignment_factor==0),f"allgather_bucket_size must be a multiple of nccl_start_alignment_factor, {self.nccl_start_alignment_factor} "
"ZeRO Stage 1 has not been thoroughly tested with MoE. This configuration is still experimental."
)
"ZeRO Stage 1 has not been thoroughly tested with MoE. This configuration is still experimental.")
assertself.reduce_scatter,"Reduce Scatter in ZeRO Stage 2 must be set to True for MoE. Other code paths are not tested with MoE"
assertany([self.is_moe_group(group)forgroupinself.optimizer.param_groups]),"The model has moe layers, but None of the param groups are marked as MoE. Create a param group with 'moe' key set to True before creating optimizer"
),"The model has moe layers, but None of the param groups are marked as MoE. Create a param group with 'moe' key set to True before creating optimizer"
auto_mpi_discovery Optional (bool). if distributed environment variables are not set, attempt to discover them from MPI
distributed_port: Optional (int). torch distributed backend port
verbose: Optional (bool). verbose logging
timeout: Optional (timedelta). Timeout for operations executed against the process group. Default value equals 30 minutes.
init_method: Optional (string). Torch distributed, URL specifying how to initialize the process group. Default is “env://” if no init_method or store is specified.