@@ -135,7 +135,11 @@ class DistributedDataParallel(Module):
...
@@ -135,7 +135,11 @@ class DistributedDataParallel(Module):
delay_allreduce (bool, default=False): Delay all communication to the end of the backward pass. This disables overlapping communication with computation.
delay_allreduce (bool, default=False): Delay all communication to the end of the backward pass. This disables overlapping communication with computation.
allreduce_trigger_params (list, optional, default=None): If supplied, should contain a list of parameters drawn from the model. Allreduces will be kicked off whenever one of these parameters receives its gradient (as opposed to when a bucket of size message_size is full). At the end of backward(), a cleanup allreduce to catch any remaining gradients will also be performed automatically. If allreduce_trigger_params is supplied, the message_size argument will be ignored.
allreduce_trigger_params (list, optional, default=None): If supplied, should contain a list of parameters drawn from the model. Allreduces will be kicked off whenever one of these parameters receives its gradient (as opposed to when a bucket of size message_size is full). At the end of backward(), a cleanup allreduce to catch any remaining gradients will also be performed automatically. If allreduce_trigger_params is supplied, the message_size argument will be ignored.
allreduce_always_fp32 (bool, default=False): Convert any FP16 gradients to FP32 before allreducing. This can improve stability for widely scaled-out runs.
allreduce_always_fp32 (bool, default=False): Convert any FP16 gradients to FP32 before allreducing. This can improve stability for widely scaled-out runs.
gradient_average_split_factor (float, default=1.0): Perform the averaging of gradients over proceses partially before and partially after the allreduce. Before allreduce: grads.mul_(1.0/gradient_average_split_factor). After allreduce: grads.mul_(gradient_average_split_factor/world size). This can reduce the stress on the dynamic range of FP16 allreduces for widely scaled-out runs.
gradient_average (bool, default=True): Option to toggle whether or not DDP averages the allreduced gradients over processes. For proper scaling, the default value of True is recommended.
gradient_predivide_factor (float, default=1.0): Allows perfoming the average of gradients over proceses partially before and partially after the allreduce. Before allreduce: grads.mul_(1.0/gradient_predivide_factor). After allreduce: grads.mul_(gradient_predivide_factor/world size). This can reduce the stress on the dynamic range of FP16 allreduces for widely scaled-out runs.
..warning::
If gradient_average=False, the second step (``grads.mul_(gradient_predivide_factor/world size)``) will be omitted.
"""
"""
...
@@ -147,7 +151,9 @@ class DistributedDataParallel(Module):
...
@@ -147,7 +151,9 @@ class DistributedDataParallel(Module):
allreduce_trigger_params=None,
allreduce_trigger_params=None,
retain_allreduce_buffers=False,
retain_allreduce_buffers=False,
allreduce_always_fp32=False,
allreduce_always_fp32=False,
gradient_average_split_factor=1.0):
gradient_average=True,
gradient_predivide_factor=1.0,
gradient_average_split_factor=None):
super(DistributedDataParallel,self).__init__()
super(DistributedDataParallel,self).__init__()
# Backward/forward compatibility around
# Backward/forward compatibility around
...
@@ -164,11 +170,16 @@ class DistributedDataParallel(Module):
...
@@ -164,11 +170,16 @@ class DistributedDataParallel(Module):
ifshared_paramisnotNone:
ifshared_paramisnotNone:
raiseValueError("shared_param is no longer supported as an option. It was misleadingly named from the start. It turns out overlapping communication with computation should work fine with shared parameters. If you still wish to delay communication to the end of the backward pass, use delay_allreduce=True|False instead.")
raiseValueError("shared_param is no longer supported as an option. It was misleadingly named from the start. It turns out overlapping communication with computation should work fine with shared parameters. If you still wish to delay communication to the end of the backward pass, use delay_allreduce=True|False instead.")
ifgradient_average_split_factorisnotNone:
print("Warning: gradient_average_split_factor has been renamed to gradient_predivide_factor. For now, gradient_average_split_factor will also work, but please update to gradient_predivide_factor instead.")