Unverified Commit 0f8c7f98 authored by HELSON's avatar HELSON Committed by GitHub
Browse files

Fixed docstring in colossalai (#171)

parent e2089c5c
...@@ -37,7 +37,11 @@ class CheckpointModule(nn.Module): ...@@ -37,7 +37,11 @@ class CheckpointModule(nn.Module):
def divide(numerator, denominator): def divide(numerator, denominator):
""" only allow exact division """ """Only allow exact division
:param numerator: Numerator of the division
:param denominator: Denominator of the division
"""
assert numerator % denominator == 0, \ assert numerator % denominator == 0, \
'{} is not divisible by {}'.format(numerator, denominator) '{} is not divisible by {}'.format(numerator, denominator)
return numerator // denominator return numerator // denominator
......
...@@ -170,7 +170,7 @@ class VanillaPatchEmbedding(nn.Module): ...@@ -170,7 +170,7 @@ class VanillaPatchEmbedding(nn.Module):
@LAYERS.register_module @LAYERS.register_module
class VanillaClassifier(nn.Module): class VanillaClassifier(nn.Module):
""" """
Classifier Classifier for ViT
:param in_features: size of each input sample :param in_features: size of each input sample
:type in_features: int :type in_features: int
......
...@@ -11,9 +11,9 @@ from colossalai.registry import LAYERS ...@@ -11,9 +11,9 @@ from colossalai.registry import LAYERS
class LambdaWrapper(nn.Module): class LambdaWrapper(nn.Module):
"""Wrap a function to nn.Module, which takes a config of layers and can fully access them """Wrap a function to nn.Module, which takes a config of layers and can fully access them
:param func: user customed function :param func: User customed function
:type func: Callable :type func: Callable
:param layers_cfg: config of layers, defaults to None :param layers_cfg: Config of layers, defaults to None
:type layers_cfg: dict, optional :type layers_cfg: dict, optional
""" """
......
...@@ -11,6 +11,9 @@ class CrossEntropyLoss2D(_Loss): ...@@ -11,6 +11,9 @@ class CrossEntropyLoss2D(_Loss):
Cross entropy loss for 2D parallelism Cross entropy loss for 2D parallelism
:param reduction: whether to average the loss, defaults to True :param reduction: whether to average the loss, defaults to True
:param args: Args for loss function
:param kwargs: Kwargs for loss function
:type reduction: bool, optional :type reduction: bool, optional
""" """
def __init__(self, reduction=True, *args, **kwargs): def __init__(self, reduction=True, *args, **kwargs):
...@@ -21,6 +24,11 @@ class CrossEntropyLoss2D(_Loss): ...@@ -21,6 +24,11 @@ class CrossEntropyLoss2D(_Loss):
self.loss_kwargs = kwargs self.loss_kwargs = kwargs
def forward(self, logits, targets): def forward(self, logits, targets):
"""Calculate loss between logits and targets
:param logits: Output logits of model
:param targets: True targets from data
"""
loss = cross_entropy(logits, targets, reduction='none', *self.loss_args, **self.loss_kwargs) loss = cross_entropy(logits, targets, reduction='none', *self.loss_args, **self.loss_kwargs)
if self.reduction_mean: if self.reduction_mean:
loss = loss.mean() loss = loss.mean()
......
...@@ -11,6 +11,9 @@ class CrossEntropyLoss2p5D(_Loss): ...@@ -11,6 +11,9 @@ class CrossEntropyLoss2p5D(_Loss):
Cross entropy loss for 2.5D parallelism Cross entropy loss for 2.5D parallelism
:param reduction: whether to average the loss, defaults to True :param reduction: whether to average the loss, defaults to True
:param args: Args for loss function
:param kwargs: Kwargs for loss function
:type reduction: bool, optional :type reduction: bool, optional
""" """
def __init__(self, reduction=True, *args, **kwargs): def __init__(self, reduction=True, *args, **kwargs):
...@@ -21,6 +24,11 @@ class CrossEntropyLoss2p5D(_Loss): ...@@ -21,6 +24,11 @@ class CrossEntropyLoss2p5D(_Loss):
self.loss_kwargs = kwargs self.loss_kwargs = kwargs
def forward(self, logits, targets): def forward(self, logits, targets):
"""Calculate loss between logits and targets
:param logits: Output logits of model
:param targets: True targets from data
"""
loss = cross_entropy(logits, targets, reduction='none', *self.loss_args, **self.loss_kwargs) loss = cross_entropy(logits, targets, reduction='none', *self.loss_args, **self.loss_kwargs)
if self.reduction_mean: if self.reduction_mean:
loss = loss.mean() loss = loss.mean()
......
...@@ -14,6 +14,9 @@ class CrossEntropyLoss3D(_Loss): ...@@ -14,6 +14,9 @@ class CrossEntropyLoss3D(_Loss):
:type depth: int :type depth: int
:param reduction: whether to average the loss, defaults to True :param reduction: whether to average the loss, defaults to True
:type reduction: bool, optional :type reduction: bool, optional
:param args: Args for loss function
:param kwargs: Kwargs for loss function
""" """
def __init__(self, reduction=True, *args, **kwargs): def __init__(self, reduction=True, *args, **kwargs):
super().__init__() super().__init__()
...@@ -24,6 +27,11 @@ class CrossEntropyLoss3D(_Loss): ...@@ -24,6 +27,11 @@ class CrossEntropyLoss3D(_Loss):
self.loss_kwargs = kwargs self.loss_kwargs = kwargs
def forward(self, logits, targets): def forward(self, logits, targets):
"""Calculate loss between logits and targets
:param logits: Output logits of model
:param targets: True targets from data
"""
loss = cross_entropy(logits, targets, reduction='none', *self.loss_args, **self.loss_kwargs) loss = cross_entropy(logits, targets, reduction='none', *self.loss_args, **self.loss_kwargs)
if self.reduction_mean: if self.reduction_mean:
loss = loss.mean() loss = loss.mean()
......
...@@ -7,6 +7,12 @@ from colossalai.global_variables import moe_env ...@@ -7,6 +7,12 @@ from colossalai.global_variables import moe_env
@LOSSES.register_module @LOSSES.register_module
class MoeCrossEntropyLoss(_Loss): class MoeCrossEntropyLoss(_Loss):
"""torch.nn.CrossEntropyLoss added with auxiliary loss. """torch.nn.CrossEntropyLoss added with auxiliary loss.
:param aux_weight: Weight of auxiliary loss in total loss
:param args: Args in CrossEntropyLoss
:param kwargs: Kwargs in CrossEntropyLoss
:type aux_weight: float, optional
""" """
def __init__(self, aux_weight: float = 0.01, *args, **kwargs): def __init__(self, aux_weight: float = 0.01, *args, **kwargs):
super().__init__() super().__init__()
...@@ -22,6 +28,14 @@ class MoeCrossEntropyLoss(_Loss): ...@@ -22,6 +28,14 @@ class MoeCrossEntropyLoss(_Loss):
@LOSSES.register_module @LOSSES.register_module
class MoeLoss(_Loss): class MoeLoss(_Loss):
"""A wrapper class for any loss module to add with auxiliary loss. """A wrapper class for any loss module to add with auxiliary loss.
:param aux_weight: Weight of auxiliary loss in total loss
:param loss_fn: Loss function
:param args: Args in loss function
:param kwargs: Kwargs in loss function
:type aux_weight: float
:type loss_fn: Callable
""" """
def __init__(self, aux_weight: float, loss_fn, *args, **kwargs): def __init__(self, aux_weight: float, loss_fn, *args, **kwargs):
super().__init__() super().__init__()
......
...@@ -38,7 +38,7 @@ class CosineAnnealingLR(_CosineAnnealingLR): ...@@ -38,7 +38,7 @@ class CosineAnnealingLR(_CosineAnnealingLR):
:param optimizer: Wrapped optimizer :param optimizer: Wrapped optimizer
:type optimizer: torch.optim.Optimizer :type optimizer: torch.optim.Optimizer
:param total_steps: number of total training steps :param total_steps: Number of total training steps
:type total_steps: int :type total_steps: int
:param eta_min: Minimum learning rate, defaults to 0 :param eta_min: Minimum learning rate, defaults to 0
:type eta_min: int, optional :type eta_min: int, optional
...@@ -56,9 +56,9 @@ class CosineAnnealingWarmupLR(WarmupScheduler): ...@@ -56,9 +56,9 @@ class CosineAnnealingWarmupLR(WarmupScheduler):
:param optimizer: Wrapped optimizer :param optimizer: Wrapped optimizer
:type optimizer: torch.optim.Optimizer :type optimizer: torch.optim.Optimizer
:param total_steps: number of total training steps :param total_steps: Number of total training steps
:type total_steps: int :type total_steps: int
:param warmup_steps: number of warmup steps, defaults to 0 :param warmup_steps: Number of warmup steps, defaults to 0
:type warmup_steps: int, optional :type warmup_steps: int, optional
:param eta_min: Minimum learning rate, defaults to 0 :param eta_min: Minimum learning rate, defaults to 0
:type eta_min: int, optional :type eta_min: int, optional
...@@ -78,9 +78,9 @@ class FlatAnnealingLR(DelayerScheduler): ...@@ -78,9 +78,9 @@ class FlatAnnealingLR(DelayerScheduler):
:param optimizer: Wrapped optimizer :param optimizer: Wrapped optimizer
:type optimizer: torch.optim.Optimizer :type optimizer: torch.optim.Optimizer
:param total_steps: number of total training steps :param total_steps: Number of total training steps
:type total_steps: int :type total_steps: int
:param pct_start: percent of steps before starting learning rate decay :param pct_start: Percent of steps before starting learning rate decay
:type pct_start: float :type pct_start: float
:param last_epoch: The index of last epoch, defaults to -1 :param last_epoch: The index of last epoch, defaults to -1
:type last_epoch: int, optional :type last_epoch: int, optional
...@@ -99,15 +99,16 @@ class FlatAnnealingLR(DelayerScheduler): ...@@ -99,15 +99,16 @@ class FlatAnnealingLR(DelayerScheduler):
@LR_SCHEDULERS.register_module @LR_SCHEDULERS.register_module
class FlatAnnealingWarmupLR(WarmupDelayerScheduler): class FlatAnnealingWarmupLR(WarmupDelayerScheduler):
"""Flat and cosine annealing learning rate scheduler with learning rate warmup. A linear warmup schedule will be applied, and then the learning rate will be a fixed value before starting decay. """Flat and cosine annealing learning rate scheduler with learning rate warmup. A linear warmup schedule will be
applied, and then the learning rate will be a fixed value before starting decay.
:param optimizer: Wrapped optimizer :param optimizer: Wrapped optimizer
:type optimizer: torch.optim.Optimizer :type optimizer: torch.optim.Optimizer
:param total_steps: number of total training steps :param total_steps: Number of total training steps
:type total_steps: int :type total_steps: int
:param warmup_steps: number of warmup steps, defaults to 0 :param warmup_steps: Number of warmup steps, defaults to 0
:type warmup_steps: int, optional :type warmup_steps: int, optional
:param pct_start: percent of steps before starting learning rate decay :param pct_start: Percent of steps before starting learning rate decay
:type pct_start: float :type pct_start: float
:param eta_min: Minimum learning rate, defaults to 0 :param eta_min: Minimum learning rate, defaults to 0
:type eta_min: int, optional :type eta_min: int, optional
......
...@@ -18,9 +18,9 @@ class DelayerScheduler(_LRScheduler): ...@@ -18,9 +18,9 @@ class DelayerScheduler(_LRScheduler):
:param optimizer: Wrapped optimizer. :param optimizer: Wrapped optimizer.
:type optimizer: torch.optim.Optimizer :type optimizer: torch.optim.Optimizer
:param delay_epochs: number of epochs to keep the initial lr until starting aplying the scheduler :param delay_epochs: Number of epochs to keep the initial lr until starting aplying the scheduler
:type delay_epochs: int :type delay_epochs: int
:param after_scheduler: after target_epoch, use this scheduler(eg. ReduceLROnPlateau) :param after_scheduler: After target_epoch, use this scheduler(eg. ReduceLROnPlateau)
:type after_scheduler: torch.optim.lr_scheduler :type after_scheduler: torch.optim.lr_scheduler
:param last_epoch: The index of last epoch, defaults to -1 :param last_epoch: The index of last epoch, defaults to -1
:type last_epoch: int, optional :type last_epoch: int, optional
...@@ -61,9 +61,9 @@ class WarmupScheduler(_LRScheduler): ...@@ -61,9 +61,9 @@ class WarmupScheduler(_LRScheduler):
:param optimizer: Wrapped optimizer. :param optimizer: Wrapped optimizer.
:type optimizer: torch.optim.Optimizer :type optimizer: torch.optim.Optimizer
:param warmup_epochs: number of epochs to linearly warmup lr until starting aplying the scheduler :param warmup_epochs: Number of epochs to linearly warmup lr until starting aplying the scheduler
:type warmup_epochs: int :type warmup_epochs: int
:param after_scheduler: after target_epoch, use this scheduler(eg. ReduceLROnPlateau) :param after_scheduler: After target_epoch, use this scheduler(eg. ReduceLROnPlateau)
:type after_scheduler: torch.optim.lr_scheduler :type after_scheduler: torch.optim.lr_scheduler
:param last_epoch: The index of last epoch, defaults to -1 :param last_epoch: The index of last epoch, defaults to -1
:type last_epoch: int, optional :type last_epoch: int, optional
...@@ -101,11 +101,11 @@ class WarmupDelayerScheduler(_LRScheduler): ...@@ -101,11 +101,11 @@ class WarmupDelayerScheduler(_LRScheduler):
:param optimizer: Wrapped optimizer. :param optimizer: Wrapped optimizer.
:type optimizer: torch.optim.Optimizer :type optimizer: torch.optim.Optimizer
:param warmup_epochs: number of epochs to linearly warmup lr until starting aplying the scheduler :param warmup_epochs: Number of epochs to linearly warmup lr until starting aplying the scheduler
:type warmup_epochs: int :type warmup_epochs: int
:param delay_epochs: number of epochs to keep the initial lr until starting aplying the scheduler :param delay_epochs: Number of epochs to keep the initial lr until starting aplying the scheduler
:type delay_epochs: int :type delay_epochs: int
:param after_scheduler: after target_epoch, use this scheduler(eg. ReduceLROnPlateau) :param after_scheduler: After target_epoch, use this scheduler(eg. ReduceLROnPlateau)
:type after_scheduler: torch.optim.lr_scheduler :type after_scheduler: torch.optim.lr_scheduler
:param last_epoch: The index of last epoch, defaults to -1 :param last_epoch: The index of last epoch, defaults to -1
:type last_epoch: int, optional :type last_epoch: int, optional
......
...@@ -9,9 +9,9 @@ class LinearWarmupLR(_LRScheduler): ...@@ -9,9 +9,9 @@ class LinearWarmupLR(_LRScheduler):
:param optimizer: Wrapped optimizer :param optimizer: Wrapped optimizer
:type optimizer: torch.optim.Optimizer :type optimizer: torch.optim.Optimizer
:param total_steps: number of total training steps :param total_steps: Number of total training steps
:type total_steps: int :type total_steps: int
:param warmup_steps: number of warmup steps, defaults to 0 :param warmup_steps: Number of warmup steps, defaults to 0
:type warmup_steps: int, optional :type warmup_steps: int, optional
:param last_epoch: The index of last epoch, defaults to -1 :param last_epoch: The index of last epoch, defaults to -1
:type last_epoch: int, optional :type last_epoch: int, optional
......
...@@ -12,15 +12,16 @@ class MultiStepLR(_MultiStepLR): ...@@ -12,15 +12,16 @@ class MultiStepLR(_MultiStepLR):
number of epoch reaches one of the milestones. Notice that such decay can number of epoch reaches one of the milestones. Notice that such decay can
happen simultaneously with other changes to the learning rate from outside happen simultaneously with other changes to the learning rate from outside
this scheduler. When last_epoch=-1, sets initial lr as lr. this scheduler. When last_epoch=-1, sets initial lr as lr.
:param optimizer: Wrapped optimizer :param optimizer: Wrapped optimizer
:type optimizer: torch.optim.Optimizer :type optimizer: torch.optim.Optimizer
:param total_steps: number of total training steps :param total_steps: Number of total training steps
:type total_steps: int :type total_steps: int
:param milestones: List of epoch indices. Must be increasing, defaults to None :param milestones: List of epoch indices. Must be increasing, defaults to None
:type milestones: List[int], optional :type milestones: List[int], optional
:param gamma: Multiplicative factor of learning rate decay, defaults to 0.1 :param gamma: Multiplicative factor of learning rate decay, defaults to 0.1
:type gamma: float, optional :type gamma: float, optional
:param num_steps_per_epoch: number of steps per epoch, defaults to -1 :param num_steps_per_epoch: Number of steps per epoch, defaults to -1
:type num_steps_per_epoch: int, optional :type num_steps_per_epoch: int, optional
:param last_epoch: The index of last epoch, defaults to -1 :param last_epoch: The index of last epoch, defaults to -1
:type last_epoch: int, optional :type last_epoch: int, optional
...@@ -33,17 +34,18 @@ class MultiStepLR(_MultiStepLR): ...@@ -33,17 +34,18 @@ class MultiStepLR(_MultiStepLR):
@LR_SCHEDULERS.register_module @LR_SCHEDULERS.register_module
class MultiStepWarmupLR(WarmupScheduler): class MultiStepWarmupLR(WarmupScheduler):
"""Multi-step laerning rate scheduler with warmup. """Multi-step laerning rate scheduler with warmup.
:param optimizer: Wrapped optimizer :param optimizer: Wrapped optimizer
:type optimizer: torch.optim.Optimizer :type optimizer: torch.optim.Optimizer
:param total_steps: number of total training steps :param total_steps: Number of total training steps
:type total_steps: int :type total_steps: int
:param warmup_steps: number of warmup steps, defaults to 0 :param warmup_steps: Number of warmup steps, defaults to 0
:type warmup_steps: int, optional :type warmup_steps: int, optional
:param milestones: List of epoch indices. Must be increasing, defaults to None :param milestones: List of epoch indices. Must be increasing, defaults to None
:type milestones: List[int], optional :type milestones: List[int], optional
:param gamma: Multiplicative factor of learning rate decay, defaults to 0.1 :param gamma: Multiplicative factor of learning rate decay, defaults to 0.1
:type gamma: float, optional :type gamma: float, optional
:param num_steps_per_epoch: number of steps per epoch, defaults to -1 :param num_steps_per_epoch: Number of steps per epoch, defaults to -1
:type num_steps_per_epoch: int, optional :type num_steps_per_epoch: int, optional
:param last_epoch: The index of last epoch, defaults to -1 :param last_epoch: The index of last epoch, defaults to -1
:type last_epoch: int, optional :type last_epoch: int, optional
......
...@@ -17,19 +17,20 @@ class OneCycleLR(_OneCycleLR): ...@@ -17,19 +17,20 @@ class OneCycleLR(_OneCycleLR):
This scheduler is not chainable. This scheduler is not chainable.
Note also that the total number of steps in the cycle can be determined in one Note also that the total number of steps in the cycle can be determined in one
of two ways (listed in order of precedence): of two ways (listed in order of precedence):
#. A value for total_steps is explicitly provided.
#. A number of epochs (epochs) and a number of steps per epoch * A value for total_steps is explicitly provided.
(steps_per_epoch) are provided. * A number of epochs (epochs) and a number of steps per epoch (steps_per_epoch) are provided.
In this case, the number of total steps is inferred by In this case, the number of total steps is inferred by total_steps = epochs * steps_per_epoch
total_steps = epochs * steps_per_epoch
You must either provide a value for total_steps or provide a value for both You must either provide a value for total_steps or provide a value for both
epochs and steps_per_epoch. epochs and steps_per_epoch.
The default behaviour of this scheduler follows the fastai implementation of 1cycle, which The default behaviour of this scheduler follows the fastai implementation of 1cycle, which
claims that "unpublished work has shown even better results by using only two phases". To claims that "unpublished work has shown even better results by using only two phases". To
mimic the behaviour of the original paper instead, set ``three_phase=True``. mimic the behaviour of the original paper instead, set ``three_phase=True``.
:param optimizer: Wrapped optimizer :param optimizer: Wrapped optimizer
:type optimizer: torch.optim.Optimizer :type optimizer: torch.optim.Optimizer
:param total_steps: number of total training steps :param total_steps: Number of total training steps
:type total_steps: int :type total_steps: int
:param pct_start: The percentage of the cycle (in number of steps) spent increasing the learning rate, defaults to 0.3 :param pct_start: The percentage of the cycle (in number of steps) spent increasing the learning rate, defaults to 0.3
:type pct_start: float, optional :type pct_start: float, optional
...@@ -64,6 +65,7 @@ class OneCycleLR(_OneCycleLR): ...@@ -64,6 +65,7 @@ class OneCycleLR(_OneCycleLR):
number of *batches* computed, not the total number of epochs computed. number of *batches* computed, not the total number of epochs computed.
When last_epoch=-1, the schedule is started from the beginning, defaults to -1 When last_epoch=-1, the schedule is started from the beginning, defaults to -1
:type last_epoch: int, optional :type last_epoch: int, optional
.. _Super-Convergence\: Very Fast Training of Neural Networks Using Large Learning Rates: .. _Super-Convergence\: Very Fast Training of Neural Networks Using Large Learning Rates:
https://arxiv.org/abs/1708.07120 https://arxiv.org/abs/1708.07120
""" """
......
...@@ -7,13 +7,14 @@ from .delayed import WarmupScheduler ...@@ -7,13 +7,14 @@ from .delayed import WarmupScheduler
@LR_SCHEDULERS.register_module @LR_SCHEDULERS.register_module
class PolynomialLR(_LRScheduler): class PolynomialLR(_LRScheduler):
"""Polynomial learning rate scheduler. """Polynomial learning rate scheduler.
:param optimizer: Wrapped optimizer :param optimizer: Wrapped optimizer
:type optimizer: torch.optim.Optimizer :type optimizer: torch.optim.Optimizer
:param total_steps: number of total training steps :param total_steps: Number of total training steps
:type total_steps: int :type total_steps: int
:param end_lr: Minimum learning rate, defaults to 0.0001 :param end_lr: Minimum learning rate, defaults to 0.0001
:type end_lr: float, optional :type end_lr: float, optional
:param power: the power of polynomial, defaults to 1.0 :param power: The power of polynomial, defaults to 1.0
:type power: float, optional :type power: float, optional
:param last_epoch: The index of last epoch, defaults to -1 :param last_epoch: The index of last epoch, defaults to -1
:type last_epoch: int, optional :type last_epoch: int, optional
...@@ -42,15 +43,16 @@ class PolynomialLR(_LRScheduler): ...@@ -42,15 +43,16 @@ class PolynomialLR(_LRScheduler):
@LR_SCHEDULERS.register_module @LR_SCHEDULERS.register_module
class PolynomialWarmupLR(WarmupScheduler): class PolynomialWarmupLR(WarmupScheduler):
"""Polynomial learning rate scheduler with warmup. """Polynomial learning rate scheduler with warmup.
:param optimizer: Wrapped optimizer :param optimizer: Wrapped optimizer
:type optimizer: torch.optim.Optimizer :type optimizer: torch.optim.Optimizer
:param total_steps: number of total training steps :param total_steps: Number of total training steps
:type total_steps: int :type total_steps: int
:param warmup_steps: number of warmup steps, defaults to 0 :param warmup_steps: Number of warmup steps, defaults to 0
:type warmup_steps: int, optional :type warmup_steps: int, optional
:param end_lr: Minimum learning rate, defaults to 0.0001 :param end_lr: Minimum learning rate, defaults to 0.0001
:type end_lr: float, optional :type end_lr: float, optional
:param power: the power of polynomial, defaults to 1.0 :param power: The power of polynomial, defaults to 1.0
:type power: float, optional :type power: float, optional
:param last_epoch: The index of last epoch, defaults to -1 :param last_epoch: The index of last epoch, defaults to -1
:type last_epoch: int, optional :type last_epoch: int, optional
......
...@@ -10,16 +10,15 @@ from colossalai.registry import LR_SCHEDULERS ...@@ -10,16 +10,15 @@ from colossalai.registry import LR_SCHEDULERS
class LambdaLR(_LambdaLR): class LambdaLR(_LambdaLR):
"""Sets the learning rate of each parameter group to the initial lr """Sets the learning rate of each parameter group to the initial lr
times a given function. When last_epoch=-1, sets initial lr as lr. times a given function. When last_epoch=-1, sets initial lr as lr.
:param optimizer: Wrapped optimizer :param optimizer: Wrapped optimizer
:type optimizer: torch.optim.Optimizer :type optimizer: torch.optim.Optimizer
:param total_steps: number of total training steps :param total_steps: Number of total training steps
:type total_steps: int :type total_steps: int
:param lr_lambda: A function which computes a multiplicative :param lr_lambda: A function which computes a multiplicative
factor given an integer parameter epoch, or a list of such factor given an integer parameter epoch, or a list of such
functions, one for each group in optimizer.param_groups, defaults to None functions, one for each group in optimizer.param_groups, defaults to None
:type lr_lambda: function or list, optional :type lr_lambda: function or list, optional
:param num_steps_per_epoch: number of steps per epoch, defaults to -1
:type num_steps_per_epoch: int, optional
:param last_epoch: The index of last epoch, defaults to -1 :param last_epoch: The index of last epoch, defaults to -1
:type last_epoch: int, optional :type last_epoch: int, optional
""" """
...@@ -32,16 +31,15 @@ class LambdaLR(_LambdaLR): ...@@ -32,16 +31,15 @@ class LambdaLR(_LambdaLR):
class MultiplicativeLR(_MultiplicativeLR): class MultiplicativeLR(_MultiplicativeLR):
"""Multiply the learning rate of each parameter group by the factor given """Multiply the learning rate of each parameter group by the factor given
in the specified function. When last_epoch=-1, sets initial lr as lr in the specified function. When last_epoch=-1, sets initial lr as lr
:param optimizer: Wrapped optimizer :param optimizer: Wrapped optimizer
:type optimizer: torch.optim.Optimizer :type optimizer: torch.optim.Optimizer
:param total_steps: number of total training steps :param total_steps: Number of total training steps
:type total_steps: int :type total_steps: int
:param lr_lambda: A function which computes a multiplicative :param lr_lambda: A function which computes a multiplicative
factor given an integer parameter epoch, or a list of such factor given an integer parameter epoch, or a list of such
functions, one for each group in optimizer.param_groups, defaults to None functions, one for each group in optimizer.param_groups, defaults to None
:type lr_lambda: function or list, optional :type lr_lambda: function or list, optional
:param num_steps_per_epoch: number of steps per epoch, defaults to -1
:type num_steps_per_epoch: int, optional
:param last_epoch: The index of last epoch, defaults to -1 :param last_epoch: The index of last epoch, defaults to -1
:type last_epoch: int, optional :type last_epoch: int, optional
""" """
...@@ -56,16 +54,15 @@ class StepLR(_StepLR): ...@@ -56,16 +54,15 @@ class StepLR(_StepLR):
step_size epochs. Notice that such decay can happen simultaneously with step_size epochs. Notice that such decay can happen simultaneously with
other changes to the learning rate from outside this scheduler. When other changes to the learning rate from outside this scheduler. When
last_epoch=-1, sets initial lr as lr last_epoch=-1, sets initial lr as lr
:param optimizer: Wrapped optimizer :param optimizer: Wrapped optimizer
:type optimizer: torch.optim.Optimizer :type optimizer: torch.optim.Optimizer
:param total_steps: number of total training steps :param total_steps: Number of total training steps
:type total_steps: int :type total_steps: int
:param step_size: Period of learning rate decay, defaults to 1 :param step_size: Period of learning rate decay, defaults to 1
:type step_size: int, optional :type step_size: int, optional
:param gamma: Multiplicative factor of learning rate decay, defaults to 0.1 :param gamma: Multiplicative factor of learning rate decay, defaults to 0.1
:type gamma: float, optional :type gamma: float, optional
:param num_steps_per_epoch: number of steps per epoch, defaults to -1
:type num_steps_per_epoch: int, optional
:param last_epoch: The index of last epoch, defaults to -1 :param last_epoch: The index of last epoch, defaults to -1
:type last_epoch: int, optional :type last_epoch: int, optional
""" """
...@@ -79,14 +76,13 @@ class StepLR(_StepLR): ...@@ -79,14 +76,13 @@ class StepLR(_StepLR):
class ExponentialLR(_ExponentialLR): class ExponentialLR(_ExponentialLR):
"""Decays the learning rate of each parameter group by gamma every epoch. """Decays the learning rate of each parameter group by gamma every epoch.
When last_epoch=-1, sets initial lr as lr When last_epoch=-1, sets initial lr as lr
:param optimizer: Wrapped optimizer :param optimizer: Wrapped optimizer
:type optimizer: torch.optim.Optimizer :type optimizer: torch.optim.Optimizer
:param total_steps: number of total training steps :param total_steps: Number of total training steps
:type total_steps: int :type total_steps: int
:param gamma: Multiplicative factor of learning rate decay, defaults to 1.0 :param gamma: Multiplicative factor of learning rate decay, defaults to 1.0
:type gamma: float, optional :type gamma: float, optional
:param num_steps_per_epoch: number of steps per epoch, defaults to -1
:type num_steps_per_epoch: int, optional
:param last_epoch: The index of last epoch, defaults to -1 :param last_epoch: The index of last epoch, defaults to -1
:type last_epoch: int, optional :type last_epoch: int, optional
""" """
......
...@@ -6,16 +6,17 @@ from ._utils import calc_acc ...@@ -6,16 +6,17 @@ from ._utils import calc_acc
class Accuracy2D(nn.Module): class Accuracy2D(nn.Module):
""" """Accuracy for 2D parallelism
Accuracy for 2D parallelism
:param logits: predicted labels
:param targets: true labels
""" """
def __init__(self): def __init__(self):
super().__init__() super().__init__()
def forward(self, logits, targets): def forward(self, logits, targets):
"""Calculate the accuracy of predicted labels.
:param logits: Predicted labels
:param targets: True labels from data
"""
with torch.no_grad(): with torch.no_grad():
correct = calc_acc(logits, targets) correct = calc_acc(logits, targets)
correct = reduce_by_batch_2d.apply(correct) correct = reduce_by_batch_2d.apply(correct)
......
...@@ -6,16 +6,17 @@ from ._utils import calc_acc ...@@ -6,16 +6,17 @@ from ._utils import calc_acc
class Accuracy2p5D(nn.Module): class Accuracy2p5D(nn.Module):
""" """Accuracy for 2p5D parallelism
Accuracy for 2p5D parallelism
:param logits: predicted labels
:param targets: true labels
""" """
def __init__(self): def __init__(self):
super().__init__() super().__init__()
def forward(self, logits, targets): def forward(self, logits, targets):
"""Calculate the accuracy of predicted labels.
:param logits: Predicted labels
:param targets: True labels from data
"""
with torch.no_grad(): with torch.no_grad():
correct = calc_acc(logits, targets) correct = calc_acc(logits, targets)
correct = reduce_by_batch_2p5d.apply(correct) correct = reduce_by_batch_2p5d.apply(correct)
......
...@@ -8,11 +8,7 @@ from ._utils import calc_acc ...@@ -8,11 +8,7 @@ from ._utils import calc_acc
class Accuracy3D(nn.Module): class Accuracy3D(nn.Module):
""" """Accuracy for 3D parallelism
Accuracy for 3D parallelism
:param logits: predicted labels
:param targets: true labels
""" """
def __init__(self): def __init__(self):
super().__init__() super().__init__()
...@@ -20,6 +16,11 @@ class Accuracy3D(nn.Module): ...@@ -20,6 +16,11 @@ class Accuracy3D(nn.Module):
self.weight_parallel_mode = get_parallel_mode_from_env(WEIGHT_GROUP_3D) self.weight_parallel_mode = get_parallel_mode_from_env(WEIGHT_GROUP_3D)
def forward(self, logits, targets): def forward(self, logits, targets):
"""Calculate the accuracy of predicted labels.
:param logits: Predicted labels
:param targets: True labels from data
"""
with torch.no_grad(): with torch.no_grad():
correct = calc_acc(logits, targets) correct = calc_acc(logits, targets)
correct = reduce_by_batch_3d.apply(correct, self.input_parallel_mode, self.weight_parallel_mode) correct = reduce_by_batch_3d.apply(correct, self.input_parallel_mode, self.weight_parallel_mode)
......
...@@ -30,8 +30,8 @@ class FusedLAMB(torch.optim.Optimizer): ...@@ -30,8 +30,8 @@ class FusedLAMB(torch.optim.Optimizer):
betas (Tuple[float, float], optional): coefficients used for computing betas (Tuple[float, float], optional): coefficients used for computing
running averages of gradient and its norm. (default: (0.9, 0.999)) running averages of gradient and its norm. (default: (0.9, 0.999))
eps (float, optional): term added to the denominator to improve eps (float, optional): term added to the denominator to improve
numerical stability. (default: 1e-8) numerical stability. (default: 1e-6)
weight_decay (float, optional): weight decay (L2 penalty) (default: 0) weight_decay (float, optional): weight decay (L2 penalty) (default: 0.01)
amsgrad (boolean, optional): whether to use the AMSGrad variant of this amsgrad (boolean, optional): whether to use the AMSGrad variant of this
algorithm from the paper `On the Convergence of Adam and Beyond`_ algorithm from the paper `On the Convergence of Adam and Beyond`_
NOT SUPPORTED now! (default: False) NOT SUPPORTED now! (default: False)
......
...@@ -20,7 +20,7 @@ class Lamb(Optimizer): ...@@ -20,7 +20,7 @@ class Lamb(Optimizer):
betas (Tuple[float, float], optional): coefficients used for computing betas (Tuple[float, float], optional): coefficients used for computing
running averages of gradient and its square (default: (0.9, 0.999)) running averages of gradient and its square (default: (0.9, 0.999))
eps (float, optional): term added to the denominator to improve eps (float, optional): term added to the denominator to improve
numerical stability (default: 1e-8) numerical stability (default: 1e-6)
weight_decay (float, optional): weight decay (L2 penalty) (default: 0) weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
adam (bool, optional): always use trust ratio = 1, which turns this into adam (bool, optional): always use trust ratio = 1, which turns this into
Adam. Useful for comparison purposes. Adam. Useful for comparison purposes.
......
...@@ -16,7 +16,7 @@ class Lars(Optimizer): ...@@ -16,7 +16,7 @@ class Lars(Optimizer):
Args: Args:
params (iterable): iterable of parameters to optimize or dicts defining params (iterable): iterable of parameters to optimize or dicts defining
parameter groups parameter groups
lr (float, optional): learning rate lr (float, optional): learning rate (default: 1e-3)
momentum (float, optional): momentum factor (default: 0) momentum (float, optional): momentum factor (default: 0)
eeta (float, optional): LARS coefficient as used in the paper (default: 1e-3) eeta (float, optional): LARS coefficient as used in the paper (default: 1e-3)
weight_decay (float, optional): weight decay (L2 penalty) (default: 0) weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment