Commit ec5086c4 authored by Liang Bowen's avatar Liang Bowen Committed by アマデウス
Browse files

Refactored docstring to google style

parent 53b1b6e3
......@@ -13,14 +13,22 @@ from torch.nn.modules.loss import _Loss
@LOSSES.register_module
class CrossEntropyLoss2p5D(_Loss):
"""
Cross entropy loss for 2.5D parallelism
r"""Cross entropy loss for 2.5D parallelism
Args:
reduction (bool, optional): whether to average the loss, defaults to True.
:param reduction: whether to average the loss, defaults to True
:param args: Args for loss function
:param kwargs: Kwargs for loss function
The ``args`` and ``kwargs`` should include parameters below:
::
:type reduction: bool, optional
weight (Tensor, optional)
size_average (bool, optional)
ignore_index (int, optional)
reduce (bool, optional)
label_smoothing (float, optional)
More details about args, kwargs and torch.nn.functional.cross_entropy could be found in
`Cross_entropy <https://pytorch.org/docs/stable/generated/torch.nn.functional.cross_entropy.html#torch.nn.functional.cross_entropy>`_.
"""
def __init__(self, reduction=True, *args, **kwargs):
super().__init__()
......@@ -30,10 +38,11 @@ class CrossEntropyLoss2p5D(_Loss):
self.loss_kwargs = kwargs
def forward(self, logits, targets):
"""Calculate loss between logits and targets
"""Calculate loss between logits and targets.
:param logits: Output logits of model
:param targets: True targets from data
Args:
logits (:class:`torch.tensor`): Predicted unnormalized scores (often referred to as logits).
targets (:class:`torch.tensor`): Ground truth class indices or class probabilities.
"""
targets = split_tensor_2p5d(targets)
loss = cross_entropy(logits, targets, reduction='none', *self.loss_args, **self.loss_kwargs)
......@@ -115,19 +124,19 @@ class VocabParallelCrossEntropyLoss2p5D(_Loss):
"""
Vocab parallel cross entropy loss for 2.5D parallelism
:param reduction: whether to average the loss, defaults to True
:type reduction: bool, optional
Args:
reduction (bool, optional): whether to average the loss, defaults to True.
"""
def __init__(self, reduction=True):
super().__init__()
self.reduction_mean = reduction
def forward(self, logits, targets):
"""Calculate loss between logits and targets
"""Calculate loss between logits and targets.
:param logits: Output logits of model
:param targets: True targets from data
Args:
logits (:class:`torch.tensor`): Predicted unnormalized scores (often referred to as logits).
targets (:class:`torch.tensor`): Ground truth class indices or class probabilities.
"""
targets = split_tensor_2p5d(targets)
loss = _VocabParallelCrossEntropy2p5D.apply(logits, targets)
......
......@@ -13,14 +13,22 @@ from torch.nn.modules.loss import _Loss
@LOSSES.register_module
class CrossEntropyLoss3D(_Loss):
"""
Cross entropy loss for 3D parallelism
r"""Cross entropy loss for 3D parallelism.
Args:
reduction (bool, optional): whether to average the loss, defaults to True.
The ``args`` and ``kwargs`` should include parameters below:
::
:param reduction: whether to average the loss, defaults to True
:param args: Args for loss function
:param kwargs: Kwargs for loss function
weight (Tensor, optional)
size_average (bool, optional)
ignore_index (int, optional)
reduce (bool, optional)
label_smoothing (float, optional)
:type reduction: bool, optional
More details about args, kwargs and torch.nn.functional.cross_entropy could be found in
`Cross_entropy <https://pytorch.org/docs/stable/generated/torch.nn.functional.cross_entropy.html#torch.nn.functional.cross_entropy>`_.
"""
def __init__(self, reduction=True, *args, **kwargs):
......@@ -32,10 +40,11 @@ class CrossEntropyLoss3D(_Loss):
self.loss_kwargs = kwargs
def forward(self, logits, targets):
"""Calculate loss between logits and targets
"""Calculate loss between logits and targets.
:param logits: Output logits of model
:param targets: True targets from data
Args:
logits (:class:`torch.tensor`): Predicted unnormalized scores (often referred to as logits).
targets (:class:`torch.tensor`): Ground truth class indices or class probabilities.
"""
targets = split_tensor_3d(targets, 0, self.weight_parallel_mode)
targets = split_tensor_3d(targets, 0, self.input_parallel_mode)
......@@ -109,12 +118,10 @@ class _VocabParallelCrossEntropy3D(torch.autograd.Function):
@LOSSES.register_module
class VocabParallelCrossEntropyLoss3D(_Loss):
"""
Vocab parallel cross entropy loss for 2D parallelism
:param reduction: whether to average the loss, defaults to True
"""Vocab parallel cross entropy loss for 2D parallelism.
:type reduction: bool, optional
Args:
reduction (bool, optional): whether to average the loss, defaults to True.
"""
def __init__(self, reduction=True):
......@@ -125,10 +132,11 @@ class VocabParallelCrossEntropyLoss3D(_Loss):
self.reduction_mean = reduction
def forward(self, logits, targets):
"""Calculate loss between logits and targets
"""Calculate loss between logits and targets.
:param logits: Output logits of model
:param targets: True targets from data
Args:
logits (:class:`torch.tensor`): Predicted unnormalized scores (often referred to as logits).
targets (:class:`torch.tensor`): Ground truth class indices or class probabilities.
"""
targets = split_tensor_3d(targets, 0, self.weight_parallel_mode)
targets = split_tensor_3d(targets, 0, self.input_parallel_mode)
......
......@@ -6,13 +6,25 @@ from colossalai.context.moe_context import MOE_CONTEXT
@LOSSES.register_module
class MoeCrossEntropyLoss(_Loss):
"""torch.nn.CrossEntropyLoss added with auxiliary loss.
r"""torch.nn.CrossEntropyLoss added with auxiliary loss.
:param aux_weight: Weight of auxiliary loss in total loss
:param args: Args in CrossEntropyLoss
:param kwargs: Kwargs in CrossEntropyLoss
Args:
input (:class:`torch.tensor`): Predicted unnormalized scores (often referred to as logits).
target (:class:`torch.tensor`): Ground truth class indices or class probabilities.
aux_weight (float, optional): Weight of auxiliary loss in total loss.Defaults 0.01.
:type aux_weight: float, optional
The ``args`` and ``kwargs`` should include parameters below:
::
weight (Tensor, optional)
size_average (bool, optional)
ignore_index (int, optional)
reduce (bool, optional)
reduction (str, optional)
label_smoothing (float, optional)
More details about args, kwargs and torch.nn.functional.cross_entropy could be found in
`Cross_entropy <https://pytorch.org/docs/stable/generated/torch.nn.functional.cross_entropy.html#torch.nn.functional.cross_entropy>`_.
"""
def __init__(self, aux_weight: float = 0.01, *args, **kwargs):
......@@ -21,6 +33,16 @@ class MoeCrossEntropyLoss(_Loss):
self.aux_weight = aux_weight
def forward(self, *args):
"""
The ``args`` should at least include parameters below:
::
input (:class:`torch.tensor`): Predicted unnormalized scores (often referred to as logits).
target (:class:`torch.tensor`): Ground truth class indices or class probabilities.
More details about args, kwargs and torch.nn.functional.cross_entropy could be found in
`Cross_entropy <https://pytorch.org/docs/stable/generated/torch.nn.functional.cross_entropy.html#torch.nn.functional.cross_entropy>`_.
"""
main_loss = self.loss(*args)
aux_loss = MOE_CONTEXT.get_loss()
return main_loss + self.aux_weight * aux_loss
......@@ -30,13 +52,11 @@ class MoeCrossEntropyLoss(_Loss):
class MoeLoss(_Loss):
"""A wrapper class for any loss module to add with auxiliary loss.
:param aux_weight: Weight of auxiliary loss in total loss
:param loss_fn: Loss function
:param args: Args in loss function
:param kwargs: Kwargs in loss function
:type aux_weight: float
:type loss_fn: Callable
Args:
aux_weight (float): Weight of auxiliary loss in total loss.
loss_fn (``Callable``): Loss function.
args (list): Args in loss function.
kwargs (dict): Kwargs in loss function
"""
def __init__(self, aux_weight: float, loss_fn, *args, **kwargs):
......@@ -45,6 +65,16 @@ class MoeLoss(_Loss):
self.aux_weight = aux_weight
def forward(self, *args, **kwargs):
"""
The ``args`` and ``kwargs`` should at least include parameters below:
::
input (:class:`torch.tensor`): Predicted unnormalized scores (often referred to as logits).
target (:class:`torch.tensor`): Ground truth class indices or class probabilities.
Note:
The ``args`` and ``kwargs`` may include different parameters varying with different loss function.
"""
main_loss = self.loss_fn(*args, **kwargs)
aux_loss = MOE_CONTEXT.get_loss()
return main_loss + self.aux_weight * aux_loss
......@@ -36,14 +36,12 @@ class CosineAnnealingLR(_CosineAnnealingLR):
.. _SGDR\: Stochastic Gradient Descent with Warm Restarts:
https://arxiv.org/abs/1608.03983
:param optimizer: Wrapped optimizer
:type optimizer: torch.optim.Optimizer
:param total_steps: Number of total training steps
:type total_steps: int
:param eta_min: Minimum learning rate, defaults to 0
:type eta_min: int, optional
:param last_epoch: The index of last epoch, defaults to -1
:type last_epoch: int, optional
Args:
optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
total_steps (int): Number of total training steps.
eta_min (int, optional): Minimum learning rate, defaults to 0.
last_epoch (int, optional): The index of last epoch, defaults to -1. When last_epoch=-1,
the schedule is started from the beginning or When last_epoch=-1, sets initial lr as lr.
"""
def __init__(self, optimizer, total_steps: int, eta_min: int = 0, last_epoch: int = -1, **kwargs):
......@@ -54,16 +52,13 @@ class CosineAnnealingLR(_CosineAnnealingLR):
class CosineAnnealingWarmupLR(WarmupScheduler):
"""Cosine annealing learning rate scheduler with learning rate warmup. A linear warmup schedule will be applied.
:param optimizer: Wrapped optimizer
:type optimizer: torch.optim.Optimizer
:param total_steps: Number of total training steps
:type total_steps: int
:param warmup_steps: Number of warmup steps, defaults to 0
:type warmup_steps: int, optional
:param eta_min: Minimum learning rate, defaults to 0
:type eta_min: int, optional
:param last_epoch: The index of last epoch, defaults to -1
:type last_epoch: int, optional
Args:
optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
total_steps (int): Number of total training steps.
warmup_steps (int, optional): Number of warmup steps, defaults to 0.
eta_min (int, optional): Minimum learning rate, defaults to 0.
last_epoch (int, optional): The index of last epoch, defaults to -1. When last_epoch=-1,
the schedule is started from the beginning or When last_epoch=-1, sets initial lr as lr.
"""
def __init__(self, optimizer, total_steps: int, warmup_steps: int = 0, eta_min: float = 0., last_epoch: int = -1):
......@@ -76,14 +71,12 @@ class CosineAnnealingWarmupLR(WarmupScheduler):
class FlatAnnealingLR(DelayerScheduler):
"""Flat and cosine annealing learning rate scheduler. The learning rate will be a fixed value before starting decay.
:param optimizer: Wrapped optimizer
:type optimizer: torch.optim.Optimizer
:param total_steps: Number of total training steps
:type total_steps: int
:param pct_start: Percent of steps before starting learning rate decay
:type pct_start: float
:param last_epoch: The index of last epoch, defaults to -1
:type last_epoch: int, optional
Args:
optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
total_steps (int): Number of total training steps.
pct_start (float, optional): Percent of steps before starting learning rate decay, defaults to -0.72.
last_epoch (int, optional): The index of last epoch, defaults to -1. When last_epoch=-1,
the schedule is started from the beginning or When last_epoch=-1, sets initial lr as lr.
"""
def __init__(self, optimizer, total_steps: int, pct_start: float = 0.72, last_epoch: int = -1, **kwargs):
......@@ -102,18 +95,14 @@ class FlatAnnealingWarmupLR(WarmupDelayerScheduler):
"""Flat and cosine annealing learning rate scheduler with learning rate warmup. A linear warmup schedule will be
applied, and then the learning rate will be a fixed value before starting decay.
:param optimizer: Wrapped optimizer
:type optimizer: torch.optim.Optimizer
:param total_steps: Number of total training steps
:type total_steps: int
:param warmup_steps: Number of warmup steps, defaults to 0
:type warmup_steps: int, optional
:param pct_start: Percent of steps before starting learning rate decay
:type pct_start: float
:param eta_min: Minimum learning rate, defaults to 0
:type eta_min: int, optional
:param last_epoch: The index of last epoch, defaults to -1
:type last_epoch: int, optional
Args:
optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
total_steps (int): Number of total training steps.
warmup_steps (int, optional): Number of warmup steps, defaults to 0.
pct_start (float, optional): Percent of steps before starting learning rate decay, defaults to -0.72.
eta_min (int, optional): Minimum learning rate, defaults to 0.
last_epoch (int, optional): The index of last epoch, defaults to -1. When last_epoch=-1,
the schedule is started from the beginning or When last_epoch=-1, sets initial lr as lr.
"""
def __init__(self, optimizer, total_steps: int, warmup_steps: int = 0, pct_start: float = 0.72, eta_min: int = 0,
......
......@@ -14,16 +14,15 @@ class _enable_get_lr_call:
class DelayerScheduler(_LRScheduler):
""" Starts with a flat lr schedule until it reaches N epochs the applies a scheduler
:param optimizer: Wrapped optimizer.
:type optimizer: torch.optim.Optimizer
:param delay_epochs: Number of epochs to keep the initial lr until starting aplying the scheduler
:type delay_epochs: int
:param after_scheduler: After target_epoch, use this scheduler(eg. ReduceLROnPlateau)
:type after_scheduler: torch.optim.lr_scheduler
:param last_epoch: The index of last epoch, defaults to -1
:type last_epoch: int, optional
"""Starts with a flat lr schedule until it reaches N epochs then applies
the specific scheduler (For example: ReduceLROnPlateau)
Args:
optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
delay_epochs (int): Number of epochs to keep the initial lr until starting applying the scheduler.
after_scheduler (:class:`torch.optim.lr_scheduler`): After target_epoch, use this scheduler.
last_epoch (int, optional): The index of last epoch, defaults to -1. When last_epoch=-1,
the schedule is started from the beginning or When last_epoch=-1, sets initial lr as lr.
"""
def __init__(self, optimizer, delay_epochs, after_scheduler, last_epoch=-1):
......@@ -57,16 +56,15 @@ class DelayerScheduler(_LRScheduler):
class WarmupScheduler(_LRScheduler):
""" Starts with a linear warmup lr schedule until it reaches N epochs the applies a scheduler
:param optimizer: Wrapped optimizer.
:type optimizer: torch.optim.Optimizer
:param warmup_epochs: Number of epochs to linearly warmup lr until starting aplying the scheduler
:type warmup_epochs: int
:param after_scheduler: After target_epoch, use this scheduler(eg. ReduceLROnPlateau)
:type after_scheduler: torch.optim.lr_scheduler
:param last_epoch: The index of last epoch, defaults to -1
:type last_epoch: int, optional
"""Starts with a linear warmup lr schedule until it reaches N epochs then applies
the specific scheduler (For example: ReduceLROnPlateau).
Args:
optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
warmup_epochs (int): Number of epochs to linearly warmup lr until starting applying the scheduler.
after_scheduler (:class:`torch.optim.lr_scheduler`): After target_epoch, use this scheduler.
last_epoch (int, optional): The index of last epoch, defaults to -1. When last_epoch=-1,
the schedule is started from the beginning or When last_epoch=-1, sets initial lr as lr.
"""
def __init__(self, optimizer, warmup_epochs, after_scheduler, last_epoch=-1):
......@@ -97,18 +95,16 @@ class WarmupScheduler(_LRScheduler):
class WarmupDelayerScheduler(_LRScheduler):
""" Starts with a linear warmup lr schedule until it reaches N epochs and a flat lr schedule until it reaches M epochs the applies a scheduler
:param optimizer: Wrapped optimizer.
:type optimizer: torch.optim.Optimizer
:param warmup_epochs: Number of epochs to linearly warmup lr until starting aplying the scheduler
:type warmup_epochs: int
:param delay_epochs: Number of epochs to keep the initial lr until starting aplying the scheduler
:type delay_epochs: int
:param after_scheduler: After target_epoch, use this scheduler(eg. ReduceLROnPlateau)
:type after_scheduler: torch.optim.lr_scheduler
:param last_epoch: The index of last epoch, defaults to -1
:type last_epoch: int, optional
"""Starts with a linear warmup lr schedule until it reaches N epochs and a flat lr schedule
until it reaches M epochs then applies the specific scheduler (For example: ReduceLROnPlateau).
Args:
optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
warmup_epochs (int): Number of epochs to linearly warmup lr until starting applying the scheduler.
delay_epochs (int): Number of epochs to keep the initial lr until starting applying the scheduler.
after_scheduler (:class:`torch.optim.lr_scheduler`): After target_epoch, use this scheduler.
last_epoch (int, optional): The index of last epoch, defaults to -1. When last_epoch=-1,
the schedule is started from the beginning or When last_epoch=-1, sets initial lr as lr.
"""
def __init__(self, optimizer, warmup_epochs, delay_epochs, after_scheduler, last_epoch=-1):
......
......@@ -5,16 +5,14 @@ from colossalai.registry import LR_SCHEDULERS
@LR_SCHEDULERS.register_module
class LinearWarmupLR(_LRScheduler):
"""Linearly warmup learning rate and then linearly decay
"""Linearly warmup learning rate and then linearly decay.
:param optimizer: Wrapped optimizer
:type optimizer: torch.optim.Optimizer
:param total_steps: Number of total training steps
:type total_steps: int
:param warmup_steps: Number of warmup steps, defaults to 0
:type warmup_steps: int, optional
:param last_epoch: The index of last epoch, defaults to -1
:type last_epoch: int, optional
Args:
optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
total_steps (int): Number of total training steps.
warmup_steps (int, optional): Number of warmup steps, defaults to 0
last_epoch (int, optional): The index of last epoch, defaults to -1. When last_epoch=-1,
the schedule is started from the beginning or When last_epoch=-1, sets initial lr as lr.
"""
def __init__(self, optimizer, total_steps: int, warmup_steps: int = 0, last_epoch: int = -1, **kwargs):
......
......@@ -13,18 +13,13 @@ class MultiStepLR(_MultiStepLR):
happen simultaneously with other changes to the learning rate from outside
this scheduler. When last_epoch=-1, sets initial lr as lr.
:param optimizer: Wrapped optimizer
:type optimizer: torch.optim.Optimizer
:param total_steps: Number of total training steps
:type total_steps: int
:param milestones: List of epoch indices. Must be increasing, defaults to None
:type milestones: List[int], optional
:param gamma: Multiplicative factor of learning rate decay, defaults to 0.1
:type gamma: float, optional
:param num_steps_per_epoch: Number of steps per epoch, defaults to -1
:type num_steps_per_epoch: int, optional
:param last_epoch: The index of last epoch, defaults to -1
:type last_epoch: int, optional
Args:
optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
total_steps (int): Number of total training steps.
milestones (List[int], optional): List of epoch indices. Must be increasing, defaults to None.
gamma (float, optional): Multiplicative factor of learning rate decay, defaults to 0.1.
last_epoch (int, optional): The index of last epoch, defaults to -1. When last_epoch=-1,
the schedule is started from the beginning or When last_epoch=-1, sets initial lr as lr.
"""
def __init__(self, optimizer, total_steps: int, milestones: List[int] = None, gamma: float = 0.1, last_epoch: int = -1, **kwargs):
......@@ -33,22 +28,17 @@ class MultiStepLR(_MultiStepLR):
@LR_SCHEDULERS.register_module
class MultiStepWarmupLR(WarmupScheduler):
"""Multi-step laerning rate scheduler with warmup.
:param optimizer: Wrapped optimizer
:type optimizer: torch.optim.Optimizer
:param total_steps: Number of total training steps
:type total_steps: int
:param warmup_steps: Number of warmup steps, defaults to 0
:type warmup_steps: int, optional
:param milestones: List of epoch indices. Must be increasing, defaults to None
:type milestones: List[int], optional
:param gamma: Multiplicative factor of learning rate decay, defaults to 0.1
:type gamma: float, optional
:param num_steps_per_epoch: Number of steps per epoch, defaults to -1
:type num_steps_per_epoch: int, optional
:param last_epoch: The index of last epoch, defaults to -1
:type last_epoch: int, optional
"""Multistep learning rate scheduler with warmup.
Args:
optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
total_steps (int): Number of total training steps.
warmup_steps (int, optional): Number of warmup steps, defaults to 0.
milestones (List[int], optional): List of epoch indices. Must be increasing, defaults to None.
gamma (float, optional): Multiplicative factor of learning rate decay, defaults to 0.1.
num_steps_per_epoch (int, optional): Number of steps per epoch, defaults to -1.
last_epoch (int, optional): The index of last epoch, defaults to -1. When last_epoch=-1,
the schedule is started from the beginning or When last_epoch=-1, sets initial lr as lr.
"""
def __init__(self, optimizer, total_steps: int, warmup_steps: int = 0, milestones: List[int] = None,
......
......@@ -28,43 +28,41 @@ class OneCycleLR(_OneCycleLR):
claims that "unpublished work has shown even better results by using only two phases". To
mimic the behaviour of the original paper instead, set ``three_phase=True``.
:param optimizer: Wrapped optimizer
:type optimizer: torch.optim.Optimizer
:param total_steps: Number of total training steps
:type total_steps: int
:param pct_start: The percentage of the cycle (in number of steps) spent increasing the learning rate, defaults to 0.3
:type pct_start: float, optional
:param anneal_strategy: {'cos', 'linear'}
Specifies the annealing strategy: "cos" for cosine annealing, "linear" for
linear annealing, defaults to 'cos'
:type anneal_strategy: str, optional
:param cycle_momentum: If ``True``, momentum is cycled inversely
to learning rate between 'base_momentum' and 'max_momentum', defaults to True
:type cycle_momentum: bool, optional
:param base_momentum: Lower momentum boundaries in the cycle
for each parameter group. Note that momentum is cycled inversely
to learning rate; at the peak of a cycle, momentum is
'base_momentum' and learning rate is 'max_lr', defaults to 0.85
:type base_momentum: float, optional
:param max_momentum: Upper momentum boundaries in the cycle
for each parameter group. Functionally,
it defines the cycle amplitude (max_momentum - base_momentum).
Note that momentum is cycled inversely
to learning rate; at the start of a cycle, momentum is 'max_momentum'
and learning rate is 'base_lr', defaults to 0.95
:type max_momentum: float, optional
:param div_factor: Determines the initial learning rate via
initial_lr = max_lr/div_factor, defaults to 25.0
:type div_factor: float, optional
:param final_div_factor: Determines the minimum learning rate via
min_lr = initial_lr/final_div_factor, defaults to 10000.0
:type final_div_factor: float, optional
:param last_epoch: The index of the last batch. This parameter is used when
resuming a training job. Since `step()` should be invoked after each
batch instead of after each epoch, this number represents the total
number of *batches* computed, not the total number of epochs computed.
When last_epoch=-1, the schedule is started from the beginning, defaults to -1
:type last_epoch: int, optional
Args:
optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
total_steps (int): Number of total training steps.
pct_start (float, optional):
The percentage of the cycle (in number of steps) spent increasing the learning rate, defaults to 0.3.
anneal_strategy (str, optional): {'cos', 'linear'}, Specifies the annealing strategy:
"cos" for cosine annealing, "linear" for linear annealing, defaults to 'cos'.
cycle_momentum (bool, optional): If ``True``, momentum is cycled inversely
to learning rate between 'base_momentum' and 'max_momentum', defaults to True.
base_momentum (float, optional): Lower momentum boundaries in the cycle for each parameter group.
Note that momentum is cycled inversely to learning rate; at the peak of a cycle, momentum is
'base_momentum' and learning rate is 'max_lr', defaults to 0.85.
max_momentum (float, optional): Upper momentum boundaries in the cycle for each parameter group.
Functionally, it defines the cycle amplitude (max_momentum - base_momentum).
Note that momentum is cycled inversely to learning rate; at the start of a cycle, momentum is 'max_momentum'
and learning rate is 'base_lr', defaults to 0.95.
div_factor (float, optional): Determines the initial learning rate via
initial_lr = max_lr/div_factor, defaults to 25.0.
final_div_factor (float, optional): Determines the minimum learning rate via
min_lr = initial_lr/final_div_factor, defaults to 10000.0.
last_epoch (int, optional): The index of the last batch. This parameter is used when resuming a training job.
Since `step()` should be invoked after each batch instead of after each epoch, this number represents
the total number of *batches* computed, not the total number of epochs computed.
When last_epoch=-1, the schedule is started from the beginning, defaults to -1
The ``kwargs`` for initializing torch.optim.lr_scheduler.OneCycleLR should include parameters below:
::
epochs (int, optional, default=None)
steps_per_epoch (int, optional, default=None)
three_phase (bool, optional, default=False)
verbose (bool, optional, default=False)
More details about kwargs could be found in
`OneCycleLR <https://pytorch.org/docs/stable/generated/torch.optim.lr_scheduler.OneCycleLR.html#torch.optim.lr_scheduler.OneCycleLR>`_.
.. _Super-Convergence\: Very Fast Training of Neural Networks Using Large Learning Rates:
https://arxiv.org/abs/1708.07120
......
......@@ -8,16 +8,13 @@ from .delayed import WarmupScheduler
class PolynomialLR(_LRScheduler):
"""Polynomial learning rate scheduler.
:param optimizer: Wrapped optimizer
:type optimizer: torch.optim.Optimizer
:param total_steps: Number of total training steps
:type total_steps: int
:param end_lr: Minimum learning rate, defaults to 0.0001
:type end_lr: float, optional
:param power: The power of polynomial, defaults to 1.0
:type power: float, optional
:param last_epoch: The index of last epoch, defaults to -1
:type last_epoch: int, optional
Args:
optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
total_steps (int): Number of total training steps.
end_lr (float, optional): Minimum learning rate, defaults to 0.0001.
power (float, optional): The power of polynomial, defaults to 1.0.
last_epoch (int, optional): The index of last epoch, defaults to -1. When last_epoch=-1,
the schedule is started from the beginning or When last_epoch=-1, sets initial lr as lr.
"""
def __init__(self, optimizer, total_steps: int, end_lr: float = 0.0001, power: float = 1.0, last_epoch: int = -1,
......@@ -44,18 +41,14 @@ class PolynomialLR(_LRScheduler):
class PolynomialWarmupLR(WarmupScheduler):
"""Polynomial learning rate scheduler with warmup.
:param optimizer: Wrapped optimizer
:type optimizer: torch.optim.Optimizer
:param total_steps: Number of total training steps
:type total_steps: int
:param warmup_steps: Number of warmup steps, defaults to 0
:type warmup_steps: int, optional
:param end_lr: Minimum learning rate, defaults to 0.0001
:type end_lr: float, optional
:param power: The power of polynomial, defaults to 1.0
:type power: float, optional
:param last_epoch: The index of last epoch, defaults to -1
:type last_epoch: int, optional
Args:
optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
total_steps (int): Number of total training steps.
warmup_steps (int, optional): Number of warmup steps, defaults to 0.
end_lr (float, optional): Minimum learning rate, defaults to 0.0001.
power (float, optional): The power of polynomial, defaults to 1.0.
last_epoch (int, optional): The index of last epoch, defaults to -1. When last_epoch=-1,
the schedule is started from the beginning or When last_epoch=-1, sets initial lr as lr.
"""
def __init__(self, optimizer, total_steps: int, warmup_steps: int = 0, end_lr: float = 0.0001, power: float = 1.0,
......
......@@ -11,16 +11,13 @@ class LambdaLR(_LambdaLR):
"""Sets the learning rate of each parameter group to the initial lr
times a given function. When last_epoch=-1, sets initial lr as lr.
:param optimizer: Wrapped optimizer
:type optimizer: torch.optim.Optimizer
:param total_steps: Number of total training steps
:type total_steps: int
:param lr_lambda: A function which computes a multiplicative
factor given an integer parameter epoch, or a list of such
functions, one for each group in optimizer.param_groups, defaults to None
:type lr_lambda: function or list, optional
:param last_epoch: The index of last epoch, defaults to -1
:type last_epoch: int, optional
Args:
optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
total_steps (int): Number of total training steps.
lr_lambda (Union[``function``, ``list[function]``]): A function which computes a multiplicative
factor given an integer parameter epoch, or a list of such functions,
one for each group in optimizer.param_groups, defaults to None.
last_epoch (int, optional): The index of last epoch, defaults to -1.
"""
def __init__(self, optimizer, total_steps, lr_lambda=None, last_epoch: int = -1) -> None:
......@@ -30,18 +27,15 @@ class LambdaLR(_LambdaLR):
@LR_SCHEDULERS.register_module
class MultiplicativeLR(_MultiplicativeLR):
"""Multiply the learning rate of each parameter group by the factor given
in the specified function. When last_epoch=-1, sets initial lr as lr
:param optimizer: Wrapped optimizer
:type optimizer: torch.optim.Optimizer
:param total_steps: Number of total training steps
:type total_steps: int
:param lr_lambda: A function which computes a multiplicative
factor given an integer parameter epoch, or a list of such
functions, one for each group in optimizer.param_groups, defaults to None
:type lr_lambda: function or list, optional
:param last_epoch: The index of last epoch, defaults to -1
:type last_epoch: int, optional
in the specified function. When last_epoch=-1, sets initial lr as lr.
Args:
optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
total_steps (int): Number of total training steps.
lr_lambda (Union[``function``, ``list[function]``]): A function which computes a multiplicative
factor given an integer parameter epoch, or a list of such functions,
one for each group in optimizer.param_groups, defaults to None.
last_epoch (int, optional): The index of last epoch, defaults to -1.
"""
def __init__(self, optimizer, total_steps, lr_lambda=None, last_epoch: int = -1) -> None:
......@@ -53,18 +47,14 @@ class StepLR(_StepLR):
"""Decays the learning rate of each parameter group by gamma every
step_size epochs. Notice that such decay can happen simultaneously with
other changes to the learning rate from outside this scheduler. When
last_epoch=-1, sets initial lr as lr
:param optimizer: Wrapped optimizer
:type optimizer: torch.optim.Optimizer
:param total_steps: Number of total training steps
:type total_steps: int
:param step_size: Period of learning rate decay, defaults to 1
:type step_size: int, optional
:param gamma: Multiplicative factor of learning rate decay, defaults to 0.1
:type gamma: float, optional
:param last_epoch: The index of last epoch, defaults to -1
:type last_epoch: int, optional
last_epoch=-1, sets initial lr as lr.
Args:
optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
total_steps (int): Number of total training steps.
step_size (int, optional): Period of learning rate decay, defaults to 1.
gamma (float, optional): Multiplicative factor of learning rate decay, defaults to 0.1.
last_epoch (int, optional): The index of last epoch, defaults to -1.
"""
def __init__(self, optimizer, total_steps, step_size: int = 1, gamma: float = 0.1, last_epoch: int = -1) -> None:
......@@ -77,14 +67,11 @@ class ExponentialLR(_ExponentialLR):
"""Decays the learning rate of each parameter group by gamma every epoch.
When last_epoch=-1, sets initial lr as lr
:param optimizer: Wrapped optimizer
:type optimizer: torch.optim.Optimizer
:param total_steps: Number of total training steps
:type total_steps: int
:param gamma: Multiplicative factor of learning rate decay, defaults to 1.0
:type gamma: float, optional
:param last_epoch: The index of last epoch, defaults to -1
:type last_epoch: int, optional
Args:
optimizer (Union[:class:`torch.optim.Optimizer`, :class:`colossalai.nn.optimizer`]): Wrapped optimizer.
total_steps (int): Number of total training steps.
gamma (float, optional): Multiplicative factor of learning rate decay, defaults to 1.0.
last_epoch (int, optional): The index of last epoch, defaults to -1.
"""
def __init__(self, optimizer, total_steps, gamma: float = 1.0,
......
......@@ -14,8 +14,12 @@ class Accuracy2D(nn.Module):
def forward(self, logits, targets):
"""Calculate the accuracy of predicted labels.
:param logits: Predicted labels
:param targets: True labels from data
Args:
logits (:class:`torch.tensor`): Predicted labels.
targets (:class:`torch.tensor`): True labels from data.
Returns:
float: the accuracy of prediction.
"""
with torch.no_grad():
targets = split_tensor_2d(targets)
......
......@@ -14,8 +14,12 @@ class Accuracy2p5D(nn.Module):
def forward(self, logits, targets):
"""Calculate the accuracy of predicted labels.
:param logits: Predicted labels
:param targets: True labels from data
Args:
logits (:class:`torch.tensor`): Predicted labels.
targets (:class:`torch.tensor`): True labels from data.
Returns:
float: the accuracy of prediction.
"""
with torch.no_grad():
targets = split_tensor_2p5d(targets)
......
......@@ -18,8 +18,12 @@ class Accuracy3D(nn.Module):
def forward(self, logits, targets):
"""Calculate the accuracy of predicted labels.
:param logits: Predicted labels
:param targets: True labels from data
Args:
logits (:class:`torch.tensor`): Predicted labels.
targets (:class:`torch.tensor`): True labels from data.
Returns:
float: the accuracy of prediction.
"""
with torch.no_grad():
targets = split_tensor_3d(targets, 0, self.weight_parallel_mode)
......
......@@ -9,11 +9,10 @@ class Registry:
"""This is a registry class used to register classes and modules so that a universal
object builder can be enabled.
:param name: The name of the registry
:type name: str
:param third_party_library: List of third party libraries which are used in the
initialization of the register module
:type third_party_library: list, optional
Args:
name (str): The name of the registry .
third_party_library (list, optional):
List of third party libraries which are used in the initialization of the register module.
"""
def __init__(self, name: str, third_party_library: List[ModuleType] = None):
......@@ -28,12 +27,12 @@ class Registry:
def register_module(self, module_class):
"""Registers a module represented in `module_class`.
:param module_class: The module to be registered
:type module_class: class
:raises AssertionError: Raises an AssertionError if the module has already been
registered before
:return: The module to be registered, so as to use it normally if via importing
:rtype: class
Args:
module_class (class): The module to be registered.
Returns:
class: The module to be registered, so as to use it normally if via importing.
Raises:
AssertionError: Raises an AssertionError if the module has already been registered before.
"""
module_name = module_class.__name__
assert module_name not in self._registry
......@@ -46,12 +45,13 @@ class Registry:
"""Retrieves a module with name `module_name` and returns the module if it has
already been registered before.
:param module_name: The name of the module to be retrieved
:type module_name: str
:raises NameError: Raises a NameError if the module to be retrieved has neither been
registered directly nor as third party modules before
:return: The retrieved module or None
:rtype: :class:`object`
Args:
module_name (str): The name of the module to be retrieved.
Returns:
:class:`object`: The retrieved module or None.
Raises:
NameError: Raises a NameError if the module to be retrieved has neither been
registered directly nor as third party modules before.
"""
if module_name in self._registry:
return self._registry[module_name]
......@@ -65,11 +65,11 @@ class Registry:
"""Searches for a module with name `module_name` and returns a boolean value indicating
whether the module has been registered directly or as third party modules before.
:param module_name: The name of the module to be searched for
:type module_name: str
:return: A boolean value indicating whether the module has been registered directly or
as third party modules before
:rtype: bool
Args:
module_name (str): The name of the module to be searched for.
Returns:
bool: A boolean value indicating whether the module has been registered directly or
as third party modules before.
"""
found_flag = module_name in self._registry
......
......@@ -17,18 +17,46 @@ from colossalai.trainer.hooks import BaseHook
class Trainer:
"""This a class tending for easy deployments of users' training and evaluation instead of
r"""This is a class tending for easy deployments of users' training and evaluation instead of
writing their own scripts. It is similar with ``ignite.engine`` and ``keras.engine``, but is
called `Trainer`.
:param engine: Engine responsible for the process function
:type engine: :class:`Engine`
:param schedule: Schedule responsible for forward and backward steps
:type schedule: :class:`BaseSchedule`, optional
:param timer: Timer used to monitor the whole training
:type timer: :class:`MultiTimer`, optional
:param logger: Logger used to record the whole training
:type logger: :class:`colossalai.logging.DistributedLogger`, optional
Args:
engine (:class:`Engine`): Engine responsible for the process function.
schedule (:class:`BaseSchedule`, optional): Schedule responsible for forward and backward steps.
timer (:class:`MultiTimer`, optional): Timer used to monitor the whole training.
logger (:class:`colossalai.logging.DistributedLogger`, optional): Logger used to record the whole training log.
Note:
when `schedule` is None, the ``NonPipelineSchedule`` would be used. If you would like to use pipeline,
you should choose ``PipelineSchedule`` or ``InterleavedPipelineSchedule`` for the `schedule`
Examples:
>>> # define model, criterion, optimizer, lr_scheduler, train_dataloader for your training
>>> model = ...
>>> criterion = ...
>>> optimizer = ...
>>> train_dataloader = ...
>>> # Initialize your engine, train_dataloader, test_dataloader, lr_scheduler
>>> engine, train_dataloader, _, _ = colossalai.initialize(model, optimizer, criterion)
>>> # Beginning training progress
>>> timier = ...
>>> logger = ...
>>> trainer = Trainer(engine=engine, logger=logger, schedule=schedule, timer=timier)
>>> # add hooks you would like to use here.
>>> hook_list = []
>>> trainer.fit(
>>> train_dataloader=train_dataloader,
>>> epochs=gpc.config.NUM_EPOCHS,
>>> test_interval=1,
>>> hooks=hook_list,
>>> display_progress=True,
>>> return_output_label=False
>>> )
More examples and details could be found in
`Training with engine and trainer <https://www.colossalai.org/docs/basics/engine_trainer>`_
and `ColossalAI-Examples <https://github.com/hpcaitech/ColossalAI-Examples/tree/main>`_.
"""
def __init__(
self,
......@@ -108,20 +136,19 @@ class Trainer:
def _set_current_step(self, epoch: int):
"""Sets current step number.
:param epoch: Step number to be set
:type epoch: int
Args:
epoch (int): Step number to be set.
"""
self._cur_step = epoch * self._steps_per_epoch
def _call_timer(self, action: str, item: str, *args, **kwargs) -> None:
"""Call timer funciton with a given timer name.
:param action: Function to be called on timer
:type action: str
:param item: Name of the timer
:type item: str
:param args: args used for action function
:param kwargs: kwargs used for action function
Args:
action (str): Function to be called on timer.
item (str): Name of the timer.
args (list): args used for action function.
kwargs (dict): kwargs used for action function.
"""
if self._timer is not None:
......@@ -134,10 +161,9 @@ class Trainer:
def _call_hooks(self, func, output=None):
"""Calls specific hooks in the current time point.
:param func: A string represents the time point
:param output: Output of the model after running a iteration or None in any other time points
:type func: str
:type output: optional
Args:
func (str): A string represents the time point.
output (Any, optional): Output of the model after running an iteration or None in any other time points.
"""
# Only after iter hook will receive output
for hook in self.hooks:
......@@ -273,25 +299,17 @@ class Trainer:
display_progress: bool = False,
return_output_label: bool = True,
):
"""Trains the model to fit training data.
:param train_dataloader: DataLoader in training
:param epochs: Maximum number of epoches
:param max_steps: Maximum number of running iterations
:param test_dataloader: DataLoader in testing
:param test_interval: Interval of testing
:param hooks: A list of hooks used in training
:param display_progress: If True, the training progress will be printed
:param return_output_label: If True, the output of model and the label will be returned
:type train_dataloader: DataLoader
:type epochs: int
:type max_steps: int, optional
:type test_dataloader: DataLoader, optional
:type test_interval: int, optional
:type hooks: list, optional
:type display_progress: bool, optional
:type return_output_label: bool, optional
r"""Trains the model to fit training data.
Args:
train_dataloader (:class:`torch.utils.data.DataLoader`): DataLoader for training.
epochs (int): Maximum number of epochs.
max_steps (int, optional): Maximum number of running iterations.
test_dataloader (:class:`torch.utils.data.DataLoader`, optional): DataLoader for validation.
test_interval (int, optional): Interval of validation
hooks (list[`BaseHook <https://github.com/hpcaitech/ColossalAI/tree/main/colossalai/trainer/hooks>`_],
optional): A list of hooks used in training.
display_progress (bool, optional): If True, a progress bar will be displayed.
"""
# set epochs and steps, consider gradient accumulation
......@@ -374,15 +392,12 @@ class Trainer:
):
"""Evaluates the model with testing data.
:param test_dataloader: DataLoader in testing
:param hooks: A list of hooks used in evaluation
:param display_progress: If True, the evaluation progress will be printed
:param return_output_label: If True, the output of model and the label will be returned
:type test_dataloader: DataLoader
:type hooks: list, optional
:type display_progress: bool, optional
:type return_output_label: bool
Args:
test_dataloader (:class:`torch.utils.data.DataLoader`, optional): Dataloader for testing.
hooks (list, optional): A list of hooks used in evaluation. Defaults to None.
display_progress (bool, optional): If True, the evaluation progress will be printed. Defaults to False.
return_output_label (bool, optional): If True, the output of model and the label
will be returned. Defaults to True.
"""
# set display
display_progress = self._should_display_progress(display_progress)
......@@ -418,10 +433,11 @@ class Trainer:
def predict(self, data: Union[Tensor, List[Tensor]]):
"""Uses trained model to make a prediction for a tensor or a tensor list.
:param data: Data as the input
:type data: Union[Tensor, List[Tensor]
:return: The output of model as the prediction
:rtype: Tensor
Args:
data (Union[:class:`torch.tensor`, List[:class:`torch.tensor`]]): Data as the input.
Returns:
:class:`torch.tensor`: The output of model as the prediction
"""
# predict without labels
if isinstance(data, (list, tuple)):
......
......@@ -40,14 +40,11 @@ class BaseHook(ABC):
def after_train_iter(self, trainer, output: Tensor, label: Tensor, loss: Tensor):
"""Actions after running a training iteration.
:param trainer: Trainer which is using this hook
:type trainer: :class:`Trainer`
:param output: Output of the model
:type output: torch.Tensor
:param label: Labels of the input data
:type label: torch.Tensor
:param loss: Loss between the output and input data
:type loss: torch.Tensor
Args:
trainer (:class:`Trainer`): Trainer which is using this hook.
output (:class:`torch.Tensor`): Output of the model.
label (:class:`torch.Tensor`): Labels of the input data.
loss (:class:`torch.Tensor`): Loss between the output and input data.
"""
pass
......@@ -89,24 +86,21 @@ class BaseHook(ABC):
def after_test_iter(self, trainer, output: Tensor, label: Tensor, loss: Tensor):
"""Actions after running a testing iteration.
:param trainer: Trainer which is using this hook
:type trainer: :class:`Trainer`
:param output: Output of the model
:type output: Tensor
:param label: Labels of the input data
:type label: Tensor
:param loss: Loss between the output and input data
:type loss: Tensor
Args:
trainer (:class:`Trainer`): Trainer which is using this hook
output (:class:`torch.Tensor`): Output of the model
label (:class:`torch.Tensor`): Labels of the input data
loss (:class:`torch.Tensor`): Loss between the output and input data
"""
pass
def init_runner_states(self, trainer, key, val):
"""Initializes trainer's state.
:param trainer: Trainer which is using this hook
:type trainer: :class:`Trainer`
:param key: Key of reseting state
:param val: Value of reseting state
Args:
trainer (:class:`Trainer`): Trainer which is using this hook
key: Key of state to be reset
val: Value of state to be reset
"""
if key not in trainer.states:
trainer.states[key] = val
......@@ -16,14 +16,13 @@ from ._lr_scheduler_hook import LRSchedulerHook
class SaveCheckpointHook(BaseHook):
"""Saves the model by interval in training process.
:param interval: Saving interval, defaults to 1
:type interval: int, optional
:param checkpoint_dir: Directory of saving checkpoint, defaults to None
:type checkpoint_dir: str, optional
:param suffix: Saving suffix of the file, defaults to ''
:type suffix: str, optional
:param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 10
:type priority: int, optional
Args:
interval (int, optional): Saving interval, defaults to 1.
checkpoint_dir (str, optional): Directory of saving checkpoint, defaults to None.
suffix (str, optional): Saving suffix of the file, defaults to ''.
priority (int, optional): Priority in the printing, hooks with small priority will be printed in front
defaults to 10. If different hooks share same priority, the order of printing would
depend on the hooks order in the hook list.
"""
def __init__(self,
......@@ -71,18 +70,17 @@ class SaveCheckpointHook(BaseHook):
class LoadCheckpointHook(BaseHook):
"""Loads the model before training process.
:param checkpoint_dir: Directory of saving checkpoint, defaults to None
:type checkpoint_dir: str, optional
:param epoch: Epoch number to be set, defaults to -1
:type epoch: str, optional
:param finetune: Whether allows to load a part of the model, defaults to False
:type finetune: bool, optional
:param strict: Whether loads a model that has the same shape of parameters, defaults to False
:type strict: bool, optional
:param suffix: Suffic, defaults to ''
:type suffix: str, optional
:param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 0
:type priority: int, optional
Args:
checkpoint_dir (str, optional): Directory of saving checkpoint, defaults to None.
epoch (str, optional): Loading checkpoint of setting epoch numbers, defaults to -1.
Epoch equals to -1 means choosing the latest checkpoint.
finetune (bool, optional): Whether allows to load a part of the model, defaults to False.
strict (bool, optional): Whether to strictly enforce that the keys in :attr:`state_dict` of the checkpoint
match the names of parameters and buffers in model, defaults to False.
suffix (str, optional): Suffix of checkpoint file path, defaults to ''.
priority (int, optional): Priority in the printing, hooks with small priority will be printed in front,
defaults to 0. If different hooks share same priority, the order of printing would
depend on the hooks order in the hook list.
"""
def __init__(self,
......
......@@ -25,13 +25,14 @@ def _format_number(val, prec=5):
class LogByEpochHook(BaseHook):
"""Hook to log by epoch
:param logger: Logger for the log
:param interval: Recording interval, defaults to 1
:type interval: int, optional
:param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 1
:type priority: int, optional
"""Hook to log by epoch.
Args:
logger (:class:`colossalai.logging.DistributedLogger`): Logger for recording the log information.
interval (int, optional): Interval of printing log information, defaults to 1.
priority (int, optional): Priority in the printing, hooks with small priority will be printed in front,
defaults to 1. If different hooks share same priority, the order of printing would
depend on the hooks order in the hook list.
"""
def __init__(self,
......@@ -48,10 +49,12 @@ class LogByEpochHook(BaseHook):
@HOOKS.register_module
class LogMetricByStepHook(BaseHook):
"""Hook to log metric by step
"""Hook to log metric by step.
:param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 10
:type priority: int, optional
Args:
priority (int, optional): Priority in the printing, hooks with small priority will be printed in front,
defaults to 10. If different hooks share same priority, the order of printing would
depend on the hooks order in the hook list.
"""
def __init__(self, priority: int = 10):
......@@ -74,11 +77,12 @@ class LogMetricByStepHook(BaseHook):
class LogMetricByEpochHook(LogByEpochHook):
"""Specialized hook to record the metric to log.
:param logger: Logger for the log
:param interval: Recording interval, defaults to 1
:type interval: int, optional
:param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 10
:type priority: int, optional
Args:
logger (:class:`colossalai.logging.DistributedLogger`): Logger for recording the log information.
interval (int, optional): Interval of printing log information, defaults to 1.
priority (int, optional): Priority in the printing, hooks with small priority will be printed in front,
defaults to 10. If different hooks share same priority, the order of printing would
depend on the hooks order in the hook list.
"""
def __init__(self,
......@@ -116,14 +120,14 @@ class LogMetricByEpochHook(LogByEpochHook):
class TensorboardHook(BaseHook):
"""Specialized hook to record the metric to Tensorboard.
:param log_dir: Directory of log
:type log_dir: str
:param ranks: Ranks of processors
:type ranks: typing.List
:param parallel_mode: Parallel mode, defaults to colossalai.context.parallel_mode.ParallelMode.GLOBAL
:type parallel_mode: :class:`colossalai.context.parallel_mode.ParallelMode`, optional
:param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 10
:type priority: int, optional
Args:
log_dir (str): Directory of log.
ranks (list): Ranks of processors.
parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`, optional): Parallel mode used in trainer,
defaults to colossalai.context.parallel_mode.ParallelMode.GLOBAL.
priority (int, optional): Priority in the printing, hooks with small priority will be printed in front,
defaults to 10. If different hooks share same priority, the order of printing would
depend on the hooks order in the hook list.
"""
def __init__(self,
......@@ -200,18 +204,15 @@ class TensorboardHook(BaseHook):
class LogTimingByEpochHook(LogByEpochHook):
"""Specialized hook to write timing record to log.
:param timer: Timer for the hook
:type timer: :class:`colossalai.utils.MultiTimer`
:param logger: Logger for the log
:type logger: :class:`colossalai.logging.DistributedLogger`
:param interval: Recording interval, defaults to 1
:type interval: int, optional
:param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 10
:type priority: int, optional
:param log_eval: Whether writes in evaluation, defaults to True
:type log_eval: bool, optional
:param ignore_num_train_steps: Number of training steps to ignore, defaults to 0
:type ignore_num_train_steps: int, optional
Args:
timer (:class:`colossalai.utils.MultiTimer`): Timer for the hook.
logger (:class:`colossalai.logging.DistributedLogger`): Logger for recording the log information.
interval (int, optional): Interval of printing log information, defaults to 1.
priority (int, optional): Priority in the printing, hooks with small priority will be printed in front
defaults to 10. If different hooks share same priority, the order of printing would
depend on the hooks order in the hook list.
log_eval (bool, optional): Whether writes in evaluation, defaults to True.
ignore_num_train_steps (int, optional): Number of training steps to ignore, defaults to 0.
"""
def __init__(self,
......@@ -270,14 +271,13 @@ class LogTimingByEpochHook(LogByEpochHook):
class LogMemoryByEpochHook(LogByEpochHook):
"""Specialized Hook to write memory usage record to log.
:param logger: Logger for the log
:type logger: colossalai.logging.DistributedLogger
:param interval: Recording interval, defaults to 1
:type interval: int, optional
:param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 10
:type priority: int, optional
:param log_eval: Whether writes in evaluation, defaults to True
:type log_eval: bool, optional
Args:
logger (:class:`colossalai.logging.DistributedLogger`): Logger for recording the log information.
interval (int, optional): Interval of printing log information, defaults to 1.
priority (int, optional): Priority in the printing, hooks with small priority will be printed in front
defaults to 1. If different hooks share same priority, the order of printing would
depend on the hooks order in the hook list.
log_eval (bool, optional): Whether writes in evaluation, defaults to True.
"""
def __init__(self,
......
......@@ -6,15 +6,17 @@ from ._metric_hook import LearningRateMetric, MetricHook
@HOOKS.register_module
class LRSchedulerHook(MetricHook):
"""Build LR scheduler
r"""Build LR scheduler for trainer.
:param lr_scheduler: LR scheduler
:param by_epoch: If `True`, the LR will be scheduled every epoch. Else, the LR will be scheduled every batch
:type by_epoch: bool
:param store_lr_in_state: If `True`, store the learning rate in each state, defaults to `True`
:type store_lr_in_state: bool, optional
:param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 1
:type priority: int, optional
Args:
lr_scheduler (:class:`colossalai.nn.lr_scheduler`): The specific LR scheduler
in range of ``colossalai.nn.lr_scheduler``, more details about ``lr_scheduler`` could be found in
`lr_scheduler <https://github.com/hpcaitech/ColossalAI/tree/main/colossalai/nn/lr_scheduler>`_.
by_epoch (bool): If `True`, the LR will be scheduled every epoch. Else, the LR will be scheduled every batch.
store_lr_in_state (bool, optional): If `True`, store the learning rate in each state, defaults to `True`.
priority (int, optional): Priority in the printing, hooks with small priority will be printed in front
defaults to 1. If different hooks share same priority, the order of printing would
depend on the hooks order in the hook list.
"""
def __init__(
self,
......
......@@ -17,13 +17,13 @@ from ._base_hook import BaseHook
class Metric(ABC):
"""A basic class of metric collectors. It collects a specific
metric during training or evaluation and it's always used with
metric during training or evaluation and would always be used with
:class:`MetricHook` to help it update its states and show the
metric. So please use corresponding hook class to make the metric
collector works.
:param epoch_only: Whether the metric only read for the full epoch
:type epoch_only: bool
Args:
epoch_only (bool): Whether the metric only read for the full epoch.
"""
def __init__(self, epoch_only: bool):
......@@ -80,8 +80,8 @@ class Metric(ABC):
class LossMetric(Metric):
"""A metric collector for loss.
:param epoch_only: Whether the metric only read for the full epoch
:type epoch_only: bool
Args:
epoch_only (bool): Whether the metric only read for the full epoch.
"""
def __init__(self, epoch_only):
......@@ -101,7 +101,8 @@ class LossMetric(Metric):
"""Updates :attr:`last_step_loss` and :attr:`accum_loss` with current loss.
It expects the output has loss.
:param loss: Current loss of the output
Args:
loss (:class:`torch.tensor`): Current loss of the output.
"""
# expect output to be logits, label and loss
loss_ = loss.detach()
......@@ -132,10 +133,9 @@ class LossMetric(Metric):
class LearningRateMetric(Metric):
"""A metric collector for learning rate.
:param epoch_only: Whether the metric only read for the full epoch
:type epoch_only: bool
:param initial_lr: Initial learning rate, defaults to 0.0
:type initial_lr: float, optional
Args:
epoch_only (bool): Whether the metric only read for the full epoch.
initial_lr (float, optional): Initial learning rate, defaults to 0.0.
"""
def __init__(self, epoch_only: bool, initial_lr: float = 0.):
......@@ -163,10 +163,9 @@ class AccuracyMetric(Metric):
"""A metric collector for accuracy. It only works for classification
tasks.
:param epoch_only: Whether the metric only read for the full epoch
:type epoch_only: bool
:param accuracy_func: Accuracy function for the classification task
:type accuracy_func: :class:`typing.Callable`
Args:
epoch_only (bool): Whether the metric only read for the full epoch.
accuracy_func (:class:`typing.Callable`): Accuracy function for the classification task.
"""
def __init__(self, epoch_only: bool, accuracy_func: Callable):
......@@ -187,9 +186,10 @@ class AccuracyMetric(Metric):
"""Updates last step accuracy and accumulated accuracy with current logits
and labels. It expects the output has logits and labels.
:param logits: The logits output of the model
:param targets: Real labels of the dataset
:param batch_size: Batch size of the task
Args:
logits (:class:`torch.tensor`): The logits output of the model.
targets (:class:`torch.tensor`): Real labels of the dataset.
batch_size (int): Batch size of the task.
"""
if isinstance(logits, (list, tuple)):
logits = logits[0]
......@@ -224,8 +224,10 @@ class MetricHook(BaseHook):
update their states. Others are used to display and
record the metric.
:param priority: Priority in the printing, hooks with small priority will be printed in front
:type priority: int
Args:
priority (int): Priority in the printing, hooks with small priority will be printed in front
defaults to 1. If different hooks share same priority, the order of printing would
depend on the hooks order in the hook list.
"""
def __init__(
......@@ -244,8 +246,10 @@ class MetricHook(BaseHook):
class LossHook(MetricHook):
"""Specialized hook class for :class:`Loss`.
:param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 0
:type priority: int, optional
Args:
priority (int, optional): Priority in the printing, hooks with small priority will be printed in front
defaults to 0. If different hooks share same priority, the order of printing would
depend on the hooks order in the hook list.
"""
def __init__(self, priority: int = 0):
......@@ -283,10 +287,11 @@ class LossHook(MetricHook):
class AccuracyHook(MetricHook):
"""Specialized hook class for :class:`Accuracy`.
:param accuracy_func: Priority in the printing, hooks with small priority will be printed in front
:type accuracy_func: typing.Callable
:param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 0
:type priority: int, optional
Args:
accuracy_func (:class:`typing.Callable`): Accuracy function for the classification task.
priority (int, optional): Priority in the printing, hooks with small priority will be printed in front
defaults to 0. If different hooks share same priority, the order of printing would
depend on the hooks order in the hook list.
"""
def __init__(self, accuracy_func: Callable, priority: int = 0):
......@@ -314,8 +319,8 @@ class AccuracyHook(MetricHook):
class ThroughputMetric(Metric):
"""Metric for :class:`Throughput`.
:param epoch_only: epoch only
:type epoch_only: bool
Args:
epoch_only (bool): Whether the metric only read for the full epoch.
"""
def __init__(self, epoch_only: bool, ignored_steps: int = 0):
super().__init__(epoch_only=epoch_only)
......@@ -360,10 +365,13 @@ class ThroughputMetric(Metric):
@HOOKS.register_module
class ThroughputHook(MetricHook):
"""Specialized hook class for :class:`Throughput`.
"""Specialized hook class for :class:`Throughput`. Hook to measure execution throughput (samples/sec).
:param priority: priority of throughput hook, defaults to 10
:type priority: int, optional
Args:
ignored_steps (int, optional): the number of initial training steps to ignore.
priority (int, optional): Priority in the printing, hooks with small priority will be printed in front
defaults to 10. If different hooks share same priority, the order of printing would
depend on the hooks order in the hook list.
"""
def __init__(self, ignored_steps: int = 0, priority: int = 10):
super().__init__(priority)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment