Fixed docstring in colossalai (#171)

0f8c7f98 · HELSON · GitHub · e2089c5c · 0f8c7f98 · 0f8c7f98
Unverified Commit 0f8c7f98 authored Jan 21, 2022 by HELSON Committed by GitHub Jan 21, 2022
20 changed files
--- a/colossalai/nn/layer/utils/common.py
+++ b/colossalai/nn/layer/utils/common.py
@@ -37,7 +37,11 @@ class CheckpointModule(nn.Module):


 def divide(numerator, denominator):
-    """ only allow exact division """
+    """Only allow exact division
+
+    :param numerator: Numerator of the division
+    :param denominator: Denominator of the division
+    """
    assert numerator % denominator == 0, \
        '{} is not divisible by {}'.format(numerator, denominator)
    return numerator // denominator

--- a/colossalai/nn/layer/vanilla/layers.py
+++ b/colossalai/nn/layer/vanilla/layers.py
@@ -170,7 +170,7 @@ class VanillaPatchEmbedding(nn.Module):
 @LAYERS.register_module
 class VanillaClassifier(nn.Module):
    """
-    Classifier
+    Classifier for ViT

    :param in_features: size of each input sample
    :type in_features: int

--- a/colossalai/nn/layer/wrapper/lambda_wrapper.py
+++ b/colossalai/nn/layer/wrapper/lambda_wrapper.py
@@ -11,9 +11,9 @@ from colossalai.registry import LAYERS
 class LambdaWrapper(nn.Module):
    """Wrap a function to nn.Module, which takes a config of layers and can fully access them

-    :param func: user customed function
+    :param func: User customed function
    :type func: Callable
-    :param layers_cfg: config of layers, defaults to None
+    :param layers_cfg: Config of layers, defaults to None
    :type layers_cfg: dict, optional
    """


--- a/colossalai/nn/loss/loss_2d.py
+++ b/colossalai/nn/loss/loss_2d.py
@@ -11,6 +11,9 @@ class CrossEntropyLoss2D(_Loss):
    Cross entropy loss for 2D parallelism

    :param reduction: whether to average the loss, defaults to True
+    :param args: Args for loss function
+    :param kwargs: Kwargs for loss function
+
    :type reduction: bool, optional
    """
    def __init__(self, reduction=True, *args, **kwargs):
@@ -21,6 +24,11 @@ class CrossEntropyLoss2D(_Loss):
        self.loss_kwargs = kwargs

    def forward(self, logits, targets):
+        """Calculate loss between logits and targets
+
+        :param logits: Output logits of model
+        :param targets: True targets from data
+        """
        loss = cross_entropy(logits, targets, reduction='none', *self.loss_args, **self.loss_kwargs)
        if self.reduction_mean:
            loss = loss.mean()

--- a/colossalai/nn/loss/loss_2p5d.py
+++ b/colossalai/nn/loss/loss_2p5d.py
@@ -11,6 +11,9 @@ class CrossEntropyLoss2p5D(_Loss):
    Cross entropy loss for 2.5D parallelism
    
    :param reduction: whether to average the loss, defaults to True
+    :param args: Args for loss function
+    :param kwargs: Kwargs for loss function
+
    :type reduction: bool, optional
    """
    def __init__(self, reduction=True, *args, **kwargs):
@@ -21,6 +24,11 @@ class CrossEntropyLoss2p5D(_Loss):
        self.loss_kwargs = kwargs

    def forward(self, logits, targets):
+        """Calculate loss between logits and targets
+
+        :param logits: Output logits of model
+        :param targets: True targets from data
+        """
        loss = cross_entropy(logits, targets, reduction='none', *self.loss_args, **self.loss_kwargs)
        if self.reduction_mean:
            loss = loss.mean()

--- a/colossalai/nn/loss/loss_3d.py
+++ b/colossalai/nn/loss/loss_3d.py
@@ -14,6 +14,9 @@ class CrossEntropyLoss3D(_Loss):
    :type depth: int
    :param reduction: whether to average the loss, defaults to True
    :type reduction: bool, optional
+
+    :param args: Args for loss function
+    :param kwargs: Kwargs for loss function
    """
    def __init__(self, reduction=True, *args, **kwargs):
        super().__init__()
@@ -24,6 +27,11 @@ class CrossEntropyLoss3D(_Loss):
        self.loss_kwargs = kwargs

    def forward(self, logits, targets):
+        """Calculate loss between logits and targets
+
+        :param logits: Output logits of model
+        :param targets: True targets from data
+        """
        loss = cross_entropy(logits, targets, reduction='none', *self.loss_args, **self.loss_kwargs)
        if self.reduction_mean:
            loss = loss.mean()

--- a/colossalai/nn/loss/loss_moe.py
+++ b/colossalai/nn/loss/loss_moe.py
@@ -7,6 +7,12 @@ from colossalai.global_variables import moe_env
 @LOSSES.register_module
 class MoeCrossEntropyLoss(_Loss):
    """torch.nn.CrossEntropyLoss added with auxiliary loss.
+
+    :param aux_weight: Weight of auxiliary loss in total loss
+    :param args: Args in CrossEntropyLoss
+    :param kwargs: Kwargs in CrossEntropyLoss
+
+    :type aux_weight: float, optional
    """
    def __init__(self, aux_weight: float = 0.01, *args, **kwargs):
        super().__init__()
@@ -22,6 +28,14 @@ class MoeCrossEntropyLoss(_Loss):
 @LOSSES.register_module
 class MoeLoss(_Loss):
    """A wrapper class for any loss module to add with auxiliary loss.
+
+    :param aux_weight: Weight of auxiliary loss in total loss
+    :param loss_fn: Loss function
+    :param args: Args in loss function
+    :param kwargs: Kwargs in loss function
+
+    :type aux_weight: float
+    :type loss_fn: Callable
    """
    def __init__(self, aux_weight: float, loss_fn, *args, **kwargs):
        super().__init__()

--- a/colossalai/nn/lr_scheduler/cosine.py
+++ b/colossalai/nn/lr_scheduler/cosine.py
@@ -38,7 +38,7 @@ class CosineAnnealingLR(_CosineAnnealingLR):

    :param optimizer: Wrapped optimizer
    :type optimizer: torch.optim.Optimizer
-    :param total_steps: number of total training steps
+    :param total_steps: Number of total training steps
    :type total_steps: int
    :param eta_min: Minimum learning rate, defaults to 0
    :type eta_min: int, optional
@@ -56,9 +56,9 @@ class CosineAnnealingWarmupLR(WarmupScheduler):

    :param optimizer: Wrapped optimizer
    :type optimizer: torch.optim.Optimizer
-    :param total_steps: number of total training steps
+    :param total_steps: Number of total training steps
    :type total_steps: int
-    :param warmup_steps: number of warmup steps, defaults to 0
+    :param warmup_steps: Number of warmup steps, defaults to 0
    :type warmup_steps: int, optional
    :param eta_min: Minimum learning rate, defaults to 0
    :type eta_min: int, optional
@@ -78,9 +78,9 @@ class FlatAnnealingLR(DelayerScheduler):

    :param optimizer: Wrapped optimizer
    :type optimizer: torch.optim.Optimizer
-    :param total_steps: number of total training steps
+    :param total_steps: Number of total training steps
    :type total_steps: int
-    :param pct_start: percent of steps before starting learning rate decay
+    :param pct_start: Percent of steps before starting learning rate decay
    :type pct_start: float
    :param last_epoch: The index of last epoch, defaults to -1
    :type last_epoch: int, optional
@@ -99,15 +99,16 @@ class FlatAnnealingLR(DelayerScheduler):

 @LR_SCHEDULERS.register_module
 class FlatAnnealingWarmupLR(WarmupDelayerScheduler):
-    """Flat and cosine annealing learning rate scheduler with learning rate warmup. A linear warmup schedule will be applied, and then the learning rate will be a fixed value before starting decay.
+    """Flat and cosine annealing learning rate scheduler with learning rate warmup. A linear warmup schedule will be
+    applied, and then the learning rate will be a fixed value before starting decay.

    :param optimizer: Wrapped optimizer
    :type optimizer: torch.optim.Optimizer
-    :param total_steps: number of total training steps
+    :param total_steps: Number of total training steps
    :type total_steps: int
-    :param warmup_steps: number of warmup steps, defaults to 0
+    :param warmup_steps: Number of warmup steps, defaults to 0
    :type warmup_steps: int, optional
-    :param pct_start: percent of steps before starting learning rate decay
+    :param pct_start: Percent of steps before starting learning rate decay
    :type pct_start: float
    :param eta_min: Minimum learning rate, defaults to 0
    :type eta_min: int, optional

--- a/colossalai/nn/lr_scheduler/delayed.py
+++ b/colossalai/nn/lr_scheduler/delayed.py
@@ -18,9 +18,9 @@ class DelayerScheduler(_LRScheduler):

    :param optimizer: Wrapped optimizer.
    :type optimizer: torch.optim.Optimizer
-    :param delay_epochs: number of epochs to keep the initial lr until starting aplying the scheduler
+    :param delay_epochs: Number of epochs to keep the initial lr until starting aplying the scheduler
    :type delay_epochs: int
-    :param after_scheduler: after target_epoch, use this scheduler(eg. ReduceLROnPlateau)
+    :param after_scheduler: After target_epoch, use this scheduler(eg. ReduceLROnPlateau)
    :type after_scheduler: torch.optim.lr_scheduler
    :param last_epoch: The index of last epoch, defaults to -1
    :type last_epoch: int, optional
@@ -61,9 +61,9 @@ class WarmupScheduler(_LRScheduler):

    :param optimizer: Wrapped optimizer.
    :type optimizer: torch.optim.Optimizer
-    :param warmup_epochs: number of epochs to linearly warmup lr until starting aplying the scheduler
+    :param warmup_epochs: Number of epochs to linearly warmup lr until starting aplying the scheduler
    :type warmup_epochs: int
-    :param after_scheduler: after target_epoch, use this scheduler(eg. ReduceLROnPlateau)
+    :param after_scheduler: After target_epoch, use this scheduler(eg. ReduceLROnPlateau)
    :type after_scheduler: torch.optim.lr_scheduler
    :param last_epoch: The index of last epoch, defaults to -1
    :type last_epoch: int, optional
@@ -101,11 +101,11 @@ class WarmupDelayerScheduler(_LRScheduler):

    :param optimizer: Wrapped optimizer.
    :type optimizer: torch.optim.Optimizer
-    :param warmup_epochs: number of epochs to linearly warmup lr until starting aplying the scheduler
+    :param warmup_epochs: Number of epochs to linearly warmup lr until starting aplying the scheduler
    :type warmup_epochs: int
-    :param delay_epochs: number of epochs to keep the initial lr until starting aplying the scheduler
+    :param delay_epochs: Number of epochs to keep the initial lr until starting aplying the scheduler
    :type delay_epochs: int
-    :param after_scheduler: after target_epoch, use this scheduler(eg. ReduceLROnPlateau)
+    :param after_scheduler: After target_epoch, use this scheduler(eg. ReduceLROnPlateau)
    :type after_scheduler: torch.optim.lr_scheduler
    :param last_epoch: The index of last epoch, defaults to -1
    :type last_epoch: int, optional

--- a/colossalai/nn/lr_scheduler/linear.py
+++ b/colossalai/nn/lr_scheduler/linear.py
@@ -9,9 +9,9 @@ class LinearWarmupLR(_LRScheduler):

    :param optimizer: Wrapped optimizer
    :type optimizer: torch.optim.Optimizer
-    :param total_steps: number of total training steps
+    :param total_steps: Number of total training steps
    :type total_steps: int
-    :param warmup_steps: number of warmup steps, defaults to 0
+    :param warmup_steps: Number of warmup steps, defaults to 0
    :type warmup_steps: int, optional
    :param last_epoch: The index of last epoch, defaults to -1
    :type last_epoch: int, optional

--- a/colossalai/nn/lr_scheduler/multistep.py
+++ b/colossalai/nn/lr_scheduler/multistep.py
@@ -12,15 +12,16 @@ class MultiStepLR(_MultiStepLR):
    number of epoch reaches one of the milestones. Notice that such decay can
    happen simultaneously with other changes to the learning rate from outside
    this scheduler. When last_epoch=-1, sets initial lr as lr.
+
    :param optimizer: Wrapped optimizer
    :type optimizer: torch.optim.Optimizer
-    :param total_steps: number of total training steps
+    :param total_steps: Number of total training steps
    :type total_steps: int
    :param milestones: List of epoch indices. Must be increasing, defaults to None
    :type milestones: List[int], optional
    :param gamma: Multiplicative factor of learning rate decay, defaults to 0.1
    :type gamma: float, optional
-    :param num_steps_per_epoch: number of steps per epoch, defaults to -1
+    :param num_steps_per_epoch: Number of steps per epoch, defaults to -1
    :type num_steps_per_epoch: int, optional
    :param last_epoch: The index of last epoch, defaults to -1
    :type last_epoch: int, optional
@@ -33,17 +34,18 @@ class MultiStepLR(_MultiStepLR):
 @LR_SCHEDULERS.register_module
 class MultiStepWarmupLR(WarmupScheduler):
    """Multi-step laerning rate scheduler with warmup.
+
    :param optimizer: Wrapped optimizer
    :type optimizer: torch.optim.Optimizer
-    :param total_steps: number of total training steps
+    :param total_steps: Number of total training steps
    :type total_steps: int
-    :param warmup_steps: number of warmup steps, defaults to 0
+    :param warmup_steps: Number of warmup steps, defaults to 0
    :type warmup_steps: int, optional
    :param milestones: List of epoch indices. Must be increasing, defaults to None
    :type milestones: List[int], optional
    :param gamma: Multiplicative factor of learning rate decay, defaults to 0.1
    :type gamma: float, optional
-    :param num_steps_per_epoch: number of steps per epoch, defaults to -1
+    :param num_steps_per_epoch: Number of steps per epoch, defaults to -1
    :type num_steps_per_epoch: int, optional
    :param last_epoch: The index of last epoch, defaults to -1
    :type last_epoch: int, optional

--- a/colossalai/nn/lr_scheduler/onecycle.py
+++ b/colossalai/nn/lr_scheduler/onecycle.py
@@ -17,19 +17,20 @@ class OneCycleLR(_OneCycleLR):
    This scheduler is not chainable.
    Note also that the total number of steps in the cycle can be determined in one
    of two ways (listed in order of precedence):
-    #. A value for total_steps is explicitly provided.
-    #. A number of epochs (epochs) and a number of steps per epoch
-       (steps_per_epoch) are provided.
-       In this case, the number of total steps is inferred by
-       total_steps = epochs * steps_per_epoch
+
+      * A value for total_steps is explicitly provided.
+      * A number of epochs (epochs) and a number of steps per epoch (steps_per_epoch) are provided.
+        In this case, the number of total steps is inferred by total_steps = epochs * steps_per_epoch
+
    You must either provide a value for total_steps or provide a value for both
    epochs and steps_per_epoch.
    The default behaviour of this scheduler follows the fastai implementation of 1cycle, which
    claims that "unpublished work has shown even better results by using only two phases". To
    mimic the behaviour of the original paper instead, set ``three_phase=True``.
+
    :param optimizer: Wrapped optimizer
    :type optimizer: torch.optim.Optimizer
-    :param total_steps: number of total training steps
+    :param total_steps: Number of total training steps
    :type total_steps: int
    :param pct_start: The percentage of the cycle (in number of steps) spent increasing the learning rate, defaults to 0.3
    :type pct_start: float, optional
@@ -64,6 +65,7 @@ class OneCycleLR(_OneCycleLR):
        number of *batches* computed, not the total number of epochs computed.
        When last_epoch=-1, the schedule is started from the beginning, defaults to -1
    :type last_epoch: int, optional
+
    .. _Super-Convergence\: Very Fast Training of Neural Networks Using Large Learning Rates:
        https://arxiv.org/abs/1708.07120
    """

--- a/colossalai/nn/lr_scheduler/poly.py
+++ b/colossalai/nn/lr_scheduler/poly.py
@@ -7,13 +7,14 @@ from .delayed import WarmupScheduler
 @LR_SCHEDULERS.register_module
 class PolynomialLR(_LRScheduler):
    """Polynomial learning rate scheduler.
+
    :param optimizer: Wrapped optimizer
    :type optimizer: torch.optim.Optimizer
-    :param total_steps: number of total training steps
+    :param total_steps: Number of total training steps
    :type total_steps: int
    :param end_lr: Minimum learning rate, defaults to 0.0001
    :type end_lr: float, optional
-    :param power: the power of polynomial, defaults to 1.0
+    :param power: The power of polynomial, defaults to 1.0
    :type power: float, optional
    :param last_epoch: The index of last epoch, defaults to -1
    :type last_epoch: int, optional
@@ -42,15 +43,16 @@ class PolynomialLR(_LRScheduler):
 @LR_SCHEDULERS.register_module
 class PolynomialWarmupLR(WarmupScheduler):
    """Polynomial learning rate scheduler with warmup.
+
    :param optimizer: Wrapped optimizer
    :type optimizer: torch.optim.Optimizer
-    :param total_steps: number of total training steps
+    :param total_steps: Number of total training steps
    :type total_steps: int
-    :param warmup_steps: number of warmup steps, defaults to 0
+    :param warmup_steps: Number of warmup steps, defaults to 0
    :type warmup_steps: int, optional
    :param end_lr: Minimum learning rate, defaults to 0.0001
    :type end_lr: float, optional
-    :param power: the power of polynomial, defaults to 1.0
+    :param power: The power of polynomial, defaults to 1.0
    :type power: float, optional
    :param last_epoch: The index of last epoch, defaults to -1
    :type last_epoch: int, optional

--- a/colossalai/nn/lr_scheduler/torch.py
+++ b/colossalai/nn/lr_scheduler/torch.py
@@ -10,16 +10,15 @@ from colossalai.registry import LR_SCHEDULERS
 class LambdaLR(_LambdaLR):
    """Sets the learning rate of each parameter group to the initial lr
    times a given function. When last_epoch=-1, sets initial lr as lr.
+
    :param optimizer: Wrapped optimizer
    :type optimizer: torch.optim.Optimizer
-    :param total_steps: number of total training steps
+    :param total_steps: Number of total training steps
    :type total_steps: int
    :param lr_lambda: A function which computes a multiplicative
        factor given an integer parameter epoch, or a list of such
        functions, one for each group in optimizer.param_groups, defaults to None
    :type lr_lambda: function or list, optional
-    :param num_steps_per_epoch: number of steps per epoch, defaults to -1
-    :type num_steps_per_epoch: int, optional
    :param last_epoch: The index of last epoch, defaults to -1
    :type last_epoch: int, optional
    """
@@ -32,16 +31,15 @@ class LambdaLR(_LambdaLR):
 class MultiplicativeLR(_MultiplicativeLR):
    """Multiply the learning rate of each parameter group by the factor given
    in the specified function. When last_epoch=-1, sets initial lr as lr
+
    :param optimizer: Wrapped optimizer
    :type optimizer: torch.optim.Optimizer
-    :param total_steps: number of total training steps
+    :param total_steps: Number of total training steps
    :type total_steps: int
    :param lr_lambda: A function which computes a multiplicative
        factor given an integer parameter epoch, or a list of such
        functions, one for each group in optimizer.param_groups, defaults to None
    :type lr_lambda: function or list, optional
-    :param num_steps_per_epoch: number of steps per epoch, defaults to -1
-    :type num_steps_per_epoch: int, optional
    :param last_epoch: The index of last epoch, defaults to -1
    :type last_epoch: int, optional
    """
@@ -56,16 +54,15 @@ class StepLR(_StepLR):
    step_size epochs. Notice that such decay can happen simultaneously with
    other changes to the learning rate from outside this scheduler. When
    last_epoch=-1, sets initial lr as lr
+
    :param optimizer: Wrapped optimizer
    :type optimizer: torch.optim.Optimizer
-    :param total_steps: number of total training steps
+    :param total_steps: Number of total training steps
    :type total_steps: int
    :param step_size: Period of learning rate decay, defaults to 1
    :type step_size: int, optional
    :param gamma: Multiplicative factor of learning rate decay, defaults to 0.1
    :type gamma: float, optional
-    :param num_steps_per_epoch: number of steps per epoch, defaults to -1
-    :type num_steps_per_epoch: int, optional
    :param last_epoch: The index of last epoch, defaults to -1
    :type last_epoch: int, optional
    """
@@ -79,14 +76,13 @@ class StepLR(_StepLR):
 class ExponentialLR(_ExponentialLR):
    """Decays the learning rate of each parameter group by gamma every epoch.
    When last_epoch=-1, sets initial lr as lr
+
    :param optimizer: Wrapped optimizer
    :type optimizer: torch.optim.Optimizer
-    :param total_steps: number of total training steps
+    :param total_steps: Number of total training steps
    :type total_steps: int
    :param gamma: Multiplicative factor of learning rate decay, defaults to 1.0
    :type gamma: float, optional
-    :param num_steps_per_epoch: number of steps per epoch, defaults to -1
-    :type num_steps_per_epoch: int, optional
    :param last_epoch: The index of last epoch, defaults to -1
    :type last_epoch: int, optional
    """

--- a/colossalai/nn/metric/accuracy_2d.py
+++ b/colossalai/nn/metric/accuracy_2d.py
@@ -6,16 +6,17 @@ from ._utils import calc_acc


 class Accuracy2D(nn.Module):
-    """
-    Accuracy for 2D parallelism
-
-    :param logits: predicted labels
-    :param targets: true labels
+    """Accuracy for 2D parallelism
    """
    def __init__(self):
        super().__init__()

    def forward(self, logits, targets):
+        """Calculate the accuracy of predicted labels.
+
+        :param logits: Predicted labels
+        :param targets: True labels from data
+        """
        with torch.no_grad():
            correct = calc_acc(logits, targets)
            correct = reduce_by_batch_2d.apply(correct)

--- a/colossalai/nn/metric/accuracy_2p5d.py
+++ b/colossalai/nn/metric/accuracy_2p5d.py
@@ -6,16 +6,17 @@ from ._utils import calc_acc


 class Accuracy2p5D(nn.Module):
-    """
-    Accuracy for 2p5D parallelism
-
-    :param logits: predicted labels
-    :param targets: true labels
+    """Accuracy for 2p5D parallelism
    """
    def __init__(self):
        super().__init__()

    def forward(self, logits, targets):
+        """Calculate the accuracy of predicted labels.
+
+        :param logits: Predicted labels
+        :param targets: True labels from data
+        """
        with torch.no_grad():
            correct = calc_acc(logits, targets)
            correct = reduce_by_batch_2p5d.apply(correct)

--- a/colossalai/nn/metric/accuracy_3d.py
+++ b/colossalai/nn/metric/accuracy_3d.py
@@ -8,11 +8,7 @@ from ._utils import calc_acc


 class Accuracy3D(nn.Module):
-    """
-    Accuracy for 3D parallelism
-
-    :param logits: predicted labels
-    :param targets: true labels
+    """Accuracy for 3D parallelism
    """
    def __init__(self):
        super().__init__()
@@ -20,6 +16,11 @@ class Accuracy3D(nn.Module):
        self.weight_parallel_mode = get_parallel_mode_from_env(WEIGHT_GROUP_3D)

    def forward(self, logits, targets):
+        """Calculate the accuracy of predicted labels.
+
+         :param logits: Predicted labels
+         :param targets: True labels from data
+         """
        with torch.no_grad():
            correct = calc_acc(logits, targets)
            correct = reduce_by_batch_3d.apply(correct, self.input_parallel_mode, self.weight_parallel_mode)

--- a/colossalai/nn/optimizer/fused_lamb.py
+++ b/colossalai/nn/optimizer/fused_lamb.py
@@ -30,8 +30,8 @@ class FusedLAMB(torch.optim.Optimizer):
        betas (Tuple[float, float], optional): coefficients used for computing
            running averages of gradient and its norm. (default: (0.9, 0.999))
        eps (float, optional): term added to the denominator to improve
-            numerical stability. (default: 1e-8)
-        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+            numerical stability. (default: 1e-6)
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0.01)
        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
            algorithm from the paper `On the Convergence of Adam and Beyond`_
            NOT SUPPORTED now! (default: False)

--- a/colossalai/nn/optimizer/lamb.py
+++ b/colossalai/nn/optimizer/lamb.py
@@ -20,7 +20,7 @@ class Lamb(Optimizer):
        betas (Tuple[float, float], optional): coefficients used for computing
            running averages of gradient and its square (default: (0.9, 0.999))
        eps (float, optional): term added to the denominator to improve
-            numerical stability (default: 1e-8)
+            numerical stability (default: 1e-6)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
        adam (bool, optional): always use trust ratio = 1, which turns this into
            Adam. Useful for comparison purposes.

--- a/colossalai/nn/optimizer/lars.py
+++ b/colossalai/nn/optimizer/lars.py
@@ -16,7 +16,7 @@ class Lars(Optimizer):
    Args:
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups
-        lr (float, optional): learning rate
+        lr (float, optional): learning rate (default: 1e-3)
        momentum (float, optional): momentum factor (default: 0)
        eeta (float, optional): LARS coefficient as used in the paper (default: 1e-3)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)