Refactored docstring to google style

ec5086c4 · Liang Bowen · アマデウス · 53b1b6e3 · ec5086c4 · ec5086c4
Commit ec5086c4 authored Mar 25, 2022 by Liang Bowen Committed by アマデウス Mar 29, 2022
11 changed files
--- a/colossalai/utils/activation_checkpoint.py
+++ b/colossalai/utils/activation_checkpoint.py
@@ -114,10 +114,13 @@ class CheckpointFunction(torch.autograd.Function):


 def checkpoint(function, activation_offload ,*args):
-    """Checkpoint the computation while preserve the rng states, modified from Pytorch torch.utils.checkpoint
+    """Checkpoint the computation while preserve the rng states, modified from Pytorch torch.utils.checkpoint.

-    :param function: Describe the forward pass function. It should know how to handle the input tuples.
-    :param args: Tuple containing the parameters of the function
-    :return: Output of running function with provided args
+    Args:
+        function: Describe the forward pass function. It should know how to handle the input tuples.
+        args (list): Tuple containing the parameters of the function
+
+    Returns:
+        Output of running function with provided args.
    """
    return CheckpointFunction.apply(function, activation_offload, *args)
--- a/colossalai/utils/checkpointing.py
+++ b/colossalai/utils/checkpointing.py
@@ -50,17 +50,17 @@ def _get_standard_checkpoint_filename(epoch: int, suffix: str = ''):


 def get_checkpoint_path(checkpoint_dir: str, epoch: int, suffix: str = ''):
-    """This is a function to generate the checkpoint path from the (checkpoint_dir, epoch, suffix, gpu_parallel_rank) tuple.
+    """This is a function to generate the checkpoint path from the tuple
+    (checkpoint_dir, epoch, suffix, gpu_parallel_rank).
    This is useful during generation and recuperation of the checkpoint.

-    :param checkpoint_dir: Set up a directory for saving checkpoints
-    :type checkpoint_dir: str
-    :param epoch: Epoch number (indicate how many epochs have you trained this model)
-    :type epoch: int
-    :param suffix: Additional notation to specify the model or checkpoint, defaults to ''
-    :type suffix: str, optional
-    :return: Checkpoint path to be generated
-    :rtype: path
+    Args:
+        checkpoint_dir (str): Set up a directory for saving checkpoints.
+        epoch (int): Epoch number (indicate how many epochs have you trained this model).
+        suffix (str, optional): Additional notation to specify the model or checkpoint, defaults to ''
+
+    Returns:
+        str: The checkpoint path to be generated.
    """
    ckpt_filename = _get_standard_checkpoint_filename(epoch, suffix)
    return os.path.join(checkpoint_dir, ckpt_filename)
@@ -74,12 +74,13 @@ def _ensure_directory_exists(filename: str):


 def get_latest_checkpoint_pattern(suffix: str = ''):
-    """Generate Regular expression of latest checkpoint's pattern
+    """Generate Regular expression of the latest checkpoint's pattern.
+
+    Args:
+        suffix (str, optional): Additional notation to specify the model or checkpoint, defaults to ''.

-    :param suffix: Additional notation to specify the model or checkpoint, defaults to ''
-    :type suffix: str, optional
-    :return: Checkpoint pattern
-    :rtype: regular expression
+    Returns:
+        str: The regular expression of checkpoint pattern.
    """
    ranks_name = _get_ranks_name()
    pattern = r'epoch(\d+)-{}{}\.pt'.format(ranks_name, suffix)
@@ -88,16 +89,19 @@ def get_latest_checkpoint_pattern(suffix: str = ''):


 def get_latest_checkpoint_path(checkpoint_dir: str, suffix: str = ''):
-    """This is a function to retrieve the latest checkpoint path from the (checkpoint_dir, suffix, gpu_parallel_rank) tuple.
+    """This is a function to retrieve the latest checkpoint path from the tuple
+    (checkpoint_dir, suffix, gpu_parallel_rank).
    This is useful during recuperation of the checkpoint, especially when you do not know the epoch number.

-    :param checkpoint_dir: Directory for saving checkpoints
-    :type checkpoint_dir: str
-    :param suffix: Additional notation to specify the model or checkpoint, defaults to ''
-    :type suffix: str, optional
-    :raises FileNotFoundError: Raise error when we cannot find the latest checkpoint file with inputs given
-    :return: The latest checkpoint path to be retrieved
-    :rtype: path
+    Args:
+        checkpoint_dir (str): Directory for saving checkpoints
+        suffix (str, optional): Additional notation to specify the model or checkpoint, defaults to ''
+
+    Returns:
+        str: The latest retrieved checkpoint path.
+
+    Raises:
+        FileNotFoundError: Raise error when we cannot find the latest checkpoint file with inputs given.
    """
    CKPT_NAME_PAT = get_latest_checkpoint_pattern(suffix=suffix)

@@ -126,22 +130,19 @@ def save_checkpoint(checkpoint_path: str,
                    optimizer: torch.optim.Optimizer,
                    lr_scheduler: torch.optim.lr_scheduler._LRScheduler = None,
                    **kwargs):
-    """Given a directory to store the checkpoints, saves all the training components' parameters or buffers, such as model,
-     optimizer, lr_scheduler and etc. into a checkpoint dictionary.
-
-    This method can be used for both colosalai nn.BaseModel and normal pytorch nn.Module.
-
-
-    :param checkpoint_path: Set up a directory for saving checkpoints
-    :type checkpoint_path: str
-    :param epoch: Epoch number (indicate how many epochs have you trained this model)
-    :type epoch: int
-    :param model: Model to be registered
-    :type model: torch.nn.Module
-    :param optimizer: Optimizer to be registered
-    :type optimizer: torch.optim.Optimizer
-    :param lr_scheduler: lr_scheduler to be registered, defaults to None
-    :type lr_scheduler: torch.optim.lr_scheduler._LRScheduler, optional
+    """Given a directory to store the checkpoints, saves all the training components' parameters or buffers, such as
+    model, optimizer, lr_scheduler etc. into a checkpoint dictionary.
+
+    This method can be used for both :class:`colossalai.nn.BaseModel` and normal  :class:`torch.nn.Module`.
+
+    Args:
+        checkpoint_path (str): Set up a directory for saving checkpoints.
+        epoch (int): Epoch number (indicate how many epochs have you trained this model).
+        model (:class:`torch.nn.Module`): Model to be registered.
+        optimizer (Union[:class:`torch.optim.Optimizer`, :class:`colossalai.nn.optimizer`]): Optimizer to be registered.
+        lr_scheduler (Union[:class:`torch.optim.lr_scheduler`,
+            :class:`colossalai.nn.lr_scheduler`], optional): lr_scheduler to be registered, defaults to None.
+        kwargs (dict): additional parameters to be saved.
    """
    # for compatibility with normal pytorch nn.Module
    if hasattr(model, 'state_dict_for_save_checkpoint'):
@@ -165,31 +166,31 @@ def load_checkpoint(checkpoint_path: str,
                    finetune: bool = False,
                    strict: bool = True) -> Tuple:
    """Loads the checkpoint file.
+
    If finetune is False, then we intend to continue/resume the training process from the checkpoint given.
    So we copy parameters and buffers from state_dict into these modules(model, optimizer,lr_scheduler)
    and its descendants.
-    If finetune is True, then only the weights and buffers of model should be reload.
-    If strict is True, then the keys of state_dict must exactly match the keys returned by this module’s
-     state_dict() function.
-
-    :param checkpoint_path: The exact and matched checkpoint_path directory to retrieve appropriate state_dict
-    :type checkpoint_path: str
-    :param model: Model to reload parameters and buffers
-    :type model: torch.nn.Module
-    :param optimizer: Optimizer to recuperate
-    :type optimizer: torch.optim.Optimizer
-    :param lr_scheduler: lr_scheduler to recuperate, defaults to None
-    :type lr_scheduler: torch.optim.lr_scheduler._LRScheduler, optional
-    :param finetune: Whether to finetune the model with new dataset or continue the pre-training, defaults to False
-    :type finetune: bool, optional
-    :param strict: Whether to strictly enforce that the keys in
-        :attr:`state_dict` of the checkpoint match the names of
-        parameters and buffers in model., defaults to True
-    :type strict: bool, optional
-    :raises ValueError: Raise error if the model/optimizer cannot successfully be recuperated
-    :return: (the epoch number of the checkpoint retrieved, the checkpoint retrieved)
-    :rtype: Tuple

+    If finetune is True, then only the weights and buffers of model should be reloaded.
+    If strict is True, then the keys of state_dict must exactly match the keys returned
+    by this module’s state_dict() function.
+
+     Args:
+        checkpoint_path (str): The exact and matched checkpoint_path directory to retrieve appropriate state_dict.
+        model (:class:`torch.nn.Module`): Model to reload parameters and buffers.
+        optimizer (Union[:class:`torch.optim.Optimizer`, :class:`colossalai.nn.optimizer`]): Optimizer to recuperate.
+        lr_scheduler (:class:`torch.optim.lr_scheduler._LRScheduler`, optional):
+            lr_scheduler to recuperate, defaults to None.
+        finetune (bool, optional): Whether to finetune the model with new dataset or
+            continue the pre-training, defaults to False.
+        strict (bool, optional): Whether to strictly enforce that the keys in :attr:`state_dict`
+            of the checkpoint match the names of parameters and buffers in model, defaults to True.
+
+    Returns:
+        Tuple(int, ``checkpoint``): The tuple (the epoch number of the checkpoint retrieved, the checkpoint retrieved).
+
+    Raises:
+        ValueError: Raise error if the model/optimizer cannot successfully be recuperated
    """
    # Load the checkpoint.
    checkpoint = torch.load(checkpoint_path, map_location='cpu')

--- a/colossalai/utils/common.py
+++ b/colossalai/utils/common.py
@@ -27,10 +27,10 @@ from .multi_tensor_apply import multi_tensor_applier
 def print_rank_0(msg: str, logger=None):
    """Print messages and save logs(optional). This is executed only if you are the rank-0 gpu.

-    :param msg: A string message to output
-    :type msg: str
-    :param logger: Python logger object, defaults to None
-    :type logger: optional
+    Args:
+        msg (str): A string message to output.
+        logger (:class:`colossalai.logging.DistributedLogger`, optional):
+            The logger to record the message, defaults to None.
    """
    if gpc.get_global_rank() == 0:
        if logger is None:
@@ -53,12 +53,15 @@ def free_port():


 def sync_model_param(model, parallel_mode):
-    """Make sure data parameters are consistent during Data Parallel Mode
+    r"""Make sure data parameters are consistent during Data Parallel Mode.

-    :param model: A pyTorch nn.model on whose parameters you check the consistency
-    :param parallel_mode: Parallel mode to be checked
-    :type model: torch.nn.Module
-    :type parallel_mode:  colossalai.context.ParallelMode
+    Args:
+        model (:class:`torch.nn.Module`): A pyTorch model on whose parameters you check the consistency.
+        parallel_mode (:class:`colossalai.context.ParallelMode`): Parallel mode to be checked.
+
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
    """
    if gpc.is_initialized(parallel_mode) and gpc.get_world_size(parallel_mode) > 1:
        for param in model.parameters():
@@ -146,18 +149,19 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
    """Clips gradient norm of an iterable of parameters whose gradients are in fp32.

    This is adapted from :func:`torch.nn.utils.clip_grad.clip_grad_norm_` and
-    added functionality to handle model parallel parameters. Note that
+    added functionality to handle model parallel parameters.
+
+    Note:
        the gradients are modified in place.

-    :param parameters: An iterable of Tensors or a single Tensor that will have gradients normalized
-    :type parameters: (Iterable[Tensor] or Tensor)
-    :param max_norm: Max norm of the gradients
-    :type max_norm: float or int
-    :param norm_type: Type of the used p-norm. Can be ``'inf'`` for infinity norm.
-    :type norm_type: float or int 
+    Args:
+        parameters (Iterable[:class:`torch.tensor`] or :class:`torch.tensor`):
+            An iterable of Tensors or a single Tensor that will have gradients normalized.
+        max_norm (Union[float, int]): Max norm of the gradients.
+        norm_type (Union[float, int, 'inf']): Type of the used p-norm. Can be ``'inf'`` for infinity norm.

-    :return: Total norm of the parameters (viewed as a single vector).
-    :rtype: float
+    Returns:
+        float: Total norm of the parameters.
    """

    if isinstance(parameters, torch.Tensor):

--- a/colossalai/utils/data_sampler/data_parallel_sampler.py
+++ b/colossalai/utils/data_sampler/data_parallel_sampler.py
@@ -19,18 +19,15 @@ T_co = TypeVar('T_co', covariant=True)

 @DATA_SAMPLERS.register_module
 class DataParallelSampler(Sampler):
-    """A data sampler for distributed data parallelism
-
-    :param dataset: A Dataset instance
-    :type dataset: torch.utils.data.Dataset
-    :param shuffle: Whether to shuffle data, defaults to False
-    :type shuffle: bool, optional
-    :param seed: The random seed, defaults to 0
-    :type seed: int, optional
-    :param drop_last: Set to True to drop the last incomplete batch, if the dataset size is not divisible by the batch
-        size. If False and the size of dataset is not divisible by the batch size, then the last batch will be smaller,
-        defaults to False
-    :type drop_last: bool, optional
+    """A data sampler for distributed data parallelism.
+
+    Args:
+        dataset (:class:`torch.utils.data.Dataset`): The Dataset for sampling.
+        shuffle (bool, optional): Whether to shuffle data, defaults to False.
+        seed (int, optional): The random seed used for sampling, defaults to 0.
+        drop_last (bool, optional): Set to True to drop the last incomplete batch, if the dataset size
+            is not divisible by the batch size. If False and the size of dataset is not divisible by
+            the batch size, then the last batch will be smaller, defaults to False.
    """

    def __init__(self,
@@ -104,8 +101,8 @@ class DataParallelSampler(Sampler):
        use a different random ordering for each epoch. Otherwise, the next iteration of this
        sampler will yield the same ordering.

-        :param epoch: Epoch number.
-        :type epoch: int
+        Args:
+            epoch (int): Epoch number.
        """
        self.epoch = epoch

@@ -118,29 +115,27 @@ def get_dataloader(dataset,
                   pin_memory=False,
                   num_workers=0,
                   **kwargs):
-    """Set up a deterministic dataloader (also configure seed workers, samplers and whether shuffle or not)
-
-    .. note:: When pipeline parallel is enabled, shuffle cannot be True as it will result in mismatch between input data
-        on the 1st stage and label on the last stage
-
-    :param dataset: A :class:`torch.utils.data.Dataset` object
-    :param shuffle: Whether to shuffle the dataset
-    :param seed: Random worker seed, defaults to 1024
-    :param add_sampler: Add DistributedDataParallelSampelr to the dataset
-    :param drop_last: Drop the last incomplete batch of data
-    :param pin_memory: Whether to pin memory address in CPU memory
-    :param num_workers: Number of worker threads for this dataloader
-
-    :type dataset: :class:`torch.utils.data.Dataset`
-    :type shuffle: bool, optional. Default is False
-    :type seed: int, optional. Default is 1024
-    :type add_sampler: bool, optional. Default is True
-    :type drop_last: bool, optional. Default is False
-    :type pin_memory: bool, optional. Default is False
-    :type num_workers: int, optional. Default is 0
-
-    :return: A object of :class:`torch.utils.data.DataLoader`
-    :rtype: :class:`torch.utils.data.DataLoader`
+    r"""Set up a deterministic dataloader (also configure seed workers, samplers and whether shuffle or not)
+
+    Note:
+        When pipeline parallel is enabled, shuffle cannot be True as it will result in mismatch between input data
+        on the 1st stage and label on the last stage.
+
+    Args:
+        dataset (:class:`torch.utils.data.Dataset`): The dataset to be loaded.
+        shuffle (bool, optional): Whether to shuffle the dataset. Defaults to False.
+        seed (int, optional): Random worker seed for sampling, defaults to 1024.
+        add_sampler: Whether to add ``DistributedDataParallelSampler`` to the dataset. Defaults to True.
+        drop_last (bool, optional): Set to True to drop the last incomplete batch, if the dataset size
+            is not divisible by the batch size. If False and the size of dataset is not divisible by
+            the batch size, then the last batch will be smaller, defaults to False.
+        pin_memory (bool, optional): Whether to pin memory address in CPU memory. Defaults to False.
+        num_workers (int, optional): Number of worker threads for this dataloader. Defaults to 0.
+        kwargs (dict): optional parameters for ``torch.utils.data.DataLoader``, more details could be found in
+                `DataLoader <https://pytorch.org/docs/stable/_modules/torch/utils/data/dataloader.html#DataLoader>`_.
+
+    Returns:
+        :class:`torch.utils.data.DataLoader`: A DataLoader used for training or testing.
    """
    _kwargs = kwargs.copy()


--- a/colossalai/utils/gradient_accumulation/__init__.py
+++ b/colossalai/utils/gradient_accumulation/__init__.py
@@ -13,19 +13,25 @@ def accumulate_gradient(model: nn.Module,
                        accumulate_size: int,
                        gradient_handlers: List[BaseGradientHandler] = None,
                        lr_scheduler: _LRScheduler = None):
-    """
-    :param model: your model object
-    :type model: :class:`torch.nn.Module`
-    :param optimizer: your optimizer object
-    :type optimizer: :class:`torch.optim.Optimizer`
-    :param dataloader: your dataloader object
-    :type dataloader: Iterable
-    :param accumulate_size: the number of steps to accumulate gradients
-    :type accumulate_size: int
-    :param gradient_handlers: list of gradient handler objects. Default is None
-    :type gradient_handlers: List[:class:`colossalai.engine.BaseGradientHandler`]
-    :param lr_scheduler: your lr scheduler object. Default is None
-    :type lr_scheduler: `torch.optim.lr_scheduler._LRScheduler`
+    r"""Turning model, optimizer, dataloader into corresponding object for gradient accumulation.
+
+    Args:
+        model (:class:`torch.nn.Module`): your model object for gradient accumulation.
+        optimizer (:class:`torch.optim.Optimizer`): your optimizer object for gradient accumulation.
+        dataloader (:class:`torch.utils.data.DataLoader` or iterable objects):
+            your dataloader object, would be called like iter(dataloader)
+        accumulate_size (int): the number of steps to accumulate gradients
+        gradient_handlers (List[:class:`colossalai.engine.BaseGradientHandler`]):
+            list of gradient handler objects. Default is None.
+        lr_scheduler (`torch.optim.lr_scheduler` or `colossalai.nn.lr_scheduler`):
+            your ``lr_scheduler`` object for gradient accumulation. Defaults to None.
+
+    More details about `gradient_handlers` could be found in
+    `Gradient_handler <https://github.com/hpcaitech/ColossalAI/tree/main/colossalai/engine/gradient_handler>`_.
+
+    More details about `lr_scheduler` could be found
+    `lr_scheduler <https://github.com/hpcaitech/ColossalAI/tree/main/colossalai/nn/lr_scheduler>`_. and
+    `how to adjust learning rate <https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate>`_.
    """
    optimizer = GradAccumOptimizer(optimizer, accumulate_size=accumulate_size, model=model)
    dataloader = GradAccumDataloader(dataloader, accumulate_size=accumulate_size)

--- a/colossalai/utils/gradient_accumulation/_gradient_accumulation.py
+++ b/colossalai/utils/gradient_accumulation/_gradient_accumulation.py
@@ -15,15 +15,13 @@ from colossalai.engine import BaseGradientHandler

 class GradAccumOptimizer(ColossalaiOptimizer):
    """A wrapper for the optimizer to enable gradient accumulation by skipping the steps 
-    before accumulation size is reached
-
-    :param optim: Your optimizer object
-    :type optim: :class:`torch.optim.Optimizer`
-    :param accumulate_size: The number of steps to accumulate gradients
-    :type accumulate_size: int
-    :param model: Your model object to check if it is DDP for special handling of no_sync() context
-    :type model: :class:`torch.nn.Module`
+    before accumulation size is reached.

+    Args:
+        optim (:class:`torch.optim.Optimizer`): Your optimizer object for gradient accumulation.
+        accumulate_size (int): The number of steps to accumulate gradients.
+        model (:class:`torch.nn.Module`):
+            Your model object to check if it is DistributedDataParallel for special handling of no_sync() context.
    """

    def __init__(self, optim: Optimizer, accumulate_size: int, model: nn.Module = None):
@@ -76,18 +74,18 @@ class GradAccumOptimizer(ColossalaiOptimizer):


 class GradAccumDataloader:
-    """A wrapper for dataloder to enable gradient accumulation by dropping the last incomplete steps.
+    """A wrapper for dataloader to enable gradient accumulation by dropping the last incomplete steps.

-    For example, if a dataloader has 10 batches of data and accumulate size is 4. The model paramters will 
-    be update only twice at step 4 and step 8. The last two batches of data do not form a complete 4-step cycle.
+    Note:
+        The dataloader would drop the last incomplete steps for gradient accumulation.
+        For example, if a dataloader has 10 batches of data and accumulate size is 4. The model parameters will
+        be updated only twice at step 4 and step 8. The last two batches of data do not form a complete 4-step cycle.
        Thus, they will be automatically skipped by this class. If the dataloader is not standard PyTorch dataloader,
        (e.g. Dali dataloader), this class will automatically consume (load data for nothing) the remaining 2 batches.

-    :param dataloader: Your dataloader object
-    :type dataloader: Iterable
-    :param accumulate_size: The number of steps to accumulate gradients
-    :type accumulate_size: int
-
+    Args:
+        optim (``Iterable``): Your dataloader object for gradient accumulation.
+        accumulate_size (int): The number of steps to accumulate gradients.
    """

    def __init__(self, dataloader: Iterable, accumulate_size: int) -> None:
@@ -125,13 +123,12 @@ class GradAccumDataloader:

 class GradAccumLrSchedulerByStep(_LRScheduler):
    """A wrapper for the LR scheduler to enable gradient accumulation by skipping the steps 
-    before accumulation size is reached
-
-    :param lr_scheduler: Your lr scheduler object
-    :type lr_scheduler: :class:`torch.optim.lr_scheduler._LRScheduler`    
-    :param accumulate_size: The number of steps to accumulate gradients
-    :type accumulate_size: int
+    before accumulation size is reached.

+    Args:
+        lr_scheduler (:class:`torch.optim.lr_scheduler._LRScheduler`):
+            Your ``lr_scheduler`` object for gradient accumulation.
+        accumulate_size (int): The number of steps to accumulate gradients.
    """

    def __init__(self, lr_scheduler: _LRScheduler, accumulate_size: int) -> None:
@@ -171,13 +168,16 @@ class GradAccumLrSchedulerByStep(_LRScheduler):


 class GradAccumGradientHandler:
-    """A wrapper for the gradient handler to enable gradient accumulation by skipping the steps 
-    before accumulation size is reached
+    r"""A wrapper for the gradient handler to enable gradient accumulation by skipping the steps
+    before accumulation size is reached.
+
+    Args:
+        grad_handler (:class:`colossalai.engine.BaseGradientHandler`):
+            Your ``gradient_handler`` object for gradient accumulation, would be called when achieving `accumulate_size`.
+        accumulate_size (int): The number of steps to accumulate gradients.

-    :param grad_handler: Your gradient handler object
-    :type grad_handler: :class:`colossalai.engine.BaseGradientHandler`    
-    :param accumulate_size: The number of steps to accumulate gradients
-    :type accumulate_size: int
+    More details about ``gradient_handlers`` could be found in
+    `Gradient_handler <https://github.com/hpcaitech/ColossalAI/tree/main/colossalai/engine/gradient_handler>`_.

    """


--- a/colossalai/utils/memory_utils/memory_monitor.py
+++ b/colossalai/utils/memory_utils/memory_monitor.py
@@ -14,12 +14,13 @@ from typing import Optional


 def colo_cuda_memory_used(device: Optional[torch.device] = None) -> int:
-    """
-    Get the free memory info of device.
-    :param device: a torch device instance or None
-    :type device: Optional[torch.device]
-    :return: current memory usage, sized by Byte
-    :rtype: int
+    """Get the free memory info of device.
+
+    Args:
+       device (Optional[``torch.device``]): a torch device instance or None. Defaults None.
+
+    Returns:
+        int: current memory usage, sized by Byte.
    """
    if device:
        assert device.type == 'cuda'
@@ -34,7 +35,7 @@ def colo_cuda_memory_used(device: Optional[torch.device] = None) -> int:


 def bytes_to_GB(val, decimal=2):
-    """A byte-to-Gigabyte converter, defaultly using binary notation.
+    """A byte-to-Gigabyte converter, default using binary notation.

    :param val: X bytes to convert
    :return: X' GB
@@ -43,7 +44,7 @@ def bytes_to_GB(val, decimal=2):


 def bytes_to_MB(val, decimal=2):
-    """A byte-to-Megabyte converter, defaultly using binary notation.
+    """A byte-to-Megabyte converter, default using binary notation.

    :param val: X bytes to convert
    :return: X' MB
@@ -54,13 +55,13 @@ def bytes_to_MB(val, decimal=2):
 def report_memory_usage(message, logger=None, report_cpu=False):
    """Calculate and print RAM usage (in GB)

-    :param message: A prefix message to add in the log
-    :type message: str
-    :param logger: An instance of :class:`colossalai.logging.DistributedLogger`
-    :type logger: :class:`colossalai.logging.DistributedLogger`, optional
-    :param report_cpu: Whether to report CPU memory
-    :type report_cpu: bool, optional
-    :raises EnvironmentError: Raise error if no distributed environment has been initialized
+    Args:
+        message (str): A prefix message to add in the log.
+        logger (:class:`colossalai.logging.DistributedLogger`): The logger used to record memory information.
+        report_cpu (bool, optional): Whether to report CPU memory.
+
+    Raises:
+        EnvironmentError: Raise error if no distributed environment has been initialized.
    """
    if not gpc.is_initialized(ParallelMode.GLOBAL):
        raise EnvironmentError("No distributed environment is initialized")

--- a/colossalai/utils/moe.py
+++ b/colossalai/utils/moe.py
@@ -12,8 +12,8 @@ def get_moe_epsize_param_dict(model: nn.Module) -> Dict[int, List[nn.Parameter]]
    size of every parameter. Since the parameters in data parallelism is replicated
    in each GPU, we set their ep_size to 1.

-    :param model: A pyTorch nn.model from which we get dict
-    :type model: torch.nn.Module
+    Args:
+        model (:class:`torch.nn.Module`): A pyTorch `nn.Module` from which we get dict.
    """
    epsize_param_dict = dict()
    for param in model.parameters():
@@ -29,10 +29,10 @@ def get_moe_epsize_param_dict(model: nn.Module) -> Dict[int, List[nn.Parameter]]


 def sync_moe_model_param(model: nn.Module):
-    """Make sure model parameters are consistent in MoE parallel context
+    """Make sure model parameters are consistent in MoE parallel context.

-    :param model: A pyTorch nn.model on whose parameters you check the consistency
-    :type model: torch.nn.Module
+    Args:
+        model (:class:`torch.nn.Module`): A pyTorch model on whose parameters you check the consistency.
    """
    if is_using_ddp():


--- a/colossalai/utils/multi_tensor_apply/multi_tensor_apply.py
+++ b/colossalai/utils/multi_tensor_apply/multi_tensor_apply.py
@@ -3,10 +3,10 @@

 class MultiTensorApply(object):
    """
-    Apply an operation to a list of tensors efficiently
+    Apply an operation to a list of tensors efficiently.

-    :param chunk_size: Size of a chunk
-    :type chunk_size: int
+    Args:
+        chunk_size (int): Size of a chunk.
    """

    available = False

--- a/colossalai/utils/tensor_detector/tensor_detector.py
+++ b/colossalai/utils/tensor_detector/tensor_detector.py
@@ -9,6 +9,7 @@ from collections import defaultdict
 LINE_WIDTH = 108
 LINE = '-' * LINE_WIDTH + '\n'

+
 class TensorDetector():
    def __init__(self,
                 show_info: bool = True,
@@ -16,17 +17,14 @@ class TensorDetector():
                 include_cpu: bool = False,
                 module: Optional[nn.Module] = None
                 ):
-        """This class is an detector to detect tensor on different devices.
-
-        :param show_info: whether to print the info on screen, default True
-        :type show_info: bool
-        :param log: the file name to save the log
-        :type log: str
-        :param include_cpu: whether to detect tensor on cpu, default False
-        :type include_cpu: bool
-        :param module: when sending an `nn.Module` it, the detector can name the tensors detected better
-        :type module: Optional[nn.Module]
-
+        """This class is a detector to detect tensor on different devices.
+
+        Args:
+            show_info (bool, optional): whether to print the info on screen, default True.
+            log (str, optional): the file name to save the log. Defaults to None.
+            include_cpu (bool, optional): whether to detect tensor on cpu, default False.
+            module (Optional[:class:`nn.Module`]): when sending an ``nn.Module`` object,
+                the detector can name the tensors detected better.
        """
        self.show_info = show_info
        self.log = log
@@ -49,7 +47,6 @@ class TensorDetector():
                self.tensor_info[id(param)].append(param.dtype)
                self.tensor_info[id(param)].append(self.get_tensor_mem(param))

-
    def get_tensor_mem(self, tensor):
        # calculate the memory occupied by a tensor
        memory_size = tensor.element_size() * tensor.storage().size()
@@ -58,7 +55,6 @@ class TensorDetector():
            memory_size += grad_memory_size
        return self.mem_format(memory_size)

-
    def mem_format(self, real_memory_size):
        # format the tensor memory into a reasonal magnitude
        if real_memory_size >= 2 ** 30:
@@ -69,7 +65,6 @@ class TensorDetector():
            return str(real_memory_size / (2 ** 10)) + ' KB'
        return str(real_memory_size) + ' B' 

-
    def collect_tensors_state(self):
        for obj in gc.get_objects():
            if torch.is_tensor(obj):
@@ -116,7 +111,6 @@ class TensorDetector():
                if obj.device not in self.devices:
                    self.devices.append(obj.device)

-    
    def print_tensors_state(self):
        template_format = '{:3s}{:<30s}{:>10s}{:>20s}{:>10s}{:>20s}{:>15s}'
        self.info += LINE
@@ -174,7 +168,6 @@ class TensorDetector():
            with open(self.log + '.log', 'a') as f:
                f.write(self.info)
    
-    
    def detect(self, include_cpu = False):
        self.include_cpu = include_cpu
        self.collect_tensors_state()

--- a/colossalai/utils/timer.py
+++ b/colossalai/utils/timer.py
@@ -25,7 +25,7 @@ class Timer:
        return time.time()

    def start(self):
-        """Fisrtly synchronize cuda, reset the clock and then start the timer.
+        """Firstly synchronize cuda, reset the clock and then start the timer.
        """
        self._elapsed = 0
        synchronize()
@@ -40,10 +40,11 @@ class Timer:
    def stop(self, keep_in_history: bool = False):
        """Stop the timer and record the start-stop time interval.

-        :param keep_in_history: Whether does it record into history each start-stop interval, defaults to False
-        :type keep_in_history: bool, optional
-        :return: Start-stop interval
-        :rtype: int
+        Args:
+            keep_in_history (bool, optional): Whether does it record into history
+                each start-stop interval, defaults to False.
+        Returns:
+            int: Start-stop interval.
        """
        synchronize()
        end_time = time.time()
@@ -57,26 +58,27 @@ class Timer:
    def get_history_mean(self):
        """Mean of all history start-stop time intervals.

-        :return: Mean of time intervals
-        :rtype: int
+        Returns:
+            int: Mean of time intervals
        """
        return sum(self._history) / len(self._history)

    def get_history_sum(self):
        """Add up all the start-stop time intervals.

-        :return: Sum of time intervals
-        :rtype: int
+        Returns:
+            int: Sum of time intervals.
        """
        return sum(self._history)

    def get_elapsed_time(self):
        """Return the last start-stop time interval.

-        .. note:: Use it only when timer is not in progress
+        Returns:
+            int: The last time interval.

-        :return: The last time interval
-        :rtype: int
+        Note:
+            Use it only when timer is not in progress
        """
        assert not self._started, 'Timer is still in progress'
        return self._elapsed
@@ -90,10 +92,10 @@ class Timer:


 class MultiTimer:
-    """An object contains multiple timers
+    """An object contains multiple timers.

-    :param on: Whether the timer is enabled. Default is True
-    :type on: bool, optional
+    Args:
+        on (bool, optional): Whether the timer is enabled. Default is True.
    """

    def __init__(self, on: bool = True):
@@ -101,10 +103,10 @@ class MultiTimer:
        self._timers = dict()

    def start(self, name: str):
-        """Start namely one of the timers
+        """Start namely one of the timers.

-        :param name: Timer's key
-        :type name: str
+        Args:
+            name (str): Timer's key.
        """
        if self._on:
            if name not in self._timers:
@@ -114,10 +116,9 @@ class MultiTimer:
    def stop(self, name: str, keep_in_history: bool):
        """Stop namely one of the timers.

-        :param name: Timer's key
-        :type name: str
-        :param keep_in_history: Whether does it record into history each start-stop interval
-        :type keep_in_history: bool
+        Args:
+            name (str): Timer's key.
+            keep_in_history (bool): Whether does it record into history each start-stop interval.
        """
        if self._on:
            return self._timers[name].stop(keep_in_history)
@@ -127,17 +128,19 @@ class MultiTimer:
    def get_timer(self, name):
        """Get timer by its name (from multitimer)

-        :param name: Timer's key
-        :return: Timer with the name you give correctly
-        :rtype: Timer
+        Args:
+            name (str): Timer's key.
+        Returns:
+            :class:`colossalai.utils.Timer`: Timer with the name you give correctly.
        """
        return self._timers[name]

    def reset(self, name=None):
        """Reset timers.

-        :param name: If name is designated, the named timer will be reset and others will not, defaults to None
-        :type name: optional
+        Args:
+            name (str, optional): If name is designated, the named timer will be reset
+                and others will not, defaults to None.
        """
        if self._on:
            if name is not None: