update examples and sphnix docs for the new api (#63)

35813ed3 · Frank Lee · GitHub · 7d371105 · 35813ed3 · 35813ed3
Unverified Commit 35813ed3 authored Dec 13, 2021 by Frank Lee Committed by GitHub Dec 13, 2021
20 changed files
--- a/README.md
+++ b/README.md
@@ -14,10 +14,12 @@ Blog: [Colossal-AI: A Unified Deep Learning System For Large-Scale Parallel Trai
 pip install colossalai
 ```

-### Install From Source
+### Install From Source (Recommended)
+
+> We **recommend** you to install from source as the Colossal-AI is updating frequently in the early versions. The documentation will be in line with the main branch of the repository. Feel free to raise an issue if you encounter any problem. :)

 ```shell
-git clone git@github.com:hpcaitech/ColossalAI.git
+git clone https://github.com/hpcaitech/ColossalAI.git
 cd ColossalAI
 # install dependency
 pip install -r requirements/requirements.txt
@@ -64,8 +66,8 @@ model = ...
 # sampler by default
 train_dataset = ... 
 train_dataloader = get_dataloader(dataset=dataset,
-                            shuffle=True,
-                            )
+                                shuffle=True,
+                                )


 # build your 

--- a/colossalai/amp/__init__.py
+++ b/colossalai/amp/__init__.py
@@ -16,6 +16,22 @@ def convert_to_amp(model: nn.Module,
                   criterion: _Loss,
                   mode: AMP_TYPE,
                   amp_config: Config = None):
+    """A helper function to wrap training components with Torch AMP modules
+
+    :param model: your model object
+    :type model: :class:`torch.nn.Module`
+    :param optimizer: your optimizer object
+    :type optimizer: :class:`torch.optim.Optimzer`
+    :param criterion: your loss function object
+    :type criterion: :class:`torch.nn.modules.loss._Loss`
+    :param mode: amp mode
+    :type mode: :class:`colossalai.amp.AMP_TYPE`
+    :param amp_config: configuration for different amp modes
+    :type amp_config: :class:`colossalai.context.Config` or dict
+
+    :return: (model, optimizer, criterion)
+    :rtype: Tuple
+    """
    assert isinstance(mode, AMP_TYPE), \
        f'expected the argument mode be AMP_TYPE, but got {type(mode)}'


--- a/colossalai/amp/apex_amp/__init__.py
+++ b/colossalai/amp/apex_amp/__init__.py
@@ -7,6 +7,18 @@ import apex.amp as apex_amp
 def convert_to_apex_amp(model: nn.Module,
                        optimizer: Optimizer,
                        amp_config):
+    """A helper function to wrap training components with Torch AMP modules
+
+    :param model: your model object
+    :type model: :class:`torch.nn.Module`
+    :param optimizer: your optimizer object
+    :type optimizer: :class:`torch.optim.Optimzer`
+    :param amp_config: configuration for nvidia apex
+    :type amp_config: :class:`colossalai.context.Config` or dict
+
+    :return: (model, optimizer)
+    :rtype: Tuple
+    """
    model, optimizer = apex_amp.initialize(model, optimizer, **amp_config)
    optimizer = ApexAMPOptimizer(optimizer)
    return model, optimizer

--- a/colossalai/amp/apex_amp/apex_amp.py
+++ b/colossalai/amp/apex_amp/apex_amp.py
@@ -13,11 +13,24 @@ from colossalai.utils import clip_grad_norm_fp32


 class ApexAMPOptimizer(ColossalaiOptimizer):
+    ''' A wrapper class for APEX optimizer and it implements apex-specific backward and clip_grad_norm
+    methods
+    '''

    def backward(self, loss: Tensor):
+        """
+        :param loss: loss computed by a loss function
+        :type loss: torch.Tensor
+        """
        with apex_amp.scale_loss(loss, self.optim) as scaled_loss:
            scaled_loss.backward()

    def clip_grad_norm(self, model: nn.Module, max_norm: float):
+        """
+        :param model: your model object
+        :type model: torch.nn.Module
+        :param max_norm: the max norm value for gradient clipping
+        :type max_norm: float
+        """
        if max_norm > 0:
            clip_grad_norm_fp32(apex_amp.master_params(self.optim), max_norm)
--- a/colossalai/amp/naive_amp/__init__.py
+++ b/colossalai/amp/naive_amp/__init__.py
@@ -8,6 +8,18 @@ from .naive_amp import NaiveAMPOptimizer, NaiveAMPModel
 def convert_to_naive_amp(model: nn.Module,
                         optimizer: Optimizer,
                         amp_config):
+    """A helper function to wrap training components with Torch AMP modules
+
+    :param model: your model object
+    :type model: :class:`torch.nn.Module`
+    :param optimizer: your optimizer object
+    :type optimizer: :class:`torch.optim.Optimzer`
+    :param amp_config: configuration for naive mode amp
+    :type amp_config: :class:`colossalai.context.Config` or dict
+
+    :return: (model, optimizer)
+    :rtype: Tuple
+    """
    if is_no_pp_or_last_stage():
        model = NaiveAMPModel(model, output_to_fp32=True)
    else:

--- a/colossalai/amp/naive_amp/_fp16_optimizer.py
+++ b/colossalai/amp/naive_amp/_fp16_optimizer.py
@@ -146,26 +146,22 @@ class DynamicGradScaler:
 class FP16Optimizer(Optimizer):
    """Float16 optimizer for fp16 and bf16 data types.

-    Arguments:
-        optimizer: base optimizer such as Adam or SGD
-        clip_grad: clip gradeints with this global L2 norm. Note
-            that clipping is ignored if clip_grad == 0
-        log_num_zeros_in_grad: return number of zeros in the gradients.
-        params_have_main_grad: flag indicating if parameters have
-            a `main_grad` field. If this is set, we are assuming
-            that the model parameters are store in the `main_grad`
-            field instead of the typical `grad` field. This happens
-            for the DDP cases where there is a contihuous buffer
-            holding the gradients. For example for bfloat16, we want
-            to do gradient accumulation and all-reduces in float32
-            and as a result we store those gradients in the main_grad.
-            Note that main grad is not necessarily in float32.
-        bf16: if true, the model is running in bfloat16.
-        grad_scaler: used for scaling gradients. Note that this can be
-            None. This case happens when `bf16 = True` and we don't
-            use any loss scale. Note that for `bf16 = True`, we can have
-            a constnat gradient scaler. Also for `bf16 = False`, we
-            always require a grad scaler.
+    :param optimizer: base optimizer such as Adam or SGD
+    :type optimizer: torch.optim.Optimizer
+    :param clip_grad: clip gradeints with this global L2 norm. Note that clipping is ignored if clip_grad == 0
+    :type param clip_grad: float
+    :param log_num_zeros_in_grad: return number of zeros in the gradients.
+    :type log_num_zeros_in_grad: bool
+    :param initial_scale: initial scale of gradient scaler
+    :type initial_scale: int
+    :param growth_factor: the growth rate of loss scale
+    :type growth_factor: int
+    :param backoff_factor: the decrease rate of loss scale
+    :type backoff_factor: float
+    :param hysterisis: delay shift in dynamic loss scaling
+    :type hysterisis: int
+    :param max_scale: maximum loss scale allowed
+    :type max_scale: int
    """

    def __init__(self,

--- a/colossalai/amp/naive_amp/naive_amp.py
+++ b/colossalai/amp/naive_amp/naive_amp.py
@@ -13,12 +13,21 @@ from ._fp16_optimizer import FP16Optimizer


 class NaiveAMPOptimizer(ColossalaiOptimizer):
+    """A wrapper class for optimizer to cast all parameters to fp16
+
+    :param optim: a normal optimizer like Adam or SGD
+    :type optim: torch.optim.Optimizer
+    """

    def __init__(self, optim: Optimizer, *args, **kwargs):
        optim = FP16Optimizer(optimizer=optim, *args, **kwargs)
        super().__init__(optim)

    def backward(self, loss: Tensor):
+        """backward with gradient scaler
+        :param loss: loss computed by a loss function
+        :type loss: torch.Tensor
+        """
        loss = self.optim.scale_loss(loss)
        loss.backward()

@@ -30,6 +39,9 @@ class NaiveAMPOptimizer(ColossalaiOptimizer):


 class NaiveAMPModel(nn.Module):
+    """A wrapper class for model to cast the model into fp16 and 
+    automatically cast the input and output
+    """

    def __init__(self,
                 model: nn.Module,

--- a/colossalai/amp/torch_amp/__init__.py
+++ b/colossalai/amp/torch_amp/__init__.py
@@ -9,6 +9,20 @@ def convert_to_torch_amp(model: nn.Module,
                         optimizer: Optimizer,
                         criterion: _Loss,
                         amp_config: Config):
+    """A helper function to wrap training components with Torch AMP modules
+
+    :param model: your model object
+    :type model: :class:`torch.nn.Module`
+    :param optimizer: your optimizer object
+    :type optimizer: :class:`torch.optim.Optimzer`
+    :param criterion: your loss function object
+    :type criterion: :class:`torch.nn.modules.loss._Loss`
+    :param amp_config: configuration for different amp modes
+    :type amp_config: :class:`colossalai.context.Config` or dict
+    
+    :return: (model, optimizer, criterion)
+    :rtype: Tuple
+    """
    model = TorchAMPModel(model)
    optimizer = TorchAMPOptimizer(optimizer, **amp_config)
    criterion = TorchAMPLoss(criterion)

--- a/colossalai/amp/torch_amp/torch_amp.py
+++ b/colossalai/amp/torch_amp/torch_amp.py
@@ -14,25 +14,45 @@ from colossalai.utils import clip_grad_norm_fp32


 class TorchAMPOptimizer(ColossalaiOptimizer):
+    """A wrapper class which integrate pytorch amp with an optimizer
+
+    :param optim: a normal optimizer like Adam or SGD
+    :type optim: torch.optim.Optimizer
+    """

    def __init__(self, optim: Optimizer, *args, **kwargs):
        super().__init__(optim)
        self.scaler = GradScaler(*args, **kwargs)

    def backward(self, loss: Tensor):
+        """backward with torch amp gradient scaler
+        :param loss: loss computed by a loss function
+        :type loss: torch.Tensor
+        """
        self.scaler.scale(loss).backward()

    def step(self):
+        """update the parameters of the model
+        """
        self.scaler.step(self.optim)
        self.scaler.update()

    def clip_grad_norm(self, model: nn.Module, max_norm: float):
+        """apply gradient clipping to the model parameters
+        :param model: your model object
+        :type model: torch.nn.Module
+        :param max_norm: max norm value for gradient clipping
+        :type max_norm: float
+        """
        if max_norm > 0.0:
            self.scaler.unscale_(self.optim)
            clip_grad_norm_fp32(model.parameters(), max_norm)


 class TorchAMPModel(nn.Module):
+    """A wrapper class for a model object which executes forward with values automatically
+    cast to fp16
+    """

    def __init__(self, model: nn.Module) -> None:
        super().__init__()
@@ -44,7 +64,10 @@ class TorchAMPModel(nn.Module):


 class TorchAMPLoss(nn.Module):
-
+    """A wrapper class for a criterion object which computes the loss in mixed-precision context
+    :param loss: a loss function object
+    :type loss: torch.nn.modules.loss._Loss
+    """
    def __init__(self, loss: _Loss):
        super().__init__()
        self.loss = loss

--- a/colossalai/builder/builder.py
+++ b/colossalai/builder/builder.py
@@ -16,8 +16,8 @@ def build_from_config(module, config: dict):
        of the return object
    :type config: dict
    :raises AssertionError: Raises an AssertionError if `module` is not a class
-    :return: An object of :class:`module`
-    :rtype: :class:`module`
+    :return: An object of interest
+    :rtype: Object
    """
    assert inspect.isclass(module), 'module must be a class'
    return module(**config)
@@ -62,8 +62,8 @@ def build_layer(config):
    :param config: A python dict or a :class:`colossalai.context.Config` object
        containing information used in the construction of the return object
    :type config: dict or :class:`colossalai.context.Config`
-    :return: An object of :class:`nn.Module`
-    :rtype: :class:`nn.Module`
+    :return: An object of :class:`torch.nn.Module`
+    :rtype: :class:`torch.nn.Module`
    """
    return build_from_registry(config, LAYERS)

@@ -75,8 +75,8 @@ def build_loss(config):
    :param config: A python dict or a :class:`colossalai.context.Config` object
        containing information used in the construction of the return object
    :type config: dict or :class:`colossalai.context.Config`
-    :return: An object of :class:`torch.autograd.Function`
-    :rtype: :class:`torch.autograd.Function`
+    :return: An object of :class:`torch.nn.modules.loss._Loss`
+    :rtype: :class:`torch.nn.modules.loss._Loss`
    """
    return build_from_registry(config, LOSSES)

@@ -87,8 +87,8 @@ def build_model(config):
    :param config: A python dict or a :class:`colossalai.context.Config` object
        containing information used in the construction of the return object
    :type config: dict or :class:`colossalai.context.Config`
-    :return: An object of :class:`nn.Module`
-    :rtype: :class:`nn.Module`
+    :return: An object of :class:`torch.nn.Module`
+    :rtype: :class:`torch.nn.Module`
    """
    return build_from_registry(config, MODELS)

@@ -134,8 +134,8 @@ def build_gradient_handler(config, model, optimizer):
    :type model: :class:`nn.Module`
    :param optimizer: An optimizer object containing parameters for the gradient handler
    :type optimizer: :class:`torch.optim.Optimizer`
-    :return: An object of :class:`BaseGradientHandler`
-    :rtype: :class:`BaseGradientHandler`
+    :return: An object of :class:`colossalai.engine.BaseGradientHandler`
+    :rtype: :class:`colossalai.engine.BaseGradientHandler`
    """
    config_ = config.copy()
    config_['model'] = model
@@ -151,8 +151,8 @@ def build_hooks(config, trainer):
    :type config: dict or :class:`colossalai.context.Config`
    :param trainer: A :class:`Trainer` object containing parameters for the hook
    :type trainer: :class:`Trainer`
-    :return: An object of :class:`BaseHook`
-    :rtype: :class:`BaseHook`
+    :return: An object of :class:`colossalai.trainer.hooks.BaseHook`
+    :rtype: :class:`colossalai.trainer.hooks.BaseHook`
    """
    config_ = config.copy()
    config_['trainer'] = trainer
@@ -182,8 +182,8 @@ def build_data_sampler(config, dataset):
    :param dataset: An object of :class:`torch.utils.data.Dataset` containing information
        used in the construction of the return object
    :type dataset: :class:`torch.utils.data.Dataset`
-    :return: An object of :class:`colossalai.nn.data.sampler.BaseSampler`
-    :rtype: :class:`colossalai.nn.data.sampler.BaseSampler`
+    :return: An object of :class:`colossalai.utils.data_sampler.BaseSampler`
+    :rtype: :class:`colossalai.utils.data_sampler.BaseSampler`
    """
    config_ = config.copy()
    config_['dataset'] = dataset
@@ -200,10 +200,6 @@ def build_lr_scheduler(config, optimizer):
    :param optimizer: An optimizer object containing parameters for the learning rate
        scheduler
    :type optimizer: :class:`torch.optim.Optimizer`
-    :param total_steps: Number of total steps of the learning rate scheduler
-    :type total_steps: int
-    :param num_steps_per_epoch: number of steps per epoch of the learning rate scheduler
-    :type num_steps_per_epoch: int
    :return: An object of :class:`torch.optim.lr_scheduler`
    :rtype: :class:`torch.optim.lr_scheduler`
    """

--- a/colossalai/builder/pipeline.py
+++ b/colossalai/builder/pipeline.py
@@ -151,6 +151,28 @@ def _partition_balanced(weights, pipeline_parallel_size, num_chunks):


 class PipelineModelInitializer():
+    """An intializer to split the model into different stages for pipeline parallelism.
+
+    An example for the model config is shown below. The class VisionTransformerFromConfig should
+    inherit colossalai.nn.model.ModelFromConfig to allow this initializer to build model from a sequence
+    of layer configurations.
+
+    model_config = dict(
+        type='VisionTransformerFromConfig',
+        embedding_cfg=dict(...),
+        ...
+    )
+
+    :param config: configuration of the model
+    :type config: dict
+    :param num_chunks: the number of chunks you want to have on the current stage. This value should be 1
+                        in most cases unless you are using virutal pipeline parallelism.
+    :type num_chunks: int
+    :param verbose: whether to print the logs
+    :type verbose: bool
+
+    """
+
    def __init__(self, config, num_chunks, verbose=False):
        self.num_chunks = num_chunks
        self.ori_model = build_model(config)
@@ -161,6 +183,13 @@ class PipelineModelInitializer():
        self._logger.info(f"The total length of layers is {layer_length}", ranks=[0])

    def initialize(self, partition_method='parameter'):
+        """Initialize the model object from the config passed
+
+        :param partition_method: this parameter determines how you want to split your model layers into stages,
+                                you can set it as 'layer' or 'parameter'
+        :type partition_method: str
+        
+        """
        # Some space for initializing comunication groups
        self._interval = None
        self._partition_layers(method=partition_method)
@@ -183,7 +212,7 @@ class PipelineModelInitializer():
            # print_rank_0(param_counts)
            self.parts = _partition_balanced(param_counts, pipeline_parallel_size, self.num_chunks)
        else:
-            assert method == 'layer', "Method should be a pre-set string"
+            raise ValueError("Method should be a pre-set string in [layer, parameter]")

        # Display the partition
        if gpc.get_global_rank() == 0 and self.verbose:

--- a/colossalai/communication/collective.py
+++ b/colossalai/communication/collective.py
@@ -18,11 +18,11 @@ def all_gather(tensor: Tensor, dim: int,
    :param tensor: Tensor to be gathered
    :param dim: The dimension concatenating in
    :param parallel_mode: Parallel group mode used in this communication
-    :type tensor: Tensor
+    :type tensor: :class:`torch.Tensor`
    :type dim: int
-    :type parallel_mode: ParallelMode
+    :type parallel_mode: :class:`colossalai.context.ParallelMode`
    :return: The tensor generated by all-gather
-    :rtype: Tensor
+    :rtype: :class:`torch.Tensor`
    """
    depth = gpc.get_world_size(parallel_mode)
    temp = tensor.clone()
@@ -54,11 +54,11 @@ def reduce_scatter(tensor: Tensor, dim: int,
    :param tensor: Tensor to be reduced and scattered
    :param dim: The dimension scattering in
    :param parallel_mode: Parallel group mode used in this communication
-    :type tensor: Tensor
+    :type tensor: :class:`torch.Tensor`
    :type dim: int
-    :type parallel_mode: ParallelMode
+    :type parallel_mode: :class:`colossalai.context.ParallelMode`
    :return: The tensor generated by reduce-scatter
-    :rtype: Tensor
+    :rtype: :class:`Tensor`
    """
    depth = gpc.get_world_size(parallel_mode)
    # temp = list(torch.chunk(tensor, depth, dim=dim))

--- a/colossalai/communication/p2p.py
+++ b/colossalai/communication/p2p.py
@@ -96,7 +96,7 @@ def recv_forward(input_tensor_shape, prev_rank=None):
    :type input_tensor_shape: torch.Size
    :type prev_rank: int, optional
    :return: The input tensor in forward step
-    :rtype: Tensor
+    :rtype: :class:`torch.Tensor`
    """
    if gpc.is_first_rank(ParallelMode.PIPELINE):
        input_tensor = None
@@ -115,7 +115,7 @@ def recv_backward(output_grad_shape, next_rank=None):
    :type output_grad_shape: torch.Size
    :type next_rank: int, optional
    :return: The grad of output tensor in forward step
-    :rtype: Tensor
+    :rtype: :class:`torch.Tensor`
    """
    if gpc.is_last_rank(ParallelMode.PIPELINE):
        output_tensor_grad = None
@@ -131,7 +131,7 @@ def send_forward(output_tensor, next_rank=None):

    :param output_tensor: Tensor to be sent
    :param next_rank: The rank of the recipient of the tensor
-    :type output_tensor: Tensor
+    :type output_tensor: :class:`torch.Tensor`
    :type next_rank: int, optional
    """
    if not gpc.is_last_rank(ParallelMode.PIPELINE):
@@ -144,7 +144,7 @@ def send_backward(input_tensor_grad, prev_rank=None):

    :param input_tensor_grad: Tensor to be sent
    :param prev_rank: The rank of the recipient of the tensor
-    :type input_tensor_grad: Tensor
+    :type input_tensor_grad: :class:`torch.Tensor`
    :type prev_rank: int, optional
    """
    if not gpc.is_first_rank(ParallelMode.PIPELINE):
@@ -162,10 +162,10 @@ def send_forward_recv_backward(output_tensor,

    :param output_tensor: Tensor to be sent
    :param output_grad_shape: The shape of the tensor to be recieved
-    :type output_tensor: Tensor
-    :type output_grad_shape: torch.Size
+    :type output_tensor: :class:`torch.Tensor`
+    :type output_grad_shape: :class:`torch.Size`
    :return: The grad of output tensor in forward step
-    :rtype: Tensor
+    :rtype: :class:`torch.Tensor`
    """
    if gpc.is_last_rank(ParallelMode.PIPELINE):
        output_tensor_grad = None
@@ -187,10 +187,10 @@ def send_backward_recv_forward(input_tensor_grad,

    :param input_tensor_grad: Tensor to be sent
    :param input_tensor_shape: The shape of the tensor to be recieved
-    :type input_tensor_grad: Tensor
-    :type input_tensor_shape: torch.Size
+    :type input_tensor_grad: :class:`torch.Tensor`
+    :type input_tensor_shape: :class:`torch.Size`
    :return: The input tensor in forward step
-    :rtype: Tensor
+    :rtype: :class:`torch.Tensor`
    """
    if gpc.is_first_rank(ParallelMode.PIPELINE):
        input_tensor = None
@@ -213,10 +213,10 @@ def send_forward_recv_forward(output_tensor,

    :param output_tensor: Tensor to be sent
    :param input_tensor_shape: The shape of the tensor to be recieved
-    :type output_tensor: Tensor
-    :type input_tensor_shape: torch.Size
+    :type output_tensor: :class:`torch.Tensor`
+    :type input_tensor_shape: :class:`torch.Size`
    :return: The input tensor in forward step
-    :rtype: Tensor
+    :rtype: :class:`torch.Tensor`
    """
    input_tensor, _ = _communicate(tensor_send_next=output_tensor,
                                   recv_prev=recv_prev,
@@ -237,10 +237,10 @@ def send_backward_recv_backward(input_tensor_grad,

    :param input_tensor_grad: Tensor to be sent
    :param output_grad_shape: The shape of the tensor to be recieved
-    :type input_tensor_grad: Tensor
-    :type output_grad_shape: torch.Size
+    :type input_tensor_grad: :class:`torch.Tensor`
+    :type output_grad_shape: :class:`torch.Size`
    :return: The grad of output tensor in forward step
-    :rtype: Tensor
+    :rtype: :class:`torch.Tensor`
    """
    _, output_tensor_grad = _communicate(tensor_send_prev=input_tensor_grad,
                                         recv_next=recv_next,
@@ -266,10 +266,10 @@ def send_forward_backward_recv_forward_backward(output_tensor,
    :param input_tensor_grad: Tensor sent to the previous
    :param input_tensor_shape: The shape of the tensor recieved from the previous
    :param output_grad_shape: The shape of the tensor recieved from the next
-    :type output_tensor: Tensor
-    :type input_tensor_grad: Tensor
-    :type input_tensor_shape: torch.Size
-    :type output_grad_shape: torch.Size
+    :type output_tensor: :class:`torch.Tensor`
+    :type input_tensor_grad: :class:`torch.Tensor`
+    :type input_tensor_shape: :class:`torch.Size`
+    :type output_grad_shape: :class:`torch.Size`
    :return: (the input tensor in forward step, the grad of output tensor in forward step)
    :rtype: (Tensor, Tensor)
    """

--- a/colossalai/communication/ring.py
+++ b/colossalai/communication/ring.py
@@ -14,10 +14,10 @@ def ring_forward(tensor_send_next: torch.Tensor, parallel_mode: ParallelMode):

    :param tensor_send_next: Tensor sent to next member
    :param parallel_mode: Parallel group mode used in this communication
-    :type tensor_send_next: Tensor
-    :type parallel_mode: ParallelMode
+    :type tensor_send_next: :class:`torch.Tensor`
+    :type parallel_mode: :class:`colossalai.context.ParallelMode`
    :return: The tensor recieved from the previous
-    :rtype: Tensor
+    :rtype: :class:`torch.Tensor`
    """
    buffer_shape = tensor_send_next.size()


--- a/colossalai/context/parallel_context.py
+++ b/colossalai/context/parallel_context.py
@@ -433,6 +433,9 @@ class ParallelContext:

    def set_device(self, device_ordinal: int = None):
        """Sets distributed processes to be bound to devices.
+
+        :param device_ordinal: the device id to be bound to
+        :type device_ordinal: int
        """
        global_rank = self.get_global_rank()
        if device_ordinal is None:
@@ -445,6 +448,9 @@ class ParallelContext:

    def set_seed(self, seed: int):
        """Sets seeds for all random libraries.
+
+        :param seed: seed for random states
+        :type seed: int
        """
        random.seed(seed)
        np.random.seed(seed)

--- a/colossalai/engine/_base_engine.py
+++ b/colossalai/engine/_base_engine.py
@@ -57,38 +57,61 @@ class Engine:

    @property
    def model(self):
+        """model attached to the engine"""
        return self._model

    @property
    def optimizer(self):
+        """optimizer attached to the engine"""
        return self._optimizer

    @property
    def criterion(self):
+        """criterion attached to the engine"""
        return self._criterion

-    @property
-    def schedule(self):
-        return self._schedule
-
    def zero_grad(self):
+        """set the gradient of parameters to zero
+        """
        self.optimizer.zero_grad()

    def step(self):
+        """execute parameter update
+        """
        self._all_reduce_gradients()
        self.optimizer.clip_grad_norm(self.model, self._clip_grad_norm)
        self.optimizer.step()

    def backward(self, loss: Tensor):
+        """Start backward propagation given the loss value computed by a loss function
+        
+        :param loss: loss value computed by a loss function
+        :type loss: :class:`torch.Tensor`
+        """
        return self.optimizer.backward(loss)

    def backward_by_grad(self, tensor, grad):
+        """Start backward propagation given the gradient of the output tensor
+        
+        :param loss: output tensor
+        :type loss: :class:`torch.Tensor`
+        :param grad: gradient passed back to the output
+        :type grad: :class:`torch.Tensor`
+        """
        return self.optimizer.backward_by_grad(tensor, grad)

    def calc_loss(self, *args, **kwargs):
+        """compute the loss value
+        :return: the loss value
+        :rtype: :class:`torch.Tensor`
+        """
        return self.criterion(*args, **kwargs)

    def __call__(self, *args, **kwargs):
+        """run the forward step for the model
+        :return: output the model
+        :rtype: Tuple[:class:`torch.Tensor`] or :class:`torch.Tensor`
+        """
        return self.model(*args, **kwargs)

    def _all_reduce_gradients(self):

--- a/colossalai/engine/schedule/_base_schedule.py
+++ b/colossalai/engine/schedule/_base_schedule.py
@@ -48,7 +48,7 @@ class BaseSchedule(ABC):
        already in the same GPU as where the model's.

        :return: (data, label)
-        :rtype: (Tensor, Tensor)
+        :rtype: (:class:`Tensor`, :class:`torch.Tensor`)
        """
        if data_iter is None:
            raise RuntimeError('Dataloader is not defined.')

--- a/colossalai/engine/schedule/_non_pipeline_schedule.py
+++ b/colossalai/engine/schedule/_non_pipeline_schedule.py
@@ -38,7 +38,9 @@ class NonPipelineSchedule(BaseSchedule):
        :type data_iter: Iterator
        :type forward_only: bool, optional
        :type return_loss: bool, optional
+        
        :return: (output, label, loss)
+        :rtype: Tuple[:class:`torch.Tensor`]
        """
        assert forward_only or return_loss, \
            "The argument 'return_loss' has to be True when 'forward_only' is False, but got False."

--- a/colossalai/engine/schedule/_pipeline_schedule.py
+++ b/colossalai/engine/schedule/_pipeline_schedule.py
@@ -133,6 +133,16 @@ class PipelineSchedule(BaseSchedule):
        """Forward step for passed-in model. If it is the first stage, the input tensor 
        is obtained from data_iterator, otherwise the passed-in input_tensor is used.
        Returns output tensor. This is a helper function and can be ignored by users.
+
+        :param engine: your engine object
+        :type engine: colossalai.engine.Engine
+        :param input_tensor: input tensor for this pipeline stage
+        :type input_tensor: :class:`torch.Tensor`
+        :param return_tensors: a list of tensors to return
+        :type return_tensors: List[:class:`torch.Tensor`]
+        
+        :return: output or the loss value of the current pipeline stage
+        :rtype: :class:`torch.Tensor`
        """

        if input_tensor is None:
@@ -162,6 +172,18 @@ class PipelineSchedule(BaseSchedule):
        output_tensor_grad is None, otherwise it is the gradients with respect to stage's output tensor.
        Returns the gradients with respect to the input tensor (None if first stage).
        This is a helper function and can be ignored by users.
+
+        :param engine: your engine object
+        :type engine: colossalai.engine.Engine
+        :param input_tensor: input tensor for this pipeline stage
+        :type input_tensor: :class:`torch.Tensor`
+        :param output_tensor: output tensor for this pipeline stage
+        :type output_tensor: :class:`torch.Tensor`
+        :param output_tensor_grad: gradient of output tensor for this pipeline stage
+        :type output_tensor_grad: :class:`torch.Tensor`
+
+        :return: gradient of input tensor
+        :rtype: :class:`torch.Tensor`
        """

        # Retain the grad on the input_tensor.
@@ -189,7 +211,17 @@ class PipelineSchedule(BaseSchedule):
        """Runs non-interleaved 1F1B schedule, with communication between pipeline stages.
        Returns a tuple with losses if the last stage, an empty tuple otherwise.

+        :param engine: your engine object
+        :type engine: colossalai.engine.Engine
+        :param data_iter: dataloader as the form of an iterator, obtained by calling iter(dataloader)
+        :type data_iter: Iterable
+        :param forward_only: whether run forward step only. Default is false. If true, no backward will be run.
+        :type forward_only: bool
+        :param return_loss: whether returns the loss value. Default is true.
+        :type return_loss: bool
+
        :return: (output, label, loss)
+        :rtype: Tuple[:class:`torch.Tensor`]
        """

        assert forward_only or return_loss, \

--- a/colossalai/initialize.py
+++ b/colossalai/initialize.py
@@ -82,6 +82,8 @@ def launch(config: Union[str, Path, Config, Dict],
    :param local_rank: rank for the process on the node and is used to set the default CUDA device,
    defaults to None. If local_rank = None, the default device ordinal will be calculated automatically
    :type local_rank: int, optional
+    :param verbose: whether to print logs
+    :type verbose: bool
    :raises Exception: raise exception when config type is wrong
    '''
    gpc.verbose = verbose
@@ -121,6 +123,20 @@ def launch_from_slurm(config: Union[str, Path, Config, Dict],
                      backend: str = 'nccl',
                      seed: int = 1024,
                      verbose: bool = True):
+    '''A wrapper for colossalai.launch for SLURM launcher by reading rank and world size from the environment variables
+    set by SLURM
+
+    :param config: config file or config file path are both acceptable
+    :type config: Union[str, dict, Config]
+    :param host: the master address for distributed training
+    :type host: str
+    :param port: the master port for distributed training
+    :type port: str
+    :param backend: backend for torch.distributed
+    :type backend: str
+    :param verbose: whether to print logs
+    :type verbose: bool
+    '''
    rank = int(os.environ['SLURM_PROCID'])
    world_size = int(os.environ['SLURM_NPROCS'])
    launch(config=config,
@@ -139,6 +155,20 @@ def launch_from_openmpi(config: Union[str, Path, Config, Dict],
                        backend: str = 'nccl',
                        seed: int = 1024,
                        verbose: bool = True):
+    '''A wrapper for colossalai.launch for OpenMPI launcher by reading rank and world size from the environment variables
+    set by OpenMPI
+
+    :param config: config file or config file path are both acceptable
+    :type config: Union[str, dict, Config]
+    :param host: the master address for distributed training
+    :type host: str
+    :param port: the master port for distributed training
+    :type port: str
+    :param backend: backend for torch.distributed
+    :type backend: str
+    :param verbose: whether to print logs
+    :type verbose: bool
+    '''
    rank = int(os.environ['OMPI_COMM_WORLD_RANK'])
    local_rank = int(os.environ['OMPI_COMM_WORLD_LOCAL_RANK'])
    world_size = int(os.environ['OMPI_COMM_WORLD_SIZE'])
@@ -159,6 +189,20 @@ def launch_from_torch(config: Union[str, Path, Config, Dict],
                      backend: str = 'nccl',
                      seed: int = 1024,
                      verbose: bool = True):
+    '''A wrapper for colossalai.launch for torchrun or torch.distributed.launch by reading rank and world size 
+    from the environment variables set by PyTorch
+
+    :param config: config file or config file path are both acceptable
+    :type config: Union[str, dict, Config]
+    :param host: the master address for distributed training
+    :type host: str
+    :param port: the master port for distributed training
+    :type port: str
+    :param backend: backend for torch.distributed
+    :type backend: str
+    :param verbose: whether to print logs
+    :type verbose: bool
+    '''
    rank = int(os.environ['RANK'])
    local_rank = int(os.environ['LOCAL_RANK'])
    world_size = int(os.environ['WORLD_SIZE'])
@@ -184,16 +228,20 @@ def initialize(model: Union[nn.Module, List[nn.Module]],
    ''' Core function to wrap the essential training components with our functionality based on the config which is loaded into gpc.config.

    :param model: your model instance
-    :type model: a single or a list of ``torch.nn.Module`` objects
+    :type model: :class:`torch.nn.Module`
    :param optimizer: your optimizer instance
-    :type optimizer: a single or a list of ``torch.optim.optimizer.Optimizer`` objects
+    :type optimizer: :class:`torch.optim.optimizer.Optimizer`
    :param criterion: your criterion instance
-    :type criterion: a single or a list of ``torch.nn.modules.loss._Loss`` objects
-    :param train_dataloader: dataloaders for training data
-    :type train_dataloader: a single or a list of ``torch.utils.data.DataLoader`` objects, defaults to None
-    :param train_dataloader: dataloaders for testing data
-    :type train_dataloader: a single or a list of ``torch.utils.data.DataLoader`` objects, defaults to None
-    :return: (engine, criterion, train_dataloader, test_dataloader)
+    :type criterion: :class:`torch.nn.modules.loss._Loss`
+    :param train_dataloader: dataloader for training data
+    :type train_dataloader: :class:`torch.utils.data.DataLoader`
+    :param train_dataloader: dataloader for testing data
+    :type train_dataloader: :class:`torch.utils.data.DataLoader`
+    :param lr_scheduler: your lr scheduler instance
+    :type lr_scheduler: :class:`torch.nn.lr_scheduler._LRScheduler`
+    :param verbose: whether to print logs
+    :type verbose: bool
+    :return: (engine, train_dataloader, test_dataloader, lr_scheduler)
    :rtype: tuple
    '''
    # get logger