update examples and sphnix docs for the new api (#63)

35813ed3 · Frank Lee · GitHub · 7d371105 · 35813ed3 · 35813ed3
Unverified Commit 35813ed3 authored Dec 13, 2021 by Frank Lee Committed by GitHub Dec 13, 2021
20 changed files
--- a/colossalai/logging/__init__.py
+++ b/colossalai/logging/__init__.py
@@ -6,5 +6,11 @@ __all__ = ['get_dist_logger', 'DistributedLogger']
 def get_dist_logger(name='root'):
    """Get logger instance based on name. The DistributedLogger will create singleton instances,
    which means that only one logger instance is created per name.
+
+    :param name: name of the logger, name must be unique
+    :type name: str
+
+    :return: a distributed logger instance
+    :rtype: :class:`colossalai.logging.DistributedLogger`
    """
    return DistributedLogger.get_instance(name=name)
--- a/colossalai/nn/layer/non_parallel_layers/_vit.py
+++ b/colossalai/nn/layer/non_parallel_layers/_vit.py
@@ -47,9 +47,24 @@ class ViTBlock(nn.Module):
 @LAYERS.register_module
 class VanillaViTPatchEmbedding(nn.Module):
    """ 2D Image to Patch Embedding
+
+    :param img_size: image size
+    :type img_size: int
+    :param patch_size: size of a patch
+    :type patch_size: int
+    :param in_chans: input channels
+    :type in_chans: int
+    :param embed_dim: embedding dimension
+    :type embed_dim: int
+    :param norm_layer: layer norm class, defaults to None
+    :type norm_layer: Callable
+    :param flattern: whether flatten the output
+    :type flatten: bool
+    :param drop: dropout rate
+    :type drop: float
    """

-    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, norm_layer=None, flatten=True, drop=0.):
+    def __init__(self, img_size, patch_size, in_chans, embed_dim, norm_layer=None, flatten=True, drop=0.):
        super().__init__()
        img_size = to_2tuple(img_size)
        patch_size = to_2tuple(patch_size)
@@ -84,12 +99,22 @@ class VanillaViTPatchEmbedding(nn.Module):
 @LAYERS.register_module
 class VanillaViTMLP(nn.Module):
    """ MLP as used in Vision Transformer, MLP-Mixer and related networks
+
+    :param in_features: input channels
+    :type in_features: int
+    :param hidden_features: channels of the output of the first dense layer
+    :type hidden_features: int
+    :param hidden_features: channels of the output of the second dense layer
+    :type hidden_features: int
+    :param act_layer: activation function
+    :type act_layer: Callable
+    :param drop: dropout rate
+    :type drop: float
+
    """

-    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+    def __init__(self, in_features, hidden_features, out_features, act_layer=nn.GELU, drop=0.):
        super().__init__()
-        out_features = out_features or in_features
-        hidden_features = hidden_features or in_features
        self.fc1 = nn.Linear(in_features, hidden_features)
        self.act = act_layer()
        self.fc2 = nn.Linear(hidden_features, out_features)
@@ -113,6 +138,11 @@ def drop_path(x, drop_prob: float = 0., training: bool = False):
    changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
    'survival rate' as the argument.

+    :param drop_prob: probability for dropout
+    :type drop_prob: float
+    :param training: whether it is training mode
+    :type training: bool
+
    """
    if drop_prob == 0. or not training:
        return x
@@ -129,6 +159,9 @@ def drop_path(x, drop_prob: float = 0., training: bool = False):
 @LAYERS.register_module
 class VanillaViTDropPath(nn.Module):
    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+
+    :param drop_prob: probability for dropout
+    :type drop_path: float
    """

    def __init__(self, drop_prob=0.):
@@ -145,7 +178,7 @@ class VanillaViTAttention(nn.Module):

    :param dim: dimension of input tensor
    :type dim: int
-    :param num_heads: number of attention heads, defaults to 8
+    :param num_heads: number of attention heads
    :type num_heads: int, optional
    :param qkv_bias: enable bias for qkv if True, defaults to False
    :type qkv_bias: bool, optional
@@ -155,7 +188,7 @@ class VanillaViTAttention(nn.Module):
    :type proj_drop: float, optional
    """

-    def __init__(self, dim, num_heads=8, qkv_bias=False, attn_drop=0., proj_drop=0.):
+    def __init__(self, dim, num_heads, qkv_bias=False, attn_drop=0., proj_drop=0.):
        super().__init__()
        self.num_heads = num_heads
        head_dim = dim // num_heads

--- a/colossalai/utils/common.py
+++ b/colossalai/utils/common.py
@@ -109,15 +109,15 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
    added functionality to handle model parallel parameters. Note that
    the gradients are modified in place.

-    Arguments:
-        parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
-            single Tensor that will have gradients normalized
-        max_norm (float or int): max norm of the gradients
-        norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
-            infinity norm.
-
-    Returns:
-        Total norm of the parameters (viewed as a single vector).
+    :param parameters: an iterable of Tensors or a single Tensor that will have gradients normalized
+    :type parameters: (Iterable[Tensor] or Tensor)
+    :param max_norm: max norm of the gradients
+    :type max_norm: float or int
+    :param norm_type: type of the used p-norm. Can be ``'inf'`` for infinity norm.
+    :type norm_type: float or int 
+
+    :return: Total norm of the parameters (viewed as a single vector).
+    :rtype: float
    """

    if isinstance(parameters, torch.Tensor):

--- a/colossalai/utils/data_sampler/data_parallel_sampler.py
+++ b/colossalai/utils/data_sampler/data_parallel_sampler.py
@@ -123,12 +123,23 @@ def get_dataloader(dataset,
        stage and label on the last stage

    :param dataset: a :class:utils.data.dataset dataset
+    :param shuffle: whether to shuffle the dataset
    :param seed: random worker seed, defaults to 1024
-    :type seed: int, optional
-    :param add_sampler_if_possible: [description], defaults to False
-    :type add_sampler_if_possible: bool, optional
-    :return: a :class:utils.data.dataset dataloader
-    :rtype: torch.utils.data.dataset
+    :param add_sampler: add DistributedDataParallelSampelr to the dataset
+    :param drop_last: drop the last incomplete batch of data
+    :param pin_memory: whether to pin memory address in CPU memory
+    :param num_workers: number of worker threads for this dataloader
+
+    :type dataset: :class:`torch.utils.data.Dataset`
+    :type shuffle: bool, optional. Default is False
+    :type seed: int, optional. Default is 1024
+    :type add_sampler: bool, optional. Default is True
+    :type drop_last: bool, optional. Default is False
+    :type pin_memory: bool, optional. Default is False
+    :type num_workers: int, optional. Default is 0
+
+    :return: a object of :class:`torch.utils.data.DataLoader`
+    :rtype: :class:`torch.utils.data.DataLoader`
    '''
    _kwargs = kwargs.copy()


--- a/colossalai/utils/gradient_accumulation/__init__.py
+++ b/colossalai/utils/gradient_accumulation/__init__.py
@@ -13,6 +13,20 @@ def accumulate_gradient(model: nn.Module,
                        accumulate_size: int,
                        gradient_handlers: List[BaseGradientHandler] = None,
                        lr_scheduler: _LRScheduler = None):
+    """
+    :param model: your model object
+    :type model: :class:`torch.nn.Module`
+    :param optimizer: your optimizer object
+    :type optimizer: :class:`torch.optim.Optimizer`
+    :param dataloader: your dataloader object
+    :type dataloader: Iterable
+    :param accumulate_size: the number of steps to accumulate gradients
+    :type accumulate_size: int
+    :param gradient_handlers: list of gradient handler objects. Default is None
+    :type gradient_handlers: List[:class:`colossalai.engine.BaseGradientHandler`]
+    :param lr_scheduler: your lr scheduler object. Default is None
+    :type lr_scheduler: `torch.optim.lr_scheduler._LRScheduler`
+    """
    optimizer = GradAccumOptimizer(optimizer, accumulate_size=accumulate_size, model=model)
    dataloader = GradAccumDataloader(dataloader, accumulate_size=accumulate_size)


--- a/colossalai/utils/gradient_accumulation/_gradient_accumulation.py
+++ b/colossalai/utils/gradient_accumulation/_gradient_accumulation.py
@@ -14,6 +14,17 @@ from colossalai.engine import BaseGradientHandler


 class GradAccumOptimizer(ColossalaiOptimizer):
+    """A wrapper for the optimizer to enable gradient accumulation by skipping the steps 
+    before accumulation size is reached
+
+    :param optim: your optimizer object
+    :type optim: :class:`torch.optim.Optimizer`
+    :param accumulate_size: the number of steps to accumulate gradients
+    :type accumualate_size: int
+    :param model: your model object to check if it is DDP for special handling of no_sync() context
+    :type model: :class:`torch.nn.Module`
+
+    """

    def __init__(self, optim: Optimizer, accumulate_size: int, model: nn.Module = None):
        super().__init__(optim)
@@ -64,6 +75,19 @@ class GradAccumOptimizer(ColossalaiOptimizer):


 class GradAccumDataloader():
+    """A wrapper for dataloder to enable gradient accumulation by dropping the last incomplete steps.
+
+    For example, if a dataloader has 10 batches of data and accumulate size is 4. The model paramters will 
+    be update only twice at step 4 and step 8. The last two batches of data do not form a complete 4-step cycle.
+    Thus, they will be automatically skipped by this class. If the dataloader is not standard PyTorch dataloader, 
+    (e.g. Dali dataloader), this class will automatically consume (load data for nothing) the remaining 2 batches.
+    
+    :param dataloader: your dataloader object
+    :type dataloader: Iterable
+    :param accumulate_size: the number of steps to accumulate gradients
+    :type accumualate_size: int
+
+    """

    def __init__(self, dataloader: Iterable, accumulate_size: int) -> None:
        self.dataloader = dataloader
@@ -99,6 +123,15 @@ class GradAccumDataloader():


 class GradAccumLrSchedulerByStep(_LRScheduler):
+    """A wrapper for the LR scheduler to enable gradient accumulation by skipping the steps 
+    before accumulation size is reached
+
+    :param lr_scheduler: your lr scheduler object
+    :type lr_scheduler: :class:`torch.optim.lr_scheduler._LRScheduler`    
+    :param accumulate_size: the number of steps to accumulate gradients
+    :type accumualate_size: int
+
+    """

    def __init__(self, lr_scheduler: _LRScheduler, accumulate_size: int) -> None:
        self.lr_scheduler = lr_scheduler
@@ -137,6 +170,15 @@ class GradAccumLrSchedulerByStep(_LRScheduler):


 class GradAccumGradientHandler():
+    """A wrapper for the gradient handler to enable gradient accumulation by skipping the steps 
+    before accumulation size is reached
+
+    :param grad_handler: your gradient handler object
+    :type grad_handler: :class:`colossalai.engine.BaseGradientHandler`    
+    :param accumulate_size: the number of steps to accumulate gradients
+    :type accumualate_size: int
+
+    """

    def __init__(self, grad_handler: BaseGradientHandler, accumulate_size: int) -> None:
        assert isinstance(grad_handler, BaseGradientHandler), \

--- a/colossalai/utils/memory.py
+++ b/colossalai/utils/memory.py
@@ -34,6 +34,10 @@ def report_memory_usage(message, logger=None, report_cpu=False):

    :param message: a prefix message to add in the log
    :type message: str
+    :param logger: an instance of :class:`colossalai.logging.DistributedLogger`
+    :type logger: :class:`colossalai.logging.DistributedLogger`
+    :param report_cpu: whether to report CPU memory
+    :type report_cpu: bool
    :raises EnvironmentError: raise error if no distributed environment has been initialized
    '''
    if not gpc.is_initialized(ParallelMode.GLOBAL):

--- a/colossalai/utils/multi_tensor_apply/multi_tensor_apply.py
+++ b/colossalai/utils/multi_tensor_apply/multi_tensor_apply.py
@@ -2,6 +2,13 @@


 class MultiTensorApply(object):
+    """
+    Apply an operation to a list of tensors efficiently
+
+    :param chunk_size: size of a chunk
+    :type chunk_size: int
+    """
+
    available = False
    warned = False


--- a/colossalai/utils/timer.py
+++ b/colossalai/utils/timer.py
@@ -74,6 +74,9 @@ class Timer:

 class MultiTimer:
    '''An object contains multiple timers
+
+    :param on: whether the timer is enabled. Default is True
+    :type on: bool
    '''

    def __init__(self, on: bool = True):

--- a/colossalai/zero/__init__.py
+++ b/colossalai/zero/__init__.py
@@ -14,6 +14,21 @@ def convert_to_zero(model: nn.Module,
                    optimizer: Optimizer,
                    level: int,
                    zero_config):
+    """
+    A helper function to integrate the model and optimizer with ZeRO optimizer and off-loading
+
+    :param model: your model object
+    :type model: :class:`torch.nn.Module`
+    :param optimizer: your optimizer object
+    :type optimizer: :class:`torch.optim.Optimizer`
+    :param level: optimizer level, can be 2 or 3
+    :type level: int
+    :param zero_config: configuration for zero
+    :type zero_config: dict
+
+    :return: (model, optimizer)
+    :rtype: Tuple
+    """
    assert level == 2 or level == 3, 'Only ZERO Optimizer Level 2 and 3 are provided'
    if level == 2:
        if is_no_pp_or_last_stage():

--- a/configs/resnet/resnet50.py
+++ b/configs/resnet/resnet50.py
-#!/usr/bin/env python
-# -*- encoding: utf-8 -*-
-import os
-
-IMG_SIZE = 224
-BATCH_SIZE = 256
-NUM_EPOCHS = 100
-
-model = dict(
-    type='VanillaResNet',
-    block_type='ResNetBottleneck',
-    layers=[3, 4, 6, 3],
-    num_cls=10
-)
-
-train_data = dict(
-    dataset=dict(
-        type='CIFAR10Dataset',
-        root=os.environ['DATA'],
-        transform_pipeline=[
-            dict(type='Resize', size=IMG_SIZE),
-            dict(type='RandomCrop', size=IMG_SIZE, padding=4),
-            dict(type='RandomHorizontalFlip'),
-            dict(type='ToTensor'),
-            dict(type='Normalize',
-                 mean=[0.4914, 0.4822, 0.4465],
-                 std=[0.2023, 0.1994, 0.2010]),
-        ]
-    ),
-    dataloader=dict(
-        batch_size=BATCH_SIZE,
-        pin_memory=True,
-        shuffle=True,
-    )
-)
-
-test_data = dict(
-    dataset=dict(
-        type='CIFAR10Dataset',
-        root=os.environ['DATA'],
-        train=False,
-        transform_pipeline=[
-            dict(type='Resize', size=IMG_SIZE),
-            dict(type='ToTensor'),
-            dict(type='Normalize',
-                 mean=[0.4914, 0.4822, 0.4465],
-                 std=[0.2023, 0.1994, 0.2010]
-                 ),
-        ]
-    ),
-    dataloader=dict(
-        batch_size=BATCH_SIZE,
-        pin_memory=True,
-    )
-)
-
-parallelization = dict(
-    pipeline=1,
-    tensor=dict(size=1, mode=None),
-)
-
-optimizer = dict(
-    type='Adam',
-    lr=0.01
-)
-
-loss = dict(
-    type='CrossEntropyLoss'
-)
-
-from colossalai.engine import AMP_TYPE
-
-fp16 = dict(
-    mode=AMP_TYPE.APEX,
-    opt_level='O2',
-)
--- a/configs/sample_config.py
+++ b/configs/sample_config.py
-#!/usr/bin/env python
-# -*- encoding: utf-8 -*-
-
-NUM_EPOCH = int
-
-model = dict()
-train_data = dict()
-test_data = dict()
-optimizer = dict()
-loss = dict()
-
-fp16 = dict()
-zero = dict()
-
-gradient_handler = []
-parallel = dict()
-hooks = []
-
-cudnn_benchmark = True
-cudnn_deterministic = False
-
-logging = dict()
--- a/configs/vit/vit_2d.py
+++ b/configs/vit/vit_2d.py
-#!/usr/bin/env python
-# -*- encoding: utf-8 -*-
-
-import os
-from pathlib import Path
-
-BATCH_SIZE = 512
-IMG_SIZE = 32
-PATCH_SIZE = 4
-DIM = 512
-NUM_ATTENTION_HEADS = 2
-SUMMA_DIM = 2
-NUM_CLASSES = 10
-DEPTH = 6
-NUM_EPOCHS = 60
-
-train_data = dict(
-    dataset=dict(
-        type='CIFAR10Dataset',
-        root=Path(os.environ['DATA']),
-        transform_pipeline=[
-            dict(type='Resize', size=IMG_SIZE),
-            dict(type='RandomCrop', size=IMG_SIZE, padding=4),
-            dict(type='RandomHorizontalFlip'),
-            dict(type='ToTensor'),
-            dict(type='Normalize',
-                 mean=[0.4914, 0.4822, 0.4465],
-                 std=[0.2023, 0.1994, 0.2010]),
-        ]
-    ),
-    dataloader=dict(
-        batch_size=BATCH_SIZE,
-        drop_last=True,
-        pin_memory=True,
-        shuffle=True,
-    )
-)
-
-test_data = dict(
-    dataset=dict(
-        type='CIFAR10Dataset',
-        root=Path(os.environ['DATA']),
-        train=False,
-        transform_pipeline=[
-            dict(type='Resize', size=IMG_SIZE),
-            dict(type='ToTensor'),
-            dict(type='Normalize',
-                 mean=[0.4914, 0.4822, 0.4465],
-                 std=[0.2023, 0.1994, 0.2010]
-                 ),
-        ]
-    ),
-    dataloader=dict(
-        batch_size=BATCH_SIZE,
-        pin_memory=True,
-    )
-)
-
-optimizer = dict(
-    type='Adam',
-    lr=0.001,
-    weight_decay=0
-)
-
-loss = dict(
-    type='CrossEntropyLoss2D',
-)
-
-model = dict(
-    type='VisionTransformerFromConfig',
-    tensor_splitting_cfg=dict(
-        type='ViTInputSplitter2D',
-    ),
-    embedding_cfg=dict(
-        type='ViTPatchEmbedding2D',
-        img_size=IMG_SIZE,
-        patch_size=PATCH_SIZE,
-        embed_dim=DIM,
-    ),
-    token_fusion_cfg=dict(
-        type='ViTTokenFuser2D',
-        img_size=IMG_SIZE,
-        patch_size=PATCH_SIZE,
-        embed_dim=DIM,
-        drop_rate=0.1
-    ),
-    norm_cfg=dict(
-        type='LayerNorm2D',
-        normalized_shape=DIM,
-        eps=1e-6,
-    ),
-    block_cfg=dict(
-        type='ViTBlock',
-        attention_cfg=dict(
-            type='ViTSelfAttention2D',
-            hidden_size=DIM,
-            num_attention_heads=NUM_ATTENTION_HEADS,
-            attention_dropout_prob=0.,
-            hidden_dropout_prob=0.1,
-            checkpoint=True
-        ),
-        droppath_cfg=dict(
-            type='VanillaViTDropPath',
-        ),
-        mlp_cfg=dict(
-            type='ViTMLP2D',
-            in_features=DIM,
-            dropout_prob=0.1,
-            mlp_ratio=4,
-            checkpoint=True
-        ),
-        norm_cfg=dict(
-            type='LayerNorm2D',
-            normalized_shape=DIM,
-            eps=1e-6,
-        ),
-    ),
-    head_cfg=dict(
-        type='ViTHead2D',
-        hidden_size=DIM,
-        num_classes=NUM_CLASSES,
-    ),
-    embed_dim=DIM,
-    depth=DEPTH,
-    drop_path_rate=0.,
-)
-
-hooks = [
-    dict(type='LogMetricByEpochHook'),
-    dict(type='Accuracy2DHook'),
-    dict(type='LossHook'),
-    dict(
-        type='LRSchedulerHook',
-        by_epoch=True,
-        lr_scheduler_cfg=dict(
-            type='LinearWarmupLR',
-            warmup_steps=5
-        )
-    ),
-    # dict(type='TensorboardHook', log_dir='./tb_logs'),
-    # dict(type='SaveCheckpointHook', interval=5, checkpoint_dir='./ckpt'),
-    # dict(type='LoadCheckpointHook', epoch=20, checkpoint_dir='./ckpt')
-]
-
-parallel = dict(
-    pipeline=dict(size=1),
-    tensor=dict(size=4, mode='2d'),
-)
-
-# for fp16 training
-# from colossalai.engine import AMP_TYPE
-# fp16 = dict(
-#     mode=AMP_TYPE.PARALLEL,
-#     initial_scale=2 ** 8
-# )
-
-# only needed when pipeline parallel is used
-# schedule = dict(
-#     num_microbatches=8
-# )
-
-
-logging = dict(
-    root_path='./logs'
-)
--- a/configs/vit/vit_3d.py
+++ b/configs/vit/vit_3d.py
-#!/usr/bin/env python
-# -*- encoding: utf-8 -*-
-
-import os
-from pathlib import Path
-
-from colossalai.context import ParallelMode
-from colossalai.engine import AMP_TYPE
-
-try:
-    import model_zoo
-except:
-    print('You need to set model_zoo to your PYTHONPATH to use the models in the collection')
-
-BATCH_SIZE = 512
-IMG_SIZE = 32
-NUM_EPOCHS = 60
-
-train_data = dict(
-    dataset=dict(
-        type='CIFAR10Dataset',
-        root=Path(os.environ['DATA']),
-        transform_pipeline=[
-            dict(type='RandomCrop', size=IMG_SIZE, padding=4),
-            dict(type='RandomHorizontalFlip'),
-            dict(type='ToTensor'),
-            dict(type='Normalize',
-                 mean=[0.4914, 0.4822, 0.4465],
-                 std=[0.2023, 0.1994, 0.2010]),
-        ]
-    ),
-    dataloader=dict(
-        batch_size=BATCH_SIZE,
-        pin_memory=True,
-        num_workers=2,
-        shuffle=True,
-    )
-)
-
-test_data = dict(
-    dataset=dict(
-        type='CIFAR10Dataset',
-        root=Path(os.environ['DATA']),
-        train=False,
-        transform_pipeline=[
-            dict(type='ToTensor'),
-            dict(type='Normalize',
-                 mean=[0.4914, 0.4822, 0.4465],
-                 std=[0.2023, 0.1994, 0.2010]
-                 ),
-        ]
-    ),
-    dataloader=dict(
-        batch_size=BATCH_SIZE,
-        pin_memory=True,
-        num_workers=2,
-    )
-)
-
-optimizer = dict(
-    type='Adam',
-    lr=0.001
-)
-
-loss = dict(
-    type='CrossEntropyLoss3D',
-    input_parallel_mode=ParallelMode.PARALLEL_3D_OUTPUT,
-    weight_parallel_mode=ParallelMode.PARALLEL_3D_WEIGHT,
-)
-
-model = dict(
-    type='vit_tiny_3d_patch4_32',
-    drop_rate=0.1,
-)
-
-hooks = [
-    dict(type='LogMetricByEpochHook'),
-    dict(type='LogTimingByEpochHook'),
-    dict(type='LogMemoryByEpochHook'),
-    dict(
-        type='Accuracy3DHook',
-        input_parallel_mode=ParallelMode.PARALLEL_3D_OUTPUT,
-        weight_parallel_mode=ParallelMode.PARALLEL_3D_WEIGHT,
-    ),
-    dict(type='LossHook'),
-    dict(type='TensorboardHook', log_dir='./tfb_logs'),
-    dict(
-        type='LRSchedulerHook',
-        by_epoch=True,
-        lr_scheduler_cfg=dict(
-            type='LinearWarmupLR',
-            warmup_steps=5
-        )
-    ),
-    # dict(type='SaveCheckpointHook', interval=5, checkpoint_dir='./ckpt'),
-    # dict(type='LoadCheckpointHook', epoch=20, checkpoint_dir='./ckpt')
-]
-
-parallel = dict(
-    pipeline=dict(size=1),
-    tensor=dict(size=8, mode='3d'),
-)
-
-fp16 = dict(
-    mode=AMP_TYPE.PARALLEL,
-    initial_scale=2 ** 8
-)
-
-logging = dict(
-    root_path='./logs'
-)
--- a/docs/amp.md
+++ b/docs/amp.md
@@ -77,10 +77,10 @@ fp16 = dict(
 )
 ```

-## Tensor Parallel AMP
+## Naive AMP

 We leveraged the Megatron-LM implementation to achieve mixed precision training while maintaining compatibility with complex tensor 
-and pipeline parallelism.
+and pipeline parallelism. This AMP mode will cast all operations into fp16.

 The following conde block show a config file for this mode.


--- a/docs/colossalai/colossalai.amp.apex_amp.rst
+++ b/docs/colossalai/colossalai.amp.apex_amp.rst
+colossalai.amp.apex\_amp
+==========================
+
+.. automodule:: colossalai.amp.apex_amp
+   :members:
--- a/docs/colossalai/colossalai.amp.naive_amp.rst
+++ b/docs/colossalai/colossalai.amp.naive_amp.rst
+colossalai.amp.naive\_amp
+==========================
+
+.. automodule:: colossalai.amp.naive_amp
+   :members:
--- a/docs/colossalai/colossalai.amp.rst
+++ b/docs/colossalai/colossalai.amp.rst
+colossalai.amp
+==================
+
+.. toctree::
+   :maxdepth: 2
+
+   colossalai.amp.torch_amp
+   colossalai.amp.apex_amp
+   colossalai.amp.naive_amp
+
+
+.. automodule:: colossalai.amp
+   :members:
--- a/docs/colossalai/colossalai.amp.torch_amp.rst
+++ b/docs/colossalai/colossalai.amp.torch_amp.rst
+colossalai.amp.torch\_amp
+==========================
+
+.. automodule:: colossalai.amp.torch_amp
+      :members:
--- a/docs/colossalai/colossalai.builder.rst
+++ b/docs/colossalai/colossalai.builder.rst
 colossalai.builder
 ==================

-.. automodule:: colossalai.builder
-   :members:
-
-
 .. toctree::
   :maxdepth: 2

   colossalai.builder.builder
   colossalai.builder.pipeline
+
+
+.. automodule:: colossalai.builder
+   :members: