Migrated project

404ecbdc · zbian · 2ebaefc5 · 404ecbdc · 404ecbdc · 404ecbdc
Commit 404ecbdc authored Oct 28, 2021 by zbian
20 changed files
--- a/docs/config.md
+++ b/docs/config.md
+# Config file
+
+Here is an example config file of training ViT on cifar:
+
+```python
+# build train_dataset and train_dataloader from this dictionary
+# It is not compulsory in Config File, instead, you can input this dictionary as an argument into colossalai.initialize() 
+train_data = dict(
+    # dictionary for building Dataset
+    dataset=dict(
+        # the type CIFAR10Dataset has to be registered
+        type='CIFAR10Dataset',
+        root='/path/to/data',
+        # transform pipeline
+        transform_pipeline=[
+            dict(type='Resize', size=IMG_SIZE),
+            dict(type='RandomCrop', size=IMG_SIZE, padding=4),
+            dict(type='RandomHorizontalFlip'),
+            dict(type='ToTensor'),
+            dict(type='Normalize',
+                 mean=[0.4914, 0.4822, 0.4465],
+                 std=[0.2023, 0.1994, 0.2010]),
+        ]
+    ),
+    # dictionary for building Dataloader
+    dataloader=dict(
+        batch_size=BATCH_SIZE,
+        pin_memory=True,
+        # num_workers=1,
+        shuffle=True,
+    )
+)
+
+# build test_dataset and test_dataloader from this dictionary
+test_data = dict(
+    dataset=dict(
+        type='CIFAR10Dataset',
+        root='/path/to/data',
+        train=False,
+        transform_pipeline=[
+            dict(type='Resize', size=IMG_SIZE),
+            dict(type='ToTensor'),
+            dict(type='Normalize',
+                 mean=[0.4914, 0.4822, 0.4465],
+                 std=[0.2023, 0.1994, 0.2010]
+                 ),
+        ]
+    ),
+    dataloader=dict(
+        batch_size=BATCH_SIZE,
+        pin_memory=True,
+        # num_workers=1,
+    )
+)
+
+# compulsory
+# build optimizer from this dictionary
+optimizer = dict(
+    # Avaluable types: 'ZeroRedundancyOptimizer_Level_1', 'ZeroRedundancyOptimizer_Level_2', 'ZeroRedundancyOptimizer_Level_3'
+    # 'Adam', 'Lamb', 'SGD', 'FusedLAMB', 'FusedAdam', 'FusedSGD', 'FP16Optimizer'
+    type='Adam',
+    lr=0.001,
+    weight_decay=0
+)
+
+# compulsory
+# build loss function from this dictionary
+loss = dict(
+    # Avaluable types:
+    # 'CrossEntropyLoss2D', 'CrossEntropyLoss2p5D', 'CrossEntropyLoss3D'
+    type='CrossEntropyLoss2D',
+)
+
+# compulsory
+# build model from this dictionary
+model = dict(
+    # types avaluable: 'PretrainBERT', 'VanillaResNet', 'VisionTransformerFromConfig'
+    type='VisionTransformerFromConfig',
+    # each key-value pair above refers to a layer
+    # input data pass through these layers recursively
+    tensor_splitting_cfg=dict(
+        type='ViTInputSplitter2D',
+    ),
+    embedding_cfg=dict(
+        type='ViTPatchEmbedding2D',
+        img_size=IMG_SIZE,
+        patch_size=PATCH_SIZE,
+        embed_dim=DIM,
+    ),
+    token_fusion_cfg=dict(
+        type='ViTTokenFuser2D',
+        img_size=IMG_SIZE,
+        patch_size=PATCH_SIZE,
+        embed_dim=DIM,
+        drop_rate=0.1
+    ),
+    norm_cfg=dict(
+        type='LayerNorm2D',
+        normalized_shape=DIM,
+        eps=1e-6,
+    ),
+    block_cfg=dict(
+        # ViTBlock is a submodule
+        type='ViTBlock',
+        attention_cfg=dict(
+            type='ViTSelfAttention2D',
+            hidden_size=DIM,
+            num_attention_heads=NUM_ATTENTION_HEADS,
+            attention_dropout_prob=0.,
+            hidden_dropout_prob=0.1,
+            checkpoint=True
+        ),
+        droppath_cfg=dict(
+            type='VanillaViTDropPath',
+        ),
+        mlp_cfg=dict(
+            type='ViTMLP2D',
+            in_features=DIM,
+            dropout_prob=0.1,
+            mlp_ratio=4,
+            checkpoint=True
+        ),
+        norm_cfg=dict(
+            type='LayerNorm2D',
+            normalized_shape=DIM,
+            eps=1e-6,
+        ),
+    ),
+    head_cfg=dict(
+        type='ViTHead2D',
+        hidden_size=DIM,
+        num_classes=NUM_CLASSES,
+    ),
+    embed_dim=DIM,
+    depth=DEPTH,
+    drop_path_rate=0.,
+)
+
+# hooks are built when initializing trainer
+# possible hooks: 'BaseHook', 'MetricHook','LoadCheckpointHook'
+# 'SaveCheckpointHook','LossHook', 'AccuracyHook', 'Accuracy2DHook'
+# 'LogMetricByEpochHook', 'TensorboardHook','LogTimingByEpochHook', 'LogMemoryByEpochHook' 
+hooks = [
+    dict(type='LogMetricByEpochHook'),
+    dict(type='LogTimingByEpochHook'),
+    dict(type='LogMemoryByEpochHook'),
+    dict(type='Accuracy2DHook'),
+    dict(type='LossHook'),
+    # dict(type='TensorboardHook', log_dir='./tfb_logs'),
+    # dict(type='SaveCheckpointHook', interval=5, checkpoint_dir='./ckpt'),
+    # dict(type='LoadCheckpointHook', epoch=20, checkpoint_dir='./ckpt')
+]
+
+# three keys: pipeline, tensor, data
+# if data=dict(size=1), which means no data parallelization, then there is no need to define it
+parallel = dict(
+    pipeline=dict(size=1),
+    tensor=dict(size=4, mode='2d'),
+)
+
+# not compulsory
+# pipeline or no pipeline schedule
+fp16 = dict(
+    mode=AMP_TYPE.PARALLEL,
+    initial_scale=2 ** 8
+)
+
+# not compulsory
+# build learning rate scheduler
+lr_scheduler = dict(
+    type='LinearWarmupLR',
+    warmup_epochs=5
+)
+
+schedule = dict(
+    num_microbatches=8
+)
+
+# training stopping criterion
+# you can give num_steps or num_epochs
+num_epochs = 60
+
+# config logging path
+logging = dict(
+    root_path='./logs'
+)
+```
\ No newline at end of file
--- a/docs/index.rst
+++ b/docs/index.rst
+.. ColossalAI documentation master file, created by
+   sphinx-quickstart on Mon Oct 11 17:05:05 2021.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+ColossalAI documentation
+======================================
+.. toctree::
+   :maxdepth: 1
+   :caption: GETTING STARTED
+
+   installation.md
+   run_demo.md
+
+
+.. toctree::
+   :maxdepth: 1
+   :caption: CUSTOMIZE YOUR TRAINING
+
+   parallelization.md
+   model.md
+   trainer_engine.md
+   amp.md
+   zero.md
+   add_your_parallel.md
+   config.md
+   
+
+
+.. toctree::
+   :maxdepth: 2
+   :caption: API REFERENCE
+
+   colossalai/colossalai
+
+
+Indices and tables
+==================
+
+* :ref:`genindex`
\ No newline at end of file
--- a/docs/installation.md
+++ b/docs/installation.md
+# Setup
+
+## Install with pip
+
+```bash
+pip install colossalai
+```
+
+## Install from source
+
+```shell
+git clone git@github.com:hpcaitech/ColossalAI.git
+cd ColossalAI
+# install dependency
+pip install -r requirements/requirements.txt
+
+# install colossalai
+pip install .
+```
+
+Install and enable CUDA kernel fusion (compulsory installation when using fused optimizer)
+
+```
+pip install -v --no-cache-dir --global-option="--cuda_ext" .
+```
--- a/docs/make.bat
+++ b/docs/make.bat
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=.
+set BUILDDIR=.build
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.https://www.sphinx-doc.org/
+	exit /b 1
+)
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+
+:end
+popd
--- a/docs/model.md
+++ b/docs/model.md
+# Define your own parallel model
+
+## Write a Simple 2D Parallel Model
+
+Let's say we have a huge MLP model and its very large hidden size makes it difficult to fit into a single GPU. We can
+then distribute the model weights across GPUs in a 2D mesh while you still write your model in a familiar way.
+
+```python
+from colossalai.nn import Linear2D
+import torch.nn as nn
+
+
+class MLP_2D(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.linear_1 = Linear2D(in_features=1024, out_features=16384)
+        self.linear_2 = Linear2D(in_features=16384, out_features=1024)
+
+    def forward(self, x):
+        x = self.linear_1(x)
+        x = self.linear_2(x)
+        return x
+
+```
+
+## Use pre-defined model
+Our Model Zoo supports *BERT*, *VIT*, *MLP-Mixer* of different sizes.
\ No newline at end of file
--- a/docs/parallelization.md
+++ b/docs/parallelization.md
+# Parallelization
+
+## Configure the Combination of Parallelization
+
+We support multiple parallelization in our library.
+
+Hybrid parallelism in our codebase, namely data parallelism, pipeline parallelism and tensor parallelism (
+1D,2D, 2.5D, 3D). You can initialize the corresponding process group by setting `parallel` in our config. The parallel
+configuration can be easily deployed by a dictionary in configuration file. The configuration dictionary must obey the
+following format. Data parallel size will be inferred automatically based on your inputs to pipeline parallelism and
+tensor parallelism.
+
+```python
+parallel = dict(
+    pipeline=dict["size": int],
+    tensor=dict["size": int, "mode": '1d' or '2d' or '2.5d' or '3d', "kwargs": Any]
+) 
+```
+
+The name of the dictionary variable should be **parallel**. All the arguments even **parallel** itself are optional and data,
+pipeline, tensor parallel size will be set to defaulted value 1. The value of data, pipeline and tensor can be a int
+representing the size of specific parallel dimension or a dictionary with a key called "size". The key "mode"
+represents the way of model parallelism.
+
+## Data Parallel
+Data parallel is the most common way to distribute your training task by splitting data into several shards and train 
+on a single shard on each device. The configuration for data parallel is detected automatically and set for you. You do 
+not have to explicitly set them in your configurations. When data parallel size is larger than 1, Colossal-AI automatically 
+adds the distributed data sampler to the dataloader to shard the dataset.
+
+
+## Pipeline Parallel (experimental)
+
+Pipeline parallelism is to split the model into several partitions by layer. For example, let's assume we have a simple 
+model which consists of two linear layer. We have two GPUs, and we can allocate the first linear layer to the first GPU 
+and the second layer to the second GPU. This example of course wastes the computing resources and is only to demonstrate
+the idea of pipeline parallelism. 
+
+As PyTorch is based on dynamic computation graph, the computation flow is not known until execution. To support pipeline 
+parallelism in PyTorch, you may need to add one more attribute in your model class which tells Colossal-AI the sequence 
+of execution. One example you can refer is `colossalai.nn.VanillaResNet`.
+
+```python
+from colossalai.nn import BaseModel
+import torch
+
+class VanillaResNet(BaseModel):
+
+    def __init__(
+            self,
+            num_cls: int,
+            block_type: str,
+            layers: List[int],
+            norm_layer_type: str = 'BatchNorm2d',
+            in_channels: int = 3,
+            groups: int = 1,
+            width_per_group: int = 64,
+            zero_init_residual: bool = False,
+            replace_stride_with_dilation: Optional[List[bool]] = None,
+            dilations=(1, 1, 1, 1)
+    ) -> None:
+        super().__init__()
+        
+        ... # some model params
+        
+        self.layers_cfg = [
+            # conv1
+            dict(type='Conv2d',
+                 in_channels=in_channels,
+                 out_channels=self.inplanes,
+                 kernel_size=7,
+                 stride=2,
+                 padding=3,
+                 bias=False),
+            # bn1
+            dict(
+                type=norm_layer_type,
+                num_features=self.inplanes
+            ),
+            # relu
+            dict(
+                type='ReLU',
+                inplace=True
+            ),
+            # maxpool
+            dict(
+                type='MaxPool2d',
+                kernel_size=3,
+                stride=2,
+                padding=1
+            ),
+            # layer 1
+            dict(
+                inplanes=self.inplanes,
+                planes=64,
+                blocks=self.blocks[0],
+                dilation=self.dilations[0],
+                **self.reslayer_common_cfg
+            ),
+            # layer 2
+            dict(
+                inplanes=64 * self.block_expansion,
+                planes=128,
+                blocks=self.blocks[1],
+                stride=2,
+                dilate=replace_stride_with_dilation[0],
+                dilation=self.dilations[1],
+                **self.reslayer_common_cfg
+            ),
+            # layer  3
+            dict(
+                inplanes=128 * self.block_expansion,
+                planes=256,
+                blocks=layers[2],
+                stride=2,
+                dilate=replace_stride_with_dilation[1],
+                dilation=self.dilations[2],
+                **self.reslayer_common_cfg
+            ),
+            # layer 4
+            dict(
+                inplanes=256 * self.block_expansion,
+                planes=512,
+                blocks=layers[3], stride=2,
+                dilate=replace_stride_with_dilation[2],
+                dilation=self.dilations[3],
+                **self.reslayer_common_cfg
+            ),
+            # avg pool
+            dict(
+                type='AdaptiveAvgPool2d',
+                output_size=(1, 1)
+            ),
+            # flatten
+            dict(
+                type='LambdaWrapper',
+                func=lambda mod, x: torch.flatten(x, 1)
+            ),
+            # linear
+            dict(
+                type='Linear',
+                in_features=512 * self.block_expansion,
+                out_features=num_cls
+            )
+        ]
+```
+
+You can set the number of pipeline stages in your configuration file. When pipeline size is larger than 1, Colossal-AI 
+will automatically creates the pipeline schedule which defines the forward and backward step. You can specify how many microbatches
+to run in each step in the `schedule` configuration.
+
+```python
+parallel = dict(
+    pipeline=dict(size=1), # number of pipeline stages
+    tensor=dict(size=1, mode=None)
+)
+
+schedule = dict(
+    num_microbatches = 4 # set the number of microbatches per step
+)
+```
+
+## 1D, 2D, 2.5D and 3D Parallel
+To enable hybrid parallelism, we provide an array of tensor parallelism. We provide the list of papers which match each 
+tensor parallel method. These parallel modes need to work with the distributed layers provided by Colossal-AI.
+- 1D: [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053)
+
+- 2D: [An Efficient 2D Method for Training Super-Large Deep Learning Models](https://arxiv.org/abs/2104.05343)  
+2D parallel relies on the SUMMA matrix multiplication algorithm and splits the input data, 
+model weights and layer outputs along two different dimensions. The tensor chunks are distributed over a 2D mesh of $P = N^2$ 
+devices where N is the number of tensor chunks in a single dimension.
+
+- 2.5D: [2.5-dimensional distributed model training](https://arxiv.org/abs/2105.14500)  
+Inspired by the 2.5D matrix multi-plication algorithm, 2.5D parallel introduces a novel tensor parallelism which further 
+parallelizes 2D tensor parallelism. An amount of $P = N^2 ∗ d$ processors are arranged into d layers, 
+where each layer performs matrix multiplication operations independently with a dimension N.
+
+- 3D: [Maximizing Parallelism in Distributed Training for Huge Neural Networks](https://arxiv.org/abs/2105.14450)  
+We also introduce a 3D tensor parallelism that parallelizes neural networks on a 3D processor cube. This method achieves 
+the optimal, $O(P^{1/3})$ communication overhead on P processors, while both computation and memory usage are evenly distributed 
+through optimized load balancing of parameters as well as activations.
+
+
+
+```python
+# 1D parallel
+parallel = dict(
+    pipeline=dict(size=1), # number of pipeline stages
+    tensor=dict(size=4, mode='1d')
+)
+
+# 2D parallel
+parallel = dict(
+    pipeline=dict(size=1), # number of pipeline stages
+    tensor=dict(size=4, mode='2d')
+)
+
+# 2.5D parallel
+parallel = dict(
+    pipeline=dict(size=1), # number of pipeline stages
+    tensor=dict(size=8, mode='2.5d', depth=2)
+)
+
+# 3D parallel
+parallel = dict(
+    pipeline=dict(size=1), # number of pipeline stages
+    tensor=dict(size=8, mode='3d')
+)
+```
+
+
+## Sequence Parallel (experimental)
+
+Sequence parallel is to support long-sequence modelling such as document-level text understanding and medical imaging. 
+This method is proposed in [Sequence Parallelism: Making 4D Parallelism Possible](https://arxiv.org/abs/2105.13120). 
+This feature is still in development is only experimental for now.
\ No newline at end of file
--- a/docs/run_demo.md
+++ b/docs/run_demo.md
+# Quick demo
+
+ColossalAI is an integrated large-scale deep learning framework with efficient parallelization techniques. The framework
+can accelerate model training on distributed systems with multiple GPUs by applying parallelization techniques. The
+framework can also run on systems with only one GPU. Quick demos showing how to use ColossalAI are given below.
+
+## Single GPU
+
+ColossalAI can be used to train deep learning models on systems with only one GPU and achieve baseline
+performances. [Here](https://colab.research.google.com/drive/1fJnqqFzPuzZ_kn1lwCpG2nh3l2ths0KE?usp=sharing#scrollTo=cQ_y7lBG09LS)
+is an example showing how to train a LeNet model on the CIFAR10 dataset using ColossalAI.
+
+## Multiple GPUs
+
+ColossalAI can be used to train deep learning models on distributed systems with multiple GPUs and accelerate the
+training process drastically by applying efficient parallelization techiniques, which will be elaborated in
+the [Parallelization](parallelization.md) section below. Run the code below on your distributed system with 4 GPUs,
+where `HOST` is the IP address of your system. Note that we use
+the [Slurm](https://slurm.schedmd.com/documentation.html) job scheduling system here.
+
+```bash
+HOST=xxx.xxx.xxx.xxx srun ./scripts/slurm_dist_train.sh ./example/train_vit_2d.py ./configs/vit/vit_2d.py
+```
+
+`./configs/vit/vit_2d.py` is a config file, which is introduced in the [Config file](config.md) section below. These
+config files are used by ColossalAI to define all kinds of training arguments, such as the model, dataset and training
+method (optimizer, lr_scheduler, epoch, etc.). Config files are highly customizable and can be modified so as to train
+different models.
+`./example/run_trainer.py` contains a standard training script and is presented below, it reads the config file and
+realizes the training process.
+
+```python
+import colossalai
+from colossalai.engine import Engine
+from colossalai.trainer import Trainer
+from colossalai.core import global_context as gpc
+
+model, train_dataloader, test_dataloader, criterion, optimizer, schedule, lr_scheduler = colossalai.initialize()
+engine = Engine(
+    model=model,
+    criterion=criterion,
+    optimizer=optimizer,
+    lr_scheduler=lr_scheduler,
+    schedule=schedule
+)
+
+trainer = Trainer(engine=engine,
+                  hooks_cfg=gpc.config.hooks,
+                  verbose=True)
+trainer.fit(
+    train_dataloader=train_dataloader,
+    test_dataloader=test_dataloader,
+    max_epochs=gpc.config.num_epochs,
+    display_progress=True,
+    test_interval=5
+)
+```
+
+Alternatively, the `model` variable can be substituted with a self-defined model or a pre-defined model in our Model
+Zoo. The detailed substitution process is elaborated [here](model.md).
+
+## Features
+
+ColossalAI provides a collection of parallel training components for you. We aim to support you with your development of
+distributed deep learning models just like how you write single-GPU deeo learning models. We provide friendly tools to
+kickstart distributed training in a few lines.
+
+- [Data Parallelism](parallelization.md)
+- [Pipeline Parallelism](parallelization.md)
+- [1D, 2D, 2.5D, 3D and sequence parallelism](parallelization.md)
+- [Friendly trainer and engine](trainer_engine.md)
+- [Extensible for new parallelism](add_your_parallel.md)
+- [Mixed Precision Training](amp.md)
+- [Zero Redundancy Optimizer (ZeRO)](zero.md)
--- a/docs/trainer_engine.md
+++ b/docs/trainer_engine.md
+# Build your engine & Customize your trainer
+
+## Build your engine
+
+To better understand the function of `Engine` class, you should know the conception of the process function in common engines. The process function usually controls the behavior over a batch of a dataset, `Engine` class just controls the process function. For example, common process function looks like this:
+
+```python
+def process_function(dataloader, model, criterion, optim):
+    optim.zero_grad()
+    data, label = next(dataloader)
+    output = model(data)
+    loss = criterion(output, label)
+    loss.backward()
+    optim.setp()
+```
+
+In `ignite.engine` or `keras.engine`, the process function is always provided by users. However, it is hard for users to write their own functions for pipeline parallelism.  Aiming at accessible hybrid parallelism for users, we provide powerful `Engine` class. It enables pipeline parallelism and offers 1F1B non-interleaving strategy. Also, you can use pre-defined learning rate scheduler in your `Engine` to adjust learning rate during training.
+
+In order to build your engine, just set model, criterion, optimizer, learning rate scheduler and schedule. Consider the following code as an example.
+
+```python
+import torch
+import torch.nn as nn
+import torchvision.models as models
+import colossalai
+
+
+model = models.resnet18()
+criterion = nn.CrossEntropyLoss()
+optimizer = torch.optim.Adam(model)
+lr_scheduler = colossalai.nn.lr_scheduler.CosineAnnealingLR(optimizer, 1000)
+schedule = colossalai.engine.schedule.NoPipelineSchedule()
+
+MyEngine = Engine(
+    model=model,
+    criterion=criterion,
+    optimizer=optimizer,
+    lr_scheduler=lr_scheduler,
+    schedule=schedule
+)
+```
+
+More information is in API reference.
+
+
+
+## Customize your trainer
+
+### Overview
+
+Before starting to learn how to customize a trainer meeting your need, you should have a basic understanding about the function of `Trainer`. We recommend you to read *Get Started* section and *Build your engine* first. 
+
+Trainer class tends to enable researchers and engineers to use our framework more conveniently, instead of writing their own scripts, we provide `Trainer` class and you can simply construct it with your own `Engine` by calling `MyTrainer = Trainer(MyEngine)`.  Then use method `fit` to train or evaluate your model. In order to make our `Trainer` class more powerful, we add some useful features to it, such as monitor or record running states and metrics which indicate model's performance, or save after a training epoch. 
+
+To accomplish  that, specific actions must be added to the training or evaluation. `BaseHook` class allow you to add desired actions in specific time points. We have already created practical hooks for those useful features. What you need to do is just picking the hooks you want. 
+
+More detailed class descriptions can be found in API reference.
+
+### Example
+
+```python
+hooks = [
+    dict(type='LogMetricByEpochHook'),
+    dict(type='LogTimingByEpochHook'),
+    dict(type='LogMemoryByEpochHook'),
+    dict(type='AccuracyHook'),
+    dict(type='LossHook'),
+    # dict(type='TensorboardHook', log_dir='./tfb_logs'),
+    # dict(type='SaveCheckpointHook', interval=5, checkpoint_dir='./ckpt'),
+    # dict(type='LoadCheckpointHook', epoch=20, checkpoint_dir='./ckpt')
+]
+```
+
+Above hooks will record metrics, used time and memory usage to log every epoch. Also it prints loss and accuracy to let users monitor the performance of the model.
+
+### Hook
+
+You can extend our `BaseHook` class. Hooks can be called at twelve time points. More detailed information can be found in API reference.
+
+Or extend from `MetricHook` to write a metric collector. You should also use the decorator `@HOOKS.register_module` for your own hook class, and import it in your main python script.
+
+For `after_train_iter()`, it receives the output of engine per iteration, which is a list including output, label and loss.
+
+Note that you can define the priority to arrange the execution order of all hooks.
+
+### Metric
+
+You can write your own metric by extending `Metric` class.  It is always used with `MetricHook`. If you write your own metric hooks, please set the priority carefully and make sure is called before other hooks which may use the results of metrics.
+
+We've already provided some metric hooks. We store metric objects in `runner.states['metrics']`. It is a dictionary and you can use the name of the metric to access it.
\ No newline at end of file
--- a/docs/zero.md
+++ b/docs/zero.md
+# Zero Redundancy Optimizer and Zero Offload
+
+The Zero Redundancy Optimizer (ZeRO) removes the memory redundancies across data-parallel processes by partitioning the three model states (optimizer states, gradients, and parameters) across data-parallel processes instead of replicating them. By doing this, it boosts memory efficiency compared to classic data-parallelism while retaining its computational granularity and communication efficiency.
+
+1. **ZeRO Level 1**: The optimizer states (e.g., for [Adam optimizer](https://arxiv.org/abs/1412.6980), 32-bit weights, and the first, and second moment estimates) are partitioned across the processes, so that each process updates only its partition.
+2. **ZeRO Level 2**: The reduced 32-bit gradients for updating the model weights are also partitioned such that each process retains only the gradients corresponding to its portion of the optimizer states.
+3. **ZeRO Level 3**: The 16-bit model parameters are partitioned across the processes. ZeRO-3 will automatically collect and partition them during the forward and backward passes.
+
+## Getting Started
+
+Once you are training with ColossalAI, enabling ZeRO-3 offload is as simple as enabling it in your ColossalAI configuration! Below are a few examples of ZeRO-3 configurations. 
+
+### Example ZeRO-3 Configurations
+
+Here we use ``Adam`` as the initial optimizer.
+
+1. Use ZeRO to partition the optimizer states (level 1), gradients (level 2), and parameters (level 3).
+    ```python
+    optimizer = dict(
+        type='Adam',
+        lr=0.001,
+        weight_decay=0
+    )
+
+    zero = dict(
+        type='ZeroRedundancyOptimizer_Level_3',
+        dynamic_loss_scale=True,
+        clip_grad=1.0
+    )
+    ```
+2. Additionally offload the optimizer states and computations to the CPU.
+    ```python
+    zero = dict(
+        offload_optimizer_config=dict(
+            device='cpu',
+            pin_memory=True,
+            fast_init=True
+        ),
+        ...
+    )
+    ```
+3. Save even more memory by offloading parameters to the CPU memory.
+    ```python
+    zero = dict(
+        offload_optimizer_config=dict(
+            device='cpu',
+            pin_memory=True,
+            fast_init=True
+        ),
+        offload_param_config=dict(
+            device='cpu',
+            pin_memory=True,
+            fast_init=OFFLOAD_PARAM_MAX_IN_CPU
+        ),
+        ...
+    )
+    ```
+4. Save even MORE memory by offloading to NVMe (if available on your system):
+    ```python
+    zero = dict(
+        offload_optimizer_config=dict(
+            device='nvme',
+            pin_memory=True,
+            fast_init=True,
+            nvme_path='/nvme_data'
+        ),
+        offload_param_config=dict(
+            device='nvme',
+            pin_memory=True,
+            max_in_cpu=OFFLOAD_PARAM_MAX_IN_CPU,
+            nvme_path='/nvme_data'
+        ),
+        ...
+    )
+    ```
+
+Note that ``fp16`` is automatically enabled when using ZeRO. 
+
+### Training
+
+Once you complete your configuration, just use `colossalai.initialize()` to initialize your training. All you need to do is to write your configuration.
\ No newline at end of file
--- a/examples/colossal_cifar_demo.ipynb
+++ b/examples/colossal_cifar_demo.ipynb
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "name": "colossal_cifar_demo.ipynb",
+      "provenance": []
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    },
+    "accelerator": "GPU"
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "uhrbvVEh2iJd"
+      },
+      "source": [
+        "# Train an image classifier\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "vP7LvCpG23a2",
+        "outputId": "b37f7203-8a02-4736-c527-603f2bb34d7d"
+      },
+      "source": [
+        "!pip install ColossalAI deepspeed"
+      ],
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Requirement already satisfied: ColossalAI in /usr/local/lib/python3.7/dist-packages (0.1)\n",
+            "Requirement already satisfied: deepspeed in /usr/local/lib/python3.7/dist-packages (0.5.4)\n",
+            "Requirement already satisfied: packaging in /usr/local/lib/python3.7/dist-packages (from deepspeed) (21.0)\n",
+            "Requirement already satisfied: triton in /usr/local/lib/python3.7/dist-packages (from deepspeed) (1.1.1)\n",
+            "Requirement already satisfied: tqdm in /usr/local/lib/python3.7/dist-packages (from deepspeed) (4.62.3)\n",
+            "Requirement already satisfied: numpy in /usr/local/lib/python3.7/dist-packages (from deepspeed) (1.19.5)\n",
+            "Requirement already satisfied: tensorboardX==1.8 in /usr/local/lib/python3.7/dist-packages (from deepspeed) (1.8)\n",
+            "Requirement already satisfied: ninja in /usr/local/lib/python3.7/dist-packages (from deepspeed) (1.10.2.2)\n",
+            "Requirement already satisfied: torch in /usr/local/lib/python3.7/dist-packages (from deepspeed) (1.9.0+cu111)\n",
+            "Requirement already satisfied: psutil in /usr/local/lib/python3.7/dist-packages (from deepspeed) (5.4.8)\n",
+            "Requirement already satisfied: protobuf>=3.2.0 in /usr/local/lib/python3.7/dist-packages (from tensorboardX==1.8->deepspeed) (3.17.3)\n",
+            "Requirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from tensorboardX==1.8->deepspeed) (1.15.0)\n",
+            "Requirement already satisfied: pyparsing>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging->deepspeed) (2.4.7)\n",
+            "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.7/dist-packages (from torch->deepspeed) (3.7.4.3)\n",
+            "Requirement already satisfied: filelock in /usr/local/lib/python3.7/dist-packages (from triton->deepspeed) (3.3.0)\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "UVKEurtS4SFS",
+        "outputId": "99fb6050-5da7-4f27-b4eb-9b3ccf830efb"
+      },
+      "source": [
+        "import colossalai\n",
+        "from colossalai.engine import Engine, NoPipelineSchedule\n",
+        "from colossalai.trainer import Trainer\n",
+        "from colossalai.context import Config\n",
+        "import torch"
+      ],
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Please install apex to use FP16 Optimizer\n",
+            "Apex should be installed to use the FP16 optimizer\n",
+            "apex is required for mixed precision training\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "PpFfhNBD7NSn"
+      },
+      "source": [
+        "First, we should initialize distributed environment. Though we just use single GPU in this example, we still need initialize distributed environment for compatibility. We just consider the simplest case here, so we just set the number of parallel processes to 1."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "8yF7Lc-K7NAS",
+        "outputId": "01312349-a8b0-4de4-9103-7d1b48e6cc36"
+      },
+      "source": [
+        "parallel_cfg = Config(dict(parallel=dict(\n",
+        "    data=dict(size=1),\n",
+        "    pipeline=dict(size=1),\n",
+        "    tensor=dict(size=1, mode=None),\n",
+        ")))\n",
+        "colossalai.init_dist(config=parallel_cfg,\n",
+        "          local_rank=0,\n",
+        "          world_size=1,\n",
+        "          host='127.0.0.1',\n",
+        "          port=8888,\n",
+        "          backend='nccl')"
+      ],
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": [
+            "colossalai - torch.distributed.distributed_c10d - 2021-10-15 03:27:51,596 INFO: Added key: store_based_barrier_key:1 to store for rank: 0\n",
+            "colossalai - torch.distributed.distributed_c10d - 2021-10-15 03:27:51,598 INFO: Rank 0: Completed store-based barrier for 1 nodes.\n",
+            "colossalai - torch.distributed.distributed_c10d - 2021-10-15 03:27:51,602 INFO: Added key: store_based_barrier_key:2 to store for rank: 0\n",
+            "colossalai - torch.distributed.distributed_c10d - 2021-10-15 03:27:51,605 INFO: Rank 0: Completed store-based barrier for 1 nodes.\n",
+            "colossalai - torch.distributed.distributed_c10d - 2021-10-15 03:27:51,608 INFO: Added key: store_based_barrier_key:3 to store for rank: 0\n",
+            "colossalai - torch.distributed.distributed_c10d - 2021-10-15 03:27:51,610 INFO: Rank 0: Completed store-based barrier for 1 nodes.\n"
+          ]
+        },
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "process rank 0 is bound to device 0\n",
+            "initialized seed on rank 0, numpy: 1024, python random: 1024, ParallelMode.DATA: 1024, ParallelMode.TENSOR: 1124,the default parallel seed is ParallelMode.DATA.\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "ppjmMxc_81TK"
+      },
+      "source": [
+        "Load and normalize the CIFAR10 training and test datasets using `colossalai.nn.data`. Note that we have wrapped `torchvision.transforms`, so that we can simply use the config dict to use them."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "ZyGhyD47-dUY",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "outputId": "98bbf2d1-a1c4-4bb4-b6df-600777b1e8f5"
+      },
+      "source": [
+        "transform_cfg = [\n",
+        "    dict(type='ToTensor'),\n",
+        "    dict(type='Normalize',\n",
+        "        mean=[0.4914, 0.4822, 0.4465],\n",
+        "        std=[0.2023, 0.1994, 0.2010]),\n",
+        "]\n",
+        "\n",
+        "batch_size = 128\n",
+        "\n",
+        "trainset = colossalai.nn.data.CIFAR10Dataset(transform_cfg, root='./data', train=True)\n",
+        "trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=2)\n",
+        "\n",
+        "testset = colossalai.nn.data.CIFAR10Dataset(transform_cfg, root='./data', train=False)\n",
+        "testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=2)"
+      ],
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Files already downloaded and verified\n",
+            "Files already downloaded and verified\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "NvPbfLLR9NzC"
+      },
+      "source": [
+        "We just define a simple Convolutional Neural Network here."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "cQ_y7lBG09LS"
+      },
+      "source": [
+        "import torch.nn as nn\n",
+        "import torch.nn.functional as F\n",
+        "\n",
+        "\n",
+        "class Net(nn.Module):\n",
+        "    def __init__(self):\n",
+        "        super().__init__()\n",
+        "        self.conv1 = nn.Conv2d(3, 6, 5)\n",
+        "        self.pool = nn.MaxPool2d(2, 2)\n",
+        "        self.conv2 = nn.Conv2d(6, 16, 5)\n",
+        "        self.fc1 = nn.Linear(16 * 5 * 5, 120)\n",
+        "        self.fc2 = nn.Linear(120, 84)\n",
+        "        self.fc3 = nn.Linear(84, 10)\n",
+        "\n",
+        "    def forward(self, x):\n",
+        "        x = self.pool(F.relu(self.conv1(x)))\n",
+        "        x = self.pool(F.relu(self.conv2(x)))\n",
+        "        x = torch.flatten(x, 1) # flatten all dimensions except batch\n",
+        "        x = F.relu(self.fc1(x))\n",
+        "        x = F.relu(self.fc2(x))\n",
+        "        x = self.fc3(x)\n",
+        "        return x\n",
+        "\n",
+        "\n",
+        "model = Net().cuda()"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "tgsszAmM9dYZ"
+      },
+      "source": [
+        "Define a Loss function and optimizer. And then we use them to initialize `Engine` and `Trainer`. We provide various training / evaluating hooks. In this case, we just use the simplest hooks which can compute and print loss and accuracy."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "YtaDoCax1BCf",
+        "outputId": "b33b1641-03d8-4597-c8c2-1a4c1d61e9b0"
+      },
+      "source": [
+        "import torch.optim as optim\n",
+        "\n",
+        "criterion = nn.CrossEntropyLoss()\n",
+        "optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)\n",
+        "schedule = NoPipelineSchedule()\n",
+        "engine = Engine(\n",
+        "        model=model,\n",
+        "        criterion=criterion,\n",
+        "        optimizer=optimizer,\n",
+        "        lr_scheduler=None,\n",
+        "        schedule=schedule\n",
+        "    )\n",
+        "trainer = Trainer(engine=engine,\n",
+        "          hooks_cfg=[dict(type='LossHook'), dict(type='LogMetricByEpochHook'), dict(type='AccuracyHook')],\n",
+        "          verbose=True)"
+      ],
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": [
+            "colossalai - rank_0 - 2021-10-15 03:27:56,018 WARNING: No gradient handler is set up, please make sure you do not need to all-reduce the gradients after a training step.\n",
+            "colossalai - rank_0 - 2021-10-15 03:27:56,024 INFO: build LogMetricByEpochHook for train, priority = 1\n",
+            "colossalai - rank_0 - 2021-10-15 03:27:56,026 INFO: build LossHook for train, priority = 10\n",
+            "colossalai - rank_0 - 2021-10-15 03:27:56,029 INFO: build AccuracyHook for train, priority = 10\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "_JR2TuvH99Ik"
+      },
+      "source": [
+        "Then we set training configs. We train our model for 10 epochs and it will be evaluated every 1 epoch. Set `display_progress` to `True` to display the training / evaluating progress bar."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "w-J3IP-J1sfx",
+        "outputId": "bdb76939-04f1-4124-ce5e-3af44c0d902c"
+      },
+      "source": [
+        "num_epochs = 10\n",
+        "test_interval = 1\n",
+        "trainer.fit(\n",
+        "        train_dataloader=trainloader,\n",
+        "        test_dataloader=testloader,\n",
+        "        max_epochs=num_epochs,\n",
+        "        display_progress=True,\n",
+        "        test_interval=test_interval\n",
+        "    )"
+      ],
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": [
+            "[Epoch 0 train]:   0%|          | 0/391 [00:00<?, ?it/s]/usr/local/lib/python3.7/dist-packages/torch/nn/functional.py:718: UserWarning: Named tensors and all their associated APIs are an experimental feature and subject to change. Please do not use them for anything important until they are released as stable. (Triggered internally at  /pytorch/c10/core/TensorImpl.h:1156.)\n",
+            "  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)\n",
+            "[Epoch 0 train]: 100%|██████████| 391/391 [00:14<00:00, 26.82it/s]\n",
+            "colossalai - rank_0 - 2021-10-15 03:28:11,088 INFO: Training - Epoch 1 - LogMetricByEpochHook: Loss = 2.29158\n",
+            "[Epoch 0 val]: 100%|██████████| 79/79 [00:02<00:00, 28.66it/s]\n",
+            "colossalai - rank_0 - 2021-10-15 03:28:14,040 INFO: Testing - Epoch 1 - LogMetricByEpochHook: Loss = 2.26517, Accuracy = 0.14820\n",
+            "[Epoch 1 train]: 100%|██████████| 391/391 [00:14<00:00, 26.31it/s]\n",
+            "colossalai - rank_0 - 2021-10-15 03:28:29,059 INFO: Training - Epoch 2 - LogMetricByEpochHook: Loss = 2.15763\n",
+            "[Epoch 1 val]: 100%|██████████| 79/79 [00:02<00:00, 28.50it/s]\n",
+            "colossalai - rank_0 - 2021-10-15 03:28:32,007 INFO: Testing - Epoch 2 - LogMetricByEpochHook: Loss = 2.00450, Accuracy = 0.27850\n",
+            "[Epoch 2 train]: 100%|██████████| 391/391 [00:14<00:00, 26.08it/s]\n",
+            "colossalai - rank_0 - 2021-10-15 03:28:47,167 INFO: Training - Epoch 3 - LogMetricByEpochHook: Loss = 1.85409\n",
+            "[Epoch 2 val]: 100%|██████████| 79/79 [00:02<00:00, 27.89it/s]\n",
+            "colossalai - rank_0 - 2021-10-15 03:28:50,168 INFO: Testing - Epoch 3 - LogMetricByEpochHook: Loss = 1.73788, Accuracy = 0.35990\n",
+            "[Epoch 3 train]: 100%|██████████| 391/391 [00:14<00:00, 26.09it/s]\n",
+            "colossalai - rank_0 - 2021-10-15 03:29:05,330 INFO: Training - Epoch 4 - LogMetricByEpochHook: Loss = 1.69363\n",
+            "[Epoch 3 val]: 100%|██████████| 79/79 [00:02<00:00, 28.43it/s]\n",
+            "colossalai - rank_0 - 2021-10-15 03:29:08,290 INFO: Testing - Epoch 4 - LogMetricByEpochHook: Loss = 1.65005, Accuracy = 0.39350\n",
+            "[Epoch 4 train]: 100%|██████████| 391/391 [00:15<00:00, 25.97it/s]\n",
+            "colossalai - rank_0 - 2021-10-15 03:29:23,530 INFO: Training - Epoch 5 - LogMetricByEpochHook: Loss = 1.61387\n",
+            "[Epoch 4 val]: 100%|██████████| 79/79 [00:02<00:00, 27.75it/s]\n",
+            "colossalai - rank_0 - 2021-10-15 03:29:26,515 INFO: Testing - Epoch 5 - LogMetricByEpochHook: Loss = 1.57507, Accuracy = 0.42430\n",
+            "[Epoch 5 train]: 100%|██████████| 391/391 [00:15<00:00, 25.92it/s]\n",
+            "colossalai - rank_0 - 2021-10-15 03:29:41,764 INFO: Training - Epoch 6 - LogMetricByEpochHook: Loss = 1.55712\n",
+            "[Epoch 5 val]: 100%|██████████| 79/79 [00:02<00:00, 27.51it/s]\n",
+            "colossalai - rank_0 - 2021-10-15 03:29:44,778 INFO: Testing - Epoch 6 - LogMetricByEpochHook: Loss = 1.53242, Accuracy = 0.43700\n",
+            "[Epoch 6 train]: 100%|██████████| 391/391 [00:14<00:00, 26.13it/s]\n",
+            "colossalai - rank_0 - 2021-10-15 03:29:59,927 INFO: Training - Epoch 7 - LogMetricByEpochHook: Loss = 1.51618\n",
+            "[Epoch 6 val]: 100%|██████████| 79/79 [00:02<00:00, 28.31it/s]\n",
+            "colossalai - rank_0 - 2021-10-15 03:30:02,884 INFO: Testing - Epoch 7 - LogMetricByEpochHook: Loss = 1.49720, Accuracy = 0.45430\n",
+            "[Epoch 7 train]: 100%|██████████| 391/391 [00:14<00:00, 26.23it/s]\n",
+            "colossalai - rank_0 - 2021-10-15 03:30:17,968 INFO: Training - Epoch 8 - LogMetricByEpochHook: Loss = 1.47857\n",
+            "[Epoch 7 val]: 100%|██████████| 79/79 [00:02<00:00, 27.97it/s]\n",
+            "colossalai - rank_0 - 2021-10-15 03:30:20,967 INFO: Testing - Epoch 8 - LogMetricByEpochHook: Loss = 1.45808, Accuracy = 0.46320\n",
+            "[Epoch 8 train]: 100%|██████████| 391/391 [00:14<00:00, 26.11it/s]\n",
+            "colossalai - rank_0 - 2021-10-15 03:30:36,129 INFO: Training - Epoch 9 - LogMetricByEpochHook: Loss = 1.44656\n",
+            "[Epoch 8 val]: 100%|██████████| 79/79 [00:02<00:00, 28.18it/s]\n",
+            "colossalai - rank_0 - 2021-10-15 03:30:39,096 INFO: Testing - Epoch 9 - LogMetricByEpochHook: Loss = 1.44903, Accuracy = 0.46580\n",
+            "[Epoch 9 train]: 100%|██████████| 391/391 [00:15<00:00, 25.97it/s]\n",
+            "colossalai - rank_0 - 2021-10-15 03:30:54,342 INFO: Training - Epoch 10 - LogMetricByEpochHook: Loss = 1.41120\n",
+            "[Epoch 9 val]: 100%|██████████| 79/79 [00:02<00:00, 28.05it/s]\n",
+            "colossalai - rank_0 - 2021-10-15 03:30:57,332 INFO: Testing - Epoch 10 - LogMetricByEpochHook: Loss = 1.41242, Accuracy = 0.48500\n"
+          ]
+        }
+      ]
+    }
+  ]
+}
\ No newline at end of file
--- a/examples/run_trainer.py
+++ b/examples/run_trainer.py
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+import colossalai
+from colossalai.core import global_context as gpc
+from colossalai.engine import Engine
+from colossalai.logging import get_global_dist_logger
+from colossalai.trainer import Trainer
+
+
+def run_trainer():
+    model, train_dataloader, test_dataloader, criterion, optimizer, schedule, lr_scheduler = colossalai.initialize()
+    logger = get_global_dist_logger()
+    schedule.data_sync = False
+    engine = Engine(
+        model=model,
+        criterion=criterion,
+        optimizer=optimizer,
+        lr_scheduler=lr_scheduler,
+        schedule=schedule
+    )
+    logger.info("engine is built", ranks=[0])
+
+    trainer = Trainer(engine=engine,
+                      hooks_cfg=gpc.config.hooks,
+                      verbose=True)
+    logger.info("trainer is built", ranks=[0])
+
+    logger.info("start training", ranks=[0])
+    trainer.fit(
+        train_dataloader=train_dataloader,
+        test_dataloader=test_dataloader,
+        max_epochs=gpc.config.num_epochs,
+        display_progress=True,
+        test_interval=2
+    )
+
+
+if __name__ == '__main__':
+    run_trainer()
--- a/model_zoo/__init__.py
+++ b/model_zoo/__init__.py
+from .vit import *
+from .mlp_mixer import *
--- a/model_zoo/bert/parallel_1d/.init
+++ b/model_zoo/bert/parallel_1d/.init
--- a/model_zoo/bert/parallel_2d/.init
+++ b/model_zoo/bert/parallel_2d/.init
--- a/model_zoo/bert/parallel_2p5d/.init
+++ b/model_zoo/bert/parallel_2p5d/.init
--- a/model_zoo/bert/parallel_3d/.init
+++ b/model_zoo/bert/parallel_3d/.init
--- a/model_zoo/mlp_mixer/__init__.py
+++ b/model_zoo/mlp_mixer/__init__.py
+from .parallel_3d import *
--- a/model_zoo/mlp_mixer/parallel_1d/.init
+++ b/model_zoo/mlp_mixer/parallel_1d/.init
--- a/model_zoo/mlp_mixer/parallel_2d/.init
+++ b/model_zoo/mlp_mixer/parallel_2d/.init
--- a/model_zoo/mlp_mixer/parallel_2p5d/.init
+++ b/model_zoo/mlp_mixer/parallel_2p5d/.init