Merge branch 'main' of https://github.com/hpcaitech/ColossalAI

7bc5a8e3 · zhuwenwen · e6748d82 · 0f785cb1 · 7bc5a8e3 · 7bc5a8e3
Commit 7bc5a8e3 authored May 05, 2023 by zhuwenwen
20 changed files
--- a/colossalai/booster/__init__.py
+++ b/colossalai/booster/__init__.py
+from .accelerator import Accelerator
+from .booster import Booster
+from .plugin import Plugin
--- a/colossalai/booster/accelerator.py
+++ b/colossalai/booster/accelerator.py
+import torch
+import torch.nn as nn
+
+__all__ = ['Accelerator']
+
+_supported_devices = [
+    'cpu',
+    'cuda',
+
+    # To be supported
+    # 'xpu',
+    # 'npu',
+    # 'tpu',
+]
+
+
+class Accelerator:
+    """
+    Accelerator is an abstraction for the hardware device that is used to run the model.
+
+    Args:
+        device (str): The device to be used. Currently only support 'cpu' and 'gpu'.
+    """
+
+    def __init__(self, device: str):
+        self.device = device
+
+        assert self.device in _supported_devices, f"Device {self.device} is not supported yet, supported devices include {_supported_devices}"
+
+    def bind(self):
+        """
+        Set the default device for the current process.
+        """
+        if self.device == 'cpu':
+            pass
+        elif self.device == 'cuda':
+            # TODO(FrankLeeeee): use global environment to check if it is a dist job
+            # if is_distributed:
+            #     local_rank = EnvTable().get_local_rank()
+            #     torch.cuda.set_device(torch.device(f'cuda:{local_rank}'))
+            torch.cuda.set_device(torch.device('cuda'))
+            pass
+        else:
+            raise ValueError(f"Device {self.device} is not supported yet")
+
+    def configure_model(self, model: nn.Module) -> nn.Module:
+        """
+        Move the model to the device.
+
+        Args:
+            model (nn.Module): The model to be moved.
+        """
+        model = model.to(torch.device(self.device))
+        return model
--- a/colossalai/booster/booster.py
+++ b/colossalai/booster/booster.py
+import warnings
+from contextlib import contextmanager
+from typing import Callable, Iterator, List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+from torch.optim import Optimizer
+from torch.optim.lr_scheduler import _LRScheduler as LRScheduler
+from torch.utils.data import DataLoader
+
+from colossalai.checkpoint_io import GeneralCheckpointIO
+
+from .accelerator import Accelerator
+from .mixed_precision import MixedPrecision, mixed_precision_factory
+from .plugin import Plugin
+
+__all__ = ['Booster']
+
+
+class Booster:
+    """
+    Booster is a high-level API for training neural networks. It provides a unified interface for
+    training with different precision, accelerator, and plugin.
+
+    Examples:
+        >>> colossalai.launch(...)
+        >>> plugin = GeminiPlugin(stage=3, ...)
+        >>> booster = Booster(precision='fp16', plugin=plugin)
+        >>>
+        >>> model = GPT2()
+        >>> optimizer = Adam(model.parameters())
+        >>> dataloader = Dataloader(Dataset)
+        >>> lr_scheduler = LinearWarmupScheduler()
+        >>> criterion = GPTLMLoss()
+        >>>
+        >>> model, optimizer, lr_scheduler, dataloader = booster.boost(model, optimizer, lr_scheduler, dataloader)
+        >>>
+        >>> for epoch in range(max_epochs):
+        >>>     for input_ids, attention_mask in dataloader:
+        >>>         outputs = model(input_ids, attention_mask)
+        >>>         loss = criterion(outputs.logits, input_ids)
+        >>>         booster.backward(loss, optimizer)
+        >>>         optimizer.step()
+        >>>         lr_scheduler.step()
+        >>>         optimizer.zero_grad()
+
+
+    Args:
+        device (str or torch.device): The device to run the training. Default: 'cuda'.
+        mixed_precision (str or MixedPrecision): The mixed precision to run the training. Default: None.
+                                If the argument is a string, it can be 'fp16', 'fp16_apex', 'bf16', or 'fp8'.
+                                'fp16' would use PyTorch AMP while `fp16_apex` would use Nvidia Apex.
+        plugin (Plugin): The plugin to run the training. Default: None.
+    """
+
+    def __init__(self,
+                 device: str = 'cuda',
+                 mixed_precision: Union[MixedPrecision, str] = None,
+                 plugin: Optional[Plugin] = None) -> None:
+        if plugin is not None:
+            assert isinstance(
+                plugin, Plugin), f'Expected the argument plugin to be an instance of Plugin, but got {type(plugin)}.'
+        self.plugin = plugin
+
+        # set accelerator
+        if self.plugin and self.plugin.control_device():
+            self.accelerator = None
+            warnings.warn('The plugin will control the accelerator, so the device argument will be ignored.')
+        else:
+            self.accelerator = Accelerator(device)
+
+        # set precision
+        if self.plugin and self.plugin.control_precision():
+            warnings.warn('The plugin will control the precision, so the mixed_precision argument will be ignored.')
+            self.mixed_precision = None
+        elif mixed_precision is None:
+            self.mixed_precision = None
+        else:
+            # validate and set precision
+            if isinstance(mixed_precision, str):
+                # the user will take the default arguments for amp training
+                self.mixed_precision = mixed_precision_factory(mixed_precision)
+            elif isinstance(mixed_precision, MixedPrecision):
+                # the user can customize the arguments by passing the precision object
+                self.mixed_precision = mixed_precision
+            else:
+                raise ValueError(
+                    f'Expected the argument mixed_precision to be a string or an instance of Precision, but got {type(mixed_precision)}.'
+                )
+
+        if self.plugin is not None and self.plugin.control_checkpoint_io():
+            self.checkpoint_io = self.plugin.get_checkpoint_io()
+        else:
+            self.checkpoint_io = GeneralCheckpointIO()
+
+    def boost(
+        self,
+        model: nn.Module,
+        optimizer: Optimizer,
+        criterion: Callable = None,
+        dataloader: DataLoader = None,
+        lr_scheduler: LRScheduler = None,
+    ) -> List[Union[nn.Module, Optimizer, LRScheduler, DataLoader]]:
+        """
+        Boost the model, optimizer, criterion, lr_scheduler, and dataloader.
+
+        Args:
+            model (nn.Module): The model to be boosted.
+            optimizer (Optimizer): The optimizer to be boosted.
+            criterion (Callable): The criterion to be boosted.
+            dataloader (DataLoader): The dataloader to be boosted.
+            lr_scheduler (LRScheduler): The lr_scheduler to be boosted.
+        """
+        # TODO(FrankLeeeee): consider multi-model and multi-optimizer case
+        # TODO(FrankLeeeee): consider multi-dataloader case
+        # transform model for mixed precision
+        if self.plugin:
+            model, optimizer, criterion, dataloader, lr_scheduler = self.plugin.configure(
+                model, optimizer, criterion, dataloader, lr_scheduler)
+
+        if self.plugin and not self.plugin.control_device():
+            # transform model for accelerator
+            model = self.accelerator.configure(model)
+
+        if self.mixed_precision and (self.plugin is None or self.plugin and not self.plugin.control_precision()):
+            # transform model for mixed precision
+            # when mixed_precision is specified and the plugin is not given or does not control the precision
+            model, optimizer, criterion = self.mixed_precision.configure(model, optimizer, criterion)
+
+        return model, optimizer, criterion, dataloader, lr_scheduler
+
+    def backward(self, loss: torch.Tensor, optimizer: Optimizer) -> None:
+        # TODO: implement this method with plugin
+        optimizer.backward(loss)
+
+    def execute_pipeline(self,
+                         data_iter: Iterator,
+                         model: nn.Module,
+                         criterion: Callable[[torch.Tensor], torch.Tensor],
+                         optimizer: Optimizer,
+                         return_loss: bool = True,
+                         return_outputs: bool = False) -> Tuple[Optional[torch.Tensor], ...]:
+        # TODO: implement this method
+        # run pipeline forward backward pass
+        # return loss or outputs if needed
+        pass
+
+    def no_sync(self, model: nn.Module) -> contextmanager:
+        assert self.plugin is not None, f'no_sync is only enabled when a plugin is provided and the plugin supports no_sync.'
+        assert self.plugin.support_no_sync, f'The plugin {self.plugin.__class__.__name__} does not support no_sync.'
+        return self.plugin.no_sync(model)
+
+    def load_model(self, model: nn.Module, checkpoint: str, strict: bool = True):
+        self.checkpoint_io.load_model(model, checkpoint, strict)
+
+    def save_model(self,
+                   model: nn.Module,
+                   checkpoint: str,
+                   prefix: str = None,
+                   shard: bool = False,
+                   size_per_shard: int = 1024):
+        self.checkpoint_io.save_model(model, checkpoint, prefix, shard, size_per_shard)
+
+    def load_optimizer(self, optimizer: Optimizer, checkpoint: str):
+        self.checkpoint_io.load_optimizer(optimizer, checkpoint)
+
+    def save_optimizer(self, optimizer: Optimizer, checkpoint: str, shard: bool = False, size_per_shard: int = 1024):
+        self.checkpoint_io.save_optimizer(optimizer, checkpoint, shard, size_per_shard)
+
+    def save_lr_scheduler(self, lr_scheduler: LRScheduler, checkpoint: str):
+        self.checkpoint_io.save_lr_scheduler(lr_scheduler, checkpoint)
+
+    def load_lr_scheduler(self, lr_scheduler: LRScheduler, checkpoint: str):
+        self.checkpoint_io.load_lr_scheduler(lr_scheduler, checkpoint)
--- a/colossalai/booster/mixed_precision/__init__.py
+++ b/colossalai/booster/mixed_precision/__init__.py
+from .bf16 import BF16MixedPrecision
+from .fp8 import FP8MixedPrecision
+from .fp16_apex import FP16ApexMixedPrecision
+from .fp16_torch import FP16TorchMixedPrecision
+from .mixed_precision_base import MixedPrecision
+
+__all__ = [
+    'MixedPrecision', 'mixed_precision_factory', 'FP16_Apex_MixedPrecision', 'FP16_Torch_MixedPrecision',
+    'FP32_MixedPrecision', 'BF16_MixedPrecision', 'FP8_MixedPrecision'
+]
+
+_mixed_precision_mapping = {
+    'fp16': FP16TorchMixedPrecision,
+    'fp16_apex': FP16ApexMixedPrecision,
+    'bf16': BF16MixedPrecision,
+    'fp8': FP8MixedPrecision
+}
+
+
+def mixed_precision_factory(mixed_precision_type: str) -> MixedPrecision:
+    """
+    Factory method to create mixed precision object
+
+    Args:
+        mixed_precision_type (str): mixed precision type, including None, 'fp16', 'fp16_apex', 'bf16', and 'fp8'.
+    """
+
+    if mixed_precision_type in _mixed_precision_mapping:
+        return _mixed_precision_mapping[mixed_precision_type]()
+    else:
+        raise ValueError(
+            f'Mixed precision type {mixed_precision_type} is not supported, support types include {list(_mixed_precision_mapping.keys())}'
+        )
--- a/colossalai/booster/mixed_precision/bf16.py
+++ b/colossalai/booster/mixed_precision/bf16.py
+from .mixed_precision_base import MixedPrecision
+
+
+class BF16MixedPrecision(MixedPrecision):
+    pass
--- a/colossalai/booster/mixed_precision/fp16_apex.py
+++ b/colossalai/booster/mixed_precision/fp16_apex.py
+from .mixed_precision_base import MixedPrecision
+
+
+class FP16ApexMixedPrecision(MixedPrecision):
+    pass
--- a/colossalai/booster/mixed_precision/fp16_torch.py
+++ b/colossalai/booster/mixed_precision/fp16_torch.py
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+from torch import Tensor
+from torch.optim import Optimizer
+
+from colossalai.interface import ModelWrapper, OptimizerWrapper
+
+from .mixed_precision_base import MixedPrecision
+
+__all__ = ['FP16_Torch_MixedPrecision', 'TorchAMPOptimizer', 'TorchAMPModule']
+
+
+class TorchAMPOptimizer(OptimizerWrapper):
+    """
+    Optimizer wrapper for mixed precision training in FP16 using PyTorch AMP.
+
+    Args:
+        optim (Optimizer): Optimizer to wrap.
+        init_scale (float): Initial scale factor. Default: 2**16.
+        growth_factor (float): Factor by which the scale is multiplied during
+            :meth:`torch.cuda.amp.GradScaler.step` if gradients were found to be finite
+            this iteration. Default: 2.0.
+        backoff_factor (float): Factor by which the scale is multiplied during
+            :meth:`torch.cuda.amp.GradScaler.step` if gradients were found to be infinite
+            this iteration. Default: 0.5.
+        growth_interval (int): Number of iterations between :meth:`torch.cuda.amp.GradScaler.step`
+            calls that may cause the scale to increase. Default: 2000.
+    """
+
+    def __init__(self,
+                 optim: Optimizer,
+                 init_scale: float = 2.**16,
+                 growth_factor: float = 2.0,
+                 backoff_factor: float = 0.5,
+                 growth_interval: int = 2000) -> None:
+        super().__init__(optim)
+        self.scaler = torch.cuda.amp.GradScaler(init_scale=init_scale,
+                                                growth_factor=growth_factor,
+                                                backoff_factor=backoff_factor,
+                                                growth_interval=growth_interval)
+
+    def backward(self, loss: Tensor, *args, **kwargs) -> None:
+        scaled_loss = self.scale_loss(loss)
+        scaled_loss.backward(*args, **kwargs)
+
+    def step(self, *args, **kwargs) -> Optional[float]:
+        out = self.scaler.step(self.optim, *args, **kwargs)
+        self.scaler.update()
+        return out
+
+    def scale_loss(self, loss: Tensor) -> Tensor:
+        return self.scaler.scale(loss)
+
+    def unscale_grad(self) -> None:
+        self.scaler.unscale_(self.optim)
+
+    def clip_grad_by_value(self, clip_value: float, *args, **kwargs) -> None:
+        self.unscale_grad()
+        super().clip_grad_by_value(clip_value, *args, **kwargs)
+
+    def clip_grad_by_norm(self,
+                          max_norm: Union[float, int],
+                          norm_type: Union[float, int] = 2.0,
+                          error_if_nonfinite: bool = False,
+                          *args,
+                          **kwargs) -> None:
+        self.unscale_grad()
+        super().clip_grad_by_norm(max_norm, norm_type, error_if_nonfinite, *args, **kwargs)
+
+
+class TorchAMPModule(ModelWrapper):
+    """
+    Module wrapper for mixed precision training in FP16 using PyTorch AMP.
+
+    Args:
+        module (nn.Module): Module to wrap.
+    """
+
+    def __init__(self, module: nn.Module):
+        super().__init__(module)
+
+    def forward(self, *args, **kwargs):
+        with torch.cuda.amp.autocast():
+            return self.module(*args, **kwargs)
+
+
+class FP16TorchMixedPrecision(MixedPrecision):
+    """
+    Precision for mixed precision training in FP16 using PyTorch AMP.
+
+    Args:
+        init_scale (float): Initial scale factor. Default: 2**16.
+        growth_factor (float): Factor by which the scale is multiplied during
+            :meth:`torch.cuda.amp.GradScaler.step` if gradients were found to be finite
+            this iteration. Default: 2.0.
+        backoff_factor (float): Factor by which the scale is multiplied during
+            :meth:`torch.cuda.amp.GradScaler.step` if gradients were found to be infinite
+            this iteration. Default: 0.5.
+        growth_interval (int): Number of iterations between :meth:`torch.cuda.amp.GradScaler.step`
+            calls that may cause the scale to increase. Default: 2000.
+    """
+
+    def __init__(self,
+                 init_scale: float = 2.**16,
+                 growth_factor: float = 2.0,
+                 backoff_factor: float = 0.5,
+                 growth_interval: int = 2000) -> None:
+        super().__init__()
+        self.torch_amp_kwargs = dict(init_scale=init_scale,
+                                     growth_factor=growth_factor,
+                                     backoff_factor=backoff_factor,
+                                     growth_interval=growth_interval)
+
+    def configure(self,
+                  model: nn.Module,
+                  optimizer: Optimizer,
+                  criterion: Callable = None) -> Tuple[nn.Module, OptimizerWrapper, Callable]:
+        model = TorchAMPModule(model)
+        optimizer = TorchAMPOptimizer(optimizer, **self.torch_amp_kwargs)
+        if criterion is not None:
+            criterion = TorchAMPModule(criterion)
+        return model, optimizer, criterion
--- a/colossalai/booster/mixed_precision/fp8.py
+++ b/colossalai/booster/mixed_precision/fp8.py
+from .mixed_precision_base import MixedPrecision
+
+
+class FP8MixedPrecision(MixedPrecision):
+    pass
--- a/colossalai/booster/mixed_precision/mixed_precision_base.py
+++ b/colossalai/booster/mixed_precision/mixed_precision_base.py
+from abc import ABC, abstractmethod
+from typing import Callable, Tuple
+
+import torch.nn as nn
+from torch.optim import Optimizer
+
+from colossalai.interface import OptimizerWrapper
+
+
+class MixedPrecision(ABC):
+    """
+    An abstract class for mixed precision training.
+    """
+
+    @abstractmethod
+    def configure(self,
+                  model: nn.Module,
+                  optimizer: Optimizer,
+                  criterion: Callable = None) -> Tuple[nn.Module, OptimizerWrapper, Callable]:
+        # TODO: implement this method
+        pass
--- a/colossalai/booster/plugin/__init__.py
+++ b/colossalai/booster/plugin/__init__.py
+from .gemini_plugin import GeminiPlugin
+from .low_level_zero_plugin import LowLevelZeroPlugin
+from .plugin_base import Plugin
+from .torch_ddp_plugin import TorchDDPPlugin
+
+__all__ = ['Plugin', 'TorchDDPPlugin', 'GeminiPlugin', 'LowLevelZeroPlugin']
--- a/colossalai/booster/plugin/gemini_plugin.py
+++ b/colossalai/booster/plugin/gemini_plugin.py
+import random
+import warnings
+from typing import Callable, List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+from torch import Tensor
+from torch.optim import Optimizer
+from torch.optim.lr_scheduler import _LRScheduler as LRScheduler
+from torch.utils.data import DataLoader
+from torch.utils.data.distributed import DistributedSampler
+
+from colossalai.checkpoint_io import CheckpointIO, GeneralCheckpointIO
+from colossalai.checkpoint_io.utils import save_state_dict
+from colossalai.cluster import DistCoordinator
+from colossalai.interface import ModelWrapper, OptimizerWrapper
+from colossalai.utils import get_current_device
+from colossalai.zero import GeminiDDP, zero_model_wrapper, zero_optim_wrapper
+from colossalai.zero.gemini.memory_tracer import MemStats
+
+from .plugin_base import Plugin
+
+__all__ = ['GeminiPlugin']
+
+
+class GeminiCheckpointIO(GeneralCheckpointIO):
+
+    def __init__(self) -> None:
+        super().__init__()
+        self.coordinator = DistCoordinator()
+
+    def load_unsharded_model(self, model: GeminiDDP, checkpoint: str, strict: bool = True):
+        """
+        Load model from checkpoint with automatic unwrapping.
+        """
+        # the model should be unwrapped in self.load_model via ModelWrapper.unwrap
+        return super().load_unsharded_model(model, checkpoint, strict=strict)
+
+    def save_unsharded_model(self, model: GeminiDDP, checkpoint: str, gather_dtensor: bool, use_safetensors: bool):
+        """
+        Save model to checkpoint but only on master process.
+        """
+        # the model should be unwrapped in self.load_model via ModelWrapper.unwrap
+        # as there is communication when get state dict, this must be called on all processes
+        state_dict = model.state_dict(only_rank_0=True)
+        if self.coordinator.is_master():
+            save_state_dict(state_dict, checkpoint, use_safetensors)
+
+    def save_unsharded_optimizer(self, optimizer: Optimizer, checkpoint: str, gather_dtensor: bool):
+        """
+        Save optimizer to checkpoint but only on master process.
+        """
+        # TODO(ver217): optimizer state dict is sharded
+        super().save_unsharded_optimizer(optimizer, checkpoint, gather_dtensor)
+
+    def save_lr_scheduler(self, lr_scheduler: LRScheduler, checkpoint: str):
+        """
+        Save model to checkpoint but only on master process.
+        """
+        if self.coordinator.is_master():
+            super().save_lr_scheduler(lr_scheduler, checkpoint)
+
+
+class GeminiModel(ModelWrapper):
+
+    def __init__(self, module: nn.Module, gemini_config: dict, verbose: bool = False) -> None:
+        super().__init__(module)
+        self.module = zero_model_wrapper(module, zero_stage=3, gemini_config=gemini_config, verbose=verbose)
+
+    def unwrap(self):
+        # as save/load state dict is coupled with the GeminiDDP, we only return GeminiDDP model
+        return self.module
+
+
+class GeminiOptimizer(OptimizerWrapper):
+
+    def __init__(self,
+                 module: GeminiDDP,
+                 optimizer: Optimizer,
+                 zero_optim_config: dict,
+                 optim_kwargs: dict,
+                 verbose: bool = False) -> None:
+        optimizer = zero_optim_wrapper(module,
+                                       optimizer,
+                                       optim_config=zero_optim_config,
+                                       **optim_kwargs,
+                                       verbose=verbose)
+        super().__init__(optimizer)
+
+    def backward(self, loss: Tensor, *args, **kwargs):
+        self.optim.backward(loss)
+
+    def clip_grad_by_norm(self,
+                          max_norm: Union[float, int],
+                          norm_type: Union[float, int] = 2,
+                          error_if_nonfinite: bool = False,
+                          *args,
+                          **kwargs) -> Tensor:
+        warnings.warn(f'Gemini controls grad clipping by itself, so you should not use clip_grad_by_norm')
+
+    def clip_grad_by_value(self, clip_value: float, *args, **kwargs) -> None:
+        raise NotImplementedError('Gemini does not support clip_grad_by_value')
+
+
+class GeminiPlugin(Plugin):
+    """
+    Plugin for Gemini.
+
+    Example:
+        >>> from colossalai.booster import Booster
+        >>> from colossalai.booster.plugin import GeminiPlugin
+        >>>
+        >>> model, train_dataset, optimizer, criterion = ...
+        >>> plugin = GeminiPlugin()
+
+        >>> train_dataloader = plugin.prepare_train_dataloader(train_dataset, batch_size=8)
+        >>> booster = Booster(plugin=plugin)
+        >>> model, optimizer, train_dataloader, criterion = booster.boost(model, optimizer, train_dataloader, criterion)
+
+    Args:
+        device (torch.device): device to place the model.
+        placement_policy (str, optional): "cpu", "cuda", "auto". Defaults to "cpu".
+        pin_memory (bool, optional): use pin memory on CPU. Defaults to False.
+        force_outputs_fp32 (bool, optional): force outputs are fp32. Defaults to False.
+        strict_ddp_mode (bool, optional): use strict ddp mode (only use dp without other parallelism). Defaults to False.
+        search_range_mb (int, optional): chunk size searching range in MegaByte. Defaults to 32.
+        hidden_dim (int, optional): the hidden dimension of DNN.
+            Users can provide this argument to speed up searching.
+            If users do not know this argument before training, it is ok. We will use a default value 1024.
+        min_chunk_size_mb (float, optional): the minimum chunk size in MegaByte.
+            If the aggregate size of parameters is still samller than the minimum chunk size,
+            all parameters will be compacted into one small chunk.
+        memstats (MemStats, optional) the memory statistics collector by a runtime memory tracer.
+        gpu_margin_mem_ratio (float, optional): The ratio of GPU remaining memory (after the first forward-backward)
+            which will be used when using hybrid CPU optimizer.
+            This argument is meaningless when `placement_policy` of `GeminiManager` is not "auto".
+            Defaults to 0.0.
+        initial_scale (float, optional): Initial scale used by DynamicGradScaler. Defaults to 2**32.
+        min_scale (float, optional): Min scale used by DynamicGradScaler. Defaults to 1.
+        growth_factor (float, optional): growth_factor used by DynamicGradScaler. Defaults to 2.
+        backoff_factor (float, optional): backoff_factor used by DynamicGradScaler. Defaults to 0.5.
+        growth_interval (float, optional): growth_interval used by DynamicGradScaler. Defaults to 1000.
+        hysteresis (float, optional): hysteresis used by DynamicGradScaler. Defaults to 2.
+        max_scale (int, optional): max_scale used by DynamicGradScaler. Defaults to 2**32.
+        max_norm (float, optional): max_norm used for `clip_grad_norm`. You should notice that you shall not do
+            clip_grad_norm by yourself when using ZeRO DDP. The ZeRO optimizer will take care of clip_grad_norm.
+        norm_type (float, optional): norm_type used for `clip_grad_norm`.
+        verbose (bool, optional): verbose mode. Debug info including chunk search result will be printed. Defaults to False.
+    """
+
+    def __init__(
+        self,
+        device: Optional[torch.device] = None,
+        placement_policy: str = "cpu",
+        pin_memory: bool = False,
+        force_outputs_fp32: bool = False,
+        strict_ddp_mode: bool = False,
+        search_range_mb: int = 32,
+        hidden_dim: Optional[int] = None,
+        min_chunk_size_mb: float = 32,
+        memstats: Optional[MemStats] = None,
+        gpu_margin_mem_ratio: float = 0.0,
+        initial_scale: float = 2**32,
+        min_scale: float = 1,
+        growth_factor: float = 2,
+        backoff_factor: float = 0.5,
+        growth_interval: int = 1000,
+        hysteresis: int = 2,
+        max_scale: float = 2**32,
+        max_norm: float = 0.0,
+        norm_type: float = 2.0,
+        verbose: bool = False,
+    ) -> None:
+
+        assert dist.is_initialized(
+        ), 'torch.distributed is not initialized, please use colossalai.launch to create the distributed environment'
+        self.rank = dist.get_rank()
+        self.world_size = dist.get_world_size()
+        self.gemini_config = dict(
+            device=(device or get_current_device()),
+            placement_policy=placement_policy,
+            pin_memory=pin_memory,
+            force_outputs_fp32=force_outputs_fp32,
+            strict_ddp_mode=strict_ddp_mode,
+            search_range_mb=search_range_mb,
+            hidden_dim=hidden_dim,
+            min_chunk_size_mb=min_chunk_size_mb,
+            memstats=memstats,
+        )
+        self.zero_optim_config = dict(gpu_margin_mem_ratio=gpu_margin_mem_ratio,)
+        self.optim_kwargs = dict(initial_scale=initial_scale,
+                                 growth_factor=growth_factor,
+                                 backoff_factor=backoff_factor,
+                                 growth_interval=growth_interval,
+                                 hysteresis=hysteresis,
+                                 min_scale=min_scale,
+                                 max_scale=max_scale,
+                                 max_norm=max_norm,
+                                 norm_type=norm_type)
+        self.verbose = verbose
+
+    def support_no_sync(self) -> bool:
+        return False
+
+    def control_precision(self) -> bool:
+        return True
+
+    def supported_precisions(self) -> List[str]:
+        return ['fp16']
+
+    def control_device(self) -> bool:
+        return True
+
+    def supported_devices(self) -> List[str]:
+        return ['cuda']
+
+    def prepare_train_dataloader(self,
+                                 dataset,
+                                 batch_size,
+                                 shuffle=False,
+                                 seed=1024,
+                                 drop_last=False,
+                                 pin_memory=False,
+                                 num_workers=0,
+                                 **kwargs):
+        r"""
+        Prepare a dataloader for distributed training. The dataloader will be wrapped by
+        `torch.utils.data.DataLoader` and `torch.utils.data.DistributedSampler`.
+
+        Note:
+            1. Evaluation datasets should not be passed to this function.
+
+        Args:
+            dataset (`torch.utils.data.Dataset`): The dataset to be loaded.
+            shuffle (bool, optional): Whether to shuffle the dataset. Defaults to False.
+            seed (int, optional): Random worker seed for sampling, defaults to 1024.
+            add_sampler: Whether to add ``DistributedDataParallelSampler`` to the dataset. Defaults to True.
+            drop_last (bool, optional): Set to True to drop the last incomplete batch, if the dataset size
+                is not divisible by the batch size. If False and the size of dataset is not divisible by
+                the batch size, then the last batch will be smaller, defaults to False.
+            pin_memory (bool, optional): Whether to pin memory address in CPU memory. Defaults to False.
+            num_workers (int, optional): Number of worker threads for this dataloader. Defaults to 0.
+            kwargs (dict): optional parameters for ``torch.utils.data.DataLoader``, more details could be found in
+                    `DataLoader <https://pytorch.org/docs/stable/_modules/torch/utils/data/dataloader.html#DataLoader>`_.
+
+        Returns:
+            :class:`torch.utils.data.DataLoader`: A DataLoader used for training or testing.
+        """
+        _kwargs = kwargs.copy()
+        sampler = DistributedSampler(dataset, num_replicas=self.world_size, rank=self.rank, shuffle=shuffle)
+
+        # Deterministic dataloader
+        def seed_worker(worker_id):
+            worker_seed = seed
+            np.random.seed(worker_seed)
+            torch.manual_seed(worker_seed)
+            random.seed(worker_seed)
+
+        return DataLoader(dataset,
+                          batch_size=batch_size,
+                          sampler=sampler,
+                          worker_init_fn=seed_worker,
+                          drop_last=drop_last,
+                          pin_memory=pin_memory,
+                          num_workers=num_workers,
+                          **_kwargs)
+
+    def configure(
+        self,
+        model: nn.Module,
+        optimizer: Optimizer,
+        criterion: Callable = None,
+        dataloader: DataLoader = None,
+        lr_scheduler: LRScheduler = None,
+    ) -> Tuple[Union[nn.Module, OptimizerWrapper, LRScheduler, DataLoader]]:
+
+        if not isinstance(model, ModelWrapper):
+            # convert model to sync bn
+            # FIXME(ver217): gemini does not support sync bn
+            # In torch/nn/modules/_functions.py, line 22, ``mean, invstd = torch.batch_norm_stats(input, eps)`` will get fp32 mean and invstd even though the input is fp16.
+            # This inconsistency of dtype will cause the error.
+            # We have two possible solutions:
+            # 1. keep batch norm always in fp32. This is hard for gemini, as it use chunks.
+            # 2. patch sync bn or write a new on. This is relatively easy, but we need to test it.
+            # model = nn.SyncBatchNorm.convert_sync_batchnorm(model, None)
+
+            # wrap the model with Gemini
+            model = GeminiModel(model, self.gemini_config, self.verbose)
+
+        if not isinstance(optimizer, OptimizerWrapper):
+            optimizer = GeminiOptimizer(model.unwrap(), optimizer, self.zero_optim_config, self.optim_kwargs,
+                                        self.verbose)
+
+        return model, optimizer, criterion, dataloader, lr_scheduler
+
+    def control_checkpoint_io(self) -> bool:
+        return True
+
+    def get_checkpoint_io(self) -> CheckpointIO:
+        return GeminiCheckpointIO()
--- a/colossalai/booster/plugin/low_level_zero_plugin.py
+++ b/colossalai/booster/plugin/low_level_zero_plugin.py
+import random
+import warnings
+from typing import Callable, List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+from torch import Tensor
+from torch.optim import Optimizer
+from torch.optim.lr_scheduler import _LRScheduler as LRScheduler
+from torch.utils._pytree import tree_map
+from torch.utils.data import DataLoader
+from torch.utils.data.distributed import DistributedSampler
+
+from colossalai.checkpoint_io import CheckpointIO
+from colossalai.interface import ModelWrapper, OptimizerWrapper
+from colossalai.utils import get_current_device
+from colossalai.zero import zero_model_wrapper, zero_optim_wrapper
+
+from .plugin_base import Plugin
+from .torch_ddp_plugin import TorchDDPCheckpointIO
+
+__all__ = ['LowLevelZeroPlugin']
+
+
+def _convert_to_fp16(x):
+    if isinstance(x, torch.Tensor) and torch.is_floating_point(x):
+        return x.half()
+    return x
+
+
+class LowLevelZeroCheckpointIO(TorchDDPCheckpointIO):
+
+    def save_unsharded_optimizer(self, optimizer: Optimizer, checkpoint: str, gather_dtensor: bool):
+        """
+        Save optimizer to checkpoint but only on master process.
+        """
+        # TODO(ver217): optimizer state dict is sharded
+        super().save_unsharded_optimizer(optimizer, checkpoint, gather_dtensor)
+
+
+class LowLevelZeroModel(ModelWrapper):
+
+    def __init__(self, module: nn.Module, stage: int, precision: str) -> None:
+        super().__init__(module)
+        self.convert_inputs = (precision == 'fp16')
+        module = zero_model_wrapper(module, zero_stage=stage)
+        if precision == 'fp16':
+            module = module.half()
+        module = module.to(get_current_device())
+        self.module = module
+
+    def forward(self, *args, **kwargs):
+        if self.convert_inputs:
+            args = tree_map(_convert_to_fp16, args)
+            kwargs = tree_map(_convert_to_fp16, kwargs)
+        return super().forward(*args, **kwargs)
+
+
+class LowLevelZeroOptimizer(OptimizerWrapper):
+
+    def __init__(self,
+                 module: nn.Module,
+                 optimizer: Optimizer,
+                 zero_optim_config: dict,
+                 optim_kwargs: dict,
+                 verbose: bool = False) -> None:
+        optimizer = zero_optim_wrapper(module,
+                                       optimizer,
+                                       optim_config=zero_optim_config,
+                                       **optim_kwargs,
+                                       verbose=verbose)
+        super().__init__(optimizer)
+
+    def backward(self, loss: Tensor, *args, **kwargs):
+        self.optim.backward(loss)
+
+    def clip_grad_by_norm(self,
+                          max_norm: Union[float, int],
+                          norm_type: Union[float, int] = 2,
+                          error_if_nonfinite: bool = False,
+                          *args,
+                          **kwargs) -> Tensor:
+        warnings.warn(f'LowLevelZero controls grad clipping by itself, so you should not use clip_grad_by_norm')
+
+    def clip_grad_by_value(self, clip_value: float, *args, **kwargs) -> None:
+        raise NotImplementedError('LowLevelZero does not support clip_grad_by_value')
+
+
+class LowLevelZeroPlugin(Plugin):
+    """
+    Plugin for low level zero.
+
+    Example:
+        >>> from colossalai.booster import Booster
+        >>> from colossalai.booster.plugin import LowLevelZeroPlugin
+        >>>
+        >>> model, train_dataset, optimizer, criterion = ...
+        >>> plugin = LowLevelZeroPlugin()
+
+        >>> train_dataloader = plugin.prepare_train_dataloader(train_dataset, batch_size=8)
+        >>> booster = Booster(plugin=plugin)
+        >>> model, optimizer, train_dataloader, criterion = booster.boost(model, optimizer, train_dataloader, criterion)
+
+    Args:
+        strage (int, optional): ZeRO stage. Defaults to 1.
+        precision (str, optional): precision. Support 'fp16' and 'fp32'. Defaults to 'fp16'.
+        initial_scale (float, optional): Initial scale used by DynamicGradScaler. Defaults to 2**32.
+        min_scale (float, optional): Min scale used by DynamicGradScaler. Defaults to 1.
+        growth_factor (float, optional): growth_factor used by DynamicGradScaler. Defaults to 2.
+        backoff_factor (float, optional): backoff_factor used by DynamicGradScaler. Defaults to 0.5.
+        growth_interval (float, optional): growth_interval used by DynamicGradScaler. Defaults to 1000.
+        hysteresis (float, optional): hysteresis used by DynamicGradScaler. Defaults to 2.
+        max_scale (int, optional): max_scale used by DynamicGradScaler. Defaults to 2**32.
+        max_norm (float, optional): max_norm used for `clip_grad_norm`. You should notice that you shall not do
+            clip_grad_norm by yourself when using ZeRO DDP. The ZeRO optimizer will take care of clip_grad_norm.
+        norm_type (float, optional): norm_type used for `clip_grad_norm`.
+        reduce_bucket_size_in_m (int, optional): grad reduce bucket size in M. Defaults to 12.
+        communication_dtype (torch.dtype, optional): communication dtype. If not specified, the dtype of param will be used. Defaults to None.
+        overlap_communication (bool, optional): whether to overlap communication and computation. Defaults to True.
+        cpu_offload (bool, optional): whether to offload grad, master weight and optimizer state to cpu. Defaults to False.
+        verbose (bool, optional): verbose mode. Debug info including grad overflow will be printed. Defaults to False.
+    """
+
+    def __init__(
+        self,
+        stage: int = 1,
+        precision: str = 'fp16',
+        initial_scale: float = 2**32,
+        min_scale: float = 1,
+        growth_factor: float = 2,
+        backoff_factor: float = 0.5,
+        growth_interval: int = 1000,
+        hysteresis: int = 2,
+        max_scale: float = 2**32,
+        max_norm: float = 0.0,
+        norm_type: float = 2.0,
+        reduce_bucket_size_in_m: int = 12,
+        communication_dtype: Optional[torch.dtype] = None,
+        overlap_communication: bool = True,
+        cpu_offload: bool = False,
+        verbose: bool = False,
+    ) -> None:
+
+        assert dist.is_initialized(
+        ), 'torch.distributed is not initialized, please use colossalai.launch to create the distributed environment'
+        assert stage in (1, 2), f'LowLevelZeroPlugin only supports stage 1/2 training'
+        assert precision in ('fp16', 'fp32'), f'LowLevelZeroPlugin only supports fp16/fp32 training'
+
+        self.rank = dist.get_rank()
+        self.world_size = dist.get_world_size()
+
+        self.stage = stage
+        self.precision = precision
+        self.zero_optim_config = dict(reduce_bucket_size=reduce_bucket_size_in_m * 1024 * 1024,
+                                      communication_dtype=communication_dtype,
+                                      overlap_communication=overlap_communication,
+                                      cpu_offload=cpu_offload)
+        self.optim_kwargs = dict(initial_scale=initial_scale,
+                                 growth_factor=growth_factor,
+                                 backoff_factor=backoff_factor,
+                                 growth_interval=growth_interval,
+                                 hysteresis=hysteresis,
+                                 min_scale=min_scale,
+                                 max_scale=max_scale,
+                                 max_norm=max_norm,
+                                 norm_type=norm_type)
+        self.verbose = verbose
+
+    def support_no_sync(self) -> bool:
+        return False
+
+    def control_precision(self) -> bool:
+        return True
+
+    def supported_precisions(self) -> List[str]:
+        return ['fp16', 'fp32']
+
+    def control_device(self) -> bool:
+        return True
+
+    def supported_devices(self) -> List[str]:
+        return ['cuda']
+
+    def prepare_train_dataloader(self,
+                                 dataset,
+                                 batch_size,
+                                 shuffle=False,
+                                 seed=1024,
+                                 drop_last=False,
+                                 pin_memory=False,
+                                 num_workers=0,
+                                 **kwargs):
+        r"""
+        Prepare a dataloader for distributed training. The dataloader will be wrapped by
+        `torch.utils.data.DataLoader` and `torch.utils.data.DistributedSampler`.
+
+        Note:
+            1. Evaluation datasets should not be passed to this function.
+
+        Args:
+            dataset (`torch.utils.data.Dataset`): The dataset to be loaded.
+            shuffle (bool, optional): Whether to shuffle the dataset. Defaults to False.
+            seed (int, optional): Random worker seed for sampling, defaults to 1024.
+            add_sampler: Whether to add ``DistributedDataParallelSampler`` to the dataset. Defaults to True.
+            drop_last (bool, optional): Set to True to drop the last incomplete batch, if the dataset size
+                is not divisible by the batch size. If False and the size of dataset is not divisible by
+                the batch size, then the last batch will be smaller, defaults to False.
+            pin_memory (bool, optional): Whether to pin memory address in CPU memory. Defaults to False.
+            num_workers (int, optional): Number of worker threads for this dataloader. Defaults to 0.
+            kwargs (dict): optional parameters for ``torch.utils.data.DataLoader``, more details could be found in
+                    `DataLoader <https://pytorch.org/docs/stable/_modules/torch/utils/data/dataloader.html#DataLoader>`_.
+
+        Returns:
+            :class:`torch.utils.data.DataLoader`: A DataLoader used for training or testing.
+        """
+        _kwargs = kwargs.copy()
+        sampler = DistributedSampler(dataset, num_replicas=self.world_size, rank=self.rank, shuffle=shuffle)
+
+        # Deterministic dataloader
+        def seed_worker(worker_id):
+            worker_seed = seed
+            np.random.seed(worker_seed)
+            torch.manual_seed(worker_seed)
+            random.seed(worker_seed)
+
+        return DataLoader(dataset,
+                          batch_size=batch_size,
+                          sampler=sampler,
+                          worker_init_fn=seed_worker,
+                          drop_last=drop_last,
+                          pin_memory=pin_memory,
+                          num_workers=num_workers,
+                          **_kwargs)
+
+    def configure(
+        self,
+        model: nn.Module,
+        optimizer: Optimizer,
+        criterion: Callable = None,
+        dataloader: DataLoader = None,
+        lr_scheduler: LRScheduler = None,
+    ) -> Tuple[Union[nn.Module, OptimizerWrapper, LRScheduler, DataLoader]]:
+
+        if not isinstance(model, ModelWrapper):
+            model = LowLevelZeroModel(model, self.stage, self.precision)
+
+        if not isinstance(optimizer, OptimizerWrapper):
+            optimizer = LowLevelZeroOptimizer(model.unwrap(), optimizer, self.zero_optim_config, self.optim_kwargs,
+                                              self.verbose)
+
+        return model, optimizer, criterion, dataloader, lr_scheduler
+
+    def control_checkpoint_io(self) -> bool:
+        return True
+
+    def get_checkpoint_io(self) -> CheckpointIO:
+        return LowLevelZeroCheckpointIO()
--- a/colossalai/booster/plugin/plugin_base.py
+++ b/colossalai/booster/plugin/plugin_base.py
+from abc import ABC, abstractmethod
+from typing import Callable, List, Tuple, Union
+
+import torch.nn as nn
+from torch.optim import Optimizer
+from torch.optim.lr_scheduler import _LRScheduler as LRScheduler
+from torch.utils.data import DataLoader
+
+from colossalai.checkpoint_io import CheckpointIO
+from colossalai.interface import OptimizerWrapper
+
+__all__ = ['Plugin']
+
+
+class Plugin(ABC):
+
+    @abstractmethod
+    def supported_devices(self) -> List[str]:
+        pass
+
+    @abstractmethod
+    def supported_precisions(self) -> List[str]:
+        pass
+
+    @abstractmethod
+    def control_precision(self) -> bool:
+        pass
+
+    @abstractmethod
+    def control_device(self) -> bool:
+        pass
+
+    @abstractmethod
+    def support_no_sync(self) -> bool:
+        pass
+
+    @abstractmethod
+    def configure(
+        self,
+        model: nn.Module,
+        optimizer: Optimizer,
+        criterion: Callable = None,
+        dataloader: DataLoader = None,
+        lr_scheduler: LRScheduler = None,
+    ) -> Tuple[Union[nn.Module, OptimizerWrapper, LRScheduler, DataLoader]]:
+        # implement this method
+        pass
+
+    @abstractmethod
+    def control_checkpoint_io(self) -> bool:
+        """
+        Whether the plugin controls the checkpoint io
+        """
+        pass
+
+    @abstractmethod
+    def get_checkpoint_io(self) -> CheckpointIO:
+        """
+        Get checkpoint io object for this plugin, only invoked when control_checkpoint_io is True.
+        """
+        pass
--- a/colossalai/booster/plugin/torch_ddp_plugin.py
+++ b/colossalai/booster/plugin/torch_ddp_plugin.py
+import random
+from typing import Callable, List, Tuple, Union
+
+import numpy as np
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.optim import Optimizer
+from torch.optim.lr_scheduler import _LRScheduler as LRScheduler
+from torch.utils.data import DataLoader
+from torch.utils.data.distributed import DistributedSampler
+
+from colossalai.checkpoint_io import CheckpointIO, GeneralCheckpointIO
+from colossalai.cluster import DistCoordinator
+from colossalai.interface import ModelWrapper, OptimizerWrapper
+
+from .plugin_base import Plugin
+
+__all__ = ['TorchDDPPlugin']
+
+
+class TorchDDPCheckpointIO(GeneralCheckpointIO):
+
+    def __init__(self) -> None:
+        super().__init__()
+        self.coordinator = DistCoordinator()
+
+    def load_unsharded_model(self, model: nn.Module, checkpoint: str, strict: bool = True):
+        """
+        Load model from checkpoint with automatic unwrapping.
+        """
+        # the model should be unwrapped in self.load_model via ModelWrapper.unwrap
+        return super().load_unsharded_model(model, checkpoint, strict=strict)
+
+    def save_unsharded_model(self, model: nn.Module, checkpoint: str, gather_dtensor: bool, use_safetensors: bool):
+        """
+        Save model to checkpoint but only on master process.
+        """
+        # the model should be unwrapped in self.load_model via ModelWrapper.unwrap
+        if self.coordinator.is_master():
+            super().save_unsharded_model(model, checkpoint, gather_dtensor, use_safetensors)
+
+    def save_unsharded_optimizer(self, optimizer: Optimizer, checkpoint: str, gather_dtensor: bool):
+        """
+        Save optimizer to checkpoint but only on master process.
+        """
+        if self.coordinator.is_master():
+            super().save_unsharded_optimizer(optimizer, checkpoint, gather_dtensor)
+
+    def save_lr_scheduler(self, lr_scheduler: LRScheduler, checkpoint: str):
+        """
+        Save model to checkpoint but only on master process.
+        """
+        if self.coordinator.is_master():
+            super().save_lr_scheduler(lr_scheduler, checkpoint)
+
+
+class TorchDDPModel(ModelWrapper):
+
+    def __init__(self, module: nn.Module, *args, **kwargs) -> None:
+        super().__init__(module)
+        self.module = DDP(module, *args, **kwargs)
+
+    def unwrap(self):
+        return self.module.module
+
+
+class TorchDDPPlugin(Plugin):
+    """
+    Plugin for PyTorch DDP.
+
+    Example:
+        >>> from colossalai.booster import Booster
+        >>> from colossalai.booster.plugin import TorchDDPPlugin
+        >>>
+        >>> model, train_dataset, optimizer, criterion = ...
+        >>> plugin = TorchDDPPlugin()
+
+        >>> train_dataloader = plugin.prepare_train_dataloader(train_dataset, batch_size=8)
+        >>> booster = Booster(plugin=plugin)
+        >>> model, optimizer, train_dataloader, criterion = booster.boost(model, optimizer, train_dataloader, criterion)
+
+    Args:
+        broadcast_buffers (bool, optional): Whether to broadcast buffers in the beginning of training. Defaults to True.
+        bucket_cap_mb (int, optional): The bucket size in MB. Defaults to 25.
+        find_unused_parameters (bool, optional): Whether to find unused parameters. Defaults to False.
+        check_reduction (bool, optional): Whether to check reduction. Defaults to False.
+        gradient_as_bucket_view (bool, optional): Whether to use gradient as bucket view. Defaults to False.
+        static_graph (bool, optional): Whether to use static graph. Defaults to False.
+    """
+
+    def __init__(self,
+                 broadcast_buffers: bool = True,
+                 bucket_cap_mb: int = 25,
+                 find_unused_parameters: bool = False,
+                 check_reduction: bool = False,
+                 gradient_as_bucket_view: bool = False,
+                 static_graph: bool = False) -> None:
+
+        assert dist.is_initialized(
+        ), 'torch.distributed is not initialized, please use colossalai.launch to create the distributed environment'
+        self.rank = dist.get_rank()
+        self.world_size = dist.get_world_size()
+        self.ddp_kwargs = dict(broadcast_buffers=broadcast_buffers,
+                               bucket_cap_mb=bucket_cap_mb,
+                               find_unused_parameters=find_unused_parameters,
+                               check_reduction=check_reduction,
+                               gradient_as_bucket_view=gradient_as_bucket_view,
+                               static_graph=static_graph)
+
+    def support_no_sync(self) -> bool:
+        return True
+
+    def control_precision(self) -> bool:
+        return False
+
+    def supported_precisions(self) -> List[str]:
+        return ['fp16', 'fp16_apex', 'bf16', 'fp8']
+
+    def control_device(self) -> bool:
+        return True
+
+    def supported_devices(self) -> List[str]:
+        return ['cuda']
+
+    def prepare_train_dataloader(self,
+                                 dataset,
+                                 batch_size,
+                                 shuffle=False,
+                                 seed=1024,
+                                 drop_last=False,
+                                 pin_memory=False,
+                                 num_workers=0,
+                                 **kwargs):
+        r"""
+        Prepare a dataloader for distributed training. The dataloader will be wrapped by
+        `torch.utils.data.DataLoader` and `torch.utils.data.DistributedSampler`.
+
+        Note:
+            1. Evaluation datasets should not be passed to this function.
+
+        Args:
+            dataset (`torch.utils.data.Dataset`): The dataset to be loaded.
+            shuffle (bool, optional): Whether to shuffle the dataset. Defaults to False.
+            seed (int, optional): Random worker seed for sampling, defaults to 1024.
+            add_sampler: Whether to add ``DistributedDataParallelSampler`` to the dataset. Defaults to True.
+            drop_last (bool, optional): Set to True to drop the last incomplete batch, if the dataset size
+                is not divisible by the batch size. If False and the size of dataset is not divisible by
+                the batch size, then the last batch will be smaller, defaults to False.
+            pin_memory (bool, optional): Whether to pin memory address in CPU memory. Defaults to False.
+            num_workers (int, optional): Number of worker threads for this dataloader. Defaults to 0.
+            kwargs (dict): optional parameters for ``torch.utils.data.DataLoader``, more details could be found in
+                    `DataLoader <https://pytorch.org/docs/stable/_modules/torch/utils/data/dataloader.html#DataLoader>`_.
+
+        Returns:
+            :class:`torch.utils.data.DataLoader`: A DataLoader used for training or testing.
+        """
+        _kwargs = kwargs.copy()
+        sampler = DistributedSampler(dataset, num_replicas=self.world_size, rank=self.rank, shuffle=shuffle)
+
+        # Deterministic dataloader
+        def seed_worker(worker_id):
+            worker_seed = seed
+            np.random.seed(worker_seed)
+            torch.manual_seed(worker_seed)
+            random.seed(worker_seed)
+
+        return DataLoader(dataset,
+                          batch_size=batch_size,
+                          sampler=sampler,
+                          worker_init_fn=seed_worker,
+                          drop_last=drop_last,
+                          pin_memory=pin_memory,
+                          num_workers=num_workers,
+                          **_kwargs)
+
+    def configure(
+        self,
+        model: nn.Module,
+        optimizer: Optimizer,
+        criterion: Callable = None,
+        dataloader: DataLoader = None,
+        lr_scheduler: LRScheduler = None,
+    ) -> Tuple[Union[nn.Module, OptimizerWrapper, LRScheduler, DataLoader]]:
+        # cast model to cuda
+        model = model.cuda()
+
+        # convert model to sync bn
+        model = nn.SyncBatchNorm.convert_sync_batchnorm(model, None)
+
+        # wrap the model with PyTorch DDP
+        model = TorchDDPModel(model, **self.ddp_kwargs)
+
+        if not isinstance(optimizer, OptimizerWrapper):
+            optimizer = OptimizerWrapper(optimizer)
+
+        return model, optimizer, criterion, dataloader, lr_scheduler
+
+    def control_checkpoint_io(self) -> bool:
+        return True
+
+    def get_checkpoint_io(self) -> CheckpointIO:
+        return TorchDDPCheckpointIO()
--- a/colossalai/builder/__init__.py
+++ b/colossalai/builder/__init__.py
+from .builder import build_from_config, build_from_registry, build_gradient_handler
+
+__all__ = ['build_gradient_handler', 'build_from_config', 'build_from_registry']
--- a/colossalai/builder/builder.py
+++ b/colossalai/builder/builder.py
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+import inspect
+
+from colossalai.registry import *
+
+
+def build_from_config(module, config: dict):
+    """Returns an object of :class:`module` constructed from `config`.
+
+    Args:
+        module: A python or user-defined class
+        config: A python dict containing information used in the construction of the return object
+
+    Returns: An ``object`` of interest
+
+    Raises:
+        AssertionError: Raises an AssertionError if `module` is not a class
+
+    """
+    assert inspect.isclass(module), 'module must be a class'
+    return module(**config)
+
+
+def build_from_registry(config, registry: Registry):
+    r"""Returns an object constructed from `config`, the type of the object
+    is specified by `registry`.
+
+    Note:
+        the `config` is used to construct the return object such as `LAYERS`, `OPTIMIZERS`
+        and other support types in `registry`. The `config` should contain
+        all required parameters of corresponding object. The details of support
+        types in `registry` and the `mod_type` in `config` could be found in
+        `registry <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/registry/__init__.py>`_.
+
+    Args:
+        config (dict or :class:`colossalai.context.colossalai.context.Config`): information
+            used in the construction of the return object.
+        registry (:class:`Registry`): A registry specifying the type of the return object
+
+    Returns:
+        A Python object specified by `registry`.
+
+    Raises:
+        Exception: Raises an Exception if an error occurred when building from registry.
+    """
+    config_ = config.copy()    # keep the original config untouched
+    assert isinstance(registry, Registry), f'Expected type Registry but got {type(registry)}'
+
+    mod_type = config_.pop('type')
+    assert registry.has(mod_type), f'{mod_type} is not found in registry {registry.name}'
+    try:
+        obj = registry.get_module(mod_type)(**config_)
+    except Exception as e:
+        print(f'An error occurred when building {mod_type} from registry {registry.name}', flush=True)
+        raise e
+
+    return obj
+
+
+def build_gradient_handler(config, model, optimizer):
+    """Returns a gradient handler object of :class:`BaseGradientHandler` constructed from `config`,
+    `model` and `optimizer`.
+
+    Args:
+        config (dict or :class:`colossalai.context.Config`): A python dict or
+            a :class:`colossalai.context.Config` object containing information
+            used in the construction of the ``GRADIENT_HANDLER``.
+        model (:class:`nn.Module`): A model containing parameters for the gradient handler
+        optimizer (:class:`torch.optim.Optimizer`): An optimizer object containing parameters for the gradient handler
+
+    Returns:
+        An object of :class:`colossalai.engine.BaseGradientHandler`
+    """
+    config_ = config.copy()
+    config_['model'] = model
+    config_['optimizer'] = optimizer
+    return build_from_registry(config_, GRADIENT_HANDLER)
--- a/colossalai/checkpoint_io/__init__.py
+++ b/colossalai/checkpoint_io/__init__.py
+from .checkpoint_io_base import CheckpointIO
+from .general_checkpoint_io import GeneralCheckpointIO
+from .index_file import CheckpointIndexFile
+
+__all__ = ['CheckpointIO', 'CheckpointIndexFile', 'GeneralCheckpointIO']
--- a/colossalai/checkpoint_io/checkpoint_io_base.py
+++ b/colossalai/checkpoint_io/checkpoint_io_base.py
+from abc import ABC, abstractmethod
+from pathlib import Path
+from typing import Union
+from typing import Optional
+
+import torch
+import torch.nn as nn
+from torch.optim import Optimizer
+from torch.optim.lr_scheduler import _LRScheduler as LRScheduler
+
+from colossalai.interface import ModelWrapper
+
+from .utils import has_index_file
+
+__all__ = ['CheckpointIO']
+
+
+class CheckpointIO(ABC):
+    """
+    CheckpointIO is the base class for all checkpoint IO classes. It defines the interface for checkpoint IO.
+
+
+    Examples:
+        >>> from colossalai.checkpoint_io import GeneralCheckpointIO
+        >>> checkpoint_io = CheckpointIO()
+        >>>
+        >>> # load model from checkpoint
+        >>> model = checkpoint_io.load_model(model, 'model.pt')
+        >>>
+        >>> # save model to checkpoint, any distributed tensor is gathered by default
+        >>> checkpoint_io.save_model(model, 'model.pt')
+        >>>
+        >>> # if the model contains distributed tensor, and you don't want to gather it
+        >>> # each rank will save its own shard of the distributed tensor
+        >>> checkpoint_io.save_model(model, 'model.pt', gather_dtensor=False)
+        >>>
+        >>> # save model to sharded checkpoints
+        >>> checkpoint_io.save_model(model, './checkpoints/', shard=True)
+        >>>
+        >>> # save model to sharded  and assume we don't want to gather distributed tensors
+        >>> checkpoint_io.save_model(model, './checkpoints/', shard=True, gather_dtensor=False)
+        >>>
+        >>> # Note:
+        >>> # 1. we don't support loading from distributed tensors, conversion from distributed tensors
+        >>> # checkpoints to full tensor checkpoint should be done offline via our CLI
+        >>> # 2. you don't have to specify whether the model is sharded or not when loading the model
+        >>> # as it will be automatically detected
+        >>>
+        >>> # load model from sharded checkpoints
+        >>> model = checkpoint_io.load_model(model, './checkpoints/')
+        >>>
+        >>> # load model from unsharded checkpoints
+        >>> model = checkpoint_io.load_model(model, './checkpoints/')
+        >>>
+        >>> # load optimizer from checkpoint
+        >>> optimizer = checkpoint_io.load_optimizer(optimizer, 'optimizer.pt')
+        >>>
+        >>> # save optimizer to checkpoint
+        >>> checkpoint_io.save_optimizer(optimizer, 'optimizer.pt')
+    """
+
+    # ======================================
+    # Public methods
+    # ======================================
+    def load_model(self,
+                   model: Union[nn.Module, ModelWrapper],
+                   checkpoint: str,
+                   strict: bool = True) -> Union[nn.Module, ModelWrapper]:
+        """
+        Load model from checkpoint.
+
+        Args:
+            model (nn.Module): model to be loaded.
+            checkpoint (str): checkpoint path. This value is made compatibility with the model checkpoints in the
+                        mainstream model zoos such as Hugging Face and TIMM. The checkpoint path can be:
+                        1. a file path, e.g. 'model.pt'
+                        2. a path to a json file which defines the index to the sharded checkpoint
+                        3. a path to a folder containing a unique .index.json file for sharded checkpoint
+                        Distributed tensors cannot be loaded directly unless gathered offline via our CLI.
+            strict (bool): whether to strictly enforce that the param name in
+                the checkpoint match the keys returned by this module's.
+        """
+        # since we only support loaded sharded and unsharded weight format
+        # containing no distributed tensors, dtensor -> full tensor conversion
+        # should be done offline via our CLI
+        # the existence of index file means it is a sharded checkpoint
+        ckpt_path = Path(checkpoint)
+        index_file_exists, index_file_path = has_index_file(checkpoint)
+
+        # return the origin model instead of the unwrapped model
+        origin_model = model
+
+        if isinstance(model, ModelWrapper):
+            model = model.unwrap()
+
+        if index_file_exists:
+            self.load_sharded_model(model, index_file_path, strict)
+        else:
+            self.load_unsharded_model(model, checkpoint, strict)
+
+        return origin_model
+
+    def save_model(self,
+                   model: Union[nn.Module, ModelWrapper],
+                   checkpoint: str,
+                   shard: bool = False,
+                   gather_dtensor: bool = True,
+                   variant: str = None,
+                   size_per_shard: int = 1024,
+                   use_safetensors: bool = False):
+        """
+        Save model to checkpoint.
+
+        Examples:
+            >>> from colossalai.checkpoint_io import GeneralCheckpointIO
+            >>> checkpoint_io = CheckpointIO()
+            >>>
+            >>> # save model to a single file
+            >>> save_model(model, 'model.pt')
+            >>>
+            >>> # save model to a sharded checkpoint
+            >>> save_model(model, './checkpoints/', shard=True)
+
+        Args:
+            model (nn.Module): model to be saved.
+            checkpoint (str): checkpoint path. The checkpoint path can be :
+                1. a file path, e.g. 'model.pt'
+                2. a directory path to save the sharded checkpoint, e.g. './checkpoints/' when shard = True.
+            shard (bool): whether to shard the checkpoint. Default: False. If set to True, the checkpoint will be sharded into
+                multiple files. The model shards will be specified by a `model.index.json` file. When shard = True, please ensure
+                that the checkpoint path is a directory path instead of a file path.
+            gather_dtensor (bool): whether to gather the distributed tensor to the first device. Default: True.
+            variant (str): If specified, weights are saved in the format pytorch_model.<variant>.bin. Default: None.
+            size_per_shard (int): size per shard in MB. Default: 1024. This value is only used when shard = True.
+            use_safetensors (bool): whether to use safe tensors. Default: False. If set to True, the checkpoint will be saved
+        """
+
+        if isinstance(model, ModelWrapper):
+            model = model.unwrap()
+
+        if shard:
+            self.save_sharded_model(model, checkpoint, gather_dtensor, variant, size_per_shard, use_safetensors)
+        else:
+            self.save_unsharded_model(model, checkpoint, gather_dtensor, use_safetensors)
+
+    def load_optimizer(self, optimizer: Optimizer, checkpoint: str):
+        """
+        Load optimizer from checkpoint.
+
+        Args:
+            optimizer (Optimizer): optimizer to be loaded.
+            checkpoint (str): checkpoint path. This value is made compatibility with the model checkpoints in the
+        """
+        index_file_exists, index_file_path = has_index_file(checkpoint)
+
+        if Path(checkpoint).is_dir() and not index_file_exists:
+            # if the checkpoint is a directory and there is no index file, raise error
+            raise ValueError(f'Cannot find index file in {checkpoint}')
+
+        if index_file_exists:
+            # the existence of index file means it is a sharded checkpoint
+            self.load_sharded_optimizer(optimizer, index_file_path)
+        else:
+            self.load_unsharded_optimizer(optimizer, checkpoint)
+
+    def save_optimizer(self,
+                       optimizer: Optimizer,
+                       checkpoint: str,
+                       shard: bool = False,
+                       gather_dtensor=True,
+                       prefix: str = None,
+                       size_per_shard: int = 1024):
+        """
+        Save optimizer to checkpoint. Optimizer states saving is not compatible with safetensors.
+
+        Args:
+            optimizer (Optimizer): optimizer to be saved.
+            checkpoint (str): checkpoint path. The checkpoint path can be :
+                1. a file path, e.g. 'model.pt'
+                2. a path to a json file which defines the index to the sharded checkpoint for the optimizer
+                3. a path to a folder containing a unique .index.json file for sharded checkpoint
+            shard (bool): whether to shard the checkpoint. Default: False. If set to True, the checkpoint will be sharded into
+                multiple files. The optimizer shards will be specified by a `optimizer.index.json` file.
+            gather_dtensor (bool): whether to gather the distributed tensor to the first device. Default: True.
+            prefix (str): prefix for the optimizer checkpoint when shard = True. Default: None.
+            size_per_shard (int): size per shard in MB. Default: 1024. This value is only used when shard is set to True.
+        """
+        if shard:
+            self.save_sharded_optimizer(optimizer, checkpoint, gather_dtensor, prefix, size_per_shard)
+        else:
+            self.save_unsharded_optimizer(optimizer, checkpoint, gather_dtensor)
+
+    # ========================================================
+    # Abstract methods for model loading/saving implementation
+    # ========================================================
+    @abstractmethod
+    def load_sharded_model(self, model: nn.Module, index_file_path: str, strict: bool):
+        """
+        Load model from sharded checkpoint.
+
+        Args:
+            model (nn.Module): model to be loaded.
+            index_file_path (str): checkpoint path. It should be path to the .index.json file or a path to a directory which contains a .index.json file.
+            strict (bool): whether to strictly enforce that the param name in
+                the checkpoint match the keys returned by this module's.
+        """
+        pass
+
+    @abstractmethod
+    def load_unsharded_model(self, model: nn.Module, checkpoint: str, strict: bool):
+        """
+        Load model from unsharded checkpoint.
+
+        Args:
+            model (nn.Module): model to be loaded.
+            checkpoint (str): checkpoint path. It should be a single file path pointing to a model weight binary.
+            strict (bool): whether to strictly enforce that the param name in
+                the checkpoint match the keys returned by this module's.
+        """
+        pass
+
+    @abstractmethod
+    def save_sharded_model(self, model: nn.Module, checkpoint: str, gather_dtensor: bool, variant: Optional[str],
+                           size_per_shard: int, use_safetensors: bool):
+        """
+        Save model to sharded checkpoint.
+
+        Args:
+            model (nn.Module): model to be saved.
+            checkpoint (str): checkpoint path. It should be a directory path.
+            gather_dtensor (bool): whether to gather the distributed tensor to the first device.
+            prefix (str): prefix for the model checkpoint.
+            size_per_shard (int): size per shard in MB.
+            use_safetensors (bool): whether to use safe tensors.
+        """
+        pass
+
+    @abstractmethod
+    def save_unsharded_model(self, model: nn.Module, checkpoint: str, gather_dtensor: bool, use_safetensors: bool):
+        """
+        Save model to unsharded checkpoint.
+
+        Args:
+            model (nn.Module): model to be saved.
+            checkpoint (str): checkpoint path. It should be a single file path pointing to a model weight binary.
+            gather_dtensor (bool): whether to gather the distributed tensor to the first device.
+            use_safetensors (bool): whether to use safe tensors.
+        """
+        pass
+
+    # ========================================================
+    # Abstract methods for optimizer loading/saving implementation
+    # ========================================================
+
+    @abstractmethod
+    def load_sharded_optimizer(self, optimizer: Optimizer, index_file_path: str, prefix: str, size_per_shard: int):
+        """
+        Load optimizer from sharded checkpoint.
+
+        Args:
+            optimizer (Optimizer): optimizer to be loaded.
+            index_file_path (str): checkpoint path. It should be path to the .index.json file or a path to a directory which contains a .index.json file.
+            prefix (str): prefix for the optimizer checkpoint.
+            size_per_shard (int): size per shard in MB.
+        """
+        pass
+
+    @abstractmethod
+    def load_unsharded_optimizer(self, optimizer: Optimizer, checkpoint: Path):
+        """
+        Load optimizer from unsharded checkpoint.
+
+        Args:
+            optimizer (Optimizer): optimizer to be loaded.
+            checkpoint (str): checkpoint path. It should be a single file path pointing to a model weight binary.
+        """
+        pass
+
+    @abstractmethod
+    def save_sharded_optimizer(self, optimizer: Optimizer, checkpoint: Path, gather_dtensor: bool, prefix: str,
+                               size_per_shard: int):
+        """
+        Save optimizer to sharded checkpoint.
+
+        Args:
+            optimizer (Optimizer): optimizer to be saved.
+            checkpoint (Path): checkpoint path. It should be a directory path.
+            gather_dtensor (bool): whether to gather the distributed tensor to the first device.
+            prefix (str): prefix for the optimizer checkpoint.
+            size_per_shard (int): size per shard in MB.
+        """
+        pass
+
+    @abstractmethod
+    def save_unsharded_optimizer(self, optimizer: Optimizer, checkpoint: Path, gather_dtensor: bool):
+        """
+        Save optimizer to unsharded checkpoint.
+
+        Args:
+            optimizer (Optimizer): optimizer to be saved.
+            checkpoint (str): checkpoint path. It should be a single file path pointing to a model weight binary.
+            gather_dtensor (bool): whether to gather the distributed tensor to the first device.
+        """
+        pass
+
+    # ============================================
+    # methods for loading and saving lr scheduler
+    # as this is quite standard, there is no need
+    # to make them abstract
+    # ============================================
+    def save_lr_scheduler(self, lr_scheduler: LRScheduler, checkpoint: str):
+        """
+        Save lr scheduler to checkpoint.
+
+        Args:
+            lr_scheduler (LRScheduler): lr scheduler to be saved.
+            checkpoint: checkpoint path. The checkpoint path can only be a file path.
+        """
+        torch.save(lr_scheduler.state_dict(), checkpoint)
+
+    def load_lr_scheduler(self, lr_scheduler: LRScheduler, checkpoint: str):
+        """
+        Load lr scheduler from checkpoint.
+
+        Args:
+            lr_scheduler (LRScheduler): lr scheduler to be loaded.
+            checkpoint (str): the path for a single checkpoint file.
+        """
+        state_dict = torch.load(checkpoint)
+        lr_scheduler.load_state_dict(state_dict)
--- a/colossalai/checkpoint_io/general_checkpoint_io.py
+++ b/colossalai/checkpoint_io/general_checkpoint_io.py
+from pathlib import Path
+
+import torch.nn as nn
+from torch.optim import Optimizer
+import logging
+import os
+import json
+import gc
+from typing import Optional
+
+from .checkpoint_io_base import CheckpointIO
+from .index_file import CheckpointIndexFile
+from .utils import (
+    has_index_file, 
+    load_state_dict, 
+    save_state_dict, 
+    is_safetensors_available,
+    shard_checkpoint,
+    load_shard_state_dict,
+    load_state_dict_into_model,
+    add_variant
+    )
+from .utils import SAFE_WEIGHTS_NAME, WEIGHTS_NAME, SAFE_WEIGHTS_INDEX_NAME, WEIGHTS_INDEX_NAME
+
+
+__all__ = ['GeneralCheckpointIO']
+
+
+class GeneralCheckpointIO(CheckpointIO):
+    """
+    Checkpoint IO
+    """
+    def load_unsharded_model(self, model: nn.Module, checkpoint: str, strict: bool):
+        checkpoint = load_state_dict(checkpoint)
+        model.load_state_dict(checkpoint, strict=strict)
+
+    def save_unsharded_model(self, model: nn.Module, checkpoint: str, gather_dtensor: bool, use_safetensors: bool):
+        state_dict = model.state_dict()
+
+        # TODO(FrankLeeeee): add support for gather_dtensor
+        if gather_dtensor:
+            pass
+
+        # save the checkpoint
+        save_state_dict(state_dict, checkpoint, use_safetensors)
+
+    def load_sharded_optimizer(self, optimizer: Optimizer, checkpoint: Path, prefix: str, size_per_shard: int):
+        raise NotImplementedError("Sharded optimizer checkpoint is not supported yet.")
+
+    def load_unsharded_optimizer(self, optimizer: Optimizer, checkpoint: Path):
+        checkpoint = load_state_dict(checkpoint)
+        optimizer.load_state_dict(checkpoint)
+
+    def save_sharded_optimizer(
+        self,
+        optimizer: Optimizer,
+        checkpoint: Path,
+        gather_dtensor: bool,
+        prefix: str,
+        size_per_shard: int,
+    ):
+        raise NotImplementedError("Sharded optimizer checkpoint is not supported yet.")
+
+    def save_unsharded_optimizer(
+        self,
+        optimizer: Optimizer,
+        checkpoint: Path,
+        gather_dtensor: bool,
+    ):
+        # TODO(FrankLeeeee): handle distributed tensors
+        save_state_dict(optimizer.state_dict(), checkpoint, use_safetensors=False)
+
+
+    def save_sharded_model(self, model: nn.Module, checkpoint_path: str, gather_dtensor:bool = False, 
+                           variant: Optional[str] = None, max_shard_size: int = 1024, use_safetensors: bool = False):
+        """ 
+        implement this method as it can be supported by Huggingface model,
+        save shard model, save model to multiple files
+        """
+        if os.path.isfile(checkpoint_path):
+            logging.error(f"Provided path ({checkpoint_path}) should be a directory, not a file")
+            return
+        
+        Path(checkpoint_path).mkdir(parents=True, exist_ok=True)
+        
+        # shard checkpoint
+        state_dict = model.state_dict()
+        weights_name = SAFE_WEIGHTS_NAME if use_safetensors else WEIGHTS_NAME
+        weights_name = add_variant(weights_name, variant)
+        shards, index = shard_checkpoint(state_dict, max_shard_size=max_shard_size, weights_name=weights_name)
+
+        # Save the model
+        for shard_file, shard in shards.items():
+            checkpoint_file_path = os.path.join(checkpoint_path, shard_file)
+            save_state_dict(shard, checkpoint_file_path, use_safetensors)
+
+        # save index file
+        save_index_file = SAFE_WEIGHTS_INDEX_NAME if use_safetensors else WEIGHTS_INDEX_NAME
+
+        save_index_file = os.path.join(checkpoint_path, add_variant(save_index_file, variant))
+        with open(save_index_file, "w", encoding="utf-8") as f:
+            content = json.dumps(index, indent=2, sort_keys=True) + "\n"
+            f.write(content)
+        logging.info(
+            f"The model is bigger than the maximum size per checkpoint ({max_shard_size}) and is going to be "
+            f"split in {len(shards)} checkpoint shards. You can find where each parameters has been saved in the "
+            f"index located at {save_index_file}."
+        )
+
+
+    def load_sharded_model(self, model: nn.Module, checkpoint_index_file: Path, strict: bool = False, use_safetensors: bool = False):
+        """
+        load shard model, load model from multiple files
+        """
+        use_safetensors = False
+        if "safetensors" in checkpoint_index_file.name:
+            use_safetensors = True
+
+        if use_safetensors and not is_safetensors_available():
+            raise ImportError("`safe_serialization` requires the `safetensors` library: `pip install safetensors`.")
+        
+        # read checkpoint index file
+        ckpt_index_file = CheckpointIndexFile.from_file(checkpoint_index_file)
+        checkpoint_files, _ = ckpt_index_file.get_checkpoint_fileanames()
+        missing_keys = ckpt_index_file.get_all_param_names()
+
+        for shard_file in checkpoint_files:
+            state_dict = load_shard_state_dict(Path(shard_file), use_safetensors)
+            load_state_dict_into_model(model, state_dict, missing_keys, strict)
+            del state_dict
+            gc.collect()
+
+        if strict and len(missing_keys) > 0:
+            error_msgs = 'Missing key(s) in state_dict: {}. '.format(
+                        ', '.join('"{}"'.format(k) for k in missing_keys))
+            raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
+                               self.__class__.__name__, "\n\t".join(error_msgs)))
+
--- a/colossalai/checkpoint_io/index_file.py
+++ b/colossalai/checkpoint_io/index_file.py
+import json
+from pathlib import Path
+from typing import Any, List, Union
+
+from .utils import is_dtensor_checkpoint
+
+__all__ = ['CheckpointIndexFile']
+
+
+class CheckpointIndexFile:
+    """
+    This class is a data structure to keep the content in the index.json file for sharded checkpoint.
+
+    Example:
+        >>> index = CheckpointIndexFile.from_file('model.index.json')
+        >>> index.append_metadata('model_type', 'bert')
+        >>> index.append_weight_map('bert.embeddings.word_embeddings.weight', 'model_0001-of-0002.bin')
+        >>> index.export('new_index.json')
+    """
+
+    def __init__(self) -> None:
+        self.root_path = None
+        self.metadata: dict = dict()
+        self.weight_map: dict = dict()
+
+    @staticmethod
+    def from_file(index_path: Union[str, Path]):
+        """
+        Create a CheckpointIndexFile object from a json file.
+
+        Args:
+            index_path (str): path to the json file.
+
+        Returns:
+            CheckpointIndexFile: CheckpointIndexFile object.
+        """
+        index = CheckpointIndexFile()
+        index.load(index_path)
+        return index
+
+    def load(self, json_path: str):
+        """
+        Load the index file from a json file.
+
+        Args:
+            json_path (str): path to the json file.
+        """
+        # load the json file
+        with open(json_path, 'r') as f:
+            index = json.load(f)
+
+        # assign attributes if exists
+        if "metadata" in index:
+            self.metadata = index["metadata"]
+        if "weight_map" in index:
+            self.weight_map = index["weight_map"]
+
+        # assign the root directory for the index file
+        self.root_path = Path(json_path).absolute().parent
+
+    def export(self, json_path: str):
+        """
+        Export the index file to a json file.
+
+        Args:
+            json_path (str): path to the json file.
+        """
+        # create the index file
+        index = dict()
+        index["metadata"] = self.metadata
+        index["weight_map"] = self.weight_map
+
+        # export the index file
+        with open(json_path, 'w') as f:
+            json.dump(index, f, indent=4)
+
+    def append_weight_map(self, param_name: str, shard_file: str):
+        """
+        Append a weight map entry to the index file.
+
+        Args:
+            param_name (str): name of the parameter.
+            shard_file (str): name of the shard file.
+        """
+        self.weight_map[param_name] = shard_file
+
+    def append_meta_data(self, name: str, val: Any):
+        """
+        Append a metadata entry to the index file.
+
+        Args:
+            name (str): name of the metadata.
+            val (Any): value of the metadata.
+        """
+        self.metadata[name] = val
+
+    def contains_dtensor(self):
+        """
+        Check if the index file contains any distributed tensor. The distributed tensors will be stored in
+        `dtensor/module.linear.weight.*.bin` or `dtensor/module.linear.weight.*.safetensors` in the weight map.
+
+        Returns:
+            bool: True if the index file contains any distributed tensor, False otherwise.
+        """
+        for value in self.weight_map.values():
+            if value.endswith(".*.bin") or value.endswith(".*.safetensors"):
+                return True
+        return False
+
+    def get_checkpoint_fileanames(self) -> List[str]:
+        """
+        Get the set of checkpoint filenames in the weight map.
+
+        Returns:
+            list: checkpoint shard filenames.
+        """
+        # read the checkpoint file list from the json file and get a list of unique file names
+        checkpoint_files = sorted(list(set(self.weight_map.values())))
+
+        # get the absolute paths for all checkpoint files
+        checkpoint_files = [str(self.root_path.joinpath(f)) for f in checkpoint_files]
+
+        dtensor_list = []
+        checkpoint_list = []
+
+        for ckpt_file in checkpoint_files:
+            if is_dtensor_checkpoint(ckpt_file):
+                dtensor_list.append(ckpt_file)
+            else:
+                checkpoint_list.append(ckpt_file)
+
+        return checkpoint_list, dtensor_list
+
+    def assert_no_dtensor_checkpoint(self):
+        for val in self.weight_map.values():
+            if is_dtensor_checkpoint(val):
+                raise ValueError(f"Checkpoint file {val} contains distributed tensor")
+
+    def get_checkpoint_file(self, param_name: str) -> str:
+        """
+        Get the checkpoint file name for a parameter.
+
+        Args:
+            param_name (str): name of the parameter.
+
+        Returns:
+            str: checkpoint file name.
+        """
+        ckpt_path = self.weight_map[param_name]
+        return ckpt_path
+    
+    def get_all_param_names(self):
+        """
+        Get all the weight keys.
+        """
+        return list(self.weight_map.keys())