Merge branch 'main' of https://github.com/oahzxl/ColossalAI into chunk

e532679c · oahzxl · c1492e50 · 7d5640b9 · c1492e50 · e532679c
Commit e532679c authored Jan 10, 2023 by oahzxl
20 changed files
--- a/colossalai/utils/profiler/legacy/mem_profiler.py
+++ b/colossalai/utils/profiler/legacy/mem_profiler.py
-from pathlib import Path
-from typing import Union
-from colossalai.engine import Engine
-from torch.utils.tensorboard import SummaryWriter
-from colossalai.gemini.ophooks import MemTracerOpHook
-from colossalai.utils.profiler.legacy.prof_utils import BaseProfiler
-
-
-class MemProfiler(BaseProfiler):
-    """Wraper of MemOpHook, used to show GPU memory usage through each iteration
-
-    To use this profiler, you need to pass an `engine` instance. And the usage is same like
-    CommProfiler.
-
-    Usage::
-
-        mm_prof = MemProfiler(engine)
-        with ProfilerContext([mm_prof]) as prof:
-            writer = SummaryWriter("mem")
-            engine.train()
-            ...
-            prof.to_file("./log")
-            prof.to_tensorboard(writer)
-
-    """
-
-    def __init__(self, engine: Engine, warmup: int = 50, refreshrate: int = 10) -> None:
-        super().__init__(profiler_name="MemoryProfiler", priority=0)
-        self._mem_tracer = MemTracerOpHook(warmup=warmup, refreshrate=refreshrate)
-        self._engine = engine
-
-    def enable(self) -> None:
-        self._engine.add_hook(self._mem_tracer)
-
-    def disable(self) -> None:
-        self._engine.remove_hook(self._mem_tracer)
-
-    def to_tensorboard(self, writer: SummaryWriter) -> None:
-        stats = self._mem_tracer.async_mem_monitor.state_dict['mem_stats']
-        for info, i in enumerate(stats):
-            writer.add_scalar("memory_usage/GPU", info, i)
-
-    def to_file(self, data_file: Path) -> None:
-        self._mem_tracer.save_results(data_file)
-
-    def show(self) -> None:
-        stats = self._mem_tracer.async_mem_monitor.state_dict['mem_stats']
-        print(stats)
--- a/colossalai/zero/__init__.py
+++ b/colossalai/zero/__init__.py
@@ -2,10 +2,12 @@ from typing import Tuple

 import torch
 import torch.nn as nn
+
 from colossalai.logging import get_dist_logger
 from colossalai.zero.sharded_model.sharded_model_v2 import ShardedModelV2
-from colossalai.zero.sharded_optim.sharded_optim_v2 import ShardedOptimizerV2
-from .zero_optimizer import ZeroOptimizer
+from colossalai.zero.sharded_optim import LowLevelZeroOptimizer, ShardedOptimizerV2
+
+from ..nn.optimizer.zero_optimizer import ZeroOptimizer


 def convert_to_zero_v2(model: nn.Module, optimizer: torch.optim.Optimizer, model_config,
@@ -36,4 +38,4 @@ def convert_to_zero_v2(model: nn.Module, optimizer: torch.optim.Optimizer, model
    return zero_model, zero_optimizer


-__all__ = ['convert_to_zero_v2', 'ShardedModelV2', 'ShardedOptimizerV2', 'ZeroOptimizer']
+__all__ = ['convert_to_zero_v2', 'LowLevelZeroOptimizer', 'ShardedModelV2', 'ShardedOptimizerV2', 'ZeroOptimizer']
--- a/colossalai/zero/sharded_model/sharded_model_v2.py
+++ b/colossalai/zero/sharded_model/sharded_model_v2.py
 import functools
+import itertools
 from collections import OrderedDict
-from typing import Any, Optional, Iterator, Tuple
 from copy import deepcopy
-import itertools
+from typing import Any, Iterator, Optional, Tuple
+
 import torch
 import torch.distributed as dist
 import torch.nn as nn
+from torch.distributed import ProcessGroup
+from torch.nn.parameter import Parameter
+
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
+from colossalai.gemini.memory_tracer import MemStatsCollector, StaticMemStatsCollector
 from colossalai.gemini.ophooks import register_ophooks_recursively
-from colossalai.zero.utils import ZeroHook
 from colossalai.gemini.paramhooks import BaseParamHookMgr
+from colossalai.gemini.stateful_tensor import TensorState
+from colossalai.gemini.stateful_tensor_mgr import StatefulTensorMgr
+from colossalai.gemini.tensor_placement_policy import TensorPlacementPolicy, TensorPlacementPolicyFactory
+from colossalai.gemini.tensor_utils import colo_model_data_move_to_cpu
 from colossalai.logging import get_dist_logger
-from colossalai.utils import get_current_device, disposable
-from colossalai.gemini.memory_tracer.memstats_collector import MemStatsCollector
+from colossalai.utils import disposable, get_current_device
 from colossalai.utils.memory import colo_device_memory_capacity
 from colossalai.zero.shard_utils import BaseShardStrategy
 from colossalai.zero.sharded_model.reduce_scatter import ReduceScatterBucketer
-from torch.distributed import ProcessGroup
-from torch.nn.parameter import Parameter
-from colossalai.gemini.tensor_utils import colo_model_data_move_to_cpu
-from colossalai.gemini.stateful_tensor import TensorState
-from colossalai.gemini.stateful_tensor_mgr import StatefulTensorMgr
-from colossalai.gemini.tensor_placement_policy import TensorPlacementPolicyFactory, TensorPlacementPolicy
+from colossalai.zero.utils import ZeroHook

-from ._utils import (cast_float_arguments, cast_tensor_to_fp16, cast_tensor_to_fp32, chunk_and_pad, free_storage,
-                     get_gradient_predivide_factor)
+from ._utils import (
+    cast_float_arguments,
+    cast_tensor_to_fp16,
+    cast_tensor_to_fp32,
+    chunk_and_pad,
+    free_storage,
+    get_gradient_predivide_factor,
+)

 try:
    from torch.nn.modules.module import _EXTRA_STATE_KEY_SUFFIX
@@ -49,7 +57,7 @@ class ShardedModelV2(nn.Module):
        module (nn.Module): A sharded module, which must be initialized by `ZeroInitContext`.
        shard_strategy (BaseShardStrategy): A shard strategy to manage shard behavior.
        process_group (Optional[ProcessGroup], optional): Data parallel process group. Defaults to None.
-        reduce_scatter_process_group (Optional[ProcessGroup], optional): Reduce-scatter process group. 
+        reduce_scatter_process_group (Optional[ProcessGroup], optional): Reduce-scatter process group.
            Generally, it should be `None`, and it's the same as `process_group`. Defaults to None.
        reduce_scatter_bucket_size_mb (int, optional): Reduce-scatter bucket size in *MB*. Defaults to 25.
        fp32_reduce_scatter (bool, optional): If set to `True`, gradients are forced to FP32 before reduce-scatter. Defaults to False.
@@ -60,10 +68,10 @@ class ShardedModelV2(nn.Module):
            Note that 'auto' policy can only work well when no other processes use CUDA during your training.
            Defaults to 'cuda'.
        gradient_predivide_factor (Optional[float], optional): Gradient is divived by this value before reduce-scatter. Defaults to 1.0.
-        reuse_fp16_shard (bool, optional): Whether to reuse fp16 shard for param and grad. 
-            Enabling this can reduce GPU memory usage, but you have to make sure you disable it when using gradient accumulation. 
-            In this mode, grad will be fp16. Make sure your optimizer supports mixed precision (fp32 param and fp16 grad). 
-            We find that PyTorch's optimizers don't support mixed precision, 
+        reuse_fp16_shard (bool, optional): Whether to reuse fp16 shard for param and grad.
+            Enabling this can reduce GPU memory usage, but you have to make sure you disable it when using gradient accumulation.
+            In this mode, grad will be fp16. Make sure your optimizer supports mixed precision (fp32 param and fp16 grad).
+            We find that PyTorch's optimizers don't support mixed precision,
            so we recommend you enable this only when using our CPUAdam with CPU offload. Defaults to False.
    """

@@ -198,15 +206,14 @@ class ShardedModelV2(nn.Module):
                    f.write(f'cuda reserved {torch.cuda.memory_reserved(get_current_device()) / 1e9} GB\n')
                    f.write(f'cuda max allocated {torch.cuda.max_memory_allocated(get_current_device()) / 1e9} GB\n')
                    f.write('CUDA model data (GB)\n')
-                    f.write(str(self._memstats_collector.model_data_list('cuda', 'GB')))
                    f.write('\n')
                    f.write('CUDA non model data (GB)\n')
-                    f.write(str(self._memstats_collector.non_model_data_list('cuda', 'GB')))
+                    f.write(str(self._memstats_collector._memstats.non_model_data_list('cuda')))
                    f.write('CPU non model data (GB)\n')
-                    f.write(str(self._memstats_collector.non_model_data_list('cpu', 'GB')))
+                    f.write(str(self._memstats_collector._memstats.non_model_data_list('cpu')))
                    f.write('\n')

-    def _pre_forward_operations(self):
+    def _pre_forward_operations(self, *args):
        # the operation will affect the memory tracer behavior in ZeroHook
        if self._memstats_collector:
            self._start_collect_memstats()
@@ -223,7 +230,7 @@ class ShardedModelV2(nn.Module):
                p.colo_attr.sharded_data_tensor.trans_state(TensorState.HOLD)

    def forward(self, *args: Any, **kwargs: Any) -> torch.Tensor:
-        self._pre_forward_operations()
+        self._pre_forward_operations(*args)
        args, kwargs = cast_float_arguments(cast_tensor_to_fp16, *args, **kwargs)
        outputs = self.module(*args, **kwargs)
        self._post_forward_operations()
@@ -248,8 +255,8 @@ class ShardedModelV2(nn.Module):
            # the way to calculate margin space is based on the assumption that
            # model data is fixed in cuda during training.
            # cuda margin space can be used to store OS.
-            self._cuda_margin_space = colo_device_memory_capacity(get_current_device()) - max(
-                self._memstats_collector.overall_mem_stats('cuda'))
+            self._cuda_margin_space = colo_device_memory_capacity(
+                get_current_device()) - self._memstats_collector._memstats.max_overall_cuda

    @torch.no_grad()
    def _post_backward_operations(self) -> None:

--- a/colossalai/zero/sharded_optim/__init__.py
+++ b/colossalai/zero/sharded_optim/__init__.py
+from .low_level_optim import LowLevelZeroOptimizer
 from .sharded_optim_v2 import ShardedOptimizerV2

-__all__ = ['ShardedOptimizerV2']
+__all__ = ['ShardedOptimizerV2', 'LowLevelZeroOptimizer']
--- a/colossalai/zero/sharded_optim/_utils.py
+++ b/colossalai/zero/sharded_optim/_utils.py
 import math
+
 import torch
+import torch.distributed as dist
 from torch._six import inf
 from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
-from colossalai.core import global_context as gpc
+
 from colossalai.context import ParallelMode
+from colossalai.core import global_context as gpc
 from colossalai.utils import is_model_parallel_parameter
-import torch.distributed as dist


 def flatten(input_):
@@ -99,19 +101,24 @@ def split_half_float_double(tensor_list):
    return buckets


-def reduce_tensor(tensor, dtype, dst_rank=None, parallel_mode=ParallelMode.DATA):
+def reduce_tensor(tensor, dtype=None, dst_rank=None, parallel_mode=ParallelMode.DATA):
    """
    Reduce the tensor in the data parallel process group

    :param tensor: A tensor object to reduce/all-reduce
    :param dtype: The data type used in communication
    :param dst_rank: The source rank for reduce. If dst_rank is None,
+    :param parallel_mode: Communication parallel mode
    all-reduce will be used instead of reduce. Default is None.

    :type tensor: torch.Tensor
-    :type dtype: torch.dtype
+    :type dtype: torch.dtype, optional
    :type dst_rank: int, optional
+    :type parallel_mode: ParallelMode, optional
    """
+    # use the original dtype
+    if dtype is None:
+        dtype = tensor.dtype

    # cast the data to specified dtype for reduce/all-reduce
    if tensor.dtype != dtype:
@@ -139,6 +146,7 @@ def reduce_tensor(tensor, dtype, dst_rank=None, parallel_mode=ParallelMode.DATA)
        local_rank = gpc.get_local_rank(parallel_mode)
        if use_all_reduce or dst_rank == local_rank:
            tensor.copy_(tensor_to_reduce)
+
    return tensor


@@ -238,7 +246,7 @@ def sync_param(flat_tensor, tensor_list):
    Synchronize the flattened tensor and unflattened tensor list. When
    a list of tensor are flattened with `torch._utils._unflatten_dense_tensors`,
    a new tensor is created. Thus, the flat tensor and original tensor list do not
-    share the same memory space. This function will update the tensor list so that 
+    share the same memory space. This function will update the tensor list so that
    they point to the same value.

    :param flat_tensor: A flat tensor obtained by calling `torch._utils._unflatten_dense_tensors` on a tensor lsit

--- a/colossalai/zero/sharded_optim/bookkeeping/__init__.py
+++ b/colossalai/zero/sharded_optim/bookkeeping/__init__.py
+from .bucket_store import BucketStore
+from .gradient_store import GradientStore
+from .parameter_store import ParameterStore
+from .tensor_bucket import TensorBucket
+
+__all__ = ['GradientStore', 'ParameterStore', 'BucketStore', 'TensorBucket']
--- a/colossalai/zero/sharded_optim/bookkeeping/base_store.py
+++ b/colossalai/zero/sharded_optim/bookkeeping/base_store.py
+from colossalai.context import ParallelMode
+from colossalai.core import global_context as gpc
+
+
+class BaseStore:
+
+    def __init__(self, dp_parallel_mode=ParallelMode.DATA):
+        self._world_size = gpc.get_world_size(dp_parallel_mode)
+        self._local_rank = gpc.get_local_rank(dp_parallel_mode)
+
+    @property
+    def world_size(self):
+        return self._world_size
+
+    @property
+    def local_rank(self):
+        return self._local_rank
--- a/colossalai/zero/sharded_optim/bookkeeping/bucket_store.py
+++ b/colossalai/zero/sharded_optim/bookkeeping/bucket_store.py
+from colossalai.context import ParallelMode
+from colossalai.core import global_context as gpc
+
+from .base_store import BaseStore
+
+
+class BucketStore(BaseStore):
+
+    def __init__(self, dp_parallel_mode):
+        super().__init__(dp_parallel_mode)
+        self._grads = dict()
+        self._params = dict()
+        self._num_elements_in_bucket = dict()
+
+        self.reset()
+
+    def num_elements_in_bucket(self, reduce_rank: int = None):
+        return self._num_elements_in_bucket[reduce_rank]
+
+    def add_num_elements_in_bucket(self, num_elements, reduce_rank: int = None):
+        self._num_elements_in_bucket[reduce_rank] += num_elements
+
+    def add_grad(self, tensor, reduce_rank: int = None):
+        self._grads[reduce_rank].append(tensor)
+
+    def add_param(self, tensor, reduce_rank: int = None):
+        self._params[reduce_rank].append(tensor)
+
+    def reset(self):
+        keys = [None] + list(range(self._world_size))
+        self._grads = {rank: [] for rank in keys}
+        self._params = {rank: [] for rank in keys}
+        self._num_elements_in_bucket = {rank: 0 for rank in keys}
+
+    def reset_by_rank(self, reduce_rank=None):
+        self._grads[reduce_rank] = []
+        self._params[reduce_rank] = []
+        self._num_elements_in_bucket[reduce_rank] = 0
+
+    def get_grad(self, reduce_rank: int = None):
+        return self._grads[reduce_rank]
+
+    def get_param(self, reduce_rank: int = None):
+        return self._params[reduce_rank]
--- a/colossalai/zero/sharded_optim/bookkeeping/gradient_store.py
+++ b/colossalai/zero/sharded_optim/bookkeeping/gradient_store.py
+from typing import List
+
+from torch import Tensor
+
+from .base_store import BaseStore
+
+
+class GradientStore(BaseStore):
+
+    def __init__(self, *args):
+        super().__init__(*args)
+        # bookkeeping data structures
+        self._averaged_gradients = dict()
+
+        # for backward reduction hooks
+        self._grad_acc_objs = []
+
+    def add_accumulate_grad_object(self, obj):
+        """
+        Keep :class:`AccumulateGrad` objects. If these objects are not kept, reduction hooks may not
+        be attached successfully.
+
+        :param obj: An object of :class:`AccumulateGrad` class
+        :type obj: :class:`AccumulateGrad`
+        """
+
+        self._grad_acc_objs.append(obj)
+
+    def get_averaged_gradients_by_group(self, group_id: int) -> List[Tensor]:
+        """
+        Return average gradients of a parameter group
+
+        :param group_id: The index of parameter group
+        :type group_id: int
+
+        :return: Return the list of averaged gradients of a parameter group. Each element is a gradient, not a parameter.
+        :rtype: List[torch.Tensor]
+        """
+
+        return self._averaged_gradients[group_id]
+
+    def add_average_gradient_by_group(self, group_id: int, tensor: Tensor) -> None:
+        """
+        Append an average gradient to the list of averaged gradients of a parameter group
+
+        :param group_id: The index of a parameter group
+        :param tensor: A :class:`torch.Tensor` object
+        :type group_id: int
+        :type tensor: torch.Tensor
+
+        """
+
+        if group_id in self._averaged_gradients:
+            self._averaged_gradients[group_id].append(tensor)
+        else:
+            self._averaged_gradients[group_id] = [tensor]
+
+    def reset_average_gradients_by_group(self, group_id: int) -> None:
+        """
+        Reset the bookkeeping data structure for averaged gradients to an empty list
+
+        :param group_id: The index of a parameter group
+        :type group_id: int
+        """
+
+        self._averaged_gradients[group_id] = []
--- a/colossalai/zero/sharded_optim/bookkeeping/parameter_store.py
+++ b/colossalai/zero/sharded_optim/bookkeeping/parameter_store.py
+from typing import List
+
+from torch import Tensor
+
+from .base_store import BaseStore
+
+
+class ParameterStore(BaseStore):
+
+    def __init__(self, dp_paralle_mode):
+        super().__init__(dp_paralle_mode)
+        # param partitioning data structures
+        self._fp16_param_to_rank = dict()
+        self._rank_groupid_to_fp16_param_list = dict()
+        self._rank_group_id_to_flat_fp16_param = dict()
+
+        # param reduction data structures
+        self._is_param_reduced = dict()
+        self._reduced_param = []
+
+    def set_param_to_rank(self, tensor: Tensor, rank: int) -> None:
+        """
+        Set the mapping between parameter to rank, each parameter should be owned by a rank.
+
+        :param tensor: A :class:`torch.Tensor` object
+        :type tensor: torch.Tensor
+        :param rank: The rank of which the process is responsible for updating the parameter
+        :type rank: int
+        """
+
+        self._fp16_param_to_rank[tensor] = rank
+
+    def get_param_rank(self, tensor: Tensor) -> int:
+        """
+        Gives the rank which the parameter belongs to
+
+        :param tensor: A :class:`torch.Tensor` object
+        :type tensor: torch.Tensor
+        """
+        return self._fp16_param_to_rank[tensor]
+
+    def belongs_to_current_rank(self, tensor) -> bool:
+        """
+        Check whether a parameter is supposed to be updated by the process of the current rank
+
+        :param tensor: A :class:`torch.Tensor` object
+        :type tensor: torch.Tensor
+
+        :return: True if the parameter should be updated by the current rank. Otherwise false.
+        :rtype: bool
+        """
+
+        tensor_rank = self._fp16_param_to_rank[tensor]
+        return tensor_rank == self._local_rank
+
+    def add_fp16_param_list_by_rank_group(self, rank, group_id, tensor_list) -> None:
+        if rank not in self._rank_groupid_to_fp16_param_list:
+            self._rank_groupid_to_fp16_param_list[rank] = dict()
+
+        if group_id not in self._rank_groupid_to_fp16_param_list[rank]:
+            self._rank_groupid_to_fp16_param_list[rank][group_id] = []
+
+        self._rank_groupid_to_fp16_param_list[rank][group_id].extend(tensor_list)
+
+    def get_fp16_params_by_rank_group(self, rank, group_id) -> List[Tensor]:
+        return self._rank_groupid_to_fp16_param_list[rank][group_id]
+
+    def add_flat_fp16_param_by_rank_group(self, rank, group_id, tensor) -> None:
+        if rank not in self._rank_group_id_to_flat_fp16_param:
+            self._rank_group_id_to_flat_fp16_param[rank] = dict()
+
+        self._rank_group_id_to_flat_fp16_param[rank][group_id] = tensor
+
+    def get_flat_fp16_param_by_rank_group(self, rank, group_id) -> Tensor:
+        return self._rank_group_id_to_flat_fp16_param[rank][group_id]
+
+    def is_param_reduced(self, tensor):
+        return self._is_param_reduced[tensor]
+
+    def set_param_reduction_state(self, tensor, state):
+        self._is_param_reduced[tensor] = state
+
+    def get_param_reduction_states(self):
+        return self._is_param_reduced
+
+    def reset_previous_reduced_params(self):
+        self._reduced_param = []
+
+    def add_previous_reduced_param(self, tensor):
+        self._reduced_param.append(tensor)
+
+    def clear_grads_of_previous_reduced_params(self):
+        if len(self._reduced_param) > 0:
+            for param in self._reduced_param:
+                param.grad = None
+            self.reset_previous_reduced_params()
--- a/colossalai/zero/sharded_optim/bookkeeping/tensor_bucket.py
+++ b/colossalai/zero/sharded_optim/bookkeeping/tensor_bucket.py
+from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
+
+
+class TensorBucket:
+
+    def __init__(self, size):
+        self._max_size = size
+        self._current_size = 0
+        self._bucket = []
+
+    @property
+    def max_size(self):
+        return self._max_size
+
+    @property
+    def current_size(self):
+        return self._current_size
+
+    def is_full_or_oversized(self):
+        return self._current_size >= self._max_size
+
+    def is_empty(self):
+        return len(self._bucket) == 0
+
+    def add_to_bucket(self, tensor, allow_oversize=False):
+        tensor_size = tensor.numel()
+
+        if not allow_oversize and self.will_exceed_max_size(tensor_size):
+            msg = f"The param bucket max size {self._max_size} is exceeded" \
+                + f"by tensor (size {tensor_size})"
+            raise RuntimeError(msg)
+
+        self._bucket.append(tensor)
+        self._current_size += tensor_size
+
+    def will_exceed_max_size(self, tensor_size):
+        expected_size = self._current_size + tensor_size
+        return expected_size > self._max_size
+
+    def get_bucket(self):
+        return self._bucket
+
+    def empty(self):
+        self._bucket = []
+        self._size = 0
+
+    def flatten(self):
+        return _flatten_dense_tensors(self._bucket)
+
+    def unflatten_and_copy(self, flat_tensor):
+        unflattened_tensor_list = _unflatten_dense_tensors(flat_tensor, self._bucket)
+        for old, new in zip(self._bucket, unflattened_tensor_list):
+            old.copy_(new)
--- a/colossalai/zero/sharded_optim/low_level_optim.py
+++ b/colossalai/zero/sharded_optim/low_level_optim.py
+from functools import partial
+from itertools import groupby
+
+import torch
+import torch.distributed as dist
+from torch.optim import Optimizer
+
+from colossalai.amp.naive_amp.grad_scaler import DynamicGradScaler
+from colossalai.context import ParallelMode
+from colossalai.core import global_context as gpc
+from colossalai.logging import get_dist_logger
+from colossalai.nn.optimizer import ColossalaiOptimizer
+from colossalai.utils.cuda import get_current_device
+
+from ._utils import (
+    calculate_global_norm_from_list,
+    compute_norm,
+    flatten,
+    get_grad_accumulate_object,
+    has_inf_or_nan,
+    reduce_tensor,
+    release_param_grad,
+    split_half_float_double,
+    sync_param,
+)
+from .bookkeeping import BucketStore, GradientStore, ParameterStore, TensorBucket
+
+
+class LowLevelZeroOptimizer(ColossalaiOptimizer):
+    """Optimizer used for ZeRO-1 and ZeRO-2.
+    """
+
+    def __init__(
+            self,
+            optimizer: Optimizer,
+
+    # grad scaler config
+            initial_scale=2**16,
+            min_scale=1,
+            growth_factor=2,
+            backoff_factor=0.5,
+            growth_interval=2000,
+            hysteresis=2,
+            max_scale: int = 2**24,
+
+    # grad clipping
+            clip_grad_norm=0.0,
+            verbose=False,
+
+    # communication
+            reduce_bucket_size=1024 * 1024,
+            communication_dtype=None,
+            overlap_communication=False,
+
+    # stage 2
+            partition_grad=False,
+            dp_parallel_mode=ParallelMode.DATA,
+            mp_parallel_mode=ParallelMode.MODEL,
+
+    # cpu offload
+            cpu_offload=False,
+
+    # forced dtype
+            forced_dtype=None):
+
+        # TODO: add support for
+        # 1. fp16 master weights
+        # 2. contiguous gradients
+        # 3. cpu offload
+        # 4. support when some parameters requires_grad = False
+        super(LowLevelZeroOptimizer, self).__init__(optim=optimizer)
+        self._dtype = self.optim.param_groups[0]['params'][0].dtype
+        self._logger = get_dist_logger()
+        self._verbose = verbose
+
+        # stage 2
+        self._partition_grads = partition_grad
+
+        # cpu_offload
+        self._cpu_offload = cpu_offload
+
+        # get process groups
+        self._dp_parallel_mode = dp_parallel_mode
+        self._mp_parallel_mode = mp_parallel_mode
+        self._local_rank = gpc.get_local_rank(dp_parallel_mode)
+        self._world_size = gpc.get_world_size(dp_parallel_mode)
+
+        self._dp_group = gpc.get_group(dp_parallel_mode)
+        if gpc.is_initialized(mp_parallel_mode) and gpc.get_world_size(mp_parallel_mode) > 1:
+            self._mp_group = gpc.get_group(mp_parallel_mode)
+        else:
+            self._mp_group = None
+
+        # fp16 and fp32 params for mixed precision training
+        self._fp16_param_groups = dict()
+        self._fp32_flat_param_groups_of_current_rank = dict()
+
+        # communication params
+        self._overlap_communication = overlap_communication
+        self._reduce_bucket_size = reduce_bucket_size
+        self._communication_dtype = communication_dtype
+
+        # gradient scaler
+        self.grad_scaler = DynamicGradScaler(initial_scale=initial_scale,
+                                             min_scale=min_scale,
+                                             growth_factor=growth_factor,
+                                             backoff_factor=backoff_factor,
+                                             growth_interval=growth_interval,
+                                             hysteresis=hysteresis,
+                                             max_scale=max_scale,
+                                             verbose=verbose)
+        self._found_overflow = torch.FloatTensor([0]).to(get_current_device())
+
+        # gradient clipping
+        self._clip_grad_norm = clip_grad_norm
+
+        if forced_dtype:
+            for group in self.optim.param_groups:
+                group_params = group['params']
+                for param in group_params:
+                    param.data = param.data.to(forced_dtype)
+            self._dtype = forced_dtype
+
+        # check argument conflict
+        self._sanity_checks()
+
+        # ParameterStore will manage the tensor buffers used for zero
+        # it will not manage the tensors used by mixed precision training
+        self._param_store = ParameterStore(self._dp_parallel_mode)
+        self._grad_store = GradientStore(self._dp_parallel_mode)
+        self._bucket_store = BucketStore(self._dp_parallel_mode)
+
+        # iterate over the param group in the optimizer
+        # partition these param groups for data parallel training
+        # and add buffers to parameter store for future access
+        for group_id, param_group in enumerate(self.optim.param_groups):
+            group_params = param_group['params']
+
+            # add the fp16 params to fp16_param_groups for bookkeeping
+            self._fp16_param_groups[group_id] = group_params
+
+            # assign parameters to ranks
+            # the params in the list are sorted
+            params_per_rank = self._partition_param_list(group_params)
+
+            # store the mapping between param to rank
+            # each param should belong to only one rank
+            for rank, params in enumerate(params_per_rank):
+                self._param_store.add_fp16_param_list_by_rank_group(rank, group_id, params)
+                for param in params:
+                    self._param_store.set_param_to_rank(param, rank)
+
+            # move to cpu to make room to create the flat tensor
+            # move_tensor(params, device='cpu')
+            for param in group_params:
+                param.data = param.data.cpu()
+
+            # flatten the reordered tensors
+            for rank in range(self._world_size):
+                tensor_list = self._param_store.get_fp16_params_by_rank_group(rank, group_id)
+                with torch.no_grad():
+                    flat_tensor = flatten(tensor_list)
+                flat_tensor = flat_tensor.data.cuda()
+                self._param_store.add_flat_fp16_param_by_rank_group(rank, group_id, flat_tensor)
+
+            # sync parameters
+            for rank in range(self._world_size):
+                flat_tensor = self._param_store.get_flat_fp16_param_by_rank_group(rank, group_id)
+                tensor_list = self._param_store.get_fp16_params_by_rank_group(rank, group_id)
+                sync_param(flat_tensor=flat_tensor, tensor_list=tensor_list)
+
+            # create a copy of fp32 weights of the parameters for which this rank is responsible
+            fp16_flat_current_rank = self._param_store.get_flat_fp16_param_by_rank_group(self._local_rank, group_id)
+            fp32_flat_current_rank = fp16_flat_current_rank.float()
+            device = 'cpu' if self._cpu_offload else get_current_device()
+            fp32_flat_current_rank = fp32_flat_current_rank.to(device)
+            fp32_flat_current_rank.requires_grad = True
+            self._fp32_flat_param_groups_of_current_rank[group_id] = fp32_flat_current_rank
+
+            # need to replace the params in the `params` field in the optimizer
+            # so that when the optimizer calls step(), it only updates the tensors
+            # managed by this data parallel rank
+            param_group['params'] = [fp32_flat_current_rank]
+
+            # set reduction state
+            for param in self._fp16_param_groups[group_id]:
+                self._param_store.set_param_reduction_state(param, False)
+
+        # intialize communication stream for
+        # communication-compuation overlapping
+        if self._overlap_communication:
+            self._comm_stream = torch.cuda.Stream()
+
+        # reduction hook is only used if overlapping communication
+        # or stage 2 is used
+        # if it is stage 1 without overlapping, no hook will be attached
+        if self._overlap_communication or self._partition_grads:
+            self._attach_reduction_hook()
+
+    @property
+    def dtype(self):
+        return self._dtype
+
+    @property
+    def loss_scale(self):
+        return self.grad_scaler.scale
+
+    @property
+    def num_param_groups(self):
+        return len(self._fp16_param_groups)
+
+    def _partition_param_list(self, param_list):
+        params_per_rank = [[] for _ in range(self._world_size)]
+        numel_per_rank = [0 for _ in range(self._world_size)]
+
+        # partititon the parameters in a greedy fashion
+        sorted_params = sorted(param_list, key=lambda x: x.numel(), reverse=True)
+        for param in sorted_params:
+            # allocate this parameter to the rank with
+            # the smallest numel for load balancing purpose
+            rank_to_go = numel_per_rank.index(min(numel_per_rank))
+            params_per_rank[rank_to_go].append(param)
+            numel_per_rank[rank_to_go] += param.numel()
+
+        if self._verbose:
+            self._logger.info(f'Number of elements on ranks: {numel_per_rank}',
+                              ranks=[0],
+                              parallel_mode=self._dp_parallel_mode)
+        return params_per_rank
+
+    def _sanity_checks(self):
+        assert torch.cuda.is_available(), 'CUDA is required'
+        for param_group in self.optim.param_groups:
+            group_params = param_group['params']
+            for param in group_params:
+                assert param.dtype == self._dtype, \
+                    f"Parameters are expected to have the same dtype `{self._dtype}`, but got `{param.dtype}`"
+
+    ###########################################################
+    # Backward Reduction Hook
+    ###########################################################
+
+    def _attach_reduction_hook(self):
+        # we iterate over the fp16 params
+        # on each param, we register a hook to its AccumulateGrad object
+        for group_id in range(self.num_param_groups):
+            param_group = self._fp16_param_groups[group_id]
+            for param in param_group:
+                if param.requires_grad:
+                    # determines the reduction destionation rank
+                    # this is only valid for stage 2
+                    # dst_rank = None means using all-reduce
+                    # else using reduce
+                    if self._partition_grads:
+                        reduce_rank = self._param_store.get_param_rank(param)
+                    else:
+                        reduce_rank = None
+
+                    def _define_and_attach(param, reduce_rank):
+                        # get the AccumulateGrad object of the param itself
+                        accum_grad_obj = get_grad_accumulate_object(param)
+                        self._grad_store.add_accumulate_grad_object(accum_grad_obj)
+
+                        reduction_func = partial(self._reduce_and_remove_grads_by_bucket,
+                                                 param=param,
+                                                 reduce_rank=reduce_rank)
+
+                        # define hook
+                        # NOT IMPORTANT BUT GOOD TO KNOW:
+                        # args here is not grad, but allow_unreacable and accumulate_grad
+                        def reduce_grad_hook(*args):
+                            reduction_func()
+
+                        accum_grad_obj.register_hook(reduce_grad_hook)
+
+                    _define_and_attach(param, reduce_rank)
+
+    def _reduce_and_remove_grads_by_bucket(self, param, reduce_rank=None):
+        param_size = param.numel()
+
+        # check if the bucket is full
+        # if full, will reduce the grads already in the bucket
+        # after reduction, the bucket will be empty
+        if self._bucket_store.num_elements_in_bucket(reduce_rank) + param_size > self._reduce_bucket_size:
+            self._reduce_grads_in_bucket(reduce_rank)
+
+        # the param must not be reduced to ensure correctness
+        is_param_reduced = self._param_store.is_param_reduced(param)
+        if is_param_reduced:
+            msg = f'Parameter of size ({param.size()}) has already been reduced, ' \
+                  + 'duplicate reduction will lead to arithmetic incorrectness'
+            raise RuntimeError(msg)
+
+        # the param must have grad for reduction
+        assert param.grad is not None, f'Parameter of size ({param.size()}) has None grad, cannot be reduced'
+
+        self._bucket_store.add_num_elements_in_bucket(param_size, reduce_rank)
+        self._bucket_store.add_grad(param.grad, reduce_rank)
+        self._bucket_store.add_param(param, reduce_rank)
+
+    def _reduce_grads_in_bucket(self, reduce_rank=None):
+        # reduce grads
+        self._reduce_grads_by_rank(reduce_rank=reduce_rank,
+                                   grads=self._bucket_store.get_grad(reduce_rank=reduce_rank),
+                                   bucket_size=self._bucket_store.num_elements_in_bucket(reduce_rank))
+
+        # use communication stream if overlapping
+        # communication with computation
+        if self._overlap_communication:
+            stream = self._comm_stream
+        else:
+            stream = torch.cuda.current_stream()
+
+        with torch.cuda.stream(stream):
+            params_in_bucket = self._bucket_store.get_param(reduce_rank=reduce_rank)
+
+            for param in params_in_bucket:
+                # the is_param_reduced flag should be False showing that
+                # this param is not reduced before calling self._reduce_grads_by_rank
+                is_param_reduced = self._param_store.is_param_reduced(param)
+
+                if is_param_reduced:
+                    msg = f'Parameter of size ({param.size()}) has been reduced, ' + \
+                          'duplicate reduction will lead to arithmetic incorrectness'
+                    raise RuntimeError(msg)
+
+                # update the flag
+                self._param_store.set_param_reduction_state(param, True)
+
+                # if partition grads = True
+                # we do not keep the gradient after reduction
+                if self._partition_grads and not self._param_store.belongs_to_current_rank(param):
+                    if self._overlap_communication:
+                        # we need to keep this gradient for now as reduction may
+                        # be completed yet since it is using a different cuda stream
+                        self._param_store.add_previous_reduced_param(param)
+                    else:
+                        param.grad = None
+
+        self._bucket_store.reset_by_rank(reduce_rank)
+
+    def _reduce_grads_by_rank(self, reduce_rank, grads, bucket_size):
+        grad_buckets_by_dtype = split_half_float_double(grads)
+
+        for tensor_list in grad_buckets_by_dtype:
+            self._reduce_no_retain(tensor_list=tensor_list, bucket_size=bucket_size, reduce_rank=reduce_rank)
+
+    ##############################
+    # Reduction Utility Function #
+    ##############################
+    def _reduce_no_retain(self, tensor_list, bucket_size, reduce_rank):
+        param_bucket = TensorBucket(size=bucket_size)
+
+        for tensor in tensor_list:
+            param_bucket.add_to_bucket(tensor, allow_oversize=True)
+
+            if param_bucket.is_full_or_oversized():
+                self._reduce_and_copy(bucket=param_bucket, reduce_rank=reduce_rank)
+                param_bucket.empty()
+
+        if not param_bucket.is_empty():
+            self._reduce_and_copy(bucket=param_bucket, reduce_rank=reduce_rank)
+
+    def _reduce_and_copy(self, bucket: TensorBucket, reduce_rank):
+        if self._overlap_communication:
+            torch.cuda.synchronize()
+            self._param_store.clear_grads_of_previous_reduced_params()
+            stream = self._comm_stream
+        else:
+            stream = torch.cuda.current_stream()
+
+        with torch.cuda.stream(stream):
+            flat = bucket.flatten()
+            reduced_flat = reduce_tensor(tensor=flat,
+                                         dtype=self._communication_dtype,
+                                         dst_rank=reduce_rank,
+                                         parallel_mode=self._dp_parallel_mode)
+
+            # update the reduced tensor
+            if reduce_rank is None or reduce_rank == self._local_rank:
+                bucket.unflatten_and_copy(reduced_flat)
+
+    ################################
+    # torch.optim.Optimizer methods
+    ################################
+
+    def backward(self, loss, retain_graph=False):
+        loss = self.loss_scale * loss
+        loss.backward(retain_graph=retain_graph)
+
+        # finish gradient reduction
+        if not self._partition_grads:
+            self._reduce_grad_stage1()
+        else:
+            # TODO: support async comm in reduce
+            self._reduce_grad_stage2()
+
+        # clear reduced grads
+        if self._overlap_communication:
+            torch.cuda.synchronize()
+            self._param_store.clear_grads_of_previous_reduced_params()
+
+    def zero_grad(self, set_to_none=True):
+        """
+        Set parameter gradients to zero. If set_to_none = True, gradient
+        will be set to None to save memory.
+
+        :param set_to_none: Whether set the gradient to None. Default value is True.
+        :type set_to_none: bool
+        """
+        for group_id, param_group in self._fp16_param_groups.items():
+            for param in param_group:
+                if set_to_none:
+                    param.grad = None
+                else:
+                    if param.grad is not None:
+                        param.grad.detach()
+                        param.grad.zero_()
+
+    ####################
+    # Update Parameter #
+    ####################
+
+    def step(self, closure=None):
+        assert closure is None, 'closure is not supported by step()'
+
+        # check for overflow
+        found_inf = self._check_overflow()
+        self.grad_scaler.update(found_inf)
+
+        # update loss scale if overflow occurs
+        if found_inf:
+            self._grad_store._averaged_gradients = dict()
+            self.zero_grad()
+            return
+
+        # copy the grad of fp16 param to fp32 param
+        single_grad_partition_groups = []
+        norm_groups = []
+
+        for group_id in range(self.num_param_groups):
+            # compute norm
+            norm_group = compute_norm(gradients=self._grad_store._averaged_gradients[group_id],
+                                      params=self._param_store.get_fp16_params_by_rank_group(group_id=group_id,
+                                                                                             rank=self._local_rank),
+                                      dp_group=self._dp_group,
+                                      mp_group=self._mp_group)
+            norm_groups.append(norm_group)
+
+            # create flat gradient for the flat fp32 params
+            fp16_avg_grads = self._grad_store.get_averaged_gradients_by_group(group_id)
+            flat_fp16_avg_grads = flatten(fp16_avg_grads)
+
+            dtype = self._fp32_flat_param_groups_of_current_rank[group_id].dtype
+            flat_fp32_avg_grads = flat_fp16_avg_grads.to(dtype)
+
+            param_shape = self._fp32_flat_param_groups_of_current_rank[group_id].shape
+            assert param_shape == flat_fp32_avg_grads.shape, \
+                f'fp32 param and grad have different shape {param_shape} vs {flat_fp32_avg_grads.shape}'
+
+            single_grad_partition_groups.append(flat_fp32_avg_grads)
+            device = self._fp32_flat_param_groups_of_current_rank[group_id].device
+            self._fp32_flat_param_groups_of_current_rank[group_id].grad = flat_fp32_avg_grads.to(device)
+            self._grad_store._averaged_gradients[group_id] = []
+            self._grad_store._averaged_gradients[group_id] = []
+
+        # unscale and clip grads
+        global_norm = calculate_global_norm_from_list(norm_list=norm_groups)
+        self._unscale_and_clip_grads(single_grad_partition_groups, global_norm)
+
+        # update the parameters
+        self.optim.step()
+        # release the fp32 grad
+        release_param_grad(self._fp32_flat_param_groups_of_current_rank.values())
+
+        # update fp16 partition updated by the current rank
+        for group_id in range(len(self._fp16_param_groups)):
+            fp16_param = self._param_store.get_flat_fp16_param_by_rank_group(rank=self._local_rank, group_id=group_id)
+            fp32_param = self._fp32_flat_param_groups_of_current_rank[group_id]
+            fp16_param.data.copy_(fp32_param)
+
+        # broadcast the updated model weights
+        handles = []
+        for group_id in range(self.num_param_groups):
+            for rank in range(self._world_size):
+                fp16_param = self._param_store.get_flat_fp16_param_by_rank_group(rank=rank, group_id=group_id)
+                handle = dist.broadcast(fp16_param, src=rank, group=self._dp_group, async_op=True)
+                handles.append(handle)
+
+        for handle in handles:
+            handle.wait()
+
+    ##################
+    # FP16 Utilities #
+    ##################
+
+    def _check_overflow(self):
+        # clear previous overflow record
+        self._found_overflow.fill_(0.0)
+
+        # check for overflow
+        for group_id in range(len(self._fp16_param_groups)):
+            for avg_grad in self._grad_store.get_averaged_gradients_by_group(group_id):
+                if avg_grad is not None and has_inf_or_nan(avg_grad):
+                    self._found_overflow.fill_(1.0)
+                    break
+
+        # all-reduce across dp group
+        dist.all_reduce(self._found_overflow, op=dist.ReduceOp.MAX, group=self._dp_group)
+
+        # all-reduce over model parallel group
+        if self._mp_group:
+            dist.all_reduce(self._found_overflow, op=dist.ReduceOp.MAX, group=self._mp_group)
+
+        if self._found_overflow.item() > 0:
+            return True
+        else:
+            return False
+
+    def _unscale_and_clip_grads(self, grad_groups_flat, total_norm):
+        # compute combined scale factor for this group
+        combined_scale = self.loss_scale
+
+        if self._clip_grad_norm > 0.:
+            # norm is in fact norm*scale
+            clip = ((total_norm / self.loss_scale) + 1e-6) / self._clip_grad_norm
+            if clip > 1:
+                combined_scale = clip * self.loss_scale
+
+        for grad in grad_groups_flat:
+            grad.data.mul_(1. / combined_scale)
+
+    ############################
+    # Gradient Synchronization #
+    ############################
+
+    def sync_grad(self):
+        # update param already reduced flag
+        reduction_states = self._param_store.get_param_reduction_states()
+        for tensor, state in reduction_states.items():
+            reduction_states[tensor] = False
+
+        # accumulate gradient
+        avg_gradients = self._grad_store._averaged_gradients
+        for group_id in range(self.num_param_groups):
+            param_group = self._param_store.get_fp16_params_by_rank_group(self._local_rank, group_id)
+
+            if group_id not in avg_gradients:
+                avg_gradients[group_id] = []
+
+            param_idx = 0
+            for param in param_group:
+                if param.grad is not None:
+                    if len(avg_gradients[group_id]) == param_idx:
+                        avg_gradients[group_id].append(param.grad)
+                    else:
+                        avg_gradients[group_id][param_idx].add_(param.grad)
+                    param_idx += 1
+
+        # the gradients needed are stored in the avg_gradients buffer
+        # thus, can clear this
+        self.zero_grad()
+
+    def _reduce_grad_stage1(self):
+        # if not overlapping communication (no reduction hook is attached)
+        # we need to manually reduce these gradients
+        if not self._overlap_communication:
+            for group_id in range(len(self._fp16_param_groups)):
+                param_group = self._fp16_param_groups[group_id]
+                for param in param_group:
+                    if param.grad is not None:
+                        self._reduce_and_remove_grads_by_bucket(param)
+
+        # we need to reduce the gradients
+        # left in the communication bucket
+        self._reduce_grads_in_bucket()
+
+    def _reduce_grad_stage2(self):
+        # when partition_grads is True, reduction hooks
+        # are attached in the __init__ function, so we
+        # only need to reduce the gradients
+        # left in the communication bucket
+        for reduce_rank in range(self._world_size):
+            self._reduce_grads_in_bucket(reduce_rank)
--- a/colossalai/zero/utils/zero_hook_v2.py
+++ b/colossalai/zero/utils/zero_hook_v2.py
-import torch
-from colossalai.tensor.param_op_hook import ParamOpHook
-from colossalai.gemini import TensorState
-from enum import Enum
-from typing import List
 from contextlib import contextmanager
+from enum import Enum
 from functools import partial
+from typing import List
+
+import torch
+
+from colossalai.gemini import TensorState
 from colossalai.gemini.gemini_mgr import GeminiManager
+from colossalai.tensor.param_op_hook import ColoParamOpHook


 class TrainingPhase(Enum):
@@ -13,7 +15,7 @@ class TrainingPhase(Enum):
    BACKWARD = 1


-class ZeROHookV2(ParamOpHook):
+class GeminiZeROHook(ColoParamOpHook):

    def __init__(self, gemini_manager: GeminiManager) -> None:
        super().__init__()
@@ -30,7 +32,9 @@ class ZeROHookV2(ParamOpHook):
        self._gemini_manager.adjust_layout(chunks)
        for chunk in chunks:
            self._chunk_manager.access_chunk(chunk)
-        self._gemini_manager.sample_model_data()
+
+        # record cuda model data of the current OP
+        self._gemini_manager.record_model_data_volume()

    def post_op(self, params):
        params = [p for p in params if not getattr(p, '_ddp_to_ignore', False)]

--- a/colossalai/zero/utils/zero_hook.py
+++ b/colossalai/zero/utils/zero_hook.py
@@ -2,23 +2,22 @@ from typing import Optional

 import torch
 import torch.distributed as dist
+
+from colossalai.gemini.memory_tracer import MemStatsCollector
+from colossalai.gemini.ophooks import BaseOpHook
+from colossalai.gemini.stateful_tensor import TensorState
+from colossalai.gemini.stateful_tensor_mgr import StatefulTensorMgr
 from colossalai.logging import get_dist_logger
 from colossalai.registry import OPHOOKS
-
 from colossalai.utils import get_current_device
-
 from colossalai.zero.shard_utils import BaseShardStrategy
-from colossalai.gemini.ophooks import BaseOpHook
-
-from colossalai.gemini.stateful_tensor_mgr import StatefulTensorMgr
-from colossalai.gemini.memory_tracer import MemStatsCollector
-from colossalai.gemini.stateful_tensor import TensorState


 @OPHOOKS.register_module
 class ZeroHook(BaseOpHook):
    """
    A hook to process sharded param for ZeRO method.
+    Warning: this class has been deprecated after version 0.1.12
    """

    def __init__(self,
@@ -68,7 +67,7 @@ class ZeroHook(BaseOpHook):

        # record model data statistics
        if self._memstarts_collector:
-            self._memstarts_collector.sample_model_data()
+            self._memstarts_collector.record_model_data_volume()

    def pre_fwd_exec(self, module: torch.nn.Module, *args):
        self.adjust_module_data(module)

--- a/docker/Dockerfile
+++ b/docker/Dockerfile
 FROM hpcaitech/cuda-conda:11.3

 # install torch
-RUN conda install pytorch torchvision torchaudio cudatoolkit=11.3 -c pytorch
+RUN conda install pytorch==1.12.1 torchvision==0.13.1 torchaudio==0.12.1 cudatoolkit=11.3 -c pytorch

 # install apex
 RUN git clone https://github.com/NVIDIA/apex && \
    cd apex && \
-    pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./
+    pip install packaging && \
+    pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" --global-option="--fast_layer_norm" ./

 # install colossalai
 RUN git clone https://github.com/hpcaitech/ColossalAI.git \
- && cd ./ColossalAI \
- && pip install -v --no-cache-dir .
+    && cd ./ColossalAI \
+    && CUDA_EXT=1 pip install -v --no-cache-dir .

 # install titans
 RUN pip install --no-cache-dir titans

--- a/docs/colossalai/colossalai.pipeline.layer_sepc.rst
+++ b/docs/colossalai/colossalai.pipeline.layer_sepc.rst
 colossalai.pipeline.layer\_sepc
 ===============================

-.. automodule:: colossalai.pipeline.layer_sepc
+.. automodule:: colossalai.pipeline.layer_spec
   :members:
--- a/docs/colossalai/colossalai.pipeline.rst
+++ b/docs/colossalai/colossalai.pipeline.rst
@@ -8,6 +8,6 @@ colossalai.pipeline
 .. toctree::
   :maxdepth: 2

-   colossalai.pipeline.layer_sepc
+   colossalai.pipeline.layer_spec
   colossalai.pipeline.pipelinable
   colossalai.pipeline.utils
--- a/docs/colossalai/colossalai.zero.utils.rst
+++ b/docs/colossalai/colossalai.zero.utils.rst
@@ -9,4 +9,4 @@ colossalai.zero.utils
   :maxdepth: 2

   colossalai.zero.utils.zero_hook
-   colossalai.zero.utils.zero_hook_v2
+   colossalai.zero.utils.gemini_hook
--- a/docs/colossalai/colossalai.zero.utils.zero_hook_v2.rst
+++ b/docs/colossalai/colossalai.zero.utils.zero_hook_v2.rst
 colossalai.zero.utils.zero\_hook\_v2
 ====================================

-.. automodule:: colossalai.zero.utils.zero_hook_v2
+.. automodule:: colossalai.zero.utils.gemini_hook
   :members:
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
 tensorboard 
-deepspeed 
 apex 
 sphinx 
 sphinx-rtd-theme 
-myst-parser
\ No newline at end of file
+myst-parser