Merge branch 'main' into feature/shardformer

a39a5c66 · Hongxin Liu · GitHub · e79b1e80 · aaeb520c · a39a5c66
Unverified Commit a39a5c66 authored Sep 04, 2023 by Hongxin Liu Committed by GitHub Sep 04, 2023
20 changed files
--- a/colossalai/zero/gemini/gemini_ddp.py
+++ b/colossalai/zero/gemini/gemini_ddp.py
@@ -2,19 +2,21 @@ import itertools
 from collections import OrderedDict
 from contextlib import nullcontext
 from functools import partial
-from typing import Dict, Iterator, List, Optional, Set, Tuple, Union
+from typing import Dict, Iterable, Iterator, List, Optional, Set, Tuple, Union

 import torch
 import torch.distributed as dist
 import torch.nn as nn
+from torch.distributed import ProcessGroup
+from torch.distributed.distributed_c10d import _get_default_group
+
+from colossalai.checkpoint_io.utils import calculate_tensor_size, StateDictSharder
+from colossalai.interface import ModelWrapper

-from colossalai.checkpoint_io.utils import StateDictSharder
 from colossalai.lazy import LazyTensor
 from colossalai.logging import get_dist_logger
-from colossalai.nn.parallel.data_parallel import ColoDDP, _cast_float, free_storage
-from colossalai.tensor import ProcessGroup as ColoProcessGroup
-from colossalai.tensor import ReplicaSpec
-from colossalai.tensor.colo_parameter import ColoParameter, ColoTensor, ColoTensorSpec
+from colossalai.nn.parallel.data_parallel import _cast_float, free_storage
+from colossalai.tensor.colo_parameter import ColoParameter
 from colossalai.tensor.param_op_hook import ColoParamOpHookManager
 from colossalai.utils import get_current_device, is_ddp_ignored

@@ -30,14 +32,13 @@ except ImportError:
    _EXTRA_STATE_KEY_SUFFIX = '_extra_state'

 __all__ = [
-    'ZeroDDP',
    'GeminiDDP',
 ]


-class ZeroDDP(ColoDDP):
-    """ZeRO DDP for ColoTensor.
-    Warning: Nested ZeroDDP is not supported now.
+class GeminiDDP(ModelWrapper):
+    """ZeRO DDP.
+    Warning: Nested GeminiDDP is not supported now.
    It is designed to be used with ChunkManager and GeminiManager.
    For more details, see the API reference of ``ChunkManager`` and ``GeminiManager``.

@@ -54,20 +55,54 @@ class ZeroDDP(ColoDDP):
        mixed_precision (torch.dtype): If set to torch.float16, the model will be trained in fp16. Otherwise, the model will be trained in bf16. Defaults to torch.float16.
    """

-    def __init__(self,
+    def __init__(
+            self,
            module: torch.nn.Module,
-                 gemini_manager: GeminiManager,
+            chunk_config_dict: Optional[dict] = None,
+            chunk_init_device: torch.device = torch.device('cpu'),
+            placement_policy: str = "static",
+            shard_param_frac: float = 1.0,    # only for static placement
+            offload_optim_frac: float = 0.0,    # only for static placement
+            offload_param_frac: float = 0.0,    # only for static placement
+            warmup_non_model_data_ratio: float = 0.8,    # only for auto placement
+            steady_cuda_cap_ratio: float = 0.9,    # only for auto placement
+            search_range_m: int = 32,    # chunk search options
+            hidden_dim: Optional[int] = None,    # chunk search options
+            min_chunk_size_m: float = 32,    # chunk search options
            pin_memory: bool = False,
            force_outputs_fp32: bool = False,
            strict_ddp_mode: bool = False,
            scatter_after_inference: bool = True,
-                 mixed_precision: torch.dtype = torch.float16) -> None:
+            mixed_precision: torch.dtype = torch.float16,
+            process_group: Optional[ProcessGroup] = None,
+            memstats: Optional[MemStats] = None,    # genimi memory stats
+            verbose: bool = False) -> None:
        assert mixed_precision in (torch.float16, torch.bfloat16)
-        self.gemini_manager = gemini_manager
-        self.chunk_manager: ChunkManager = gemini_manager.chunk_manager
+        if chunk_config_dict is not None:
+            self.chunk_manager = ChunkManager(chunk_config_dict, chunk_init_device)
+        else:
+            # some ugly hotfix for the compatibility with Lightning
+            if search_range_m is None:
+                search_range_m = 32
+            self.chunk_manager = init_chunk_manager(model=module,
+                                                    init_device=chunk_init_device,
+                                                    hidden_dim=hidden_dim,
+                                                    search_range_m=search_range_m,
+                                                    min_chunk_size_m=min_chunk_size_m,
+                                                    strict_ddp_flag=strict_ddp_mode,
+                                                    process_group=process_group,
+                                                    verbose=verbose)
+        self.gemini_manager = GeminiManager(placement_policy,
+                                            self.chunk_manager,
+                                            memstats,
+                                            shard_param_frac=shard_param_frac,
+                                            offload_optim_frac=offload_optim_frac,
+                                            offload_param_frac=offload_param_frac,
+                                            warmup_non_model_data_ratio=warmup_non_model_data_ratio,
+                                            steady_cuda_cap_ratio=steady_cuda_cap_ratio)
        self.force_outputs_fp32 = force_outputs_fp32
-        self.param_op_hook = GeminiZeROHook(gemini_manager)
-        self.fp32_params: List[ColoTensor] = list()
+        self.param_op_hook = GeminiZeROHook(self.gemini_manager)
+        self.fp32_params: List[torch.Tensor] = list()
        self.fp16_params: List[ColoParameter] = list()
        self.overflow_counter = 0
        self.grads_device: Dict[torch.Tensor, torch.device] = dict()
@@ -75,6 +110,7 @@ class ZeroDDP(ColoDDP):
        self.name2param: Dict[str, nn.Parameter] = dict()
        self.scatter_after_inference = scatter_after_inference
        self.mixed_precision = mixed_precision
+        self.dp_process_group = process_group or _get_default_group()

        self._logger = get_dist_logger()

@@ -88,20 +124,67 @@ class ZeroDDP(ColoDDP):
            for p in module.parameters():
                param_order.append(p)

-        self._init_chunks(param_order=param_order,
-                          strict_ddp_mode=strict_ddp_mode,
-                          cpu_offload=self.gemini_manager.policy_name != 'cuda',
-                          pin_memory=pin_memory)
-
        for name, param in module.named_parameters():
            self.param2name[param] = name
        for m_name, m_var in module.named_modules():
            for p_name, p_var in m_var.named_parameters(recurse=False):
                param_name = m_name + '.' + p_name if m_name else p_name
                self.name2param[param_name] = p_var
-        super().__init__(module, process_group=ColoProcessGroup())
+
+        self._init_chunks(param_order=param_order,
+                          strict_ddp_mode=strict_ddp_mode,
+                          cpu_offload=self.gemini_manager.policy_name != 'cuda',
+                          pin_memory=pin_memory)
+        super().__init__(module)
        self._non_persistent_buffers_set = self._get_non_persistent_buffers_set(module)
        self._cast_buffers()
+        # register grad hook
+        for p in module.parameters():
+            if is_ddp_ignored(p):
+                continue
+            if p.requires_grad:
+                p.register_hook(partial(self.grad_handle, p))
+
+    def parameters(self, recurse: bool = True):
+        return self.module.parameters(recurse)
+
+    def named_parameters(self, prefix: str = '', recurse: bool = True):
+        return self.module.named_parameters(prefix, recurse)
+
+    def named_buffers(self, prefix: str = '', recurse: bool = True):
+        return self.module.named_buffers(prefix, recurse)
+
+    def named_children(self):
+        return self.module.named_children()
+
+    def named_modules(self,
+                      memo: Optional[Set[torch.nn.Module]] = None,
+                      prefix: str = '',
+                      remove_duplicate: bool = True):
+        return self.module.named_modules(memo, prefix, remove_duplicate)
+
+    @staticmethod
+    def set_params_to_ignore(params_to_ignore: Iterable[torch.Tensor]) -> None:
+        """Sets parameters to be ignored by DDP.
+        This method must be called before initializing ColoDDP.
+
+        Example:
+            >>> params_to_ignore = []
+            >>> for p in module.parameters():
+            >>>     if should_ignore(p):
+            >>>         params_to_ignore.append(p)
+            >>> ColoDDP.set_params_to_ignore(params_to_ignore)
+            >>> module = ColoDDP(module)
+
+        Args:
+            params_to_ignore (Iterable[torch.Tensor]): A list of parameters to be ignored.
+        """
+        for p in params_to_ignore:
+            p._ddp_to_ignore = True
+
+    def unwrap(self):
+        # as save/load state dict is overwrited, only return self
+        return self

    def _get_non_persistent_buffers_set(self,
                                        module,
@@ -207,7 +290,7 @@ class ZeroDDP(ColoDDP):
                    error_params.append(self.param2name[param])
            error_str = "\n\t".join(error_params)
            raise RuntimeError("ZERO DDP error: the synchronization of gradients doesn't exit properly.",
-                               "The most possible reason is that the model is not compatible with ZeroDDP.\n",
+                               "The most possible reason is that the model is not compatible with GeminiDDP.\n",
                               f"{error_str}")
        self._setup_grads_ptr()
        self._logger.debug(
@@ -227,6 +310,7 @@ class ZeroDDP(ColoDDP):
        self._post_backward()

    def grad_handle(self, p, grad):
+        setattr(p, "_gemini_reduced", True)
        empty_grad = torch.empty_like(grad)
        free_storage(empty_grad)
        with torch._C.DisableTorchFunction():
@@ -533,7 +617,7 @@ class ZeroDDP(ColoDDP):
        for chunk_32 in chunk_list:
            chunk_16 = chunk_32.paired_chunk
            assert chunk_16 is not None
-            chunk_16.optim_update()
+            chunk_16.payload.copy_(chunk_32.payload)

        for name, buf in persistent_buffers.items():
            if buf is not None:
@@ -557,17 +641,11 @@ class ZeroDDP(ColoDDP):
                        unexpected_keys.append(key)

    def _init_chunks(self, param_order, strict_ddp_mode: bool, cpu_offload: bool, pin_memory: bool):
-        ddp_pg = ColoProcessGroup()
+        dp_world_size = dist.get_world_size(self.dp_process_group)
        for p in param_order.generate():
            self._preprocess_param(p)
            assert type(p) is ColoParameter

-            # gather sharded parameters in the strict ddp mode
-            if strict_ddp_mode:
-                if not p.is_replicate():
-                    p.set_dist_spec(ReplicaSpec())
-                p.set_process_group(pg=ddp_pg)
-
            # ignore the parameters with no gradient
            if not p.requires_grad:
                self.set_params_to_ignore([p])
@@ -578,38 +656,37 @@ class ZeroDDP(ColoDDP):
                continue

            # create a fp32 parameter
-            fp32_data = p.data.float()
-            fp32_p = ColoTensor(fp32_data, spec=ColoTensorSpec(p.process_group))
+            fp32_p = p.data.float()
            # create a fp16 parameter
            p.data = p.data.to(self.mixed_precision)

            # register the fp16 parameter and fp32 parameter in the chunk manager
-            dp_world_size = p.process_group.dp_world_size()
            self.chunk_manager.register_tensor(tensor=p,
                                               group_type='fp16_param',
                                               config_key=dp_world_size,
+                                               process_group=self.dp_process_group,
                                               cpu_offload=cpu_offload,
                                               pin_memory=pin_memory)
            self.chunk_manager.register_tensor(tensor=fp32_p,
                                               group_type='fp32_param',
                                               config_key=dp_world_size,
+                                               process_group=self.dp_process_group,
                                               cpu_offload=cpu_offload,
                                               pin_memory=pin_memory)

            self.fp16_params.append(p)
            self.fp32_params.append(fp32_p)
-            self.grads_device[p] = self.gemini_manager.default_device

        self.chunk_manager.close_all_groups()

+        self.gemini_manager.setup_grads_device(self.fp16_params, self.grads_device)
+        # move master weights to corresponding device and setup paired chunks
        for p, fp32_p in zip(self.fp16_params, self.fp32_params):
            chunk_16 = self.chunk_manager.get_chunk(p)
            chunk_32 = self.chunk_manager.get_chunk(fp32_p)
            chunk_32.init_pair(chunk_16)
-
-            # keep gathered chunks are in CUDA
-            if chunk_16.keep_gathered:
-                self.grads_device[p] = get_current_device()
+            if chunk_32.device_type != self.grads_device[p].type:
+                self.chunk_manager.move_chunk(chunk_32, self.grads_device[p])

    def _cast_buffers(self):
        for buffer in self.module.buffers():
@@ -705,65 +782,3 @@ class ZeroDDP(ColoDDP):
        yield sharder.current_block, sharder.current_block_size


-class GeminiDDP(ZeroDDP):
-
-    def __init__(self,
-                 module: torch.nn.Module,
-                 device: torch.device,
-                 placement_policy: str = "cpu",
-                 pin_memory: bool = False,
-                 force_outputs_fp32: bool = False,
-                 strict_ddp_mode: bool = False,
-                 scatter_after_inference: bool = True,
-                 search_range_m: int = 32,
-                 hidden_dim: Optional[int] = None,
-                 min_chunk_size_m: float = 32,
-                 memstats: Optional[MemStats] = None,
-                 mixed_precision: torch.dtype = torch.float16,
-                 verbose: bool = False) -> None:
-        """
-        A torch.Module wrapper using ZeRO-DP and Gemini.
-        ZeRO is for parallel. Gemini is for memory management.
-        WARNING: The class will modify the module inline!
-
-        Example:
-            model is initialized under the context of ColoInitContext
-            >>> model = GeminiDDP(model, torch.cuda.current_device(), "cuda")
-            >>> logits = model(x)
-            >>> loss = criterion(logits, labels)
-            >>> model.backward(loss)
-
-        Args:
-            module (torch.nn.Module): the model to be wrapped.
-            device (torch.device): device to place the model.
-            placement_policy (str, optional): "cpu", "cuda", "auto". Defaults to "cpu".
-            pin_memory (bool, optional): use pin memory on CPU. Defaults to False.
-            force_outputs_fp32 (bool, optional): force outputs are fp32. Defaults to False.
-            search_range_m (int, optional): chunk size searching range divided by 2^20. Defaults to 32.
-            hidden_dim (int, optional): the hidden dimension of DNN.
-                Users can provide this argument to speed up searching.
-                If users do not know this argument before training, it is ok. We will use a default value 1024.
-            min_chunk_size_m (float, optional): the minimum chunk size divided by 2^20.
-                If the aggregate size of parameters is still smaller than the minimum chunk size,
-                all parameters will be compacted into one small chunk.
-            memstats (MemStats, optional) the memory statistics collector by a runtime memory tracer.
-        """
-        # some ugly hotfix for the compatibility with Lightning
-        if search_range_m is None:
-            search_range_m = 32
-
-        chunk_manager = init_chunk_manager(model=module,
-                                           init_device=device,
-                                           hidden_dim=hidden_dim,
-                                           search_range_m=search_range_m,
-                                           min_chunk_size_m=min_chunk_size_m,
-                                           strict_ddp_flag=strict_ddp_mode,
-                                           verbose=verbose)
-        gemini_manager = GeminiManager(placement_policy, chunk_manager, memstats)
-        super().__init__(module,
-                         gemini_manager,
-                         pin_memory,
-                         force_outputs_fp32,
-                         strict_ddp_mode,
-                         scatter_after_inference,
-                         mixed_precision=mixed_precision)
--- a/colossalai/zero/gemini/gemini_mgr.py
+++ b/colossalai/zero/gemini/gemini_mgr.py
 import functools
 from time import time
-from typing import List, Optional, Tuple
+from typing import Dict, List, Optional, Tuple

 import torch

@@ -26,7 +26,11 @@ class GeminiManager:
        memstats (MemStats, optional): a mem stats collected by a runtime mem tracer. if None then GeminiManager will collect it during a warmup iteration.
    """

-    def __init__(self, placement_policy: str, chunk_manager: ChunkManager, memstats: Optional[MemStats] = None) -> None:
+    def __init__(self,
+                 placement_policy: str,
+                 chunk_manager: ChunkManager,
+                 memstats: Optional[MemStats] = None,
+                 **placement_kwargs) -> None:

        assert placement_policy in PlacementPolicyFactory.get_policy_names()
        self.policy_name = placement_policy
@@ -37,7 +41,7 @@ class GeminiManager:
        self._memstats = memstats
        self._mem_stats_collector = ChunkMemStatsCollector(chunk_manager,
                                                           self._memstats) if policy_cls.need_mem_stats else None
-        self._placement_policy = policy_cls(chunk_manager, self._mem_stats_collector)
+        self._placement_policy = policy_cls(chunk_manager, self._mem_stats_collector, **placement_kwargs)
        self._compute_list: List[Tuple[Chunk, ...]] = []
        self._compute_idx: int = -1

@@ -133,10 +137,6 @@ class GeminiManager:
        if self._warmup and self._placement_policy.need_mem_stats:
            self._compute_list.append(chunks)

-    @property
-    def default_device(self):
-        return self._placement_policy.get_default_device()
-
    def sample_overall_data(self):
        if self._mem_stats_collector:
            self._mem_stats_collector.sample_overall_data()
@@ -159,6 +159,6 @@ class GeminiManager:
    def is_cuda_margin_mem_avail(self) -> bool:
        return self._placement_policy.need_mem_stats

-    @staticmethod
-    def get_default_device(policy_name: str) -> torch.device:
-        return PlacementPolicyFactory.get_default_device(policy_name)
+    def setup_grads_device(self, params: List[torch.Tensor], grads_device_map: Dict[torch.Tensor,
+                                                                                    torch.device]) -> None:
+        self._placement_policy.setup_grads_device(params, grads_device_map)
--- a/colossalai/zero/gemini/gemini_optimizer.py
+++ b/colossalai/zero/gemini/gemini_optimizer.py
@@ -2,7 +2,7 @@
 import copy
 import math
 import warnings
-from typing import Any, Dict, Iterator, OrderedDict, Set, Tuple
+from typing import Any, Dict, Iterator, OrderedDict, Set, Tuple, Union

 import torch
 import torch.distributed as dist
@@ -10,16 +10,17 @@ from torch.nn import Parameter
 from torch.optim import Optimizer

 from colossalai.amp.naive_amp.mixed_precision_mixin import BF16MixedPrecisionMixin, FP16MixedPrecisionMixin
-from colossalai.checkpoint_io.utils import StateDictSharder
+from colossalai.checkpoint_io.utils import calculate_tensor_size, StateDictSharder
+from colossalai.interface import OptimizerWrapper
 from colossalai.logging import get_dist_logger
-from colossalai.nn.optimizer import ColossalaiOptimizer, CPUAdam, FusedAdam, HybridAdam
+from colossalai.nn.optimizer import CPUAdam, FusedAdam, HybridAdam
 from colossalai.tensor.d_tensor import is_distributed_tensor
 from colossalai.utils import disposable, get_current_device, is_ddp_ignored

 from .chunk import Chunk, ChunkManager
-from .gemini_ddp import ZeroDDP
+from .gemini_ddp import GeminiDDP

-__all__ = ['ZeroOptimizer', 'GeminiAdamOptimizer']
+__all__ = ['GeminiOptimizer', 'GeminiAdamOptimizer']

 _AVAIL_OPTIM_LIST = {FusedAdam, CPUAdam, HybridAdam}

@@ -27,7 +28,7 @@ _AVAIL_OPTIM_LIST = {FusedAdam, CPUAdam, HybridAdam}
 class GeminiFP16MixedPrecisionMixin(FP16MixedPrecisionMixin):

    def __init__(self,
-                 module: ZeroDDP,
+                 module: GeminiDDP,
                 initial_scale: float = 2**16,
                 min_scale: float = 1,
                 growth_factor: float = 2,
@@ -46,11 +47,11 @@ class GeminiFP16MixedPrecisionMixin(FP16MixedPrecisionMixin):
        self.module.overflow_counter = 0


-class ZeroOptimizer(ColossalaiOptimizer):
-    """A wrapper for optimizer. ``ZeroDDP`` and ``ZeroOptimizer`` implement Zero Redundancy Optimizer (ZeRO state-3).
+class GeminiOptimizer(OptimizerWrapper):
+    """A wrapper for optimizer. ``GeminiDDP`` and ``GeminiOptimizer`` implement Zero Redundancy Optimizer (ZeRO state-3).

    Note:
-        You must use ``ZeroDDP`` with ``ZeroOptimizer``.
+        You must use ``GeminiDDP`` with ``GeminiOptimizer``.

    Note:
        Make sure you set ``placement_policy`` of ``GeminiManager`` to `"auto"`,
@@ -58,7 +59,7 @@ class ZeroOptimizer(ColossalaiOptimizer):

    Args:
        optim (Optimizer): An Optimizer instance.
-        module (ZeroDDP): A ``ZeroDDP`` instance.
+        module (GeminiDDP): A ``GeminiDDP`` instance.
        gpu_margin_mem_ratio (float, optional): The ratio of GPU remaining memory (after the first forward-backward)
            which will be used when using hybrid CPU optimizer.
            This argument is meaningless when `placement_policy` of `GeminiManager` is not "auto".
@@ -70,15 +71,15 @@ class ZeroOptimizer(ColossalaiOptimizer):
        growth_interval (float, optional): Growth_interval used by DynamicGradScaler. Defaults to 1000.
        hysteresis (float, optional): Hysteresis used by DynamicGradScaler. Defaults to 2.
        max_scale (int, optional): Max_scale used by DynamicGradScaler. Defaults to 2**32.
-        clipping_norm (float, optional): The norm value used to clip gradient. Defaults to 0.0.
+        max_norm (float, optional): The norm value used to clip gradient. Defaults to 0.0.
        norm_type (float, optional): The type of norm used for gradient clipping. Currently, only L2-norm (norm_type=2.0)
-            is supported in ZeroOptimizer. Defaults to 2.0.
+            is supported in GeminiOptimizer. Defaults to 2.0.
        verbose (bool, optional): Whether to print verbose information, including grad overflow info. Defaults to False.
    """

    def __init__(self,
                 optim: Optimizer,
-                 module: ZeroDDP,
+                 module: GeminiDDP,
                 gpu_margin_mem_ratio: float = 0.0,
                 initial_scale: float = 2**32,
                 min_scale: float = 1,
@@ -87,12 +88,12 @@ class ZeroOptimizer(ColossalaiOptimizer):
                 growth_interval: int = 1000,
                 hysteresis: int = 2,
                 max_scale: float = 2**32,
-                 clipping_norm: float = 0.0,
+                 max_norm: float = 0.0,
                 norm_type: float = 2.0,
                 verbose: bool = False,
                 **defaults: Any):
        super().__init__(optim)
-        assert isinstance(module, ZeroDDP)
+        assert isinstance(module, GeminiDDP)
        assert type(optim) in _AVAIL_OPTIM_LIST, "You should use an optimizer in the available list:\n" \
            f"{_AVAIL_OPTIM_LIST}"
        self.module = module
@@ -101,8 +102,8 @@ class ZeroOptimizer(ColossalaiOptimizer):
        self.param_to_range: Dict[Parameter, Tuple[int, int]] = dict()
        self.param_to_chunk32: Dict[Parameter, Chunk] = dict()
        self.chunk16_set: Set[Chunk] = set()
-        self.clipping_flag = clipping_norm > 0.0
-        self.max_norm = clipping_norm
+        self.clipping_flag = max_norm > 0.0
+        self.max_norm = max_norm
        self.verbose = verbose
        self.param_groups_backup = list()

@@ -111,7 +112,7 @@ class ZeroOptimizer(ColossalaiOptimizer):
        self.id_to_fake_params: Dict[int, Parameter] = dict()

        if self.clipping_flag:
-            assert norm_type == 2.0, "ZeroOptimizer only supports L2 norm now"
+            assert norm_type == 2.0, "GeminiOptimizer only supports L2 norm now"

        ddp_param_list = []
        for name, param in module.named_parameters():
@@ -703,8 +704,19 @@ class ZeroOptimizer(ColossalaiOptimizer):

        yield sharder.current_block, sharder.current_block_size

+    def clip_grad_by_value(self, clip_value: float, *args, **kwargs) -> None:
+        raise NotImplementedError('Gemini does not support clip_grad_by_value')

-class GeminiAdamOptimizer(ZeroOptimizer):
+    def clip_grad_by_norm(self,
+                          max_norm: Union[float, int],
+                          norm_type: Union[float, int] = 2,
+                          error_if_nonfinite: bool = False,
+                          *args,
+                          **kwargs) -> torch.Tensor:
+        warnings.warn(f'Gemini controls grad clipping by itself, so you should not use clip_grad_by_norm')
+
+
+class GeminiAdamOptimizer(GeminiOptimizer):

    def __init__(self, model: torch.nn.Module, **defaults: Any) -> None:
        optimizer = HybridAdam(model.parameters(), **defaults)

--- a/colossalai/zero/gemini/memory_tracer/memory_stats.py
+++ b/colossalai/zero/gemini/memory_tracer/memory_stats.py
@@ -9,7 +9,7 @@ class MemStats(object):

    def __init__(self) -> None:
        """
-        Store the non model data statistics used for Gemini and ZeroOptimizer.
+        Store the non model data statistics used for Gemini and GeminiOptimizer.
        """
        # (preop_step, List[param])
        self._step_param_dict = dict()

--- a/colossalai/zero/gemini/placement_policy.py
+++ b/colossalai/zero/gemini/placement_policy.py
 import functools
+import warnings
 from abc import ABC, abstractmethod
 from time import time
 from typing import Dict, List, Optional, Tuple, Type
@@ -7,6 +8,7 @@ import torch

 from colossalai.utils import get_current_device
 from colossalai.utils.memory import colo_device_memory_capacity
+from colossalai.zero.gemini.chunk import Chunk

 from .chunk import Chunk, ChunkManager
 from .memory_tracer import ChunkMemStatsCollector
@@ -17,7 +19,8 @@ class PlacementPolicy(ABC):

    def __init__(self,
                 chunk_manager: ChunkManager,
-                 mem_stats_collector: Optional[ChunkMemStatsCollector] = None) -> None:
+                 mem_stats_collector: Optional[ChunkMemStatsCollector] = None,
+                 **kwargs) -> None:
        self.chunk_manager = chunk_manager
        self.mem_stats_collector: Optional[ChunkMemStatsCollector] = mem_stats_collector

@@ -25,57 +28,87 @@ class PlacementPolicy(ABC):
    def evict_tensors(self, can_evict_chunks: List[Chunk], **kwargs) -> Tuple[int, float]:
        raise NotImplementedError

-    @staticmethod
-    def get_default_device() -> torch.device:
-        return torch.device('cpu')
+    @abstractmethod
+    def setup_grads_device(self, params: List[torch.Tensor], grads_device_map: Dict[torch.Tensor,
+                                                                                    torch.device]) -> None:
+        raise NotImplementedError


-class CPUPlacementPolicy(PlacementPolicy):
+class StaticPlacementPolicy(PlacementPolicy):

    def __init__(self,
                 chunk_manager: ChunkManager,
-                 mem_stats_collector: Optional[ChunkMemStatsCollector] = None) -> None:
+                 mem_stats_collector: Optional[ChunkMemStatsCollector] = None,
+                 shard_param_frac: float = 1.0,
+                 offload_optim_frac: float = 0.0,
+                 offload_param_frac: float = 0.0,
+                 **kwargs) -> None:
        super().__init__(chunk_manager, mem_stats_collector=mem_stats_collector)
+        if offload_param_frac > 0.0 and (shard_param_frac != 1.0 or offload_optim_frac != 1.0):
+            warnings.warn('offload_param_frac is ignored when shard_param_frac != 1.0 or offload_optim_frac != 1.0')
+            offload_param_frac = 0.0
+        self.shard_param_frac = shard_param_frac
+        self.offload_optim_frac = offload_optim_frac
+        self.offload_param_frac = offload_param_frac
+        # these should be initialized in setup_grads_device
+        self.keep_gathered_chunk_mem = 0.0
+        self.keep_cuda_chunk_mem = 0.0

    def evict_tensors(self, can_evict_chunks: List[Chunk], **kwargs) -> Tuple[int, float]:
-        volume = 0
-        start = time()
+        can_shard_chunk_mem = sum(chunk.chunk_mem for chunk in can_evict_chunks)
+        can_offload_chunk_mem = can_shard_chunk_mem
        for chunk in can_evict_chunks:
+            if can_shard_chunk_mem <= self.keep_gathered_chunk_mem:
+                break
            self.chunk_manager.release_chunk(chunk)
+            # real saved mem is chunk_mem - shard_mem, for simplicity we use chunk_mem
+            can_shard_chunk_mem -= chunk.chunk_mem
+        for chunk in can_evict_chunks:
+            if can_offload_chunk_mem <= self.keep_cuda_chunk_mem:
+                break
            self.chunk_manager.move_chunk(chunk, torch.device('cpu'))
-            volume += chunk.chunk_mem
-        return volume, time() - start
+            # real saved mem is shard_mem, for simplicity we use chunk_mem
+            can_offload_chunk_mem -= chunk.chunk_mem
+        return 0, 0.0
+
+    def setup_grads_device(self, params: List[torch.Tensor], grads_device_map: Dict[torch.Tensor,
+                                                                                    torch.device]) -> None:
+        total_chunk_mem = sum(self.chunk_manager.get_chunk(p).chunk_mem for p in params)
+
+        offload_optim_chunk_mem = total_chunk_mem * self.offload_optim_frac
+        offloaded_optim_chunk_mem = 0
+        chunks = set(self.chunk_manager.get_chunk(p) for p in params)
+        for chunk in chunks:
+            params = chunk.get_tensors()
+            # init offload optim settings
+            # keep gathered chunks are in CUDA
+            if chunk.keep_gathered or offloaded_optim_chunk_mem >= offload_optim_chunk_mem:
+                device = get_current_device()
+            else:
+                device = torch.device('cpu')
+                # real offloaded mem is chunk.shard_mem, for simplicity we use chunk mem here
+                offloaded_optim_chunk_mem += chunk.chunk_mem
+            for p in params:
+                grads_device_map[p] = device
+        self.keep_gathered_chunk_mem = total_chunk_mem * (1 - self.shard_param_frac)
+        self.keep_cuda_chunk_mem = total_chunk_mem * (1 - self.offload_param_frac)


-class CUDAPlacementPolicy(PlacementPolicy):
+class AutoPlacementPolicy(PlacementPolicy):
+    need_mem_stats: bool = True

    def __init__(self,
                 chunk_manager: ChunkManager,
-                 mem_stats_collector: Optional[ChunkMemStatsCollector] = None) -> None:
-        assert torch.cuda.is_available(), 'Cannot use CUDATensorPlacementPolicy when CUDA is not available'
+                 mem_stats_collector: Optional[ChunkMemStatsCollector] = None,
+                 warmup_non_model_data_ratio: float = 0.8,
+                 steady_cuda_cap_ratio: float = 0.9,
+                 **kwargs) -> None:
        super().__init__(chunk_manager, mem_stats_collector=mem_stats_collector)
-
-    def evict_tensors(self, can_evict_chunks: List[Chunk], **kwargs) -> Tuple[int, float]:
-        return 0, 0
-
-    @staticmethod
-    def get_default_device() -> torch.device:
-        return get_current_device()
-
-
-class AutoPlacementPolicy(PlacementPolicy):
-
-    need_mem_stats: bool = True
        # model data will use 1-_warmup_non_model_data_ratio CUDA memory in warmup phase
        # you can set them by AutoPlacementPolicy.set_warmup_non_model_data_ratio()
        # and AutoPlacementPolicy.set_steady_cuda_cap_ratio()
-    _warmup_non_model_data_ratio: float = 0.8
-    _steady_cuda_cap_ratio: float = 0.9
-
-    def __init__(self,
-                 chunk_manager: ChunkManager,
-                 mem_stats_collector: Optional[ChunkMemStatsCollector] = None) -> None:
-        super().__init__(chunk_manager, mem_stats_collector=mem_stats_collector)
+        self._warmup_non_model_data_ratio = warmup_non_model_data_ratio
+        self._steady_cuda_cap_ratio = steady_cuda_cap_ratio

    def evict_tensors(self,
                      can_evict_chunks: List[Chunk],
@@ -105,11 +138,11 @@ class AutoPlacementPolicy(PlacementPolicy):
        used_cuda_model_data = self.chunk_manager.total_mem['cuda']
        if warmup:
            # We designate a part of CUDA memory for model data in warmup iterations.
-            max_cuda_non_model_data_per_period = cuda_capacity * AutoPlacementPolicy._warmup_non_model_data_ratio
+            max_cuda_non_model_data_per_period = cuda_capacity * self._warmup_non_model_data_ratio
        else:
            # max non-model-data cuda memory consumption of this sampling moment and the next sampling moment.
            max_cuda_non_model_data_per_period = self.mem_stats_collector.next_period_non_model_data_usage('cuda')
-            cuda_capacity *= AutoPlacementPolicy._steady_cuda_cap_ratio
+            cuda_capacity *= self._steady_cuda_cap_ratio
        total_cuda_model_data = cuda_capacity - max_cuda_non_model_data_per_period
        avail_cuda_model_data = total_cuda_model_data - used_cuda_model_data
        freed_cuda_model_data = 0
@@ -145,89 +178,22 @@ class AutoPlacementPolicy(PlacementPolicy):
        next_compute_idx = sorted(next_compute_idx.items(), key=lambda pair: pair[1], reverse=True)
        return [t for (t, idx) in next_compute_idx]

-    @staticmethod
-    def set_warmup_non_model_data_ratio(ratio: float) -> None:
-        ratio = float(ratio)
-        assert 0.0 < ratio < 1.0
-        AutoPlacementPolicy._warmup_non_model_data_ratio = ratio
-
-    @staticmethod
-    def set_steady_cuda_cap_ratio(ratio: float) -> None:
-        ratio = float(ratio)
-        assert 0.0 < ratio < 1.0
-        AutoPlacementPolicy._steady_cuda_cap_ratio = ratio
-
-
-class ConstPlacementPolicy(PlacementPolicy):
-
-    need_mem_stats: bool = False
-    _accessed_memory_boundary = 512 * 1024**2
-
-    def __init__(self,
-                 chunk_manager: ChunkManager,
-                 mem_stats_collector: Optional[ChunkMemStatsCollector] = None) -> None:
-        super().__init__(chunk_manager, mem_stats_collector=mem_stats_collector)
-
-    def evict_tensors(self,
-                      can_evict_chunks: List[Chunk],
-                      cuda_demand: int = 0,
-                      warmup: bool = True,
-                      compute_list: Optional[List[Tuple[Chunk, ...]]] = None,
-                      compute_idx: int = 0,
-                      **kwargs) -> Tuple[int, float]:
-        """
-        See the docstrings in the class `AutoPlacementPolicy`.
-        """
-        start = time()
-        used_accessed_memory = self.chunk_manager.accessed_mem
-        avail_accessed_memory = ConstPlacementPolicy._accessed_memory_boundary - used_accessed_memory
-        freed_accessed_memory = 0
-
-        if avail_accessed_memory < cuda_demand:
-            to_free_memory = cuda_demand - avail_accessed_memory
-            to_free_chunks = can_evict_chunks
-
-            if not warmup:
-                # sort all chunks
-                to_free_chunks = self._sort_can_evict_chunks(tuple(to_free_chunks), compute_idx, tuple(compute_list))
-
-            for chunk in to_free_chunks:
-                if freed_accessed_memory >= to_free_memory:
-                    break
-
-                self.chunk_manager.release_chunk(chunk)
-                self.chunk_manager.move_chunk(chunk, torch.device('cpu'))
-                freed_accessed_memory += chunk.chunk_mem
-
-            if freed_accessed_memory < to_free_memory:
-                raise RuntimeError(f"Adjust layout failed! No enough CUDA memory! "
-                                   f"Need {to_free_memory}, freed {freed_accessed_memory}")
-        return freed_accessed_memory, time() - start
-
-    @staticmethod
-    @functools.lru_cache(maxsize=None)
-    def _sort_can_evict_chunks(can_evict_chunks: tuple, compute_idx: int, compute_list: tuple) -> list:
-        next_compute_idx = {chunk: len(compute_list) for chunk in can_evict_chunks}
-        for i in range(len(compute_list) - 1, compute_idx, -1):
-            for chunk in compute_list[i]:
-                if chunk in next_compute_idx:
-                    next_compute_idx[chunk] = i
-        next_compute_idx = sorted(next_compute_idx.items(), key=lambda pair: pair[1], reverse=True)
-        return [t for (t, idx) in next_compute_idx]
-
-    @staticmethod
-    def set_const_memory_boundary(cuda_memory_mb: int) -> None:
-        boundary = int(cuda_memory_mb * 1024**2)
-        assert boundary > 0
-        ConstPlacementPolicy._accessed_memory_boundary = boundary
+    def setup_grads_device(self, params: List[torch.Tensor], grads_device_map: Dict[torch.Tensor,
+                                                                                    torch.device]) -> None:
+        for p in params:
+            chunk = self.chunk_manager.get_chunk(p)
+            # init offload optim settings
+            # keep gathered chunks are in CUDA
+            if chunk.keep_gathered:
+                grads_device_map[p] = get_current_device()
+            else:
+                grads_device_map[p] = torch.device('cpu')


 class PlacementPolicyFactory:
    policies: Dict[str, Type[PlacementPolicy]] = {
-        'cpu': CPUPlacementPolicy,
-        'cuda': CUDAPlacementPolicy,
        'auto': AutoPlacementPolicy,
-        'const': ConstPlacementPolicy
+        'static': StaticPlacementPolicy,
    }

    @staticmethod
@@ -239,8 +205,3 @@ class PlacementPolicyFactory:
    @staticmethod
    def get_policy_names():
        return tuple(PlacementPolicyFactory.policies.keys())
-
-    @staticmethod
-    def get_default_device(policy_name: str) -> torch.device:
-        policy_cls = PlacementPolicyFactory.create(policy_name)
-        return policy_cls.get_default_device()
--- a/colossalai/zero/gemini/utils.py
+++ b/colossalai/zero/gemini/utils.py
@@ -64,13 +64,13 @@ def get_static_torch_model(zero_ddp_model,
                           device=torch.device("cpu"),
                           dtype=torch.float32,
                           only_rank_0=True) -> torch.nn.Module:
-    """Get a static torch.nn.Module model from the given ZeroDDP module.
-    You should notice that the original ZeroDDP model is not modified.
+    """Get a static torch.nn.Module model from the given GeminiDDP module.
+    You should notice that the original GeminiDDP model is not modified.
    Thus, you can use the original model in further training.
    But you should not use the returned torch model to train, this can cause unexpected errors.

    Args:
-        zero_ddp_model (ZeroDDP): a zero ddp model
+        zero_ddp_model (GeminiDDP): a zero ddp model
        device (torch.device): the device of the final torch model
        dtype (torch.dtype): the dtype of the final torch model
        only_rank_0 (bool): if True, only rank0 has the converted torch model
@@ -78,8 +78,8 @@ def get_static_torch_model(zero_ddp_model,
    Returns:
        torch.nn.Module: a static torch model used for saving checkpoints or numeric checks
    """
-    from colossalai.zero.gemini.gemini_ddp import ZeroDDP
-    assert isinstance(zero_ddp_model, ZeroDDP)
+    from colossalai.zero.gemini.gemini_ddp import GeminiDDP
+    assert isinstance(zero_ddp_model, GeminiDDP)

    state_dict = zero_ddp_model.state_dict(only_rank_0=only_rank_0)
    colo_model = zero_ddp_model.module

--- a/colossalai/zero/low_level/bookkeeping/gradient_store.py
+++ b/colossalai/zero/low_level/bookkeeping/gradient_store.py
@@ -57,8 +57,8 @@ class GradientStore(BaseStore):
            self._grads_of_params[group_id][param_id].append(grad)

    def add_gradients_by_param_id(self, grad: Tensor, grad_idx: int, group_id: int, param_id: int):
-        """For old gradient accumulation, not in use now.
-        Add a gradient slice on an existing slice of the parameter's gradient
+        """Add a gradient slice on an existing slice of the parameter's gradient
+        Used when no_sync is not activated.

        Args:
            grad (Tensor): The split gradient to append to list

--- a/colossalai/zero/low_level/low_level_optim.py
+++ b/colossalai/zero/low_level/low_level_optim.py
@@ -80,9 +80,6 @@ class LowLevelZeroOptimizer(OptimizerWrapper):
            tp_process_group: Optional[ProcessGroup] = None,    # if using tp
            forced_dtype: Optional[torch.dtype] = None):

-        # TODO:
-        # 1. state_dict for checkpoint IO
-
        super(LowLevelZeroOptimizer, self).__init__(optim=optimizer)
        self._dtype = self.optim.param_groups[0]['params'][0].dtype
        self._logger = get_dist_logger()
@@ -277,7 +274,11 @@ class LowLevelZeroOptimizer(OptimizerWrapper):
                        sync_tensor(flat_grads_per_rank[rank], grad_list)
                        for grad in grad_list:
                            param_id = self._bucket_store.get_param_id_of_grad(grad)
+                            if len(self._grad_store.get_partitioned_gradients_by_param_id(group_id,
+                                                                                          param_id)) < self._world_size:
                                self._grad_store.append_gradients_by_param_id(grad, group_id, param_id)
+                            else:
+                                self._grad_store.add_gradients_by_param_id(grad, rank, group_id, param_id)

                else:
                    flat_grads_list = list(flat_grads.split(len(flat_grads) // self._world_size))
@@ -291,7 +292,10 @@ class LowLevelZeroOptimizer(OptimizerWrapper):
                    sync_tensor(recieved_grad, grad_in_bucket_current_rank)
                    for grad in grad_in_bucket_current_rank:
                        param_id = self._bucket_store.get_param_id_of_grad(grad)
+                        if len(self._grad_store.get_partitioned_gradients_by_param_id(group_id, param_id)) < 1:
                            self._grad_store.append_gradients_by_param_id(grad, group_id, param_id)
+                        else:
+                            self._grad_store.add_gradients_by_param_id(grad, 0, group_id, param_id)

                self._bucket_store.reset()

@@ -315,7 +319,8 @@ class LowLevelZeroOptimizer(OptimizerWrapper):

    def backward(self, loss, retain_graph=False):
        assert not(self._partition_grads and not self.require_grad_sync), \
-            "ZeRO2(partition_grads) and gradient accumulation(no_sync) are not compatible"
+            "ZeRO2(partition_grads) and no_sync are not compatible"
+
        if self.mixed_precision_mixin is not None:
            loss = self.mixed_precision_mixin.pre_backward(loss)

@@ -537,9 +542,12 @@ class LowLevelZeroOptimizer(OptimizerWrapper):
            for k, v in state.items():
                if isinstance(v, torch.Tensor) and k != 'step':
                    working_param = self._param_store.master_to_working_param[id(param)]
-                    gather_tensor = [torch.zeros_like(v) for _ in range(self._world_size)]
-                    dist.all_gather(gather_tensor, v, group=self.dp_pg)
-                    param_state = torch.stack(gather_tensor).view(-1)[:working_param.numel()].reshape_as(working_param)
+                    gather_tensor = [
+                        torch.zeros(v.shape, device='cuda', dtype=v.dtype) for _ in range(self._world_size)
+                    ]
+                    dist.all_gather(gather_tensor, v.cuda(), group=self.dp_pg)
+                    param_state = torch.stack(gather_tensor).view(-1)[:working_param.numel()].reshape_as(
+                        working_param).cpu()
                    zero_state[param][k] = param_state

        states_dict = self._pack_state(zero_state)
@@ -562,10 +570,9 @@ class LowLevelZeroOptimizer(OptimizerWrapper):
                        if padding_size > 0:
                            v = torch.nn.functional.pad(v, [0, padding_size])
                        v_list = v.split(v.numel() // self._world_size)
-                        zero_state_dict['state'][param_idx][k] = v_list[self._local_rank].detach()
+                        zero_state_dict['state'][param_idx][k] = v_list[self._local_rank].detach().clone()

        self.optim.load_state_dict(zero_state_dict)
-        zero_state_dict = dict()

    def state_dict_shard(self, max_shard_size: int = 1024) -> Iterator[Tuple[Dict, int]]:
        """Returns dictionaries containing a whole state of the module one by one. The max size of dictionary shard is specified by ``max_shard_size``.
@@ -594,9 +601,10 @@ class LowLevelZeroOptimizer(OptimizerWrapper):

            for k, v in states.items():
                if isinstance(v, torch.Tensor) and k != 'step':
-                    state_tensor = [torch.zeros_like(v) for _ in range(self._world_size)]
-                    dist.all_gather(state_tensor, v, group=self.dp_pg)
-                    state_tensor = torch.stack(state_tensor).view(-1)[:working_param.numel()].reshape_as(working_param)
+                    state_tensor = [torch.zeros(v.shape, device='cuda', dtype=v.dtype) for _ in range(self._world_size)]
+                    dist.all_gather(state_tensor, v.cuda(), group=self.dp_pg)
+                    state_tensor = torch.stack(state_tensor).view(-1)[:working_param.numel()].reshape_as(
+                        working_param).cpu()
                    current_block_size += state_tensor.numel()
                    current_block[k] = state_tensor


--- a/colossalai/zero/low_level/readme.md
+++ b/colossalai/zero/low_level/readme.md
 # Low Level ZeRO
 >Low Level ZeRO == ZeRO-DP stage 1 and 2, we would denote it as ZeRO.
+## Examples of ZeRO and gradient accumulation
+
+The code below only shows a typical gradient accumulation process, and it drops a lot of details, such as the processing of loss.
+
+```python
+# examples of ZeRO1 with gradient accumulation
+...
+outputs = model(input)
+loss = SomeLoss(outputs)
+if (idx + 1) % ACCUMULATE_STEP != 0:
+    with booster.no_sync(model, optimizer):
+        # under this context, the gradient would not sync when backward,
+        # left each rank having different gradient.
+        # It saves the backward time
+        booster.backward(loss, optimizer)
+        continue
+else:
+    # need to sync all the accumulated gradient
+    booster.backward(loss, optimizer):
+    optimizer.step()
+    ...
+```
+
+```python
+# example of ZeRO2 with gradient accumulation
+
+...
+outputs = model(input)
+loss = SomeLoss(outputs)
+# ZeRO2 split the gradients and can NOT accumulate gradient with syncing.
+booster.backward(loss, optimizer)
+if (idx + 1) % ACCUMULATE_STEP == 0:
+    optimizer.step()
+...
+```
+

 ## Design:
 ### Notion
@@ -25,11 +61,11 @@ The data structure looks like this:
 ```
 After that, the gradients would be flattened by rank, and the data structure looks like this:
 ```
-# g-0 means flatten([g-00, g-10])
+# g-X0 means flatten([g-00, g-10])
 {
-0: [g-0],
-1: [g-1],
-2: [g-2]
+0: [g-X0],
+1: [g-X1],
+2: [g-X2]
 }
 ```
 For zero1, we iterate the dictionary and do `all_reduce`. For zero2, we can just do `reduce-scatter`.

--- a/colossalai/zero/wrapper.py
+++ b/colossalai/zero/wrapper.py
@@ -109,6 +109,6 @@ def zero_optim_wrapper(model: nn.Module,
        config_dict['clip_grad_norm'] = max_norm
        return LowLevelZeroOptimizer(optimizer, **config_dict, verbose=verbose)
    else:
-        from colossalai.zero.gemini.gemini_optimizer import ZeroOptimizer
+        from colossalai.zero.gemini.gemini_optimizer import GeminiOptimizer
        config_dict['clipping_norm'] = max_norm
-        return ZeroOptimizer(optimizer, model, **config_dict, verbose=verbose)
+        return GeminiOptimizer(optimizer, model, **config_dict, verbose=verbose)
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -18,7 +18,7 @@ RUN apt-get update && \
    rm -rf /var/lib/apt/lists/*

 # install torch
-RUN conda install pytorch==1.12.1 torchvision==0.13.1 torchaudio==0.12.1 cudatoolkit=11.3 -c pytorch
+RUN conda install -y pytorch==1.12.1 torchvision==0.13.1 torchaudio==0.12.1 cudatoolkit=11.3 -c pytorch

 # install ninja
 RUN apt-get update && \
@@ -43,8 +43,9 @@ RUN git clone -b ${VERSION} https://github.com/hpcaitech/ColossalAI.git \
 RUN pip install --no-cache-dir titans

 # install tensornvme
-RUN conda install cmake && \
+RUN conda install -y cmake && \
    git clone https://github.com/hpcaitech/TensorNVMe.git && \
    cd TensorNVMe && \
+    apt update -y && apt install -y libaio-dev && \
    pip install -r requirements.txt && \
    pip install -v --no-cache-dir .
--- a/docs/README-zh-Hans.md
+++ b/docs/README-zh-Hans.md
@@ -24,6 +24,7 @@
 </div>

 ## 新闻
+* [2023/09] [70 Billion Parameter LLaMA2 Model Training Accelerated by 195%](https://www.hpc-ai.tech/blog/70b-llama2-training)
 * [2023/07] [HPC-AI Tech Raises 22 Million USD in Series A Funding](https://www.hpc-ai.tech/blog/hpc-ai-tech-raises-22-million-usd-in-series-a-funding-to-fuel-team-expansion-and-business-growth)
 * [2023/07] [65B Model Pretraining Accelerated by 38%, Best Practices for Building LLaMA-Like Base Models Open-Source](https://www.hpc-ai.tech/blog/large-model-pretraining)
 * [2023/03] [ColossalChat: An Open-Source Solution for Cloning ChatGPT With a Complete RLHF Pipeline](https://medium.com/@yangyou_berkeley/colossalchat-an-open-source-solution-for-cloning-chatgpt-with-a-complete-rlhf-pipeline-5edf08fb538b)
@@ -49,7 +50,7 @@
 <li>
   <a href="#并行训练样例展示">并行训练样例展示</a>
   <ul>
-     <li><a href="#LLaMA">LLaMA</a></li>
+     <li><a href="#LLaMA2">LLaMA 1/2</a></li>
     <li><a href="#GPT-3">GPT-3</a></li>
     <li><a href="#GPT-2">GPT-2</a></li>
     <li><a href="#BERT">BERT</a></li>
@@ -210,7 +211,16 @@ Colossal-AI 为您提供了一系列并行组件。我们的目标是让您的
 <p align="right">(<a href="#top">返回顶端</a>)</p>

 ## 并行训练样例展示
-### LLaMA
+### LLaMA2
+<p align="center">
+<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/llama2_pretraining.png" width=600/>
+</p>
+
+- 700亿参数LLaMA2训练加速195%
+[[code]](https://github.com/hpcaitech/ColossalAI/tree/example/llama/examples/language/llama)
+[[blog]](https://www.hpc-ai.tech/blog/70b-llama2-training)
+
+### LLaMA1
 <p align="center">
 <img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/examples/images/LLaMA_pretraining.png" width=600/>
 </p>

--- a/docs/source/en/features/zero_with_chunk.md
+++ b/docs/source/en/features/zero_with_chunk.md
@@ -54,32 +54,38 @@ We also provide a lightweight chunk search mechanism to help users automatically

 We will use `GeminiDDP` to use ZeRO with chunk-based memory management. This is our new torch.Module wrapper which uses ZeRO-DP and Gemini. ZeRO is for parallelism and Gemini is for memory management.

-Also Make sure that your model is initialized under the context of ColoInitContext.
+Gemini allows LazyInitContext, which can save memory when initializing large models with multi-GPUs.

+If your model has `N` billion parameters and your GPU memory is `M` GB, we recommend you use LazyInitContext when `4N >= M`. Otherwise, LazyInitContext is optional.
+
+<!--- doc-test-ignore-start -->
 ```python
-with ColoInitContext(device='cpu', default_dist_spec=default_dist_spec, default_pg=default_pg):
+with LazyInitContext(default_device=torch.device('cuda')):
  model = gpt2_medium(checkpoint=True)
 ```
+<!--- doc-test-ignore-end -->
+
+We've provided `Booster` API which is user-friendly. We recommend you use `Booster` API. But if you still want to use low level API, you can read below content of this section.

-Define the model parameters as follows:
+Wrap the model with `GeminiDDP`.

+<!--- doc-test-ignore-start -->
 ```python
-chunk_manager = init_chunk_manager(model=module,
-                                           init_device=device,
-                                           hidden_dim=hidden_dim,
-                                           search_range_m=search_range_m,
-                                           min_chunk_size_m=min_chunk_size_m)
-gemini_manager = GeminiManager(placement_policy, chunk_manager)
+model = GeminiDDP(model, hidden_dim=hidden_dim, min_chunk_size_m=min_chunk_size_m)
 ```
+<!--- doc-test-ignore-end -->

 `hidden_dim` is the hidden dimension of DNN. Users can provide this argument to speed up searching. If users do not know this argument before training, it is ok. We will use a default value 1024. `min_chunk_size_m` is a floating point, being the minimum chunk size divided by 2^20 (e.g., if min_chunk_size_m=2.5, then the minimum chunk size should be 2.5*(2^20)).If the aggregate size of parameters is still smaller than the minimum chunk size, all parameters will be compacted into one small chunk.

 Initialization of the optimizer.
+<!--- doc-test-ignore-start -->
 ```python
 optimizer = GeminiAdamOptimizer(model, lr=1e-3, initial_scale=2**5)
 ```
+<!--- doc-test-ignore-start -->

 Training
+<!--- doc-test-ignore-start -->
 ```python
 optimizer.zero_grad()
 outputs = model(input_ids, attn_mask)
@@ -87,6 +93,7 @@ loss = criterion(outputs, input_ids)
 optimizer.backward(loss)
 optimizer.step()
 ```
+<!--- doc-test-ignore-start -->
 > ⚠️ Note: Please do not use `loss.backward()`, the standard way of writing is `optimizer.backward(loss)`.

 ### Train GPT
@@ -142,46 +149,6 @@ class GPTLMLoss(nn.Module):
        return self.loss_fn(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
 ```

-Define tensor parallel and parameter sharding strategies for tensor parallelism:
-
-```python
-def tensor_parallelize(model: torch.nn.Module, pg: ProcessGroup):
-    for mn, module in model.named_modules():
-        for pn, param in module.named_parameters(recurse=False):
-            if hasattr(param, 'visited'):
-                continue
-            param.set_dist_spec(ReplicaSpec())
-            if 'mlp.c_fc' in mn:
-                if 'weight' in pn or 'bias' in pn:
-                    split_param_col_tp1d(param, pg)
-                    param.compute_spec.set_output_replicate(False)
-                else:
-                    param.set_dist_spec(ReplicaSpec())
-            elif 'mlp.c_proj' in mn:
-                if 'weight' in pn:
-                    split_param_row_tp1d(param, pg)
-                else:
-                    param.set_dist_spec(ReplicaSpec())
-            elif 'wte' in mn or 'wpe' in mn:
-                split_param_col_tp1d(param, pg)
-            elif 'c_attn' in mn or 'c_proj' in mn:
-                split_param_col_tp1d(param, pg)
-            else:
-                param.set_dist_spec(ReplicaSpec())
-
-            param.visited = True
-def split_param_single_dim_tp1d(dim: int, param: ColoParameter, pg: ProcessGroup):
-    spec = (ShardSpec([dim], [pg.tp_world_size()]), ComputeSpec(ComputePattern.TP1D))
-    param.set_tensor_spec(*spec)
-
-
-def split_param_row_tp1d(param: ColoParameter, pg: ProcessGroup):
-    split_param_single_dim_tp1d(0, param, pg)
-
-
-def split_param_col_tp1d(param: ColoParameter, pg: ProcessGroup):
-    split_param_single_dim_tp1d(-1, param, pg)
-```

 Write a function to get random inputs:

@@ -198,7 +165,7 @@ Finally, we define a model which uses Gemini + ZeRO DDP and define our training
 from colossalai.nn.optimizer import HybridAdam

 from colossalai.booster import Booster
-from colossalai.zero import ColoInitContext
+from colossalai.lazy import LazyInitContext
 from colossalai.booster.plugin import GeminiPlugin

 def main():
@@ -214,17 +181,13 @@ def main():
    optimizer = HybridAdam(model.parameters(), lr=0.001)

    torch.manual_seed(123)
-    default_pg = ProcessGroup(tp_degree=args.tp_degree)
-    default_dist_spec = ShardSpec([-1], [args.tp_degree])
    # build GPT model
-    with ColoInitContext(device='cpu', default_dist_spec=default_dist_spec, default_pg=default_pg):
+    with ColoInitContext(default_device=torch.device('cuda')):
      model = gpt2_medium(checkpoint=True)
-    pg = default_pg
-    # Tensor Parallelism (TP)
-    tensor_parallelize(model, pg)

-    # Gemini + ZeRO DP, Note it must be used after TP
-    plugin = GeminiPlugin(placement_policy='cuda', max_norm=1.0, initial_scale=2**5)
+
+    # Gemini + ZeRO DP
+    plugin = GeminiPlugin(max_norm=1.0, initial_scale=2**5)
    booster = Booster(plugin=plugin)
    model, optimizer, criterion, _, _ = booster.boost(model, optimizer, criterion)


--- a/docs/source/zh-Hans/features/zero_with_chunk.md
+++ b/docs/source/zh-Hans/features/zero_with_chunk.md
@@ -53,32 +53,37 @@

 我们将运用`GeminiDDP`的方式来使用基于Chunk内存管理的ZeRO。这是我们新包装的torch.Module ，它使用 ZeRO-DP 和 Gemini，其中ZeRO 用于并行，Gemini 用于内存管理。

-同样需要确保你的模型是在 `ColoInitContext` 的上下文中初始化的。
+Gemini支持惰性初始化, 它可以节省多卡初始化大模型时的显存使用.

+如果你的模型有 `N` billion 个参数，你的 GPU 内存为 `M` GB, 当 `4N >= M` 时，我们推荐使用 LazyInitContext。否则，LazyInitContext 是可选的。
+
+<!--- doc-test-ignore-start -->
 ```python
-with ColoInitContext(device='cpu', default_dist_spec=default_dist_spec, default_pg=default_pg):
+with LazyInitContext(default_device=torch.device('cuda')):
  model = gpt2_medium(checkpoint=True)
 ```
+<!--- doc-test-ignore-end -->
+
+我们提供了 `Booster` API，它用户友好。我们推荐你使用 `Booster` API。如果您仍然想使用底层 API，您可以继续阅读本节其他内容。

-定义模型参数如下:
+使用 `GeminiDDP` 包装模型。

+<!--- doc-test-ignore-start -->
 ```python
-chunk_manager = init_chunk_manager(model=module,
-                                   init_device=device,
-                                   hidden_dim=hidden_dim,
-                                   search_range_m=search_range_m,
-                                   min_chunk_size_m=min_chunk_size_m)
-gemini_manager = GeminiManager(placement_policy, chunk_manager)
-model = ZeroDDP(model, gemini_manager)
+model = GeminiDDP(model, hidden_dim=hidden_dim, min_chunk_size_m=min_chunk_size_m)
 ```
+<!--- doc-test-ignore-end -->

 `hidden dim`是DNN的隐藏维度。用户可以提供这个参数来加快搜索速度。如果用户在训练前不知道这个参数也可以。 我们将使用默认值 1024。`min_chunk_size_m`是以兆（2^20）为单位的最小块大小。如果参数的总大小仍然小于最小块大小，则所有参数将被压缩为一个小块。

 初始化优化器。
+<!--- doc-test-ignore-start -->
 ```python
 optimizer = GeminiAdamOptimizer(model, lr=1e-3, initial_scale=2**5)
 ```
+<!--- doc-test-ignore-end -->

+<!--- doc-test-ignore-start -->
 训练
 ```python
 optimizer.zero_grad()
@@ -87,6 +92,7 @@ loss = criterion(outputs, input_ids)
 optimizer.backward(loss)
 optimizer.step()
 ```
+<!--- doc-test-ignore-end -->
 > ⚠️ 注意：请不要使用`loss.backward()`，规范写法是`optimizer.backward(loss)`。

 ### 训练GPT
@@ -143,47 +149,6 @@ class GPTLMLoss(nn.Module):
        return self.loss_fn(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
 ```

-定义张量并行和参数分片策略：
-
-```python
-def tensor_parallelize(model: torch.nn.Module, pg: ProcessGroup):
-    for mn, module in model.named_modules():
-        for pn, param in module.named_parameters(recurse=False):
-            if hasattr(param, 'visited'):
-                continue
-            param.set_dist_spec(ReplicaSpec())
-            if 'mlp.c_fc' in mn:
-                if 'weight' in pn or 'bias' in pn:
-                    split_param_col_tp1d(param, pg)
-                    param.compute_spec.set_output_replicate(False)
-                else:
-                    param.set_dist_spec(ReplicaSpec())
-            elif 'mlp.c_proj' in mn:
-                if 'weight' in pn:
-                    split_param_row_tp1d(param, pg)
-                else:
-                    param.set_dist_spec(ReplicaSpec())
-            elif 'wte' in mn or 'wpe' in mn:
-                split_param_col_tp1d(param, pg)
-            elif 'c_attn' in mn or 'c_proj' in mn:
-                split_param_col_tp1d(param, pg)
-            else:
-                param.set_dist_spec(ReplicaSpec())
-
-            param.visited = True
-def split_param_single_dim_tp1d(dim: int, param: ColoParameter, pg: ProcessGroup):
-    spec = (ShardSpec([dim], [pg.tp_world_size()]), ComputeSpec(ComputePattern.TP1D))
-    param.set_tensor_spec(*spec)
-
-
-def split_param_row_tp1d(param: ColoParameter, pg: ProcessGroup):
-    split_param_single_dim_tp1d(0, param, pg)
-
-
-def split_param_col_tp1d(param: ColoParameter, pg: ProcessGroup):
-    split_param_single_dim_tp1d(-1, param, pg)
-```
-
 写一个获得随机输入的函数:

 ```python
@@ -200,7 +165,7 @@ def get_data(batch_size, seq_len, vocab_size):
 from colossalai.nn.optimizer import HybridAdam

 from colossalai.booster import Booster
-from colossalai.zero import ColoInitContext
+from colossalai.lazy import LazyInitContext
 from colossalai.booster.plugin import GeminiPlugin

 def main():
@@ -216,17 +181,13 @@ def main():
    optimizer = HybridAdam(model.parameters(), lr=0.001)

    torch.manual_seed(123)
-    default_pg = ProcessGroup(tp_degree=args.tp_degree)
-    default_dist_spec = ShardSpec([-1], [args.tp_degree])
    # build GPT model
-    with ColoInitContext(device='cpu', default_dist_spec=default_dist_spec, default_pg=default_pg):
+    with ColoInitContext(default_device=torch.device('cuda')):
      model = gpt2_medium(checkpoint=True)
-    pg = default_pg
-    # Tensor Parallelism (TP)
-    tensor_parallelize(model, pg)

-    # Gemini + ZeRO DP, Note it must be used after TP
-    plugin = GeminiPlugin(placement_policy='cuda', max_norm=1.0, initial_scale=2**5)
+
+    # Gemini + ZeRO DP
+    plugin = GeminiPlugin(max_norm=1.0, initial_scale=2**5)
    booster = Booster(plugin=plugin)
    model, optimizer, criterion, _, _ = booster.boost(model, optimizer, criterion)


--- a/examples/community/roberta/pretraining/run_pretraining.py
+++ b/examples/community/roberta/pretraining/run_pretraining.py
@@ -22,7 +22,7 @@ from colossalai.nn.parallel import GeminiDDP, zero_model_wrapper, zero_optim_wra
 from colossalai.tensor import ColoParameter, ComputePattern, ComputeSpec, ProcessGroup, ReplicaSpec, ShardSpec
 from colossalai.utils import get_current_device
 from colossalai.utils.model.colo_init_context import ColoInitContext
-from colossalai.zero import ZeroOptimizer
+from colossalai.zero import GeminiOptimizer


 def main():
@@ -46,7 +46,7 @@ def main():
        args.local_rank = -1
        args.log_interval = 1
    else:
-        colossalai.launch_from_torch(config={})    #args.colossal_config
+        colossalai.launch_from_torch(config={})    # args.colossal_config
        args.local_rank = int(os.environ["LOCAL_RANK"])
        logger.info(
            f'launch_from_torch, world size: {torch.distributed.get_world_size()} | ' +
@@ -123,7 +123,8 @@ def main():
    get_tflops_func = partial(get_tflops, numel, args.train_micro_batch_size_per_gpu, args.max_seq_length)

    # 144003367 is is the length of the entire dataset
-    steps_per_epoch = 144003367 // world_size // args.train_micro_batch_size_per_gpu // args.gradient_accumulation_steps // args.refresh_bucket_size    #len(dataloader)
+    # len(dataloader)
+    steps_per_epoch = 144003367 // world_size // args.train_micro_batch_size_per_gpu // args.gradient_accumulation_steps // args.refresh_bucket_size
    total_steps = steps_per_epoch * args.epoch

    lr_scheduler = get_lr_scheduler(optimizer, total_steps=total_steps, last_epoch=-1)

--- a/examples/images/diffusion/requirements.txt
+++ b/examples/images/diffusion/requirements.txt
@@ -7,7 +7,7 @@ imageio-ffmpeg==0.4.2
 torchmetrics==0.7
 omegaconf==2.1.1
 test-tube>=0.7.5
-streamlit>=0.73.1
+streamlit>=1.11.1
 einops==0.3.0
 transformers
 webdataset==0.2.5

--- a/examples/images/dreambooth/test_ci.sh
+++ b/examples/images/dreambooth/test_ci.sh
@@ -20,6 +20,5 @@ for plugin in "gemini"; do
  --lr_scheduler="constant" \
  --lr_warmup_steps=0 \
  --test_run=True \
-  --num_class_images=200 \
-  --placement="auto" # "cuda"
+  --num_class_images=200
 done
--- a/examples/images/dreambooth/train_dreambooth_colossalai.py
+++ b/examples/images/dreambooth/train_dreambooth_colossalai.py
@@ -2,9 +2,9 @@ import argparse
 import hashlib
 import math
 import os
+import shutil
 from pathlib import Path
 from typing import Optional
-import shutil

 import torch
 import torch.nn.functional as F
@@ -19,6 +19,8 @@ from tqdm.auto import tqdm
 from transformers import AutoTokenizer, PretrainedConfig

 import colossalai
+from colossalai.booster import Booster
+from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin, TorchDDPPlugin
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
 from colossalai.logging import disable_existing_loggers, get_dist_logger
@@ -26,8 +28,6 @@ from colossalai.nn.optimizer import HybridAdam
 from colossalai.utils import get_current_device
 from colossalai.zero import ColoInitContext
 from colossalai.zero.gemini import get_static_torch_model
-from colossalai.booster import Booster
-from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin, TorchDDPPlugin

 disable_existing_loggers()
 logger = get_dist_logger()
@@ -138,10 +138,10 @@ def parse_args(input_args=None):
              " resolution"),
    )
    parser.add_argument(
-        "--placement",
-        type=str,
-        default="cpu",
-        help="Placement Policy for Gemini. Valid when using colossalai as dist plan.",
+        "--offload_optim_frac",
+        type=float,
+        default=1.0,
+        help="Fraction of optimizer states to be offloaded. Valid when using colossalai as dist plan.",
    )
    parser.add_argument(
        "--center_crop",
@@ -461,7 +461,6 @@ def main(args):
        revision=args.revision,
    )

-
    if args.externel_unet_path is None:
        logger.info(f"Loading UNet2DConditionModel from {args.pretrained_model_name_or_path}", ranks=[0])
        unet = UNet2DConditionModel.from_pretrained(args.pretrained_model_name_or_path,
@@ -491,30 +490,31 @@ def main(args):
    if args.plugin.startswith('torch_ddp'):
        plugin = TorchDDPPlugin()
    elif args.plugin == 'gemini':
-        plugin = GeminiPlugin(placement_policy=args.placement, strict_ddp_mode=True, initial_scale=2 ** 5)
+        plugin = GeminiPlugin(offload_optim_frac=args.offload_optim_frac, strict_ddp_mode=True, initial_scale=2**5)
    elif args.plugin == 'low_level_zero':
-        plugin = LowLevelZeroPlugin(initial_scale=2 ** 5)
+        plugin = LowLevelZeroPlugin(initial_scale=2**5)

    booster = Booster(plugin=plugin, **booster_kwargs)

    # config optimizer for colossalai zero
-    optimizer = HybridAdam(unet.parameters(), lr=args.learning_rate, initial_scale=2**5, clipping_norm=args.max_grad_norm)
+    optimizer = HybridAdam(unet.parameters(),
+                           lr=args.learning_rate,
+                           initial_scale=2**5,
+                           clipping_norm=args.max_grad_norm)

    # load noise_scheduler
    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")

    # prepare dataset
    logger.info(f"Prepare dataset from {args.instance_data_dir}", ranks=[0])
-    train_dataset = DreamBoothDataset(
-        instance_data_root=args.instance_data_dir,
+    train_dataset = DreamBoothDataset(instance_data_root=args.instance_data_dir,
                                      instance_prompt=args.instance_prompt,
                                      class_data_root=args.class_data_dir if args.with_prior_preservation else None,
                                      class_prompt=args.class_prompt,
                                      tokenizer=tokenizer,
                                      size=args.resolution,
                                      center_crop=args.center_crop,
-        test=args.test_run
-    )
+                                      test=args.test_run)

    def collate_fn(examples):
        input_ids = [example["instance_prompt_ids"] for example in examples]
@@ -690,6 +690,7 @@ def main(args):
        if args.push_to_hub:
            repo.push_to_hub(commit_message="End of training", blocking=False, auto_lfs_prune=True)

+
 if __name__ == "__main__":
    args = parse_args()
    main(args)
--- a/examples/images/dreambooth/train_dreambooth_colossalai_lora.py
+++ b/examples/images/dreambooth/train_dreambooth_colossalai_lora.py
@@ -2,9 +2,9 @@ import argparse
 import hashlib
 import math
 import os
+import shutil
 from pathlib import Path
 from typing import Optional
-import shutil

 import torch
 import torch.nn.functional as F
@@ -21,6 +21,8 @@ from tqdm.auto import tqdm
 from transformers import AutoTokenizer, PretrainedConfig

 import colossalai
+from colossalai.booster import Booster
+from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin, TorchDDPPlugin
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
 from colossalai.logging import disable_existing_loggers, get_dist_logger
@@ -28,8 +30,6 @@ from colossalai.nn.optimizer import HybridAdam
 from colossalai.utils import get_current_device
 from colossalai.zero import ColoInitContext, GeminiAdamOptimizer
 from colossalai.zero.gemini import get_static_torch_model
-from colossalai.booster import Booster
-from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin, TorchDDPPlugin

 disable_existing_loggers()
 logger = get_dist_logger()
@@ -459,7 +459,6 @@ def main(args):
        revision=args.revision,
    )

-
    if args.externel_unet_path is None:
        logger.info(f"Loading UNet2DConditionModel from {args.pretrained_model_name_or_path}", ranks=[0])
        unet = UNet2DConditionModel.from_pretrained(args.pretrained_model_name_or_path,
@@ -490,8 +489,7 @@ def main(args):
            block_id = int(name[len("down_blocks.")])
            hidden_size = unet.config.block_out_channels[block_id]

-        lora_attn_procs[name] = LoRACrossAttnProcessor(hidden_size=hidden_size,
-                                                       cross_attention_dim=cross_attention_dim)
+        lora_attn_procs[name] = LoRACrossAttnProcessor(hidden_size=hidden_size, cross_attention_dim=cross_attention_dim)

    unet.set_attn_processor(lora_attn_procs)
    lora_layers = AttnProcsLayers(unet.attn_processors)
@@ -513,14 +511,17 @@ def main(args):
    if args.plugin.startswith('torch_ddp'):
        plugin = TorchDDPPlugin()
    elif args.plugin == 'gemini':
-        plugin = GeminiPlugin(placement_policy='cuda', strict_ddp_mode=True, initial_scale=2 ** 5)
+        plugin = GeminiPlugin(strict_ddp_mode=True, initial_scale=2**5)
    elif args.plugin == 'low_level_zero':
-        plugin = LowLevelZeroPlugin(initial_scale=2 ** 5)
+        plugin = LowLevelZeroPlugin(initial_scale=2**5)

    booster = Booster(plugin=plugin, **booster_kwargs)

    # config optimizer for colossalai zero
-    optimizer = HybridAdam(unet.parameters(), lr=args.learning_rate, initial_scale=2**5, clipping_norm=args.max_grad_norm)
+    optimizer = HybridAdam(unet.parameters(),
+                           lr=args.learning_rate,
+                           initial_scale=2**5,
+                           clipping_norm=args.max_grad_norm)

    # load noise_scheduler
    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
@@ -711,6 +712,7 @@ def main(args):
        if args.push_to_hub:
            repo.push_to_hub(commit_message="End of training", blocking=False, auto_lfs_prune=True)

+
 if __name__ == "__main__":
    args = parse_args()
    main(args)
--- a/examples/images/resnet/README.md
+++ b/examples/images/resnet/README.md
@@ -49,8 +49,8 @@ python eval.py -c ./ckpt-low_level_zero -e 80

 Expected accuracy performance will be:

-| Model     | Single-GPU Baseline FP32 | Booster DDP with FP32 | Booster DDP with FP16 | Booster Low Level Zero |
-| --------- | ------------------------ | --------------------- | --------------------- | ---------------------- |
-| ResNet-18 | 85.85%                   | 84.91%                | 85.46%                | 84.50%                 |
+| Model     | Single-GPU Baseline FP32 | Booster DDP with FP32 | Booster DDP with FP16 | Booster Low Level Zero | Booster Gemini |
+| --------- | ------------------------ | --------------------- | --------------------- | ---------------------- | -------------- |
+| ResNet-18 | 85.85%                   | 84.91%                | 85.46%                | 84.50%                 | 84.60%         |

 **Note: the baseline is adapted from the [script](https://pytorch-tutorial.readthedocs.io/en/latest/tutorial/chapter03_intermediate/3_2_2_cnn_resnet_cifar10/) to use `torchvision.models.resnet18`**