[zero] reorganize zero/gemini folder structure (#3424)

* [zero] refactor low-level zero folder structure * [zero] fix legacy zero import path * [zero] fix legacy zero import path * [zero] remove useless import * [zero] refactor gemini folder structure * [zero] refactor gemini folder structure * [zero] refactor legacy zero import path * [zero] refactor gemini folder structure * [zero] refactor gemini folder structure * [zero] refactor gemini folder structure * [zero] refactor legacy zero import path * [zero] fix test import path * [zero] fix test * [zero] fix circular import * [zero] update import

[zero] reorganize zero/gemini folder structure (#3424)
* [zero] refactor low-level zero folder structure * [zero] fix legacy zero import path * [zero] fix legacy zero import path * [zero] remove useless import * [zero] refactor gemini folder structure * [zero] refactor gemini folder structure * [zero] refactor legacy zero import path * [zero] refactor gemini folder structure * [zero] refactor gemini folder structure * [zero] refactor gemini folder structure * [zero] refactor legacy zero import path * [zero] fix test import path * [zero] fix test * [zero] fix circular import * [zero] update import
26b7aac0 · ver217 · GitHub · b09adff7 · 26b7aac0 · 26b7aac0
Unverified Commit 26b7aac0 authored Apr 04, 2023 by ver217 Committed by GitHub Apr 04, 2023
20 changed files
--- a/colossalai/gemini/ophooks/__init__.py
+++ b/colossalai/gemini/ophooks/__init__.py
--- a/colossalai/gemini/ophooks/_shard_grad_ophook.py
+++ b/colossalai/gemini/ophooks/_shard_grad_ophook.py
--- a/colossalai/gemini/ophooks/_shard_param_ophook.py
+++ b/colossalai/gemini/ophooks/_shard_param_ophook.py
 import torch
 from colossalai.registry import OPHOOKS
 from . import BaseOpHook

--- a/colossalai/gemini/ophooks/runtime_mem_tracer_hook.py
+++ b/colossalai/gemini/ophooks/runtime_mem_tracer_hook.py
@@ -5,9 +5,9 @@ from typing import List
 import torch
-from colossalai.gemini.memory_tracer import MemStats, SyncCudaMemoryMonitor
-from colossalai.gemini.tensor_utils import alloc_storage, free_storage
 from colossalai.tensor.param_op_hook import ColoParamOpHook
+from colossalai.zero.gemini.memory_tracer import MemStats, SyncCudaMemoryMonitor
+from colossalai.zero.legacy.gemini.tensor_utils import alloc_storage, free_storage
 class TrainingPhase(Enum):

--- a/colossalai/gemini/ophooks/utils.py
+++ b/colossalai/gemini/ophooks/utils.py
--- a/colossalai/gemini/paramhooks/__init__.py
+++ b/colossalai/gemini/paramhooks/__init__.py
--- a/colossalai/gemini/paramhooks/_param_hookmgr.py
+++ b/colossalai/gemini/paramhooks/_param_hookmgr.py
--- a/colossalai/gemini/stateful_tensor.py
+++ b/colossalai/gemini/stateful_tensor.py
 from enum import Enum
-from typing import Optional
+from typing import Optional, Union
 import torch
-from typing import Union
-from colossalai.gemini.gemini_context import GeminiMemoryManager
+from .gemini_context import GeminiMemoryManager
 def sizeof_tensor(tensor: torch.Tensor):
@@ -19,7 +19,7 @@ class TensorState(Enum):
 class StatefulTensor(object):
-    """A Structure stores a Torch Tensor and labeled states. 
+    """A Structure stores a Torch Tensor and labeled states.
    Inspired from the paper:
    PatrickStar: Parallel Training of Pre-trained Models via Chunk-based Memory Management

--- a/colossalai/gemini/stateful_tensor_mgr.py
+++ b/colossalai/gemini/stateful_tensor_mgr.py
 import functools
-import torch
 import types
-from colossalai.utils.cuda import get_current_device
+from time import time
-from colossalai.gemini.tensor_utils import colo_model_data_tensor_move_inline, colo_tensor_mem_usage
-from colossalai.gemini.stateful_tensor import StatefulTensor, TensorState
-from colossalai.gemini.tensor_placement_policy import TensorPlacementPolicy
 from typing import List
+import torch
 from colossalai.logging import get_dist_logger
-from time import time
+from colossalai.utils.cuda import get_current_device
+from .stateful_tensor import StatefulTensor, TensorState
+from .tensor_placement_policy import TensorPlacementPolicy
+from .tensor_utils import colo_model_data_tensor_move_inline, colo_tensor_mem_usage
 class StatefulTensorMgr(object):

--- a/colossalai/gemini/tensor_placement_policy.py
+++ b/colossalai/gemini/tensor_placement_policy.py
@@ -5,11 +5,12 @@ from typing import List, Optional, Type
 import torch
-from colossalai.gemini.memory_tracer import MemStatsCollector
-from colossalai.gemini.stateful_tensor import StatefulTensor
-from colossalai.gemini.tensor_utils import colo_model_data_tensor_move_inline, colo_tensor_mem_usage
 from colossalai.utils import get_current_device
 from colossalai.utils.memory import colo_device_memory_capacity
+from colossalai.zero.gemini.memory_tracer import MemStatsCollector
+from .stateful_tensor import StatefulTensor
+from .tensor_utils import colo_model_data_tensor_move_inline, colo_tensor_mem_usage
 class TensorPlacementPolicy(ABC):

--- a/colossalai/gemini/tensor_utils.py
+++ b/colossalai/gemini/tensor_utils.py
+from typing import Tuple, Union
 import torch
-from colossalai.gemini.stateful_tensor import StatefulTensor
-from typing import Union, Tuple
+from .stateful_tensor import StatefulTensor
 def is_storage_empty(tensor: torch.Tensor) -> bool:

--- a/colossalai/zero/init_ctx/__init__.py
+++ b/colossalai/zero/init_ctx/__init__.py
--- a/colossalai/zero/init_ctx/init_context.py
+++ b/colossalai/zero/init_ctx/init_context.py
@@ -13,10 +13,10 @@ from colossalai.context.singleton_meta import SingletonMeta
 from colossalai.core import global_context as gpc
 from colossalai.logging import get_dist_logger
 from colossalai.utils.model.utils import InsertPostInitMethodToModuleSubClasses
-from colossalai.zero.shard_utils import BaseShardStrategy
+from colossalai.zero.legacy.shard_utils import BaseShardStrategy
-from colossalai.zero.sharded_model._utils import cast_tensor_to_fp16
+from colossalai.zero.legacy.sharded_model._utils import cast_tensor_to_fp16
-from colossalai.zero.sharded_model.sharded_model_v2 import ShardedModelV2
+from colossalai.zero.legacy.sharded_model.sharded_model_v2 import ShardedModelV2
-from colossalai.zero.sharded_param import ShardedParamV2
+from colossalai.zero.legacy.sharded_param import ShardedParamV2
 @dataclass

--- a/colossalai/zero/shard_utils/__init__.py
+++ b/colossalai/zero/shard_utils/__init__.py
--- a/colossalai/zero/shard_utils/base_shard_strategy.py
+++ b/colossalai/zero/shard_utils/base_shard_strategy.py
@@ -2,7 +2,8 @@ from abc import ABC, abstractmethod
 from typing import List, Optional
 import torch.distributed as dist
-from colossalai.zero.sharded_param.sharded_tensor import ShardedTensor
+from colossalai.zero.legacy.sharded_param.sharded_tensor import ShardedTensor
 class BaseShardStrategy(ABC):

--- a/colossalai/zero/shard_utils/bucket_tensor_shard_strategy.py
+++ b/colossalai/zero/shard_utils/bucket_tensor_shard_strategy.py
@@ -2,17 +2,18 @@ from typing import List, Optional
 import torch
 import torch.distributed as dist
-from colossalai.utils import get_current_device
-from colossalai.zero.sharded_param.sharded_tensor import ShardedTensor
 from torch._utils import _flatten_dense_tensors as flatten
+from colossalai.utils import get_current_device
+from colossalai.zero.legacy.sharded_param.sharded_tensor import ShardedTensor
 from .tensor_shard_strategy import TensorShardStrategy
 class BucketTensorShardStrategy(TensorShardStrategy):
-    """Use the same shard scheme as `TensorShardStrategy`'s, but it gathers tensors of a sub-module together, 
+    """Use the same shard scheme as `TensorShardStrategy`'s, but it gathers tensors of a sub-module together,
-    which will fully utilize network bandwidth. 
+    which will fully utilize network bandwidth.
-    It is especially useful when sub-module contains bias, 
+    It is especially useful when sub-module contains bias,
    since we cannot utilize network bandwidth well if we only gather a bias tensor (bias is usaully small).
    """

--- a/colossalai/zero/shard_utils/commons.py
+++ b/colossalai/zero/shard_utils/commons.py
-import torch
-import torch.nn.functional as F
 from typing import Tuple
+import torch
 def get_shard(tensor: torch.Tensor, rank: int, world_size: int) -> Tuple[torch.Tensor, int]:
    """Return the local shard of a full tensor."""

--- a/colossalai/zero/shard_utils/tensor_shard_strategy.py
+++ b/colossalai/zero/shard_utils/tensor_shard_strategy.py
@@ -2,11 +2,12 @@ from typing import List, Optional
 import torch
 import torch.distributed as dist
 from colossalai.utils import get_current_device
-from colossalai.zero.shard_utils import BaseShardStrategy
+from colossalai.zero.legacy.gemini.tensor_utils import colo_model_data_tensor_move_inline
-from colossalai.zero.shard_utils.commons import get_shard
+from colossalai.zero.legacy.shard_utils import BaseShardStrategy
-from colossalai.zero.sharded_param.sharded_tensor import ShardedTensor
+from colossalai.zero.legacy.shard_utils.commons import get_shard
-from colossalai.gemini.tensor_utils import colo_model_data_tensor_move_inline
+from colossalai.zero.legacy.sharded_param.sharded_tensor import ShardedTensor
 class TensorShardStrategy(BaseShardStrategy):
@@ -27,7 +28,7 @@ class TensorShardStrategy(BaseShardStrategy):
        Args:
            t (ShardedTensor): a tensor to be sharded.
-            process_group (Optional[dist.ProcessGroup], optional): the process group among which tensor shards. 
+            process_group (Optional[dist.ProcessGroup], optional): the process group among which tensor shards.
            Defaults to None.
        """
        if t.is_sharded:

--- a/colossalai/zero/sharded_model/__init__.py
+++ b/colossalai/zero/sharded_model/__init__.py
 from .sharded_model_v2 import ShardedModelV2
 __all__ = ['ShardedModelV2']
\ No newline at end of file
--- a/colossalai/zero/sharded_model/_utils.py
+++ b/colossalai/zero/sharded_model/_utils.py
-from typing import Any, Callable, List, Tuple
+from typing import Any, Callable, List, Tuple, Union
 import torch
 import torch.nn.functional as F
-from typing import Union
-from colossalai.gemini.stateful_tensor import StatefulTensor
+from colossalai.zero.legacy.gemini.stateful_tensor import StatefulTensor
 def get_gradient_predivide_factor(world_size: int) -> float: