[legacy] clean up legacy code (#4743)

* [legacy] remove outdated codes of pipeline (#4692) * [legacy] remove cli of benchmark and update optim (#4690) * [legacy] remove cli of benchmark and update optim * [doc] fix cli doc test * [legacy] fix engine clip grad norm * [legacy] remove outdated colo tensor (#4694) * [legacy] remove outdated colo tensor * [test] fix test import * [legacy] move outdated zero to legacy (#4696) * [legacy] clean up utils (#4700) * [legacy] clean up utils * [example] update examples * [legacy] clean up amp * [legacy] fix amp module * [legacy] clean up gpc (#4742) * [legacy] clean up context * [legacy] clean core, constants and global vars * [legacy] refactor initialize * [example] fix examples ci * [example] fix examples ci * [legacy] fix tests * [example] fix gpt example * [example] fix examples ci * [devops] fix ci installation * [example] fix examples ci

[legacy] clean up legacy code (#4743)
* [legacy] remove outdated codes of pipeline (#4692) * [legacy] remove cli of benchmark and update optim (#4690) * [legacy] remove cli of benchmark and update optim * [doc] fix cli doc test * [legacy] fix engine clip grad norm * [legacy] remove outdated colo tensor (#4694) * [legacy] remove outdated colo tensor * [test] fix test import * [legacy] move outdated zero to legacy (#4696) * [legacy] clean up utils (#4700) * [legacy] clean up utils * [example] update examples * [legacy] clean up amp * [legacy] fix amp module * [legacy] clean up gpc (#4742) * [legacy] clean up context * [legacy] clean core, constants and global vars * [legacy] refactor initialize * [example] fix examples ci * [example] fix examples ci * [legacy] fix tests * [example] fix gpt example * [example] fix examples ci * [devops] fix ci installation * [example] fix examples ci
b5f9e37c · Hongxin Liu · GitHub · 32e7f994 · b5f9e37c · b5f9e37c
Unverified Commit b5f9e37c authored Sep 18, 2023 by Hongxin Liu Committed by GitHub Sep 18, 2023
20 changed files
--- a/colossalai/pipeline/rpc/_pipeline_schedule.py
+++ b/colossalai/pipeline/rpc/_pipeline_schedule.py
@@ -6,8 +6,8 @@ import torch.distributed as dist
 from torch._C._distributed_rpc import PyRRef
 from torch.futures import Future

-from colossalai.pipeline.pipeline_process_group import ppg
-from colossalai.pipeline.rpc._pipeline_base import Phase, PipelineEngineBase, UniqueKey, WorkerBase, WorkItem
+from colossalai.legacy.pipeline.pipeline_process_group import ppg
+from colossalai.legacy.pipeline.rpc._pipeline_base import Phase, PipelineEngineBase, UniqueKey, WorkerBase, WorkItem

 # Implementation of different Pipeline schedule
 # <strategy>Worker defines the worker for each stage

--- a/colossalai/pipeline/rpc/utils.py
+++ b/colossalai/pipeline/rpc/utils.py
@@ -10,7 +10,7 @@ from torch._C._distributed_rpc import _is_current_rpc_agent_set
 from torch.futures import Future

 from colossalai.initialize import launch
-from colossalai.pipeline.pipeline_process_group import ppg
+from colossalai.legacy.pipeline.pipeline_process_group import ppg


 def pyobj_map(obj: Any, fn: Callable, process_types: Union[Type, Tuple[Type]] = ()) -> Any:

--- a/colossalai/pipeline/utils.py
+++ b/colossalai/pipeline/utils.py
--- a/colossalai/legacy/tensor/__init__.py
+++ b/colossalai/legacy/tensor/__init__.py
+from . import distspec
+from .compute_spec import ComputePattern, ComputeSpec
+from .dist_spec_mgr import DistSpecManager
+from .distspec import ReplicaSpec, ShardSpec
+from .process_group import ProcessGroup
+from .tensor_spec import ColoTensorSpec
+
+__all__ = [
+    'ComputePattern',
+    'ComputeSpec',
+    'distspec',
+    'DistSpecManager',
+    'ProcessGroup',
+    'ColoTensorSpec',
+    'ShardSpec',
+    'ReplicaSpec',
+]
--- a/colossalai/tensor/compute_spec.py
+++ b/colossalai/tensor/compute_spec.py
--- a/colossalai/tensor/const.py
+++ b/colossalai/tensor/const.py
--- a/colossalai/tensor/dist_spec_mgr.py
+++ b/colossalai/tensor/dist_spec_mgr.py
@@ -4,12 +4,12 @@ import torch
 import torch.distributed as dist
 from numpy import prod

-from colossalai.tensor.distspec import DistPlacementPattern, _DistSpec
-from colossalai.tensor.process_group import ProcessGroup
+from colossalai.legacy.tensor.distspec import DistPlacementPattern, _DistSpec
+from colossalai.legacy.tensor.process_group import ProcessGroup


 # TODO(jiaruifang) circle import, move the divide to colossalai.commons.
-# colossalai.tensor shall not import any submodule from colossal.nn
+# colossalai.legacy.tensor shall not import any submodule from colossal.nn
 def divide(numerator, denominator):
    """Only allow exact division.


--- a/colossalai/tensor/distspec.py
+++ b/colossalai/tensor/distspec.py
--- a/colossalai/tensor/op_wrapper.py
+++ b/colossalai/tensor/op_wrapper.py
-from typing import (
-    Callable,
-    Dict,
-)
 import functools
+from typing import Callable, Dict

 # Custom sharded ops
 _COLOSSAL_OPS: Dict[str, Callable] = {}

--- a/colossalai/tensor/process_group.py
+++ b/colossalai/tensor/process_group.py
--- a/colossalai/tensor/tensor_spec.py
+++ b/colossalai/tensor/tensor_spec.py
 from dataclasses import dataclass
 from typing import Optional

-from colossalai.tensor.distspec import DistPlacementPattern, _DistSpec
-from colossalai.tensor.process_group import ProcessGroup
+from colossalai.legacy.tensor.distspec import DistPlacementPattern, _DistSpec
+from colossalai.legacy.tensor.process_group import ProcessGroup

 from .compute_spec import ComputeSpec


--- a/colossalai/legacy/trainer/_trainer.py
+++ b/colossalai/legacy/trainer/_trainer.py
@@ -6,8 +6,9 @@ from tqdm import tqdm

 from colossalai.legacy.engine import Engine
 from colossalai.legacy.trainer.hooks import BaseHook
+from colossalai.legacy.utils import is_dp_rank_0, is_no_pp_or_last_stage, is_tp_rank_0
 from colossalai.logging import DistributedLogger
-from colossalai.utils import MultiTimer, is_dp_rank_0, is_no_pp_or_last_stage, is_tp_rank_0
+from colossalai.utils import MultiTimer


 class Trainer:

--- a/colossalai/legacy/trainer/hooks/_checkpoint_hook.py
+++ b/colossalai/legacy/trainer/hooks/_checkpoint_hook.py
@@ -4,8 +4,8 @@ import torch

 from colossalai.legacy.registry import HOOKS
 from colossalai.legacy.trainer.hooks import BaseHook
+from colossalai.legacy.utils.checkpointing import save_checkpoint
 from colossalai.logging import get_dist_logger
-from colossalai.utils.checkpointing import save_checkpoint

 from ._lr_scheduler_hook import LRSchedulerHook


--- a/colossalai/legacy/trainer/hooks/_log_hook.py
+++ b/colossalai/legacy/trainer/hooks/_log_hook.py
@@ -5,12 +5,13 @@ import os
 import os.path as osp
 from typing import List

-from colossalai.context import ParallelMode
-from colossalai.core import global_context as gpc
+from colossalai.legacy.context import ParallelMode
+from colossalai.legacy.core import global_context as gpc
 from colossalai.legacy.registry import HOOKS
 from colossalai.legacy.trainer.hooks._metric_hook import ThroughputMetric
+from colossalai.legacy.utils import is_dp_rank_0, is_no_pp_or_last_stage, is_tp_rank_0, report_memory_usage
 from colossalai.logging import DistributedLogger
-from colossalai.utils import MultiTimer, is_dp_rank_0, is_no_pp_or_last_stage, is_tp_rank_0, report_memory_usage
+from colossalai.utils import MultiTimer

 from ._base_hook import BaseHook
 from ._commons_ import _format_number
@@ -112,8 +113,8 @@ class TensorboardHook(BaseHook):
    Args:
        log_dir (str): Directory of log.
        ranks (list): Ranks of processors.
-        parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`, optional): Parallel mode used in trainer,
-            defaults to colossalai.context.parallel_mode.ParallelMode.GLOBAL.
+        parallel_mode (:class:`colossalai.legacy.context.parallel_mode.ParallelMode`, optional): Parallel mode used in trainer,
+            defaults to colossalai.legacy.context.parallel_mode.ParallelMode.GLOBAL.
        priority (int, optional): Priority in the printing, hooks with small priority will be printed in front,
            defaults to 10. If different hooks share same priority, the order of printing would
            depend on the hooks order in the hook list.

--- a/colossalai/legacy/trainer/hooks/_metric_hook.py
+++ b/colossalai/legacy/trainer/hooks/_metric_hook.py
@@ -7,11 +7,12 @@ from typing import Callable
 import torch
 import torch.distributed as dist

-from colossalai.context import ParallelMode
-from colossalai.core import global_context as gpc
 from colossalai.legacy.communication import all_reduce
+from colossalai.legacy.context import ParallelMode
+from colossalai.legacy.core import global_context as gpc
 from colossalai.legacy.registry import HOOKS
-from colossalai.utils import get_current_device, is_no_pp_or_last_stage
+from colossalai.legacy.utils import is_no_pp_or_last_stage
+from colossalai.utils import get_current_device

 from ._base_hook import BaseHook
 from ._commons_ import _format_number

--- a/colossalai/legacy/utils/__init__.py
+++ b/colossalai/legacy/utils/__init__.py
+from .checkpointing import load_checkpoint, save_checkpoint
+from .common import (
+    clip_grad_norm_fp32,
+    copy_tensor_parallel_attributes,
+    count_zeros_fp32,
+    is_dp_rank_0,
+    is_model_parallel_parameter,
+    is_no_pp_or_last_stage,
+    is_tp_rank_0,
+    is_using_ddp,
+    is_using_pp,
+    is_using_sequence,
+    param_is_not_tensor_parallel_duplicate,
+    print_rank_0,
+    switch_virtual_pipeline_parallel_rank,
+    sync_model_param,
+)
+from .data_sampler import DataParallelSampler, get_dataloader
+from .memory import (
+    colo_device_memory_capacity,
+    colo_device_memory_used,
+    colo_get_cpu_memory_capacity,
+    colo_set_cpu_memory_capacity,
+    colo_set_process_memory_fraction,
+    report_memory_usage,
+)
+
+__all__ = [
+    'DataParallelSampler',
+    'get_dataloader',
+    'save_checkpoint',
+    'load_checkpoint',
+    'colo_device_memory_capacity',
+    'colo_device_memory_used',
+    'colo_get_cpu_memory_capacity',
+    'colo_set_cpu_memory_capacity',
+    'colo_set_process_memory_fraction',
+    'report_memory_usage',
+    'clip_grad_norm_fp32',
+    'copy_tensor_parallel_attributes',
+    'count_zeros_fp32',
+    'is_dp_rank_0',
+    'is_model_parallel_parameter',
+    'is_no_pp_or_last_stage',
+    'is_tp_rank_0',
+    'is_using_ddp',
+    'is_using_pp',
+    'is_using_sequence',
+    'param_is_not_tensor_parallel_duplicate',
+    'print_rank_0',
+    'switch_virtual_pipeline_parallel_rank',
+    'sync_model_param',
+]
--- a/colossalai/utils/activation_checkpoint.py
+++ b/colossalai/utils/activation_checkpoint.py
 #!/usr/bin/env python
 # -*- encoding: utf-8 -*-

+import weakref
+
 import torch
 from torch.utils.checkpoint import check_backward_validity, detach_variable

-from colossalai.context.random import get_states, get_current_mode, set_seed_states, set_mode, sync_states
-from .cuda import get_current_device
-
-import weakref
+from colossalai.legacy.context.random import get_current_mode, get_states, set_mode, set_seed_states, sync_states
+from colossalai.utils import get_current_device


 def copy_to_device(obj, device):

--- a/colossalai/utils/checkpoint/__init__.py
+++ b/colossalai/utils/checkpoint/__init__.py
-from .module_checkpoint import save_checkpoint, load_checkpoint
+from .module_checkpoint import load_checkpoint, save_checkpoint

 __all__ = ['save_checkpoint', 'load_checkpoint']
--- a/colossalai/utils/checkpoint/module_checkpoint.py
+++ b/colossalai/utils/checkpoint/module_checkpoint.py
+from typing import Dict, Optional
+
 import torch
 import torch.distributed as dist
+
+from colossalai.interface import OptimizerWrapper
 from colossalai.tensor import ColoTensor
-from colossalai.nn.optimizer import ColossalaiOptimizer
-from colossalai.utils.checkpoint.utils import gather_tensor, scatter_tensor
-from typing import Optional, Dict
+
+from .utils import gather_tensor, scatter_tensor


 def save_checkpoint(path: str,
                    epoch: int,
                    model: torch.nn.Module,
-                    optimizer: Optional[ColossalaiOptimizer] = None,
+                    optimizer: Optional[OptimizerWrapper] = None,
                    lr_scheduler: torch.optim.lr_scheduler._LRScheduler = None,
                    *args,
                    **kwargs):
@@ -19,7 +22,7 @@ def save_checkpoint(path: str,
        path (str): directory to save the checkpoint files.
        epoch (int): the number of epoch
        model (torch.nn.Module): a torch module initialized by ColoInitContext
-        optimizer (ColossalaiOptimizer, optional): optimizers. Defaults to None.
+        optimizer (OptimizerWrapper, optional): optimizers. Defaults to None.
        lr_scheduler (torch.optim.lr_scheduler._LRScheduler, optional): lr schedule. Defaults to None.
    """
    rank = dist.get_rank()
@@ -74,7 +77,7 @@ def save_checkpoint(path: str,
 def load_checkpoint(path: str,
                    epoch: int,
                    model: torch.nn.Module,
-                    optimizer: Optional[ColossalaiOptimizer] = None,
+                    optimizer: Optional[OptimizerWrapper] = None,
                    lr_scheduler: torch.optim.lr_scheduler._LRScheduler = None,
                    torch_load_kwargs: Optional[Dict] = None,
                    load_state_dict_kwargs: Optional[Dict] = None):
@@ -84,7 +87,7 @@ def load_checkpoint(path: str,
        path (str): directory to save the checkpoint files.
        epoch (int): the number of epoch
        model (torch.nn.Module): a torch module initialized by ColoInitContext
-        optimizer (ColossalaiOptimizer, optional): optimizers. Defaults to None.
+        optimizer (OptimizerWrapper, optional): optimizers. Defaults to None.
        lr_scheduler (torch.optim.lr_scheduler._LRScheduler, optional): lr schedule. Defaults to None.
        torch_load_kwargs: (dict, optional): The kwargs of torch.load inside the function
        load_state_dict_kwargs (dict, optional): The kwargs of load_state_dict inside the function

--- a/colossalai/utils/checkpoint/utils.py
+++ b/colossalai/utils/checkpoint/utils.py
 import torch
 import torch.distributed as dist
-from colossalai.tensor import ColoTensor, ColoTensorSpec
-from colossalai.tensor.distspec import _DistSpec, DistPlacementPattern
+
+from colossalai.legacy.tensor import ColoTensorSpec
+from colossalai.legacy.tensor.distspec import DistPlacementPattern, _DistSpec
+from colossalai.tensor import ColoTensor


 def robust_broadcast(tensor):