[legacy] clean up legacy code (#4743)

* [legacy] remove outdated codes of pipeline (#4692) * [legacy] remove cli of benchmark and update optim (#4690) * [legacy] remove cli of benchmark and update optim * [doc] fix cli doc test * [legacy] fix engine clip grad norm * [legacy] remove outdated colo tensor (#4694) * [legacy] remove outdated colo tensor * [test] fix test import * [legacy] move outdated zero to legacy (#4696) * [legacy] clean up utils (#4700) * [legacy] clean up utils * [example] update examples * [legacy] clean up amp * [legacy] fix amp module * [legacy] clean up gpc (#4742) * [legacy] clean up context * [legacy] clean core, constants and global vars * [legacy] refactor initialize * [example] fix examples ci * [example] fix examples ci * [legacy] fix tests * [example] fix gpt example * [example] fix examples ci * [devops] fix ci installation * [example] fix examples ci

[legacy] clean up legacy code (#4743)
* [legacy] remove outdated codes of pipeline (#4692) * [legacy] remove cli of benchmark and update optim (#4690) * [legacy] remove cli of benchmark and update optim * [doc] fix cli doc test * [legacy] fix engine clip grad norm * [legacy] remove outdated colo tensor (#4694) * [legacy] remove outdated colo tensor * [test] fix test import * [legacy] move outdated zero to legacy (#4696) * [legacy] clean up utils (#4700) * [legacy] clean up utils * [example] update examples * [legacy] clean up amp * [legacy] fix amp module * [legacy] clean up gpc (#4742) * [legacy] clean up context * [legacy] clean core, constants and global vars * [legacy] refactor initialize * [example] fix examples ci * [example] fix examples ci * [legacy] fix tests * [example] fix gpt example * [example] fix examples ci * [devops] fix ci installation * [example] fix examples ci
b5f9e37c · Hongxin Liu · GitHub · 32e7f994 · b5f9e37c · b5f9e37c
Unverified Commit b5f9e37c authored Sep 18, 2023 by Hongxin Liu Committed by GitHub Sep 18, 2023
20 changed files
--- a/.github/workflows/doc_test_on_pr.yml
+++ b/.github/workflows/doc_test_on_pr.yml
@@ -89,7 +89,7 @@ jobs:
      - name: Install ColossalAI
        run: |
          source activate pytorch
-          pip install -v .
+          CUDA_EXT=1 pip install -v .
      - name: Test the Doc
        run: |

--- a/.github/workflows/doc_test_on_schedule.yml
+++ b/.github/workflows/doc_test_on_schedule.yml
@@ -32,7 +32,7 @@ jobs:
      - name: Install ColossalAI
        run: |
-          pip install -v .
+          CUDA_EXT=1 pip install -v .
      - name: Install Doc Test Requirements
        run: |

--- a/.github/workflows/example_check_on_dispatch.yml
+++ b/.github/workflows/example_check_on_dispatch.yml
@@ -53,7 +53,7 @@ jobs:
        uses: actions/checkout@v3
      - name: Install Colossal-AI
        run: |
-          pip install -v .
+          CUDA_EXT=1 pip install -v .
      - name: Test the example
        run: |
          dir=${{ matrix.directory }}

--- a/.github/workflows/example_check_on_pr.yml
+++ b/.github/workflows/example_check_on_pr.yml
@@ -88,7 +88,7 @@ jobs:
      - name: Install Colossal-AI
        run: |
-          pip install -v .
+          CUDA_EXT=1 pip install -v .
      - name: Test the example
        run: |

--- a/.github/workflows/example_check_on_schedule.yml
+++ b/.github/workflows/example_check_on_schedule.yml
@@ -42,7 +42,7 @@ jobs:
      - name: Install Colossal-AI
        run: |
-          pip install -v .
+          CUDA_EXT=1 pip install -v .
      - name: Traverse all files
        run: |

--- a/colossalai/__init__.py
+++ b/colossalai/__init__.py
-from .initialize import (
+from .initialize import launch, launch_from_openmpi, launch_from_slurm, launch_from_torch
-    get_default_parser,
-    initialize,
-    launch,
-    launch_from_openmpi,
-    launch_from_slurm,
-    launch_from_torch,
-)
 try:
    # .version will be created by setup.py
@@ -15,3 +8,5 @@ except ModuleNotFoundError:
    # and directly set PYTHONPATH to use Colossal-AI which is a bad practice
    __version__ = '0.0.0'
    print('please install Colossal-AI from https://www.colossalai.org/download or from source')
+__all__ = ['launch', 'launch_from_openmpi', 'launch_from_slurm', 'launch_from_torch', '__version__']
--- a/colossalai/amp/__init__.py
+++ b/colossalai/amp/__init__.py
-#!/usr/bin/env python
-# -*- encoding: utf-8 -*-
-import torch.nn as nn
-from torch.nn.modules.loss import _Loss
-from torch.optim import Optimizer
-from colossalai.context import Config
-from .amp_type import AMP_TYPE
-from .apex_amp import convert_to_apex_amp
-from .naive_amp import convert_to_naive_amp
-from .torch_amp import convert_to_torch_amp
-__all__ = ['convert_to_amp', 'convert_to_naive_amp', 'convert_to_apex_amp', 'convert_to_torch_amp', 'AMP_TYPE']
-def convert_to_amp(model: nn.Module, optimizer: Optimizer, criterion: _Loss, mode: AMP_TYPE, amp_config: Config = None):
-    """A helper function to wrap training components with Torch AMP modules.
-    Args:
-        param model (:class:`torch.nn.Module`): your model object.
-        optimizer (:class:`torch.optim.Optimizer`): your optimizer object.
-        criterion (:class:`torch.nn.modules.loss._Loss`): your loss function object.
-        mode (:class:`colossalai.amp.AMP_TYPE`): amp mode.
-        amp_config (Union[:class:`colossalai.context.Config`, dict]): configuration for different amp modes.
-    Returns:
-        A tuple (model, optimizer, criterion).
-    Note:
-        ``amp_config`` may vary from different mode you choose. You should check the corresponding amp mode
-        for more details about ``amp_config``.
-        For ``apex_amp``, please check
-        `apex_amp config <https://nvidia.github.io/apex/amp.html?highlight=apex%20amp>`_.
-        For ``naive_amp``, please check
-        `naive_amp config <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/amp/naive_amp/_fp16_optimizer.py#L42>`_.
-        For ``torch_amp``, please check
-        `torch_amp config <https://github.com/pytorch/pytorch/blob/master/torch/cuda/amp/grad_scaler.py#L97>`_.
-    """
-    assert isinstance(mode, AMP_TYPE), \
-        f'expected the argument mode be AMP_TYPE, but got {type(mode)}'
-    if amp_config is None:
-        amp_config = Config()
-    if mode == AMP_TYPE.TORCH:
-        model, optimizer, criterion = convert_to_torch_amp(model, optimizer, criterion, amp_config)
-    elif mode == AMP_TYPE.APEX:
-        model, optimizer = convert_to_apex_amp(model, optimizer, amp_config)
-    elif mode == AMP_TYPE.NAIVE:
-        model, optimizer = convert_to_naive_amp(model, optimizer, amp_config)
-    return model, optimizer, criterion
--- a/colossalai/amp/naive_amp/__init__.py
+++ b/colossalai/amp/naive_amp/__init__.py
-import inspect
-import torch.nn as nn
-from torch.optim import Optimizer
-from colossalai.utils import is_no_pp_or_last_stage
-from ._fp16_optimizer import FP16Optimizer
-from .grad_scaler import ConstantGradScaler, DynamicGradScaler
-from .naive_amp import NaiveAMPModel, NaiveAMPOptimizer
-def convert_to_naive_amp(model: nn.Module, optimizer: Optimizer, amp_config):
-    """A helper function to wrap training components with naive AMP modules. In this mode,
-    we forcibly cast the model weights and inputs to FP16, and cast the model outputs to FP32 to calculate loss,
-    which is equivalent to Apex O3.
-    Args:
-        model (:class:`torch.nn.Module`): your model object
-        optimizer (:class:`torch.optim.Optimizer`): your optimizer object
-        amp_config (:class:`colossalai.context.Config` or dict): configuration for naive mode amp.
-    Returns:
-        Tuple: A tuple (model, optimizer)
-    The ``amp_config`` should contain parameters below::
-        verbose (bool, optional): if set to `True`, will print debug info (Default: False).
-        clip_grad_norm (float, optional): clip gradients with this global L2 norm (Default 0).
-                                          Note that clipping is ignored if clip_grad == 0.
-        dynamic_grad_scale (bool): whether to use dynamic grad scaler.
-    """
-    if isinstance(model, nn.ModuleList):
-        # interleaved pipeline
-        module_list = []
-        for chunk, m in enumerate(model):
-            output_to_fp32 = is_no_pp_or_last_stage() and chunk == len(model) - 1
-            module_list.append(NaiveAMPModel(m, output_to_fp32=output_to_fp32))
-        model = nn.ModuleList(module_list)
-    else:
-        output_to_fp32 = is_no_pp_or_last_stage()
-        model = NaiveAMPModel(model, output_to_fp32=output_to_fp32)
-    use_dynamic_grad_scaler = amp_config.pop('dynamic_grad_scale', True)
-    if use_dynamic_grad_scaler:
-        scaler_class = DynamicGradScaler
-    else:
-        scaler_class = ConstantGradScaler
-    sig = inspect.signature(scaler_class.__init__)
-    kwargs = dict()
-    for param in sig.parameters.values():
-        if param.name in amp_config:
-            kwargs[param.name] = amp_config.pop(param.name)
-    grad_scaler = scaler_class(**kwargs)
-    optimizer = NaiveAMPOptimizer(optimizer, grad_scaler, **amp_config)
-    return model, optimizer
-__all__ = ['convert_to_naive_amp', 'NaiveAMPOptimizer', 'FP16Optimizer']
--- a/colossalai/auto_parallel/offload/amp_optimizer.py
+++ b/colossalai/auto_parallel/offload/amp_optimizer.py
@@ -5,8 +5,8 @@ import torch
 from torch.optim import Optimizer
 from colossalai.amp.naive_amp.grad_scaler import DynamicGradScaler
+from colossalai.interface import OptimizerWrapper
 from colossalai.logging import get_dist_logger
-from colossalai.nn.optimizer import ColossalaiOptimizer
 from colossalai.utils import get_current_device
 from .base_offload_module import BaseOffloadModule
@@ -19,7 +19,7 @@ class OptimState(Enum):
    UNSCALED = 1
-class AMPOptimizer(ColossalaiOptimizer):
+class AMPOptimizer(OptimizerWrapper):
    """
    A wrapper for Optimizer.
    Code reference: https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/optimizer/zero_optimizer.py

--- a/colossalai/checkpoint_io/utils.py
+++ b/colossalai/checkpoint_io/utils.py
@@ -13,7 +13,6 @@ import torch.nn as nn
 from torch.optim import Optimizer
 from colossalai.interface import ModelWrapper, OptimizerWrapper
-from colossalai.nn.optimizer import ColossalaiOptimizer
 from colossalai.tensor.d_tensor import (
    is_customized_distributed_tensor,
    is_distributed_tensor,
@@ -130,10 +129,7 @@ def unwrap_optimizer(optimizer: OptimizerWrapper):
    This method should be used before saving/loading it to/from sharded checkpoints.
    '''
-    # TODO(Baizhou): ColossalaiOptimizer will be replaced with OptimizerWrapper in the future
    unwrapped_optim = optimizer.optim
-    if isinstance(unwrapped_optim, ColossalaiOptimizer):
-        unwrapped_optim = unwrapped_optim.optim
    return unwrapped_optim

--- a/colossalai/cli/benchmark/__init__.py
+++ b/colossalai/cli/benchmark/__init__.py
-import click
-from colossalai.context import Config
-from .benchmark import run_benchmark
-from .utils import *
-__all__ = ['benchmark']
-@click.command()
-@click.option("-g", "--gpus", type=int, default=None, help="Total number of devices to use.")
-@click.option("-b", "--batch_size", type=int, default=8, help="Batch size of the input tensor.")
-@click.option("-s", "--seq_len", type=int, default=512, help="Sequence length of the input tensor.")
-@click.option("-d", "--dimension", type=int, default=1024, help="Hidden dimension of the input tensor.")
-@click.option("-w", "--warmup_steps", type=int, default=10, help="The number of warmup steps.")
-@click.option("-p", "--profile_steps", type=int, default=50, help="The number of profiling steps.")
-@click.option("-l", "--layers", type=int, default=2)
-@click.option("-m",
-              "--model",
-              type=click.Choice(['mlp'], case_sensitive=False),
-              default='mlp',
-              help="Select the model to benchmark, currently only supports MLP")
-def benchmark(gpus: int, batch_size: int, seq_len: int, dimension: int, warmup_steps: int, profile_steps: int,
-              layers: int, model: str):
-    args_dict = locals()
-    args = Config(args_dict)
-    run_benchmark(args)
--- a/colossalai/cli/benchmark/benchmark.py
+++ b/colossalai/cli/benchmark/benchmark.py
-from functools import partial
-from typing import Dict, List
-import click
-import torch.multiprocessing as mp
-import colossalai
-from colossalai.cli.benchmark.utils import find_all_configs, get_batch_data, profile_model
-from colossalai.context import Config
-from colossalai.context.random import reset_seeds
-from colossalai.core import global_context as gpc
-from colossalai.logging import disable_existing_loggers, get_dist_logger
-from colossalai.testing import free_port
-from colossalai.utils import MultiTimer
-from .models import MLP
-def run_benchmark(args: Config) -> None:
-    """
-    Run benchmarking with torch.multiprocessing.
-    """
-    # sanity checks
-    if args.gpus is None:
-        click.echo("Error: --num_gpus is not given")
-        exit()
-    if args.gpus <= 1:
-        click.echo("Warning: tensor parallel will be activated with at least 2 devices.")
-    click.echo("=== Benchmarking Parameters ===")
-    for k, v in args.items():
-        click.echo(f'{k}: {v}')
-    click.echo('')
-    config_list = find_all_configs(args.gpus)
-    avail_ports = [free_port() for _ in range(len(config_list))]
-    run_func = partial(run_dist_profiling,
-                       world_size=args.gpus,
-                       port_list=avail_ports,
-                       config_list=config_list,
-                       hyperparams=args)
-    mp.spawn(run_func, nprocs=args.gpus)
-def run_dist_profiling(rank: int, world_size: int, port_list: List[int], config_list: List[Dict],
-                       hyperparams: Config) -> None:
-    """
-    A function executed for profiling, this function should be spawn by torch.multiprocessing.
-    Args:
-        rank (int): rank of the process
-        world_size (int): the number of processes
-        port_list (List[int]): a list of free ports for initializing distributed networks
-        config_list (List[Dict]): a list of configuration
-        hyperparams (Config): the hyperparameters given by the user
-    """
-    # disable logging for clean output
-    disable_existing_loggers()
-    logger = get_dist_logger()
-    logger.set_level('WARNING')
-    for config, port in zip(config_list, port_list):
-        colossalai.launch(config=config, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
-        timer = MultiTimer()
-        # 1D parallel should be skipped if in_features or out_features is not able to be divided exactly by 1D parallel size.
-        if config.parallel.tensor.mode == '1d' and hyperparams.dimension % config.parallel.tensor.size != 0:
-            click.echo(
-                "1D parallel will be skipped because in_features or out_features is not able to be divided exactly by 1D parallel size."
-            )
-            continue
-        if hyperparams.model == 'mlp':
-            model = MLP(dim=hyperparams.dimension, layers=hyperparams.layers)
-        else:
-            if gpc.get_global_rank() == 0:
-                click.echo("Error: Invalid argument for --model")
-                exit()
-        data_func = partial(get_batch_data,
-                            dim=hyperparams.dimension,
-                            batch_size=hyperparams.batch_size,
-                            seq_length=hyperparams.seq_len,
-                            mode=config.parallel.tensor.mode)
-        fwd_time, bwd_time, max_allocated, max_cached = profile_model(model=model,
-                                                                      warmup_steps=hyperparams.warmup_steps,
-                                                                      profile_steps=hyperparams.profile_steps,
-                                                                      data_func=data_func,
-                                                                      timer=timer)
-        gpc.destroy()
-        reset_seeds()
-        if gpc.get_global_rank() == 0:
-            config_str = ', '.join([f'{k}: {v}' for k, v in config.parallel.tensor.items()])
-            click.echo(f"=== {config_str} ===")
-            click.echo(f"Average forward time: {fwd_time}")
-            click.echo(f"Average backward time: {bwd_time}")
-            click.echo(f"Max allocated GPU memory: {max_allocated}")
-            click.echo(f"Max cached GPU memory: {max_cached}\n")
--- a/colossalai/cli/benchmark/models.py
+++ b/colossalai/cli/benchmark/models.py
-import torch
-import colossalai.legacy.nn as col_nn
-class MLP(torch.nn.Module):
-    def __init__(self, dim: int, layers: int):
-        super().__init__()
-        self.layers = torch.nn.ModuleList()
-        for _ in range(layers):
-            self.layers.append(col_nn.Linear(dim, dim))
-    def forward(self, x):
-        for layer in self.layers:
-            x = layer(x)
-        return x
--- a/colossalai/cli/benchmark/utils.py
+++ b/colossalai/cli/benchmark/utils.py
-import math
-import time
-from typing import Callable, Dict, List, Tuple
-import torch
-from colossalai.context import Config, ParallelMode
-from colossalai.utils import MultiTimer
-def get_time_stamp() -> int:
-    """
-    Return the time stamp for profiling.
-    Returns:
-        time_stamp (int): the time given by time.time()
-    """
-    torch.cuda.synchronize()
-    time_stamp = time.time()
-    return time_stamp
-def get_memory_states() -> Tuple[float]:
-    """
-    Return the memory statistics.
-    Returns:
-        max_allocated (float): the allocated CUDA memory
-        max_cached (float):  the cached CUDA memory
-    """
-    max_allocated = torch.cuda.max_memory_allocated() / (1024**3)
-    max_cached = torch.cuda.max_memory_reserved() / (1024**3)
-    torch.cuda.reset_peak_memory_stats()
-    torch.cuda.empty_cache()
-    return max_allocated, max_cached
-def find_all_configs(device_cnt: int) -> List[Dict]:
-    """
-    Find all possible configurations for tensor parallelism
-    Args:
-        device_cnt (int): the number of devices
-    Returns:
-        config_list (List[Dict]): a list of configurations
-    """
-    def _is_square(num):
-        # 2D parallel should be implemented with at least 2 devices.
-        if num <= 1:
-            return False
-        return math.floor(math.sqrt(num))**2 == num
-    def _is_cube(num):
-        # 3D parallel should be implemented with at least 2 devices.
-        if num <= 1:
-            return False
-        return math.floor(num**(1. / 3.))**3 == num
-    config_list = []
-    # add non-parallel config
-    config = dict(parallel=dict(tensor=dict(size=device_cnt, mode=None)))
-    config_list.append(config)
-    # add 1D config
-    config = dict(parallel=dict(tensor=dict(size=device_cnt, mode='1d')))
-    config_list.append(config)
-    # add 2D config only if device_cnt is a square
-    if _is_square(device_cnt):
-        config = dict(parallel=dict(tensor=dict(size=device_cnt, mode='2d')))
-        config_list.append(config)
-    # check for 2.5D
-    # iterate over depth
-    for depth in range(1, device_cnt):
-        if device_cnt % depth == 0 and _is_square(device_cnt // depth):
-            config = dict(parallel=dict(tensor=dict(size=device_cnt, mode='2.5d', depth=depth)))
-            config_list.append(config)
-    # check for 3D if device_cnt is a cube
-    if _is_cube(device_cnt):
-        config = dict(parallel=dict(tensor=dict(size=device_cnt, mode='3d')))
-        config_list.append(config)
-    config_list = [Config(cfg) for cfg in config_list]
-    return config_list
-def profile_model(model: torch.nn.Module, warmup_steps: int, profile_steps: int, data_func: Callable,
-                  timer: MultiTimer) -> Tuple[float]:
-    """
-    Profile the forward and backward of a model
-    Args:
-        model (torch.nn.Module): a PyTorch model
-        warmup_steps (int): the number of steps for warmup
-        profile_steps (int): the number of steps for profiling
-        data_func (Callable): a function to generate random data
-        timer (colossalai.utils.Multitimer): a timer instance for time recording
-    Returns:
-        fwd_time (float): the average forward time taken by forward pass in second
-        bwd_time (float): the average backward time taken by forward pass in second
-        max_allocated (float): the maximum GPU memory allocated in GB
-        max_cached (float): the maximum GPU memory cached in GB
-    """
-    def _run_step(data):
-        timer.start('forward')
-        out = model(data)
-        timer.stop('forward', keep_in_history=True)
-        timer.start('backward')
-        out.mean().backward()
-        timer.stop('backward', keep_in_history=True)
-    data_list = [data_func() for _ in range(warmup_steps)]
-    for data in data_list:
-        _run_step(data)
-    timer.reset('forward')
-    timer.reset('backward')
-    for _ in range(profile_steps):
-        data = data_func()
-        _run_step(data)
-    max_allocated, max_cached = get_memory_states()
-    fwd_time = timer.get_timer('forward').get_history_mean()
-    bwd_time = timer.get_timer('backward').get_history_mean()
-    return fwd_time, bwd_time, max_allocated, max_cached
-def get_batch_data(dim: int, batch_size: int, seq_length: int, mode: ParallelMode) -> torch.Tensor:
-    """
-    Return a random data of shape (batch_size, seq_length, dim) for profiling.
-    Args:
-        dim (int): hidden size
-        batch_size (int): the number of data samples
-        seq_length (int): the number of tokens
-        mode (ParallelMode): Colossal-AI ParallelMode enum
-    Returns:
-        data (torch.Tensor): random data
-    """
-    if mode in ['2d', '2.5d']:
-        batch_size = batch_size // 2
-        dim = dim // 2
-    elif mode == '3d':
-        batch_size = batch_size // 4
-        dim = dim // 2
-    data = torch.rand(batch_size, seq_length, dim).cuda()
-    return data
--- a/colossalai/cli/cli.py
+++ b/colossalai/cli/cli.py
 import click
-from .benchmark import benchmark
 from .check import check
 from .launcher import run
@@ -19,7 +18,6 @@ def cli():
 cli.add_command(run)
 cli.add_command(check)
-cli.add_command(benchmark)
 if __name__ == '__main__':
    cli()
--- a/colossalai/context/__init__.py
+++ b/colossalai/context/__init__.py
 from .config import Config, ConfigException
-from .parallel_context import ParallelContext
-from .parallel_mode import ParallelMode
+# from .moe_context import MOE_CONTEXT
-from .moe_context import MOE_CONTEXT
-from .process_group_initializer import *
+__all__ = [
-from .random import *
+    'Config',
+    'ConfigException',
+]
--- a/colossalai/context/moe_context.py
+++ b/colossalai/context/moe_context.py
@@ -3,13 +3,12 @@ from typing import Tuple
 import torch
 import torch.distributed as dist
-from colossalai.context.parallel_mode import ParallelMode
 from colossalai.context.singleton_meta import SingletonMeta
-from colossalai.tensor import ProcessGroup
+from colossalai.legacy.tensor import ProcessGroup
 def _check_sanity():
-    from colossalai.core import global_context as gpc
+    from colossalai.legacy.core import global_context as gpc
    if gpc.tensor_parallel_size > 1 or gpc.pipeline_parallel_size > 1:
        raise NotImplementedError("Moe is not compatible with tensor or "
                                  "pipeline parallel at present.")
@@ -61,7 +60,7 @@ class MoeContext(metaclass=SingletonMeta):
        self.world_size = dist.get_world_size()
-        from colossalai.core import global_context as gpc
+        from colossalai.legacy.core import global_context as gpc
        self.max_ep_size = gpc.config.get('max_ep_size', self.world_size)
        assert self.world_size % self.max_ep_size == 0, \
            "Maximum expert parallel size must be a factor of the number of GPUs"

--- a/colossalai/fx/passes/shard_1d_pass.py
+++ b/colossalai/fx/passes/shard_1d_pass.py
+import operator
 import torch
 import torch.nn as nn
-import operator
-from colossalai.tensor import ProcessGroup
+from colossalai.legacy.tensor import ProcessGroup
-from colossalai.tensor.distspec import ShardSpec
+from colossalai.legacy.tensor.compute_spec import ComputePattern, ComputeSpec
-from colossalai.tensor.compute_spec import ComputePattern, ComputeSpec
+from colossalai.legacy.tensor.distspec import ShardSpec
 ELEMENTWISE_MODULE_OP = [torch.nn.Dropout, torch.nn.ReLU]
 ELEMENTWISE_FUNC_OP = [
@@ -13,7 +15,7 @@ ELEMENTWISE_FUNC_OP = [
 def weight_split(weight: torch.nn.parameter.Parameter, dim: int, col_normal: bool) -> torch.nn.parameter.Parameter:
-    """weight_split 
+    """weight_split
    split a nn.Parameter
    Args:
@@ -60,9 +62,9 @@ def row_shard_linear_pass(gm: torch.fx.GraphModule):
 def transformer_mlp_pass(graph_module: torch.fx.GraphModule, process_group: ProcessGroup):
    """
-    This IR pass checks for transformer MLP like structure and annotate column and row sharding to the linear layers. 
+    This IR pass checks for transformer MLP like structure and annotate column and row sharding to the linear layers.
    """
-    #TODO: Needs to handle special cases, like x = linear(x) + linear(x)
+    # TODO: Needs to handle special cases, like x = linear(x) + linear(x)
    graph = graph_module.graph
    world_size = process_group.world_size()

--- a/colossalai/initialize.py
+++ b/colossalai/initialize.py
 #!/usr/bin/env python
 # -*- encoding: utf-8 -*-
-import argparse
 import os
-import pprint
+import warnings
 from pathlib import Path
-from typing import Callable, Dict, Iterable, List, Optional, Tuple, Union
+from typing import Dict, Union
 import torch
-import torch.nn as nn
+import torch.distributed as dist
-from torch.nn.modules.loss import _Loss
-from torch.nn.parallel import DistributedDataParallel as DDP
-from torch.optim.lr_scheduler import _LRScheduler
-from torch.optim.optimizer import Optimizer
-from torch.utils.data import DataLoader
-from colossalai.amp import AMP_TYPE, convert_to_amp
+from colossalai.context import Config
-from colossalai.amp.naive_amp import NaiveAMPModel
-from colossalai.context import Config, ConfigException, ParallelMode
-from colossalai.context.moe_context import MOE_CONTEXT
-from colossalai.core import global_context as gpc
-from colossalai.legacy.builder.builder import build_gradient_handler
-from colossalai.legacy.engine import Engine
-from colossalai.legacy.engine.gradient_accumulation import accumulate_gradient
-from colossalai.legacy.engine.schedule import (
-    InterleavedPipelineSchedule,
-    NonPipelineSchedule,
-    PipelineSchedule,
-    get_tensor_shape,
-)
 from colossalai.logging import get_dist_logger
-from colossalai.nn.optimizer.colossalai_optimizer import ColossalaiOptimizer
+from colossalai.utils import set_device, set_seed
-from colossalai.utils import get_current_device, is_using_ddp, is_using_pp, is_using_sequence, sync_model_param
-from colossalai.utils.moe import sync_moe_model_param
-from colossalai.zero.legacy import ShardedOptimizerV2, convert_to_zero_v2
-from colossalai.zero.legacy.gemini.ophooks import BaseOpHook
-def get_default_parser():
-    """Reads user command line and uses an argument parser to parse the input arguments.
-    Input arguments include configuration, host, port, world size, local rank, backend for torch.distributed.
-    Returns:
-       Namespace: Returns the parser with the default arguments, the user may add customized arguments into this parser.
-    """
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--config', type=str, help='path to the config file')
-    parser.add_argument('--host', type=str, help='the master address for distributed training')
-    parser.add_argument('--port', type=int, help='the master port for distributed training')
-    parser.add_argument('--world_size', type=int, help='world size for distributed training')
-    parser.add_argument('--rank', type=int, help='rank for the default process group')
-    parser.add_argument('--local_rank', type=int, help='local rank on the node')
-    parser.add_argument('--backend', type=str, default='nccl', help='backend for distributed communication')
-    return parser
 def launch(config: Union[str, Path, Config, Dict],
@@ -83,40 +42,23 @@ def launch(config: Union[str, Path, Config, Dict],
    Raises:
        Exception: Raise exception when config type is wrong
    """
-    gpc.verbose = verbose
+    if rank == 0:
+        warnings.warn("`config` is deprecated and will be removed soon.")
-    # set config
-    assert isinstance(config, (Config, str, Path, dict)), \
-        f'expected argument config to be Config, str or Path, but got {type(config)}'
-    if not isinstance(config, Config) and isinstance(config, dict):
-        config = Config(config)
-    if isinstance(config, (str, Path)):
-        config = Config.from_file(config)
-    gpc.load_config(config)
    # init default process group
-    gpc.init_global_dist(rank, world_size, backend, host, port)
+    init_method = f'tcp://[{host}]:{port}'
+    dist.init_process_group(rank=rank, world_size=world_size, backend=backend, init_method=init_method)
-    # init process groups for different parallel modes from config
-    gpc.init_parallel_groups()
    # set cuda device
    if torch.cuda.is_available():
        # if local rank is not given, calculate automatically
-        gpc.set_device(local_rank)
+        set_device(local_rank)
-    # set the number of processes running on the same node
-    gpc.detect_num_processes_on_current_node()
-    gpc.set_seed(seed)
+    set_seed(seed)
    if verbose:
        logger = get_dist_logger()
-        logger.info(
+        logger.info(f'Distributed environment is initialized, world size: {dist.get_world_size()}', ranks=[0])
-            f'Distributed environment is initialized, '
-            f'data parallel size: {gpc.data_parallel_size}, pipeline parallel size: {gpc.pipeline_parallel_size}, '
-            f'tensor parallel size: {gpc.tensor_parallel_size}',
-            ranks=[0])
 def launch_from_slurm(config: Union[str, Path, Config, Dict],
@@ -224,247 +166,3 @@ def launch_from_torch(config: Union[str, Path, Config, Dict],
           backend=backend,
           seed=seed,
           verbose=verbose)
-def initialize(model: nn.Module,
-               optimizer: Optimizer,
-               criterion: Optional[_Loss] = None,
-               train_dataloader: Optional[Iterable] = None,
-               test_dataloader: Optional[Iterable] = None,
-               lr_scheduler: Optional[_LRScheduler] = None,
-               ophooks: Optional[List[BaseOpHook]] = None,
-               verbose: bool = True) -> Tuple[Engine, DataLoader, DataLoader, _LRScheduler]:
-    """Core function to wrap the essential training components with our functionality based on the config which is
-    loaded into gpc.config.
-    Args:
-        model (:class:`torch.nn.Module` or Callable): Your model instance or a function to build the model.
-        optimizer (:class:`torch.optim.optimizer.Optimizer` or :class:`Type[torch.optim.optimizer]`):
-            Your optimizer instance.
-        criterion (:class:`torch.nn.modules.loss._Loss`, optional): Your criterion instance.
-        train_dataloader (:class:`torch.utils.data.DataLoader`, optional): Dataloader for training.
-        test_dataloader (:class:`torch.utils.data.DataLoader`, optional): Dataloader for testing.
-        lr_scheduler (:class:`torch.nn.lr_scheduler._LRScheduler`, optional): Your lr scheduler instance, optional.
-        verbose (bool, optional): Whether to print logs.
-    Returns:
-        Tuple (engine, train_dataloader, test_dataloader, lr_scheduler):
-            A tuple of ``(engine, train_dataloader, test_dataloader, lr_scheduler)``
-            where only ``engine`` could not be None.
-    """
-    # get logger
-    logger = get_dist_logger()
-    gpc.verbose = verbose
-    # get config from gpc
-    config = gpc.config
-    # print config
-    if verbose:
-        logger.info(
-            f"\n========== Your Config ========\n"
-            f"{pprint.pformat(gpc.config)}\n"
-            f"================================\n",
-            ranks=[0])
-    # cudnn
-    cudnn_benchmark = config.get('cudnn_benchmark', False)
-    cudnn_deterministic = config.get('cudnn_deterministic', False)
-    torch.backends.cudnn.benchmark = cudnn_benchmark
-    torch.backends.cudnn.deterministic = cudnn_deterministic
-    if verbose:
-        logger.info(f"cuDNN benchmark = {cudnn_benchmark}, deterministic = {cudnn_deterministic}", ranks=[0])
-    # zero
-    use_zero = hasattr(gpc.config, 'zero')
-    if use_zero:
-        zero_cfg = gpc.config.get('zero', None)
-        if zero_cfg is not None:
-            cfg_ = zero_cfg.copy()
-        else:
-            cfg_ = {}
-        optimizer_config = zero_cfg.get('optimizer_config', None)
-        model_config = zero_cfg.get('model_config', None)
-        model, optimizer = convert_to_zero_v2(model,
-                                              optimizer,
-                                              model_config=model_config,
-                                              optimizer_config=optimizer_config)
-        logger.info("Initializing ZeRO model and optimizer finished!", ranks=[0])
-    else:
-        if isinstance(model, nn.Module):
-            # first sync model across dp ranks
-            model.to(get_current_device())
-        elif isinstance(model, Callable):
-            model = model().to(get_current_device())
-        # optimizer maybe a optimizer_cls
-        if isinstance(optimizer, Callable):
-            optimizer = optimizer(model.parameters())
-            logger.warning("Initializing an non ZeRO model with optimizer class")
-    if not use_zero:
-        if is_using_sequence():
-            sync_model_param(model, ParallelMode.SEQUENCE_DP)
-        elif MOE_CONTEXT.is_initialized:
-            sync_moe_model_param(model)
-        elif is_using_ddp():
-            sync_model_param(model, ParallelMode.DATA)
-    else:
-        logger.warning(
-            "The parameters of models is not automatically synchronized.\n"
-            "Please make sure that all parameters are the same in data parallel group.",
-            ranks=[0])
-    # check amp and zero
-    fp16_cfg = gpc.config.get('fp16', None)
-    if fp16_cfg is not None and fp16_cfg.mode is not None and use_zero:
-        raise ConfigException(
-            "It is not allowed to set fp16 and zero configuration in your config file at the same time")
-    # clip grad norm
-    clip_grad_norm = gpc.config.get('clip_grad_norm', 0.0)
-    # initialize amp
-    amp_mode = None
-    if fp16_cfg is not None and fp16_cfg.mode is not None:
-        cfg_ = fp16_cfg.copy()
-        amp_mode = cfg_.pop('mode')
-        if is_using_pp():
-            assert amp_mode == AMP_TYPE.NAIVE, 'Pipeline only support NaiveAMP currently'
-        if amp_mode == AMP_TYPE.NAIVE:
-            cfg_['clip_grad_norm'] = clip_grad_norm
-        model, optimizer, criterion = convert_to_amp(model=model,
-                                                     optimizer=optimizer,
-                                                     criterion=criterion,
-                                                     mode=amp_mode,
-                                                     amp_config=cfg_)
-    # get torch ddp config
-    torch_ddp_cfg = gpc.config.get('torch_ddp', dict())
-    # gradient handler
-    gradient_handler_cfg = gpc.config.get('gradient_handler', None)
-    if gradient_handler_cfg is None:
-        # if gradient handler is not specified in the configuration file,
-        # check in the following order
-        # 1. if optimizer is ZERO, then use zero grad handler
-        # 2. if dp size is larger than 1 and pipeline is not used, use pytorch ddp
-        # 3. if using pipeline and dp size larger than 1, use data parallel grad handler
-        if isinstance(optimizer, ShardedOptimizerV2):
-            gradient_handler_cfg = [dict(type='ZeROGradientHandler')]
-            if verbose:
-                logger.info(
-                    "Training with zero is detected, ZeROGradientHandler is automatically "
-                    "added even though not specified in the configuration",
-                    ranks=[0])
-        elif is_using_ddp() and MOE_CONTEXT.is_initialized:
-            gradient_handler_cfg = [dict(type='MoeGradientHandler')]
-            if verbose:
-                logger.info(
-                    "Data parallel training is detected with moe parallel, MoeGradientHandler is automatically "
-                    "added even though not specified in the configuration",
-                    ranks=[0])
-        elif is_using_sequence():
-            model = DDP(model,
-                        process_group=gpc.get_group(ParallelMode.SEQUENCE_DP),
-                        device_ids=[torch.cuda.current_device()],
-                        **torch_ddp_cfg)
-            if verbose:
-                logger.info('Model is using torch.nn.parallel.DistributedDataParallel for Sequence Parallelism',
-                            ranks=[0])
-        elif is_using_ddp() and not is_using_pp() and amp_mode != AMP_TYPE.NAIVE:
-            model = DDP(model,
-                        process_group=gpc.get_group(ParallelMode.DATA),
-                        device_ids=[torch.cuda.current_device()],
-                        **torch_ddp_cfg)
-            if verbose:
-                logger.info('Model is using torch.nn.parallel.DistributedDataParallel for Data Parallelism', ranks=[0])
-        elif is_using_ddp():
-            gradient_handler_cfg = [dict(type='DataParallelGradientHandler')]
-            if verbose:
-                logger.info(
-                    "Data parallel training is detected when using pipeline parallel, "
-                    "DataParallelGradientHandler is automatically "
-                    "added even though not specified in the configuration",
-                    ranks=[0])
-        # add pipeline parallel gradient handler, if pipeline shared module is detected
-        for param in model.parameters():
-            if getattr(param, 'pipeline_shared_module_pg', None) is not None:
-                if gradient_handler_cfg is None:
-                    gradient_handler_cfg = [dict(type='PipelineSharedModuleGradientHandler')]
-                else:
-                    gradient_handler_cfg.append(dict(type='PipelineSharedModuleGradientHandler'))
-                if verbose:
-                    logger.info(
-                        "pipeline_shared_module is detected, PipelineSharedModuleGradientHandler is automatically "
-                        "added even though not specified in the configuration",
-                        ranks=[0])
-                break
-    else:
-        if not isinstance(gradient_handler_cfg, list):
-            raise ConfigException(
-                f"expected gradient_handler in the configuration file to be a list but got {type(gradient_handler_cfg)}"
-            )
-    # turn off sync buffer for NaiveAMPModel if using torch DDP and NaiveAMPModel at the same time
-    # to avoid duplicated buffer synchronization
-    if isinstance(model, DDP) and isinstance(model.module, NaiveAMPModel):
-        model.module.sync_buffer = False
-    # initialize schedule for engine
-    if is_using_pp():
-        tensor_shape = get_tensor_shape()
-        use_interleaved = hasattr(gpc.config, 'model') and hasattr(gpc.config.model, 'num_chunks')
-        if gpc.is_initialized(ParallelMode.PARALLEL_1D):
-            scatter_gather = True
-        else:
-            scatter_gather = False
-        if use_interleaved:
-            if isinstance(model, nn.Sequential):
-                model = nn.ModuleList([model])
-            schedule = InterleavedPipelineSchedule(gpc.config.NUM_MICRO_BATCHES,
-                                                   gpc.config.model.num_chunks,
-                                                   tensor_shape=tensor_shape,
-                                                   scatter_gather_tensors=scatter_gather)
-        else:
-            schedule = PipelineSchedule(gpc.config.NUM_MICRO_BATCHES,
-                                        tensor_shape=tensor_shape,
-                                        scatter_gather_tensors=scatter_gather)
-    else:
-        schedule = NonPipelineSchedule()
-    if gradient_handler_cfg is None:
-        gradient_handlers = None
-        if verbose and not isinstance(model, DDP):
-            logger.warning(
-                "No PyTorch DDP or gradient handler is set up, please make sure you do not need "
-                "to all-reduce the gradients after a training step.",
-                ranks=[0])
-    else:
-        gradient_handlers = [build_gradient_handler(cfg, model, optimizer) for cfg in gradient_handler_cfg]
-    # check if optimizer is ColossalaiOptimizer
-    if not isinstance(optimizer, (ColossalaiOptimizer, ShardedOptimizerV2)):
-        optimizer = ColossalaiOptimizer(optim=optimizer)
-    # gradient accumulation
-    grad_accum_size = gpc.config.get('gradient_accumulation', None)
-    if grad_accum_size is not None:
-        optimizer, train_dataloader, gradient_handlers, lr_scheduler = accumulate_gradient(
-            model=model,
-            optimizer=optimizer,
-            dataloader=train_dataloader,
-            accumulate_size=grad_accum_size,
-            gradient_handlers=gradient_handlers,
-            lr_scheduler=lr_scheduler)
-    engine = Engine(model=model,
-                    optimizer=optimizer,
-                    criterion=criterion,
-                    gradient_handlers=gradient_handlers,
-                    clip_grad_norm=clip_grad_norm,
-                    ophook_list=ophooks,
-                    schedule=schedule)
-    return engine, train_dataloader, test_dataloader, lr_scheduler
--- a/colossalai/legacy/__init__.py
+++ b/colossalai/legacy/__init__.py
+from .initialize import initialize, launch, launch_from_openmpi, launch_from_slurm, launch_from_torch
+__all__ = [
+    'launch',
+    'launch_from_openmpi',
+    'launch_from_slurm',
+    'launch_from_torch',
+    'initialize',
+]