[legacy] move engine to legacy (#4560)

* [legacy] move engine to legacy * [example] fix seq parallel example * [example] fix seq parallel example * [test] test gemini pluging hang * [test] test gemini pluging hang * [test] test gemini pluging hang * [test] test gemini pluging hang * [test] test gemini pluging hang * [example] update seq parallel requirements

[legacy] move engine to legacy (#4560)
* [legacy] move engine to legacy * [example] fix seq parallel example * [example] fix seq parallel example * [test] test gemini pluging hang * [test] test gemini pluging hang * [test] test gemini pluging hang * [test] test gemini pluging hang * [test] test gemini pluging hang * [example] update seq parallel requirements
8accecd5 · Hongxin Liu · 89fe0277 · 8accecd5 · 8accecd5 · 8accecd5
Commit 8accecd5 authored Sep 04, 2023 by Hongxin Liu
20 changed files
--- a/colossalai/builder/builder.py
+++ b/colossalai/builder/builder.py
@@ -71,7 +71,7 @@ def build_gradient_handler(config, model, optimizer):
        optimizer (:class:`torch.optim.Optimizer`): An optimizer object containing parameters for the gradient handler
    Returns:
-        An object of :class:`colossalai.engine.BaseGradientHandler`
+        An object of :class:`colossalai.legacy.engine.BaseGradientHandler`
    """
    config_ = config.copy()
    config_['model'] = model

--- a/colossalai/initialize.py
+++ b/colossalai/initialize.py
@@ -21,9 +21,9 @@ from colossalai.builder.builder import build_gradient_handler
 from colossalai.context import Config, ConfigException, ParallelMode
 from colossalai.context.moe_context import MOE_CONTEXT
 from colossalai.core import global_context as gpc
-from colossalai.engine import Engine
+from colossalai.legacy.engine import Engine
-from colossalai.engine.gradient_accumulation import accumulate_gradient
+from colossalai.legacy.engine.gradient_accumulation import accumulate_gradient
-from colossalai.engine.schedule import (
+from colossalai.legacy.engine.schedule import (
    InterleavedPipelineSchedule,
    NonPipelineSchedule,
    PipelineSchedule,

--- a/colossalai/engine/__init__.py
+++ b/colossalai/engine/__init__.py
--- a/colossalai/engine/_base_engine.py
+++ b/colossalai/engine/_base_engine.py
@@ -8,11 +8,17 @@ from torch import Tensor
 from torch.nn import Module
 from torch.nn.modules.loss import _Loss
-from colossalai.engine.gradient_handler import BaseGradientHandler
+from colossalai.legacy.engine.gradient_handler import BaseGradientHandler
-from colossalai.engine.schedule import BaseSchedule, InterleavedPipelineSchedule, NonPipelineSchedule, PipelineSchedule
+from colossalai.legacy.engine.schedule import (
+    BaseSchedule,
+    InterleavedPipelineSchedule,
+    NonPipelineSchedule,
+    PipelineSchedule,
+)
 from colossalai.logging import get_dist_logger
-from colossalai.zero.legacy.gemini import BaseOpHook, register_ophooks_recursively
 from colossalai.nn.optimizer import ColossalaiOptimizer
+from colossalai.zero.legacy.gemini import BaseOpHook, register_ophooks_recursively
 class Engine:
    """Basic engine class for training and evaluation. It runs a specific process method

--- a/colossalai/engine/gradient_accumulation/__init__.py
+++ b/colossalai/engine/gradient_accumulation/__init__.py
@@ -4,7 +4,7 @@ import torch.nn as nn
 from torch.optim import Optimizer
 from torch.optim.lr_scheduler import _LRScheduler
-from colossalai.engine import BaseGradientHandler
+from colossalai.legacy.engine import BaseGradientHandler
 from ._gradient_accumulation import (
    GradAccumDataloader,
@@ -33,7 +33,7 @@ def accumulate_gradient(model: nn.Module,
        dataloader (:class:`torch.utils.data.DataLoader` or iterable objects):
            your dataloader object, would be called like iter(dataloader)
        accumulate_size (int): the number of steps to accumulate gradients
-        gradient_handlers (List[:class:`colossalai.engine.BaseGradientHandler`]):
+        gradient_handlers (List[:class:`colossalai.legacy.engine.BaseGradientHandler`]):
            list of gradient handler objects. Default is None.
        lr_scheduler (`torch.optim.lr_scheduler` or `colossalai.nn.lr_scheduler`):
            your ``lr_scheduler`` object for gradient accumulation. Defaults to None.

--- a/colossalai/engine/gradient_accumulation/_gradient_accumulation.py
+++ b/colossalai/engine/gradient_accumulation/_gradient_accumulation.py
@@ -10,7 +10,7 @@ from torch.optim import Optimizer
 from torch.optim.lr_scheduler import _LRScheduler
 from torch.utils.data import DataLoader
-from colossalai.engine import BaseGradientHandler
+from colossalai.legacy.engine import BaseGradientHandler
 from colossalai.nn.optimizer import ColossalaiOptimizer
 from colossalai.utils import conditional_context
@@ -262,7 +262,7 @@ class GradAccumGradientHandler:
    before accumulation size is reached.
    Args:
-        grad_handler (:class:`colossalai.engine.BaseGradientHandler`):
+        grad_handler (:class:`colossalai.legacy.engine.BaseGradientHandler`):
            Your ``gradient_handler`` object for gradient accumulation, would be called when achieving `accumulate_size`.
        accumulate_size (int): The number of steps to accumulate gradients.

--- a/colossalai/engine/gradient_handler/__init__.py
+++ b/colossalai/engine/gradient_handler/__init__.py
--- a/colossalai/engine/gradient_handler/_base_gradient_handler.py
+++ b/colossalai/engine/gradient_handler/_base_gradient_handler.py
--- a/colossalai/engine/gradient_handler/_data_parallel_gradient_handler.py
+++ b/colossalai/engine/gradient_handler/_data_parallel_gradient_handler.py
+from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
 from colossalai.registry import GRADIENT_HANDLER
-from ...context.parallel_mode import ParallelMode
 from ._base_gradient_handler import BaseGradientHandler
 from .utils import bucket_allreduce

--- a/colossalai/engine/gradient_handler/_moe_gradient_handler.py
+++ b/colossalai/engine/gradient_handler/_moe_gradient_handler.py
 from colossalai.context.moe_context import MOE_CONTEXT
+from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
 from colossalai.registry import GRADIENT_HANDLER
 from colossalai.utils.moe import get_moe_epsize_param_dict
-from ...context.parallel_mode import ParallelMode
 from ._base_gradient_handler import BaseGradientHandler
 from .utils import bucket_allreduce

--- a/colossalai/engine/gradient_handler/_pipeline_parallel_gradient_handler.py
+++ b/colossalai/engine/gradient_handler/_pipeline_parallel_gradient_handler.py
--- a/colossalai/engine/gradient_handler/_sequence_parallel_gradient_handler.py
+++ b/colossalai/engine/gradient_handler/_sequence_parallel_gradient_handler.py
+from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
 from colossalai.registry import GRADIENT_HANDLER
-from ...context.parallel_mode import ParallelMode
 from ._base_gradient_handler import BaseGradientHandler
 from .utils import bucket_allreduce

--- a/colossalai/engine/gradient_handler/_zero_gradient_handler.py
+++ b/colossalai/engine/gradient_handler/_zero_gradient_handler.py
--- a/colossalai/engine/gradient_handler/utils.py
+++ b/colossalai/engine/gradient_handler/utils.py
--- a/colossalai/engine/schedule/__init__.py
+++ b/colossalai/engine/schedule/__init__.py
--- a/colossalai/engine/schedule/_base_schedule.py
+++ b/colossalai/engine/schedule/_base_schedule.py
@@ -95,7 +95,7 @@ class BaseSchedule(ABC):
        """The process function over a batch of dataset for training or evaluation.
        Args:
-            engine (colossalai.engine.Engine): Colossalai engine for training and inference.
+            engine (colossalai.legacy.engine.Engine): Colossalai engine for training and inference.
            data_iter (Iterable): Data iterator from which get a batch of data, obtained by calling iter(dataloader).
            forward_only (bool): If True, the process won't include backward.
            return_loss (bool, optional): If False, the loss won't be returned.

--- a/colossalai/engine/schedule/_non_pipeline_schedule.py
+++ b/colossalai/engine/schedule/_non_pipeline_schedule.py
@@ -54,7 +54,7 @@ class NonPipelineSchedule(BaseSchedule):
        The returned labels and loss will None if :attr:`return_loss` is False.
        Args:
-            engine (colossalai.engine.Engine): Colossalai engine for training and inference.
+            engine (colossalai.legacy.engine.Engine): Colossalai engine for training and inference.
            data_iter (Iterable): Dataloader as the form of an iterator, obtained by calling iter(dataloader).
            forward_only (bool, optional):
                If True, the model is run for the forward pass, else back propagation will be executed.

--- a/colossalai/engine/schedule/_pipeline_schedule.py
+++ b/colossalai/engine/schedule/_pipeline_schedule.py
@@ -236,7 +236,7 @@ class PipelineSchedule(BaseSchedule):
        Returns output tensor. This is a helper function and can be ignored by users.
        Args:
-            engine (colossalai.engine.Engine): Colossalai engine for training and inference.
+            engine (colossalai.legacy.engine.Engine): Colossalai engine for training and inference.
            input_obj (Union[:class:`torch.Tensor`, List[:class:`torch.Tensor`]]): Input tensor for this pipeline stage.
            return_tensors (List[:class:`torch.Tensor`]): A list of tensors to return.
            return_output_label (bool, optional): Whether returns output labels.
@@ -274,7 +274,7 @@ class PipelineSchedule(BaseSchedule):
        This is a helper function and can be ignored by users.
        Args:
-            engine (colossalai.engine.Engine): Colossalai engine for training and inference.
+            engine (colossalai.legacy.engine.Engine): Colossalai engine for training and inference.
            input_obj (Union[:class:`torch.Tensor`, List[:class:`torch.Tensor`]]): input tensor for this pipeline stage.
            output_obj (Union[:class:`torch.Tensor`, List[:class:`torch.Tensor`]]): output tensor for this pipeline stage.
            output_obj_grad (Union[:class:`torch.Tensor`, List[:class:`torch.Tensor`]]): gradient of output tensor for this pipeline stage.
@@ -314,7 +314,7 @@ class PipelineSchedule(BaseSchedule):
        Returns a tuple with losses if the last stage, an empty tuple otherwise.
        Args:
-            engine (colossalai.engine.Engine): Colossalai engine for training and inference.
+            engine (colossalai.legacy.engine.Engine): Colossalai engine for training and inference.
            data_iter (Iterable): Dataloader as the form of an iterator, obtained by calling iter(dataloader).
            forward_only (bool, optional):
                Whether run forward step only. Default is false. If true, no backward will be run.
@@ -518,7 +518,7 @@ class InterleavedPipelineSchedule(PipelineSchedule):
        Returns output tensor. This is a helper function and can be ignored by users.
        Args:
-            engine (colossalai.engine.Engine): Colossalai engine for training and inference.
+            engine (colossalai.legacy.engine.Engine): Colossalai engine for training and inference.
            model_chunk_id (int): The id of model chunks.
            input_obj (Union[:class:`torch.Tensor`, List[:class:`torch.Tensor`]]): Input tensor for this pipeline stage.
            return_tensors (List[:class:`torch.Tensor`]): A list of tensors to return.
@@ -555,7 +555,7 @@ class InterleavedPipelineSchedule(PipelineSchedule):
        communication between pipeline stages as needed.
        Args:
-            engine (colossalai.engine.Engine): Colossalai engine for training and inference.
+            engine (colossalai.legacy.engine.Engine): Colossalai engine for training and inference.
            data_iter (Iterable): Dataloader as the form of an iterator, obtained by calling iter(dataloader).
            forward_only (bool, optional):
                Whether run forward step only. Default is false. If true, no backward will be run.

--- a/colossalai/engine/schedule/_pipeline_schedule_v2.py
+++ b/colossalai/engine/schedule/_pipeline_schedule_v2.py
@@ -69,7 +69,7 @@ class PipelineScheduleV2(PipelineSchedule):
        Returns a tuple with losses if the last stage, an empty tuple otherwise.
        Args:
-            engine (colossalai.engine.Engine): Colossalai engine for training and inference.
+            engine (colossalai.legacy.engine.Engine): Colossalai engine for training and inference.
            data_iter (Iterable): Dataloader as the form of an iterator, obtained by calling iter(dataloader).
            forward_only (bool, optional):
                Whether run forward step only. Default is false. If true, no backward will be run.

--- a/colossalai/legacy/trainer/_trainer.py
+++ b/colossalai/legacy/trainer/_trainer.py
@@ -4,7 +4,7 @@ import torch
 from torch.utils.data import DataLoader
 from tqdm import tqdm
-from colossalai.engine import Engine
+from colossalai.legacy.engine import Engine
 from colossalai.legacy.trainer.hooks import BaseHook
 from colossalai.logging import DistributedLogger
 from colossalai.utils import MultiTimer, is_dp_rank_0, is_no_pp_or_last_stage, is_tp_rank_0