Merge branch 'main' into feature/shardformer

fae6c92e · Hongxin Liu · GitHub · bd186784 · ac178ca5 · fae6c92e
Unverified Commit fae6c92e authored Sep 05, 2023 by Hongxin Liu Committed by GitHub Sep 05, 2023
20 changed files
--- a/colossalai/engine/gradient_accumulation/__init__.py
+++ b/colossalai/engine/gradient_accumulation/__init__.py
@@ -4,7 +4,7 @@ import torch.nn as nn
 from torch.optim import Optimizer
 from torch.optim.lr_scheduler import _LRScheduler
-from colossalai.engine import BaseGradientHandler
+from colossalai.legacy.engine import BaseGradientHandler
 from ._gradient_accumulation import (
    GradAccumDataloader,
@@ -33,7 +33,7 @@ def accumulate_gradient(model: nn.Module,
        dataloader (:class:`torch.utils.data.DataLoader` or iterable objects):
            your dataloader object, would be called like iter(dataloader)
        accumulate_size (int): the number of steps to accumulate gradients
-        gradient_handlers (List[:class:`colossalai.engine.BaseGradientHandler`]):
+        gradient_handlers (List[:class:`colossalai.legacy.engine.BaseGradientHandler`]):
            list of gradient handler objects. Default is None.
        lr_scheduler (`torch.optim.lr_scheduler` or `colossalai.nn.lr_scheduler`):
            your ``lr_scheduler`` object for gradient accumulation. Defaults to None.

--- a/colossalai/engine/gradient_accumulation/_gradient_accumulation.py
+++ b/colossalai/engine/gradient_accumulation/_gradient_accumulation.py
@@ -10,7 +10,7 @@ from torch.optim import Optimizer
 from torch.optim.lr_scheduler import _LRScheduler
 from torch.utils.data import DataLoader
-from colossalai.engine import BaseGradientHandler
+from colossalai.legacy.engine import BaseGradientHandler
 from colossalai.nn.optimizer import ColossalaiOptimizer
 from colossalai.utils import conditional_context
@@ -262,7 +262,7 @@ class GradAccumGradientHandler:
    before accumulation size is reached.
    Args:
-        grad_handler (:class:`colossalai.engine.BaseGradientHandler`):
+        grad_handler (:class:`colossalai.legacy.engine.BaseGradientHandler`):
            Your ``gradient_handler`` object for gradient accumulation, would be called when achieving `accumulate_size`.
        accumulate_size (int): The number of steps to accumulate gradients.

--- a/colossalai/engine/gradient_handler/__init__.py
+++ b/colossalai/engine/gradient_handler/__init__.py
--- a/colossalai/engine/gradient_handler/_base_gradient_handler.py
+++ b/colossalai/engine/gradient_handler/_base_gradient_handler.py
--- a/colossalai/engine/gradient_handler/_data_parallel_gradient_handler.py
+++ b/colossalai/engine/gradient_handler/_data_parallel_gradient_handler.py
+from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
-from colossalai.registry import GRADIENT_HANDLER
+from colossalai.legacy.registry import GRADIENT_HANDLER
-from ...context.parallel_mode import ParallelMode
 from ._base_gradient_handler import BaseGradientHandler
 from .utils import bucket_allreduce

--- a/colossalai/engine/gradient_handler/_moe_gradient_handler.py
+++ b/colossalai/engine/gradient_handler/_moe_gradient_handler.py
 from colossalai.context.moe_context import MOE_CONTEXT
+from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
-from colossalai.registry import GRADIENT_HANDLER
+from colossalai.legacy.registry import GRADIENT_HANDLER
 from colossalai.utils.moe import get_moe_epsize_param_dict
-from ...context.parallel_mode import ParallelMode
 from ._base_gradient_handler import BaseGradientHandler
 from .utils import bucket_allreduce

--- a/colossalai/engine/gradient_handler/_pipeline_parallel_gradient_handler.py
+++ b/colossalai/engine/gradient_handler/_pipeline_parallel_gradient_handler.py
@@ -7,7 +7,7 @@ import torch.distributed as dist
 from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
 from colossalai.core import global_context as gpc
-from colossalai.registry import GRADIENT_HANDLER
+from colossalai.legacy.registry import GRADIENT_HANDLER
 from ._base_gradient_handler import BaseGradientHandler

--- a/colossalai/engine/gradient_handler/_sequence_parallel_gradient_handler.py
+++ b/colossalai/engine/gradient_handler/_sequence_parallel_gradient_handler.py
+from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
-from colossalai.registry import GRADIENT_HANDLER
+from colossalai.legacy.registry import GRADIENT_HANDLER
-from ...context.parallel_mode import ParallelMode
 from ._base_gradient_handler import BaseGradientHandler
 from .utils import bucket_allreduce

--- a/colossalai/engine/gradient_handler/_zero_gradient_handler.py
+++ b/colossalai/engine/gradient_handler/_zero_gradient_handler.py
-from colossalai.registry import GRADIENT_HANDLER
+from colossalai.legacy.registry import GRADIENT_HANDLER
 from ._base_gradient_handler import BaseGradientHandler

--- a/colossalai/engine/gradient_handler/utils.py
+++ b/colossalai/engine/gradient_handler/utils.py
--- a/colossalai/engine/schedule/__init__.py
+++ b/colossalai/engine/schedule/__init__.py
--- a/colossalai/engine/schedule/_base_schedule.py
+++ b/colossalai/engine/schedule/_base_schedule.py
@@ -95,7 +95,7 @@ class BaseSchedule(ABC):
        """The process function over a batch of dataset for training or evaluation.
        Args:
-            engine (colossalai.engine.Engine): Colossalai engine for training and inference.
+            engine (colossalai.legacy.engine.Engine): Colossalai engine for training and inference.
            data_iter (Iterable): Data iterator from which get a batch of data, obtained by calling iter(dataloader).
            forward_only (bool): If True, the process won't include backward.
            return_loss (bool, optional): If False, the loss won't be returned.

--- a/colossalai/engine/schedule/_non_pipeline_schedule.py
+++ b/colossalai/engine/schedule/_non_pipeline_schedule.py
@@ -54,7 +54,7 @@ class NonPipelineSchedule(BaseSchedule):
        The returned labels and loss will None if :attr:`return_loss` is False.
        Args:
-            engine (colossalai.engine.Engine): Colossalai engine for training and inference.
+            engine (colossalai.legacy.engine.Engine): Colossalai engine for training and inference.
            data_iter (Iterable): Dataloader as the form of an iterator, obtained by calling iter(dataloader).
            forward_only (bool, optional):
                If True, the model is run for the forward pass, else back propagation will be executed.

--- a/colossalai/engine/schedule/_pipeline_schedule.py
+++ b/colossalai/engine/schedule/_pipeline_schedule.py
@@ -236,7 +236,7 @@ class PipelineSchedule(BaseSchedule):
        Returns output tensor. This is a helper function and can be ignored by users.
        Args:
-            engine (colossalai.engine.Engine): Colossalai engine for training and inference.
+            engine (colossalai.legacy.engine.Engine): Colossalai engine for training and inference.
            input_obj (Union[:class:`torch.Tensor`, List[:class:`torch.Tensor`]]): Input tensor for this pipeline stage.
            return_tensors (List[:class:`torch.Tensor`]): A list of tensors to return.
            return_output_label (bool, optional): Whether returns output labels.
@@ -274,7 +274,7 @@ class PipelineSchedule(BaseSchedule):
        This is a helper function and can be ignored by users.
        Args:
-            engine (colossalai.engine.Engine): Colossalai engine for training and inference.
+            engine (colossalai.legacy.engine.Engine): Colossalai engine for training and inference.
            input_obj (Union[:class:`torch.Tensor`, List[:class:`torch.Tensor`]]): input tensor for this pipeline stage.
            output_obj (Union[:class:`torch.Tensor`, List[:class:`torch.Tensor`]]): output tensor for this pipeline stage.
            output_obj_grad (Union[:class:`torch.Tensor`, List[:class:`torch.Tensor`]]): gradient of output tensor for this pipeline stage.
@@ -314,7 +314,7 @@ class PipelineSchedule(BaseSchedule):
        Returns a tuple with losses if the last stage, an empty tuple otherwise.
        Args:
-            engine (colossalai.engine.Engine): Colossalai engine for training and inference.
+            engine (colossalai.legacy.engine.Engine): Colossalai engine for training and inference.
            data_iter (Iterable): Dataloader as the form of an iterator, obtained by calling iter(dataloader).
            forward_only (bool, optional):
                Whether run forward step only. Default is false. If true, no backward will be run.
@@ -518,7 +518,7 @@ class InterleavedPipelineSchedule(PipelineSchedule):
        Returns output tensor. This is a helper function and can be ignored by users.
        Args:
-            engine (colossalai.engine.Engine): Colossalai engine for training and inference.
+            engine (colossalai.legacy.engine.Engine): Colossalai engine for training and inference.
            model_chunk_id (int): The id of model chunks.
            input_obj (Union[:class:`torch.Tensor`, List[:class:`torch.Tensor`]]): Input tensor for this pipeline stage.
            return_tensors (List[:class:`torch.Tensor`]): A list of tensors to return.
@@ -555,7 +555,7 @@ class InterleavedPipelineSchedule(PipelineSchedule):
        communication between pipeline stages as needed.
        Args:
-            engine (colossalai.engine.Engine): Colossalai engine for training and inference.
+            engine (colossalai.legacy.engine.Engine): Colossalai engine for training and inference.
            data_iter (Iterable): Dataloader as the form of an iterator, obtained by calling iter(dataloader).
            forward_only (bool, optional):
                Whether run forward step only. Default is false. If true, no backward will be run.

--- a/colossalai/engine/schedule/_pipeline_schedule_v2.py
+++ b/colossalai/engine/schedule/_pipeline_schedule_v2.py
@@ -69,7 +69,7 @@ class PipelineScheduleV2(PipelineSchedule):
        Returns a tuple with losses if the last stage, an empty tuple otherwise.
        Args:
-            engine (colossalai.engine.Engine): Colossalai engine for training and inference.
+            engine (colossalai.legacy.engine.Engine): Colossalai engine for training and inference.
            data_iter (Iterable): Dataloader as the form of an iterator, obtained by calling iter(dataloader).
            forward_only (bool, optional):
                Whether run forward step only. Default is false. If true, no backward will be run.

--- a/colossalai/registry/__init__.py
+++ b/colossalai/registry/__init__.py
--- a/colossalai/registry/registry.py
+++ b/colossalai/registry/registry.py
--- a/colossalai/trainer/__init__.py
+++ b/colossalai/trainer/__init__.py
--- a/colossalai/trainer/_trainer.py
+++ b/colossalai/trainer/_trainer.py
-from typing import Union, List, Any
+from typing import Any, List, Union
 import torch
 from torch.utils.data import DataLoader
 from tqdm import tqdm
-from colossalai.engine import Engine
+from colossalai.legacy.engine import Engine
+from colossalai.legacy.trainer.hooks import BaseHook
 from colossalai.logging import DistributedLogger
-from colossalai.utils import MultiTimer
+from colossalai.utils import MultiTimer, is_dp_rank_0, is_no_pp_or_last_stage, is_tp_rank_0
-from colossalai.utils import is_dp_rank_0, is_tp_rank_0, is_no_pp_or_last_stage
-from colossalai.trainer.hooks import BaseHook
 class Trainer:

--- a/colossalai/trainer/hooks/__init__.py
+++ b/colossalai/trainer/hooks/__init__.py
 from ._base_hook import BaseHook
 from ._checkpoint_hook import SaveCheckpointHook
-from ._log_hook import (LogMemoryByEpochHook, LogMetricByEpochHook, LogMetricByStepHook, LogTimingByEpochHook,
+from ._log_hook import (
-                        TensorboardHook)
+    LogMemoryByEpochHook,
+    LogMetricByEpochHook,
+    LogMetricByStepHook,
+    LogTimingByEpochHook,
+    TensorboardHook,
+)
 from ._lr_scheduler_hook import LRSchedulerHook
 from ._metric_hook import AccuracyHook, LossHook, MetricHook, ThroughputHook