[legacy] move communication and nn to legacy and refactor logger (#4671)

* [legacy] move communication to legacy (#4640) * [legacy] refactor logger and clean up legacy codes (#4654) * [legacy] make logger independent to gpc * [legacy] make optim independent to registry * [legacy] move test engine to legacy * [legacy] move nn to legacy (#4656) * [legacy] move nn to legacy * [checkpointio] fix save hf config * [test] remove useledd rpc pp test * [legacy] fix nn init * [example] skip tutorial hybriad parallel example * [devops] test doc check * [devops] test doc check

[legacy] move communication and nn to legacy and refactor logger (#4671)
* [legacy] move communication to legacy (#4640) * [legacy] refactor logger and clean up legacy codes (#4654) * [legacy] make logger independent to gpc * [legacy] make optim independent to registry * [legacy] move test engine to legacy * [legacy] move nn to legacy (#4656) * [legacy] move nn to legacy * [checkpointio] fix save hf config * [test] remove useledd rpc pp test * [legacy] fix nn init * [example] skip tutorial hybriad parallel example * [devops] test doc check * [devops] test doc check
554aa959 · Hongxin Liu · GitHub · 536397cc · 554aa959 · 554aa959
Unverified Commit 554aa959 authored Sep 11, 2023 by Hongxin Liu Committed by GitHub Sep 11, 2023
20 changed files
--- a/colossalai/nn/parallel/layers/cache_embedding/parallel_cached_embedding_tablewise_split_cache.py
+++ b/colossalai/nn/parallel/layers/cache_embedding/parallel_cached_embedding_tablewise_split_cache.py
+import abc
+from typing import List
+
 import torch
 import torch.distributed as dist
 import torch.nn as nn
 from torch.profiler import record_function

-from .cached_embedding import CachedEmbeddingBag
-
+from colossalai.legacy.nn._ops._utils import dual_all_to_all_tablewise
 from colossalai.tensor import ProcessGroup
-from colossalai.nn._ops._utils import dual_all_to_all_tablewise
-from .embedding_config import TablewiseEmbeddingBagConfig
-from .cache_mgr import EvictionStrategy

-from typing import List
-import abc
+from .cache_mgr import EvictionStrategy
+from .cached_embedding import CachedEmbeddingBag
+from .embedding_config import TablewiseEmbeddingBagConfig


 class ParallelCachedEmbeddingBagTablewiseSpiltCache(abc.ABC, nn.Module):

--- a/colossalai/nn/parallel/layers/colo_module.py
+++ b/colossalai/nn/parallel/layers/colo_module.py
-from colossalai.tensor.distspec import _DistSpec
+from typing import Dict, List
+
 from colossalai.tensor import ComputePattern
-from typing import List, Dict
+from colossalai.tensor.distspec import _DistSpec


 class ColoModule(object):

--- a/colossalai/nn/parallel/layers/embedding.py
+++ b/colossalai/nn/parallel/layers/embedding.py
+from colossalai.tensor import ComputePattern, ProcessGroup, ShardSpec, distspec
+
 from .colo_module import ColoModule
-from colossalai.tensor import ComputePattern, distspec, ProcessGroup, ShardSpec


 class ColoEmbedding(ColoModule):

--- a/colossalai/nn/parallel/layers/linear.py
+++ b/colossalai/nn/parallel/layers/linear.py
+from colossalai.tensor import ComputePattern, ProcessGroup, ShardSpec, distspec
+
 from .colo_module import ColoModule
-from colossalai.tensor import ComputePattern, distspec, ProcessGroup, ShardSpec


 class ColoLinear(ColoModule):

--- a/colossalai/nn/parallel/layers/module_utils.py
+++ b/colossalai/nn/parallel/layers/module_utils.py
 from typing import Dict
-from colossalai.tensor import ColoParameter, ComputeSpec, ProcessGroup
-from colossalai.tensor import distspec
-from . import ColoModule
+
 import torch

+from colossalai.tensor import ColoParameter, ComputeSpec, ProcessGroup, distspec
+
+from . import ColoModule
+
 _COLOSSAL_MODULES: Dict[type, ColoModule] = {}



--- a/colossalai/nn/parallel/reducer.py
+++ b/colossalai/nn/parallel/reducer.py
--- a/colossalai/legacy/trainer/hooks/_metric_hook.py
+++ b/colossalai/legacy/trainer/hooks/_metric_hook.py
@@ -7,9 +7,9 @@ from typing import Callable
 import torch
 import torch.distributed as dist

-from colossalai.communication import all_reduce
 from colossalai.context import ParallelMode
 from colossalai.core import global_context as gpc
+from colossalai.legacy.communication import all_reduce
 from colossalai.legacy.registry import HOOKS
 from colossalai.utils import get_current_device, is_no_pp_or_last_stage


--- a/colossalai/logging/logger.py
+++ b/colossalai/logging/logger.py
@@ -6,8 +6,7 @@ import logging
 from pathlib import Path
 from typing import List, Union

-import colossalai
-from colossalai.context.parallel_mode import ParallelMode
+import torch.distributed as dist


 class DistributedLogger:
@@ -63,6 +62,7 @@ class DistributedLogger:
            self._logger.propagate = False

            DistributedLogger.__instances[name] = self
+        self.rank = dist.get_rank() if dist.is_initialized() else 0

    @staticmethod
    def __get_call_info():
@@ -109,16 +109,10 @@ class DistributedLogger:
        # create log directory
        path.mkdir(parents=True, exist_ok=True)

-        # set the default file name if path is a directory
-        if not colossalai.core.global_context.is_initialized(ParallelMode.GLOBAL):
-            rank = 0
-        else:
-            rank = colossalai.core.global_context.get_global_rank()
-
        if suffix is not None:
-            log_file_name = f'rank_{rank}_{suffix}.log'
+            log_file_name = f'rank_{self.rank}_{suffix}.log'
        else:
-            log_file_name = f'rank_{rank}.log'
+            log_file_name = f'rank_{self.rank}.log'
        path = path.joinpath(log_file_name)

        # add file handler
@@ -128,19 +122,14 @@ class DistributedLogger:
        file_handler.setFormatter(formatter)
        self._logger.addHandler(file_handler)

-    def _log(self,
-             level,
-             message: str,
-             parallel_mode: ParallelMode = ParallelMode.GLOBAL,
-             ranks: List[int] = None) -> None:
+    def _log(self, level, message: str, ranks: List[int] = None) -> None:
        if ranks is None:
            getattr(self._logger, level)(message)
        else:
-            local_rank = colossalai.core.global_context.get_local_rank(parallel_mode)
-            if local_rank in ranks:
+            if self.rank in ranks:
                getattr(self._logger, level)(message)

-    def info(self, message: str, parallel_mode: ParallelMode = ParallelMode.GLOBAL, ranks: List[int] = None) -> None:
+    def info(self, message: str, ranks: List[int] = None) -> None:
        """Log an info message.

        Args:
@@ -150,10 +139,10 @@ class DistributedLogger:
            ranks (List[int]): List of parallel ranks.
        """
        message_prefix = "{}:{} {}".format(*self.__get_call_info())
-        self._log('info', message_prefix, parallel_mode, ranks)
-        self._log('info', message, parallel_mode, ranks)
+        self._log('info', message_prefix, ranks)
+        self._log('info', message, ranks)

-    def warning(self, message: str, parallel_mode: ParallelMode = ParallelMode.GLOBAL, ranks: List[int] = None) -> None:
+    def warning(self, message: str, ranks: List[int] = None) -> None:
        """Log a warning message.

        Args:
@@ -163,10 +152,10 @@ class DistributedLogger:
            ranks (List[int]): List of parallel ranks.
        """
        message_prefix = "{}:{} {}".format(*self.__get_call_info())
-        self._log('warning', message_prefix, parallel_mode, ranks)
-        self._log('warning', message, parallel_mode, ranks)
+        self._log('warning', message_prefix, ranks)
+        self._log('warning', message, ranks)

-    def debug(self, message: str, parallel_mode: ParallelMode = ParallelMode.GLOBAL, ranks: List[int] = None) -> None:
+    def debug(self, message: str, ranks: List[int] = None) -> None:
        """Log a debug message.

        Args:
@@ -176,10 +165,10 @@ class DistributedLogger:
            ranks (List[int]): List of parallel ranks.
        """
        message_prefix = "{}:{} {}".format(*self.__get_call_info())
-        self._log('debug', message_prefix, parallel_mode, ranks)
-        self._log('debug', message, parallel_mode, ranks)
+        self._log('debug', message_prefix, ranks)
+        self._log('debug', message, ranks)

-    def error(self, message: str, parallel_mode: ParallelMode = ParallelMode.GLOBAL, ranks: List[int] = None) -> None:
+    def error(self, message: str, ranks: List[int] = None) -> None:
        """Log an error message.

        Args:
@@ -189,5 +178,5 @@ class DistributedLogger:
            ranks (List[int]): List of parallel ranks.
        """
        message_prefix = "{}:{} {}".format(*self.__get_call_info())
-        self._log('error', message_prefix, parallel_mode, ranks)
-        self._log('error', message, parallel_mode, ranks)
+        self._log('error', message_prefix, ranks)
+        self._log('error', message, ranks)
--- a/colossalai/nn/__init__.py
+++ b/colossalai/nn/__init__.py
-from ._ops import *
+from .init import *
 from .layer import *
 from .loss import *
 from .lr_scheduler import *
-from .metric import *
 from .optimizer import *
--- a/colossalai/nn/layer/__init__.py
+++ b/colossalai/nn/layer/__init__.py
-from .colossalai_layer import *
-from .parallel_1d import *
-from .parallel_2d import *
-from .parallel_2p5d import *
-from .parallel_3d import *
-from .parallel_sequence import *
 from .moe import *
 from .utils import *
-from .vanilla import *
-from .wrapper import *
--- a/colossalai/nn/layer/utils.py
+++ b/colossalai/nn/layer/utils.py
+def divide(numerator, denominator):
+    """Only allow exact division.
+
+    Args:
+        numerator (int): Numerator of the division.
+        denominator (int): Denominator of the division.
+
+    Returns:
+        int: the result of exact division.
+    """
+    assert denominator != 0, 'denominator can not be zero'
+    assert numerator % denominator == 0, \
+        '{} is not divisible by {}'.format(numerator, denominator)
+    return numerator // denominator
--- a/colossalai/nn/loss/__init__.py
+++ b/colossalai/nn/loss/__init__.py
-from colossalai.global_variables import tensor_parallel_env as env
-from colossalai.nn.layer.utils import get_tensor_parallel_mode
-from torch import nn
-from torch.nn.modules.loss import *
-from torch.nn.modules.loss import _Loss
-
-from .loss_1d import VocabParallelCrossEntropyLoss1D
-from .loss_2d import CrossEntropyLoss2D, VocabParallelCrossEntropyLoss2D
-from .loss_2p5d import CrossEntropyLoss2p5D, VocabParallelCrossEntropyLoss2p5D
-from .loss_3d import CrossEntropyLoss3D, VocabParallelCrossEntropyLoss3D
 from .loss_moe import MoeCrossEntropyLoss, MoeLoss
-
-_parallel_cross_entropy = {
-    '2d': CrossEntropyLoss2D,
-    '2.5d': CrossEntropyLoss2p5D,
-    '3d': CrossEntropyLoss3D,
-}
-
-_vocab_parallel_cross_entropy = {
-    '1d': VocabParallelCrossEntropyLoss1D,
-    '2d': VocabParallelCrossEntropyLoss2D,
-    '2.5d': VocabParallelCrossEntropyLoss2p5D,
-    '3d': VocabParallelCrossEntropyLoss3D,
-}
-
-
-class CrossEntropyLoss(_Loss):
-
-    def __init__(self, reduction: bool = True, *args, **kwargs):
-        super().__init__()
-        tensor_parallel = get_tensor_parallel_mode()
-        if tensor_parallel is not None and env.vocab_parallel:
-            self.loss = _vocab_parallel_cross_entropy[tensor_parallel](reduction=reduction, *args, **kwargs)
-        elif tensor_parallel is None or tensor_parallel == '1d':
-            reduction = 'mean' if reduction else 'none'
-            self.loss = nn.CrossEntropyLoss(reduction=reduction, *args, **kwargs)
-        else:
-            self.loss = _parallel_cross_entropy[tensor_parallel](reduction=reduction, *args, **kwargs)
-
-    def forward(self, *args):
-        return self.loss(*args)
--- a/colossalai/nn/lr_scheduler/cosine.py
+++ b/colossalai/nn/lr_scheduler/cosine.py
 from torch.optim.lr_scheduler import CosineAnnealingLR as _CosineAnnealingLR

-from colossalai.legacy.registry import LR_SCHEDULERS
-
 from .delayed import DelayerScheduler, WarmupDelayerScheduler, WarmupScheduler


-@LR_SCHEDULERS.register_module
 class CosineAnnealingLR(_CosineAnnealingLR):
    r"""Set the learning rate of each parameter group using a cosine annealing
    schedule, where :math:`\eta_{max}` is set to the initial lr and
@@ -49,7 +46,6 @@ class CosineAnnealingLR(_CosineAnnealingLR):
        super().__init__(optimizer, total_steps, eta_min=eta_min, last_epoch=last_epoch)


-@LR_SCHEDULERS.register_module
 class CosineAnnealingWarmupLR(WarmupScheduler):
    """Cosine annealing learning rate scheduler with learning rate warmup. A linear warmup schedule will be applied.

@@ -70,7 +66,6 @@ class CosineAnnealingWarmupLR(WarmupScheduler):
        super().__init__(optimizer, warmup_steps, base_scheduler)


-@LR_SCHEDULERS.register_module
 class FlatAnnealingLR(DelayerScheduler):
    """Flat and cosine annealing learning rate scheduler. The learning rate will be a fixed value before starting decay.

@@ -91,7 +86,6 @@ class FlatAnnealingLR(DelayerScheduler):
        super().__init__(optimizer, flat_steps, base_scheduler, last_epoch=last_epoch)


-@LR_SCHEDULERS.register_module
 class FlatAnnealingWarmupLR(WarmupDelayerScheduler):
    """Flat and cosine annealing learning rate scheduler with learning rate warmup. A linear warmup schedule will be
    applied, and then the learning rate will be a fixed value before starting decay.

--- a/colossalai/nn/lr_scheduler/linear.py
+++ b/colossalai/nn/lr_scheduler/linear.py
 from torch.optim.lr_scheduler import _LRScheduler

-from colossalai.legacy.registry import LR_SCHEDULERS

-
-@LR_SCHEDULERS.register_module
 class LinearWarmupLR(_LRScheduler):
    """Linearly warmup learning rate and then linearly decay.


--- a/colossalai/nn/lr_scheduler/multistep.py
+++ b/colossalai/nn/lr_scheduler/multistep.py
@@ -2,12 +2,9 @@ from typing import List

 from torch.optim.lr_scheduler import MultiStepLR as _MultiStepLR

-from colossalai.legacy.registry import LR_SCHEDULERS
-
 from .delayed import WarmupScheduler


-@LR_SCHEDULERS.register_module
 class MultiStepLR(_MultiStepLR):
    """Decays the learning rate of each parameter group by gamma once the
    number of epoch reaches one of the milestones. Notice that such decay can
@@ -33,7 +30,6 @@ class MultiStepLR(_MultiStepLR):
        super().__init__(optimizer, milestones, gamma=gamma, last_epoch=last_epoch)


-@LR_SCHEDULERS.register_module
 class MultiStepWarmupLR(WarmupScheduler):
    """Multistep learning rate scheduler with warmup.


--- a/colossalai/nn/lr_scheduler/onecycle.py
+++ b/colossalai/nn/lr_scheduler/onecycle.py
 from torch.optim.lr_scheduler import OneCycleLR as _OneCycleLR

-from colossalai.legacy.registry import LR_SCHEDULERS

-
-@LR_SCHEDULERS.register_module
 class OneCycleLR(_OneCycleLR):
    r"""Sets the learning rate of each parameter group according to the
    1cycle learning rate policy. The 1cycle policy anneals the learning

--- a/colossalai/nn/lr_scheduler/poly.py
+++ b/colossalai/nn/lr_scheduler/poly.py
 from torch.optim.lr_scheduler import _LRScheduler

-from colossalai.legacy.registry import LR_SCHEDULERS
-
 from .delayed import WarmupScheduler


-@LR_SCHEDULERS.register_module
 class PolynomialLR(_LRScheduler):
    """Polynomial learning rate scheduler.

@@ -41,7 +38,6 @@ class PolynomialLR(_LRScheduler):
                for base_lr in self.base_lrs]


-@LR_SCHEDULERS.register_module
 class PolynomialWarmupLR(WarmupScheduler):
    """Polynomial learning rate scheduler with warmup.


--- a/colossalai/nn/lr_scheduler/torch.py
+++ b/colossalai/nn/lr_scheduler/torch.py
@@ -3,10 +3,7 @@ from torch.optim.lr_scheduler import LambdaLR as _LambdaLR
 from torch.optim.lr_scheduler import MultiplicativeLR as _MultiplicativeLR
 from torch.optim.lr_scheduler import StepLR as _StepLR

-from colossalai.legacy.registry import LR_SCHEDULERS

-
-@LR_SCHEDULERS.register_module
 class LambdaLR(_LambdaLR):
    """Sets the learning rate of each parameter group to the initial lr
    times a given function. When last_epoch=-1, sets initial lr as lr.
@@ -24,7 +21,6 @@ class LambdaLR(_LambdaLR):
        super().__init__(optimizer, lr_lambda, last_epoch=last_epoch)


-@LR_SCHEDULERS.register_module
 class MultiplicativeLR(_MultiplicativeLR):
    """Multiply the learning rate of each parameter group by the factor given
    in the specified function. When last_epoch=-1, sets initial lr as lr.
@@ -42,7 +38,6 @@ class MultiplicativeLR(_MultiplicativeLR):
        super().__init__(optimizer, lr_lambda, last_epoch=last_epoch)


-@LR_SCHEDULERS.register_module
 class StepLR(_StepLR):
    """Decays the learning rate of each parameter group by gamma every
    step_size epochs. Notice that such decay can happen simultaneously with
@@ -61,7 +56,6 @@ class StepLR(_StepLR):
        super().__init__(optimizer, step_size, gamma=gamma, last_epoch=last_epoch)


-@LR_SCHEDULERS.register_module
 class ExponentialLR(_ExponentialLR):
    """Decays the learning rate of each parameter group by gamma every epoch.
    When last_epoch=-1, sets initial lr as lr

--- a/colossalai/nn/optimizer/cpu_adam.py
+++ b/colossalai/nn/optimizer/cpu_adam.py
@@ -4,12 +4,10 @@ from typing import Optional
 import torch

 from colossalai.kernel.op_builder import CPUAdamBuilder
-from colossalai.legacy.registry import OPTIMIZERS

 from .nvme_optimizer import NVMeOptimizer


-@OPTIMIZERS.register_module
 class CPUAdam(NVMeOptimizer):
    """Implements Adam algorithm.


--- a/colossalai/nn/optimizer/fused_adam.py
+++ b/colossalai/nn/optimizer/fused_adam.py
@@ -8,11 +8,9 @@ Licensed under the MIT License.
 '''
 import torch

-from colossalai.legacy.registry import OPTIMIZERS
 from colossalai.utils import multi_tensor_applier


-@OPTIMIZERS.register_module
 class FusedAdam(torch.optim.Optimizer):
    """Implements Adam algorithm.