Unverified Commit 554aa959 authored by Hongxin Liu's avatar Hongxin Liu Committed by GitHub
Browse files

[legacy] move communication and nn to legacy and refactor logger (#4671)

* [legacy] move communication to legacy (#4640)

* [legacy] refactor logger and clean up legacy codes (#4654)

* [legacy] make logger independent to gpc

* [legacy] make optim independent to registry

* [legacy] move test engine to legacy

* [legacy] move nn to legacy (#4656)

* [legacy] move nn to legacy

* [checkpointio] fix save hf config

* [test] remove useledd rpc pp test

* [legacy] fix nn init

* [example] skip tutorial hybriad parallel example

* [devops] test doc check

* [devops] test doc check
parent 536397cc
import abc
from typing import List
import torch
import torch.distributed as dist
import torch.nn as nn
from torch.profiler import record_function
from .cached_embedding import CachedEmbeddingBag
from colossalai.legacy.nn._ops._utils import dual_all_to_all_tablewise
from colossalai.tensor import ProcessGroup
from colossalai.nn._ops._utils import dual_all_to_all_tablewise
from .embedding_config import TablewiseEmbeddingBagConfig
from .cache_mgr import EvictionStrategy
from typing import List
import abc
from .cache_mgr import EvictionStrategy
from .cached_embedding import CachedEmbeddingBag
from .embedding_config import TablewiseEmbeddingBagConfig
class ParallelCachedEmbeddingBagTablewiseSpiltCache(abc.ABC, nn.Module):
......
from colossalai.tensor.distspec import _DistSpec
from typing import Dict, List
from colossalai.tensor import ComputePattern
from typing import List, Dict
from colossalai.tensor.distspec import _DistSpec
class ColoModule(object):
......
from colossalai.tensor import ComputePattern, ProcessGroup, ShardSpec, distspec
from .colo_module import ColoModule
from colossalai.tensor import ComputePattern, distspec, ProcessGroup, ShardSpec
class ColoEmbedding(ColoModule):
......
from colossalai.tensor import ComputePattern, ProcessGroup, ShardSpec, distspec
from .colo_module import ColoModule
from colossalai.tensor import ComputePattern, distspec, ProcessGroup, ShardSpec
class ColoLinear(ColoModule):
......
from typing import Dict
from colossalai.tensor import ColoParameter, ComputeSpec, ProcessGroup
from colossalai.tensor import distspec
from . import ColoModule
import torch
from colossalai.tensor import ColoParameter, ComputeSpec, ProcessGroup, distspec
from . import ColoModule
_COLOSSAL_MODULES: Dict[type, ColoModule] = {}
......
......@@ -7,9 +7,9 @@ from typing import Callable
import torch
import torch.distributed as dist
from colossalai.communication import all_reduce
from colossalai.context import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.legacy.communication import all_reduce
from colossalai.legacy.registry import HOOKS
from colossalai.utils import get_current_device, is_no_pp_or_last_stage
......
......@@ -6,8 +6,7 @@ import logging
from pathlib import Path
from typing import List, Union
import colossalai
from colossalai.context.parallel_mode import ParallelMode
import torch.distributed as dist
class DistributedLogger:
......@@ -63,6 +62,7 @@ class DistributedLogger:
self._logger.propagate = False
DistributedLogger.__instances[name] = self
self.rank = dist.get_rank() if dist.is_initialized() else 0
@staticmethod
def __get_call_info():
......@@ -109,16 +109,10 @@ class DistributedLogger:
# create log directory
path.mkdir(parents=True, exist_ok=True)
# set the default file name if path is a directory
if not colossalai.core.global_context.is_initialized(ParallelMode.GLOBAL):
rank = 0
else:
rank = colossalai.core.global_context.get_global_rank()
if suffix is not None:
log_file_name = f'rank_{rank}_{suffix}.log'
log_file_name = f'rank_{self.rank}_{suffix}.log'
else:
log_file_name = f'rank_{rank}.log'
log_file_name = f'rank_{self.rank}.log'
path = path.joinpath(log_file_name)
# add file handler
......@@ -128,19 +122,14 @@ class DistributedLogger:
file_handler.setFormatter(formatter)
self._logger.addHandler(file_handler)
def _log(self,
level,
message: str,
parallel_mode: ParallelMode = ParallelMode.GLOBAL,
ranks: List[int] = None) -> None:
def _log(self, level, message: str, ranks: List[int] = None) -> None:
if ranks is None:
getattr(self._logger, level)(message)
else:
local_rank = colossalai.core.global_context.get_local_rank(parallel_mode)
if local_rank in ranks:
if self.rank in ranks:
getattr(self._logger, level)(message)
def info(self, message: str, parallel_mode: ParallelMode = ParallelMode.GLOBAL, ranks: List[int] = None) -> None:
def info(self, message: str, ranks: List[int] = None) -> None:
"""Log an info message.
Args:
......@@ -150,10 +139,10 @@ class DistributedLogger:
ranks (List[int]): List of parallel ranks.
"""
message_prefix = "{}:{} {}".format(*self.__get_call_info())
self._log('info', message_prefix, parallel_mode, ranks)
self._log('info', message, parallel_mode, ranks)
self._log('info', message_prefix, ranks)
self._log('info', message, ranks)
def warning(self, message: str, parallel_mode: ParallelMode = ParallelMode.GLOBAL, ranks: List[int] = None) -> None:
def warning(self, message: str, ranks: List[int] = None) -> None:
"""Log a warning message.
Args:
......@@ -163,10 +152,10 @@ class DistributedLogger:
ranks (List[int]): List of parallel ranks.
"""
message_prefix = "{}:{} {}".format(*self.__get_call_info())
self._log('warning', message_prefix, parallel_mode, ranks)
self._log('warning', message, parallel_mode, ranks)
self._log('warning', message_prefix, ranks)
self._log('warning', message, ranks)
def debug(self, message: str, parallel_mode: ParallelMode = ParallelMode.GLOBAL, ranks: List[int] = None) -> None:
def debug(self, message: str, ranks: List[int] = None) -> None:
"""Log a debug message.
Args:
......@@ -176,10 +165,10 @@ class DistributedLogger:
ranks (List[int]): List of parallel ranks.
"""
message_prefix = "{}:{} {}".format(*self.__get_call_info())
self._log('debug', message_prefix, parallel_mode, ranks)
self._log('debug', message, parallel_mode, ranks)
self._log('debug', message_prefix, ranks)
self._log('debug', message, ranks)
def error(self, message: str, parallel_mode: ParallelMode = ParallelMode.GLOBAL, ranks: List[int] = None) -> None:
def error(self, message: str, ranks: List[int] = None) -> None:
"""Log an error message.
Args:
......@@ -189,5 +178,5 @@ class DistributedLogger:
ranks (List[int]): List of parallel ranks.
"""
message_prefix = "{}:{} {}".format(*self.__get_call_info())
self._log('error', message_prefix, parallel_mode, ranks)
self._log('error', message, parallel_mode, ranks)
self._log('error', message_prefix, ranks)
self._log('error', message, ranks)
from ._ops import *
from .init import *
from .layer import *
from .loss import *
from .lr_scheduler import *
from .metric import *
from .optimizer import *
from .colossalai_layer import *
from .parallel_1d import *
from .parallel_2d import *
from .parallel_2p5d import *
from .parallel_3d import *
from .parallel_sequence import *
from .moe import *
from .utils import *
from .vanilla import *
from .wrapper import *
def divide(numerator, denominator):
"""Only allow exact division.
Args:
numerator (int): Numerator of the division.
denominator (int): Denominator of the division.
Returns:
int: the result of exact division.
"""
assert denominator != 0, 'denominator can not be zero'
assert numerator % denominator == 0, \
'{} is not divisible by {}'.format(numerator, denominator)
return numerator // denominator
from colossalai.global_variables import tensor_parallel_env as env
from colossalai.nn.layer.utils import get_tensor_parallel_mode
from torch import nn
from torch.nn.modules.loss import *
from torch.nn.modules.loss import _Loss
from .loss_1d import VocabParallelCrossEntropyLoss1D
from .loss_2d import CrossEntropyLoss2D, VocabParallelCrossEntropyLoss2D
from .loss_2p5d import CrossEntropyLoss2p5D, VocabParallelCrossEntropyLoss2p5D
from .loss_3d import CrossEntropyLoss3D, VocabParallelCrossEntropyLoss3D
from .loss_moe import MoeCrossEntropyLoss, MoeLoss
_parallel_cross_entropy = {
'2d': CrossEntropyLoss2D,
'2.5d': CrossEntropyLoss2p5D,
'3d': CrossEntropyLoss3D,
}
_vocab_parallel_cross_entropy = {
'1d': VocabParallelCrossEntropyLoss1D,
'2d': VocabParallelCrossEntropyLoss2D,
'2.5d': VocabParallelCrossEntropyLoss2p5D,
'3d': VocabParallelCrossEntropyLoss3D,
}
class CrossEntropyLoss(_Loss):
def __init__(self, reduction: bool = True, *args, **kwargs):
super().__init__()
tensor_parallel = get_tensor_parallel_mode()
if tensor_parallel is not None and env.vocab_parallel:
self.loss = _vocab_parallel_cross_entropy[tensor_parallel](reduction=reduction, *args, **kwargs)
elif tensor_parallel is None or tensor_parallel == '1d':
reduction = 'mean' if reduction else 'none'
self.loss = nn.CrossEntropyLoss(reduction=reduction, *args, **kwargs)
else:
self.loss = _parallel_cross_entropy[tensor_parallel](reduction=reduction, *args, **kwargs)
def forward(self, *args):
return self.loss(*args)
from torch.optim.lr_scheduler import CosineAnnealingLR as _CosineAnnealingLR
from colossalai.legacy.registry import LR_SCHEDULERS
from .delayed import DelayerScheduler, WarmupDelayerScheduler, WarmupScheduler
@LR_SCHEDULERS.register_module
class CosineAnnealingLR(_CosineAnnealingLR):
r"""Set the learning rate of each parameter group using a cosine annealing
schedule, where :math:`\eta_{max}` is set to the initial lr and
......@@ -49,7 +46,6 @@ class CosineAnnealingLR(_CosineAnnealingLR):
super().__init__(optimizer, total_steps, eta_min=eta_min, last_epoch=last_epoch)
@LR_SCHEDULERS.register_module
class CosineAnnealingWarmupLR(WarmupScheduler):
"""Cosine annealing learning rate scheduler with learning rate warmup. A linear warmup schedule will be applied.
......@@ -70,7 +66,6 @@ class CosineAnnealingWarmupLR(WarmupScheduler):
super().__init__(optimizer, warmup_steps, base_scheduler)
@LR_SCHEDULERS.register_module
class FlatAnnealingLR(DelayerScheduler):
"""Flat and cosine annealing learning rate scheduler. The learning rate will be a fixed value before starting decay.
......@@ -91,7 +86,6 @@ class FlatAnnealingLR(DelayerScheduler):
super().__init__(optimizer, flat_steps, base_scheduler, last_epoch=last_epoch)
@LR_SCHEDULERS.register_module
class FlatAnnealingWarmupLR(WarmupDelayerScheduler):
"""Flat and cosine annealing learning rate scheduler with learning rate warmup. A linear warmup schedule will be
applied, and then the learning rate will be a fixed value before starting decay.
......
from torch.optim.lr_scheduler import _LRScheduler
from colossalai.legacy.registry import LR_SCHEDULERS
@LR_SCHEDULERS.register_module
class LinearWarmupLR(_LRScheduler):
"""Linearly warmup learning rate and then linearly decay.
......
......@@ -2,12 +2,9 @@ from typing import List
from torch.optim.lr_scheduler import MultiStepLR as _MultiStepLR
from colossalai.legacy.registry import LR_SCHEDULERS
from .delayed import WarmupScheduler
@LR_SCHEDULERS.register_module
class MultiStepLR(_MultiStepLR):
"""Decays the learning rate of each parameter group by gamma once the
number of epoch reaches one of the milestones. Notice that such decay can
......@@ -33,7 +30,6 @@ class MultiStepLR(_MultiStepLR):
super().__init__(optimizer, milestones, gamma=gamma, last_epoch=last_epoch)
@LR_SCHEDULERS.register_module
class MultiStepWarmupLR(WarmupScheduler):
"""Multistep learning rate scheduler with warmup.
......
from torch.optim.lr_scheduler import OneCycleLR as _OneCycleLR
from colossalai.legacy.registry import LR_SCHEDULERS
@LR_SCHEDULERS.register_module
class OneCycleLR(_OneCycleLR):
r"""Sets the learning rate of each parameter group according to the
1cycle learning rate policy. The 1cycle policy anneals the learning
......
from torch.optim.lr_scheduler import _LRScheduler
from colossalai.legacy.registry import LR_SCHEDULERS
from .delayed import WarmupScheduler
@LR_SCHEDULERS.register_module
class PolynomialLR(_LRScheduler):
"""Polynomial learning rate scheduler.
......@@ -41,7 +38,6 @@ class PolynomialLR(_LRScheduler):
for base_lr in self.base_lrs]
@LR_SCHEDULERS.register_module
class PolynomialWarmupLR(WarmupScheduler):
"""Polynomial learning rate scheduler with warmup.
......
......@@ -3,10 +3,7 @@ from torch.optim.lr_scheduler import LambdaLR as _LambdaLR
from torch.optim.lr_scheduler import MultiplicativeLR as _MultiplicativeLR
from torch.optim.lr_scheduler import StepLR as _StepLR
from colossalai.legacy.registry import LR_SCHEDULERS
@LR_SCHEDULERS.register_module
class LambdaLR(_LambdaLR):
"""Sets the learning rate of each parameter group to the initial lr
times a given function. When last_epoch=-1, sets initial lr as lr.
......@@ -24,7 +21,6 @@ class LambdaLR(_LambdaLR):
super().__init__(optimizer, lr_lambda, last_epoch=last_epoch)
@LR_SCHEDULERS.register_module
class MultiplicativeLR(_MultiplicativeLR):
"""Multiply the learning rate of each parameter group by the factor given
in the specified function. When last_epoch=-1, sets initial lr as lr.
......@@ -42,7 +38,6 @@ class MultiplicativeLR(_MultiplicativeLR):
super().__init__(optimizer, lr_lambda, last_epoch=last_epoch)
@LR_SCHEDULERS.register_module
class StepLR(_StepLR):
"""Decays the learning rate of each parameter group by gamma every
step_size epochs. Notice that such decay can happen simultaneously with
......@@ -61,7 +56,6 @@ class StepLR(_StepLR):
super().__init__(optimizer, step_size, gamma=gamma, last_epoch=last_epoch)
@LR_SCHEDULERS.register_module
class ExponentialLR(_ExponentialLR):
"""Decays the learning rate of each parameter group by gamma every epoch.
When last_epoch=-1, sets initial lr as lr
......
......@@ -4,12 +4,10 @@ from typing import Optional
import torch
from colossalai.kernel.op_builder import CPUAdamBuilder
from colossalai.legacy.registry import OPTIMIZERS
from .nvme_optimizer import NVMeOptimizer
@OPTIMIZERS.register_module
class CPUAdam(NVMeOptimizer):
"""Implements Adam algorithm.
......
......@@ -8,11 +8,9 @@ Licensed under the MIT License.
'''
import torch
from colossalai.legacy.registry import OPTIMIZERS
from colossalai.utils import multi_tensor_applier
@OPTIMIZERS.register_module
class FusedAdam(torch.optim.Optimizer):
"""Implements Adam algorithm.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment