Unverified Commit 554aa959 authored by Hongxin Liu's avatar Hongxin Liu Committed by GitHub
Browse files

[legacy] move communication and nn to legacy and refactor logger (#4671)

* [legacy] move communication to legacy (#4640)

* [legacy] refactor logger and clean up legacy codes (#4654)

* [legacy] make logger independent to gpc

* [legacy] make optim independent to registry

* [legacy] move test engine to legacy

* [legacy] move nn to legacy (#4656)

* [legacy] move nn to legacy

* [checkpointio] fix save hf config

* [test] remove useledd rpc pp test

* [legacy] fix nn init

* [example] skip tutorial hybriad parallel example

* [devops] test doc check

* [devops] test doc check
parent 536397cc
......@@ -8,10 +8,10 @@ import torch.nn.functional as F
from torch import Tensor
from torch.nn import Parameter
from colossalai.communication import broadcast
from colossalai.context import ParallelMode, seed
from colossalai.core import global_context as gpc
from colossalai.global_variables import tensor_parallel_env as env
from colossalai.legacy.communication import broadcast
from colossalai.legacy.registry import LAYERS
from colossalai.nn import init as init
from colossalai.utils.checkpointing import gather_tensor_parallel_state_dict, partition_tensor_parallel_state_dict
......
from ._operation import reduce_by_batch_2p5d, split_batch_2p5d
from .layers import (Classifier2p5D, Embedding2p5D, LayerNorm2p5D, Linear2p5D, PatchEmbedding2p5D,
VocabParallelClassifier2p5D, VocabParallelEmbedding2p5D)
from .layers import (
Classifier2p5D,
Embedding2p5D,
LayerNorm2p5D,
Linear2p5D,
PatchEmbedding2p5D,
VocabParallelClassifier2p5D,
VocabParallelEmbedding2p5D,
)
__all__ = [
'split_batch_2p5d', 'reduce_by_batch_2p5d', 'Linear2p5D', 'LayerNorm2p5D', 'Classifier2p5D', 'PatchEmbedding2p5D',
......
......@@ -2,12 +2,13 @@ from typing import Any, Tuple
import torch
import torch.distributed as dist
from colossalai.communication.collective import (all_gather, all_reduce, reduce_scatter)
from torch import Tensor
from torch.cuda.amp import custom_bwd, custom_fwd
from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.legacy.communication.collective import all_gather, all_reduce, reduce_scatter
from colossalai.utils import get_current_device
from torch import Tensor
from torch.cuda.amp import custom_bwd, custom_fwd
def get_parallel_group(parallel_mode: ParallelMode):
......
......@@ -8,10 +8,10 @@ import torch.nn.functional as F
from torch import Tensor
from torch.nn import Parameter
from colossalai.communication import broadcast
from colossalai.context import ParallelMode, seed
from colossalai.core import global_context as gpc
from colossalai.global_variables import tensor_parallel_env as env
from colossalai.legacy.communication import broadcast
from colossalai.legacy.registry import LAYERS
from colossalai.nn import init as init
from colossalai.utils.checkpointing import (
......
from ._operation import reduce_by_batch_3d, split_batch_3d, split_tensor_3d
from .layers import (Classifier3D, Embedding3D, LayerNorm3D, Linear3D, PatchEmbedding3D, VocabParallelClassifier3D,
VocabParallelEmbedding3D)
from .layers import (
Classifier3D,
Embedding3D,
LayerNorm3D,
Linear3D,
PatchEmbedding3D,
VocabParallelClassifier3D,
VocabParallelEmbedding3D,
)
__all__ = [
'reduce_by_batch_3d', 'split_tensor_3d', 'split_batch_3d', 'Linear3D', 'LayerNorm3D', 'PatchEmbedding3D',
......
......@@ -7,10 +7,10 @@ import torch
from torch import Tensor
from torch.cuda.amp import custom_bwd, custom_fwd
from colossalai.communication import all_gather, all_reduce, broadcast, reduce, reduce_scatter
from colossalai.constants import INPUT_GROUP_3D, WEIGHT_GROUP_3D
from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.legacy.communication import all_gather, all_reduce, broadcast, reduce, reduce_scatter
from ._utils import get_parallel_mode_from_env, push_async_grad
......
......@@ -8,14 +8,14 @@ import torch.nn.functional as F
from torch import Tensor
from torch.nn import Parameter
from colossalai.communication import all_reduce, broadcast
from colossalai.constants import INPUT_GROUP_3D, INPUT_X_WEIGHT_3D, OUTPUT_GROUP_3D, OUTPUT_X_WEIGHT_3D, WEIGHT_GROUP_3D
from colossalai.context import ParallelMode, seed
from colossalai.core import global_context as gpc
from colossalai.global_variables import tensor_parallel_env as env
from colossalai.legacy.communication import all_reduce, broadcast
from colossalai.legacy.nn.layer.base_layer import ParallelLayer
from colossalai.legacy.registry import LAYERS
from colossalai.nn import init as init
from colossalai.nn.layer.base_layer import ParallelLayer
from colossalai.utils.checkpointing import (
broadcast_state_dict,
gather_tensor_parallel_state_dict,
......
from ._operation import RingQK, RingAV
from ._operation import RingAV, RingQK
from .layers import TransformerSelfAttentionRing
__all__ = ['TransformerSelfAttentionRing', 'RingAV', 'RingQK']
......@@ -3,13 +3,13 @@
import torch
from torch import distributed as dist
from torch.cuda.amp import custom_bwd, custom_fwd
from colossalai.communication import ring_forward
from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.nn.layer.parallel_sequence._utils import _calc_incoming_device_range, _calc_current_device_range
from colossalai.legacy.communication import ring_forward
from colossalai.legacy.nn.layer.parallel_sequence._utils import _calc_current_device_range, _calc_incoming_device_range
from colossalai.utils import get_current_device
from torch.cuda.amp import custom_bwd, custom_fwd
class RingQK(torch.autograd.Function):
......
......@@ -14,8 +14,8 @@ from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.kernel import FusedScaleMaskSoftmax
from colossalai.kernel.cuda_native.scaled_softmax import AttnMaskType
from colossalai.legacy.nn.layer.parallel_sequence._operation import RingAV, RingQK
from colossalai.legacy.registry import LAYERS
from colossalai.nn.layer.parallel_sequence._operation import RingAV, RingQK
@LAYERS.register_module
......
from .common import (ACT2FN, CheckpointModule, _ntuple, divide, get_tensor_parallel_mode,
set_tensor_parallel_attribute_by_partition, set_tensor_parallel_attribute_by_size, to_2tuple)
__all__ = [
'CheckpointModule', 'divide', 'ACT2FN', 'set_tensor_parallel_attribute_by_size',
'set_tensor_parallel_attribute_by_partition', 'get_tensor_parallel_mode', '_ntuple', 'to_2tuple'
]
from .common import (
ACT2FN,
CheckpointModule,
_ntuple,
divide,
get_tensor_parallel_mode,
set_tensor_parallel_attribute_by_partition,
set_tensor_parallel_attribute_by_size,
to_2tuple,
)
__all__ = [
'CheckpointModule', 'divide', 'ACT2FN', 'set_tensor_parallel_attribute_by_size',
'set_tensor_parallel_attribute_by_partition', 'get_tensor_parallel_mode', '_ntuple', 'to_2tuple'
]
......@@ -6,10 +6,11 @@ from itertools import repeat
import numpy as np
import torch
from torch import Tensor, nn
from colossalai.constants import IS_TENSOR_PARALLEL, NUM_PARTITIONS
from colossalai.global_variables import tensor_parallel_env as env
from colossalai.utils import checkpoint
from torch import Tensor, nn
class CheckpointModule(nn.Module):
......
import torch.nn as nn
import torch.distributed as dist
from typing import List, Tuple, Union
import torch.distributed as dist
import torch.nn as nn
from colossalai.context import ParallelMode
from colossalai.core import global_context as gpc
......
from torch import nn
from torch.nn.modules.loss import *
from torch.nn.modules.loss import _Loss
from colossalai.global_variables import tensor_parallel_env as env
from colossalai.legacy.nn.layer.utils import get_tensor_parallel_mode
from .loss_1d import VocabParallelCrossEntropyLoss1D
from .loss_2d import CrossEntropyLoss2D, VocabParallelCrossEntropyLoss2D
from .loss_2p5d import CrossEntropyLoss2p5D, VocabParallelCrossEntropyLoss2p5D
from .loss_3d import CrossEntropyLoss3D, VocabParallelCrossEntropyLoss3D
_parallel_cross_entropy = {
'2d': CrossEntropyLoss2D,
'2.5d': CrossEntropyLoss2p5D,
'3d': CrossEntropyLoss3D,
}
_vocab_parallel_cross_entropy = {
'1d': VocabParallelCrossEntropyLoss1D,
'2d': VocabParallelCrossEntropyLoss2D,
'2.5d': VocabParallelCrossEntropyLoss2p5D,
'3d': VocabParallelCrossEntropyLoss3D,
}
class CrossEntropyLoss(_Loss):
def __init__(self, reduction: bool = True, *args, **kwargs):
super().__init__()
tensor_parallel = get_tensor_parallel_mode()
if tensor_parallel is not None and env.vocab_parallel:
self.loss = _vocab_parallel_cross_entropy[tensor_parallel](reduction=reduction, *args, **kwargs)
elif tensor_parallel is None or tensor_parallel == '1d':
reduction = 'mean' if reduction else 'none'
self.loss = nn.CrossEntropyLoss(reduction=reduction, *args, **kwargs)
else:
self.loss = _parallel_cross_entropy[tensor_parallel](reduction=reduction, *args, **kwargs)
def forward(self, *args):
return self.loss(*args)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment