Unverified Commit 554aa959 authored by Hongxin Liu's avatar Hongxin Liu Committed by GitHub
Browse files

[legacy] move communication and nn to legacy and refactor logger (#4671)

* [legacy] move communication to legacy (#4640)

* [legacy] refactor logger and clean up legacy codes (#4654)

* [legacy] make logger independent to gpc

* [legacy] make optim independent to registry

* [legacy] move test engine to legacy

* [legacy] move nn to legacy (#4656)

* [legacy] move nn to legacy

* [checkpointio] fix save hf config

* [test] remove useledd rpc pp test

* [legacy] fix nn init

* [example] skip tutorial hybriad parallel example

* [devops] test doc check

* [devops] test doc check
parent 536397cc
import torch.nn.functional as F
from typing import Optional from typing import Optional
import torch.nn.functional as F
from torch import Tensor from torch import Tensor
from colossalai.tensor import ColoTensor, ColoTensorSpec, ComputePattern, ComputeSpec, ReplicaSpec, ShardSpec, distspec
from colossalai.tensor.op_wrapper import colo_op_impl from colossalai.tensor.op_wrapper import colo_op_impl
from colossalai.tensor import ComputePattern, ComputePattern, ComputeSpec, ColoTensor, distspec, ColoTensorSpec, \
ShardSpec, ReplicaSpec
from ._utils import GeneralTensor, convert_to_colo_tensor from ._utils import GeneralTensor, convert_to_colo_tensor
......
from typing import List, Optional from typing import List, Optional
import torch.nn.functional as F import torch.nn.functional as F
from colossalai.tensor import ColoTensor, ColoTensorSpec, ReplicaSpec, distspec
from colossalai.tensor.op_wrapper import colo_op_impl from colossalai.tensor.op_wrapper import colo_op_impl
from colossalai.tensor import ColoTensor, distspec, ColoTensorSpec, ReplicaSpec
from ._utils import GeneralTensor, convert_to_colo_tensor from ._utils import GeneralTensor, convert_to_colo_tensor
......
from typing import Optional
import torch import torch
import torch.nn.functional as F import torch.nn.functional as F
from typing import Optional
from colossalai.tensor.op_wrapper import colo_op_impl from colossalai.legacy.nn.loss.loss_1d import VocabParallelCrossEntropyLoss1D
from colossalai.tensor import ColoTensor, ColoTensorSpec from colossalai.tensor import ColoTensor, ColoTensorSpec
from colossalai.nn.loss.loss_1d import VocabParallelCrossEntropyLoss1D from colossalai.tensor.op_wrapper import colo_op_impl
from ._utils import GeneralTensor, convert_to_colo_tensor from ._utils import GeneralTensor, convert_to_colo_tensor
......
from .colossalai_layer import *
from .parallel_1d import *
from .parallel_2d import *
from .parallel_2p5d import *
from .parallel_3d import *
from .parallel_sequence import *
from .utils import *
from .vanilla import *
from .wrapper import *
from ._utils import partition_batch from ._utils import partition_batch
from .dropout import Dropout from .dropout import Dropout
from .embedding import Embedding, PatchEmbedding from .embedding import Embedding, PatchEmbedding
from .linear import Classifier, Linear from .linear import Classifier, Linear
from .normalization import LayerNorm from .normalization import LayerNorm
__all__ = ['Linear', 'Classifier', 'Embedding', 'PatchEmbedding', 'LayerNorm', 'Dropout', 'partition_batch'] __all__ = ['Linear', 'Classifier', 'Embedding', 'PatchEmbedding', 'LayerNorm', 'Dropout', 'partition_batch']
import math import math
from typing import Callable from typing import Callable
from colossalai.utils import get_current_device from torch import dtype, nn
from torch import dtype, nn
from colossalai.nn import init
from ... import init as init from colossalai.utils import get_current_device
from ..parallel_1d import Embedding1D, PatchEmbedding1D, VocabParallelEmbedding1D
from ..parallel_2d import Embedding2D, PatchEmbedding2D, VocabParallelEmbedding2D from ..parallel_1d import Embedding1D, PatchEmbedding1D, VocabParallelEmbedding1D
from ..parallel_2p5d import Embedding2p5D, PatchEmbedding2p5D, VocabParallelEmbedding2p5D from ..parallel_2d import Embedding2D, PatchEmbedding2D, VocabParallelEmbedding2D
from ..parallel_3d import Embedding3D, PatchEmbedding3D, VocabParallelEmbedding3D from ..parallel_2p5d import Embedding2p5D, PatchEmbedding2p5D, VocabParallelEmbedding2p5D
from ..utils import get_tensor_parallel_mode from ..parallel_3d import Embedding3D, PatchEmbedding3D, VocabParallelEmbedding3D
from ..vanilla import VanillaPatchEmbedding from ..utils import get_tensor_parallel_mode
from ._utils import ColossalaiModule from ..vanilla import VanillaPatchEmbedding
from ._utils import ColossalaiModule
_parallel_embedding = {
'1d': Embedding1D, _parallel_embedding = {
'2d': Embedding2D, '1d': Embedding1D,
'2.5d': Embedding2p5D, '2d': Embedding2D,
'3d': Embedding3D, '2.5d': Embedding2p5D,
} '3d': Embedding3D,
}
_vocab_parallel_embedding = {
'1d': VocabParallelEmbedding1D, _vocab_parallel_embedding = {
'2d': VocabParallelEmbedding2D, '1d': VocabParallelEmbedding1D,
'2.5d': VocabParallelEmbedding2p5D, '2d': VocabParallelEmbedding2D,
'3d': VocabParallelEmbedding3D '2.5d': VocabParallelEmbedding2p5D,
} '3d': VocabParallelEmbedding3D
}
_parallel_patchembedding = {
None: VanillaPatchEmbedding, _parallel_patchembedding = {
'1d': PatchEmbedding1D, None: VanillaPatchEmbedding,
'2d': PatchEmbedding2D, '1d': PatchEmbedding1D,
'2.5d': PatchEmbedding2p5D, '2d': PatchEmbedding2D,
'3d': PatchEmbedding3D '2.5d': PatchEmbedding2p5D,
} '3d': PatchEmbedding3D
}
class Embedding(ColossalaiModule):
r"""Embedding for colossalai. class Embedding(ColossalaiModule):
r"""Embedding for colossalai.
Args:
num_embeddings (int): number of embeddings. Args:
embedding_dim (int): dimension of embedding. num_embeddings (int): number of embeddings.
padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient; embedding_dim (int): dimension of embedding.
therefore, the embedding vector at padding_idx is not updated during training, padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient;
i.e. it remains as a fixed “pad”, defaults to None. therefore, the embedding vector at padding_idx is not updated during training,
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None. i.e. it remains as a fixed “pad”, defaults to None.
weight_initializer (:class:`typing.Callable`, optional): dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
he initializer of weight, defaults to normal initializer. weight_initializer (:class:`typing.Callable`, optional):
he initializer of weight, defaults to normal initializer.
The ``args`` and ``kwargs`` used in :class:`torch.nn.functional.embedding` should contain:
:: The ``args`` and ``kwargs`` used in :class:`torch.nn.functional.embedding` should contain:
::
max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is
renormalized to have norm max_norm. Note: this will modify weight in-place. max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is
norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default 2. renormalized to have norm max_norm. Note: this will modify weight in-place.
scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default 2.
of frequency of the words in the mini-batch. Default False. scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse
sparse (bool, optional): If True, gradient w.r.t. weight will be a sparse tensor. Default False. of frequency of the words in the mini-batch. Default False.
sparse (bool, optional): If True, gradient w.r.t. weight will be a sparse tensor. Default False.
More details about ``args`` and ``kwargs`` could be found in
`Embedding <https://pytorch.org/docs/stable/generated/torch.nn.functional.embedding.html#torch.nn.functional.embedding>`_. More details about ``args`` and ``kwargs`` could be found in
`Embedding <https://pytorch.org/docs/stable/generated/torch.nn.functional.embedding.html#torch.nn.functional.embedding>`_.
More details about ``initializer`` please refer to
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_ More details about ``initializer`` please refer to
""" `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_
"""
def __init__(self,
num_embeddings: int, def __init__(self,
embedding_dim: int, num_embeddings: int,
padding_idx: int = None, embedding_dim: int,
dtype: dtype = None, padding_idx: int = None,
weight_initializer: Callable = init.normal_(), dtype: dtype = None,
vocab_parallel_limit: int = 2048, weight_initializer: Callable = init.normal_(),
*args, vocab_parallel_limit: int = 2048,
**kwargs) -> None: *args,
tensor_parallel = get_tensor_parallel_mode() **kwargs) -> None:
if tensor_parallel is None: tensor_parallel = get_tensor_parallel_mode()
embed = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx, *args, if tensor_parallel is None:
**kwargs).to(dtype).to(get_current_device()) embed = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx, *args,
weight_initializer(embed.weight, fan_in=num_embeddings, fan_out=embedding_dim) **kwargs).to(dtype).to(get_current_device())
elif num_embeddings <= vocab_parallel_limit: weight_initializer(embed.weight, fan_in=num_embeddings, fan_out=embedding_dim)
embed = _parallel_embedding[tensor_parallel]( elif num_embeddings <= vocab_parallel_limit:
num_embeddings, embed = _parallel_embedding[tensor_parallel](
embedding_dim, num_embeddings,
padding_idx=padding_idx, embedding_dim,
dtype=dtype, padding_idx=padding_idx,
weight_initializer=weight_initializer, dtype=dtype,
*args, weight_initializer=weight_initializer,
**kwargs, *args,
) **kwargs,
else: )
embed = _vocab_parallel_embedding[tensor_parallel]( else:
num_embeddings, embed = _vocab_parallel_embedding[tensor_parallel](
embedding_dim, num_embeddings,
padding_idx=padding_idx, embedding_dim,
dtype=dtype, padding_idx=padding_idx,
weight_initializer=weight_initializer, dtype=dtype,
*args, weight_initializer=weight_initializer,
**kwargs, *args,
) **kwargs,
super().__init__(embed) )
super().__init__(embed)
class PatchEmbedding(ColossalaiModule):
"""2D Image to Patch Embedding. class PatchEmbedding(ColossalaiModule):
"""2D Image to Patch Embedding.
Args:
img_size (int): image size. Args:
patch_size (int): patch size. img_size (int): image size.
in_chans (int): number of channels of input image. patch_size (int): patch size.
embed_size (int): size of embedding. in_chans (int): number of channels of input image.
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None. embed_size (int): size of embedding.
flatten (bool, optional): whether to flatten output tensor, defaults to True. dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
weight_initializer (:class:`typing.Callable`, optional): flatten (bool, optional): whether to flatten output tensor, defaults to True.
The initializer of weight, defaults to kaiming uniform initializer. weight_initializer (:class:`typing.Callable`, optional):
bias_initializer (:class:`typing.Callable`, optional): The initializer of weight, defaults to kaiming uniform initializer.
The initializer of bias, defaults to xavier uniform initializer. bias_initializer (:class:`typing.Callable`, optional):
position_embed_initializer (:class:`typing.Callable`, optional): The initializer of bias, defaults to xavier uniform initializer.
The initializer of position embedding, defaults to zeros initializer. position_embed_initializer (:class:`typing.Callable`, optional):
The initializer of position embedding, defaults to zeros initializer.
More details about ``initializer`` please refer to
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_. More details about ``initializer`` please refer to
""" `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
"""
def __init__(
self, def __init__(
img_size: int, self,
patch_size: int, img_size: int,
in_chans: int, patch_size: int,
embed_size: int, in_chans: int,
dtype: dtype = None, embed_size: int,
flatten: bool = True, dtype: dtype = None,
weight_initializer: Callable = init.kaiming_uniform_(a=math.sqrt(5)), flatten: bool = True,
bias_initializer: Callable = init.xavier_uniform_(a=1, scale=1), weight_initializer: Callable = init.kaiming_uniform_(a=math.sqrt(5)),
position_embed_initializer: Callable = init.zeros_() bias_initializer: Callable = init.xavier_uniform_(a=1, scale=1),
) -> None: position_embed_initializer: Callable = init.zeros_()
tensor_parallel = get_tensor_parallel_mode() ) -> None:
embed = _parallel_patchembedding[tensor_parallel]( tensor_parallel = get_tensor_parallel_mode()
img_size, embed = _parallel_patchembedding[tensor_parallel](
patch_size, img_size,
in_chans, patch_size,
embed_size, in_chans,
dtype=dtype, embed_size,
flatten=flatten, dtype=dtype,
weight_initializer=weight_initializer, flatten=flatten,
bias_initializer=bias_initializer, weight_initializer=weight_initializer,
position_embed_initializer=position_embed_initializer, bias_initializer=bias_initializer,
) position_embed_initializer=position_embed_initializer,
super().__init__(embed) )
super().__init__(embed)
...@@ -4,9 +4,9 @@ from typing import Callable ...@@ -4,9 +4,9 @@ from typing import Callable
from torch import dtype, nn from torch import dtype, nn
from colossalai.nn import init
from colossalai.utils import get_current_device from colossalai.utils import get_current_device
from ... import init as init
from ..parallel_1d import * from ..parallel_1d import *
from ..parallel_2d import * from ..parallel_2d import *
from ..parallel_2p5d import * from ..parallel_2p5d import *
......
from colossalai.utils import get_current_device from torch import nn
from torch import nn
from colossalai.utils import get_current_device
from ..parallel_1d import LayerNorm1D
from ..parallel_2d import LayerNorm2D from ..parallel_1d import LayerNorm1D
from ..parallel_2p5d import LayerNorm2p5D from ..parallel_2d import LayerNorm2D
from ..parallel_3d import LayerNorm3D from ..parallel_2p5d import LayerNorm2p5D
from ..utils import get_tensor_parallel_mode from ..parallel_3d import LayerNorm3D
from ..vanilla import VanillaLayerNorm from ..utils import get_tensor_parallel_mode
from ._utils import ColossalaiModule from ..vanilla import VanillaLayerNorm
from ._utils import ColossalaiModule
_parallel_layernorm = {
None: VanillaLayerNorm, _parallel_layernorm = {
"1d": LayerNorm1D, None: VanillaLayerNorm,
"2d": LayerNorm2D, "1d": LayerNorm1D,
"2.5d": LayerNorm2p5D, "2d": LayerNorm2D,
"3d": LayerNorm3D, "2.5d": LayerNorm2p5D,
} "3d": LayerNorm3D,
}
class LayerNorm(ColossalaiModule):
r"""Layer Normalization for colossalai. class LayerNorm(ColossalaiModule):
r"""Layer Normalization for colossalai.
Args:
normalized_shape (int): input shape from an expected input of size. Args:
:math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1] normalized_shape (int): input shape from an expected input of size.
\times \ldots \times \text{normalized_shape}[-1]]` :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1]
If a single integer is used, it is treated as a singleton list, and this module will \times \ldots \times \text{normalized_shape}[-1]]`
normalize over the last dimension which is expected to be of that specific size. If a single integer is used, it is treated as a singleton list, and this module will
eps (float): a value added to the denominator for numerical stability, defaults to 1e-05. normalize over the last dimension which is expected to be of that specific size.
bias (bool, optional): Whether to add a bias, defaults to ``True``. eps (float): a value added to the denominator for numerical stability, defaults to 1e-05.
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None. bias (bool, optional): Whether to add a bias, defaults to ``True``.
""" dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
"""
def __init__(self, normalized_shape: int, eps=1e-05, bias=True, dtype=None) -> None:
tensor_parallel = get_tensor_parallel_mode() def __init__(self, normalized_shape: int, eps=1e-05, bias=True, dtype=None) -> None:
if tensor_parallel is None: tensor_parallel = get_tensor_parallel_mode()
norm = nn.LayerNorm(normalized_shape, eps=eps).to(dtype).to(get_current_device()) if tensor_parallel is None:
else: norm = nn.LayerNorm(normalized_shape, eps=eps).to(dtype).to(get_current_device())
norm = _parallel_layernorm[tensor_parallel](normalized_shape, eps=eps, dtype=dtype) else:
super().__init__(norm) norm = _parallel_layernorm[tensor_parallel](normalized_shape, eps=eps, dtype=dtype)
super().__init__(norm)
from .layers import (Classifier1D, Dropout1D, Embedding1D, LayerNorm1D, Linear1D, Linear1D_Col, Linear1D_Row, from .layers import (
PatchEmbedding1D, VocabParallelClassifier1D, VocabParallelEmbedding1D) Classifier1D,
Dropout1D,
Embedding1D,
LayerNorm1D,
Linear1D,
Linear1D_Col,
Linear1D_Row,
PatchEmbedding1D,
VocabParallelClassifier1D,
VocabParallelEmbedding1D,
)
__all__ = [ __all__ = [
'Linear1D', 'Linear1D_Col', 'Linear1D_Row', 'Embedding1D', 'Dropout1D', 'Classifier1D', 'VocabParallelClassifier1D', 'Linear1D', 'Linear1D_Col', 'Linear1D_Row', 'Embedding1D', 'Dropout1D', 'Classifier1D', 'VocabParallelClassifier1D',
......
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
import torch import torch
import torch.distributed as dist import torch.distributed as dist
from colossalai.core import global_context as gpc from colossalai.core import global_context as gpc
from colossalai.global_variables import tensor_parallel_env as env from colossalai.global_variables import tensor_parallel_env as env
...@@ -124,7 +125,7 @@ class _ReduceInput(torch.autograd.Function): ...@@ -124,7 +125,7 @@ class _ReduceInput(torch.autograd.Function):
class _SplitForwardGatherBackward(torch.autograd.Function): class _SplitForwardGatherBackward(torch.autograd.Function):
""" """
Split the input and keep only the corresponding chuck to the rank. Split the input and keep only the corresponding chuck to the rank.
Args: Args:
input_: input matrix. input_: input matrix.
parallel_mode: parallel mode. parallel_mode: parallel mode.
......
...@@ -10,11 +10,11 @@ import torch.nn.functional as F ...@@ -10,11 +10,11 @@ import torch.nn.functional as F
from torch import Tensor from torch import Tensor
from torch.nn.parameter import Parameter from torch.nn.parameter import Parameter
from colossalai.communication import broadcast
from colossalai.context import ParallelMode, seed from colossalai.context import ParallelMode, seed
from colossalai.core import global_context as gpc from colossalai.core import global_context as gpc
from colossalai.global_variables import tensor_parallel_env as env from colossalai.global_variables import tensor_parallel_env as env
from colossalai.kernel import LayerNorm from colossalai.kernel import LayerNorm
from colossalai.legacy.communication import broadcast
from colossalai.legacy.registry import LAYERS from colossalai.legacy.registry import LAYERS
from colossalai.nn import init as init from colossalai.nn import init as init
from colossalai.utils.checkpointing import ( from colossalai.utils.checkpointing import (
......
from ._operation import reduce_by_batch_2d, split_batch_2d from ._operation import reduce_by_batch_2d, split_batch_2d
from .layers import (Classifier2D, Embedding2D, LayerNorm2D, Linear2D, PatchEmbedding2D, VocabParallelClassifier2D, from .layers import (
VocabParallelEmbedding2D) Classifier2D,
Embedding2D,
LayerNorm2D,
Linear2D,
PatchEmbedding2D,
VocabParallelClassifier2D,
VocabParallelEmbedding2D,
)
__all__ = [ __all__ = [
'split_batch_2d', 'reduce_by_batch_2d', 'Linear2D', 'LayerNorm2D', 'Classifier2D', 'PatchEmbedding2D', 'split_batch_2d', 'reduce_by_batch_2d', 'Linear2D', 'LayerNorm2D', 'Classifier2D', 'PatchEmbedding2D',
......
...@@ -2,13 +2,14 @@ from typing import Any, Optional, Tuple ...@@ -2,13 +2,14 @@ from typing import Any, Optional, Tuple
import torch import torch
import torch.distributed as dist import torch.distributed as dist
from colossalai.communication.collective import (all_gather, all_reduce, reduce, reduce_scatter)
from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.utils import get_current_device
from torch import Tensor from torch import Tensor
from torch.cuda.amp import custom_bwd, custom_fwd from torch.cuda.amp import custom_bwd, custom_fwd
from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.global_variables import tensor_parallel_env as env from colossalai.global_variables import tensor_parallel_env as env
from colossalai.legacy.communication.collective import all_gather, all_reduce, reduce, reduce_scatter
from colossalai.utils import get_current_device
def matmul_2d( def matmul_2d(
...@@ -226,9 +227,9 @@ class Matmul_AB_2D(torch.autograd.Function): ...@@ -226,9 +227,9 @@ class Matmul_AB_2D(torch.autograd.Function):
col_group = gpc.get_group(col_parallel_mode) col_group = gpc.get_group(col_parallel_mode)
src_a = summa_dim * row_rank + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \ src_a = summa_dim * row_rank + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \
pipeline_parallel_rank * tensor_parallel_size pipeline_parallel_rank * tensor_parallel_size
src_b = col_rank + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \ src_b = col_rank + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \
pipeline_parallel_rank * tensor_parallel_size pipeline_parallel_rank * tensor_parallel_size
opa = [None] * 2 opa = [None] * 2
opb = [None] * 2 opb = [None] * 2
...@@ -351,9 +352,9 @@ class Matmul_ABT_2D(torch.autograd.Function): ...@@ -351,9 +352,9 @@ class Matmul_ABT_2D(torch.autograd.Function):
col_group = gpc.get_group(col_parallel_mode) col_group = gpc.get_group(col_parallel_mode)
src_b = col_rank + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \ src_b = col_rank + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \
pipeline_parallel_rank * tensor_parallel_size pipeline_parallel_rank * tensor_parallel_size
src_c = summa_dim * row_rank + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \ src_c = summa_dim * row_rank + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \
pipeline_parallel_rank * tensor_parallel_size pipeline_parallel_rank * tensor_parallel_size
opb = [None] * 2 opb = [None] * 2
opr = [None] * 2 opr = [None] * 2
...@@ -484,9 +485,9 @@ class Matmul_ATB_2D(torch.autograd.Function): ...@@ -484,9 +485,9 @@ class Matmul_ATB_2D(torch.autograd.Function):
col_group = gpc.get_group(col_parallel_mode) col_group = gpc.get_group(col_parallel_mode)
src_a = summa_dim * row_rank + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \ src_a = summa_dim * row_rank + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \
pipeline_parallel_rank * tensor_parallel_size pipeline_parallel_rank * tensor_parallel_size
src_c = col_rank + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \ src_c = col_rank + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \
pipeline_parallel_rank * tensor_parallel_size pipeline_parallel_rank * tensor_parallel_size
opa = [None] * 2 opa = [None] * 2
opr = [None] * 2 opr = [None] * 2
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment