Unverified Commit b5f9e37c authored by Hongxin Liu's avatar Hongxin Liu Committed by GitHub
Browse files

[legacy] clean up legacy code (#4743)

* [legacy] remove outdated codes of pipeline (#4692)

* [legacy] remove cli of benchmark and update optim (#4690)

* [legacy] remove cli of benchmark and update optim

* [doc] fix cli doc test

* [legacy] fix engine clip grad norm

* [legacy] remove outdated colo tensor (#4694)

* [legacy] remove outdated colo tensor

* [test] fix test import

* [legacy] move outdated zero to legacy (#4696)

* [legacy] clean up utils (#4700)

* [legacy] clean up utils

* [example] update examples

* [legacy] clean up amp

* [legacy] fix amp module

* [legacy] clean up gpc (#4742)

* [legacy] clean up context

* [legacy] clean core, constants and global vars

* [legacy] refactor initialize

* [example] fix examples ci

* [example] fix examples ci

* [legacy] fix tests

* [example] fix gpt example

* [example] fix examples ci

* [devops] fix ci installation

* [example] fix examples ci
parent 32e7f994
...@@ -4,8 +4,8 @@ ...@@ -4,8 +4,8 @@
import torch import torch
import torch.distributed as dist import torch.distributed as dist
from colossalai.core import global_context as gpc from colossalai.legacy.core import global_context as gpc
from colossalai.global_variables import tensor_parallel_env as env from colossalai.legacy.global_variables import tensor_parallel_env as env
from ..utils import divide from ..utils import divide
......
...@@ -10,18 +10,18 @@ import torch.nn.functional as F ...@@ -10,18 +10,18 @@ import torch.nn.functional as F
from torch import Tensor from torch import Tensor
from torch.nn.parameter import Parameter from torch.nn.parameter import Parameter
from colossalai.context import ParallelMode, seed
from colossalai.core import global_context as gpc
from colossalai.global_variables import tensor_parallel_env as env
from colossalai.kernel import LayerNorm from colossalai.kernel import LayerNorm
from colossalai.legacy.communication import broadcast from colossalai.legacy.communication import broadcast
from colossalai.legacy.context import ParallelMode, seed
from colossalai.legacy.context.parallel_context import global_context as gpc
from colossalai.legacy.global_variables import tensor_parallel_env as env
from colossalai.legacy.registry import LAYERS from colossalai.legacy.registry import LAYERS
from colossalai.nn import init as init from colossalai.legacy.utils.checkpointing import (
from colossalai.utils.checkpointing import (
broadcast_state_dict, broadcast_state_dict,
gather_tensor_parallel_state_dict, gather_tensor_parallel_state_dict,
partition_tensor_parallel_state_dict, partition_tensor_parallel_state_dict,
) )
from colossalai.nn import init as init
from colossalai.utils.cuda import get_current_device from colossalai.utils.cuda import get_current_device
from ..base_layer import ParallelLayer from ..base_layer import ParallelLayer
......
...@@ -5,10 +5,10 @@ import torch.distributed as dist ...@@ -5,10 +5,10 @@ import torch.distributed as dist
from torch import Tensor from torch import Tensor
from torch.cuda.amp import custom_bwd, custom_fwd from torch.cuda.amp import custom_bwd, custom_fwd
from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.global_variables import tensor_parallel_env as env
from colossalai.legacy.communication.collective import all_gather, all_reduce, reduce, reduce_scatter from colossalai.legacy.communication.collective import all_gather, all_reduce, reduce, reduce_scatter
from colossalai.legacy.context.parallel_mode import ParallelMode
from colossalai.legacy.core import global_context as gpc
from colossalai.legacy.global_variables import tensor_parallel_env as env
from colossalai.utils import get_current_device from colossalai.utils import get_current_device
...@@ -31,9 +31,9 @@ def matmul_2d( ...@@ -31,9 +31,9 @@ def matmul_2d(
out_shape (:class:`torch.size`): shape of output tensor. out_shape (:class:`torch.size`): shape of output tensor.
row_rank (int, optional): the rank of row, defaults to None. row_rank (int, optional): the rank of row, defaults to None.
col_rank (int, optional): the rank of column, defaults to None. col_rank (int, optional): the rank of column, defaults to None.
row_parallel_mode (:class:`colossalai.context.ParallelMode`, optional): row_parallel_mode (:class:`colossalai.legacy.context.ParallelMode`, optional):
row parallel mode, defaults to ParallelMode.PARALLEL_2D_ROW. row parallel mode, defaults to ParallelMode.PARALLEL_2D_ROW.
col_parallel_mode (:class:`colossalai.context.ParallelMode`, optional): col_parallel_mode (:class:`colossalai.legacy.context.ParallelMode`, optional):
column parallel mode, defaults to ParallelMode.PARALLEL_2D_COL. column parallel mode, defaults to ParallelMode.PARALLEL_2D_COL.
Returns: Returns:
...@@ -146,8 +146,8 @@ def classifier_2d(A: Tensor, B: Tensor, bias: Optional[Tensor], summa_dim: int, ...@@ -146,8 +146,8 @@ def classifier_2d(A: Tensor, B: Tensor, bias: Optional[Tensor], summa_dim: int,
out_shape (:class:`torch.size`): shape of output tensor. out_shape (:class:`torch.size`): shape of output tensor.
row_rank (int, optional): the rank of row, defaults to None. row_rank (int, optional): the rank of row, defaults to None.
col_rank (int, optional): the rank of column, defaults to None. col_rank (int, optional): the rank of column, defaults to None.
row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode. row_parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): row parallel mode.
col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode. col_parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): column parallel mode.
data_parallel_rank (int): data parallel rank. data_parallel_rank (int): data parallel rank.
pipeline_parallel_rank (int): pipeline parallel rank pipeline_parallel_rank (int): pipeline parallel rank
pipeline_parallel_size (int): pipeline parallel size. pipeline_parallel_size (int): pipeline parallel size.
...@@ -172,8 +172,8 @@ class Matmul_AB_2D(torch.autograd.Function): ...@@ -172,8 +172,8 @@ class Matmul_AB_2D(torch.autograd.Function):
out_shape (:class:`torch.size`): shape of output tensor. out_shape (:class:`torch.size`): shape of output tensor.
row_rank (int, optional): the rank of row, defaults to None. row_rank (int, optional): the rank of row, defaults to None.
col_rank (int, optional): the rank of column, defaults to None. col_rank (int, optional): the rank of column, defaults to None.
row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode. row_parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): row parallel mode.
col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode. col_parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): column parallel mode.
data_parallel_rank (int): data parallel rank. data_parallel_rank (int): data parallel rank.
pipeline_parallel_rank (int): pipeline parallel rank pipeline_parallel_rank (int): pipeline parallel rank
pipeline_parallel_size (int): pipeline parallel size. pipeline_parallel_size (int): pipeline parallel size.
...@@ -299,8 +299,8 @@ class Matmul_ABT_2D(torch.autograd.Function): ...@@ -299,8 +299,8 @@ class Matmul_ABT_2D(torch.autograd.Function):
out_shape (:class:`torch.size`): shape of output tensor. out_shape (:class:`torch.size`): shape of output tensor.
row_rank (int, optional): the rank of row, defaults to None. row_rank (int, optional): the rank of row, defaults to None.
col_rank (int, optional): the rank of column, defaults to None. col_rank (int, optional): the rank of column, defaults to None.
row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode. row_parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): row parallel mode.
col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode. col_parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): column parallel mode.
column parallel mode, defaults to ParallelMode.PARALLEL_2D_COL. column parallel mode, defaults to ParallelMode.PARALLEL_2D_COL.
data_parallel_rank (int): data parallel rank. data_parallel_rank (int): data parallel rank.
pipeline_parallel_rank (int): pipeline parallel rank pipeline_parallel_rank (int): pipeline parallel rank
...@@ -433,8 +433,8 @@ class Matmul_ATB_2D(torch.autograd.Function): ...@@ -433,8 +433,8 @@ class Matmul_ATB_2D(torch.autograd.Function):
out_shape (:class:`torch.size`): shape of output tensor. out_shape (:class:`torch.size`): shape of output tensor.
row_rank (int, optional): the rank of row, defaults to None. row_rank (int, optional): the rank of row, defaults to None.
col_rank (int, optional): the rank of column, defaults to None. col_rank (int, optional): the rank of column, defaults to None.
row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode. row_parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): row parallel mode.
col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode. col_parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): column parallel mode.
data_parallel_rank (int): data parallel rank. data_parallel_rank (int): data parallel rank.
pipeline_parallel_rank (int): pipeline parallel rank pipeline_parallel_rank (int): pipeline parallel rank
pipeline_parallel_size (int): pipeline parallel size. pipeline_parallel_size (int): pipeline parallel size.
...@@ -620,8 +620,8 @@ def add_bias_2d(input_: Tensor, bias: Tensor, output_size_per_partition: int, ro ...@@ -620,8 +620,8 @@ def add_bias_2d(input_: Tensor, bias: Tensor, output_size_per_partition: int, ro
output_size_per_partition (int): size of output per partition. output_size_per_partition (int): size of output per partition.
row_rank (int, optional): the rank of row, defaults to None. row_rank (int, optional): the rank of row, defaults to None.
col_rank (int, optional): the rank of column, defaults to None. col_rank (int, optional): the rank of column, defaults to None.
row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode. row_parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): row parallel mode.
col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode. col_parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): column parallel mode.
skip_bias_add (bool): skip_bias_add (bool):
If set to ``True``, it will skip bias add for linear layer, which is preserved for kernel fusion. If set to ``True``, it will skip bias add for linear layer, which is preserved for kernel fusion.
data_parallel_rank (int): data parallel rank. data_parallel_rank (int): data parallel rank.
...@@ -685,8 +685,8 @@ def layernorm_2d(input_: Tensor, E_x: Tensor, Var_x: Tensor, hidden_size: int, r ...@@ -685,8 +685,8 @@ def layernorm_2d(input_: Tensor, E_x: Tensor, Var_x: Tensor, hidden_size: int, r
E_x (:class:`torch.tensor`): mean. E_x (:class:`torch.tensor`): mean.
Var_x (:class:`torch.tensor`): variance. Var_x (:class:`torch.tensor`): variance.
hidden_size (int): hidden size. hidden_size (int): hidden size.
row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode. row_parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): row parallel mode.
col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode. col_parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): column parallel mode.
Note: Note:
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
...@@ -719,7 +719,7 @@ def all_gather_tensor_2d(tensor: Tensor, dim: int, parallel_mode: ParallelMode) ...@@ -719,7 +719,7 @@ def all_gather_tensor_2d(tensor: Tensor, dim: int, parallel_mode: ParallelMode)
Args: Args:
tensor (:class:`torch.tensor`): Input tensor. tensor (:class:`torch.tensor`): Input tensor.
dim (int): Dimension to gather. dim (int): Dimension to gather.
parallel_mode (:class:`colossalai.context.ParallelMode`): The parallel mode tensor used. parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): The parallel mode tensor used.
Note: Note:
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
...@@ -767,7 +767,7 @@ def reduce_tensor_2d(input_: Tensor, parallel_mode: ParallelMode) -> Tensor: ...@@ -767,7 +767,7 @@ def reduce_tensor_2d(input_: Tensor, parallel_mode: ParallelMode) -> Tensor:
Args: Args:
input_ (:class:`torch.tensor`): Input tensor. input_ (:class:`torch.tensor`): Input tensor.
parallel_mode (:class:`colossalai.context.ParallelMode`): The parallel mode tensor used. parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): The parallel mode tensor used.
Note: Note:
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
...@@ -795,7 +795,7 @@ def reduce_scatter_tensor_2d(tensor: Tensor, dim: int, parallel_mode: ParallelMo ...@@ -795,7 +795,7 @@ def reduce_scatter_tensor_2d(tensor: Tensor, dim: int, parallel_mode: ParallelMo
Args: Args:
tensor (:class:`torch.tensor`): Input tensor. tensor (:class:`torch.tensor`): Input tensor.
dim (int): Dimension to reduce. dim (int): Dimension to reduce.
parallel_mode (:class:`colossalai.context.ParallelMode`): The parallel mode tensor used. parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): The parallel mode tensor used.
Note: Note:
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
......
from colossalai.context.parallel_mode import ParallelMode from colossalai.legacy.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc from colossalai.legacy.core import global_context as gpc
from colossalai.global_variables import tensor_parallel_env as env from colossalai.legacy.global_variables import tensor_parallel_env as env
def get_summa_dim_from_env() -> int: def get_summa_dim_from_env() -> int:
......
...@@ -8,13 +8,16 @@ import torch.nn.functional as F ...@@ -8,13 +8,16 @@ import torch.nn.functional as F
from torch import Tensor from torch import Tensor
from torch.nn import Parameter from torch.nn import Parameter
from colossalai.context import ParallelMode, seed
from colossalai.core import global_context as gpc
from colossalai.global_variables import tensor_parallel_env as env
from colossalai.legacy.communication import broadcast from colossalai.legacy.communication import broadcast
from colossalai.legacy.context import ParallelMode, seed
from colossalai.legacy.core import global_context as gpc
from colossalai.legacy.global_variables import tensor_parallel_env as env
from colossalai.legacy.registry import LAYERS from colossalai.legacy.registry import LAYERS
from colossalai.legacy.utils.checkpointing import (
gather_tensor_parallel_state_dict,
partition_tensor_parallel_state_dict,
)
from colossalai.nn import init as init from colossalai.nn import init as init
from colossalai.utils.checkpointing import gather_tensor_parallel_state_dict, partition_tensor_parallel_state_dict
from colossalai.utils.cuda import get_current_device from colossalai.utils.cuda import get_current_device
from ..base_layer import ParallelLayer from ..base_layer import ParallelLayer
......
...@@ -5,9 +5,9 @@ import torch.distributed as dist ...@@ -5,9 +5,9 @@ import torch.distributed as dist
from torch import Tensor from torch import Tensor
from torch.cuda.amp import custom_bwd, custom_fwd from torch.cuda.amp import custom_bwd, custom_fwd
from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.legacy.communication.collective import all_gather, all_reduce, reduce_scatter from colossalai.legacy.communication.collective import all_gather, all_reduce, reduce_scatter
from colossalai.legacy.context.parallel_mode import ParallelMode
from colossalai.legacy.core import global_context as gpc
from colossalai.utils import get_current_device from colossalai.utils import get_current_device
...@@ -112,8 +112,8 @@ def classifier_2p5d(A: Tensor, B: Tensor, bias, tesseract_dim: int, out_shape: T ...@@ -112,8 +112,8 @@ def classifier_2p5d(A: Tensor, B: Tensor, bias, tesseract_dim: int, out_shape: T
out_shape (:class:`torch.size`): shape of output tensor. out_shape (:class:`torch.size`): shape of output tensor.
row_rank (int): the rank of row. row_rank (int): the rank of row.
col_rank (int): the rank of column. col_rank (int): the rank of column.
row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode. row_parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): row parallel mode.
col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode. col_parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): column parallel mode.
data_parallel_rank (int): data parallel rank. data_parallel_rank (int): data parallel rank.
pipeline_parallel_rank (int): pipeline parallel rank pipeline_parallel_rank (int): pipeline parallel rank
pipeline_parallel_size (int): pipeline parallel size. pipeline_parallel_size (int): pipeline parallel size.
...@@ -139,8 +139,8 @@ class Matmul_AB_2p5D(torch.autograd.Function): ...@@ -139,8 +139,8 @@ class Matmul_AB_2p5D(torch.autograd.Function):
row_rank (int): the rank of row. row_rank (int): the rank of row.
col_rank (int): the rank of column. col_rank (int): the rank of column.
dep_rank (int): the rank of depth. dep_rank (int): the rank of depth.
row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode. row_parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): row parallel mode.
col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode. col_parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): column parallel mode.
data_parallel_rank (int): data parallel rank. data_parallel_rank (int): data parallel rank.
pipeline_parallel_rank (int): pipeline parallel rank pipeline_parallel_rank (int): pipeline parallel rank
pipeline_parallel_size (int): pipeline parallel size. pipeline_parallel_size (int): pipeline parallel size.
...@@ -264,8 +264,8 @@ class Matmul_ABT_2p5D(torch.autograd.Function): ...@@ -264,8 +264,8 @@ class Matmul_ABT_2p5D(torch.autograd.Function):
row_rank (int): the rank of row. row_rank (int): the rank of row.
col_rank (int): the rank of column. col_rank (int): the rank of column.
dep_rank (int): the rank of depth. dep_rank (int): the rank of depth.
row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode. row_parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): row parallel mode.
col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode. col_parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): column parallel mode.
data_parallel_rank (int): data parallel rank. data_parallel_rank (int): data parallel rank.
pipeline_parallel_rank (int): pipeline parallel rank pipeline_parallel_rank (int): pipeline parallel rank
pipeline_parallel_size (int): pipeline parallel size. pipeline_parallel_size (int): pipeline parallel size.
...@@ -394,8 +394,8 @@ class Matmul_ATB_2p5D(torch.autograd.Function): ...@@ -394,8 +394,8 @@ class Matmul_ATB_2p5D(torch.autograd.Function):
row_rank (int): the rank of row. row_rank (int): the rank of row.
col_rank (int): the rank of column. col_rank (int): the rank of column.
dep_rank (int): the rank of depth. dep_rank (int): the rank of depth.
row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode. row_parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): row parallel mode.
col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode. col_parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): column parallel mode.
data_parallel_rank (int): data parallel rank. data_parallel_rank (int): data parallel rank.
pipeline_parallel_rank (int): pipeline parallel rank pipeline_parallel_rank (int): pipeline parallel rank
pipeline_parallel_size (int): pipeline parallel size. pipeline_parallel_size (int): pipeline parallel size.
...@@ -606,7 +606,7 @@ def add_bias_2p5d(input: Tensor, bias: Tensor, output_size_per_partition: int, t ...@@ -606,7 +606,7 @@ def add_bias_2p5d(input: Tensor, bias: Tensor, output_size_per_partition: int, t
row_rank (int): the rank of row. row_rank (int): the rank of row.
col_rank (int): the rank of column. col_rank (int): the rank of column.
dep_rank (int): the rank of depth. dep_rank (int): the rank of depth.
col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode. col_parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): column parallel mode.
skip_bias_add (bool): If set to ``True``, it will skip bias add for linear layer, skip_bias_add (bool): If set to ``True``, it will skip bias add for linear layer,
which is preserved for kernel fusion. which is preserved for kernel fusion.
data_parallel_rank (int): data parallel rank. data_parallel_rank (int): data parallel rank.
...@@ -631,7 +631,7 @@ class _Layernorm2p5D(torch.autograd.Function): ...@@ -631,7 +631,7 @@ class _Layernorm2p5D(torch.autograd.Function):
E_x (:class:`torch.tensor`): mean. E_x (:class:`torch.tensor`): mean.
Var_x (:class:`torch.tensor`): variance. Var_x (:class:`torch.tensor`): variance.
hidden_size (int): hidden size. hidden_size (int): hidden size.
row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode. row_parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): row parallel mode.
Note: Note:
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
...@@ -682,7 +682,7 @@ def layernorm_2p5d(input: Tensor, E_x: Tensor, Var_x: Tensor, hidden_size: int, ...@@ -682,7 +682,7 @@ def layernorm_2p5d(input: Tensor, E_x: Tensor, Var_x: Tensor, hidden_size: int,
E_x (:class:`torch.tensor`): mean. E_x (:class:`torch.tensor`): mean.
Var_x (:class:`torch.tensor`): variance. Var_x (:class:`torch.tensor`): variance.
hidden_size (int): hidden size. hidden_size (int): hidden size.
row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode. row_parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): row parallel mode.
Note: Note:
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
...@@ -715,7 +715,7 @@ def all_gather_tensor_2p5d(inputs: Tensor, dim: int, col_parallel_mode: Parallel ...@@ -715,7 +715,7 @@ def all_gather_tensor_2p5d(inputs: Tensor, dim: int, col_parallel_mode: Parallel
Args: Args:
inputs (:class:`torch.tensor`): input tensor. inputs (:class:`torch.tensor`): input tensor.
dim (int): dimension of all-gather. dim (int): dimension of all-gather.
col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode. col_parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): column parallel mode.
Note: Note:
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
...@@ -730,7 +730,7 @@ class SplitFirst(torch.autograd.Function): ...@@ -730,7 +730,7 @@ class SplitFirst(torch.autograd.Function):
Args: Args:
inputs (:class:`torch.tensor`): input tensor. inputs (:class:`torch.tensor`): input tensor.
tesseract_dim (int): dimension of TESSERACT fo 2.5D parallelism tesseract_dim (int): dimension of TESSERACT fo 2.5D parallelism
col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode. col_parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): column parallel mode.
Note: Note:
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
...@@ -798,7 +798,7 @@ def reduce_tensor_2p5d(input_: Tensor, parallel_mode: ParallelMode) -> Tensor: ...@@ -798,7 +798,7 @@ def reduce_tensor_2p5d(input_: Tensor, parallel_mode: ParallelMode) -> Tensor:
Args: Args:
input_ (:class:`torch.tensor`): Input tensor. input_ (:class:`torch.tensor`): Input tensor.
parallel_mode (:class:`colossalai.context.ParallelMode`): The parallel mode tensor used. parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): The parallel mode tensor used.
Note: Note:
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
...@@ -826,7 +826,7 @@ def reduce_scatter_tensor_2p5d(input_: Tensor, dim: int, parallel_mode: Parallel ...@@ -826,7 +826,7 @@ def reduce_scatter_tensor_2p5d(input_: Tensor, dim: int, parallel_mode: Parallel
Args: Args:
input_ (:class:`torch.tensor`): Input tensor. input_ (:class:`torch.tensor`): Input tensor.
dim (int): Dimension to reduce. dim (int): Dimension to reduce.
parallel_mode (:class:`colossalai.context.ParallelMode`): The parallel mode tensor used. parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): The parallel mode tensor used.
Note: Note:
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
......
from colossalai.context.parallel_mode import ParallelMode from colossalai.legacy.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc from colossalai.legacy.core import global_context as gpc
from colossalai.global_variables import tensor_parallel_env as env from colossalai.legacy.global_variables import tensor_parallel_env as env
def get_tesseract_dim_dep_from_env(): def get_tesseract_dim_dep_from_env():
......
...@@ -8,17 +8,17 @@ import torch.nn.functional as F ...@@ -8,17 +8,17 @@ import torch.nn.functional as F
from torch import Tensor from torch import Tensor
from torch.nn import Parameter from torch.nn import Parameter
from colossalai.context import ParallelMode, seed
from colossalai.core import global_context as gpc
from colossalai.global_variables import tensor_parallel_env as env
from colossalai.legacy.communication import broadcast from colossalai.legacy.communication import broadcast
from colossalai.legacy.context import ParallelMode, seed
from colossalai.legacy.core import global_context as gpc
from colossalai.legacy.global_variables import tensor_parallel_env as env
from colossalai.legacy.registry import LAYERS from colossalai.legacy.registry import LAYERS
from colossalai.nn import init as init from colossalai.legacy.utils.checkpointing import (
from colossalai.utils.checkpointing import (
broadcast_state_dict, broadcast_state_dict,
gather_tensor_parallel_state_dict, gather_tensor_parallel_state_dict,
partition_tensor_parallel_state_dict, partition_tensor_parallel_state_dict,
) )
from colossalai.nn import init as init
from colossalai.utils.cuda import get_current_device from colossalai.utils.cuda import get_current_device
from ..base_layer import ParallelLayer from ..base_layer import ParallelLayer
......
...@@ -7,10 +7,10 @@ import torch ...@@ -7,10 +7,10 @@ import torch
from torch import Tensor from torch import Tensor
from torch.cuda.amp import custom_bwd, custom_fwd from torch.cuda.amp import custom_bwd, custom_fwd
from colossalai.constants import INPUT_GROUP_3D, WEIGHT_GROUP_3D
from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.legacy.communication import all_gather, all_reduce, broadcast, reduce, reduce_scatter from colossalai.legacy.communication import all_gather, all_reduce, broadcast, reduce, reduce_scatter
from colossalai.legacy.constants import INPUT_GROUP_3D, WEIGHT_GROUP_3D
from colossalai.legacy.context.parallel_mode import ParallelMode
from colossalai.legacy.core import global_context as gpc
from ._utils import get_parallel_mode_from_env, push_async_grad from ._utils import get_parallel_mode_from_env, push_async_grad
...@@ -73,9 +73,9 @@ def linear_3d( ...@@ -73,9 +73,9 @@ def linear_3d(
Args: Args:
input_ (:class:`torch.tensor`): input matrix. input_ (:class:`torch.tensor`): input matrix.
weight (:class:`torch.tensor`): matrix of weight. weight (:class:`torch.tensor`): matrix of weight.
input_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): input parallel mode. input_parallel_mode (:class:`colossalai.legacy.context.parallel_mode.ParallelMode`): input parallel mode.
weight_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): weight parallel mode. weight_parallel_mode (:class:`colossalai.legacy.context.parallel_mode.ParallelMode`): weight parallel mode.
output_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): output parallel mode. output_parallel_mode (:class:`colossalai.legacy.context.parallel_mode.ParallelMode`): output parallel mode.
Note: Note:
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
...@@ -166,9 +166,9 @@ def classifier_3d( ...@@ -166,9 +166,9 @@ def classifier_3d(
input_ (:class:`torch.tensor`): input matrix. input_ (:class:`torch.tensor`): input matrix.
weight (:class:`torch.tensor`): matrix of weight. weight (:class:`torch.tensor`): matrix of weight.
bias (:class:`torch.tensor`): matrix of bias. bias (:class:`torch.tensor`): matrix of bias.
input_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): input parallel mode. input_parallel_mode (:class:`colossalai.legacy.context.parallel_mode.ParallelMode`): input parallel mode.
weight_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): weight parallel mode. weight_parallel_mode (:class:`colossalai.legacy.context.parallel_mode.ParallelMode`): weight parallel mode.
output_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): output parallel mode. output_parallel_mode (:class:`colossalai.legacy.context.parallel_mode.ParallelMode`): output parallel mode.
Note: Note:
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
...@@ -260,9 +260,9 @@ def vocab_parallel_classifier_3d( ...@@ -260,9 +260,9 @@ def vocab_parallel_classifier_3d(
input_ (:class:`torch.tensor`): input matrix. input_ (:class:`torch.tensor`): input matrix.
weight (:class:`torch.tensor`): matrix of weight. weight (:class:`torch.tensor`): matrix of weight.
bias (:class:`torch.tensor`): matrix of bias. bias (:class:`torch.tensor`): matrix of bias.
input_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): input parallel mode. input_parallel_mode (:class:`colossalai.legacy.context.parallel_mode.ParallelMode`): input parallel mode.
weight_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): weight parallel mode. weight_parallel_mode (:class:`colossalai.legacy.context.parallel_mode.ParallelMode`): weight parallel mode.
output_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): output parallel mode. output_parallel_mode (:class:`colossalai.legacy.context.parallel_mode.ParallelMode`): output parallel mode.
Note: Note:
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
...@@ -378,8 +378,8 @@ def layernorm_3d( ...@@ -378,8 +378,8 @@ def layernorm_3d(
If a single integer is used, it is treated as a singleton list, and this module will If a single integer is used, it is treated as a singleton list, and this module will
normalize over the last dimension which is expected to be of that specific size. normalize over the last dimension which is expected to be of that specific size.
eps (float): a value added to the denominator for numerical stability eps (float): a value added to the denominator for numerical stability
output_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): output parallel mode. output_parallel_mode (:class:`colossalai.legacy.context.parallel_mode.ParallelMode`): output parallel mode.
input_x_weight_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): input x weight parallel mode. input_x_weight_parallel_mode (:class:`colossalai.legacy.context.parallel_mode.ParallelMode`): input x weight parallel mode.
Note: Note:
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
...@@ -404,7 +404,7 @@ def split_tensor_3d(tensor: Tensor, dim: int, parallel_mode: ParallelMode) -> Te ...@@ -404,7 +404,7 @@ def split_tensor_3d(tensor: Tensor, dim: int, parallel_mode: ParallelMode) -> Te
Args: Args:
tensor (:class:`torch.tensor`): Input tensor. tensor (:class:`torch.tensor`): Input tensor.
dim (int): Specified dimension in which to split. dim (int): Specified dimension in which to split.
parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`, optional): Parallel mode. parallel_mode (:class:`colossalai.legacy.context.parallel_mode.ParallelMode`, optional): Parallel mode.
Returns: Returns:
:class:`torch.tensor`: The tensor has been split. :class:`torch.tensor`: The tensor has been split.
...@@ -434,8 +434,8 @@ def split_batch_3d(input_: Tensor, ...@@ -434,8 +434,8 @@ def split_batch_3d(input_: Tensor,
Args: Args:
input_ (:class:`torch.tensor`): Input tensor. input_ (:class:`torch.tensor`): Input tensor.
dim (int): Specified dimension in which to split. dim (int): Specified dimension in which to split.
input_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`, optional): input parallel mode. input_parallel_mode (:class:`colossalai.legacy.context.parallel_mode.ParallelMode`, optional): input parallel mode.
weight_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`, optional): weight parallel mode. weight_parallel_mode (:class:`colossalai.legacy.context.parallel_mode.ParallelMode`, optional): weight parallel mode.
Returns: Returns:
:class:`torch.tensor`: The tensor has been split. :class:`torch.tensor`: The tensor has been split.
...@@ -471,7 +471,7 @@ def reduce_tensor_3d(tensor: Tensor, parallel_mode: ParallelMode) -> Tensor: ...@@ -471,7 +471,7 @@ def reduce_tensor_3d(tensor: Tensor, parallel_mode: ParallelMode) -> Tensor:
Args: Args:
tensor (:class:`torch.tensor`): Input tensor. tensor (:class:`torch.tensor`): Input tensor.
parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): Parallel mode. parallel_mode (:class:`colossalai.legacy.context.parallel_mode.ParallelMode`): Parallel mode.
Note: Note:
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
...@@ -501,7 +501,7 @@ def all_gather_tensor_3d(tensor: Tensor, dim: int, parallel_mode: ParallelMode) ...@@ -501,7 +501,7 @@ def all_gather_tensor_3d(tensor: Tensor, dim: int, parallel_mode: ParallelMode)
Args: Args:
tensor (:class:`torch.tensor`): Input tensor. tensor (:class:`torch.tensor`): Input tensor.
dim (int): Dimension to gather. dim (int): Dimension to gather.
parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): Parallel mode. parallel_mode (:class:`colossalai.legacy.context.parallel_mode.ParallelMode`): Parallel mode.
Note: Note:
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
...@@ -530,7 +530,7 @@ def reduce_scatter_tensor_3d(tensor: Tensor, dim: int, parallel_mode: ParallelMo ...@@ -530,7 +530,7 @@ def reduce_scatter_tensor_3d(tensor: Tensor, dim: int, parallel_mode: ParallelMo
Args: Args:
tensor (:class:`torch.tensor`): Input tensor. tensor (:class:`torch.tensor`): Input tensor.
dim (int): Dimension to scatter. dim (int): Dimension to scatter.
parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): Parallel mode. parallel_mode (:class:`colossalai.legacy.context.parallel_mode.ParallelMode`): Parallel mode.
Note: Note:
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
...@@ -578,8 +578,8 @@ def reduce_by_batch_3d(tensor: Tensor, ...@@ -578,8 +578,8 @@ def reduce_by_batch_3d(tensor: Tensor,
r"""All-reduce the input from the model parallel region. r"""All-reduce the input from the model parallel region.
Args: Args:
input_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): input parallel mode. input_parallel_mode (:class:`colossalai.legacy.context.parallel_mode.ParallelMode`): input parallel mode.
weight_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): weight parallel mode. weight_parallel_mode (:class:`colossalai.legacy.context.parallel_mode.ParallelMode`): weight parallel mode.
reduce_mean (bool, optional): If set to ``True``, it will divide the output by reduce_mean (bool, optional): If set to ``True``, it will divide the output by
(input parallel size * weight parallel size), default to False. (input parallel size * weight parallel size), default to False.
......
...@@ -4,9 +4,15 @@ from functools import partial ...@@ -4,9 +4,15 @@ from functools import partial
import torch import torch
from torch import Tensor from torch import Tensor
from colossalai.constants import INPUT_GROUP_3D, INPUT_X_WEIGHT_3D, OUTPUT_GROUP_3D, OUTPUT_X_WEIGHT_3D, WEIGHT_GROUP_3D from colossalai.legacy.constants import (
from colossalai.core import global_context as gpc INPUT_GROUP_3D,
from colossalai.global_variables import tensor_parallel_env as env INPUT_X_WEIGHT_3D,
OUTPUT_GROUP_3D,
OUTPUT_X_WEIGHT_3D,
WEIGHT_GROUP_3D,
)
from colossalai.legacy.core import global_context as gpc
from colossalai.legacy.global_variables import tensor_parallel_env as env
def get_depth_from_env() -> int: def get_depth_from_env() -> int:
......
...@@ -8,19 +8,25 @@ import torch.nn.functional as F ...@@ -8,19 +8,25 @@ import torch.nn.functional as F
from torch import Tensor from torch import Tensor
from torch.nn import Parameter from torch.nn import Parameter
from colossalai.constants import INPUT_GROUP_3D, INPUT_X_WEIGHT_3D, OUTPUT_GROUP_3D, OUTPUT_X_WEIGHT_3D, WEIGHT_GROUP_3D
from colossalai.context import ParallelMode, seed
from colossalai.core import global_context as gpc
from colossalai.global_variables import tensor_parallel_env as env
from colossalai.legacy.communication import all_reduce, broadcast from colossalai.legacy.communication import all_reduce, broadcast
from colossalai.legacy.constants import (
INPUT_GROUP_3D,
INPUT_X_WEIGHT_3D,
OUTPUT_GROUP_3D,
OUTPUT_X_WEIGHT_3D,
WEIGHT_GROUP_3D,
)
from colossalai.legacy.context import ParallelMode, seed
from colossalai.legacy.core import global_context as gpc
from colossalai.legacy.global_variables import tensor_parallel_env as env
from colossalai.legacy.nn.layer.base_layer import ParallelLayer from colossalai.legacy.nn.layer.base_layer import ParallelLayer
from colossalai.legacy.registry import LAYERS from colossalai.legacy.registry import LAYERS
from colossalai.nn import init as init from colossalai.legacy.utils.checkpointing import (
from colossalai.utils.checkpointing import (
broadcast_state_dict, broadcast_state_dict,
gather_tensor_parallel_state_dict, gather_tensor_parallel_state_dict,
partition_tensor_parallel_state_dict, partition_tensor_parallel_state_dict,
) )
from colossalai.nn import init as init
from colossalai.utils.cuda import get_current_device from colossalai.utils.cuda import get_current_device
from ..utils import divide, set_tensor_parallel_attribute_by_partition, to_2tuple from ..utils import divide, set_tensor_parallel_attribute_by_partition, to_2tuple
......
...@@ -5,9 +5,9 @@ import torch ...@@ -5,9 +5,9 @@ import torch
from torch import distributed as dist from torch import distributed as dist
from torch.cuda.amp import custom_bwd, custom_fwd from torch.cuda.amp import custom_bwd, custom_fwd
from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.legacy.communication import ring_forward from colossalai.legacy.communication import ring_forward
from colossalai.legacy.context.parallel_mode import ParallelMode
from colossalai.legacy.core import global_context as gpc
from colossalai.legacy.nn.layer.parallel_sequence._utils import _calc_current_device_range, _calc_incoming_device_range from colossalai.legacy.nn.layer.parallel_sequence._utils import _calc_current_device_range, _calc_incoming_device_range
from colossalai.utils import get_current_device from colossalai.utils import get_current_device
......
...@@ -9,11 +9,11 @@ import torch.nn.functional as F ...@@ -9,11 +9,11 @@ import torch.nn.functional as F
from torch.nn import Parameter from torch.nn import Parameter
import colossalai import colossalai
from colossalai.context import seed
from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.kernel import FusedScaleMaskSoftmax from colossalai.kernel import FusedScaleMaskSoftmax
from colossalai.kernel.cuda_native.scaled_softmax import AttnMaskType from colossalai.kernel.cuda_native.scaled_softmax import AttnMaskType
from colossalai.legacy.context import seed
from colossalai.legacy.context.parallel_mode import ParallelMode
from colossalai.legacy.core import global_context as gpc
from colossalai.legacy.nn.layer.parallel_sequence._operation import RingAV, RingQK from colossalai.legacy.nn.layer.parallel_sequence._operation import RingAV, RingQK
from colossalai.legacy.registry import LAYERS from colossalai.legacy.registry import LAYERS
......
...@@ -8,9 +8,9 @@ import numpy as np ...@@ -8,9 +8,9 @@ import numpy as np
import torch import torch
from torch import Tensor, nn from torch import Tensor, nn
from colossalai.constants import IS_TENSOR_PARALLEL, NUM_PARTITIONS from colossalai.legacy.constants import IS_TENSOR_PARALLEL, NUM_PARTITIONS
from colossalai.global_variables import tensor_parallel_env as env from colossalai.legacy.global_variables import tensor_parallel_env as env
from colossalai.utils import checkpoint from colossalai.legacy.utils import checkpoint
class CheckpointModule(nn.Module): class CheckpointModule(nn.Module):
......
...@@ -7,7 +7,7 @@ from torch import Tensor ...@@ -7,7 +7,7 @@ from torch import Tensor
from torch import nn as nn from torch import nn as nn
from torch.nn.parameter import Parameter from torch.nn.parameter import Parameter
from colossalai.context import seed from colossalai.legacy.context import seed
from colossalai.legacy.registry import LAYERS from colossalai.legacy.registry import LAYERS
from colossalai.nn import init as init from colossalai.nn import init as init
from colossalai.utils.cuda import get_current_device from colossalai.utils.cuda import get_current_device
...@@ -64,7 +64,7 @@ class WrappedDropout(nn.Module): ...@@ -64,7 +64,7 @@ class WrappedDropout(nn.Module):
Args: Args:
p (float, optional): probability of an element to be zeroed, defaults 0.5. p (float, optional): probability of an element to be zeroed, defaults 0.5.
inplace (bool, optional): whether to do dropout in-place, default to be False. inplace (bool, optional): whether to do dropout in-place, default to be False.
mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode. mode (:class:`colossalai.legacy.context.ParallelMode`): The chosen parallel mode.
Note: Note:
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
...@@ -101,7 +101,7 @@ class WrappedDropPath(nn.Module): ...@@ -101,7 +101,7 @@ class WrappedDropPath(nn.Module):
Args: Args:
p (float, optional): probability of dropping path, defaults 0.0. p (float, optional): probability of dropping path, defaults 0.0.
mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode. mode (:class:`colossalai.legacy.context.ParallelMode`): The chosen parallel mode.
Note: Note:
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
......
...@@ -3,8 +3,8 @@ from typing import List, Tuple, Union ...@@ -3,8 +3,8 @@ from typing import List, Tuple, Union
import torch.distributed as dist import torch.distributed as dist
import torch.nn as nn import torch.nn as nn
from colossalai.context import ParallelMode from colossalai.legacy.context import ParallelMode
from colossalai.core import global_context as gpc from colossalai.legacy.core import global_context as gpc
class PipelineSharedModuleWrapper: class PipelineSharedModuleWrapper:
......
...@@ -2,7 +2,7 @@ from torch import nn ...@@ -2,7 +2,7 @@ from torch import nn
from torch.nn.modules.loss import * from torch.nn.modules.loss import *
from torch.nn.modules.loss import _Loss from torch.nn.modules.loss import _Loss
from colossalai.global_variables import tensor_parallel_env as env from colossalai.legacy.global_variables import tensor_parallel_env as env
from colossalai.legacy.nn.layer.utils import get_tensor_parallel_mode from colossalai.legacy.nn.layer.utils import get_tensor_parallel_mode
from .loss_1d import VocabParallelCrossEntropyLoss1D from .loss_1d import VocabParallelCrossEntropyLoss1D
......
...@@ -3,8 +3,8 @@ import torch.distributed as dist ...@@ -3,8 +3,8 @@ import torch.distributed as dist
from torch.cuda.amp import custom_bwd, custom_fwd from torch.cuda.amp import custom_bwd, custom_fwd
from torch.nn.modules.loss import _Loss from torch.nn.modules.loss import _Loss
from colossalai.context import ParallelMode from colossalai.legacy.context import ParallelMode
from colossalai.core import global_context as gpc from colossalai.legacy.core import global_context as gpc
from colossalai.legacy.registry import LOSSES from colossalai.legacy.registry import LOSSES
......
...@@ -4,8 +4,8 @@ from torch.cuda.amp import custom_bwd, custom_fwd ...@@ -4,8 +4,8 @@ from torch.cuda.amp import custom_bwd, custom_fwd
from torch.nn.functional import cross_entropy from torch.nn.functional import cross_entropy
from torch.nn.modules.loss import _Loss from torch.nn.modules.loss import _Loss
from colossalai.context import ParallelMode from colossalai.legacy.context import ParallelMode
from colossalai.core import global_context as gpc from colossalai.legacy.core import global_context as gpc
from colossalai.legacy.nn.layer.parallel_2d import reduce_by_batch_2d, split_batch_2d from colossalai.legacy.nn.layer.parallel_2d import reduce_by_batch_2d, split_batch_2d
from colossalai.legacy.nn.layer.parallel_2d._utils import assert_summa_initialization from colossalai.legacy.nn.layer.parallel_2d._utils import assert_summa_initialization
from colossalai.legacy.registry import LOSSES from colossalai.legacy.registry import LOSSES
......
...@@ -4,8 +4,8 @@ from torch.cuda.amp import custom_bwd, custom_fwd ...@@ -4,8 +4,8 @@ from torch.cuda.amp import custom_bwd, custom_fwd
from torch.nn.functional import cross_entropy from torch.nn.functional import cross_entropy
from torch.nn.modules.loss import _Loss from torch.nn.modules.loss import _Loss
from colossalai.context import ParallelMode from colossalai.legacy.context import ParallelMode
from colossalai.core import global_context as gpc from colossalai.legacy.core import global_context as gpc
from colossalai.legacy.nn.layer.parallel_2p5d import reduce_by_batch_2p5d, split_batch_2p5d from colossalai.legacy.nn.layer.parallel_2p5d import reduce_by_batch_2p5d, split_batch_2p5d
from colossalai.legacy.nn.layer.parallel_2p5d._utils import assert_tesseract_initialization from colossalai.legacy.nn.layer.parallel_2p5d._utils import assert_tesseract_initialization
from colossalai.legacy.registry import LOSSES from colossalai.legacy.registry import LOSSES
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment