delete unused files

da3f0934 · zhuwenwen · c4dd1fd4 · c4dd1fd4 · c4dd1fd4 · c4dd1fd4
Commit da3f0934 authored Apr 23, 2023 by zhuwenwen
20 changed files
--- a/colossalai/communication/__pycache__/utils.cpython-36.pyc
+++ b/colossalai/communication/__pycache__/utils.cpython-36.pyc
--- a/colossalai/communication/__pycache__/utils.cpython-37.pyc
+++ b/colossalai/communication/__pycache__/utils.cpython-37.pyc
--- a/colossalai/communication/collective.py
+++ b/colossalai/communication/collective.py
-#!/usr/bin/env python
-# -*- encoding: utf-8 -*-
-import torch
-import torch.distributed as dist
-from torch.distributed import ReduceOp
-from torch import Tensor
-from colossalai.context import ParallelMode
-from colossalai.core import global_context as gpc
-from colossalai.utils import get_current_device
-def all_gather(tensor: Tensor, dim: int, parallel_mode: ParallelMode, async_op: bool = False) -> Tensor:
-    """Gathers all tensors from the parallel group and concatenates them in a 
-    specific dimension.
-    :param tensor: Tensor to be gathered
-    :param dim: The dimension concatenating in
-    :param parallel_mode: Parallel group mode used in this communication
-    :param async_op: Whether operations are asynchronous
-    :type tensor: :class:`torch.Tensor`
-    :type dim: int
-    :type parallel_mode: :class:`colossalai.context.ParallelMode`
-    :type async_op: bool, optional
-    :return: The tensor generated by all-gather
-    :rtype: :class:`torch.Tensor`
-    """
-    depth = gpc.get_world_size(parallel_mode)
-    if depth == 1:
-        out = tensor
-        work = None
-    else:
-        shape = list(tensor.shape)
-        shape[0], shape[dim] = shape[dim], shape[0]
-        shape[0] *= depth
-        out = torch.empty(shape, dtype=tensor.dtype, device=get_current_device())
-        temp = list(torch.chunk(out, depth, dim=0))
-        work = dist.all_gather(tensor_list=temp,
-                               tensor=tensor.transpose(0, dim).contiguous(),
-                               group=gpc.get_group(parallel_mode),
-                               async_op=async_op)
-        out = torch.transpose(out, 0, dim)
-    if async_op:
-        return out, work
-    else:
-        return out
-def reduce_scatter(tensor: Tensor,
-                   dim: int,
-                   parallel_mode: ParallelMode,
-                   op: ReduceOp = ReduceOp.SUM,
-                   async_op: bool = False) -> Tensor:
-    """Reduces all tensors then scatters it in a specific dimension to all 
-    members in the parallel group.
-    :param tensor: Tensor to be reduced and scattered
-    :param dim: The dimension scattering in
-    :param parallel_mode: Parallel group mode used in this communication
-    :param op: The type of reduce operation
-    :param async_op: Whether operations are asynchronous
-    :type tensor: :class:`torch.Tensor`
-    :type dim: int
-    :type parallel_mode: :class:`colossalai.context.ParallelMode`
-    :type op: ReduceOp, optional
-    :type async_op: bool, optional
-    :return: The tensor generated by reduce-scatter
-    :rtype: :class:`Tensor`
-    """
-    depth = gpc.get_world_size(parallel_mode)
-    if depth == 1:
-        out = tensor
-        work = None
-    else:
-        temp = list(map(lambda x: x.contiguous(), torch.chunk(tensor, depth, dim=dim)))
-        out = torch.empty(temp[0].shape, dtype=tensor.dtype, device=get_current_device())
-        work = dist.reduce_scatter(output=out,
-                                   input_list=temp,
-                                   op=op,
-                                   group=gpc.get_group(parallel_mode),
-                                   async_op=async_op)
-    if async_op:
-        return out, work
-    else:
-        return out
-def all_reduce(tensor: Tensor,
-               parallel_mode: ParallelMode,
-               op: ReduceOp = ReduceOp.SUM,
-               async_op: bool = False) -> Tensor:
-    depth = gpc.get_world_size(parallel_mode)
-    if depth == 1:
-        out = tensor
-        work = None
-    else:
-        out = tensor.contiguous()
-        work = dist.all_reduce(out, op=op, group=gpc.get_group(parallel_mode), async_op=async_op)
-    if async_op:
-        return out, work
-    else:
-        return out
-def broadcast(tensor: Tensor, src: int, parallel_mode: ParallelMode, async_op: bool = False):
-    depth = gpc.get_world_size(parallel_mode)
-    if depth == 1:
-        out = tensor
-        work = None
-    else:
-        out = tensor.contiguous()
-        work = dist.broadcast(out, src=src, group=gpc.get_group(parallel_mode), async_op=async_op)
-    if async_op:
-        return out, work
-    else:
-        return out
-def reduce(tensor: Tensor, dst: int, parallel_mode: ParallelMode, op: ReduceOp = ReduceOp.SUM, async_op: bool = False):
-    depth = gpc.get_world_size(parallel_mode)
-    if depth == 1:
-        out = tensor
-        work = None
-    else:
-        out = tensor.contiguous()
-        work = dist.reduce(out, dst=dst, op=op, group=gpc.get_group(parallel_mode), async_op=async_op)
-    if async_op:
-        return out, work
-    else:
-        return out
--- a/colossalai/communication/p2p.py
+++ b/colossalai/communication/p2p.py
-#!/usr/bin/env python
-# -*- encoding: utf-8 -*-
-from typing import List, Tuple, Union
-import torch
-import torch.distributed as dist
-from colossalai.context.parallel_mode import ParallelMode
-from colossalai.core import global_context as gpc
-from colossalai.utils import get_current_device
-from functools import reduce
-import operator
-from .utils import split_tensor_into_1d_equal_chunks, gather_split_1d_tensor
-TensorShape = Union[torch.Size, List[int], Tuple[int]]
-def _get_tensor_shape(tensor_shape: TensorShape, chunk_tensor: bool = False) -> Tuple[TensorShape, bool]:
-    """get the exact tensor shape when communicating and return whether the tensor is a chunk
-    :param tensor_shape: shape of tensor
-    :type tensor_shape: TensorShape
-    :param chunk_tensor: whether to chunk tensor, defaults to False
-    :type chunk_tensor: bool, optional
-    :return: exact tensor shape, whether to chunk tensor
-    :rtype: Tuple[Union[torch.Size, List[int], Tuple[int]], bool]
-    """
-    if chunk_tensor:
-        tensor_chunk_shape = reduce(operator.mul, tensor_shape, 1)
-        tensor_parallel_world_size = gpc.get_world_size(ParallelMode.TENSOR)
-        if tensor_chunk_shape % tensor_parallel_world_size == 0:
-            tensor_chunk_shape = tensor_chunk_shape // tensor_parallel_world_size
-        else:
-            tensor_chunk_shape = tensor_shape
-            chunk_tensor = False
-    else:
-        tensor_chunk_shape = tensor_shape
-    return tensor_chunk_shape, chunk_tensor
-def _communicate(tensor_send_next=None,
-                 tensor_send_prev=None,
-                 recv_prev=False,
-                 recv_next=False,
-                 recv_prev_shape=None,
-                 recv_next_shape=None,
-                 prev_rank=None,
-                 next_rank=None,
-                 dtype=None,
-                 scatter_gather_tensors=False):
-    """
-    Adapted from megatron.p2p_communication.
-    Communicate tensors between stages. Used as helper method in other
-    communication methods that are used in pipeline schedule.
-    Takes the following arguments:
-        tensor_send_next: tensor to send to next rank (no tensor sent if
-                          set to None).
-        tensor_send_prev: tensor to send to prev rank (no tensor sent if
-                          set to None).
-        recv_prev: boolean for whether tensor should be received from
-                   previous rank.
-        recv_next: boolean for whether tensor should be received from
-                   next rank.
-    Returns:
-        (tensor_recv_prev, tensor_recv_next)
-    """
-    # Create placeholder tensors for receive in forward and backward directions
-    # if needed.
-    tensor_recv_prev = None
-    tensor_recv_next = None
-    if recv_prev:
-        assert recv_prev_shape is not None
-        recv_prev_chunk_shape, recv_prev_split = _get_tensor_shape(recv_prev_shape, scatter_gather_tensors)
-        tensor_recv_prev = torch.empty(recv_prev_chunk_shape,
-                                       requires_grad=True,
-                                       device=get_current_device(),
-                                       dtype=dtype)
-    if recv_next:
-        assert recv_next_shape is not None
-        recv_next_chunk_shape, recv_next_split = _get_tensor_shape(recv_next_shape, scatter_gather_tensors)
-        tensor_recv_next = torch.empty(recv_next_chunk_shape,
-                                       requires_grad=True,
-                                       device=get_current_device(),
-                                       dtype=dtype)
-    if tensor_send_prev is not None or recv_prev:
-        if prev_rank is None:
-            prev_rank = gpc.get_prev_global_rank(
-                ParallelMode.PIPELINE)
-    if tensor_send_next is not None or recv_next:
-        if next_rank is None:
-            next_rank = gpc.get_next_global_rank(
-                ParallelMode.PIPELINE)
-    if tensor_send_prev is not None:
-        send_prev_split = _get_tensor_shape(tensor_send_prev.shape, scatter_gather_tensors)[1]
-        if send_prev_split:
-            tensor_send_prev = split_tensor_into_1d_equal_chunks(tensor_send_prev)
-    if tensor_send_next is not None:
-        send_next_split = _get_tensor_shape(tensor_send_next.shape, scatter_gather_tensors)[1]
-        if send_next_split:
-            tensor_send_next = split_tensor_into_1d_equal_chunks(tensor_send_next)
-    ops = []
-    if tensor_send_prev is not None:
-        send_prev_op = dist.P2POp(dist.isend, tensor_send_prev, prev_rank)
-        ops.append(send_prev_op)
-    if tensor_recv_prev is not None:
-        recv_prev_op = dist.P2POp(dist.irecv, tensor_recv_prev, prev_rank)
-        ops.append(recv_prev_op)
-    if tensor_recv_next is not None:
-        recv_next_op = dist.P2POp(dist.irecv, tensor_recv_next, next_rank)
-        ops.append(recv_next_op)
-    if tensor_send_next is not None:
-        send_next_op = dist.P2POp(dist.isend, tensor_send_next, next_rank)
-        ops.append(send_next_op)
-    if len(ops) > 0:
-        reqs = dist.batch_isend_irecv(ops)
-        for req in reqs:
-            req.wait()
-    # To protect against race condition when using batch_isend_irecv().
-    torch.cuda.synchronize()
-    if recv_prev and recv_prev_split:
-        tensor_recv_prev = gather_split_1d_tensor(tensor_recv_prev).view(recv_prev_shape).requires_grad_()
-    if recv_next and recv_next_split:
-        tensor_recv_next = gather_split_1d_tensor(tensor_recv_next).view(recv_next_shape).requires_grad_()
-    return tensor_recv_prev, tensor_recv_next
-def recv_forward(input_tensor_shape, prev_rank=None, dtype=torch.float, scatter_gather_tensors=False):
-    """Receives the input tensor from the previous member in pipeline.
-    :param input_tensor_shape: The shape of the tensor to be recieved
-    :param prev_rank: The rank of the source of the tensor
-    :type input_tensor_shape: torch.Size
-    :type prev_rank: int, optional
-    :return: The input tensor in forward step
-    :rtype: :class:`torch.Tensor`
-    """
-    if gpc.is_pipeline_first_stage():
-        input_tensor = None
-    else:
-        input_tensor, _ = _communicate(recv_prev=True,
-                                       recv_prev_shape=input_tensor_shape,
-                                       prev_rank=prev_rank,
-                                       dtype=dtype,
-                                       scatter_gather_tensors=scatter_gather_tensors)
-    return input_tensor
-def recv_backward(output_grad_shape, next_rank=None, dtype=torch.float, scatter_gather_tensors=False):
-    """Receives the grad tensor from the next member in pipeline.
-    :param output_grad_shape: The shape of the tensor to be recieved
-    :param next_rank: The rank of the source of the tensor
-    :type output_grad_shape: torch.Size
-    :type next_rank: int, optional
-    :return: The grad of output tensor in forward step
-    :rtype: :class:`torch.Tensor`
-    """
-    if gpc.is_pipeline_last_stage():
-        output_tensor_grad = None
-    else:
-        _, output_tensor_grad = _communicate(recv_next=True,
-                                             recv_next_shape=output_grad_shape,
-                                             next_rank=next_rank,
-                                             dtype=dtype,
-                                             scatter_gather_tensors=scatter_gather_tensors)
-    return output_tensor_grad
-def send_forward(output_tensor, next_rank=None, scatter_gather_tensors=False):
-    """Sends the input tensor to the next member in pipeline.
-    :param output_tensor: Tensor to be sent
-    :param next_rank: The rank of the recipient of the tensor
-    :type output_tensor: :class:`torch.Tensor`
-    :type next_rank: int, optional
-    """
-    if not gpc.is_pipeline_last_stage():
-        _communicate(tensor_send_next=output_tensor,
-                     next_rank=next_rank,
-                     scatter_gather_tensors=scatter_gather_tensors)
-def send_backward(input_tensor_grad, prev_rank=None, scatter_gather_tensors=False):
-    """Sends the grad tensor to the previous member in pipeline.
-    :param input_tensor_grad: Tensor to be sent
-    :param prev_rank: The rank of the recipient of the tensor
-    :type input_tensor_grad: :class:`torch.Tensor`
-    :type prev_rank: int, optional
-    """
-    if not gpc.is_pipeline_first_stage():
-        _communicate(tensor_send_prev=input_tensor_grad,
-                     prev_rank=prev_rank,
-                     scatter_gather_tensors=scatter_gather_tensors)
-def send_forward_recv_backward(output_tensor,
-                               output_grad_shape,
-                               recv_next=True,
-                               next_rank=None,
-                               dtype=torch.float,
-                               scatter_gather_tensors=False):
-    """Batched communication operation. Sends the input tensor to the 
-    next member in pipeline, while recieves the grad tensor from the
-    next member in pipeline.
-    :param output_tensor: Tensor to be sent
-    :param output_grad_shape: The shape of the tensor to be recieved
-    :type output_tensor: :class:`torch.Tensor`
-    :type output_grad_shape: :class:`torch.Size`
-    :return: The grad of output tensor in forward step
-    :rtype: :class:`torch.Tensor`
-    """
-    if gpc.is_pipeline_last_stage():
-        output_tensor_grad = None
-    else:
-        _, output_tensor_grad = _communicate(tensor_send_next=output_tensor,
-                                             recv_next=recv_next,
-                                             recv_next_shape=output_grad_shape,
-                                             next_rank=next_rank,
-                                             dtype=dtype,
-                                             scatter_gather_tensors=scatter_gather_tensors)
-    return output_tensor_grad
-def send_backward_recv_forward(input_tensor_grad,
-                               input_tensor_shape,
-                               recv_prev=True,
-                               prev_rank=None,
-                               dtype=torch.float,
-                               scatter_gather_tensors=False):
-    """Batched communication operation. Sends the grad tensor to the 
-    previous member in pipeline, while recieves the input tensor from the
-    previous member in pipeline.
-    :param input_tensor_grad: Tensor to be sent
-    :param input_tensor_shape: The shape of the tensor to be recieved
-    :type input_tensor_grad: :class:`torch.Tensor`
-    :type input_tensor_shape: :class:`torch.Size`
-    :return: The input tensor in forward step
-    :rtype: :class:`torch.Tensor`
-    """
-    if gpc.is_pipeline_first_stage():
-        input_tensor = None
-    else:
-        input_tensor, _ = _communicate(tensor_send_prev=input_tensor_grad,
-                                       recv_prev=recv_prev,
-                                       recv_prev_shape=input_tensor_shape,
-                                       prev_rank=prev_rank,
-                                       dtype=dtype,
-                                       scatter_gather_tensors=scatter_gather_tensors)
-    return input_tensor
-def send_forward_recv_forward(output_tensor,
-                              input_tensor_shape,
-                              recv_prev=True,
-                              prev_rank=None,
-                              next_rank=None,
-                              dtype=torch.float,
-                              scatter_gather_tensors=False):
-    """Batched communication operation. Sends the input tensor to the 
-    next member in pipeline, while recieves the input tensor from the
-    previous member in pipeline.
-    :param output_tensor: Tensor to be sent
-    :param input_tensor_shape: The shape of the tensor to be recieved
-    :type output_tensor: :class:`torch.Tensor`
-    :type input_tensor_shape: :class:`torch.Size`
-    :return: The input tensor in forward step
-    :rtype: :class:`torch.Tensor`
-    """
-    input_tensor, _ = _communicate(tensor_send_next=output_tensor,
-                                   recv_prev=recv_prev,
-                                   recv_prev_shape=input_tensor_shape,
-                                   prev_rank=prev_rank,
-                                   next_rank=next_rank,
-                                   dtype=dtype,
-                                   scatter_gather_tensors=scatter_gather_tensors)
-    return input_tensor
-def send_backward_recv_backward(input_tensor_grad,
-                                output_grad_shape,
-                                recv_next=True,
-                                prev_rank=None,
-                                next_rank=None,
-                                dtype=torch.float,
-                                scatter_gather_tensors=False):
-    """Batched communication operation. Sends the grad tensor to the 
-    previous member in pipeline, while recieves the grad tensor from the
-    next member in pipeline.
-    :param input_tensor_grad: Tensor to be sent
-    :param output_grad_shape: The shape of the tensor to be recieved
-    :type input_tensor_grad: :class:`torch.Tensor`
-    :type output_grad_shape: :class:`torch.Size`
-    :return: The grad of output tensor in forward step
-    :rtype: :class:`torch.Tensor`
-    """
-    _, output_tensor_grad = _communicate(tensor_send_prev=input_tensor_grad,
-                                         recv_next=recv_next,
-                                         recv_next_shape=output_grad_shape,
-                                         prev_rank=prev_rank,
-                                         next_rank=next_rank,
-                                         dtype=dtype,
-                                         scatter_gather_tensors=scatter_gather_tensors)
-    return output_tensor_grad
-def send_forward_backward_recv_forward_backward(output_tensor,
-                                                input_tensor_grad,
-                                                input_tensor_shape,
-                                                output_grad_shape,
-                                                recv_prev=True,
-                                                recv_next=True,
-                                                prev_rank=None,
-                                                next_rank=None,
-                                                dtype=torch.float,
-                                                scatter_gather_tensors=False):
-    """Batched communication operation. Sends the input tensor to the next and 
-    the grad tensor to the previous, while recieves the grad tensor from the
-    next and the input tensor from the previous.
-    :param output_tensor: Tensor sent to the next
-    :param input_tensor_grad: Tensor sent to the previous
-    :param input_tensor_shape: The shape of the tensor recieved from the previous
-    :param output_grad_shape: The shape of the tensor recieved from the next
-    :type output_tensor: :class:`torch.Tensor`
-    :type input_tensor_grad: :class:`torch.Tensor`
-    :type input_tensor_shape: :class:`torch.Size`
-    :type output_grad_shape: :class:`torch.Size`
-    :return: (the input tensor in forward step, the grad of output tensor in forward step)
-    :rtype: (Tensor, Tensor)
-    """
-    input_tensor, output_tensor_grad = _communicate(
-        tensor_send_next=output_tensor,
-        tensor_send_prev=input_tensor_grad,
-        recv_prev=recv_prev,
-        recv_next=recv_next,
-        recv_prev_shape=input_tensor_shape,
-        recv_next_shape=output_grad_shape,
-        prev_rank=prev_rank,
-        next_rank=next_rank,
-        dtype=dtype,
-        scatter_gather_tensors=scatter_gather_tensors)
-    return input_tensor, output_tensor_grad
--- a/colossalai/communication/ring.py
+++ b/colossalai/communication/ring.py
-#!/usr/bin/env python
-# -*- encoding: utf-8 -*-
-import torch
-from colossalai.context.parallel_mode import ParallelMode
-from colossalai.core import global_context as gpc
-from colossalai.utils import get_current_device, synchronize
-def ring_forward(tensor_send_next: torch.Tensor, parallel_mode: ParallelMode):
-    """Sends a tensor to the next member and recieves a tensor from the previous member.
-    This function returns the recieved tensor from the previous member.
-    :param tensor_send_next: Tensor sent to next member
-    :param parallel_mode: Parallel group mode used in this communication
-    :type tensor_send_next: :class:`torch.Tensor`
-    :type parallel_mode: :class:`colossalai.context.ParallelMode`
-    :return: The tensor recieved from the previous
-    :rtype: :class:`torch.Tensor`
-    """
-    buffer_shape = tensor_send_next.size()
-    ops = []
-    current_rank = gpc.get_global_rank()
-    tensor_recv_prev = torch.empty(buffer_shape,
-                                   requires_grad=True,
-                                   device=get_current_device(),
-                                   dtype=tensor_send_next.dtype)
-    # send to next rank
-    send_next_op = torch.distributed.P2POp(
-        torch.distributed.isend, tensor_send_next,
-        gpc.get_next_global_rank(parallel_mode))
-    ops.append(send_next_op)
-    # receive from prev rank
-    recv_prev_op = torch.distributed.P2POp(
-        torch.distributed.irecv, tensor_recv_prev,
-        gpc.get_prev_global_rank(parallel_mode))
-    ops.append(recv_prev_op)
-    if current_rank % 2 == 0:
-        ops = ops[::-1]
-    reqs = torch.distributed.batch_isend_irecv(ops)
-    for req in reqs:
-        req.wait()
-    # To protect against race condition when using batch_isend_irecv().
-    synchronize()
-    return tensor_recv_prev
--- a/colossalai/communication/utils.py
+++ b/colossalai/communication/utils.py
-import torch
-import torch.distributed as dist
-from colossalai.context.parallel_mode import ParallelMode
-from colossalai.core import global_context as gpc
-from colossalai.utils import get_current_device
-def send_tensor_meta(tensor, need_meta=True, next_rank=None):
-    """Sends tensor meta information before sending a specific tensor. 
-    Since the recipient must know the shape of the tensor in p2p communications,
-    meta information of the tensor should be sent before communications. This function
-    synchronizes with :func:`recv_tensor_meta`.
-    :param tensor: Tensor to be sent
-    :param need_meta: If False, meta information won't be sent
-    :param next_rank: The rank of the next member in pipeline parallel group
-    :type tensor: Tensor
-    :type need_meta: bool, optional
-    :type next_rank: int
-    :return: False
-    :rtype: bool
-    """
-    if need_meta:
-        if next_rank is None:
-            next_rank = gpc.get_next_global_rank(ParallelMode.PIPELINE)
-        tensor_kwargs = {'dtype': torch.long, 'device': get_current_device()}
-        send_shape = torch.tensor(tensor.size(), **tensor_kwargs)
-        send_ndims = torch.tensor(len(tensor.size()), **tensor_kwargs)
-        dist.send(send_ndims, next_rank)
-        dist.send(send_shape, next_rank)
-    return False
-def recv_tensor_meta(tensor_shape, prev_rank=None):
-    """Recieves tensor meta information before recieving a specific tensor. 
-    Since the recipient must know the shape of the tensor in p2p communications,
-    meta information of the tensor should be recieved before communications. This function
-    synchronizes with :func:`send_tensor_meta`.
-    :param tensor_shape: The shape of the tensor to be recieved
-    :param prev_rank: The rank of the source of the tensor
-    :type tensor_shape: torch.Size
-    :type prev_rank: int, optional
-    :return: The shape of the tensor to be recieved
-    :rtype: torch.Size
-    """
-    if tensor_shape is None:
-        if prev_rank is None:
-            prev_rank = gpc.get_prev_global_rank(ParallelMode.PIPELINE)
-        tensor_kwargs = {'dtype': torch.long, 'device': get_current_device()}
-        recv_ndims = torch.empty((), **tensor_kwargs)
-        dist.recv(recv_ndims, prev_rank)
-        recv_shape = torch.empty(recv_ndims, **tensor_kwargs)
-        dist.recv(recv_shape, prev_rank)
-        tensor_shape = torch.Size(recv_shape)
-    return tensor_shape
-def split_tensor_into_1d_equal_chunks(tensor, new_buffer=False):
-    """Break a tensor into equal 1D chunks.
-    :param tensor: Tensor to be splitted before communication
-    :param new_buffer: Whether uses a new buffer to store sliced tensor
-    :type tensor: torch.Tensor
-    :type new_buffer: bool, optional
-    :return splitted_tensor: The splitted tensor
-    :rtype splitted_tensor: torch.Tensor
-    """
-    partition_size = torch.numel(tensor) // gpc.get_world_size(ParallelMode.PARALLEL_1D)
-    start_index = partition_size * gpc.get_local_rank(ParallelMode.PARALLEL_1D)
-    end_index = start_index + partition_size
-    if new_buffer:
-        data = torch.empty(partition_size, dtype=tensor.dtype,
-                           device=torch.cuda.current_device(),
-                           requires_grad=False)
-        data.copy_(tensor.view(-1)[start_index:end_index])
-    else:
-        data = tensor.view(-1)[start_index:end_index]
-    return data
-def gather_split_1d_tensor(tensor):
-    """Opposite of above function, gather values from model parallel ranks.
-    :param tensor: Tensor to be gathered after communication
-    :type tensor: torch.Tensor
-    :return gathered: The gathered tensor
-    :rtype gathered: torch.Tensor
-    """
-    world_size = gpc.get_world_size(ParallelMode.PARALLEL_1D)
-    numel = torch.numel(tensor)
-    numel_gathered = world_size * numel
-    gathered = torch.empty(numel_gathered, dtype=tensor.dtype,
-                           device=torch.cuda.current_device(),
-                           requires_grad=False)
-    chunks = [gathered[i*numel:(i+1)*numel] for i in range(world_size)]
-    dist.all_gather(chunks, tensor, group=gpc.get_group(ParallelMode.PARALLEL_1D))
-    return gathered
--- a/colossalai/constants.py
+++ b/colossalai/constants.py
-#!/usr/bin/env python
-# -*- encoding: utf-8 -*-
-ALLOWED_MODES = [None, '1d', '2d', '2.5d', '3d', 'sequence']
-TENSOR_PARALLEL_MODE = 'tensor_parallel_mode'
-# intializer
-INITIALIZER_MAPPING = {
-    'data': 'Initializer_Data',
-    'tensor': 'Initializer_Tensor',
-    'pipeline': 'Initializer_Pipeline',
-    'embedding': 'Initializer_Embedding',
-    '1d': 'Initializer_1D',
-    '2d': 'Initializer_2D',
-    '2.5d': 'Initializer_2p5D',
-    '3d': 'Initializer_3D',
-    'sequence': 'Initializer_Sequence',
-    'model': 'Initializer_Model',
-    'moe': 'Initializer_Moe'
-}
-# 3D parallelism groups
-INPUT_GROUP_3D = 'input_group_3d'
-WEIGHT_GROUP_3D = 'weight_group_3d'
-OUTPUT_GROUP_3D = 'output_group_3d'
-# Attributes of tensor parallel parameters 
-IS_TENSOR_PARALLEL = 'is_tensor_parallel'
-NUM_PARTITIONS = 'num_partitions'
-TENSOR_PARALLEL_ATTRIBUTES = [IS_TENSOR_PARALLEL, NUM_PARTITIONS]
--- a/colossalai/context/__init__.py
+++ b/colossalai/context/__init__.py
-from .config import Config, ConfigException
-from .parallel_context import ParallelContext
-from .parallel_mode import ParallelMode
-from .process_group_initializer import *
-from .random import *
--- a/colossalai/context/__pycache__/__init__.cpython-36.pyc
+++ b/colossalai/context/__pycache__/__init__.cpython-36.pyc
--- a/colossalai/context/__pycache__/__init__.cpython-37.pyc
+++ b/colossalai/context/__pycache__/__init__.cpython-37.pyc
--- a/colossalai/context/__pycache__/config.cpython-36.pyc
+++ b/colossalai/context/__pycache__/config.cpython-36.pyc
--- a/colossalai/context/__pycache__/config.cpython-37.pyc
+++ b/colossalai/context/__pycache__/config.cpython-37.pyc
--- a/colossalai/context/__pycache__/parallel_context.cpython-36.pyc
+++ b/colossalai/context/__pycache__/parallel_context.cpython-36.pyc
--- a/colossalai/context/__pycache__/parallel_context.cpython-37.pyc
+++ b/colossalai/context/__pycache__/parallel_context.cpython-37.pyc
--- a/colossalai/context/__pycache__/parallel_mode.cpython-36.pyc
+++ b/colossalai/context/__pycache__/parallel_mode.cpython-36.pyc
--- a/colossalai/context/__pycache__/parallel_mode.cpython-37.pyc
+++ b/colossalai/context/__pycache__/parallel_mode.cpython-37.pyc
--- a/colossalai/context/config.py
+++ b/colossalai/context/config.py
-#!/usr/bin/env python
-# -*- encoding: utf-8 -*-
-import inspect
-import sys
-from importlib.machinery import SourceFileLoader
-from pathlib import Path
-from colossalai.logging import get_dist_logger
-class Config(dict):
-    """This is a wrapper class for dict objects so that values of which can be
-    accessed as attributes.
-    :param config: The dict object to be wrapped
-    :type config: dict
-    """
-    def __init__(self, config: dict = None):
-        if config is not None:
-            for k, v in config.items():
-                self._add_item(k, v)
-    def __missing__(self, key):
-        raise KeyError(key)
-    def __getattr__(self, key):
-        try:
-            value = super(Config, self).__getitem__(key)
-            return value
-        except KeyError:
-            raise AttributeError(key)
-    def __setattr__(self, key, value):
-        super(Config, self).__setitem__(key, value)
-    def _add_item(self, key, value):
-        if isinstance(value, dict):
-            self.__setattr__(key, Config(value))
-        else:
-            self.__setattr__(key, value)
-    def update(self, config):
-        assert isinstance(config, (Config, dict)), 'can only update dictionary or Config objects.'
-        for k, v in config.items():
-            self._add_item(k, v)
-        return self
-    @staticmethod
-    def from_file(filename: str):
-        """Reads a python file and constructs a corresponding :class:`Config` object.
-        :param filename: Name of the file to construct the return object
-        :type filename: str
-        :raises AssertionError: Raises an AssertionError if the file does not exist, or the file
-            is not .py file
-        :return: A :class:`Config` object constructed with information in the file
-        :rtype: :class:`Config`
-        """
-        # check config path
-        if isinstance(filename, str):
-            filepath = Path(filename).absolute()
-        elif isinstance(filename, Path):
-            filepath = filename.absolute()
-        assert filepath.exists(), f'{filename} is not found, please check your configuration path'
-        # check extension
-        extension = filepath.suffix
-        assert extension == '.py', 'only .py files are supported'
-        # import the config as module
-        remove_path = False
-        if filepath.parent not in sys.path:
-            sys.path.insert(0, (filepath))
-            remove_path = True
-        module_name = filepath.stem
-        source_file = SourceFileLoader(fullname=str(module_name), path=str(filepath))
-        module = source_file.load_module()
-        # load into config
-        config = Config()
-        for k, v in module.__dict__.items():
-            if k.startswith('__') or inspect.ismodule(v) or inspect.isclass(v):
-                continue
-            else:
-                config._add_item(k, v)
-        logger = get_dist_logger()
-        logger.debug('variables which starts with __, is a module or class declaration are omitted in config file')
-        # remove module
-        del sys.modules[module_name]
-        if remove_path:
-            sys.path.pop(0)
-        return config
-class ConfigException(Exception):
-    pass
--- a/colossalai/context/parallel_context.py
+++ b/colossalai/context/parallel_context.py
--- a/colossalai/context/parallel_mode.py
+++ b/colossalai/context/parallel_mode.py
-#!/usr/bin/env python
-# -*- encoding: utf-8 -*-
-from enum import Enum
-# parallel modes
-class ParallelMode(Enum):
-    """This is an enumeration class containing all possible parallel modes.
-    """
-    GLOBAL = 'global'
-    # common parallel
-    DATA = 'data'
-    # model parallel - containing tensor and pipeline parallel groups
-    # this is added to facilitate amp and grad clipping in hybrid parallel
-    MODEL = 'model'
-    # pipeline parallel
-    PIPELINE = 'pipe'
-    # containing all ranks in tensor parallel
-    TENSOR = 'tensor'
-    # sequence parallel
-    SEQUENCE = 'sequence'
-    SEQUENCE_DP = 'sequence_dp'
-    # 1D Parallel
-    PARALLEL_1D = '1d'
-    # 2D parallel
-    PARALLEL_2D_ROW = '2d_row'
-    PARALLEL_2D_COL = '2d_col'
-    # 3D parallel
-    PARALLEL_3D_INPUT = '3d_input'
-    PARALLEL_3D_WEIGHT = '3d_weight'
-    PARALLEL_3D_OUTPUT = '3d_output'
-    # 2.5D parallel
-    PARALLEL_2P5D_ROW = '2p5d_row'
-    PARALLEL_2P5D_COL = '2p5d_col'
-    PARALLEL_2P5D_DEP = '2p5d_dep'
-    PARALLEL_2P5D_XZ = '2p5d_xz'
-    # MOE parallel
-    MOE_DATA = 'moe_data'
-    MOE_MODEL = 'moe_model'
--- a/colossalai/context/process_group_initializer/__init__.py
+++ b/colossalai/context/process_group_initializer/__init__.py
-from .initializer_1d import Initializer_1D
-from .initializer_2d import Initializer_2D
-from .initializer_2p5d import Initializer_2p5D
-from .initializer_3d import Initializer_3D
-from .initializer_data import Initializer_Data
-from .initializer_pipeline import Initializer_Pipeline
-from .initializer_sequence import Initializer_Sequence
-from .initializer_tensor import Initializer_Tensor
-from .initializer_model import Initializer_Model
-from .initializer_moe import Initializer_Moe
-from .process_group_initializer import ProcessGroupInitializer
-__all__ = [
-    'Initializer_Tensor', 'Initializer_Sequence', 'Initializer_Pipeline',
-    'Initializer_Data', 'Initializer_2p5D', 'Initializer_2D', 'Initializer_3D',
-    'Initializer_1D', 'ProcessGroupInitializer', 'Initializer_Model',
-    'Initializer_Moe'
-]