delete unused files

da3f0934 · zhuwenwen · c4dd1fd4 · c4dd1fd4 · c4dd1fd4 · c4dd1fd4
Commit da3f0934 authored Apr 23, 2023 by zhuwenwen
20 changed files
--- a/colossalai/engine/gradient_handler/__pycache__/_pipeline_parallel_gradient_handler.cpython-36.pyc
+++ b/colossalai/engine/gradient_handler/__pycache__/_pipeline_parallel_gradient_handler.cpython-36.pyc
--- a/colossalai/engine/gradient_handler/__pycache__/_pipeline_parallel_gradient_handler.cpython-37.pyc
+++ b/colossalai/engine/gradient_handler/__pycache__/_pipeline_parallel_gradient_handler.cpython-37.pyc
--- a/colossalai/engine/gradient_handler/__pycache__/_sequence_parallel_gradient_handler.cpython-36.pyc
+++ b/colossalai/engine/gradient_handler/__pycache__/_sequence_parallel_gradient_handler.cpython-36.pyc
--- a/colossalai/engine/gradient_handler/__pycache__/_sequence_parallel_gradient_handler.cpython-37.pyc
+++ b/colossalai/engine/gradient_handler/__pycache__/_sequence_parallel_gradient_handler.cpython-37.pyc
--- a/colossalai/engine/gradient_handler/__pycache__/_zero_gradient_handler.cpython-36.pyc
+++ b/colossalai/engine/gradient_handler/__pycache__/_zero_gradient_handler.cpython-36.pyc
--- a/colossalai/engine/gradient_handler/__pycache__/_zero_gradient_handler.cpython-37.pyc
+++ b/colossalai/engine/gradient_handler/__pycache__/_zero_gradient_handler.cpython-37.pyc
--- a/colossalai/engine/gradient_handler/_base_gradient_handler.py
+++ b/colossalai/engine/gradient_handler/_base_gradient_handler.py
-#!/usr/bin/env python
-# -*- encoding: utf-8 -*-
-
-from abc import ABC, abstractmethod
-
-
-class BaseGradientHandler(ABC):
-    """A basic helper class to handle all-reduce operations of gradients across different parallel groups 
-    before optimization.
-
-    :param model: Model where the gradients accumulate
-    :param optimizer: Optimizer for updating the parameters
-    :type model: Module
-    :type optimizer: Optimizer
-    """
-    def __init__(self, model, optimizer):
-        self._model = model
-        self._optimizer = optimizer
-
-    @abstractmethod
-    def handle_gradient(self):
-        """A method to accumulate gradients across different parallel groups. Users should
-        write their own functions or just use the functions in pre-defined subclasses.
-        """
-        pass
--- a/colossalai/engine/gradient_handler/_data_parallel_gradient_handler.py
+++ b/colossalai/engine/gradient_handler/_data_parallel_gradient_handler.py
-#!/usr/bin/env python
-
-import torch.distributed as dist
-from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
-
-from colossalai.core import global_context as gpc
-from colossalai.registry import GRADIENT_HANDLER
-from ._base_gradient_handler import BaseGradientHandler
-from ...context.parallel_mode import ParallelMode
-
-
-@GRADIENT_HANDLER.register_module
-class DataParallelGradientHandler(BaseGradientHandler):
-    """A helper class to handle all-reduce operations in a data parallel group.
-    A all-reduce collective communication will be operated in 
-    :func:`handle_gradient` among a data parallel group.
-    For better performance, it bucketizes the gradients of all parameters that are 
-    the same type to improve the efficiency of communication.
-    """
-
-    def handle_gradient(self):
-        """A method running a all-reduce operation in a data parallel group.
-        """
-        # TODO: add memory buffer
-        if gpc.data_parallel_size > 1:
-            # bucketize and all-reduce
-            buckets = {}
-            # Pack the buckets.
-            for param in self._model.parameters():
-                if param.requires_grad and param.grad is not None:
-                    tp = param.data.type()
-                    if tp not in buckets:
-                        buckets[tp] = []
-                    buckets[tp].append(param)
-                    # param.main_grad = param.grad
-
-            # For each bucket, all-reduce and copy all-reduced grads.
-            for tp in buckets:
-                bucket = buckets[tp]
-                grads = [param.grad.data for param in bucket]
-                coalesced = _flatten_dense_tensors(grads)
-                coalesced /= gpc.get_world_size(ParallelMode.DATA)
-
-                dist.all_reduce(
-                    coalesced, group=gpc.get_group(ParallelMode.DATA))
-                for buf, synced in zip(grads, _unflatten_dense_tensors(
-                        coalesced, grads)):
-                    buf.copy_(synced)
--- a/colossalai/engine/gradient_handler/_moe_gradient_handler.py
+++ b/colossalai/engine/gradient_handler/_moe_gradient_handler.py
-import torch.distributed as dist
-from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
-from colossalai.core import global_context as gpc
-from colossalai.registry import GRADIENT_HANDLER
-from colossalai.global_variables import moe_env
-from ._base_gradient_handler import BaseGradientHandler
-from ...context.parallel_mode import ParallelMode
-
-
-@GRADIENT_HANDLER.register_module
-class MoeGradientHandler(BaseGradientHandler):
-    """A helper class to handle all-reduce operations in a data parallel group and
-    moe model parallel. A all-reduce collective communication will be operated in
-    :func:`handle_gradient` among a data parallel group.
-    For better performance, it bucketizes the gradients of all parameters that are
-    the same type to improve the efficiency of communication.
-    """
-
-    def handle_gradient(self):
-        """A method running an all-reduce operation in a data parallel group.
-        Then running an all-reduce operation for all parameters in experts
-        across moe model parallel group
-        """
-        moe_data = moe_env.data_parallel_size
-        global_data = gpc.data_parallel_size
-
-        if global_data > 1:
-            # bucketize and all-reduce
-            buckets = {}
-            # Pack the buckets.
-            for param in self._model.parameters():
-                if param.requires_grad and \
-                        param.grad is not None and \
-                        not hasattr(param, 'moe_param'):
-                    tp = param.data.type()
-                    if tp not in buckets:
-                        buckets[tp] = []
-                    buckets[tp].append(param)
-                    # param.main_grad = param.grad
-
-            # For each bucket, all-reduce and copy all-reduced grads.
-            for tp in buckets:
-                bucket = buckets[tp]
-                grads = [param.grad.data for param in bucket]
-                coalesced = _flatten_dense_tensors(grads)
-                coalesced /= gpc.get_world_size(ParallelMode.DATA)
-
-                dist.all_reduce(
-                    coalesced, group=gpc.get_group(ParallelMode.DATA))
-                for buf, synced in zip(grads, _unflatten_dense_tensors(
-                        coalesced, grads)):
-                    buf.copy_(synced)
-
-        if global_data > 1:
-            for param in self._model.parameters():
-                if not param.requires_grad or param.grad is None:
-                    continue
-                if moe_data > 1 and hasattr(param, 'moe_param'):
-                    param.grad.data /= moe_data
-                    dist.all_reduce(param.grad.data,
-                                    group=gpc.get_group(ParallelMode.MOE_DATA))
--- a/colossalai/engine/gradient_handler/_pipeline_parallel_gradient_handler.py
+++ b/colossalai/engine/gradient_handler/_pipeline_parallel_gradient_handler.py
-#!/usr/bin/env python
-
-import torch.distributed as dist
-from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
-
-from colossalai.core import global_context as gpc
-from colossalai.registry import GRADIENT_HANDLER
-from ._base_gradient_handler import BaseGradientHandler
-from collections import defaultdict
-
-
-@GRADIENT_HANDLER.register_module
-class PipelineSharedModuleGradientHandler(BaseGradientHandler):
-    """A helper class to handle all-reduce operations in sub parallel groups.
-    A all-reduce collective communication will be operated in 
-    :func:`handle_gradient` among all sub pipeline parallel groups.
-    For better performance, it bucketizes the gradients of all parameters that are 
-    the same type to improve the efficiency of communication.
-    """
-
-    def handle_gradient(self):
-        """A method running a all-reduce operation in sub pipeline parallel groups.
-        """
-        if gpc.pipeline_parallel_size > 1:
-            # bucketize and all-reduce
-            buckets = defaultdict(lambda: defaultdict(list))
-            # Pack the buckets.
-            for param in self._model.parameters():
-                group = getattr(param, 'pipeline_shared_module_pg', None)
-                if param.requires_grad and param.grad is not None and group is not None:
-                    tp = param.data.type()
-                    buckets[group][tp].append(param)
-
-            # For each bucket, all-reduce and copy all-reduced grads.
-            for group, group_buckets in buckets.items():
-                for tp, bucket in group_buckets.items():
-                    grads = [param.grad.data for param in bucket]
-                    coalesced = _flatten_dense_tensors(grads)
-                    dist.all_reduce(coalesced, op=dist.ReduceOp.SUM, group=group)
-                    for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)):
-                        buf.copy_(synced)
--- a/colossalai/engine/gradient_handler/_sequence_parallel_gradient_handler.py
+++ b/colossalai/engine/gradient_handler/_sequence_parallel_gradient_handler.py
-#!/usr/bin/env python
-from functools import total_ordering
-import torch
-import torch.distributed as dist
-from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
-
-from colossalai.core import global_context as gpc
-from colossalai.registry import GRADIENT_HANDLER
-from ._base_gradient_handler import BaseGradientHandler
-from ...context.parallel_mode import ParallelMode
-import colossalai
-
-
-@GRADIENT_HANDLER.register_module
-class SequenceParallelGradientHandler(BaseGradientHandler):
-    """A helper class to handle all-reduce operations in a data parallel group.
-    A all-reduce collective communication will be operated in 
-    :func:`handle_gradient` among a data parallel group.
-    For better performance, it bucketizes the gradients of all parameters that are 
-    the same type to improve the efficiency of communication.
-    """
-
-    def handle_gradient(self):
-        """A method running a all-reduce operation in a data parallel group.
-        """
-
-        # bucketize and all-reduce
-        buckets = {}
-
-        # Pack the buckets.
-        for param in self._model.parameters():
-            if param.requires_grad and param.grad is not None:
-                tp = param.data.type()
-                if tp not in buckets:
-                    buckets[tp] = []
-                buckets[tp].append(param)
-
-        # For each bucket, all-reduce and copy all-reduced grads.
-        for tp in buckets:
-            bucket = buckets[tp]
-            grads = [param.grad.data for param in bucket]
-            coalesced = _flatten_dense_tensors(grads)
-
-            coalesced /= gpc.get_world_size(ParallelMode.SEQUENCE_DP)
-
-            dist.all_reduce(
-                coalesced, group=gpc.get_group(ParallelMode.SEQUENCE_DP))
-
-            for buf, synced in zip(grads, _unflatten_dense_tensors(
-                    coalesced, grads)):
-                buf.copy_(synced)
--- a/colossalai/engine/gradient_handler/_zero_gradient_handler.py
+++ b/colossalai/engine/gradient_handler/_zero_gradient_handler.py
-from colossalai.registry import GRADIENT_HANDLER
-from ._base_gradient_handler import BaseGradientHandler
-
-
-@GRADIENT_HANDLER.register_module
-class ZeROGradientHandler(BaseGradientHandler):
-    """A helper class to handle all-reduce operations in a data parallel group.
-    A all-reduce collective communication will be operated in 
-    :func:`handle_gradient` among a data parallel group.
-    This class is specialized with ZeRO optimization.
-    """
-
-    def handle_gradient(self):
-        """A method running a all-reduce operation in a data parallel group.
-        """
-        self._optimizer.allreduce_gradients()
--- a/colossalai/engine/ophooks/__init__.py
+++ b/colossalai/engine/ophooks/__init__.py
-from ._base_ophook import BaseOpHook
-from ._memtracer_ophook import MemTracerOpHook
-import torch
-from typing import List
-
-all = ["BaseOpHook", "MemTracerOpHook", "register_ophooks_recursively"]
-
-
-# apply torch.autograd.Function that calls a backward_function to tensors in output
-def _apply_to_tensors_only(module, functional, backward_function, outputs):
-    if type(outputs) is tuple:
-        touched_outputs = []
-        for output in outputs:
-            touched_output = _apply_to_tensors_only(module, functional,
-                                                    backward_function, output)
-            touched_outputs.append(touched_output)
-        return tuple(touched_outputs)
-    elif type(outputs) is torch.Tensor:
-        return functional.apply(module, backward_function, outputs)
-    else:
-        return outputs
-
-
-class PreBackwardFunction(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, module, pre_backward_function, outputs):
-        ctx.module = module
-        ctx.pre_backward_function = pre_backward_function
-        module.applied_pre_backward = False
-        outputs = outputs.detach()
-        return outputs
-
-    @staticmethod
-    def backward(ctx, *args):
-        ctx.pre_backward_function(ctx.module)
-        return (None, None) + args
-
-
-class PostBackwardFunction(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, module, pre_backward_function, output):
-        ctx.module = module
-        output = output.detach()
-        ctx.pre_backward_function = pre_backward_function
-        return output
-
-    @staticmethod
-    def backward(ctx, *args):
-        """
-        Args:
-            activation_grad of the next layer.
-        Returns:
-            grad of the input activation.
-        """
-        ctx.pre_backward_function(ctx.module)
-        return (None, None) + args
-
-
-def register_ophooks_recursively(module: torch.nn.Module,
-                                 ophook_list: List[BaseOpHook] = None,
-                                 name: str = ""):
-    r"""Recursilvely register pre/post hooks for all submodules in the module in FWD and BWD."""
-    assert isinstance(module, torch.nn.Module)
-    has_children = False
-    for child_name, child in module.named_children():
-        register_ophooks_recursively(child, ophook_list, name + child_name)
-        has_children = True
-
-    # Early return on modules with no parameters or buffers that
-    # are not in their children.
-    if (len(list(module.named_parameters(recurse=False))) == 0
-            and len(list(module.named_buffers(recurse=False))) == 0):
-        return
-
-    # return if the module has not childern.
-    if has_children:
-        return
-
-    if ophook_list is not None:
-        for hook in ophook_list:
-            assert (isinstance(hook, BaseOpHook))
-
-    def _pre_forward_module_hook(submodule, *args):
-        for hook in ophook_list:
-            assert isinstance(submodule, torch.nn.Module)
-            hook.pre_fwd_exec(submodule, *args)
-
-    def _post_forward_module_hook(submodule, *args):
-        for hook in ophook_list:
-            assert isinstance(submodule, torch.nn.Module)
-            hook.post_fwd_exec(submodule, *args)
-
-    def _pre_backward_module_hook(submodule, inputs, output):
-        def _run_before_backward_function(submodule):
-            for hook in ophook_list:
-                assert isinstance(submodule, torch.nn.Module)
-                hook.pre_bwd_exec(submodule, inputs, output)
-
-        return _apply_to_tensors_only(submodule, PreBackwardFunction,
-                                      _run_before_backward_function, output)
-
-    def _post_backward_module_hook(submodule, inputs):
-        def _run_after_backward_function(submodule):
-            for hook in ophook_list:
-                assert isinstance(submodule, torch.nn.Module)
-                hook.post_bwd_exec(submodule, inputs)
-
-        return _apply_to_tensors_only(submodule, PostBackwardFunction,
-                                      _run_after_backward_function, inputs)
-
-    module.register_forward_pre_hook(_pre_forward_module_hook)
-    module.register_forward_hook(_post_forward_module_hook)
-
-    module.register_forward_hook(_pre_backward_module_hook)
-    module.register_forward_pre_hook(_post_backward_module_hook)
--- a/colossalai/engine/ophooks/__pycache__/__init__.cpython-36.pyc
+++ b/colossalai/engine/ophooks/__pycache__/__init__.cpython-36.pyc
--- a/colossalai/engine/ophooks/__pycache__/__init__.cpython-37.pyc
+++ b/colossalai/engine/ophooks/__pycache__/__init__.cpython-37.pyc
--- a/colossalai/engine/ophooks/__pycache__/_base_ophook.cpython-36.pyc
+++ b/colossalai/engine/ophooks/__pycache__/_base_ophook.cpython-36.pyc
--- a/colossalai/engine/ophooks/__pycache__/_base_ophook.cpython-37.pyc
+++ b/colossalai/engine/ophooks/__pycache__/_base_ophook.cpython-37.pyc
--- a/colossalai/engine/ophooks/__pycache__/_memtracer_ophook.cpython-36.pyc
+++ b/colossalai/engine/ophooks/__pycache__/_memtracer_ophook.cpython-36.pyc
--- a/colossalai/engine/ophooks/__pycache__/_memtracer_ophook.cpython-37.pyc
+++ b/colossalai/engine/ophooks/__pycache__/_memtracer_ophook.cpython-37.pyc
--- a/colossalai/engine/ophooks/_base_ophook.py
+++ b/colossalai/engine/ophooks/_base_ophook.py
-from abc import ABC, abstractmethod
-import torch
-
-
-class BaseOpHook(ABC):
-    """This class allows users to add customized operations
-    before and after the execution of a PyTorch submodule"""
-    def __init__(self):
-        pass
-
-    @abstractmethod
-    def pre_fwd_exec(self, module: torch.nn.Module, *args):
-        pass
-
-    @abstractmethod
-    def post_fwd_exec(self, module: torch.nn.Module, *args):
-        pass
-
-    @abstractmethod
-    def pre_bwd_exec(self, module: torch.nn.Module, input, output):
-        pass
-
-    @abstractmethod
-    def post_bwd_exec(self, module: torch.nn.Module, input):
-        pass
-
-    @abstractmethod
-    def post_iter(self):
-        pass