[legacy] move communication and nn to legacy and refactor logger (#4671)

* [legacy] move communication to legacy (#4640) * [legacy] refactor logger and clean up legacy codes (#4654) * [legacy] make logger independent to gpc * [legacy] make optim independent to registry * [legacy] move test engine to legacy * [legacy] move nn to legacy (#4656) * [legacy] move nn to legacy * [checkpointio] fix save hf config * [test] remove useledd rpc pp test * [legacy] fix nn init * [example] skip tutorial hybriad parallel example * [devops] test doc check * [devops] test doc check

[legacy] move communication and nn to legacy and refactor logger (#4671)
* [legacy] move communication to legacy (#4640) * [legacy] refactor logger and clean up legacy codes (#4654) * [legacy] make logger independent to gpc * [legacy] make optim independent to registry * [legacy] move test engine to legacy * [legacy] move nn to legacy (#4656) * [legacy] move nn to legacy * [checkpointio] fix save hf config * [test] remove useledd rpc pp test * [legacy] fix nn init * [example] skip tutorial hybriad parallel example * [devops] test doc check * [devops] test doc check
554aa959 · Hongxin Liu · GitHub · 536397cc · 554aa959 · 554aa959
Unverified Commit 554aa959 authored Sep 11, 2023 by Hongxin Liu Committed by GitHub Sep 11, 2023
20 changed files
--- a/colossalai/nn/loss/loss_1d.py
+++ b/colossalai/nn/loss/loss_1d.py
--- a/colossalai/nn/loss/loss_2d.py
+++ b/colossalai/nn/loss/loss_2d.py
@@ -6,9 +6,9 @@ from torch.nn.modules.loss import _Loss

 from colossalai.context import ParallelMode
 from colossalai.core import global_context as gpc
+from colossalai.legacy.nn.layer.parallel_2d import reduce_by_batch_2d, split_batch_2d
+from colossalai.legacy.nn.layer.parallel_2d._utils import assert_summa_initialization
 from colossalai.legacy.registry import LOSSES
-from colossalai.nn.layer.parallel_2d import reduce_by_batch_2d, split_batch_2d
-from colossalai.nn.layer.parallel_2d._utils import assert_summa_initialization
 from colossalai.utils import get_current_device



--- a/colossalai/nn/loss/loss_2p5d.py
+++ b/colossalai/nn/loss/loss_2p5d.py
@@ -6,9 +6,9 @@ from torch.nn.modules.loss import _Loss

 from colossalai.context import ParallelMode
 from colossalai.core import global_context as gpc
+from colossalai.legacy.nn.layer.parallel_2p5d import reduce_by_batch_2p5d, split_batch_2p5d
+from colossalai.legacy.nn.layer.parallel_2p5d._utils import assert_tesseract_initialization
 from colossalai.legacy.registry import LOSSES
-from colossalai.nn.layer.parallel_2p5d import reduce_by_batch_2p5d, split_batch_2p5d
-from colossalai.nn.layer.parallel_2p5d._utils import assert_tesseract_initialization
 from colossalai.utils import get_current_device



--- a/colossalai/nn/loss/loss_3d.py
+++ b/colossalai/nn/loss/loss_3d.py
@@ -6,9 +6,9 @@ from torch.nn.modules.loss import _Loss

 from colossalai.constants import INPUT_GROUP_3D, OUTPUT_GROUP_3D, WEIGHT_GROUP_3D
 from colossalai.core import global_context as gpc
+from colossalai.legacy.nn.layer.parallel_3d import reduce_by_batch_3d, split_tensor_3d
+from colossalai.legacy.nn.layer.parallel_3d._utils import get_parallel_mode_from_env
 from colossalai.legacy.registry import LOSSES
-from colossalai.nn.layer.parallel_3d import reduce_by_batch_3d, split_tensor_3d
-from colossalai.nn.layer.parallel_3d._utils import get_parallel_mode_from_env
 from colossalai.utils import get_current_device



--- a/colossalai/nn/metric/__init__.py
+++ b/colossalai/nn/metric/__init__.py
-from torch import nn
-
-from ._utils import calc_acc
-from .accuracy_2d import Accuracy2D
-from .accuracy_2p5d import Accuracy2p5D
-from .accuracy_3d import Accuracy3D
-from colossalai.nn.layer.utils import get_tensor_parallel_mode
-
-_parallel_accuracy = {
-    '2d': Accuracy2D,
-    '2.5d': Accuracy2p5D,
-    '3d': Accuracy3D,
-}
-
-
-class Accuracy(nn.Module):
-    def __init__(self):
-        super().__init__()
-        tensor_parallel = get_tensor_parallel_mode()
-        if tensor_parallel not in _parallel_accuracy:
-            self.acc = calc_acc
-        else:
-            self.acc = _parallel_accuracy[tensor_parallel]()
-
-    def forward(self, *args):
-        return self.acc(*args)
+from torch import nn
+
+from colossalai.legacy.nn.layer.utils import get_tensor_parallel_mode
+
+from ._utils import calc_acc
+from .accuracy_2d import Accuracy2D
+from .accuracy_2p5d import Accuracy2p5D
+from .accuracy_3d import Accuracy3D
+
+_parallel_accuracy = {
+    '2d': Accuracy2D,
+    '2.5d': Accuracy2p5D,
+    '3d': Accuracy3D,
+}
+
+
+class Accuracy(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        tensor_parallel = get_tensor_parallel_mode()
+        if tensor_parallel not in _parallel_accuracy:
+            self.acc = calc_acc
+        else:
+            self.acc = _parallel_accuracy[tensor_parallel]()
+
+    def forward(self, *args):
+        return self.acc(*args)
--- a/colossalai/nn/metric/_utils.py
+++ b/colossalai/nn/metric/_utils.py
-import torch
-
-
-def calc_acc(logits, targets):
-    preds = torch.argmax(logits, dim=-1)
-    correct = torch.sum(targets == preds)
-    return correct
+import torch
+
+
+def calc_acc(logits, targets):
+    preds = torch.argmax(logits, dim=-1)
+    correct = torch.sum(targets == preds)
+    return correct
--- a/colossalai/nn/metric/accuracy_2d.py
+++ b/colossalai/nn/metric/accuracy_2d.py
 import torch
-from colossalai.nn.layer.parallel_2d import reduce_by_batch_2d, split_batch_2d
 from torch import nn

+from colossalai.legacy.nn.layer.parallel_2d import reduce_by_batch_2d, split_batch_2d
+
 from ._utils import calc_acc



--- a/colossalai/nn/metric/accuracy_2p5d.py
+++ b/colossalai/nn/metric/accuracy_2p5d.py
 import torch
-from colossalai.nn.layer.parallel_2p5d import reduce_by_batch_2p5d, split_batch_2p5d
 from torch import nn

+from colossalai.legacy.nn.layer.parallel_2p5d import reduce_by_batch_2p5d, split_batch_2p5d
+
 from ._utils import calc_acc



--- a/colossalai/nn/metric/accuracy_3d.py
+++ b/colossalai/nn/metric/accuracy_3d.py
-import torch
-from colossalai.constants import INPUT_GROUP_3D, WEIGHT_GROUP_3D
-from colossalai.nn.layer.parallel_3d import reduce_by_batch_3d, split_tensor_3d
-from colossalai.nn.layer.parallel_3d._utils import get_parallel_mode_from_env
-from torch import nn
-
-from ._utils import calc_acc
-
-
-class Accuracy3D(nn.Module):
-    """Accuracy for 3D parallelism
-    """
-    def __init__(self):
-        super().__init__()
-        self.input_parallel_mode = get_parallel_mode_from_env(INPUT_GROUP_3D)
-        self.weight_parallel_mode = get_parallel_mode_from_env(WEIGHT_GROUP_3D)
-
-    def forward(self, logits, targets):
-        """Calculate the accuracy of predicted labels.
-
-        Args:
-            logits (:class:`torch.tensor`): Predicted labels.
-            targets (:class:`torch.tensor`): True labels from data.
-
-        Returns:
-            float: the accuracy of prediction.
-         """
-        with torch.no_grad():
-            targets = split_tensor_3d(targets, 0, self.weight_parallel_mode)
-            targets = split_tensor_3d(targets, 0, self.input_parallel_mode)
-            correct = calc_acc(logits, targets)
-            correct = reduce_by_batch_3d(correct, self.input_parallel_mode, self.weight_parallel_mode)
-        return correct
+import torch
+from torch import nn
+
+from colossalai.constants import INPUT_GROUP_3D, WEIGHT_GROUP_3D
+from colossalai.legacy.nn.layer.parallel_3d import reduce_by_batch_3d, split_tensor_3d
+from colossalai.legacy.nn.layer.parallel_3d._utils import get_parallel_mode_from_env
+
+from ._utils import calc_acc
+
+
+class Accuracy3D(nn.Module):
+    """Accuracy for 3D parallelism
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.input_parallel_mode = get_parallel_mode_from_env(INPUT_GROUP_3D)
+        self.weight_parallel_mode = get_parallel_mode_from_env(WEIGHT_GROUP_3D)
+
+    def forward(self, logits, targets):
+        """Calculate the accuracy of predicted labels.
+
+        Args:
+            logits (:class:`torch.tensor`): Predicted labels.
+            targets (:class:`torch.tensor`): True labels from data.
+
+        Returns:
+            float: the accuracy of prediction.
+         """
+        with torch.no_grad():
+            targets = split_tensor_3d(targets, 0, self.weight_parallel_mode)
+            targets = split_tensor_3d(targets, 0, self.input_parallel_mode)
+            correct = calc_acc(logits, targets)
+            correct = reduce_by_batch_3d(correct, self.input_parallel_mode, self.weight_parallel_mode)
+        return correct
--- a/colossalai/nn/parallel/__init__.py
+++ b/colossalai/nn/parallel/__init__.py
--- a/colossalai/nn/parallel/data_parallel.py
+++ b/colossalai/nn/parallel/data_parallel.py
--- a/colossalai/nn/parallel/layers/__init__.py
+++ b/colossalai/nn/parallel/layers/__init__.py
+from .cache_embedding import (
+    CachedEmbeddingBag,
+    CachedParamMgr,
+    EvictionStrategy,
+    LimitBuffIndexCopyer,
+    ParallelCachedEmbeddingBag,
+    ParallelCachedEmbeddingBagTablewise,
+    ParallelCachedEmbeddingBagTablewiseSpiltCache,
+    TablewiseEmbeddingBagConfig,
+)
 from .colo_module import ColoModule
-from .linear import ColoLinear
 from .embedding import ColoEmbedding
-from .module_utils import register_colo_module, is_colo_module, get_colo_module, init_colo_module, check_colo_module
-
-from .cache_embedding import CachedEmbeddingBag, ParallelCachedEmbeddingBag, CachedParamMgr, LimitBuffIndexCopyer, EvictionStrategy, \
-    ParallelCachedEmbeddingBagTablewise, TablewiseEmbeddingBagConfig, ParallelCachedEmbeddingBagTablewiseSpiltCache
+from .linear import ColoLinear
+from .module_utils import check_colo_module, get_colo_module, init_colo_module, is_colo_module, register_colo_module

 __all__ = [
    'ColoModule', 'register_colo_module', 'is_colo_module', 'get_colo_module', 'init_colo_module', 'check_colo_module',

--- a/colossalai/nn/parallel/layers/cache_embedding/__init__.py
+++ b/colossalai/nn/parallel/layers/cache_embedding/__init__.py
 from .cache_mgr import CachedParamMgr, EvictionStrategy
-from .copyer import LimitBuffIndexCopyer
 from .cached_embedding import CachedEmbeddingBag
-from .parallel_cached_embedding import ParallelCachedEmbeddingBag
+from .copyer import LimitBuffIndexCopyer
 from .embedding_config import TablewiseEmbeddingBagConfig
+from .parallel_cached_embedding import ParallelCachedEmbeddingBag
 from .parallel_cached_embedding_tablewise import ParallelCachedEmbeddingBagTablewise
 from .parallel_cached_embedding_tablewise_split_cache import ParallelCachedEmbeddingBagTablewiseSpiltCache


--- a/colossalai/nn/parallel/layers/cache_embedding/base_embedding.py
+++ b/colossalai/nn/parallel/layers/cache_embedding/base_embedding.py
 import abc
+
 import torch.nn as nn



--- a/colossalai/nn/parallel/layers/cache_embedding/cache_mgr.py
+++ b/colossalai/nn/parallel/layers/cache_embedding/cache_mgr.py
+import sys
+from contextlib import contextmanager
+from enum import Enum
+from typing import List, Optional
+
 import numpy as np
 import torch
-from torch.profiler import record_function
-from typing import List, Optional
 from contexttimer import Timer
+from torch.profiler import record_function
+
 from .copyer import LimitBuffIndexCopyer
-from enum import Enum
-import sys
-from contextlib import contextmanager


 class EvictionStrategy(Enum):
@@ -35,7 +37,7 @@ def _wait_for_data(t, stream: Optional[torch.cuda.streams.Stream]) -> None:
 class CachedParamMgr(torch.nn.Module):
    """
    Manage Embedding Weights on CPU and CUDA memory uses a software cache.
-    CPU maintains the entire original weight. 
+    CPU maintains the entire original weight.
    CUDA maintains a fraction of the weights used in the upcoming computation. The row number in CUDA is controlled by `cuda_row_num`.
    During training, GPU needs to transmit embedding rows between CPU and GPU.
    Args:
@@ -115,7 +117,7 @@ class CachedParamMgr(torch.nn.Module):
        self._elapsed_dict[name] += t.elapsed

    def _find_evict_gpu_idxs(self, evict_num: int) -> torch.Tensor:
-        """_find_evict_gpu_idxs 
+        """_find_evict_gpu_idxs
        Find the gpu idxs to be evicted, according to their freq.
        Args:
            evict_num (int): how many rows has to be evicted
@@ -202,7 +204,7 @@ class CachedParamMgr(torch.nn.Module):
        """reorder
        reorder the weight according to ids' frequency in dataset before training.
        Execute only once before training, also known as warmup phase.
-        
+
        Note:
            If you would like to use the DATASET as the eviction strategy, you must call this function.
        Note:
@@ -516,7 +518,7 @@ class CachedParamMgr(torch.nn.Module):
        """
        deprecated
        evict one row from cuda to cpu.
-        Returns: 
+        Returns:
        (int) : the slot id be evicted.
        """
        mask = torch.logical_or(torch.isin(self.cached_idx_map, self.evict_backlist), self.cached_idx_map == -1)

--- a/colossalai/nn/parallel/layers/cache_embedding/cached_embedding.py
+++ b/colossalai/nn/parallel/layers/cache_embedding/cached_embedding.py
+from typing import Iterator, List, Optional, Tuple, Union
+
 import torch
 import torch.nn.functional as F
-from typing import List, Optional, Iterator, Tuple, Union
+from torch.nn.parameter import Parameter

 from .base_embedding import BaseEmbeddingBag
 from .cache_mgr import CachedParamMgr, EvictionStrategy
-from torch.nn.parameter import Parameter


 class CachedEmbeddingBag(BaseEmbeddingBag):
@@ -27,7 +28,7 @@ class CachedEmbeddingBag(BaseEmbeddingBag):
        include_last_offset (bool, optional): if True, offsets has one additional element, where the last element is equivalent to the size of indices. This matches the CSR format.. Defaults to False.
        dtype (torch.dtype, optional): data type of the cpu weight initialization. Defaults to None meaning float32.
        device (torch.device, optional): device type to the cpu weight. Defaults to None meaning cpu.
-        cache_ratio (float, float): cache ratio of the #cuda_weight_row / #cpu_weight_row 
+        cache_ratio (float, float): cache ratio of the #cuda_weight_row / #cpu_weight_row
        ids_freq_mapping (Union[List, torch.Tensor], optional): the frequency of each embedding vector occurs in dataset. Defaults to None.
        warmup_ratio (float, optional): the ratio of cuda cache is warmuped with. Defaults to 0.7.
        buffer_size (int, optional): the max number of vectors in transmitter buffer. If set to 0, the buffer is not used. Defaults to 0.
@@ -85,10 +86,10 @@ class CachedEmbeddingBag(BaseEmbeddingBag):
                    buffer_size=50_000,
                    pin_weight=False):
        """
-        Called after initialized. 
+        Called after initialized.
        Reorder the weight rows according to the ids_freq_mapping.
        Then, let the weights of the Module be managed by a CachedParamMgr.
-        
+
        Args:
            cuda_row_num (int): number of rows can be hosted in CUDA memory
            ids_freq_mapping (List[int]): a list, idx is id number, value is freq

--- a/colossalai/nn/parallel/layers/cache_embedding/copyer.py
+++ b/colossalai/nn/parallel/layers/cache_embedding/copyer.py
@@ -3,7 +3,7 @@ from torch import LongTensor


 class LimitBuffIndexCopyer(object):
-    """LimitBuffIndexCopyer 
+    """LimitBuffIndexCopyer
    Index Copy using limited temp buffer on CUDA.

    Args:
@@ -15,7 +15,7 @@ class LimitBuffIndexCopyer(object):

    @torch.no_grad()
    def index_copy(self, dim: int, src_index: LongTensor, tgt_index: LongTensor, src: torch.Tensor, tgt: torch.Tensor):
-        """copy 
+        """copy
        src tensor[src_index] -(index_select)-> tmp -(index_copy_)-> tgt tensor [tgt_index]
        The valid rows in the src tensor are continuous, while rows in tgt tensor is scattered.


--- a/colossalai/nn/parallel/layers/cache_embedding/embedding_config.py
+++ b/colossalai/nn/parallel/layers/cache_embedding/embedding_config.py
--- a/colossalai/nn/parallel/layers/cache_embedding/parallel_cached_embedding.py
+++ b/colossalai/nn/parallel/layers/cache_embedding/parallel_cached_embedding.py
+from typing import Iterator, List, Optional, Tuple
+
 import torch
 import torch.nn.functional as F
-from typing import List, Optional, Iterator, Tuple

-from .cached_embedding import CachedEmbeddingBag
-from colossalai.nn._ops._utils import dual_all_to_all
+from colossalai.legacy.nn._ops._utils import dual_all_to_all
+from colossalai.tensor import ColoParameter, ColoTensor, ColoTensorSpec, ComputePattern, ProcessGroup, ShardSpec

-from colossalai.tensor import ColoParameter, ShardSpec, ComputePattern, ProcessGroup, ColoTensorSpec, ColoTensor
 from .cache_mgr import CachedParamMgr, EvictionStrategy
+from .cached_embedding import CachedEmbeddingBag


 def get_partition(embedding_dim, rank, world_size) -> Tuple[int, int, bool]:

--- a/colossalai/nn/parallel/layers/cache_embedding/parallel_cached_embedding_tablewise.py
+++ b/colossalai/nn/parallel/layers/cache_embedding/parallel_cached_embedding_tablewise.py
+import time
+from typing import List
+
 import torch
 import torch.distributed as dist
 import torch.nn.functional as F

-from .cached_embedding import CachedEmbeddingBag
-from .cache_mgr import EvictionStrategy
-from .embedding_config import TablewiseEmbeddingBagConfig
+from colossalai.legacy.nn._ops._utils import dual_all_to_all_tablewise
 from colossalai.tensor import ProcessGroup
-from colossalai.nn._ops._utils import dual_all_to_all_tablewise

-from typing import List
-import time
+from .cache_mgr import EvictionStrategy
+from .cached_embedding import CachedEmbeddingBag
+from .embedding_config import TablewiseEmbeddingBagConfig


 class ParallelCachedEmbeddingBagTablewise(CachedEmbeddingBag):