Unverified Commit 554aa959 authored by Hongxin Liu's avatar Hongxin Liu Committed by GitHub
Browse files

[legacy] move communication and nn to legacy and refactor logger (#4671)

* [legacy] move communication to legacy (#4640)

* [legacy] refactor logger and clean up legacy codes (#4654)

* [legacy] make logger independent to gpc

* [legacy] make optim independent to registry

* [legacy] move test engine to legacy

* [legacy] move nn to legacy (#4656)

* [legacy] move nn to legacy

* [checkpointio] fix save hf config

* [test] remove useledd rpc pp test

* [legacy] fix nn init

* [example] skip tutorial hybriad parallel example

* [devops] test doc check

* [devops] test doc check
parent 536397cc
...@@ -6,9 +6,9 @@ from torch.nn.modules.loss import _Loss ...@@ -6,9 +6,9 @@ from torch.nn.modules.loss import _Loss
from colossalai.context import ParallelMode from colossalai.context import ParallelMode
from colossalai.core import global_context as gpc from colossalai.core import global_context as gpc
from colossalai.legacy.nn.layer.parallel_2d import reduce_by_batch_2d, split_batch_2d
from colossalai.legacy.nn.layer.parallel_2d._utils import assert_summa_initialization
from colossalai.legacy.registry import LOSSES from colossalai.legacy.registry import LOSSES
from colossalai.nn.layer.parallel_2d import reduce_by_batch_2d, split_batch_2d
from colossalai.nn.layer.parallel_2d._utils import assert_summa_initialization
from colossalai.utils import get_current_device from colossalai.utils import get_current_device
......
...@@ -6,9 +6,9 @@ from torch.nn.modules.loss import _Loss ...@@ -6,9 +6,9 @@ from torch.nn.modules.loss import _Loss
from colossalai.context import ParallelMode from colossalai.context import ParallelMode
from colossalai.core import global_context as gpc from colossalai.core import global_context as gpc
from colossalai.legacy.nn.layer.parallel_2p5d import reduce_by_batch_2p5d, split_batch_2p5d
from colossalai.legacy.nn.layer.parallel_2p5d._utils import assert_tesseract_initialization
from colossalai.legacy.registry import LOSSES from colossalai.legacy.registry import LOSSES
from colossalai.nn.layer.parallel_2p5d import reduce_by_batch_2p5d, split_batch_2p5d
from colossalai.nn.layer.parallel_2p5d._utils import assert_tesseract_initialization
from colossalai.utils import get_current_device from colossalai.utils import get_current_device
......
...@@ -6,9 +6,9 @@ from torch.nn.modules.loss import _Loss ...@@ -6,9 +6,9 @@ from torch.nn.modules.loss import _Loss
from colossalai.constants import INPUT_GROUP_3D, OUTPUT_GROUP_3D, WEIGHT_GROUP_3D from colossalai.constants import INPUT_GROUP_3D, OUTPUT_GROUP_3D, WEIGHT_GROUP_3D
from colossalai.core import global_context as gpc from colossalai.core import global_context as gpc
from colossalai.legacy.nn.layer.parallel_3d import reduce_by_batch_3d, split_tensor_3d
from colossalai.legacy.nn.layer.parallel_3d._utils import get_parallel_mode_from_env
from colossalai.legacy.registry import LOSSES from colossalai.legacy.registry import LOSSES
from colossalai.nn.layer.parallel_3d import reduce_by_batch_3d, split_tensor_3d
from colossalai.nn.layer.parallel_3d._utils import get_parallel_mode_from_env
from colossalai.utils import get_current_device from colossalai.utils import get_current_device
......
from torch import nn from torch import nn
from ._utils import calc_acc from colossalai.legacy.nn.layer.utils import get_tensor_parallel_mode
from .accuracy_2d import Accuracy2D
from .accuracy_2p5d import Accuracy2p5D from ._utils import calc_acc
from .accuracy_3d import Accuracy3D from .accuracy_2d import Accuracy2D
from colossalai.nn.layer.utils import get_tensor_parallel_mode from .accuracy_2p5d import Accuracy2p5D
from .accuracy_3d import Accuracy3D
_parallel_accuracy = {
'2d': Accuracy2D, _parallel_accuracy = {
'2.5d': Accuracy2p5D, '2d': Accuracy2D,
'3d': Accuracy3D, '2.5d': Accuracy2p5D,
} '3d': Accuracy3D,
}
class Accuracy(nn.Module):
def __init__(self): class Accuracy(nn.Module):
super().__init__()
tensor_parallel = get_tensor_parallel_mode() def __init__(self):
if tensor_parallel not in _parallel_accuracy: super().__init__()
self.acc = calc_acc tensor_parallel = get_tensor_parallel_mode()
else: if tensor_parallel not in _parallel_accuracy:
self.acc = _parallel_accuracy[tensor_parallel]() self.acc = calc_acc
else:
def forward(self, *args): self.acc = _parallel_accuracy[tensor_parallel]()
return self.acc(*args)
def forward(self, *args):
return self.acc(*args)
import torch import torch
def calc_acc(logits, targets): def calc_acc(logits, targets):
preds = torch.argmax(logits, dim=-1) preds = torch.argmax(logits, dim=-1)
correct = torch.sum(targets == preds) correct = torch.sum(targets == preds)
return correct return correct
import torch import torch
from colossalai.nn.layer.parallel_2d import reduce_by_batch_2d, split_batch_2d
from torch import nn from torch import nn
from colossalai.legacy.nn.layer.parallel_2d import reduce_by_batch_2d, split_batch_2d
from ._utils import calc_acc from ._utils import calc_acc
......
import torch import torch
from colossalai.nn.layer.parallel_2p5d import reduce_by_batch_2p5d, split_batch_2p5d
from torch import nn from torch import nn
from colossalai.legacy.nn.layer.parallel_2p5d import reduce_by_batch_2p5d, split_batch_2p5d
from ._utils import calc_acc from ._utils import calc_acc
......
import torch import torch
from colossalai.constants import INPUT_GROUP_3D, WEIGHT_GROUP_3D from torch import nn
from colossalai.nn.layer.parallel_3d import reduce_by_batch_3d, split_tensor_3d
from colossalai.nn.layer.parallel_3d._utils import get_parallel_mode_from_env from colossalai.constants import INPUT_GROUP_3D, WEIGHT_GROUP_3D
from torch import nn from colossalai.legacy.nn.layer.parallel_3d import reduce_by_batch_3d, split_tensor_3d
from colossalai.legacy.nn.layer.parallel_3d._utils import get_parallel_mode_from_env
from ._utils import calc_acc
from ._utils import calc_acc
class Accuracy3D(nn.Module):
"""Accuracy for 3D parallelism class Accuracy3D(nn.Module):
""" """Accuracy for 3D parallelism
def __init__(self): """
super().__init__()
self.input_parallel_mode = get_parallel_mode_from_env(INPUT_GROUP_3D) def __init__(self):
self.weight_parallel_mode = get_parallel_mode_from_env(WEIGHT_GROUP_3D) super().__init__()
self.input_parallel_mode = get_parallel_mode_from_env(INPUT_GROUP_3D)
def forward(self, logits, targets): self.weight_parallel_mode = get_parallel_mode_from_env(WEIGHT_GROUP_3D)
"""Calculate the accuracy of predicted labels.
def forward(self, logits, targets):
Args: """Calculate the accuracy of predicted labels.
logits (:class:`torch.tensor`): Predicted labels.
targets (:class:`torch.tensor`): True labels from data. Args:
logits (:class:`torch.tensor`): Predicted labels.
Returns: targets (:class:`torch.tensor`): True labels from data.
float: the accuracy of prediction.
""" Returns:
with torch.no_grad(): float: the accuracy of prediction.
targets = split_tensor_3d(targets, 0, self.weight_parallel_mode) """
targets = split_tensor_3d(targets, 0, self.input_parallel_mode) with torch.no_grad():
correct = calc_acc(logits, targets) targets = split_tensor_3d(targets, 0, self.weight_parallel_mode)
correct = reduce_by_batch_3d(correct, self.input_parallel_mode, self.weight_parallel_mode) targets = split_tensor_3d(targets, 0, self.input_parallel_mode)
return correct correct = calc_acc(logits, targets)
correct = reduce_by_batch_3d(correct, self.input_parallel_mode, self.weight_parallel_mode)
return correct
from .cache_embedding import (
CachedEmbeddingBag,
CachedParamMgr,
EvictionStrategy,
LimitBuffIndexCopyer,
ParallelCachedEmbeddingBag,
ParallelCachedEmbeddingBagTablewise,
ParallelCachedEmbeddingBagTablewiseSpiltCache,
TablewiseEmbeddingBagConfig,
)
from .colo_module import ColoModule from .colo_module import ColoModule
from .linear import ColoLinear
from .embedding import ColoEmbedding from .embedding import ColoEmbedding
from .module_utils import register_colo_module, is_colo_module, get_colo_module, init_colo_module, check_colo_module from .linear import ColoLinear
from .module_utils import check_colo_module, get_colo_module, init_colo_module, is_colo_module, register_colo_module
from .cache_embedding import CachedEmbeddingBag, ParallelCachedEmbeddingBag, CachedParamMgr, LimitBuffIndexCopyer, EvictionStrategy, \
ParallelCachedEmbeddingBagTablewise, TablewiseEmbeddingBagConfig, ParallelCachedEmbeddingBagTablewiseSpiltCache
__all__ = [ __all__ = [
'ColoModule', 'register_colo_module', 'is_colo_module', 'get_colo_module', 'init_colo_module', 'check_colo_module', 'ColoModule', 'register_colo_module', 'is_colo_module', 'get_colo_module', 'init_colo_module', 'check_colo_module',
......
from .cache_mgr import CachedParamMgr, EvictionStrategy from .cache_mgr import CachedParamMgr, EvictionStrategy
from .copyer import LimitBuffIndexCopyer
from .cached_embedding import CachedEmbeddingBag from .cached_embedding import CachedEmbeddingBag
from .parallel_cached_embedding import ParallelCachedEmbeddingBag from .copyer import LimitBuffIndexCopyer
from .embedding_config import TablewiseEmbeddingBagConfig from .embedding_config import TablewiseEmbeddingBagConfig
from .parallel_cached_embedding import ParallelCachedEmbeddingBag
from .parallel_cached_embedding_tablewise import ParallelCachedEmbeddingBagTablewise from .parallel_cached_embedding_tablewise import ParallelCachedEmbeddingBagTablewise
from .parallel_cached_embedding_tablewise_split_cache import ParallelCachedEmbeddingBagTablewiseSpiltCache from .parallel_cached_embedding_tablewise_split_cache import ParallelCachedEmbeddingBagTablewiseSpiltCache
......
import sys
from contextlib import contextmanager
from enum import Enum
from typing import List, Optional
import numpy as np import numpy as np
import torch import torch
from torch.profiler import record_function
from typing import List, Optional
from contexttimer import Timer from contexttimer import Timer
from torch.profiler import record_function
from .copyer import LimitBuffIndexCopyer from .copyer import LimitBuffIndexCopyer
from enum import Enum
import sys
from contextlib import contextmanager
class EvictionStrategy(Enum): class EvictionStrategy(Enum):
...@@ -35,7 +37,7 @@ def _wait_for_data(t, stream: Optional[torch.cuda.streams.Stream]) -> None: ...@@ -35,7 +37,7 @@ def _wait_for_data(t, stream: Optional[torch.cuda.streams.Stream]) -> None:
class CachedParamMgr(torch.nn.Module): class CachedParamMgr(torch.nn.Module):
""" """
Manage Embedding Weights on CPU and CUDA memory uses a software cache. Manage Embedding Weights on CPU and CUDA memory uses a software cache.
CPU maintains the entire original weight. CPU maintains the entire original weight.
CUDA maintains a fraction of the weights used in the upcoming computation. The row number in CUDA is controlled by `cuda_row_num`. CUDA maintains a fraction of the weights used in the upcoming computation. The row number in CUDA is controlled by `cuda_row_num`.
During training, GPU needs to transmit embedding rows between CPU and GPU. During training, GPU needs to transmit embedding rows between CPU and GPU.
Args: Args:
...@@ -115,7 +117,7 @@ class CachedParamMgr(torch.nn.Module): ...@@ -115,7 +117,7 @@ class CachedParamMgr(torch.nn.Module):
self._elapsed_dict[name] += t.elapsed self._elapsed_dict[name] += t.elapsed
def _find_evict_gpu_idxs(self, evict_num: int) -> torch.Tensor: def _find_evict_gpu_idxs(self, evict_num: int) -> torch.Tensor:
"""_find_evict_gpu_idxs """_find_evict_gpu_idxs
Find the gpu idxs to be evicted, according to their freq. Find the gpu idxs to be evicted, according to their freq.
Args: Args:
evict_num (int): how many rows has to be evicted evict_num (int): how many rows has to be evicted
...@@ -202,7 +204,7 @@ class CachedParamMgr(torch.nn.Module): ...@@ -202,7 +204,7 @@ class CachedParamMgr(torch.nn.Module):
"""reorder """reorder
reorder the weight according to ids' frequency in dataset before training. reorder the weight according to ids' frequency in dataset before training.
Execute only once before training, also known as warmup phase. Execute only once before training, also known as warmup phase.
Note: Note:
If you would like to use the DATASET as the eviction strategy, you must call this function. If you would like to use the DATASET as the eviction strategy, you must call this function.
Note: Note:
...@@ -516,7 +518,7 @@ class CachedParamMgr(torch.nn.Module): ...@@ -516,7 +518,7 @@ class CachedParamMgr(torch.nn.Module):
""" """
deprecated deprecated
evict one row from cuda to cpu. evict one row from cuda to cpu.
Returns: Returns:
(int) : the slot id be evicted. (int) : the slot id be evicted.
""" """
mask = torch.logical_or(torch.isin(self.cached_idx_map, self.evict_backlist), self.cached_idx_map == -1) mask = torch.logical_or(torch.isin(self.cached_idx_map, self.evict_backlist), self.cached_idx_map == -1)
......
from typing import Iterator, List, Optional, Tuple, Union
import torch import torch
import torch.nn.functional as F import torch.nn.functional as F
from typing import List, Optional, Iterator, Tuple, Union from torch.nn.parameter import Parameter
from .base_embedding import BaseEmbeddingBag from .base_embedding import BaseEmbeddingBag
from .cache_mgr import CachedParamMgr, EvictionStrategy from .cache_mgr import CachedParamMgr, EvictionStrategy
from torch.nn.parameter import Parameter
class CachedEmbeddingBag(BaseEmbeddingBag): class CachedEmbeddingBag(BaseEmbeddingBag):
...@@ -27,7 +28,7 @@ class CachedEmbeddingBag(BaseEmbeddingBag): ...@@ -27,7 +28,7 @@ class CachedEmbeddingBag(BaseEmbeddingBag):
include_last_offset (bool, optional): if True, offsets has one additional element, where the last element is equivalent to the size of indices. This matches the CSR format.. Defaults to False. include_last_offset (bool, optional): if True, offsets has one additional element, where the last element is equivalent to the size of indices. This matches the CSR format.. Defaults to False.
dtype (torch.dtype, optional): data type of the cpu weight initialization. Defaults to None meaning float32. dtype (torch.dtype, optional): data type of the cpu weight initialization. Defaults to None meaning float32.
device (torch.device, optional): device type to the cpu weight. Defaults to None meaning cpu. device (torch.device, optional): device type to the cpu weight. Defaults to None meaning cpu.
cache_ratio (float, float): cache ratio of the #cuda_weight_row / #cpu_weight_row cache_ratio (float, float): cache ratio of the #cuda_weight_row / #cpu_weight_row
ids_freq_mapping (Union[List, torch.Tensor], optional): the frequency of each embedding vector occurs in dataset. Defaults to None. ids_freq_mapping (Union[List, torch.Tensor], optional): the frequency of each embedding vector occurs in dataset. Defaults to None.
warmup_ratio (float, optional): the ratio of cuda cache is warmuped with. Defaults to 0.7. warmup_ratio (float, optional): the ratio of cuda cache is warmuped with. Defaults to 0.7.
buffer_size (int, optional): the max number of vectors in transmitter buffer. If set to 0, the buffer is not used. Defaults to 0. buffer_size (int, optional): the max number of vectors in transmitter buffer. If set to 0, the buffer is not used. Defaults to 0.
...@@ -85,10 +86,10 @@ class CachedEmbeddingBag(BaseEmbeddingBag): ...@@ -85,10 +86,10 @@ class CachedEmbeddingBag(BaseEmbeddingBag):
buffer_size=50_000, buffer_size=50_000,
pin_weight=False): pin_weight=False):
""" """
Called after initialized. Called after initialized.
Reorder the weight rows according to the ids_freq_mapping. Reorder the weight rows according to the ids_freq_mapping.
Then, let the weights of the Module be managed by a CachedParamMgr. Then, let the weights of the Module be managed by a CachedParamMgr.
Args: Args:
cuda_row_num (int): number of rows can be hosted in CUDA memory cuda_row_num (int): number of rows can be hosted in CUDA memory
ids_freq_mapping (List[int]): a list, idx is id number, value is freq ids_freq_mapping (List[int]): a list, idx is id number, value is freq
......
...@@ -3,7 +3,7 @@ from torch import LongTensor ...@@ -3,7 +3,7 @@ from torch import LongTensor
class LimitBuffIndexCopyer(object): class LimitBuffIndexCopyer(object):
"""LimitBuffIndexCopyer """LimitBuffIndexCopyer
Index Copy using limited temp buffer on CUDA. Index Copy using limited temp buffer on CUDA.
Args: Args:
...@@ -15,7 +15,7 @@ class LimitBuffIndexCopyer(object): ...@@ -15,7 +15,7 @@ class LimitBuffIndexCopyer(object):
@torch.no_grad() @torch.no_grad()
def index_copy(self, dim: int, src_index: LongTensor, tgt_index: LongTensor, src: torch.Tensor, tgt: torch.Tensor): def index_copy(self, dim: int, src_index: LongTensor, tgt_index: LongTensor, src: torch.Tensor, tgt: torch.Tensor):
"""copy """copy
src tensor[src_index] -(index_select)-> tmp -(index_copy_)-> tgt tensor [tgt_index] src tensor[src_index] -(index_select)-> tmp -(index_copy_)-> tgt tensor [tgt_index]
The valid rows in the src tensor are continuous, while rows in tgt tensor is scattered. The valid rows in the src tensor are continuous, while rows in tgt tensor is scattered.
......
from typing import Iterator, List, Optional, Tuple
import torch import torch
import torch.nn.functional as F import torch.nn.functional as F
from typing import List, Optional, Iterator, Tuple
from .cached_embedding import CachedEmbeddingBag from colossalai.legacy.nn._ops._utils import dual_all_to_all
from colossalai.nn._ops._utils import dual_all_to_all from colossalai.tensor import ColoParameter, ColoTensor, ColoTensorSpec, ComputePattern, ProcessGroup, ShardSpec
from colossalai.tensor import ColoParameter, ShardSpec, ComputePattern, ProcessGroup, ColoTensorSpec, ColoTensor
from .cache_mgr import CachedParamMgr, EvictionStrategy from .cache_mgr import CachedParamMgr, EvictionStrategy
from .cached_embedding import CachedEmbeddingBag
def get_partition(embedding_dim, rank, world_size) -> Tuple[int, int, bool]: def get_partition(embedding_dim, rank, world_size) -> Tuple[int, int, bool]:
......
import time
from typing import List
import torch import torch
import torch.distributed as dist import torch.distributed as dist
import torch.nn.functional as F import torch.nn.functional as F
from .cached_embedding import CachedEmbeddingBag from colossalai.legacy.nn._ops._utils import dual_all_to_all_tablewise
from .cache_mgr import EvictionStrategy
from .embedding_config import TablewiseEmbeddingBagConfig
from colossalai.tensor import ProcessGroup from colossalai.tensor import ProcessGroup
from colossalai.nn._ops._utils import dual_all_to_all_tablewise
from typing import List from .cache_mgr import EvictionStrategy
import time from .cached_embedding import CachedEmbeddingBag
from .embedding_config import TablewiseEmbeddingBagConfig
class ParallelCachedEmbeddingBagTablewise(CachedEmbeddingBag): class ParallelCachedEmbeddingBagTablewise(CachedEmbeddingBag):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment