Unverified Commit 554aa959 authored by Hongxin Liu's avatar Hongxin Liu Committed by GitHub
Browse files

[legacy] move communication and nn to legacy and refactor logger (#4671)

* [legacy] move communication to legacy (#4640)

* [legacy] refactor logger and clean up legacy codes (#4654)

* [legacy] make logger independent to gpc

* [legacy] make optim independent to registry

* [legacy] move test engine to legacy

* [legacy] move nn to legacy (#4656)

* [legacy] move nn to legacy

* [checkpointio] fix save hf config

* [test] remove useledd rpc pp test

* [legacy] fix nn init

* [example] skip tutorial hybriad parallel example

* [devops] test doc check

* [devops] test doc check
parent 536397cc
......@@ -6,9 +6,9 @@ from torch.nn.modules.loss import _Loss
from colossalai.context import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.legacy.nn.layer.parallel_2d import reduce_by_batch_2d, split_batch_2d
from colossalai.legacy.nn.layer.parallel_2d._utils import assert_summa_initialization
from colossalai.legacy.registry import LOSSES
from colossalai.nn.layer.parallel_2d import reduce_by_batch_2d, split_batch_2d
from colossalai.nn.layer.parallel_2d._utils import assert_summa_initialization
from colossalai.utils import get_current_device
......
......@@ -6,9 +6,9 @@ from torch.nn.modules.loss import _Loss
from colossalai.context import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.legacy.nn.layer.parallel_2p5d import reduce_by_batch_2p5d, split_batch_2p5d
from colossalai.legacy.nn.layer.parallel_2p5d._utils import assert_tesseract_initialization
from colossalai.legacy.registry import LOSSES
from colossalai.nn.layer.parallel_2p5d import reduce_by_batch_2p5d, split_batch_2p5d
from colossalai.nn.layer.parallel_2p5d._utils import assert_tesseract_initialization
from colossalai.utils import get_current_device
......
......@@ -6,9 +6,9 @@ from torch.nn.modules.loss import _Loss
from colossalai.constants import INPUT_GROUP_3D, OUTPUT_GROUP_3D, WEIGHT_GROUP_3D
from colossalai.core import global_context as gpc
from colossalai.legacy.nn.layer.parallel_3d import reduce_by_batch_3d, split_tensor_3d
from colossalai.legacy.nn.layer.parallel_3d._utils import get_parallel_mode_from_env
from colossalai.legacy.registry import LOSSES
from colossalai.nn.layer.parallel_3d import reduce_by_batch_3d, split_tensor_3d
from colossalai.nn.layer.parallel_3d._utils import get_parallel_mode_from_env
from colossalai.utils import get_current_device
......
from torch import nn
from ._utils import calc_acc
from .accuracy_2d import Accuracy2D
from .accuracy_2p5d import Accuracy2p5D
from .accuracy_3d import Accuracy3D
from colossalai.nn.layer.utils import get_tensor_parallel_mode
_parallel_accuracy = {
'2d': Accuracy2D,
'2.5d': Accuracy2p5D,
'3d': Accuracy3D,
}
class Accuracy(nn.Module):
def __init__(self):
super().__init__()
tensor_parallel = get_tensor_parallel_mode()
if tensor_parallel not in _parallel_accuracy:
self.acc = calc_acc
else:
self.acc = _parallel_accuracy[tensor_parallel]()
def forward(self, *args):
return self.acc(*args)
from torch import nn
from colossalai.legacy.nn.layer.utils import get_tensor_parallel_mode
from ._utils import calc_acc
from .accuracy_2d import Accuracy2D
from .accuracy_2p5d import Accuracy2p5D
from .accuracy_3d import Accuracy3D
_parallel_accuracy = {
'2d': Accuracy2D,
'2.5d': Accuracy2p5D,
'3d': Accuracy3D,
}
class Accuracy(nn.Module):
def __init__(self):
super().__init__()
tensor_parallel = get_tensor_parallel_mode()
if tensor_parallel not in _parallel_accuracy:
self.acc = calc_acc
else:
self.acc = _parallel_accuracy[tensor_parallel]()
def forward(self, *args):
return self.acc(*args)
import torch
def calc_acc(logits, targets):
preds = torch.argmax(logits, dim=-1)
correct = torch.sum(targets == preds)
return correct
import torch
def calc_acc(logits, targets):
preds = torch.argmax(logits, dim=-1)
correct = torch.sum(targets == preds)
return correct
import torch
from colossalai.nn.layer.parallel_2d import reduce_by_batch_2d, split_batch_2d
from torch import nn
from colossalai.legacy.nn.layer.parallel_2d import reduce_by_batch_2d, split_batch_2d
from ._utils import calc_acc
......
import torch
from colossalai.nn.layer.parallel_2p5d import reduce_by_batch_2p5d, split_batch_2p5d
from torch import nn
from colossalai.legacy.nn.layer.parallel_2p5d import reduce_by_batch_2p5d, split_batch_2p5d
from ._utils import calc_acc
......
import torch
from colossalai.constants import INPUT_GROUP_3D, WEIGHT_GROUP_3D
from colossalai.nn.layer.parallel_3d import reduce_by_batch_3d, split_tensor_3d
from colossalai.nn.layer.parallel_3d._utils import get_parallel_mode_from_env
from torch import nn
from ._utils import calc_acc
class Accuracy3D(nn.Module):
"""Accuracy for 3D parallelism
"""
def __init__(self):
super().__init__()
self.input_parallel_mode = get_parallel_mode_from_env(INPUT_GROUP_3D)
self.weight_parallel_mode = get_parallel_mode_from_env(WEIGHT_GROUP_3D)
def forward(self, logits, targets):
"""Calculate the accuracy of predicted labels.
Args:
logits (:class:`torch.tensor`): Predicted labels.
targets (:class:`torch.tensor`): True labels from data.
Returns:
float: the accuracy of prediction.
"""
with torch.no_grad():
targets = split_tensor_3d(targets, 0, self.weight_parallel_mode)
targets = split_tensor_3d(targets, 0, self.input_parallel_mode)
correct = calc_acc(logits, targets)
correct = reduce_by_batch_3d(correct, self.input_parallel_mode, self.weight_parallel_mode)
return correct
import torch
from torch import nn
from colossalai.constants import INPUT_GROUP_3D, WEIGHT_GROUP_3D
from colossalai.legacy.nn.layer.parallel_3d import reduce_by_batch_3d, split_tensor_3d
from colossalai.legacy.nn.layer.parallel_3d._utils import get_parallel_mode_from_env
from ._utils import calc_acc
class Accuracy3D(nn.Module):
"""Accuracy for 3D parallelism
"""
def __init__(self):
super().__init__()
self.input_parallel_mode = get_parallel_mode_from_env(INPUT_GROUP_3D)
self.weight_parallel_mode = get_parallel_mode_from_env(WEIGHT_GROUP_3D)
def forward(self, logits, targets):
"""Calculate the accuracy of predicted labels.
Args:
logits (:class:`torch.tensor`): Predicted labels.
targets (:class:`torch.tensor`): True labels from data.
Returns:
float: the accuracy of prediction.
"""
with torch.no_grad():
targets = split_tensor_3d(targets, 0, self.weight_parallel_mode)
targets = split_tensor_3d(targets, 0, self.input_parallel_mode)
correct = calc_acc(logits, targets)
correct = reduce_by_batch_3d(correct, self.input_parallel_mode, self.weight_parallel_mode)
return correct
from .cache_embedding import (
CachedEmbeddingBag,
CachedParamMgr,
EvictionStrategy,
LimitBuffIndexCopyer,
ParallelCachedEmbeddingBag,
ParallelCachedEmbeddingBagTablewise,
ParallelCachedEmbeddingBagTablewiseSpiltCache,
TablewiseEmbeddingBagConfig,
)
from .colo_module import ColoModule
from .linear import ColoLinear
from .embedding import ColoEmbedding
from .module_utils import register_colo_module, is_colo_module, get_colo_module, init_colo_module, check_colo_module
from .cache_embedding import CachedEmbeddingBag, ParallelCachedEmbeddingBag, CachedParamMgr, LimitBuffIndexCopyer, EvictionStrategy, \
ParallelCachedEmbeddingBagTablewise, TablewiseEmbeddingBagConfig, ParallelCachedEmbeddingBagTablewiseSpiltCache
from .linear import ColoLinear
from .module_utils import check_colo_module, get_colo_module, init_colo_module, is_colo_module, register_colo_module
__all__ = [
'ColoModule', 'register_colo_module', 'is_colo_module', 'get_colo_module', 'init_colo_module', 'check_colo_module',
......
from .cache_mgr import CachedParamMgr, EvictionStrategy
from .copyer import LimitBuffIndexCopyer
from .cached_embedding import CachedEmbeddingBag
from .parallel_cached_embedding import ParallelCachedEmbeddingBag
from .copyer import LimitBuffIndexCopyer
from .embedding_config import TablewiseEmbeddingBagConfig
from .parallel_cached_embedding import ParallelCachedEmbeddingBag
from .parallel_cached_embedding_tablewise import ParallelCachedEmbeddingBagTablewise
from .parallel_cached_embedding_tablewise_split_cache import ParallelCachedEmbeddingBagTablewiseSpiltCache
......
import sys
from contextlib import contextmanager
from enum import Enum
from typing import List, Optional
import numpy as np
import torch
from torch.profiler import record_function
from typing import List, Optional
from contexttimer import Timer
from torch.profiler import record_function
from .copyer import LimitBuffIndexCopyer
from enum import Enum
import sys
from contextlib import contextmanager
class EvictionStrategy(Enum):
......@@ -35,7 +37,7 @@ def _wait_for_data(t, stream: Optional[torch.cuda.streams.Stream]) -> None:
class CachedParamMgr(torch.nn.Module):
"""
Manage Embedding Weights on CPU and CUDA memory uses a software cache.
CPU maintains the entire original weight.
CPU maintains the entire original weight.
CUDA maintains a fraction of the weights used in the upcoming computation. The row number in CUDA is controlled by `cuda_row_num`.
During training, GPU needs to transmit embedding rows between CPU and GPU.
Args:
......@@ -115,7 +117,7 @@ class CachedParamMgr(torch.nn.Module):
self._elapsed_dict[name] += t.elapsed
def _find_evict_gpu_idxs(self, evict_num: int) -> torch.Tensor:
"""_find_evict_gpu_idxs
"""_find_evict_gpu_idxs
Find the gpu idxs to be evicted, according to their freq.
Args:
evict_num (int): how many rows has to be evicted
......@@ -202,7 +204,7 @@ class CachedParamMgr(torch.nn.Module):
"""reorder
reorder the weight according to ids' frequency in dataset before training.
Execute only once before training, also known as warmup phase.
Note:
If you would like to use the DATASET as the eviction strategy, you must call this function.
Note:
......@@ -516,7 +518,7 @@ class CachedParamMgr(torch.nn.Module):
"""
deprecated
evict one row from cuda to cpu.
Returns:
Returns:
(int) : the slot id be evicted.
"""
mask = torch.logical_or(torch.isin(self.cached_idx_map, self.evict_backlist), self.cached_idx_map == -1)
......
from typing import Iterator, List, Optional, Tuple, Union
import torch
import torch.nn.functional as F
from typing import List, Optional, Iterator, Tuple, Union
from torch.nn.parameter import Parameter
from .base_embedding import BaseEmbeddingBag
from .cache_mgr import CachedParamMgr, EvictionStrategy
from torch.nn.parameter import Parameter
class CachedEmbeddingBag(BaseEmbeddingBag):
......@@ -27,7 +28,7 @@ class CachedEmbeddingBag(BaseEmbeddingBag):
include_last_offset (bool, optional): if True, offsets has one additional element, where the last element is equivalent to the size of indices. This matches the CSR format.. Defaults to False.
dtype (torch.dtype, optional): data type of the cpu weight initialization. Defaults to None meaning float32.
device (torch.device, optional): device type to the cpu weight. Defaults to None meaning cpu.
cache_ratio (float, float): cache ratio of the #cuda_weight_row / #cpu_weight_row
cache_ratio (float, float): cache ratio of the #cuda_weight_row / #cpu_weight_row
ids_freq_mapping (Union[List, torch.Tensor], optional): the frequency of each embedding vector occurs in dataset. Defaults to None.
warmup_ratio (float, optional): the ratio of cuda cache is warmuped with. Defaults to 0.7.
buffer_size (int, optional): the max number of vectors in transmitter buffer. If set to 0, the buffer is not used. Defaults to 0.
......@@ -85,10 +86,10 @@ class CachedEmbeddingBag(BaseEmbeddingBag):
buffer_size=50_000,
pin_weight=False):
"""
Called after initialized.
Called after initialized.
Reorder the weight rows according to the ids_freq_mapping.
Then, let the weights of the Module be managed by a CachedParamMgr.
Args:
cuda_row_num (int): number of rows can be hosted in CUDA memory
ids_freq_mapping (List[int]): a list, idx is id number, value is freq
......
......@@ -3,7 +3,7 @@ from torch import LongTensor
class LimitBuffIndexCopyer(object):
"""LimitBuffIndexCopyer
"""LimitBuffIndexCopyer
Index Copy using limited temp buffer on CUDA.
Args:
......@@ -15,7 +15,7 @@ class LimitBuffIndexCopyer(object):
@torch.no_grad()
def index_copy(self, dim: int, src_index: LongTensor, tgt_index: LongTensor, src: torch.Tensor, tgt: torch.Tensor):
"""copy
"""copy
src tensor[src_index] -(index_select)-> tmp -(index_copy_)-> tgt tensor [tgt_index]
The valid rows in the src tensor are continuous, while rows in tgt tensor is scattered.
......
from typing import Iterator, List, Optional, Tuple
import torch
import torch.nn.functional as F
from typing import List, Optional, Iterator, Tuple
from .cached_embedding import CachedEmbeddingBag
from colossalai.nn._ops._utils import dual_all_to_all
from colossalai.legacy.nn._ops._utils import dual_all_to_all
from colossalai.tensor import ColoParameter, ColoTensor, ColoTensorSpec, ComputePattern, ProcessGroup, ShardSpec
from colossalai.tensor import ColoParameter, ShardSpec, ComputePattern, ProcessGroup, ColoTensorSpec, ColoTensor
from .cache_mgr import CachedParamMgr, EvictionStrategy
from .cached_embedding import CachedEmbeddingBag
def get_partition(embedding_dim, rank, world_size) -> Tuple[int, int, bool]:
......
import time
from typing import List
import torch
import torch.distributed as dist
import torch.nn.functional as F
from .cached_embedding import CachedEmbeddingBag
from .cache_mgr import EvictionStrategy
from .embedding_config import TablewiseEmbeddingBagConfig
from colossalai.legacy.nn._ops._utils import dual_all_to_all_tablewise
from colossalai.tensor import ProcessGroup
from colossalai.nn._ops._utils import dual_all_to_all_tablewise
from typing import List
import time
from .cache_mgr import EvictionStrategy
from .cached_embedding import CachedEmbeddingBag
from .embedding_config import TablewiseEmbeddingBagConfig
class ParallelCachedEmbeddingBagTablewise(CachedEmbeddingBag):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment