Unverified Commit 35813ed3 authored by Frank Lee's avatar Frank Lee Committed by GitHub
Browse files

update examples and sphnix docs for the new api (#63)

parent 7d371105
......@@ -6,5 +6,11 @@ __all__ = ['get_dist_logger', 'DistributedLogger']
def get_dist_logger(name='root'):
"""Get logger instance based on name. The DistributedLogger will create singleton instances,
which means that only one logger instance is created per name.
:param name: name of the logger, name must be unique
:type name: str
:return: a distributed logger instance
:rtype: :class:`colossalai.logging.DistributedLogger`
"""
return DistributedLogger.get_instance(name=name)
......@@ -47,9 +47,24 @@ class ViTBlock(nn.Module):
@LAYERS.register_module
class VanillaViTPatchEmbedding(nn.Module):
""" 2D Image to Patch Embedding
:param img_size: image size
:type img_size: int
:param patch_size: size of a patch
:type patch_size: int
:param in_chans: input channels
:type in_chans: int
:param embed_dim: embedding dimension
:type embed_dim: int
:param norm_layer: layer norm class, defaults to None
:type norm_layer: Callable
:param flattern: whether flatten the output
:type flatten: bool
:param drop: dropout rate
:type drop: float
"""
def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, norm_layer=None, flatten=True, drop=0.):
def __init__(self, img_size, patch_size, in_chans, embed_dim, norm_layer=None, flatten=True, drop=0.):
super().__init__()
img_size = to_2tuple(img_size)
patch_size = to_2tuple(patch_size)
......@@ -84,12 +99,22 @@ class VanillaViTPatchEmbedding(nn.Module):
@LAYERS.register_module
class VanillaViTMLP(nn.Module):
""" MLP as used in Vision Transformer, MLP-Mixer and related networks
:param in_features: input channels
:type in_features: int
:param hidden_features: channels of the output of the first dense layer
:type hidden_features: int
:param hidden_features: channels of the output of the second dense layer
:type hidden_features: int
:param act_layer: activation function
:type act_layer: Callable
:param drop: dropout rate
:type drop: float
"""
def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
def __init__(self, in_features, hidden_features, out_features, act_layer=nn.GELU, drop=0.):
super().__init__()
out_features = out_features or in_features
hidden_features = hidden_features or in_features
self.fc1 = nn.Linear(in_features, hidden_features)
self.act = act_layer()
self.fc2 = nn.Linear(hidden_features, out_features)
......@@ -113,6 +138,11 @@ def drop_path(x, drop_prob: float = 0., training: bool = False):
changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
'survival rate' as the argument.
:param drop_prob: probability for dropout
:type drop_prob: float
:param training: whether it is training mode
:type training: bool
"""
if drop_prob == 0. or not training:
return x
......@@ -129,6 +159,9 @@ def drop_path(x, drop_prob: float = 0., training: bool = False):
@LAYERS.register_module
class VanillaViTDropPath(nn.Module):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
:param drop_prob: probability for dropout
:type drop_path: float
"""
def __init__(self, drop_prob=0.):
......@@ -145,7 +178,7 @@ class VanillaViTAttention(nn.Module):
:param dim: dimension of input tensor
:type dim: int
:param num_heads: number of attention heads, defaults to 8
:param num_heads: number of attention heads
:type num_heads: int, optional
:param qkv_bias: enable bias for qkv if True, defaults to False
:type qkv_bias: bool, optional
......@@ -155,7 +188,7 @@ class VanillaViTAttention(nn.Module):
:type proj_drop: float, optional
"""
def __init__(self, dim, num_heads=8, qkv_bias=False, attn_drop=0., proj_drop=0.):
def __init__(self, dim, num_heads, qkv_bias=False, attn_drop=0., proj_drop=0.):
super().__init__()
self.num_heads = num_heads
head_dim = dim // num_heads
......
......@@ -109,15 +109,15 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
added functionality to handle model parallel parameters. Note that
the gradients are modified in place.
Arguments:
parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
single Tensor that will have gradients normalized
max_norm (float or int): max norm of the gradients
norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
infinity norm.
Returns:
Total norm of the parameters (viewed as a single vector).
:param parameters: an iterable of Tensors or a single Tensor that will have gradients normalized
:type parameters: (Iterable[Tensor] or Tensor)
:param max_norm: max norm of the gradients
:type max_norm: float or int
:param norm_type: type of the used p-norm. Can be ``'inf'`` for infinity norm.
:type norm_type: float or int
:return: Total norm of the parameters (viewed as a single vector).
:rtype: float
"""
if isinstance(parameters, torch.Tensor):
......
......@@ -123,12 +123,23 @@ def get_dataloader(dataset,
stage and label on the last stage
:param dataset: a :class:utils.data.dataset dataset
:param shuffle: whether to shuffle the dataset
:param seed: random worker seed, defaults to 1024
:type seed: int, optional
:param add_sampler_if_possible: [description], defaults to False
:type add_sampler_if_possible: bool, optional
:return: a :class:utils.data.dataset dataloader
:rtype: torch.utils.data.dataset
:param add_sampler: add DistributedDataParallelSampelr to the dataset
:param drop_last: drop the last incomplete batch of data
:param pin_memory: whether to pin memory address in CPU memory
:param num_workers: number of worker threads for this dataloader
:type dataset: :class:`torch.utils.data.Dataset`
:type shuffle: bool, optional. Default is False
:type seed: int, optional. Default is 1024
:type add_sampler: bool, optional. Default is True
:type drop_last: bool, optional. Default is False
:type pin_memory: bool, optional. Default is False
:type num_workers: int, optional. Default is 0
:return: a object of :class:`torch.utils.data.DataLoader`
:rtype: :class:`torch.utils.data.DataLoader`
'''
_kwargs = kwargs.copy()
......
......@@ -13,6 +13,20 @@ def accumulate_gradient(model: nn.Module,
accumulate_size: int,
gradient_handlers: List[BaseGradientHandler] = None,
lr_scheduler: _LRScheduler = None):
"""
:param model: your model object
:type model: :class:`torch.nn.Module`
:param optimizer: your optimizer object
:type optimizer: :class:`torch.optim.Optimizer`
:param dataloader: your dataloader object
:type dataloader: Iterable
:param accumulate_size: the number of steps to accumulate gradients
:type accumulate_size: int
:param gradient_handlers: list of gradient handler objects. Default is None
:type gradient_handlers: List[:class:`colossalai.engine.BaseGradientHandler`]
:param lr_scheduler: your lr scheduler object. Default is None
:type lr_scheduler: `torch.optim.lr_scheduler._LRScheduler`
"""
optimizer = GradAccumOptimizer(optimizer, accumulate_size=accumulate_size, model=model)
dataloader = GradAccumDataloader(dataloader, accumulate_size=accumulate_size)
......
......@@ -14,6 +14,17 @@ from colossalai.engine import BaseGradientHandler
class GradAccumOptimizer(ColossalaiOptimizer):
"""A wrapper for the optimizer to enable gradient accumulation by skipping the steps
before accumulation size is reached
:param optim: your optimizer object
:type optim: :class:`torch.optim.Optimizer`
:param accumulate_size: the number of steps to accumulate gradients
:type accumualate_size: int
:param model: your model object to check if it is DDP for special handling of no_sync() context
:type model: :class:`torch.nn.Module`
"""
def __init__(self, optim: Optimizer, accumulate_size: int, model: nn.Module = None):
super().__init__(optim)
......@@ -64,6 +75,19 @@ class GradAccumOptimizer(ColossalaiOptimizer):
class GradAccumDataloader():
"""A wrapper for dataloder to enable gradient accumulation by dropping the last incomplete steps.
For example, if a dataloader has 10 batches of data and accumulate size is 4. The model paramters will
be update only twice at step 4 and step 8. The last two batches of data do not form a complete 4-step cycle.
Thus, they will be automatically skipped by this class. If the dataloader is not standard PyTorch dataloader,
(e.g. Dali dataloader), this class will automatically consume (load data for nothing) the remaining 2 batches.
:param dataloader: your dataloader object
:type dataloader: Iterable
:param accumulate_size: the number of steps to accumulate gradients
:type accumualate_size: int
"""
def __init__(self, dataloader: Iterable, accumulate_size: int) -> None:
self.dataloader = dataloader
......@@ -99,6 +123,15 @@ class GradAccumDataloader():
class GradAccumLrSchedulerByStep(_LRScheduler):
"""A wrapper for the LR scheduler to enable gradient accumulation by skipping the steps
before accumulation size is reached
:param lr_scheduler: your lr scheduler object
:type lr_scheduler: :class:`torch.optim.lr_scheduler._LRScheduler`
:param accumulate_size: the number of steps to accumulate gradients
:type accumualate_size: int
"""
def __init__(self, lr_scheduler: _LRScheduler, accumulate_size: int) -> None:
self.lr_scheduler = lr_scheduler
......@@ -137,6 +170,15 @@ class GradAccumLrSchedulerByStep(_LRScheduler):
class GradAccumGradientHandler():
"""A wrapper for the gradient handler to enable gradient accumulation by skipping the steps
before accumulation size is reached
:param grad_handler: your gradient handler object
:type grad_handler: :class:`colossalai.engine.BaseGradientHandler`
:param accumulate_size: the number of steps to accumulate gradients
:type accumualate_size: int
"""
def __init__(self, grad_handler: BaseGradientHandler, accumulate_size: int) -> None:
assert isinstance(grad_handler, BaseGradientHandler), \
......
......@@ -34,6 +34,10 @@ def report_memory_usage(message, logger=None, report_cpu=False):
:param message: a prefix message to add in the log
:type message: str
:param logger: an instance of :class:`colossalai.logging.DistributedLogger`
:type logger: :class:`colossalai.logging.DistributedLogger`
:param report_cpu: whether to report CPU memory
:type report_cpu: bool
:raises EnvironmentError: raise error if no distributed environment has been initialized
'''
if not gpc.is_initialized(ParallelMode.GLOBAL):
......
......@@ -2,6 +2,13 @@
class MultiTensorApply(object):
"""
Apply an operation to a list of tensors efficiently
:param chunk_size: size of a chunk
:type chunk_size: int
"""
available = False
warned = False
......
......@@ -74,6 +74,9 @@ class Timer:
class MultiTimer:
'''An object contains multiple timers
:param on: whether the timer is enabled. Default is True
:type on: bool
'''
def __init__(self, on: bool = True):
......
......@@ -14,6 +14,21 @@ def convert_to_zero(model: nn.Module,
optimizer: Optimizer,
level: int,
zero_config):
"""
A helper function to integrate the model and optimizer with ZeRO optimizer and off-loading
:param model: your model object
:type model: :class:`torch.nn.Module`
:param optimizer: your optimizer object
:type optimizer: :class:`torch.optim.Optimizer`
:param level: optimizer level, can be 2 or 3
:type level: int
:param zero_config: configuration for zero
:type zero_config: dict
:return: (model, optimizer)
:rtype: Tuple
"""
assert level == 2 or level == 3, 'Only ZERO Optimizer Level 2 and 3 are provided'
if level == 2:
if is_no_pp_or_last_stage():
......
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import os
IMG_SIZE = 224
BATCH_SIZE = 256
NUM_EPOCHS = 100
model = dict(
type='VanillaResNet',
block_type='ResNetBottleneck',
layers=[3, 4, 6, 3],
num_cls=10
)
train_data = dict(
dataset=dict(
type='CIFAR10Dataset',
root=os.environ['DATA'],
transform_pipeline=[
dict(type='Resize', size=IMG_SIZE),
dict(type='RandomCrop', size=IMG_SIZE, padding=4),
dict(type='RandomHorizontalFlip'),
dict(type='ToTensor'),
dict(type='Normalize',
mean=[0.4914, 0.4822, 0.4465],
std=[0.2023, 0.1994, 0.2010]),
]
),
dataloader=dict(
batch_size=BATCH_SIZE,
pin_memory=True,
shuffle=True,
)
)
test_data = dict(
dataset=dict(
type='CIFAR10Dataset',
root=os.environ['DATA'],
train=False,
transform_pipeline=[
dict(type='Resize', size=IMG_SIZE),
dict(type='ToTensor'),
dict(type='Normalize',
mean=[0.4914, 0.4822, 0.4465],
std=[0.2023, 0.1994, 0.2010]
),
]
),
dataloader=dict(
batch_size=BATCH_SIZE,
pin_memory=True,
)
)
parallelization = dict(
pipeline=1,
tensor=dict(size=1, mode=None),
)
optimizer = dict(
type='Adam',
lr=0.01
)
loss = dict(
type='CrossEntropyLoss'
)
from colossalai.engine import AMP_TYPE
fp16 = dict(
mode=AMP_TYPE.APEX,
opt_level='O2',
)
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
NUM_EPOCH = int
model = dict()
train_data = dict()
test_data = dict()
optimizer = dict()
loss = dict()
fp16 = dict()
zero = dict()
gradient_handler = []
parallel = dict()
hooks = []
cudnn_benchmark = True
cudnn_deterministic = False
logging = dict()
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import os
from pathlib import Path
BATCH_SIZE = 512
IMG_SIZE = 32
PATCH_SIZE = 4
DIM = 512
NUM_ATTENTION_HEADS = 2
SUMMA_DIM = 2
NUM_CLASSES = 10
DEPTH = 6
NUM_EPOCHS = 60
train_data = dict(
dataset=dict(
type='CIFAR10Dataset',
root=Path(os.environ['DATA']),
transform_pipeline=[
dict(type='Resize', size=IMG_SIZE),
dict(type='RandomCrop', size=IMG_SIZE, padding=4),
dict(type='RandomHorizontalFlip'),
dict(type='ToTensor'),
dict(type='Normalize',
mean=[0.4914, 0.4822, 0.4465],
std=[0.2023, 0.1994, 0.2010]),
]
),
dataloader=dict(
batch_size=BATCH_SIZE,
drop_last=True,
pin_memory=True,
shuffle=True,
)
)
test_data = dict(
dataset=dict(
type='CIFAR10Dataset',
root=Path(os.environ['DATA']),
train=False,
transform_pipeline=[
dict(type='Resize', size=IMG_SIZE),
dict(type='ToTensor'),
dict(type='Normalize',
mean=[0.4914, 0.4822, 0.4465],
std=[0.2023, 0.1994, 0.2010]
),
]
),
dataloader=dict(
batch_size=BATCH_SIZE,
pin_memory=True,
)
)
optimizer = dict(
type='Adam',
lr=0.001,
weight_decay=0
)
loss = dict(
type='CrossEntropyLoss2D',
)
model = dict(
type='VisionTransformerFromConfig',
tensor_splitting_cfg=dict(
type='ViTInputSplitter2D',
),
embedding_cfg=dict(
type='ViTPatchEmbedding2D',
img_size=IMG_SIZE,
patch_size=PATCH_SIZE,
embed_dim=DIM,
),
token_fusion_cfg=dict(
type='ViTTokenFuser2D',
img_size=IMG_SIZE,
patch_size=PATCH_SIZE,
embed_dim=DIM,
drop_rate=0.1
),
norm_cfg=dict(
type='LayerNorm2D',
normalized_shape=DIM,
eps=1e-6,
),
block_cfg=dict(
type='ViTBlock',
attention_cfg=dict(
type='ViTSelfAttention2D',
hidden_size=DIM,
num_attention_heads=NUM_ATTENTION_HEADS,
attention_dropout_prob=0.,
hidden_dropout_prob=0.1,
checkpoint=True
),
droppath_cfg=dict(
type='VanillaViTDropPath',
),
mlp_cfg=dict(
type='ViTMLP2D',
in_features=DIM,
dropout_prob=0.1,
mlp_ratio=4,
checkpoint=True
),
norm_cfg=dict(
type='LayerNorm2D',
normalized_shape=DIM,
eps=1e-6,
),
),
head_cfg=dict(
type='ViTHead2D',
hidden_size=DIM,
num_classes=NUM_CLASSES,
),
embed_dim=DIM,
depth=DEPTH,
drop_path_rate=0.,
)
hooks = [
dict(type='LogMetricByEpochHook'),
dict(type='Accuracy2DHook'),
dict(type='LossHook'),
dict(
type='LRSchedulerHook',
by_epoch=True,
lr_scheduler_cfg=dict(
type='LinearWarmupLR',
warmup_steps=5
)
),
# dict(type='TensorboardHook', log_dir='./tb_logs'),
# dict(type='SaveCheckpointHook', interval=5, checkpoint_dir='./ckpt'),
# dict(type='LoadCheckpointHook', epoch=20, checkpoint_dir='./ckpt')
]
parallel = dict(
pipeline=dict(size=1),
tensor=dict(size=4, mode='2d'),
)
# for fp16 training
# from colossalai.engine import AMP_TYPE
# fp16 = dict(
# mode=AMP_TYPE.PARALLEL,
# initial_scale=2 ** 8
# )
# only needed when pipeline parallel is used
# schedule = dict(
# num_microbatches=8
# )
logging = dict(
root_path='./logs'
)
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import os
from pathlib import Path
from colossalai.context import ParallelMode
from colossalai.engine import AMP_TYPE
try:
import model_zoo
except:
print('You need to set model_zoo to your PYTHONPATH to use the models in the collection')
BATCH_SIZE = 512
IMG_SIZE = 32
NUM_EPOCHS = 60
train_data = dict(
dataset=dict(
type='CIFAR10Dataset',
root=Path(os.environ['DATA']),
transform_pipeline=[
dict(type='RandomCrop', size=IMG_SIZE, padding=4),
dict(type='RandomHorizontalFlip'),
dict(type='ToTensor'),
dict(type='Normalize',
mean=[0.4914, 0.4822, 0.4465],
std=[0.2023, 0.1994, 0.2010]),
]
),
dataloader=dict(
batch_size=BATCH_SIZE,
pin_memory=True,
num_workers=2,
shuffle=True,
)
)
test_data = dict(
dataset=dict(
type='CIFAR10Dataset',
root=Path(os.environ['DATA']),
train=False,
transform_pipeline=[
dict(type='ToTensor'),
dict(type='Normalize',
mean=[0.4914, 0.4822, 0.4465],
std=[0.2023, 0.1994, 0.2010]
),
]
),
dataloader=dict(
batch_size=BATCH_SIZE,
pin_memory=True,
num_workers=2,
)
)
optimizer = dict(
type='Adam',
lr=0.001
)
loss = dict(
type='CrossEntropyLoss3D',
input_parallel_mode=ParallelMode.PARALLEL_3D_OUTPUT,
weight_parallel_mode=ParallelMode.PARALLEL_3D_WEIGHT,
)
model = dict(
type='vit_tiny_3d_patch4_32',
drop_rate=0.1,
)
hooks = [
dict(type='LogMetricByEpochHook'),
dict(type='LogTimingByEpochHook'),
dict(type='LogMemoryByEpochHook'),
dict(
type='Accuracy3DHook',
input_parallel_mode=ParallelMode.PARALLEL_3D_OUTPUT,
weight_parallel_mode=ParallelMode.PARALLEL_3D_WEIGHT,
),
dict(type='LossHook'),
dict(type='TensorboardHook', log_dir='./tfb_logs'),
dict(
type='LRSchedulerHook',
by_epoch=True,
lr_scheduler_cfg=dict(
type='LinearWarmupLR',
warmup_steps=5
)
),
# dict(type='SaveCheckpointHook', interval=5, checkpoint_dir='./ckpt'),
# dict(type='LoadCheckpointHook', epoch=20, checkpoint_dir='./ckpt')
]
parallel = dict(
pipeline=dict(size=1),
tensor=dict(size=8, mode='3d'),
)
fp16 = dict(
mode=AMP_TYPE.PARALLEL,
initial_scale=2 ** 8
)
logging = dict(
root_path='./logs'
)
......@@ -77,10 +77,10 @@ fp16 = dict(
)
```
## Tensor Parallel AMP
## Naive AMP
We leveraged the Megatron-LM implementation to achieve mixed precision training while maintaining compatibility with complex tensor
and pipeline parallelism.
and pipeline parallelism. This AMP mode will cast all operations into fp16.
The following conde block show a config file for this mode.
......
colossalai.amp.apex\_amp
==========================
.. automodule:: colossalai.amp.apex_amp
:members:
colossalai.amp.naive\_amp
==========================
.. automodule:: colossalai.amp.naive_amp
:members:
colossalai.amp
==================
.. toctree::
:maxdepth: 2
colossalai.amp.torch_amp
colossalai.amp.apex_amp
colossalai.amp.naive_amp
.. automodule:: colossalai.amp
:members:
colossalai.amp.torch\_amp
==========================
.. automodule:: colossalai.amp.torch_amp
:members:
colossalai.builder
==================
.. automodule:: colossalai.builder
:members:
.. toctree::
:maxdepth: 2
colossalai.builder.builder
colossalai.builder.pipeline
.. automodule:: colossalai.builder
:members:
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment