Unverified Commit 35813ed3 authored by Frank Lee's avatar Frank Lee Committed by GitHub
Browse files

update examples and sphnix docs for the new api (#63)

parent 7d371105
...@@ -6,5 +6,11 @@ __all__ = ['get_dist_logger', 'DistributedLogger'] ...@@ -6,5 +6,11 @@ __all__ = ['get_dist_logger', 'DistributedLogger']
def get_dist_logger(name='root'): def get_dist_logger(name='root'):
"""Get logger instance based on name. The DistributedLogger will create singleton instances, """Get logger instance based on name. The DistributedLogger will create singleton instances,
which means that only one logger instance is created per name. which means that only one logger instance is created per name.
:param name: name of the logger, name must be unique
:type name: str
:return: a distributed logger instance
:rtype: :class:`colossalai.logging.DistributedLogger`
""" """
return DistributedLogger.get_instance(name=name) return DistributedLogger.get_instance(name=name)
...@@ -47,9 +47,24 @@ class ViTBlock(nn.Module): ...@@ -47,9 +47,24 @@ class ViTBlock(nn.Module):
@LAYERS.register_module @LAYERS.register_module
class VanillaViTPatchEmbedding(nn.Module): class VanillaViTPatchEmbedding(nn.Module):
""" 2D Image to Patch Embedding """ 2D Image to Patch Embedding
:param img_size: image size
:type img_size: int
:param patch_size: size of a patch
:type patch_size: int
:param in_chans: input channels
:type in_chans: int
:param embed_dim: embedding dimension
:type embed_dim: int
:param norm_layer: layer norm class, defaults to None
:type norm_layer: Callable
:param flattern: whether flatten the output
:type flatten: bool
:param drop: dropout rate
:type drop: float
""" """
def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, norm_layer=None, flatten=True, drop=0.): def __init__(self, img_size, patch_size, in_chans, embed_dim, norm_layer=None, flatten=True, drop=0.):
super().__init__() super().__init__()
img_size = to_2tuple(img_size) img_size = to_2tuple(img_size)
patch_size = to_2tuple(patch_size) patch_size = to_2tuple(patch_size)
...@@ -84,12 +99,22 @@ class VanillaViTPatchEmbedding(nn.Module): ...@@ -84,12 +99,22 @@ class VanillaViTPatchEmbedding(nn.Module):
@LAYERS.register_module @LAYERS.register_module
class VanillaViTMLP(nn.Module): class VanillaViTMLP(nn.Module):
""" MLP as used in Vision Transformer, MLP-Mixer and related networks """ MLP as used in Vision Transformer, MLP-Mixer and related networks
:param in_features: input channels
:type in_features: int
:param hidden_features: channels of the output of the first dense layer
:type hidden_features: int
:param hidden_features: channels of the output of the second dense layer
:type hidden_features: int
:param act_layer: activation function
:type act_layer: Callable
:param drop: dropout rate
:type drop: float
""" """
def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.): def __init__(self, in_features, hidden_features, out_features, act_layer=nn.GELU, drop=0.):
super().__init__() super().__init__()
out_features = out_features or in_features
hidden_features = hidden_features or in_features
self.fc1 = nn.Linear(in_features, hidden_features) self.fc1 = nn.Linear(in_features, hidden_features)
self.act = act_layer() self.act = act_layer()
self.fc2 = nn.Linear(hidden_features, out_features) self.fc2 = nn.Linear(hidden_features, out_features)
...@@ -113,6 +138,11 @@ def drop_path(x, drop_prob: float = 0., training: bool = False): ...@@ -113,6 +138,11 @@ def drop_path(x, drop_prob: float = 0., training: bool = False):
changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
'survival rate' as the argument. 'survival rate' as the argument.
:param drop_prob: probability for dropout
:type drop_prob: float
:param training: whether it is training mode
:type training: bool
""" """
if drop_prob == 0. or not training: if drop_prob == 0. or not training:
return x return x
...@@ -129,6 +159,9 @@ def drop_path(x, drop_prob: float = 0., training: bool = False): ...@@ -129,6 +159,9 @@ def drop_path(x, drop_prob: float = 0., training: bool = False):
@LAYERS.register_module @LAYERS.register_module
class VanillaViTDropPath(nn.Module): class VanillaViTDropPath(nn.Module):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
:param drop_prob: probability for dropout
:type drop_path: float
""" """
def __init__(self, drop_prob=0.): def __init__(self, drop_prob=0.):
...@@ -145,7 +178,7 @@ class VanillaViTAttention(nn.Module): ...@@ -145,7 +178,7 @@ class VanillaViTAttention(nn.Module):
:param dim: dimension of input tensor :param dim: dimension of input tensor
:type dim: int :type dim: int
:param num_heads: number of attention heads, defaults to 8 :param num_heads: number of attention heads
:type num_heads: int, optional :type num_heads: int, optional
:param qkv_bias: enable bias for qkv if True, defaults to False :param qkv_bias: enable bias for qkv if True, defaults to False
:type qkv_bias: bool, optional :type qkv_bias: bool, optional
...@@ -155,7 +188,7 @@ class VanillaViTAttention(nn.Module): ...@@ -155,7 +188,7 @@ class VanillaViTAttention(nn.Module):
:type proj_drop: float, optional :type proj_drop: float, optional
""" """
def __init__(self, dim, num_heads=8, qkv_bias=False, attn_drop=0., proj_drop=0.): def __init__(self, dim, num_heads, qkv_bias=False, attn_drop=0., proj_drop=0.):
super().__init__() super().__init__()
self.num_heads = num_heads self.num_heads = num_heads
head_dim = dim // num_heads head_dim = dim // num_heads
......
...@@ -109,15 +109,15 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2): ...@@ -109,15 +109,15 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
added functionality to handle model parallel parameters. Note that added functionality to handle model parallel parameters. Note that
the gradients are modified in place. the gradients are modified in place.
Arguments: :param parameters: an iterable of Tensors or a single Tensor that will have gradients normalized
parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a :type parameters: (Iterable[Tensor] or Tensor)
single Tensor that will have gradients normalized :param max_norm: max norm of the gradients
max_norm (float or int): max norm of the gradients :type max_norm: float or int
norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for :param norm_type: type of the used p-norm. Can be ``'inf'`` for infinity norm.
infinity norm. :type norm_type: float or int
Returns: :return: Total norm of the parameters (viewed as a single vector).
Total norm of the parameters (viewed as a single vector). :rtype: float
""" """
if isinstance(parameters, torch.Tensor): if isinstance(parameters, torch.Tensor):
......
...@@ -123,12 +123,23 @@ def get_dataloader(dataset, ...@@ -123,12 +123,23 @@ def get_dataloader(dataset,
stage and label on the last stage stage and label on the last stage
:param dataset: a :class:utils.data.dataset dataset :param dataset: a :class:utils.data.dataset dataset
:param shuffle: whether to shuffle the dataset
:param seed: random worker seed, defaults to 1024 :param seed: random worker seed, defaults to 1024
:type seed: int, optional :param add_sampler: add DistributedDataParallelSampelr to the dataset
:param add_sampler_if_possible: [description], defaults to False :param drop_last: drop the last incomplete batch of data
:type add_sampler_if_possible: bool, optional :param pin_memory: whether to pin memory address in CPU memory
:return: a :class:utils.data.dataset dataloader :param num_workers: number of worker threads for this dataloader
:rtype: torch.utils.data.dataset
:type dataset: :class:`torch.utils.data.Dataset`
:type shuffle: bool, optional. Default is False
:type seed: int, optional. Default is 1024
:type add_sampler: bool, optional. Default is True
:type drop_last: bool, optional. Default is False
:type pin_memory: bool, optional. Default is False
:type num_workers: int, optional. Default is 0
:return: a object of :class:`torch.utils.data.DataLoader`
:rtype: :class:`torch.utils.data.DataLoader`
''' '''
_kwargs = kwargs.copy() _kwargs = kwargs.copy()
......
...@@ -13,6 +13,20 @@ def accumulate_gradient(model: nn.Module, ...@@ -13,6 +13,20 @@ def accumulate_gradient(model: nn.Module,
accumulate_size: int, accumulate_size: int,
gradient_handlers: List[BaseGradientHandler] = None, gradient_handlers: List[BaseGradientHandler] = None,
lr_scheduler: _LRScheduler = None): lr_scheduler: _LRScheduler = None):
"""
:param model: your model object
:type model: :class:`torch.nn.Module`
:param optimizer: your optimizer object
:type optimizer: :class:`torch.optim.Optimizer`
:param dataloader: your dataloader object
:type dataloader: Iterable
:param accumulate_size: the number of steps to accumulate gradients
:type accumulate_size: int
:param gradient_handlers: list of gradient handler objects. Default is None
:type gradient_handlers: List[:class:`colossalai.engine.BaseGradientHandler`]
:param lr_scheduler: your lr scheduler object. Default is None
:type lr_scheduler: `torch.optim.lr_scheduler._LRScheduler`
"""
optimizer = GradAccumOptimizer(optimizer, accumulate_size=accumulate_size, model=model) optimizer = GradAccumOptimizer(optimizer, accumulate_size=accumulate_size, model=model)
dataloader = GradAccumDataloader(dataloader, accumulate_size=accumulate_size) dataloader = GradAccumDataloader(dataloader, accumulate_size=accumulate_size)
......
...@@ -14,6 +14,17 @@ from colossalai.engine import BaseGradientHandler ...@@ -14,6 +14,17 @@ from colossalai.engine import BaseGradientHandler
class GradAccumOptimizer(ColossalaiOptimizer): class GradAccumOptimizer(ColossalaiOptimizer):
"""A wrapper for the optimizer to enable gradient accumulation by skipping the steps
before accumulation size is reached
:param optim: your optimizer object
:type optim: :class:`torch.optim.Optimizer`
:param accumulate_size: the number of steps to accumulate gradients
:type accumualate_size: int
:param model: your model object to check if it is DDP for special handling of no_sync() context
:type model: :class:`torch.nn.Module`
"""
def __init__(self, optim: Optimizer, accumulate_size: int, model: nn.Module = None): def __init__(self, optim: Optimizer, accumulate_size: int, model: nn.Module = None):
super().__init__(optim) super().__init__(optim)
...@@ -64,6 +75,19 @@ class GradAccumOptimizer(ColossalaiOptimizer): ...@@ -64,6 +75,19 @@ class GradAccumOptimizer(ColossalaiOptimizer):
class GradAccumDataloader(): class GradAccumDataloader():
"""A wrapper for dataloder to enable gradient accumulation by dropping the last incomplete steps.
For example, if a dataloader has 10 batches of data and accumulate size is 4. The model paramters will
be update only twice at step 4 and step 8. The last two batches of data do not form a complete 4-step cycle.
Thus, they will be automatically skipped by this class. If the dataloader is not standard PyTorch dataloader,
(e.g. Dali dataloader), this class will automatically consume (load data for nothing) the remaining 2 batches.
:param dataloader: your dataloader object
:type dataloader: Iterable
:param accumulate_size: the number of steps to accumulate gradients
:type accumualate_size: int
"""
def __init__(self, dataloader: Iterable, accumulate_size: int) -> None: def __init__(self, dataloader: Iterable, accumulate_size: int) -> None:
self.dataloader = dataloader self.dataloader = dataloader
...@@ -99,6 +123,15 @@ class GradAccumDataloader(): ...@@ -99,6 +123,15 @@ class GradAccumDataloader():
class GradAccumLrSchedulerByStep(_LRScheduler): class GradAccumLrSchedulerByStep(_LRScheduler):
"""A wrapper for the LR scheduler to enable gradient accumulation by skipping the steps
before accumulation size is reached
:param lr_scheduler: your lr scheduler object
:type lr_scheduler: :class:`torch.optim.lr_scheduler._LRScheduler`
:param accumulate_size: the number of steps to accumulate gradients
:type accumualate_size: int
"""
def __init__(self, lr_scheduler: _LRScheduler, accumulate_size: int) -> None: def __init__(self, lr_scheduler: _LRScheduler, accumulate_size: int) -> None:
self.lr_scheduler = lr_scheduler self.lr_scheduler = lr_scheduler
...@@ -137,6 +170,15 @@ class GradAccumLrSchedulerByStep(_LRScheduler): ...@@ -137,6 +170,15 @@ class GradAccumLrSchedulerByStep(_LRScheduler):
class GradAccumGradientHandler(): class GradAccumGradientHandler():
"""A wrapper for the gradient handler to enable gradient accumulation by skipping the steps
before accumulation size is reached
:param grad_handler: your gradient handler object
:type grad_handler: :class:`colossalai.engine.BaseGradientHandler`
:param accumulate_size: the number of steps to accumulate gradients
:type accumualate_size: int
"""
def __init__(self, grad_handler: BaseGradientHandler, accumulate_size: int) -> None: def __init__(self, grad_handler: BaseGradientHandler, accumulate_size: int) -> None:
assert isinstance(grad_handler, BaseGradientHandler), \ assert isinstance(grad_handler, BaseGradientHandler), \
......
...@@ -34,6 +34,10 @@ def report_memory_usage(message, logger=None, report_cpu=False): ...@@ -34,6 +34,10 @@ def report_memory_usage(message, logger=None, report_cpu=False):
:param message: a prefix message to add in the log :param message: a prefix message to add in the log
:type message: str :type message: str
:param logger: an instance of :class:`colossalai.logging.DistributedLogger`
:type logger: :class:`colossalai.logging.DistributedLogger`
:param report_cpu: whether to report CPU memory
:type report_cpu: bool
:raises EnvironmentError: raise error if no distributed environment has been initialized :raises EnvironmentError: raise error if no distributed environment has been initialized
''' '''
if not gpc.is_initialized(ParallelMode.GLOBAL): if not gpc.is_initialized(ParallelMode.GLOBAL):
......
...@@ -2,6 +2,13 @@ ...@@ -2,6 +2,13 @@
class MultiTensorApply(object): class MultiTensorApply(object):
"""
Apply an operation to a list of tensors efficiently
:param chunk_size: size of a chunk
:type chunk_size: int
"""
available = False available = False
warned = False warned = False
......
...@@ -74,6 +74,9 @@ class Timer: ...@@ -74,6 +74,9 @@ class Timer:
class MultiTimer: class MultiTimer:
'''An object contains multiple timers '''An object contains multiple timers
:param on: whether the timer is enabled. Default is True
:type on: bool
''' '''
def __init__(self, on: bool = True): def __init__(self, on: bool = True):
......
...@@ -14,6 +14,21 @@ def convert_to_zero(model: nn.Module, ...@@ -14,6 +14,21 @@ def convert_to_zero(model: nn.Module,
optimizer: Optimizer, optimizer: Optimizer,
level: int, level: int,
zero_config): zero_config):
"""
A helper function to integrate the model and optimizer with ZeRO optimizer and off-loading
:param model: your model object
:type model: :class:`torch.nn.Module`
:param optimizer: your optimizer object
:type optimizer: :class:`torch.optim.Optimizer`
:param level: optimizer level, can be 2 or 3
:type level: int
:param zero_config: configuration for zero
:type zero_config: dict
:return: (model, optimizer)
:rtype: Tuple
"""
assert level == 2 or level == 3, 'Only ZERO Optimizer Level 2 and 3 are provided' assert level == 2 or level == 3, 'Only ZERO Optimizer Level 2 and 3 are provided'
if level == 2: if level == 2:
if is_no_pp_or_last_stage(): if is_no_pp_or_last_stage():
......
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import os
IMG_SIZE = 224
BATCH_SIZE = 256
NUM_EPOCHS = 100
model = dict(
type='VanillaResNet',
block_type='ResNetBottleneck',
layers=[3, 4, 6, 3],
num_cls=10
)
train_data = dict(
dataset=dict(
type='CIFAR10Dataset',
root=os.environ['DATA'],
transform_pipeline=[
dict(type='Resize', size=IMG_SIZE),
dict(type='RandomCrop', size=IMG_SIZE, padding=4),
dict(type='RandomHorizontalFlip'),
dict(type='ToTensor'),
dict(type='Normalize',
mean=[0.4914, 0.4822, 0.4465],
std=[0.2023, 0.1994, 0.2010]),
]
),
dataloader=dict(
batch_size=BATCH_SIZE,
pin_memory=True,
shuffle=True,
)
)
test_data = dict(
dataset=dict(
type='CIFAR10Dataset',
root=os.environ['DATA'],
train=False,
transform_pipeline=[
dict(type='Resize', size=IMG_SIZE),
dict(type='ToTensor'),
dict(type='Normalize',
mean=[0.4914, 0.4822, 0.4465],
std=[0.2023, 0.1994, 0.2010]
),
]
),
dataloader=dict(
batch_size=BATCH_SIZE,
pin_memory=True,
)
)
parallelization = dict(
pipeline=1,
tensor=dict(size=1, mode=None),
)
optimizer = dict(
type='Adam',
lr=0.01
)
loss = dict(
type='CrossEntropyLoss'
)
from colossalai.engine import AMP_TYPE
fp16 = dict(
mode=AMP_TYPE.APEX,
opt_level='O2',
)
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
NUM_EPOCH = int
model = dict()
train_data = dict()
test_data = dict()
optimizer = dict()
loss = dict()
fp16 = dict()
zero = dict()
gradient_handler = []
parallel = dict()
hooks = []
cudnn_benchmark = True
cudnn_deterministic = False
logging = dict()
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import os
from pathlib import Path
BATCH_SIZE = 512
IMG_SIZE = 32
PATCH_SIZE = 4
DIM = 512
NUM_ATTENTION_HEADS = 2
SUMMA_DIM = 2
NUM_CLASSES = 10
DEPTH = 6
NUM_EPOCHS = 60
train_data = dict(
dataset=dict(
type='CIFAR10Dataset',
root=Path(os.environ['DATA']),
transform_pipeline=[
dict(type='Resize', size=IMG_SIZE),
dict(type='RandomCrop', size=IMG_SIZE, padding=4),
dict(type='RandomHorizontalFlip'),
dict(type='ToTensor'),
dict(type='Normalize',
mean=[0.4914, 0.4822, 0.4465],
std=[0.2023, 0.1994, 0.2010]),
]
),
dataloader=dict(
batch_size=BATCH_SIZE,
drop_last=True,
pin_memory=True,
shuffle=True,
)
)
test_data = dict(
dataset=dict(
type='CIFAR10Dataset',
root=Path(os.environ['DATA']),
train=False,
transform_pipeline=[
dict(type='Resize', size=IMG_SIZE),
dict(type='ToTensor'),
dict(type='Normalize',
mean=[0.4914, 0.4822, 0.4465],
std=[0.2023, 0.1994, 0.2010]
),
]
),
dataloader=dict(
batch_size=BATCH_SIZE,
pin_memory=True,
)
)
optimizer = dict(
type='Adam',
lr=0.001,
weight_decay=0
)
loss = dict(
type='CrossEntropyLoss2D',
)
model = dict(
type='VisionTransformerFromConfig',
tensor_splitting_cfg=dict(
type='ViTInputSplitter2D',
),
embedding_cfg=dict(
type='ViTPatchEmbedding2D',
img_size=IMG_SIZE,
patch_size=PATCH_SIZE,
embed_dim=DIM,
),
token_fusion_cfg=dict(
type='ViTTokenFuser2D',
img_size=IMG_SIZE,
patch_size=PATCH_SIZE,
embed_dim=DIM,
drop_rate=0.1
),
norm_cfg=dict(
type='LayerNorm2D',
normalized_shape=DIM,
eps=1e-6,
),
block_cfg=dict(
type='ViTBlock',
attention_cfg=dict(
type='ViTSelfAttention2D',
hidden_size=DIM,
num_attention_heads=NUM_ATTENTION_HEADS,
attention_dropout_prob=0.,
hidden_dropout_prob=0.1,
checkpoint=True
),
droppath_cfg=dict(
type='VanillaViTDropPath',
),
mlp_cfg=dict(
type='ViTMLP2D',
in_features=DIM,
dropout_prob=0.1,
mlp_ratio=4,
checkpoint=True
),
norm_cfg=dict(
type='LayerNorm2D',
normalized_shape=DIM,
eps=1e-6,
),
),
head_cfg=dict(
type='ViTHead2D',
hidden_size=DIM,
num_classes=NUM_CLASSES,
),
embed_dim=DIM,
depth=DEPTH,
drop_path_rate=0.,
)
hooks = [
dict(type='LogMetricByEpochHook'),
dict(type='Accuracy2DHook'),
dict(type='LossHook'),
dict(
type='LRSchedulerHook',
by_epoch=True,
lr_scheduler_cfg=dict(
type='LinearWarmupLR',
warmup_steps=5
)
),
# dict(type='TensorboardHook', log_dir='./tb_logs'),
# dict(type='SaveCheckpointHook', interval=5, checkpoint_dir='./ckpt'),
# dict(type='LoadCheckpointHook', epoch=20, checkpoint_dir='./ckpt')
]
parallel = dict(
pipeline=dict(size=1),
tensor=dict(size=4, mode='2d'),
)
# for fp16 training
# from colossalai.engine import AMP_TYPE
# fp16 = dict(
# mode=AMP_TYPE.PARALLEL,
# initial_scale=2 ** 8
# )
# only needed when pipeline parallel is used
# schedule = dict(
# num_microbatches=8
# )
logging = dict(
root_path='./logs'
)
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import os
from pathlib import Path
from colossalai.context import ParallelMode
from colossalai.engine import AMP_TYPE
try:
import model_zoo
except:
print('You need to set model_zoo to your PYTHONPATH to use the models in the collection')
BATCH_SIZE = 512
IMG_SIZE = 32
NUM_EPOCHS = 60
train_data = dict(
dataset=dict(
type='CIFAR10Dataset',
root=Path(os.environ['DATA']),
transform_pipeline=[
dict(type='RandomCrop', size=IMG_SIZE, padding=4),
dict(type='RandomHorizontalFlip'),
dict(type='ToTensor'),
dict(type='Normalize',
mean=[0.4914, 0.4822, 0.4465],
std=[0.2023, 0.1994, 0.2010]),
]
),
dataloader=dict(
batch_size=BATCH_SIZE,
pin_memory=True,
num_workers=2,
shuffle=True,
)
)
test_data = dict(
dataset=dict(
type='CIFAR10Dataset',
root=Path(os.environ['DATA']),
train=False,
transform_pipeline=[
dict(type='ToTensor'),
dict(type='Normalize',
mean=[0.4914, 0.4822, 0.4465],
std=[0.2023, 0.1994, 0.2010]
),
]
),
dataloader=dict(
batch_size=BATCH_SIZE,
pin_memory=True,
num_workers=2,
)
)
optimizer = dict(
type='Adam',
lr=0.001
)
loss = dict(
type='CrossEntropyLoss3D',
input_parallel_mode=ParallelMode.PARALLEL_3D_OUTPUT,
weight_parallel_mode=ParallelMode.PARALLEL_3D_WEIGHT,
)
model = dict(
type='vit_tiny_3d_patch4_32',
drop_rate=0.1,
)
hooks = [
dict(type='LogMetricByEpochHook'),
dict(type='LogTimingByEpochHook'),
dict(type='LogMemoryByEpochHook'),
dict(
type='Accuracy3DHook',
input_parallel_mode=ParallelMode.PARALLEL_3D_OUTPUT,
weight_parallel_mode=ParallelMode.PARALLEL_3D_WEIGHT,
),
dict(type='LossHook'),
dict(type='TensorboardHook', log_dir='./tfb_logs'),
dict(
type='LRSchedulerHook',
by_epoch=True,
lr_scheduler_cfg=dict(
type='LinearWarmupLR',
warmup_steps=5
)
),
# dict(type='SaveCheckpointHook', interval=5, checkpoint_dir='./ckpt'),
# dict(type='LoadCheckpointHook', epoch=20, checkpoint_dir='./ckpt')
]
parallel = dict(
pipeline=dict(size=1),
tensor=dict(size=8, mode='3d'),
)
fp16 = dict(
mode=AMP_TYPE.PARALLEL,
initial_scale=2 ** 8
)
logging = dict(
root_path='./logs'
)
...@@ -77,10 +77,10 @@ fp16 = dict( ...@@ -77,10 +77,10 @@ fp16 = dict(
) )
``` ```
## Tensor Parallel AMP ## Naive AMP
We leveraged the Megatron-LM implementation to achieve mixed precision training while maintaining compatibility with complex tensor We leveraged the Megatron-LM implementation to achieve mixed precision training while maintaining compatibility with complex tensor
and pipeline parallelism. and pipeline parallelism. This AMP mode will cast all operations into fp16.
The following conde block show a config file for this mode. The following conde block show a config file for this mode.
......
colossalai.amp.apex\_amp
==========================
.. automodule:: colossalai.amp.apex_amp
:members:
colossalai.amp.naive\_amp
==========================
.. automodule:: colossalai.amp.naive_amp
:members:
colossalai.amp
==================
.. toctree::
:maxdepth: 2
colossalai.amp.torch_amp
colossalai.amp.apex_amp
colossalai.amp.naive_amp
.. automodule:: colossalai.amp
:members:
colossalai.amp.torch\_amp
==========================
.. automodule:: colossalai.amp.torch_amp
:members:
colossalai.builder colossalai.builder
================== ==================
.. automodule:: colossalai.builder
:members:
.. toctree:: .. toctree::
:maxdepth: 2 :maxdepth: 2
colossalai.builder.builder colossalai.builder.builder
colossalai.builder.pipeline colossalai.builder.pipeline
.. automodule:: colossalai.builder
:members:
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment