Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ColossalAI
Commits
35813ed3
Unverified
Commit
35813ed3
authored
Dec 13, 2021
by
Frank Lee
Committed by
GitHub
Dec 13, 2021
Browse files
update examples and sphnix docs for the new api (#63)
parent
7d371105
Changes
118
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
189 additions
and
400 deletions
+189
-400
colossalai/logging/__init__.py
colossalai/logging/__init__.py
+6
-0
colossalai/nn/layer/non_parallel_layers/_vit.py
colossalai/nn/layer/non_parallel_layers/_vit.py
+39
-6
colossalai/utils/common.py
colossalai/utils/common.py
+9
-9
colossalai/utils/data_sampler/data_parallel_sampler.py
colossalai/utils/data_sampler/data_parallel_sampler.py
+16
-5
colossalai/utils/gradient_accumulation/__init__.py
colossalai/utils/gradient_accumulation/__init__.py
+14
-0
colossalai/utils/gradient_accumulation/_gradient_accumulation.py
...lai/utils/gradient_accumulation/_gradient_accumulation.py
+42
-0
colossalai/utils/memory.py
colossalai/utils/memory.py
+4
-0
colossalai/utils/multi_tensor_apply/multi_tensor_apply.py
colossalai/utils/multi_tensor_apply/multi_tensor_apply.py
+7
-0
colossalai/utils/timer.py
colossalai/utils/timer.py
+3
-0
colossalai/zero/__init__.py
colossalai/zero/__init__.py
+15
-0
configs/resnet/resnet50.py
configs/resnet/resnet50.py
+0
-76
configs/sample_config.py
configs/sample_config.py
+0
-22
configs/vit/vit_2d.py
configs/vit/vit_2d.py
+0
-165
configs/vit/vit_3d.py
configs/vit/vit_3d.py
+0
-111
docs/amp.md
docs/amp.md
+2
-2
docs/colossalai/colossalai.amp.apex_amp.rst
docs/colossalai/colossalai.amp.apex_amp.rst
+5
-0
docs/colossalai/colossalai.amp.naive_amp.rst
docs/colossalai/colossalai.amp.naive_amp.rst
+5
-0
docs/colossalai/colossalai.amp.rst
docs/colossalai/colossalai.amp.rst
+13
-0
docs/colossalai/colossalai.amp.torch_amp.rst
docs/colossalai/colossalai.amp.torch_amp.rst
+5
-0
docs/colossalai/colossalai.builder.rst
docs/colossalai/colossalai.builder.rst
+4
-4
No files found.
colossalai/logging/__init__.py
View file @
35813ed3
...
...
@@ -6,5 +6,11 @@ __all__ = ['get_dist_logger', 'DistributedLogger']
def
get_dist_logger
(
name
=
'root'
):
"""Get logger instance based on name. The DistributedLogger will create singleton instances,
which means that only one logger instance is created per name.
:param name: name of the logger, name must be unique
:type name: str
:return: a distributed logger instance
:rtype: :class:`colossalai.logging.DistributedLogger`
"""
return
DistributedLogger
.
get_instance
(
name
=
name
)
colossalai/nn/layer/non_parallel_layers/_vit.py
View file @
35813ed3
...
...
@@ -47,9 +47,24 @@ class ViTBlock(nn.Module):
@
LAYERS
.
register_module
class
VanillaViTPatchEmbedding
(
nn
.
Module
):
""" 2D Image to Patch Embedding
:param img_size: image size
:type img_size: int
:param patch_size: size of a patch
:type patch_size: int
:param in_chans: input channels
:type in_chans: int
:param embed_dim: embedding dimension
:type embed_dim: int
:param norm_layer: layer norm class, defaults to None
:type norm_layer: Callable
:param flattern: whether flatten the output
:type flatten: bool
:param drop: dropout rate
:type drop: float
"""
def
__init__
(
self
,
img_size
=
224
,
patch_size
=
16
,
in_chans
=
3
,
embed_dim
=
768
,
norm_layer
=
None
,
flatten
=
True
,
drop
=
0.
):
def
__init__
(
self
,
img_size
,
patch_size
,
in_chans
,
embed_dim
,
norm_layer
=
None
,
flatten
=
True
,
drop
=
0.
):
super
().
__init__
()
img_size
=
to_2tuple
(
img_size
)
patch_size
=
to_2tuple
(
patch_size
)
...
...
@@ -84,12 +99,22 @@ class VanillaViTPatchEmbedding(nn.Module):
@
LAYERS
.
register_module
class
VanillaViTMLP
(
nn
.
Module
):
""" MLP as used in Vision Transformer, MLP-Mixer and related networks
:param in_features: input channels
:type in_features: int
:param hidden_features: channels of the output of the first dense layer
:type hidden_features: int
:param hidden_features: channels of the output of the second dense layer
:type hidden_features: int
:param act_layer: activation function
:type act_layer: Callable
:param drop: dropout rate
:type drop: float
"""
def
__init__
(
self
,
in_features
,
hidden_features
=
None
,
out_features
=
None
,
act_layer
=
nn
.
GELU
,
drop
=
0.
):
def
__init__
(
self
,
in_features
,
hidden_features
,
out_features
,
act_layer
=
nn
.
GELU
,
drop
=
0.
):
super
().
__init__
()
out_features
=
out_features
or
in_features
hidden_features
=
hidden_features
or
in_features
self
.
fc1
=
nn
.
Linear
(
in_features
,
hidden_features
)
self
.
act
=
act_layer
()
self
.
fc2
=
nn
.
Linear
(
hidden_features
,
out_features
)
...
...
@@ -113,6 +138,11 @@ def drop_path(x, drop_prob: float = 0., training: bool = False):
changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
'survival rate' as the argument.
:param drop_prob: probability for dropout
:type drop_prob: float
:param training: whether it is training mode
:type training: bool
"""
if
drop_prob
==
0.
or
not
training
:
return
x
...
...
@@ -129,6 +159,9 @@ def drop_path(x, drop_prob: float = 0., training: bool = False):
@
LAYERS
.
register_module
class
VanillaViTDropPath
(
nn
.
Module
):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
:param drop_prob: probability for dropout
:type drop_path: float
"""
def
__init__
(
self
,
drop_prob
=
0.
):
...
...
@@ -145,7 +178,7 @@ class VanillaViTAttention(nn.Module):
:param dim: dimension of input tensor
:type dim: int
:param num_heads: number of attention heads
, defaults to 8
:param num_heads: number of attention heads
:type num_heads: int, optional
:param qkv_bias: enable bias for qkv if True, defaults to False
:type qkv_bias: bool, optional
...
...
@@ -155,7 +188,7 @@ class VanillaViTAttention(nn.Module):
:type proj_drop: float, optional
"""
def
__init__
(
self
,
dim
,
num_heads
=
8
,
qkv_bias
=
False
,
attn_drop
=
0.
,
proj_drop
=
0.
):
def
__init__
(
self
,
dim
,
num_heads
,
qkv_bias
=
False
,
attn_drop
=
0.
,
proj_drop
=
0.
):
super
().
__init__
()
self
.
num_heads
=
num_heads
head_dim
=
dim
//
num_heads
...
...
colossalai/utils/common.py
View file @
35813ed3
...
...
@@ -109,15 +109,15 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
added functionality to handle model parallel parameters. Note that
the gradients are modified in place.
Arguments:
parameters (Iterable[Tensor] or Tensor)
: an iterable of Tensors or a
single Tensor that will have gradients normalized
max_norm
(
float or int
): max norm of the gradients
norm_type
(float or int)
: type of the used p-norm. Can be ``'inf'`` for
infinity norm.
R
eturn
s
:
Total norm of the parameters (viewed as a single vector).
:param parameters: an iterable of Tensors or a single Tensor that will have gradients normalized
:type
parameters
:
(Iterable[Tensor] or Tensor)
:param max_norm: max norm of the gradients
:type
max_norm
:
float or int
:param
norm_type: type of the used p-norm. Can be ``'inf'`` for
infinity norm.
:type norm_type: float or int
:r
eturn:
Total norm of the parameters (viewed as a single vector).
:rtype: float
"""
if
isinstance
(
parameters
,
torch
.
Tensor
):
...
...
colossalai/utils/data_sampler/data_parallel_sampler.py
View file @
35813ed3
...
...
@@ -123,12 +123,23 @@ def get_dataloader(dataset,
stage and label on the last stage
:param dataset: a :class:utils.data.dataset dataset
:param shuffle: whether to shuffle the dataset
:param seed: random worker seed, defaults to 1024
:type seed: int, optional
:param add_sampler_if_possible: [description], defaults to False
:type add_sampler_if_possible: bool, optional
:return: a :class:utils.data.dataset dataloader
:rtype: torch.utils.data.dataset
:param add_sampler: add DistributedDataParallelSampelr to the dataset
:param drop_last: drop the last incomplete batch of data
:param pin_memory: whether to pin memory address in CPU memory
:param num_workers: number of worker threads for this dataloader
:type dataset: :class:`torch.utils.data.Dataset`
:type shuffle: bool, optional. Default is False
:type seed: int, optional. Default is 1024
:type add_sampler: bool, optional. Default is True
:type drop_last: bool, optional. Default is False
:type pin_memory: bool, optional. Default is False
:type num_workers: int, optional. Default is 0
:return: a object of :class:`torch.utils.data.DataLoader`
:rtype: :class:`torch.utils.data.DataLoader`
'''
_kwargs
=
kwargs
.
copy
()
...
...
colossalai/utils/gradient_accumulation/__init__.py
View file @
35813ed3
...
...
@@ -13,6 +13,20 @@ def accumulate_gradient(model: nn.Module,
accumulate_size
:
int
,
gradient_handlers
:
List
[
BaseGradientHandler
]
=
None
,
lr_scheduler
:
_LRScheduler
=
None
):
"""
:param model: your model object
:type model: :class:`torch.nn.Module`
:param optimizer: your optimizer object
:type optimizer: :class:`torch.optim.Optimizer`
:param dataloader: your dataloader object
:type dataloader: Iterable
:param accumulate_size: the number of steps to accumulate gradients
:type accumulate_size: int
:param gradient_handlers: list of gradient handler objects. Default is None
:type gradient_handlers: List[:class:`colossalai.engine.BaseGradientHandler`]
:param lr_scheduler: your lr scheduler object. Default is None
:type lr_scheduler: `torch.optim.lr_scheduler._LRScheduler`
"""
optimizer
=
GradAccumOptimizer
(
optimizer
,
accumulate_size
=
accumulate_size
,
model
=
model
)
dataloader
=
GradAccumDataloader
(
dataloader
,
accumulate_size
=
accumulate_size
)
...
...
colossalai/utils/gradient_accumulation/_gradient_accumulation.py
View file @
35813ed3
...
...
@@ -14,6 +14,17 @@ from colossalai.engine import BaseGradientHandler
class
GradAccumOptimizer
(
ColossalaiOptimizer
):
"""A wrapper for the optimizer to enable gradient accumulation by skipping the steps
before accumulation size is reached
:param optim: your optimizer object
:type optim: :class:`torch.optim.Optimizer`
:param accumulate_size: the number of steps to accumulate gradients
:type accumualate_size: int
:param model: your model object to check if it is DDP for special handling of no_sync() context
:type model: :class:`torch.nn.Module`
"""
def
__init__
(
self
,
optim
:
Optimizer
,
accumulate_size
:
int
,
model
:
nn
.
Module
=
None
):
super
().
__init__
(
optim
)
...
...
@@ -64,6 +75,19 @@ class GradAccumOptimizer(ColossalaiOptimizer):
class
GradAccumDataloader
():
"""A wrapper for dataloder to enable gradient accumulation by dropping the last incomplete steps.
For example, if a dataloader has 10 batches of data and accumulate size is 4. The model paramters will
be update only twice at step 4 and step 8. The last two batches of data do not form a complete 4-step cycle.
Thus, they will be automatically skipped by this class. If the dataloader is not standard PyTorch dataloader,
(e.g. Dali dataloader), this class will automatically consume (load data for nothing) the remaining 2 batches.
:param dataloader: your dataloader object
:type dataloader: Iterable
:param accumulate_size: the number of steps to accumulate gradients
:type accumualate_size: int
"""
def
__init__
(
self
,
dataloader
:
Iterable
,
accumulate_size
:
int
)
->
None
:
self
.
dataloader
=
dataloader
...
...
@@ -99,6 +123,15 @@ class GradAccumDataloader():
class
GradAccumLrSchedulerByStep
(
_LRScheduler
):
"""A wrapper for the LR scheduler to enable gradient accumulation by skipping the steps
before accumulation size is reached
:param lr_scheduler: your lr scheduler object
:type lr_scheduler: :class:`torch.optim.lr_scheduler._LRScheduler`
:param accumulate_size: the number of steps to accumulate gradients
:type accumualate_size: int
"""
def
__init__
(
self
,
lr_scheduler
:
_LRScheduler
,
accumulate_size
:
int
)
->
None
:
self
.
lr_scheduler
=
lr_scheduler
...
...
@@ -137,6 +170,15 @@ class GradAccumLrSchedulerByStep(_LRScheduler):
class
GradAccumGradientHandler
():
"""A wrapper for the gradient handler to enable gradient accumulation by skipping the steps
before accumulation size is reached
:param grad_handler: your gradient handler object
:type grad_handler: :class:`colossalai.engine.BaseGradientHandler`
:param accumulate_size: the number of steps to accumulate gradients
:type accumualate_size: int
"""
def
__init__
(
self
,
grad_handler
:
BaseGradientHandler
,
accumulate_size
:
int
)
->
None
:
assert
isinstance
(
grad_handler
,
BaseGradientHandler
),
\
...
...
colossalai/utils/memory.py
View file @
35813ed3
...
...
@@ -34,6 +34,10 @@ def report_memory_usage(message, logger=None, report_cpu=False):
:param message: a prefix message to add in the log
:type message: str
:param logger: an instance of :class:`colossalai.logging.DistributedLogger`
:type logger: :class:`colossalai.logging.DistributedLogger`
:param report_cpu: whether to report CPU memory
:type report_cpu: bool
:raises EnvironmentError: raise error if no distributed environment has been initialized
'''
if
not
gpc
.
is_initialized
(
ParallelMode
.
GLOBAL
):
...
...
colossalai/utils/multi_tensor_apply/multi_tensor_apply.py
View file @
35813ed3
...
...
@@ -2,6 +2,13 @@
class
MultiTensorApply
(
object
):
"""
Apply an operation to a list of tensors efficiently
:param chunk_size: size of a chunk
:type chunk_size: int
"""
available
=
False
warned
=
False
...
...
colossalai/utils/timer.py
View file @
35813ed3
...
...
@@ -74,6 +74,9 @@ class Timer:
class
MultiTimer
:
'''An object contains multiple timers
:param on: whether the timer is enabled. Default is True
:type on: bool
'''
def
__init__
(
self
,
on
:
bool
=
True
):
...
...
colossalai/zero/__init__.py
View file @
35813ed3
...
...
@@ -14,6 +14,21 @@ def convert_to_zero(model: nn.Module,
optimizer
:
Optimizer
,
level
:
int
,
zero_config
):
"""
A helper function to integrate the model and optimizer with ZeRO optimizer and off-loading
:param model: your model object
:type model: :class:`torch.nn.Module`
:param optimizer: your optimizer object
:type optimizer: :class:`torch.optim.Optimizer`
:param level: optimizer level, can be 2 or 3
:type level: int
:param zero_config: configuration for zero
:type zero_config: dict
:return: (model, optimizer)
:rtype: Tuple
"""
assert
level
==
2
or
level
==
3
,
'Only ZERO Optimizer Level 2 and 3 are provided'
if
level
==
2
:
if
is_no_pp_or_last_stage
():
...
...
configs/resnet/resnet50.py
deleted
100644 → 0
View file @
7d371105
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import
os
IMG_SIZE
=
224
BATCH_SIZE
=
256
NUM_EPOCHS
=
100
model
=
dict
(
type
=
'VanillaResNet'
,
block_type
=
'ResNetBottleneck'
,
layers
=
[
3
,
4
,
6
,
3
],
num_cls
=
10
)
train_data
=
dict
(
dataset
=
dict
(
type
=
'CIFAR10Dataset'
,
root
=
os
.
environ
[
'DATA'
],
transform_pipeline
=
[
dict
(
type
=
'Resize'
,
size
=
IMG_SIZE
),
dict
(
type
=
'RandomCrop'
,
size
=
IMG_SIZE
,
padding
=
4
),
dict
(
type
=
'RandomHorizontalFlip'
),
dict
(
type
=
'ToTensor'
),
dict
(
type
=
'Normalize'
,
mean
=
[
0.4914
,
0.4822
,
0.4465
],
std
=
[
0.2023
,
0.1994
,
0.2010
]),
]
),
dataloader
=
dict
(
batch_size
=
BATCH_SIZE
,
pin_memory
=
True
,
shuffle
=
True
,
)
)
test_data
=
dict
(
dataset
=
dict
(
type
=
'CIFAR10Dataset'
,
root
=
os
.
environ
[
'DATA'
],
train
=
False
,
transform_pipeline
=
[
dict
(
type
=
'Resize'
,
size
=
IMG_SIZE
),
dict
(
type
=
'ToTensor'
),
dict
(
type
=
'Normalize'
,
mean
=
[
0.4914
,
0.4822
,
0.4465
],
std
=
[
0.2023
,
0.1994
,
0.2010
]
),
]
),
dataloader
=
dict
(
batch_size
=
BATCH_SIZE
,
pin_memory
=
True
,
)
)
parallelization
=
dict
(
pipeline
=
1
,
tensor
=
dict
(
size
=
1
,
mode
=
None
),
)
optimizer
=
dict
(
type
=
'Adam'
,
lr
=
0.01
)
loss
=
dict
(
type
=
'CrossEntropyLoss'
)
from
colossalai.engine
import
AMP_TYPE
fp16
=
dict
(
mode
=
AMP_TYPE
.
APEX
,
opt_level
=
'O2'
,
)
configs/sample_config.py
deleted
100644 → 0
View file @
7d371105
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
NUM_EPOCH
=
int
model
=
dict
()
train_data
=
dict
()
test_data
=
dict
()
optimizer
=
dict
()
loss
=
dict
()
fp16
=
dict
()
zero
=
dict
()
gradient_handler
=
[]
parallel
=
dict
()
hooks
=
[]
cudnn_benchmark
=
True
cudnn_deterministic
=
False
logging
=
dict
()
configs/vit/vit_2d.py
deleted
100644 → 0
View file @
7d371105
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import
os
from
pathlib
import
Path
BATCH_SIZE
=
512
IMG_SIZE
=
32
PATCH_SIZE
=
4
DIM
=
512
NUM_ATTENTION_HEADS
=
2
SUMMA_DIM
=
2
NUM_CLASSES
=
10
DEPTH
=
6
NUM_EPOCHS
=
60
train_data
=
dict
(
dataset
=
dict
(
type
=
'CIFAR10Dataset'
,
root
=
Path
(
os
.
environ
[
'DATA'
]),
transform_pipeline
=
[
dict
(
type
=
'Resize'
,
size
=
IMG_SIZE
),
dict
(
type
=
'RandomCrop'
,
size
=
IMG_SIZE
,
padding
=
4
),
dict
(
type
=
'RandomHorizontalFlip'
),
dict
(
type
=
'ToTensor'
),
dict
(
type
=
'Normalize'
,
mean
=
[
0.4914
,
0.4822
,
0.4465
],
std
=
[
0.2023
,
0.1994
,
0.2010
]),
]
),
dataloader
=
dict
(
batch_size
=
BATCH_SIZE
,
drop_last
=
True
,
pin_memory
=
True
,
shuffle
=
True
,
)
)
test_data
=
dict
(
dataset
=
dict
(
type
=
'CIFAR10Dataset'
,
root
=
Path
(
os
.
environ
[
'DATA'
]),
train
=
False
,
transform_pipeline
=
[
dict
(
type
=
'Resize'
,
size
=
IMG_SIZE
),
dict
(
type
=
'ToTensor'
),
dict
(
type
=
'Normalize'
,
mean
=
[
0.4914
,
0.4822
,
0.4465
],
std
=
[
0.2023
,
0.1994
,
0.2010
]
),
]
),
dataloader
=
dict
(
batch_size
=
BATCH_SIZE
,
pin_memory
=
True
,
)
)
optimizer
=
dict
(
type
=
'Adam'
,
lr
=
0.001
,
weight_decay
=
0
)
loss
=
dict
(
type
=
'CrossEntropyLoss2D'
,
)
model
=
dict
(
type
=
'VisionTransformerFromConfig'
,
tensor_splitting_cfg
=
dict
(
type
=
'ViTInputSplitter2D'
,
),
embedding_cfg
=
dict
(
type
=
'ViTPatchEmbedding2D'
,
img_size
=
IMG_SIZE
,
patch_size
=
PATCH_SIZE
,
embed_dim
=
DIM
,
),
token_fusion_cfg
=
dict
(
type
=
'ViTTokenFuser2D'
,
img_size
=
IMG_SIZE
,
patch_size
=
PATCH_SIZE
,
embed_dim
=
DIM
,
drop_rate
=
0.1
),
norm_cfg
=
dict
(
type
=
'LayerNorm2D'
,
normalized_shape
=
DIM
,
eps
=
1e-6
,
),
block_cfg
=
dict
(
type
=
'ViTBlock'
,
attention_cfg
=
dict
(
type
=
'ViTSelfAttention2D'
,
hidden_size
=
DIM
,
num_attention_heads
=
NUM_ATTENTION_HEADS
,
attention_dropout_prob
=
0.
,
hidden_dropout_prob
=
0.1
,
checkpoint
=
True
),
droppath_cfg
=
dict
(
type
=
'VanillaViTDropPath'
,
),
mlp_cfg
=
dict
(
type
=
'ViTMLP2D'
,
in_features
=
DIM
,
dropout_prob
=
0.1
,
mlp_ratio
=
4
,
checkpoint
=
True
),
norm_cfg
=
dict
(
type
=
'LayerNorm2D'
,
normalized_shape
=
DIM
,
eps
=
1e-6
,
),
),
head_cfg
=
dict
(
type
=
'ViTHead2D'
,
hidden_size
=
DIM
,
num_classes
=
NUM_CLASSES
,
),
embed_dim
=
DIM
,
depth
=
DEPTH
,
drop_path_rate
=
0.
,
)
hooks
=
[
dict
(
type
=
'LogMetricByEpochHook'
),
dict
(
type
=
'Accuracy2DHook'
),
dict
(
type
=
'LossHook'
),
dict
(
type
=
'LRSchedulerHook'
,
by_epoch
=
True
,
lr_scheduler_cfg
=
dict
(
type
=
'LinearWarmupLR'
,
warmup_steps
=
5
)
),
# dict(type='TensorboardHook', log_dir='./tb_logs'),
# dict(type='SaveCheckpointHook', interval=5, checkpoint_dir='./ckpt'),
# dict(type='LoadCheckpointHook', epoch=20, checkpoint_dir='./ckpt')
]
parallel
=
dict
(
pipeline
=
dict
(
size
=
1
),
tensor
=
dict
(
size
=
4
,
mode
=
'2d'
),
)
# for fp16 training
# from colossalai.engine import AMP_TYPE
# fp16 = dict(
# mode=AMP_TYPE.PARALLEL,
# initial_scale=2 ** 8
# )
# only needed when pipeline parallel is used
# schedule = dict(
# num_microbatches=8
# )
logging
=
dict
(
root_path
=
'./logs'
)
configs/vit/vit_3d.py
deleted
100644 → 0
View file @
7d371105
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import
os
from
pathlib
import
Path
from
colossalai.context
import
ParallelMode
from
colossalai.engine
import
AMP_TYPE
try
:
import
model_zoo
except
:
print
(
'You need to set model_zoo to your PYTHONPATH to use the models in the collection'
)
BATCH_SIZE
=
512
IMG_SIZE
=
32
NUM_EPOCHS
=
60
train_data
=
dict
(
dataset
=
dict
(
type
=
'CIFAR10Dataset'
,
root
=
Path
(
os
.
environ
[
'DATA'
]),
transform_pipeline
=
[
dict
(
type
=
'RandomCrop'
,
size
=
IMG_SIZE
,
padding
=
4
),
dict
(
type
=
'RandomHorizontalFlip'
),
dict
(
type
=
'ToTensor'
),
dict
(
type
=
'Normalize'
,
mean
=
[
0.4914
,
0.4822
,
0.4465
],
std
=
[
0.2023
,
0.1994
,
0.2010
]),
]
),
dataloader
=
dict
(
batch_size
=
BATCH_SIZE
,
pin_memory
=
True
,
num_workers
=
2
,
shuffle
=
True
,
)
)
test_data
=
dict
(
dataset
=
dict
(
type
=
'CIFAR10Dataset'
,
root
=
Path
(
os
.
environ
[
'DATA'
]),
train
=
False
,
transform_pipeline
=
[
dict
(
type
=
'ToTensor'
),
dict
(
type
=
'Normalize'
,
mean
=
[
0.4914
,
0.4822
,
0.4465
],
std
=
[
0.2023
,
0.1994
,
0.2010
]
),
]
),
dataloader
=
dict
(
batch_size
=
BATCH_SIZE
,
pin_memory
=
True
,
num_workers
=
2
,
)
)
optimizer
=
dict
(
type
=
'Adam'
,
lr
=
0.001
)
loss
=
dict
(
type
=
'CrossEntropyLoss3D'
,
input_parallel_mode
=
ParallelMode
.
PARALLEL_3D_OUTPUT
,
weight_parallel_mode
=
ParallelMode
.
PARALLEL_3D_WEIGHT
,
)
model
=
dict
(
type
=
'vit_tiny_3d_patch4_32'
,
drop_rate
=
0.1
,
)
hooks
=
[
dict
(
type
=
'LogMetricByEpochHook'
),
dict
(
type
=
'LogTimingByEpochHook'
),
dict
(
type
=
'LogMemoryByEpochHook'
),
dict
(
type
=
'Accuracy3DHook'
,
input_parallel_mode
=
ParallelMode
.
PARALLEL_3D_OUTPUT
,
weight_parallel_mode
=
ParallelMode
.
PARALLEL_3D_WEIGHT
,
),
dict
(
type
=
'LossHook'
),
dict
(
type
=
'TensorboardHook'
,
log_dir
=
'./tfb_logs'
),
dict
(
type
=
'LRSchedulerHook'
,
by_epoch
=
True
,
lr_scheduler_cfg
=
dict
(
type
=
'LinearWarmupLR'
,
warmup_steps
=
5
)
),
# dict(type='SaveCheckpointHook', interval=5, checkpoint_dir='./ckpt'),
# dict(type='LoadCheckpointHook', epoch=20, checkpoint_dir='./ckpt')
]
parallel
=
dict
(
pipeline
=
dict
(
size
=
1
),
tensor
=
dict
(
size
=
8
,
mode
=
'3d'
),
)
fp16
=
dict
(
mode
=
AMP_TYPE
.
PARALLEL
,
initial_scale
=
2
**
8
)
logging
=
dict
(
root_path
=
'./logs'
)
docs/amp.md
View file @
35813ed3
...
...
@@ -77,10 +77,10 @@ fp16 = dict(
)
```
##
Tensor Parallel
AMP
##
Naive
AMP
We leveraged the Megatron-LM implementation to achieve mixed precision training while maintaining compatibility with complex tensor
and pipeline parallelism.
and pipeline parallelism.
This AMP mode will cast all operations into fp16.
The following conde block show a config file for this mode.
...
...
docs/colossalai/colossalai.amp.apex_amp.rst
0 → 100644
View file @
35813ed3
colossalai.amp.apex\_amp
==========================
.. automodule:: colossalai.amp.apex_amp
:members:
docs/colossalai/colossalai.amp.naive_amp.rst
0 → 100644
View file @
35813ed3
colossalai.amp.naive\_amp
==========================
.. automodule:: colossalai.amp.naive_amp
:members:
docs/colossalai/colossalai.amp.rst
0 → 100644
View file @
35813ed3
colossalai.amp
==================
.. toctree::
:maxdepth: 2
colossalai.amp.torch_amp
colossalai.amp.apex_amp
colossalai.amp.naive_amp
.. automodule:: colossalai.amp
:members:
docs/colossalai/colossalai.amp.torch_amp.rst
0 → 100644
View file @
35813ed3
colossalai.amp.torch\_amp
==========================
.. automodule:: colossalai.amp.torch_amp
:members:
docs/colossalai/colossalai.builder.rst
View file @
35813ed3
colossalai.builder
==================
.. automodule:: colossalai.builder
:members:
.. toctree::
:maxdepth: 2
colossalai.builder.builder
colossalai.builder.pipeline
.. automodule:: colossalai.builder
:members:
Prev
1
2
3
4
5
6
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment