Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ColossalAI
Commits
ec5086c4
Commit
ec5086c4
authored
Mar 25, 2022
by
Liang Bowen
Committed by
アマデウス
Mar 29, 2022
Browse files
Refactored docstring to google style
parent
53b1b6e3
Changes
94
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
482 additions
and
454 deletions
+482
-454
colossalai/nn/loss/loss_2p5d.py
colossalai/nn/loss/loss_2p5d.py
+24
-15
colossalai/nn/loss/loss_3d.py
colossalai/nn/loss/loss_3d.py
+25
-17
colossalai/nn/loss/loss_moe.py
colossalai/nn/loss/loss_moe.py
+42
-12
colossalai/nn/lr_scheduler/cosine.py
colossalai/nn/lr_scheduler/cosine.py
+27
-38
colossalai/nn/lr_scheduler/delayed.py
colossalai/nn/lr_scheduler/delayed.py
+28
-32
colossalai/nn/lr_scheduler/linear.py
colossalai/nn/lr_scheduler/linear.py
+7
-9
colossalai/nn/lr_scheduler/multistep.py
colossalai/nn/lr_scheduler/multistep.py
+18
-28
colossalai/nn/lr_scheduler/onecycle.py
colossalai/nn/lr_scheduler/onecycle.py
+35
-37
colossalai/nn/lr_scheduler/poly.py
colossalai/nn/lr_scheduler/poly.py
+15
-22
colossalai/nn/lr_scheduler/torch.py
colossalai/nn/lr_scheduler/torch.py
+29
-42
colossalai/nn/metric/accuracy_2d.py
colossalai/nn/metric/accuracy_2d.py
+6
-2
colossalai/nn/metric/accuracy_2p5d.py
colossalai/nn/metric/accuracy_2p5d.py
+6
-2
colossalai/nn/metric/accuracy_3d.py
colossalai/nn/metric/accuracy_3d.py
+6
-2
colossalai/registry/registry.py
colossalai/registry/registry.py
+22
-22
colossalai/trainer/_trainer.py
colossalai/trainer/_trainer.py
+69
-53
colossalai/trainer/hooks/_base_hook.py
colossalai/trainer/hooks/_base_hook.py
+14
-20
colossalai/trainer/hooks/_checkpoint_hook.py
colossalai/trainer/hooks/_checkpoint_hook.py
+18
-20
colossalai/trainer/hooks/_log_hook.py
colossalai/trainer/hooks/_log_hook.py
+43
-43
colossalai/trainer/hooks/_lr_scheduler_hook.py
colossalai/trainer/hooks/_lr_scheduler_hook.py
+10
-8
colossalai/trainer/hooks/_metric_hook.py
colossalai/trainer/hooks/_metric_hook.py
+38
-30
No files found.
colossalai/nn/loss/loss_2p5d.py
View file @
ec5086c4
...
...
@@ -13,14 +13,22 @@ from torch.nn.modules.loss import _Loss
@
LOSSES
.
register_module
class
CrossEntropyLoss2p5D
(
_Loss
):
"""
Cross entropy loss for 2.5D parallelism
r
"""Cross entropy loss for 2.5D parallelism
Args:
reduction (bool, optional): whether to average the loss, defaults to True.
:param reduction: whether to average the loss, defaults to True
:param args: Args for loss function
:param kwargs: Kwargs for loss function
The ``args`` and ``kwargs`` should include parameters below:
::
:type reduction: bool, optional
weight (Tensor, optional)
size_average (bool, optional)
ignore_index (int, optional)
reduce (bool, optional)
label_smoothing (float, optional)
More details about args, kwargs and torch.nn.functional.cross_entropy could be found in
`Cross_entropy <https://pytorch.org/docs/stable/generated/torch.nn.functional.cross_entropy.html#torch.nn.functional.cross_entropy>`_.
"""
def
__init__
(
self
,
reduction
=
True
,
*
args
,
**
kwargs
):
super
().
__init__
()
...
...
@@ -30,10 +38,11 @@ class CrossEntropyLoss2p5D(_Loss):
self
.
loss_kwargs
=
kwargs
def
forward
(
self
,
logits
,
targets
):
"""Calculate loss between logits and targets
"""Calculate loss between logits and targets
.
:param logits: Output logits of model
:param targets: True targets from data
Args:
logits (:class:`torch.tensor`): Predicted unnormalized scores (often referred to as logits).
targets (:class:`torch.tensor`): Ground truth class indices or class probabilities.
"""
targets
=
split_tensor_2p5d
(
targets
)
loss
=
cross_entropy
(
logits
,
targets
,
reduction
=
'none'
,
*
self
.
loss_args
,
**
self
.
loss_kwargs
)
...
...
@@ -115,19 +124,19 @@ class VocabParallelCrossEntropyLoss2p5D(_Loss):
"""
Vocab parallel cross entropy loss for 2.5D parallelism
:param reduction: whether to average the loss, defaults to True
:type reduction: bool, optional
Args:
reduction (bool, optional): whether to average the loss, defaults to True.
"""
def
__init__
(
self
,
reduction
=
True
):
super
().
__init__
()
self
.
reduction_mean
=
reduction
def
forward
(
self
,
logits
,
targets
):
"""Calculate loss between logits and targets
"""Calculate loss between logits and targets
.
:param logits: Output logits of model
:param targets: True targets from data
Args:
logits (:class:`torch.tensor`): Predicted unnormalized scores (often referred to as logits).
targets (:class:`torch.tensor`): Ground truth class indices or class probabilities.
"""
targets
=
split_tensor_2p5d
(
targets
)
loss
=
_VocabParallelCrossEntropy2p5D
.
apply
(
logits
,
targets
)
...
...
colossalai/nn/loss/loss_3d.py
View file @
ec5086c4
...
...
@@ -13,14 +13,22 @@ from torch.nn.modules.loss import _Loss
@
LOSSES
.
register_module
class
CrossEntropyLoss3D
(
_Loss
):
"""
Cross entropy loss for 3D parallelism
r
"""Cross entropy loss for 3D parallelism.
Args:
reduction (bool, optional): whether to average the loss, defaults to True.
The ``args`` and ``kwargs`` should include parameters below:
::
:param reduction: whether to average the loss, defaults to True
:param args: Args for loss function
:param kwargs: Kwargs for loss function
weight (Tensor, optional)
size_average (bool, optional)
ignore_index (int, optional)
reduce (bool, optional)
label_smoothing (float, optional)
:type reduction: bool, optional
More details about args, kwargs and torch.nn.functional.cross_entropy could be found in
`Cross_entropy <https://pytorch.org/docs/stable/generated/torch.nn.functional.cross_entropy.html#torch.nn.functional.cross_entropy>`_.
"""
def
__init__
(
self
,
reduction
=
True
,
*
args
,
**
kwargs
):
...
...
@@ -32,10 +40,11 @@ class CrossEntropyLoss3D(_Loss):
self
.
loss_kwargs
=
kwargs
def
forward
(
self
,
logits
,
targets
):
"""Calculate loss between logits and targets
"""Calculate loss between logits and targets
.
:param logits: Output logits of model
:param targets: True targets from data
Args:
logits (:class:`torch.tensor`): Predicted unnormalized scores (often referred to as logits).
targets (:class:`torch.tensor`): Ground truth class indices or class probabilities.
"""
targets
=
split_tensor_3d
(
targets
,
0
,
self
.
weight_parallel_mode
)
targets
=
split_tensor_3d
(
targets
,
0
,
self
.
input_parallel_mode
)
...
...
@@ -109,12 +118,10 @@ class _VocabParallelCrossEntropy3D(torch.autograd.Function):
@
LOSSES
.
register_module
class
VocabParallelCrossEntropyLoss3D
(
_Loss
):
"""
Vocab parallel cross entropy loss for 2D parallelism
:param reduction: whether to average the loss, defaults to True
"""Vocab parallel cross entropy loss for 2D parallelism.
:type reduction: bool, optional
Args:
reduction (bool, optional): whether to average the loss, defaults to True.
"""
def
__init__
(
self
,
reduction
=
True
):
...
...
@@ -125,10 +132,11 @@ class VocabParallelCrossEntropyLoss3D(_Loss):
self
.
reduction_mean
=
reduction
def
forward
(
self
,
logits
,
targets
):
"""Calculate loss between logits and targets
"""Calculate loss between logits and targets
.
:param logits: Output logits of model
:param targets: True targets from data
Args:
logits (:class:`torch.tensor`): Predicted unnormalized scores (often referred to as logits).
targets (:class:`torch.tensor`): Ground truth class indices or class probabilities.
"""
targets
=
split_tensor_3d
(
targets
,
0
,
self
.
weight_parallel_mode
)
targets
=
split_tensor_3d
(
targets
,
0
,
self
.
input_parallel_mode
)
...
...
colossalai/nn/loss/loss_moe.py
View file @
ec5086c4
...
...
@@ -6,13 +6,25 @@ from colossalai.context.moe_context import MOE_CONTEXT
@
LOSSES
.
register_module
class
MoeCrossEntropyLoss
(
_Loss
):
"""torch.nn.CrossEntropyLoss added with auxiliary loss.
r
"""torch.nn.CrossEntropyLoss added with auxiliary loss.
:param aux_weight: Weight of auxiliary loss in total loss
:param args: Args in CrossEntropyLoss
:param kwargs: Kwargs in CrossEntropyLoss
Args:
input (:class:`torch.tensor`): Predicted unnormalized scores (often referred to as logits).
target (:class:`torch.tensor`): Ground truth class indices or class probabilities.
aux_weight (float, optional): Weight of auxiliary loss in total loss.Defaults 0.01.
:type aux_weight: float, optional
The ``args`` and ``kwargs`` should include parameters below:
::
weight (Tensor, optional)
size_average (bool, optional)
ignore_index (int, optional)
reduce (bool, optional)
reduction (str, optional)
label_smoothing (float, optional)
More details about args, kwargs and torch.nn.functional.cross_entropy could be found in
`Cross_entropy <https://pytorch.org/docs/stable/generated/torch.nn.functional.cross_entropy.html#torch.nn.functional.cross_entropy>`_.
"""
def
__init__
(
self
,
aux_weight
:
float
=
0.01
,
*
args
,
**
kwargs
):
...
...
@@ -21,6 +33,16 @@ class MoeCrossEntropyLoss(_Loss):
self
.
aux_weight
=
aux_weight
def
forward
(
self
,
*
args
):
"""
The ``args`` should at least include parameters below:
::
input (:class:`torch.tensor`): Predicted unnormalized scores (often referred to as logits).
target (:class:`torch.tensor`): Ground truth class indices or class probabilities.
More details about args, kwargs and torch.nn.functional.cross_entropy could be found in
`Cross_entropy <https://pytorch.org/docs/stable/generated/torch.nn.functional.cross_entropy.html#torch.nn.functional.cross_entropy>`_.
"""
main_loss
=
self
.
loss
(
*
args
)
aux_loss
=
MOE_CONTEXT
.
get_loss
()
return
main_loss
+
self
.
aux_weight
*
aux_loss
...
...
@@ -30,13 +52,11 @@ class MoeCrossEntropyLoss(_Loss):
class
MoeLoss
(
_Loss
):
"""A wrapper class for any loss module to add with auxiliary loss.
:param aux_weight: Weight of auxiliary loss in total loss
:param loss_fn: Loss function
:param args: Args in loss function
:param kwargs: Kwargs in loss function
:type aux_weight: float
:type loss_fn: Callable
Args:
aux_weight (float): Weight of auxiliary loss in total loss.
loss_fn (``Callable``): Loss function.
args (list): Args in loss function.
kwargs (dict): Kwargs in loss function
"""
def
__init__
(
self
,
aux_weight
:
float
,
loss_fn
,
*
args
,
**
kwargs
):
...
...
@@ -45,6 +65,16 @@ class MoeLoss(_Loss):
self
.
aux_weight
=
aux_weight
def
forward
(
self
,
*
args
,
**
kwargs
):
"""
The ``args`` and ``kwargs`` should at least include parameters below:
::
input (:class:`torch.tensor`): Predicted unnormalized scores (often referred to as logits).
target (:class:`torch.tensor`): Ground truth class indices or class probabilities.
Note:
The ``args`` and ``kwargs`` may include different parameters varying with different loss function.
"""
main_loss
=
self
.
loss_fn
(
*
args
,
**
kwargs
)
aux_loss
=
MOE_CONTEXT
.
get_loss
()
return
main_loss
+
self
.
aux_weight
*
aux_loss
colossalai/nn/lr_scheduler/cosine.py
View file @
ec5086c4
...
...
@@ -36,14 +36,12 @@ class CosineAnnealingLR(_CosineAnnealingLR):
.. _SGDR\: Stochastic Gradient Descent with Warm Restarts:
https://arxiv.org/abs/1608.03983
:param optimizer: Wrapped optimizer
:type optimizer: torch.optim.Optimizer
:param total_steps: Number of total training steps
:type total_steps: int
:param eta_min: Minimum learning rate, defaults to 0
:type eta_min: int, optional
:param last_epoch: The index of last epoch, defaults to -1
:type last_epoch: int, optional
Args:
optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
total_steps (int): Number of total training steps.
eta_min (int, optional): Minimum learning rate, defaults to 0.
last_epoch (int, optional): The index of last epoch, defaults to -1. When last_epoch=-1,
the schedule is started from the beginning or When last_epoch=-1, sets initial lr as lr.
"""
def
__init__
(
self
,
optimizer
,
total_steps
:
int
,
eta_min
:
int
=
0
,
last_epoch
:
int
=
-
1
,
**
kwargs
):
...
...
@@ -54,16 +52,13 @@ class CosineAnnealingLR(_CosineAnnealingLR):
class
CosineAnnealingWarmupLR
(
WarmupScheduler
):
"""Cosine annealing learning rate scheduler with learning rate warmup. A linear warmup schedule will be applied.
:param optimizer: Wrapped optimizer
:type optimizer: torch.optim.Optimizer
:param total_steps: Number of total training steps
:type total_steps: int
:param warmup_steps: Number of warmup steps, defaults to 0
:type warmup_steps: int, optional
:param eta_min: Minimum learning rate, defaults to 0
:type eta_min: int, optional
:param last_epoch: The index of last epoch, defaults to -1
:type last_epoch: int, optional
Args:
optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
total_steps (int): Number of total training steps.
warmup_steps (int, optional): Number of warmup steps, defaults to 0.
eta_min (int, optional): Minimum learning rate, defaults to 0.
last_epoch (int, optional): The index of last epoch, defaults to -1. When last_epoch=-1,
the schedule is started from the beginning or When last_epoch=-1, sets initial lr as lr.
"""
def
__init__
(
self
,
optimizer
,
total_steps
:
int
,
warmup_steps
:
int
=
0
,
eta_min
:
float
=
0.
,
last_epoch
:
int
=
-
1
):
...
...
@@ -76,14 +71,12 @@ class CosineAnnealingWarmupLR(WarmupScheduler):
class
FlatAnnealingLR
(
DelayerScheduler
):
"""Flat and cosine annealing learning rate scheduler. The learning rate will be a fixed value before starting decay.
:param optimizer: Wrapped optimizer
:type optimizer: torch.optim.Optimizer
:param total_steps: Number of total training steps
:type total_steps: int
:param pct_start: Percent of steps before starting learning rate decay
:type pct_start: float
:param last_epoch: The index of last epoch, defaults to -1
:type last_epoch: int, optional
Args:
optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
total_steps (int): Number of total training steps.
pct_start (float, optional): Percent of steps before starting learning rate decay, defaults to -0.72.
last_epoch (int, optional): The index of last epoch, defaults to -1. When last_epoch=-1,
the schedule is started from the beginning or When last_epoch=-1, sets initial lr as lr.
"""
def
__init__
(
self
,
optimizer
,
total_steps
:
int
,
pct_start
:
float
=
0.72
,
last_epoch
:
int
=
-
1
,
**
kwargs
):
...
...
@@ -102,18 +95,14 @@ class FlatAnnealingWarmupLR(WarmupDelayerScheduler):
"""Flat and cosine annealing learning rate scheduler with learning rate warmup. A linear warmup schedule will be
applied, and then the learning rate will be a fixed value before starting decay.
:param optimizer: Wrapped optimizer
:type optimizer: torch.optim.Optimizer
:param total_steps: Number of total training steps
:type total_steps: int
:param warmup_steps: Number of warmup steps, defaults to 0
:type warmup_steps: int, optional
:param pct_start: Percent of steps before starting learning rate decay
:type pct_start: float
:param eta_min: Minimum learning rate, defaults to 0
:type eta_min: int, optional
:param last_epoch: The index of last epoch, defaults to -1
:type last_epoch: int, optional
Args:
optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
total_steps (int): Number of total training steps.
warmup_steps (int, optional): Number of warmup steps, defaults to 0.
pct_start (float, optional): Percent of steps before starting learning rate decay, defaults to -0.72.
eta_min (int, optional): Minimum learning rate, defaults to 0.
last_epoch (int, optional): The index of last epoch, defaults to -1. When last_epoch=-1,
the schedule is started from the beginning or When last_epoch=-1, sets initial lr as lr.
"""
def
__init__
(
self
,
optimizer
,
total_steps
:
int
,
warmup_steps
:
int
=
0
,
pct_start
:
float
=
0.72
,
eta_min
:
int
=
0
,
...
...
colossalai/nn/lr_scheduler/delayed.py
View file @
ec5086c4
...
...
@@ -14,16 +14,15 @@ class _enable_get_lr_call:
class
DelayerScheduler
(
_LRScheduler
):
""" Starts with a flat lr schedule until it reaches N epochs the applies a scheduler
:param optimizer: Wrapped optimizer.
:type optimizer: torch.optim.Optimizer
:param delay_epochs: Number of epochs to keep the initial lr until starting aplying the scheduler
:type delay_epochs: int
:param after_scheduler: After target_epoch, use this scheduler(eg. ReduceLROnPlateau)
:type after_scheduler: torch.optim.lr_scheduler
:param last_epoch: The index of last epoch, defaults to -1
:type last_epoch: int, optional
"""Starts with a flat lr schedule until it reaches N epochs then applies
the specific scheduler (For example: ReduceLROnPlateau)
Args:
optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
delay_epochs (int): Number of epochs to keep the initial lr until starting applying the scheduler.
after_scheduler (:class:`torch.optim.lr_scheduler`): After target_epoch, use this scheduler.
last_epoch (int, optional): The index of last epoch, defaults to -1. When last_epoch=-1,
the schedule is started from the beginning or When last_epoch=-1, sets initial lr as lr.
"""
def
__init__
(
self
,
optimizer
,
delay_epochs
,
after_scheduler
,
last_epoch
=-
1
):
...
...
@@ -57,16 +56,15 @@ class DelayerScheduler(_LRScheduler):
class
WarmupScheduler
(
_LRScheduler
):
""" Starts with a linear warmup lr schedule until it reaches N epochs the applies a scheduler
:param optimizer: Wrapped optimizer.
:type optimizer: torch.optim.Optimizer
:param warmup_epochs: Number of epochs to linearly warmup lr until starting aplying the scheduler
:type warmup_epochs: int
:param after_scheduler: After target_epoch, use this scheduler(eg. ReduceLROnPlateau)
:type after_scheduler: torch.optim.lr_scheduler
:param last_epoch: The index of last epoch, defaults to -1
:type last_epoch: int, optional
"""Starts with a linear warmup lr schedule until it reaches N epochs then applies
the specific scheduler (For example: ReduceLROnPlateau).
Args:
optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
warmup_epochs (int): Number of epochs to linearly warmup lr until starting applying the scheduler.
after_scheduler (:class:`torch.optim.lr_scheduler`): After target_epoch, use this scheduler.
last_epoch (int, optional): The index of last epoch, defaults to -1. When last_epoch=-1,
the schedule is started from the beginning or When last_epoch=-1, sets initial lr as lr.
"""
def
__init__
(
self
,
optimizer
,
warmup_epochs
,
after_scheduler
,
last_epoch
=-
1
):
...
...
@@ -97,18 +95,16 @@ class WarmupScheduler(_LRScheduler):
class
WarmupDelayerScheduler
(
_LRScheduler
):
""" Starts with a linear warmup lr schedule until it reaches N epochs and a flat lr schedule until it reaches M epochs the applies a scheduler
:param optimizer: Wrapped optimizer.
:type optimizer: torch.optim.Optimizer
:param warmup_epochs: Number of epochs to linearly warmup lr until starting aplying the scheduler
:type warmup_epochs: int
:param delay_epochs: Number of epochs to keep the initial lr until starting aplying the scheduler
:type delay_epochs: int
:param after_scheduler: After target_epoch, use this scheduler(eg. ReduceLROnPlateau)
:type after_scheduler: torch.optim.lr_scheduler
:param last_epoch: The index of last epoch, defaults to -1
:type last_epoch: int, optional
"""Starts with a linear warmup lr schedule until it reaches N epochs and a flat lr schedule
until it reaches M epochs then applies the specific scheduler (For example: ReduceLROnPlateau).
Args:
optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
warmup_epochs (int): Number of epochs to linearly warmup lr until starting applying the scheduler.
delay_epochs (int): Number of epochs to keep the initial lr until starting applying the scheduler.
after_scheduler (:class:`torch.optim.lr_scheduler`): After target_epoch, use this scheduler.
last_epoch (int, optional): The index of last epoch, defaults to -1. When last_epoch=-1,
the schedule is started from the beginning or When last_epoch=-1, sets initial lr as lr.
"""
def
__init__
(
self
,
optimizer
,
warmup_epochs
,
delay_epochs
,
after_scheduler
,
last_epoch
=-
1
):
...
...
colossalai/nn/lr_scheduler/linear.py
View file @
ec5086c4
...
...
@@ -5,16 +5,14 @@ from colossalai.registry import LR_SCHEDULERS
@
LR_SCHEDULERS
.
register_module
class
LinearWarmupLR
(
_LRScheduler
):
"""Linearly warmup learning rate and then linearly decay
"""Linearly warmup learning rate and then linearly decay
.
:param optimizer: Wrapped optimizer
:type optimizer: torch.optim.Optimizer
:param total_steps: Number of total training steps
:type total_steps: int
:param warmup_steps: Number of warmup steps, defaults to 0
:type warmup_steps: int, optional
:param last_epoch: The index of last epoch, defaults to -1
:type last_epoch: int, optional
Args:
optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
total_steps (int): Number of total training steps.
warmup_steps (int, optional): Number of warmup steps, defaults to 0
last_epoch (int, optional): The index of last epoch, defaults to -1. When last_epoch=-1,
the schedule is started from the beginning or When last_epoch=-1, sets initial lr as lr.
"""
def
__init__
(
self
,
optimizer
,
total_steps
:
int
,
warmup_steps
:
int
=
0
,
last_epoch
:
int
=
-
1
,
**
kwargs
):
...
...
colossalai/nn/lr_scheduler/multistep.py
View file @
ec5086c4
...
...
@@ -13,18 +13,13 @@ class MultiStepLR(_MultiStepLR):
happen simultaneously with other changes to the learning rate from outside
this scheduler. When last_epoch=-1, sets initial lr as lr.
:param optimizer: Wrapped optimizer
:type optimizer: torch.optim.Optimizer
:param total_steps: Number of total training steps
:type total_steps: int
:param milestones: List of epoch indices. Must be increasing, defaults to None
:type milestones: List[int], optional
:param gamma: Multiplicative factor of learning rate decay, defaults to 0.1
:type gamma: float, optional
:param num_steps_per_epoch: Number of steps per epoch, defaults to -1
:type num_steps_per_epoch: int, optional
:param last_epoch: The index of last epoch, defaults to -1
:type last_epoch: int, optional
Args:
optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
total_steps (int): Number of total training steps.
milestones (List[int], optional): List of epoch indices. Must be increasing, defaults to None.
gamma (float, optional): Multiplicative factor of learning rate decay, defaults to 0.1.
last_epoch (int, optional): The index of last epoch, defaults to -1. When last_epoch=-1,
the schedule is started from the beginning or When last_epoch=-1, sets initial lr as lr.
"""
def
__init__
(
self
,
optimizer
,
total_steps
:
int
,
milestones
:
List
[
int
]
=
None
,
gamma
:
float
=
0.1
,
last_epoch
:
int
=
-
1
,
**
kwargs
):
...
...
@@ -33,22 +28,17 @@ class MultiStepLR(_MultiStepLR):
@
LR_SCHEDULERS
.
register_module
class
MultiStepWarmupLR
(
WarmupScheduler
):
"""Multi-step laerning rate scheduler with warmup.
:param optimizer: Wrapped optimizer
:type optimizer: torch.optim.Optimizer
:param total_steps: Number of total training steps
:type total_steps: int
:param warmup_steps: Number of warmup steps, defaults to 0
:type warmup_steps: int, optional
:param milestones: List of epoch indices. Must be increasing, defaults to None
:type milestones: List[int], optional
:param gamma: Multiplicative factor of learning rate decay, defaults to 0.1
:type gamma: float, optional
:param num_steps_per_epoch: Number of steps per epoch, defaults to -1
:type num_steps_per_epoch: int, optional
:param last_epoch: The index of last epoch, defaults to -1
:type last_epoch: int, optional
"""Multistep learning rate scheduler with warmup.
Args:
optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
total_steps (int): Number of total training steps.
warmup_steps (int, optional): Number of warmup steps, defaults to 0.
milestones (List[int], optional): List of epoch indices. Must be increasing, defaults to None.
gamma (float, optional): Multiplicative factor of learning rate decay, defaults to 0.1.
num_steps_per_epoch (int, optional): Number of steps per epoch, defaults to -1.
last_epoch (int, optional): The index of last epoch, defaults to -1. When last_epoch=-1,
the schedule is started from the beginning or When last_epoch=-1, sets initial lr as lr.
"""
def
__init__
(
self
,
optimizer
,
total_steps
:
int
,
warmup_steps
:
int
=
0
,
milestones
:
List
[
int
]
=
None
,
...
...
colossalai/nn/lr_scheduler/onecycle.py
View file @
ec5086c4
...
...
@@ -28,43 +28,41 @@ class OneCycleLR(_OneCycleLR):
claims that "unpublished work has shown even better results by using only two phases". To
mimic the behaviour of the original paper instead, set ``three_phase=True``.
:param optimizer: Wrapped optimizer
:type optimizer: torch.optim.Optimizer
:param total_steps: Number of total training steps
:type total_steps: int
:param pct_start: The percentage of the cycle (in number of steps) spent increasing the learning rate, defaults to 0.3
:type pct_start: float, optional
:param anneal_strategy: {'cos', 'linear'}
Specifies the annealing strategy: "cos" for cosine annealing, "linear" for
linear annealing, defaults to 'cos'
:type anneal_strategy: str, optional
:param cycle_momentum: If ``True``, momentum is cycled inversely
to learning rate between 'base_momentum' and 'max_momentum', defaults to True
:type cycle_momentum: bool, optional
:param base_momentum: Lower momentum boundaries in the cycle
for each parameter group. Note that momentum is cycled inversely
to learning rate; at the peak of a cycle, momentum is
'base_momentum' and learning rate is 'max_lr', defaults to 0.85
:type base_momentum: float, optional
:param max_momentum: Upper momentum boundaries in the cycle
for each parameter group. Functionally,
it defines the cycle amplitude (max_momentum - base_momentum).
Note that momentum is cycled inversely
to learning rate; at the start of a cycle, momentum is 'max_momentum'
and learning rate is 'base_lr', defaults to 0.95
:type max_momentum: float, optional
:param div_factor: Determines the initial learning rate via
initial_lr = max_lr/div_factor, defaults to 25.0
:type div_factor: float, optional
:param final_div_factor: Determines the minimum learning rate via
min_lr = initial_lr/final_div_factor, defaults to 10000.0
:type final_div_factor: float, optional
:param last_epoch: The index of the last batch. This parameter is used when
resuming a training job. Since `step()` should be invoked after each
batch instead of after each epoch, this number represents the total
number of *batches* computed, not the total number of epochs computed.
When last_epoch=-1, the schedule is started from the beginning, defaults to -1
:type last_epoch: int, optional
Args:
optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
total_steps (int): Number of total training steps.
pct_start (float, optional):
The percentage of the cycle (in number of steps) spent increasing the learning rate, defaults to 0.3.
anneal_strategy (str, optional): {'cos', 'linear'}, Specifies the annealing strategy:
"cos" for cosine annealing, "linear" for linear annealing, defaults to 'cos'.
cycle_momentum (bool, optional): If ``True``, momentum is cycled inversely
to learning rate between 'base_momentum' and 'max_momentum', defaults to True.
base_momentum (float, optional): Lower momentum boundaries in the cycle for each parameter group.
Note that momentum is cycled inversely to learning rate; at the peak of a cycle, momentum is
'base_momentum' and learning rate is 'max_lr', defaults to 0.85.
max_momentum (float, optional): Upper momentum boundaries in the cycle for each parameter group.
Functionally, it defines the cycle amplitude (max_momentum - base_momentum).
Note that momentum is cycled inversely to learning rate; at the start of a cycle, momentum is 'max_momentum'
and learning rate is 'base_lr', defaults to 0.95.
div_factor (float, optional): Determines the initial learning rate via
initial_lr = max_lr/div_factor, defaults to 25.0.
final_div_factor (float, optional): Determines the minimum learning rate via
min_lr = initial_lr/final_div_factor, defaults to 10000.0.
last_epoch (int, optional): The index of the last batch. This parameter is used when resuming a training job.
Since `step()` should be invoked after each batch instead of after each epoch, this number represents
the total number of *batches* computed, not the total number of epochs computed.
When last_epoch=-1, the schedule is started from the beginning, defaults to -1
The ``kwargs`` for initializing torch.optim.lr_scheduler.OneCycleLR should include parameters below:
::
epochs (int, optional, default=None)
steps_per_epoch (int, optional, default=None)
three_phase (bool, optional, default=False)
verbose (bool, optional, default=False)
More details about kwargs could be found in
`OneCycleLR <https://pytorch.org/docs/stable/generated/torch.optim.lr_scheduler.OneCycleLR.html#torch.optim.lr_scheduler.OneCycleLR>`_.
.. _Super-Convergence\: Very Fast Training of Neural Networks Using Large Learning Rates:
https://arxiv.org/abs/1708.07120
...
...
colossalai/nn/lr_scheduler/poly.py
View file @
ec5086c4
...
...
@@ -8,16 +8,13 @@ from .delayed import WarmupScheduler
class
PolynomialLR
(
_LRScheduler
):
"""Polynomial learning rate scheduler.
:param optimizer: Wrapped optimizer
:type optimizer: torch.optim.Optimizer
:param total_steps: Number of total training steps
:type total_steps: int
:param end_lr: Minimum learning rate, defaults to 0.0001
:type end_lr: float, optional
:param power: The power of polynomial, defaults to 1.0
:type power: float, optional
:param last_epoch: The index of last epoch, defaults to -1
:type last_epoch: int, optional
Args:
optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
total_steps (int): Number of total training steps.
end_lr (float, optional): Minimum learning rate, defaults to 0.0001.
power (float, optional): The power of polynomial, defaults to 1.0.
last_epoch (int, optional): The index of last epoch, defaults to -1. When last_epoch=-1,
the schedule is started from the beginning or When last_epoch=-1, sets initial lr as lr.
"""
def
__init__
(
self
,
optimizer
,
total_steps
:
int
,
end_lr
:
float
=
0.0001
,
power
:
float
=
1.0
,
last_epoch
:
int
=
-
1
,
...
...
@@ -44,18 +41,14 @@ class PolynomialLR(_LRScheduler):
class
PolynomialWarmupLR
(
WarmupScheduler
):
"""Polynomial learning rate scheduler with warmup.
:param optimizer: Wrapped optimizer
:type optimizer: torch.optim.Optimizer
:param total_steps: Number of total training steps
:type total_steps: int
:param warmup_steps: Number of warmup steps, defaults to 0
:type warmup_steps: int, optional
:param end_lr: Minimum learning rate, defaults to 0.0001
:type end_lr: float, optional
:param power: The power of polynomial, defaults to 1.0
:type power: float, optional
:param last_epoch: The index of last epoch, defaults to -1
:type last_epoch: int, optional
Args:
optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
total_steps (int): Number of total training steps.
warmup_steps (int, optional): Number of warmup steps, defaults to 0.
end_lr (float, optional): Minimum learning rate, defaults to 0.0001.
power (float, optional): The power of polynomial, defaults to 1.0.
last_epoch (int, optional): The index of last epoch, defaults to -1. When last_epoch=-1,
the schedule is started from the beginning or When last_epoch=-1, sets initial lr as lr.
"""
def
__init__
(
self
,
optimizer
,
total_steps
:
int
,
warmup_steps
:
int
=
0
,
end_lr
:
float
=
0.0001
,
power
:
float
=
1.0
,
...
...
colossalai/nn/lr_scheduler/torch.py
View file @
ec5086c4
...
...
@@ -11,16 +11,13 @@ class LambdaLR(_LambdaLR):
"""Sets the learning rate of each parameter group to the initial lr
times a given function. When last_epoch=-1, sets initial lr as lr.
:param optimizer: Wrapped optimizer
:type optimizer: torch.optim.Optimizer
:param total_steps: Number of total training steps
:type total_steps: int
:param lr_lambda: A function which computes a multiplicative
factor given an integer parameter epoch, or a list of such
functions, one for each group in optimizer.param_groups, defaults to None
:type lr_lambda: function or list, optional
:param last_epoch: The index of last epoch, defaults to -1
:type last_epoch: int, optional
Args:
optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
total_steps (int): Number of total training steps.
lr_lambda (Union[``function``, ``list[function]``]): A function which computes a multiplicative
factor given an integer parameter epoch, or a list of such functions,
one for each group in optimizer.param_groups, defaults to None.
last_epoch (int, optional): The index of last epoch, defaults to -1.
"""
def
__init__
(
self
,
optimizer
,
total_steps
,
lr_lambda
=
None
,
last_epoch
:
int
=
-
1
)
->
None
:
...
...
@@ -30,18 +27,15 @@ class LambdaLR(_LambdaLR):
@
LR_SCHEDULERS
.
register_module
class
MultiplicativeLR
(
_MultiplicativeLR
):
"""Multiply the learning rate of each parameter group by the factor given
in the specified function. When last_epoch=-1, sets initial lr as lr
:param optimizer: Wrapped optimizer
:type optimizer: torch.optim.Optimizer
:param total_steps: Number of total training steps
:type total_steps: int
:param lr_lambda: A function which computes a multiplicative
factor given an integer parameter epoch, or a list of such
functions, one for each group in optimizer.param_groups, defaults to None
:type lr_lambda: function or list, optional
:param last_epoch: The index of last epoch, defaults to -1
:type last_epoch: int, optional
in the specified function. When last_epoch=-1, sets initial lr as lr.
Args:
optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
total_steps (int): Number of total training steps.
lr_lambda (Union[``function``, ``list[function]``]): A function which computes a multiplicative
factor given an integer parameter epoch, or a list of such functions,
one for each group in optimizer.param_groups, defaults to None.
last_epoch (int, optional): The index of last epoch, defaults to -1.
"""
def
__init__
(
self
,
optimizer
,
total_steps
,
lr_lambda
=
None
,
last_epoch
:
int
=
-
1
)
->
None
:
...
...
@@ -53,18 +47,14 @@ class StepLR(_StepLR):
"""Decays the learning rate of each parameter group by gamma every
step_size epochs. Notice that such decay can happen simultaneously with
other changes to the learning rate from outside this scheduler. When
last_epoch=-1, sets initial lr as lr
:param optimizer: Wrapped optimizer
:type optimizer: torch.optim.Optimizer
:param total_steps: Number of total training steps
:type total_steps: int
:param step_size: Period of learning rate decay, defaults to 1
:type step_size: int, optional
:param gamma: Multiplicative factor of learning rate decay, defaults to 0.1
:type gamma: float, optional
:param last_epoch: The index of last epoch, defaults to -1
:type last_epoch: int, optional
last_epoch=-1, sets initial lr as lr.
Args:
optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
total_steps (int): Number of total training steps.
step_size (int, optional): Period of learning rate decay, defaults to 1.
gamma (float, optional): Multiplicative factor of learning rate decay, defaults to 0.1.
last_epoch (int, optional): The index of last epoch, defaults to -1.
"""
def
__init__
(
self
,
optimizer
,
total_steps
,
step_size
:
int
=
1
,
gamma
:
float
=
0.1
,
last_epoch
:
int
=
-
1
)
->
None
:
...
...
@@ -77,14 +67,11 @@ class ExponentialLR(_ExponentialLR):
"""Decays the learning rate of each parameter group by gamma every epoch.
When last_epoch=-1, sets initial lr as lr
:param optimizer: Wrapped optimizer
:type optimizer: torch.optim.Optimizer
:param total_steps: Number of total training steps
:type total_steps: int
:param gamma: Multiplicative factor of learning rate decay, defaults to 1.0
:type gamma: float, optional
:param last_epoch: The index of last epoch, defaults to -1
:type last_epoch: int, optional
Args:
optimizer (Union[:class:`torch.optim.Optimizer`, :class:`colossalai.nn.optimizer`]): Wrapped optimizer.
total_steps (int): Number of total training steps.
gamma (float, optional): Multiplicative factor of learning rate decay, defaults to 1.0.
last_epoch (int, optional): The index of last epoch, defaults to -1.
"""
def
__init__
(
self
,
optimizer
,
total_steps
,
gamma
:
float
=
1.0
,
...
...
colossalai/nn/metric/accuracy_2d.py
View file @
ec5086c4
...
...
@@ -14,8 +14,12 @@ class Accuracy2D(nn.Module):
def
forward
(
self
,
logits
,
targets
):
"""Calculate the accuracy of predicted labels.
:param logits: Predicted labels
:param targets: True labels from data
Args:
logits (:class:`torch.tensor`): Predicted labels.
targets (:class:`torch.tensor`): True labels from data.
Returns:
float: the accuracy of prediction.
"""
with
torch
.
no_grad
():
targets
=
split_tensor_2d
(
targets
)
...
...
colossalai/nn/metric/accuracy_2p5d.py
View file @
ec5086c4
...
...
@@ -14,8 +14,12 @@ class Accuracy2p5D(nn.Module):
def
forward
(
self
,
logits
,
targets
):
"""Calculate the accuracy of predicted labels.
:param logits: Predicted labels
:param targets: True labels from data
Args:
logits (:class:`torch.tensor`): Predicted labels.
targets (:class:`torch.tensor`): True labels from data.
Returns:
float: the accuracy of prediction.
"""
with
torch
.
no_grad
():
targets
=
split_tensor_2p5d
(
targets
)
...
...
colossalai/nn/metric/accuracy_3d.py
View file @
ec5086c4
...
...
@@ -18,8 +18,12 @@ class Accuracy3D(nn.Module):
def
forward
(
self
,
logits
,
targets
):
"""Calculate the accuracy of predicted labels.
:param logits: Predicted labels
:param targets: True labels from data
Args:
logits (:class:`torch.tensor`): Predicted labels.
targets (:class:`torch.tensor`): True labels from data.
Returns:
float: the accuracy of prediction.
"""
with
torch
.
no_grad
():
targets
=
split_tensor_3d
(
targets
,
0
,
self
.
weight_parallel_mode
)
...
...
colossalai/registry/registry.py
View file @
ec5086c4
...
...
@@ -9,11 +9,10 @@ class Registry:
"""This is a registry class used to register classes and modules so that a universal
object builder can be enabled.
:param name: The name of the registry
:type name: str
:param third_party_library: List of third party libraries which are used in the
initialization of the register module
:type third_party_library: list, optional
Args:
name (str): The name of the registry .
third_party_library (list, optional):
List of third party libraries which are used in the initialization of the register module.
"""
def
__init__
(
self
,
name
:
str
,
third_party_library
:
List
[
ModuleType
]
=
None
):
...
...
@@ -28,12 +27,12 @@ class Registry:
def
register_module
(
self
,
module_class
):
"""Registers a module represented in `module_class`.
:param module_class: The module to be registered
:type
module_class
:
class
:raises AssertionError: Raises an AssertionError if the module has already been
registered before
:return: The module to be registered, so as to use it normally if via importing
:rtype: class
Args:
module_class
(
class
): The module to be registered.
Returns:
class: The module to be registered, so as to use it normally if via importing.
Raises:
AssertionError: Raises an AssertionError if the module has already been registered before.
"""
module_name
=
module_class
.
__name__
assert
module_name
not
in
self
.
_registry
...
...
@@ -46,12 +45,13 @@ class Registry:
"""Retrieves a module with name `module_name` and returns the module if it has
already been registered before.
:param module_name: The name of the module to be retrieved
:type module_name: str
:raises NameError: Raises a NameError if the module to be retrieved has neither been
registered directly nor as third party modules before
:return: The retrieved module or None
:rtype: :class:`object`
Args:
module_name (str): The name of the module to be retrieved.
Returns:
:class:`object`: The retrieved module or None.
Raises:
NameError: Raises a NameError if the module to be retrieved has neither been
registered directly nor as third party modules before.
"""
if
module_name
in
self
.
_registry
:
return
self
.
_registry
[
module_name
]
...
...
@@ -65,11 +65,11 @@ class Registry:
"""Searches for a module with name `module_name` and returns a boolean value indicating
whether the module has been registered directly or as third party modules before.
:param module_name: The name of the module to be searched for
:type
module_name
: str
:r
eturn:
A boolean value indicating whether the module has been registered directly or
as third party
modules be
fore
:rtype: bool
Args:
module_name
(str): The name of the module to be searched for.
R
eturn
s
:
bool: A boolean value indicating whether the
module
ha
s be
en registered directly or
as third party modules before.
"""
found_flag
=
module_name
in
self
.
_registry
...
...
colossalai/trainer/_trainer.py
View file @
ec5086c4
...
...
@@ -17,18 +17,46 @@ from colossalai.trainer.hooks import BaseHook
class
Trainer
:
"""This a class tending for easy deployments of users' training and evaluation instead of
r
"""This
is
a class tending for easy deployments of users' training and evaluation instead of
writing their own scripts. It is similar with ``ignite.engine`` and ``keras.engine``, but is
called `Trainer`.
:param engine: Engine responsible for the process function
:type engine: :class:`Engine`
:param schedule: Schedule responsible for forward and backward steps
:type schedule: :class:`BaseSchedule`, optional
:param timer: Timer used to monitor the whole training
:type timer: :class:`MultiTimer`, optional
:param logger: Logger used to record the whole training
:type logger: :class:`colossalai.logging.DistributedLogger`, optional
Args:
engine (:class:`Engine`): Engine responsible for the process function.
schedule (:class:`BaseSchedule`, optional): Schedule responsible for forward and backward steps.
timer (:class:`MultiTimer`, optional): Timer used to monitor the whole training.
logger (:class:`colossalai.logging.DistributedLogger`, optional): Logger used to record the whole training log.
Note:
when `schedule` is None, the ``NonPipelineSchedule`` would be used. If you would like to use pipeline,
you should choose ``PipelineSchedule`` or ``InterleavedPipelineSchedule`` for the `schedule`
Examples:
>>> # define model, criterion, optimizer, lr_scheduler, train_dataloader for your training
>>> model = ...
>>> criterion = ...
>>> optimizer = ...
>>> train_dataloader = ...
>>> # Initialize your engine, train_dataloader, test_dataloader, lr_scheduler
>>> engine, train_dataloader, _, _ = colossalai.initialize(model, optimizer, criterion)
>>> # Beginning training progress
>>> timier = ...
>>> logger = ...
>>> trainer = Trainer(engine=engine, logger=logger, schedule=schedule, timer=timier)
>>> # add hooks you would like to use here.
>>> hook_list = []
>>> trainer.fit(
>>> train_dataloader=train_dataloader,
>>> epochs=gpc.config.NUM_EPOCHS,
>>> test_interval=1,
>>> hooks=hook_list,
>>> display_progress=True,
>>> return_output_label=False
>>> )
More examples and details could be found in
`Training with engine and trainer <https://www.colossalai.org/docs/basics/engine_trainer>`_
and `ColossalAI-Examples <https://github.com/hpcaitech/ColossalAI-Examples/tree/main>`_.
"""
def
__init__
(
self
,
...
...
@@ -108,20 +136,19 @@ class Trainer:
def
_set_current_step
(
self
,
epoch
:
int
):
"""Sets current step number.
:param epoch: Step number to be set
:type
epoch
: int
Args:
epoch
(int): Step number to be set.
"""
self
.
_cur_step
=
epoch
*
self
.
_steps_per_epoch
def
_call_timer
(
self
,
action
:
str
,
item
:
str
,
*
args
,
**
kwargs
)
->
None
:
"""Call timer funciton with a given timer name.
:param action: Function to be called on timer
:type action: str
:param item: Name of the timer
:type item: str
:param args: args used for action function
:param kwargs: kwargs used for action function
Args:
action (str): Function to be called on timer.
item (str): Name of the timer.
args (list): args used for action function.
kwargs (dict): kwargs used for action function.
"""
if
self
.
_timer
is
not
None
:
...
...
@@ -134,10 +161,9 @@ class Trainer:
def
_call_hooks
(
self
,
func
,
output
=
None
):
"""Calls specific hooks in the current time point.
:param func: A string represents the time point
:param output: Output of the model after running a iteration or None in any other time points
:type func: str
:type output: optional
Args:
func (str): A string represents the time point.
output (Any, optional): Output of the model after running an iteration or None in any other time points.
"""
# Only after iter hook will receive output
for
hook
in
self
.
hooks
:
...
...
@@ -273,25 +299,17 @@ class Trainer:
display_progress
:
bool
=
False
,
return_output_label
:
bool
=
True
,
):
"""Trains the model to fit training data.
:param train_dataloader: DataLoader in training
:param epochs: Maximum number of epoches
:param max_steps: Maximum number of running iterations
:param test_dataloader: DataLoader in testing
:param test_interval: Interval of testing
:param hooks: A list of hooks used in training
:param display_progress: If True, the training progress will be printed
:param return_output_label: If True, the output of model and the label will be returned
:type train_dataloader: DataLoader
:type epochs: int
:type max_steps: int, optional
:type test_dataloader: DataLoader, optional
:type test_interval: int, optional
:type hooks: list, optional
:type display_progress: bool, optional
:type return_output_label: bool, optional
r
"""Trains the model to fit training data.
Args:
train_dataloader (:class:`torch.utils.data.DataLoader`): DataLoader for training.
epochs (int): Maximum number of epochs.
max_steps (int, optional): Maximum number of running iterations.
test_dataloader (:class:`torch.utils.data.DataLoader`, optional): DataLoader for validation.
test_interval (int, optional): Interval of validation
hooks (list[`BaseHook <https://github.com/hpcaitech/ColossalAI/tree/main/colossalai/trainer/hooks>`_],
optional): A list of hooks used in training.
display_progress (bool, optional): If True, a progress bar will be displayed.
"""
# set epochs and steps, consider gradient accumulation
...
...
@@ -374,15 +392,12 @@ class Trainer:
):
"""Evaluates the model with testing data.
:param test_dataloader: DataLoader in testing
:param hooks: A list of hooks used in evaluation
:param display_progress: If True, the evaluation progress will be printed
:param return_output_label: If True, the output of model and the label will be returned
:type test_dataloader: DataLoader
:type hooks: list, optional
:type display_progress: bool, optional
:type return_output_label: bool
Args:
test_dataloader (:class:`torch.utils.data.DataLoader`, optional): Dataloader for testing.
hooks (list, optional): A list of hooks used in evaluation. Defaults to None.
display_progress (bool, optional): If True, the evaluation progress will be printed. Defaults to False.
return_output_label (bool, optional): If True, the output of model and the label
will be returned. Defaults to True.
"""
# set display
display_progress
=
self
.
_should_display_progress
(
display_progress
)
...
...
@@ -418,10 +433,11 @@ class Trainer:
def
predict
(
self
,
data
:
Union
[
Tensor
,
List
[
Tensor
]]):
"""Uses trained model to make a prediction for a tensor or a tensor list.
:param data: Data as the input
:type data: Union[Tensor, List[Tensor]
:return: The output of model as the prediction
:rtype: Tensor
Args:
data (Union[:class:`torch.tensor`, List[:class:`torch.tensor`]]): Data as the input.
Returns:
:class:`torch.tensor`: The output of model as the prediction
"""
# predict without labels
if
isinstance
(
data
,
(
list
,
tuple
)):
...
...
colossalai/trainer/hooks/_base_hook.py
View file @
ec5086c4
...
...
@@ -40,14 +40,11 @@ class BaseHook(ABC):
def
after_train_iter
(
self
,
trainer
,
output
:
Tensor
,
label
:
Tensor
,
loss
:
Tensor
):
"""Actions after running a training iteration.
:param trainer: Trainer which is using this hook
:type trainer: :class:`Trainer`
:param output: Output of the model
:type output: torch.Tensor
:param label: Labels of the input data
:type label: torch.Tensor
:param loss: Loss between the output and input data
:type loss: torch.Tensor
Args:
trainer (:class:`Trainer`): Trainer which is using this hook.
output (:class:`torch.Tensor`): Output of the model.
label (:class:`torch.Tensor`): Labels of the input data.
loss (:class:`torch.Tensor`): Loss between the output and input data.
"""
pass
...
...
@@ -89,24 +86,21 @@ class BaseHook(ABC):
def
after_test_iter
(
self
,
trainer
,
output
:
Tensor
,
label
:
Tensor
,
loss
:
Tensor
):
"""Actions after running a testing iteration.
:param trainer: Trainer which is using this hook
:type trainer: :class:`Trainer`
:param output: Output of the model
:type output: Tensor
:param label: Labels of the input data
:type label: Tensor
:param loss: Loss between the output and input data
:type loss: Tensor
Args:
trainer (:class:`Trainer`): Trainer which is using this hook
output (:class:`torch.Tensor`): Output of the model
label (:class:`torch.Tensor`): Labels of the input data
loss (:class:`torch.Tensor`): Loss between the output and input data
"""
pass
def
init_runner_states
(
self
,
trainer
,
key
,
val
):
"""Initializes trainer's state.
:param trainer: Trainer which is using this hook
:type
trainer
:
:class:`Trainer`
:param
key: Key of
reseting state
:param
val: Value of
reseting state
Args:
trainer
(
:class:`Trainer`
): Trainer which is using this hook
key: Key of
state to be reset
val: Value of
state to be reset
"""
if
key
not
in
trainer
.
states
:
trainer
.
states
[
key
]
=
val
colossalai/trainer/hooks/_checkpoint_hook.py
View file @
ec5086c4
...
...
@@ -16,14 +16,13 @@ from ._lr_scheduler_hook import LRSchedulerHook
class
SaveCheckpointHook
(
BaseHook
):
"""Saves the model by interval in training process.
:param interval: Saving interval, defaults to 1
:type interval: int, optional
:param checkpoint_dir: Directory of saving checkpoint, defaults to None
:type checkpoint_dir: str, optional
:param suffix: Saving suffix of the file, defaults to ''
:type suffix: str, optional
:param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 10
:type priority: int, optional
Args:
interval (int, optional): Saving interval, defaults to 1.
checkpoint_dir (str, optional): Directory of saving checkpoint, defaults to None.
suffix (str, optional): Saving suffix of the file, defaults to ''.
priority (int, optional): Priority in the printing, hooks with small priority will be printed in front
defaults to 10. If different hooks share same priority, the order of printing would
depend on the hooks order in the hook list.
"""
def
__init__
(
self
,
...
...
@@ -71,18 +70,17 @@ class SaveCheckpointHook(BaseHook):
class
LoadCheckpointHook
(
BaseHook
):
"""Loads the model before training process.
:param checkpoint_dir: Directory of saving checkpoint, defaults to None
:type checkpoint_dir: str, optional
:param epoch: Epoch number to be set, defaults to -1
:type epoch: str, optional
:param finetune: Whether allows to load a part of the model, defaults to False
:type finetune: bool, optional
:param strict: Whether loads a model that has the same shape of parameters, defaults to False
:type strict: bool, optional
:param suffix: Suffic, defaults to ''
:type suffix: str, optional
:param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 0
:type priority: int, optional
Args:
checkpoint_dir (str, optional): Directory of saving checkpoint, defaults to None.
epoch (str, optional): Loading checkpoint of setting epoch numbers, defaults to -1.
Epoch equals to -1 means choosing the latest checkpoint.
finetune (bool, optional): Whether allows to load a part of the model, defaults to False.
strict (bool, optional): Whether to strictly enforce that the keys in :attr:`state_dict` of the checkpoint
match the names of parameters and buffers in model, defaults to False.
suffix (str, optional): Suffix of checkpoint file path, defaults to ''.
priority (int, optional): Priority in the printing, hooks with small priority will be printed in front,
defaults to 0. If different hooks share same priority, the order of printing would
depend on the hooks order in the hook list.
"""
def
__init__
(
self
,
...
...
colossalai/trainer/hooks/_log_hook.py
View file @
ec5086c4
...
...
@@ -25,13 +25,14 @@ def _format_number(val, prec=5):
class
LogByEpochHook
(
BaseHook
):
"""Hook to log by epoch
:param logger: Logger for the log
:param interval: Recording interval, defaults to 1
:type interval: int, optional
:param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 1
:type priority: int, optional
"""Hook to log by epoch.
Args:
logger (:class:`colossalai.logging.DistributedLogger`): Logger for recording the log information.
interval (int, optional): Interval of printing log information, defaults to 1.
priority (int, optional): Priority in the printing, hooks with small priority will be printed in front,
defaults to 1. If different hooks share same priority, the order of printing would
depend on the hooks order in the hook list.
"""
def
__init__
(
self
,
...
...
@@ -48,10 +49,12 @@ class LogByEpochHook(BaseHook):
@
HOOKS
.
register_module
class
LogMetricByStepHook
(
BaseHook
):
"""Hook to log metric by step
"""Hook to log metric by step
.
:param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 10
:type priority: int, optional
Args:
priority (int, optional): Priority in the printing, hooks with small priority will be printed in front,
defaults to 10. If different hooks share same priority, the order of printing would
depend on the hooks order in the hook list.
"""
def
__init__
(
self
,
priority
:
int
=
10
):
...
...
@@ -74,11 +77,12 @@ class LogMetricByStepHook(BaseHook):
class
LogMetricByEpochHook
(
LogByEpochHook
):
"""Specialized hook to record the metric to log.
:param logger: Logger for the log
:param interval: Recording interval, defaults to 1
:type interval: int, optional
:param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 10
:type priority: int, optional
Args:
logger (:class:`colossalai.logging.DistributedLogger`): Logger for recording the log information.
interval (int, optional): Interval of printing log information, defaults to 1.
priority (int, optional): Priority in the printing, hooks with small priority will be printed in front,
defaults to 10. If different hooks share same priority, the order of printing would
depend on the hooks order in the hook list.
"""
def
__init__
(
self
,
...
...
@@ -116,14 +120,14 @@ class LogMetricByEpochHook(LogByEpochHook):
class
TensorboardHook
(
BaseHook
):
"""Specialized hook to record the metric to Tensorboard.
:param log_dir: Directory of log
:type
log_dir
:
str
:param ranks
: Ranks of processors
:type ranks: typing.List
:param parallel_mode: Parallel mode,
defaults to colossalai.context.parallel_mode.ParallelMode.GLOBAL
:type parallel_mode: :class:`colossalai.context.parallel_mode.ParallelMode`, optional
:param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 10
:type priority: int, optional
Args:
log_dir
(
str
): Directory of log.
ranks (list)
: Ranks of processors
.
parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`, optional): Parallel mode used in trainer,
defaults to colossalai.context.parallel_mode.ParallelMode.GLOBAL
.
priority (int, optional): Priority in the printing, hooks with small priority will be printed in front,
defaults to 10. If different hooks share same priority, the order of printing would
depend on the hooks order in the hook list.
"""
def
__init__
(
self
,
...
...
@@ -200,18 +204,15 @@ class TensorboardHook(BaseHook):
class
LogTimingByEpochHook
(
LogByEpochHook
):
"""Specialized hook to write timing record to log.
:param timer: Timer for the hook
:type timer: :class:`colossalai.utils.MultiTimer`
:param logger: Logger for the log
:type logger: :class:`colossalai.logging.DistributedLogger`
:param interval: Recording interval, defaults to 1
:type interval: int, optional
:param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 10
:type priority: int, optional
:param log_eval: Whether writes in evaluation, defaults to True
:type log_eval: bool, optional
:param ignore_num_train_steps: Number of training steps to ignore, defaults to 0
:type ignore_num_train_steps: int, optional
Args:
timer (:class:`colossalai.utils.MultiTimer`): Timer for the hook.
logger (:class:`colossalai.logging.DistributedLogger`): Logger for recording the log information.
interval (int, optional): Interval of printing log information, defaults to 1.
priority (int, optional): Priority in the printing, hooks with small priority will be printed in front
defaults to 10. If different hooks share same priority, the order of printing would
depend on the hooks order in the hook list.
log_eval (bool, optional): Whether writes in evaluation, defaults to True.
ignore_num_train_steps (int, optional): Number of training steps to ignore, defaults to 0.
"""
def
__init__
(
self
,
...
...
@@ -270,14 +271,13 @@ class LogTimingByEpochHook(LogByEpochHook):
class
LogMemoryByEpochHook
(
LogByEpochHook
):
"""Specialized Hook to write memory usage record to log.
:param logger: Logger for the log
:type logger: colossalai.logging.DistributedLogger
:param interval: Recording interval, defaults to 1
:type interval: int, optional
:param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 10
:type priority: int, optional
:param log_eval: Whether writes in evaluation, defaults to True
:type log_eval: bool, optional
Args:
logger (:class:`colossalai.logging.DistributedLogger`): Logger for recording the log information.
interval (int, optional): Interval of printing log information, defaults to 1.
priority (int, optional): Priority in the printing, hooks with small priority will be printed in front
defaults to 1. If different hooks share same priority, the order of printing would
depend on the hooks order in the hook list.
log_eval (bool, optional): Whether writes in evaluation, defaults to True.
"""
def
__init__
(
self
,
...
...
colossalai/trainer/hooks/_lr_scheduler_hook.py
View file @
ec5086c4
...
...
@@ -6,15 +6,17 @@ from ._metric_hook import LearningRateMetric, MetricHook
@
HOOKS
.
register_module
class
LRSchedulerHook
(
MetricHook
):
"""Build LR scheduler
r
"""Build LR scheduler
for trainer.
:param lr_scheduler: LR scheduler
:param by_epoch: If `True`, the LR will be scheduled every epoch. Else, the LR will be scheduled every batch
:type by_epoch: bool
:param store_lr_in_state: If `True`, store the learning rate in each state, defaults to `True`
:type store_lr_in_state: bool, optional
:param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 1
:type priority: int, optional
Args:
lr_scheduler (:class:`colossalai.nn.lr_scheduler`): The specific LR scheduler
in range of ``colossalai.nn.lr_scheduler``, more details about ``lr_scheduler`` could be found in
`lr_scheduler <https://github.com/hpcaitech/ColossalAI/tree/main/colossalai/nn/lr_scheduler>`_.
by_epoch (bool): If `True`, the LR will be scheduled every epoch. Else, the LR will be scheduled every batch.
store_lr_in_state (bool, optional): If `True`, store the learning rate in each state, defaults to `True`.
priority (int, optional): Priority in the printing, hooks with small priority will be printed in front
defaults to 1. If different hooks share same priority, the order of printing would
depend on the hooks order in the hook list.
"""
def
__init__
(
self
,
...
...
colossalai/trainer/hooks/_metric_hook.py
View file @
ec5086c4
...
...
@@ -17,13 +17,13 @@ from ._base_hook import BaseHook
class
Metric
(
ABC
):
"""A basic class of metric collectors. It collects a specific
metric during training or evaluation and
it's
always used with
metric during training or evaluation and
would
always
be
used with
:class:`MetricHook` to help it update its states and show the
metric. So please use corresponding hook class to make the metric
collector works.
:param epoch_only: Whether the metric only read for the full epoch
:type
epoch_only
: bool
Args:
epoch_only
(bool): Whether the metric only read for the full epoch.
"""
def
__init__
(
self
,
epoch_only
:
bool
):
...
...
@@ -80,8 +80,8 @@ class Metric(ABC):
class
LossMetric
(
Metric
):
"""A metric collector for loss.
:param epoch_only: Whether the metric only read for the full epoch
:type
epoch_only
: bool
Args:
epoch_only
(bool): Whether the metric only read for the full epoch.
"""
def
__init__
(
self
,
epoch_only
):
...
...
@@ -101,7 +101,8 @@ class LossMetric(Metric):
"""Updates :attr:`last_step_loss` and :attr:`accum_loss` with current loss.
It expects the output has loss.
:param loss: Current loss of the output
Args:
loss (:class:`torch.tensor`): Current loss of the output.
"""
# expect output to be logits, label and loss
loss_
=
loss
.
detach
()
...
...
@@ -132,10 +133,9 @@ class LossMetric(Metric):
class
LearningRateMetric
(
Metric
):
"""A metric collector for learning rate.
:param epoch_only: Whether the metric only read for the full epoch
:type epoch_only: bool
:param initial_lr: Initial learning rate, defaults to 0.0
:type initial_lr: float, optional
Args:
epoch_only (bool): Whether the metric only read for the full epoch.
initial_lr (float, optional): Initial learning rate, defaults to 0.0.
"""
def
__init__
(
self
,
epoch_only
:
bool
,
initial_lr
:
float
=
0.
):
...
...
@@ -163,10 +163,9 @@ class AccuracyMetric(Metric):
"""A metric collector for accuracy. It only works for classification
tasks.
:param epoch_only: Whether the metric only read for the full epoch
:type epoch_only: bool
:param accuracy_func: Accuracy function for the classification task
:type accuracy_func: :class:`typing.Callable`
Args:
epoch_only (bool): Whether the metric only read for the full epoch.
accuracy_func (:class:`typing.Callable`): Accuracy function for the classification task.
"""
def
__init__
(
self
,
epoch_only
:
bool
,
accuracy_func
:
Callable
):
...
...
@@ -187,9 +186,10 @@ class AccuracyMetric(Metric):
"""Updates last step accuracy and accumulated accuracy with current logits
and labels. It expects the output has logits and labels.
:param logits: The logits output of the model
:param targets: Real labels of the dataset
:param batch_size: Batch size of the task
Args:
logits (:class:`torch.tensor`): The logits output of the model.
targets (:class:`torch.tensor`): Real labels of the dataset.
batch_size (int): Batch size of the task.
"""
if
isinstance
(
logits
,
(
list
,
tuple
)):
logits
=
logits
[
0
]
...
...
@@ -224,8 +224,10 @@ class MetricHook(BaseHook):
update their states. Others are used to display and
record the metric.
:param priority: Priority in the printing, hooks with small priority will be printed in front
:type priority: int
Args:
priority (int): Priority in the printing, hooks with small priority will be printed in front
defaults to 1. If different hooks share same priority, the order of printing would
depend on the hooks order in the hook list.
"""
def
__init__
(
...
...
@@ -244,8 +246,10 @@ class MetricHook(BaseHook):
class
LossHook
(
MetricHook
):
"""Specialized hook class for :class:`Loss`.
:param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 0
:type priority: int, optional
Args:
priority (int, optional): Priority in the printing, hooks with small priority will be printed in front
defaults to 0. If different hooks share same priority, the order of printing would
depend on the hooks order in the hook list.
"""
def
__init__
(
self
,
priority
:
int
=
0
):
...
...
@@ -283,10 +287,11 @@ class LossHook(MetricHook):
class
AccuracyHook
(
MetricHook
):
"""Specialized hook class for :class:`Accuracy`.
:param accuracy_func: Priority in the printing, hooks with small priority will be printed in front
:type accuracy_func: typing.Callable
:param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 0
:type priority: int, optional
Args:
accuracy_func (:class:`typing.Callable`): Accuracy function for the classification task.
priority (int, optional): Priority in the printing, hooks with small priority will be printed in front
defaults to 0. If different hooks share same priority, the order of printing would
depend on the hooks order in the hook list.
"""
def
__init__
(
self
,
accuracy_func
:
Callable
,
priority
:
int
=
0
):
...
...
@@ -314,8 +319,8 @@ class AccuracyHook(MetricHook):
class
ThroughputMetric
(
Metric
):
"""Metric for :class:`Throughput`.
:param epoch_only: epoch only
:type
epoch_only
: bool
Args:
epoch_only
(bool): Whether the metric only read for the full epoch.
"""
def
__init__
(
self
,
epoch_only
:
bool
,
ignored_steps
:
int
=
0
):
super
().
__init__
(
epoch_only
=
epoch_only
)
...
...
@@ -360,10 +365,13 @@ class ThroughputMetric(Metric):
@
HOOKS
.
register_module
class
ThroughputHook
(
MetricHook
):
"""Specialized hook class for :class:`Throughput`.
"""Specialized hook class for :class:`Throughput`.
Hook to measure execution throughput (samples/sec).
:param priority: priority of throughput hook, defaults to 10
:type priority: int, optional
Args:
ignored_steps (int, optional): the number of initial training steps to ignore.
priority (int, optional): Priority in the printing, hooks with small priority will be printed in front
defaults to 10. If different hooks share same priority, the order of printing would
depend on the hooks order in the hook list.
"""
def
__init__
(
self
,
ignored_steps
:
int
=
0
,
priority
:
int
=
10
):
super
().
__init__
(
priority
)
...
...
Prev
1
2
3
4
5
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment