Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ColossalAI
Commits
ec5086c4
Commit
ec5086c4
authored
Mar 25, 2022
by
Liang Bowen
Committed by
アマデウス
Mar 29, 2022
Browse files
Refactored docstring to google style
parent
53b1b6e3
Changes
91
Show whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
240 additions
and
234 deletions
+240
-234
colossalai/utils/activation_checkpoint.py
colossalai/utils/activation_checkpoint.py
+7
-4
colossalai/utils/checkpointing.py
colossalai/utils/checkpointing.py
+62
-61
colossalai/utils/common.py
colossalai/utils/common.py
+25
-21
colossalai/utils/data_sampler/data_parallel_sampler.py
colossalai/utils/data_sampler/data_parallel_sampler.py
+32
-37
colossalai/utils/gradient_accumulation/__init__.py
colossalai/utils/gradient_accumulation/__init__.py
+19
-13
colossalai/utils/gradient_accumulation/_gradient_accumulation.py
...lai/utils/gradient_accumulation/_gradient_accumulation.py
+32
-32
colossalai/utils/memory_utils/memory_monitor.py
colossalai/utils/memory_utils/memory_monitor.py
+16
-15
colossalai/utils/moe.py
colossalai/utils/moe.py
+5
-5
colossalai/utils/multi_tensor_apply/multi_tensor_apply.py
colossalai/utils/multi_tensor_apply/multi_tensor_apply.py
+3
-3
colossalai/utils/tensor_detector/tensor_detector.py
colossalai/utils/tensor_detector/tensor_detector.py
+9
-16
colossalai/utils/timer.py
colossalai/utils/timer.py
+30
-27
No files found.
colossalai/utils/activation_checkpoint.py
View file @
ec5086c4
...
...
@@ -114,10 +114,13 @@ class CheckpointFunction(torch.autograd.Function):
def
checkpoint
(
function
,
activation_offload
,
*
args
):
"""Checkpoint the computation while preserve the rng states, modified from Pytorch torch.utils.checkpoint
"""Checkpoint the computation while preserve the rng states, modified from Pytorch torch.utils.checkpoint
.
:param function: Describe the forward pass function. It should know how to handle the input tuples.
:param args: Tuple containing the parameters of the function
:return: Output of running function with provided args
Args:
function: Describe the forward pass function. It should know how to handle the input tuples.
args (list): Tuple containing the parameters of the function
Returns:
Output of running function with provided args.
"""
return
CheckpointFunction
.
apply
(
function
,
activation_offload
,
*
args
)
colossalai/utils/checkpointing.py
View file @
ec5086c4
...
...
@@ -50,17 +50,17 @@ def _get_standard_checkpoint_filename(epoch: int, suffix: str = ''):
def
get_checkpoint_path
(
checkpoint_dir
:
str
,
epoch
:
int
,
suffix
:
str
=
''
):
"""This is a function to generate the checkpoint path from the (checkpoint_dir, epoch, suffix, gpu_parallel_rank) tuple.
"""This is a function to generate the checkpoint path from the tuple
(checkpoint_dir, epoch, suffix, gpu_parallel_rank).
This is useful during generation and recuperation of the checkpoint.
:param checkpoint_dir: Set up a directory for saving checkpoints
:type checkpoint_dir: str
:param epoch: Epoch number (indicate how many epochs have you trained this model)
:type epoch: int
:param suffix: Additional notation to specify the model or checkpoint, defaults to ''
:type suffix: str, optional
:return: Checkpoint path to be generated
:rtype: path
Args:
checkpoint_dir (str): Set up a directory for saving checkpoints.
epoch (int): Epoch number (indicate how many epochs have you trained this model).
suffix (str, optional): Additional notation to specify the model or checkpoint, defaults to ''
Returns:
str: The checkpoint path to be generated.
"""
ckpt_filename
=
_get_standard_checkpoint_filename
(
epoch
,
suffix
)
return
os
.
path
.
join
(
checkpoint_dir
,
ckpt_filename
)
...
...
@@ -74,12 +74,13 @@ def _ensure_directory_exists(filename: str):
def
get_latest_checkpoint_pattern
(
suffix
:
str
=
''
):
"""Generate Regular expression of latest checkpoint's pattern
"""Generate Regular expression of the latest checkpoint's pattern.
Args:
suffix (str, optional): Additional notation to specify the model or checkpoint, defaults to ''.
:param suffix: Additional notation to specify the model or checkpoint, defaults to ''
:type suffix: str, optional
:return: Checkpoint pattern
:rtype: regular expression
Returns:
str: The regular expression of checkpoint pattern.
"""
ranks_name
=
_get_ranks_name
()
pattern
=
r
'epoch(\d+)-{}{}\.pt'
.
format
(
ranks_name
,
suffix
)
...
...
@@ -88,16 +89,19 @@ def get_latest_checkpoint_pattern(suffix: str = ''):
def
get_latest_checkpoint_path
(
checkpoint_dir
:
str
,
suffix
:
str
=
''
):
"""This is a function to retrieve the latest checkpoint path from the (checkpoint_dir, suffix, gpu_parallel_rank) tuple.
"""This is a function to retrieve the latest checkpoint path from the tuple
(checkpoint_dir, suffix, gpu_parallel_rank).
This is useful during recuperation of the checkpoint, especially when you do not know the epoch number.
:param checkpoint_dir: Directory for saving checkpoints
:type checkpoint_dir: str
:param suffix: Additional notation to specify the model or checkpoint, defaults to ''
:type suffix: str, optional
:raises FileNotFoundError: Raise error when we cannot find the latest checkpoint file with inputs given
:return: The latest checkpoint path to be retrieved
:rtype: path
Args:
checkpoint_dir (str): Directory for saving checkpoints
suffix (str, optional): Additional notation to specify the model or checkpoint, defaults to ''
Returns:
str: The latest retrieved checkpoint path.
Raises:
FileNotFoundError: Raise error when we cannot find the latest checkpoint file with inputs given.
"""
CKPT_NAME_PAT
=
get_latest_checkpoint_pattern
(
suffix
=
suffix
)
...
...
@@ -126,22 +130,19 @@ def save_checkpoint(checkpoint_path: str,
optimizer
:
torch
.
optim
.
Optimizer
,
lr_scheduler
:
torch
.
optim
.
lr_scheduler
.
_LRScheduler
=
None
,
**
kwargs
):
"""Given a directory to store the checkpoints, saves all the training components' parameters or buffers, such as model,
optimizer, lr_scheduler and etc. into a checkpoint dictionary.
This method can be used for both colosalai nn.BaseModel and normal pytorch nn.Module.
:param checkpoint_path: Set up a directory for saving checkpoints
:type checkpoint_path: str
:param epoch: Epoch number (indicate how many epochs have you trained this model)
:type epoch: int
:param model: Model to be registered
:type model: torch.nn.Module
:param optimizer: Optimizer to be registered
:type optimizer: torch.optim.Optimizer
:param lr_scheduler: lr_scheduler to be registered, defaults to None
:type lr_scheduler: torch.optim.lr_scheduler._LRScheduler, optional
"""Given a directory to store the checkpoints, saves all the training components' parameters or buffers, such as
model, optimizer, lr_scheduler etc. into a checkpoint dictionary.
This method can be used for both :class:`colossalai.nn.BaseModel` and normal :class:`torch.nn.Module`.
Args:
checkpoint_path (str): Set up a directory for saving checkpoints.
epoch (int): Epoch number (indicate how many epochs have you trained this model).
model (:class:`torch.nn.Module`): Model to be registered.
optimizer (Union[:class:`torch.optim.Optimizer`, :class:`colossalai.nn.optimizer`]): Optimizer to be registered.
lr_scheduler (Union[:class:`torch.optim.lr_scheduler`,
:class:`colossalai.nn.lr_scheduler`], optional): lr_scheduler to be registered, defaults to None.
kwargs (dict): additional parameters to be saved.
"""
# for compatibility with normal pytorch nn.Module
if
hasattr
(
model
,
'state_dict_for_save_checkpoint'
):
...
...
@@ -165,31 +166,31 @@ def load_checkpoint(checkpoint_path: str,
finetune
:
bool
=
False
,
strict
:
bool
=
True
)
->
Tuple
:
"""Loads the checkpoint file.
If finetune is False, then we intend to continue/resume the training process from the checkpoint given.
So we copy parameters and buffers from state_dict into these modules(model, optimizer,lr_scheduler)
and its descendants.
If finetune is True, then only the weights and buffers of model should be reload.
If strict is True, then the keys of state_dict must exactly match the keys returned by this module’s
state_dict() function.
:param checkpoint_path: The exact and matched checkpoint_path directory to retrieve appropriate state_dict
:type checkpoint_path: str
:param model: Model to reload parameters and buffers
:type model: torch.nn.Module
:param optimizer: Optimizer to recuperate
:type optimizer: torch.optim.Optimizer
:param lr_scheduler: lr_scheduler to recuperate, defaults to None
:type lr_scheduler: torch.optim.lr_scheduler._LRScheduler, optional
:param finetune: Whether to finetune the model with new dataset or continue the pre-training, defaults to False
:type finetune: bool, optional
:param strict: Whether to strictly enforce that the keys in
:attr:`state_dict` of the checkpoint match the names of
parameters and buffers in model., defaults to True
:type strict: bool, optional
:raises ValueError: Raise error if the model/optimizer cannot successfully be recuperated
:return: (the epoch number of the checkpoint retrieved, the checkpoint retrieved)
:rtype: Tuple
If finetune is True, then only the weights and buffers of model should be reloaded.
If strict is True, then the keys of state_dict must exactly match the keys returned
by this module’s state_dict() function.
Args:
checkpoint_path (str): The exact and matched checkpoint_path directory to retrieve appropriate state_dict.
model (:class:`torch.nn.Module`): Model to reload parameters and buffers.
optimizer (Union[:class:`torch.optim.Optimizer`, :class:`colossalai.nn.optimizer`]): Optimizer to recuperate.
lr_scheduler (:class:`torch.optim.lr_scheduler._LRScheduler`, optional):
lr_scheduler to recuperate, defaults to None.
finetune (bool, optional): Whether to finetune the model with new dataset or
continue the pre-training, defaults to False.
strict (bool, optional): Whether to strictly enforce that the keys in :attr:`state_dict`
of the checkpoint match the names of parameters and buffers in model, defaults to True.
Returns:
Tuple(int, ``checkpoint``): The tuple (the epoch number of the checkpoint retrieved, the checkpoint retrieved).
Raises:
ValueError: Raise error if the model/optimizer cannot successfully be recuperated
"""
# Load the checkpoint.
checkpoint
=
torch
.
load
(
checkpoint_path
,
map_location
=
'cpu'
)
...
...
colossalai/utils/common.py
View file @
ec5086c4
...
...
@@ -27,10 +27,10 @@ from .multi_tensor_apply import multi_tensor_applier
def
print_rank_0
(
msg
:
str
,
logger
=
None
):
"""Print messages and save logs(optional). This is executed only if you are the rank-0 gpu.
:param msg: A string message to output
:type msg: str
:param
logger
: Python logger object, defaults to None
:type logger: optional
Args:
msg (str): A string message to output.
logger
(:class:`colossalai.logging.DistributedLogger`, optional):
The logger to record the message, defaults to None.
"""
if
gpc
.
get_global_rank
()
==
0
:
if
logger
is
None
:
...
...
@@ -53,12 +53,15 @@ def free_port():
def
sync_model_param
(
model
,
parallel_mode
):
"""Make sure data parameters are consistent during Data Parallel Mode
r
"""Make sure data parameters are consistent during Data Parallel Mode
.
:param model: A pyTorch nn.model on whose parameters you check the consistency
:param parallel_mode: Parallel mode to be checked
:type model: torch.nn.Module
:type parallel_mode: colossalai.context.ParallelMode
Args:
model (:class:`torch.nn.Module`): A pyTorch model on whose parameters you check the consistency.
parallel_mode (:class:`colossalai.context.ParallelMode`): Parallel mode to be checked.
Note:
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
"""
if
gpc
.
is_initialized
(
parallel_mode
)
and
gpc
.
get_world_size
(
parallel_mode
)
>
1
:
for
param
in
model
.
parameters
():
...
...
@@ -146,18 +149,19 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
"""Clips gradient norm of an iterable of parameters whose gradients are in fp32.
This is adapted from :func:`torch.nn.utils.clip_grad.clip_grad_norm_` and
added functionality to handle model parallel parameters. Note that
added functionality to handle model parallel parameters.
Note:
the gradients are modified in place.
:param parameters: An iterable of Tensors or a single Tensor that will have gradients normalized
:type parameters: (Iterable[Tensor] or Tensor)
:param max_norm: Max norm of the gradients
:type max_norm: float or int
:param norm_type: Type of the used p-norm. Can be ``'inf'`` for infinity norm.
:type norm_type: float or int
Args:
parameters (Iterable[:class:`torch.tensor`] or :class:`torch.tensor`):
An iterable of Tensors or a single Tensor that will have gradients normalized.
max_norm (Union[float, int]): Max norm of the gradients.
norm_type (Union[float, int, 'inf']): Type of the used p-norm. Can be ``'inf'`` for infinity norm.
:r
eturn:
Total norm of the parameters (viewed as a single vector).
:rtype: float
R
eturn
s
:
float: Total norm of the parameters.
"""
if
isinstance
(
parameters
,
torch
.
Tensor
):
...
...
colossalai/utils/data_sampler/data_parallel_sampler.py
View file @
ec5086c4
...
...
@@ -19,18 +19,15 @@ T_co = TypeVar('T_co', covariant=True)
@
DATA_SAMPLERS
.
register_module
class
DataParallelSampler
(
Sampler
):
"""A data sampler for distributed data parallelism
:param dataset: A Dataset instance
:type dataset: torch.utils.data.Dataset
:param shuffle: Whether to shuffle data, defaults to False
:type shuffle: bool, optional
:param seed: The random seed, defaults to 0
:type seed: int, optional
:param drop_last: Set to True to drop the last incomplete batch, if the dataset size is not divisible by the batch
size. If False and the size of dataset is not divisible by the batch size, then the last batch will be smaller,
defaults to False
:type drop_last: bool, optional
"""A data sampler for distributed data parallelism.
Args:
dataset (:class:`torch.utils.data.Dataset`): The Dataset for sampling.
shuffle (bool, optional): Whether to shuffle data, defaults to False.
seed (int, optional): The random seed used for sampling, defaults to 0.
drop_last (bool, optional): Set to True to drop the last incomplete batch, if the dataset size
is not divisible by the batch size. If False and the size of dataset is not divisible by
the batch size, then the last batch will be smaller, defaults to False.
"""
def
__init__
(
self
,
...
...
@@ -104,8 +101,8 @@ class DataParallelSampler(Sampler):
use a different random ordering for each epoch. Otherwise, the next iteration of this
sampler will yield the same ordering.
:param epoch: Epoch number.
:type
epoch
:
int
Args:
epoch
(
int
): Epoch number.
"""
self
.
epoch
=
epoch
...
...
@@ -118,29 +115,27 @@ def get_dataloader(dataset,
pin_memory
=
False
,
num_workers
=
0
,
**
kwargs
):
"""Set up a deterministic dataloader (also configure seed workers, samplers and whether shuffle or not)
.. note:: When pipeline parallel is enabled, shuffle cannot be True as it will result in mismatch between input data
on the 1st stage and label on the last stage
:param dataset: A :class:`torch.utils.data.Dataset` object
:param shuffle: Whether to shuffle the dataset
:param seed: Random worker seed, defaults to 1024
:param add_sampler: Add DistributedDataParallelSampelr to the dataset
:param drop_last: Drop the last incomplete batch of data
:param pin_memory: Whether to pin memory address in CPU memory
:param num_workers: Number of worker threads for this dataloader
:type dataset: :class:`torch.utils.data.Dataset`
:type shuffle: bool, optional. Default is False
:type seed: int, optional. Default is 1024
:type add_sampler: bool, optional. Default is True
:type drop_last: bool, optional. Default is False
:type pin_memory: bool, optional. Default is False
:type num_workers: int, optional. Default is 0
:return: A object of :class:`torch.utils.data.DataLoader`
:rtype: :class:`torch.utils.data.DataLoader`
r
"""Set up a deterministic dataloader (also configure seed workers, samplers and whether shuffle or not)
Note:
When pipeline parallel is enabled, shuffle cannot be True as it will result in mismatch between input data
on the 1st stage and label on the last stage.
Args:
dataset (:class:`torch.utils.data.Dataset`): The dataset to be loaded.
shuffle (bool, optional): Whether to shuffle the dataset. Defaults to False.
seed (int, optional): Random worker seed for sampling, defaults to 1024.
add_sampler: Whether to add ``DistributedDataParallelSampler`` to the dataset. Defaults to True.
drop_last (bool, optional): Set to True to drop the last incomplete batch, if the dataset size
is not divisible by the batch size. If False and the size of dataset is not divisible by
the batch size, then the last batch will be smaller, defaults to False.
pin_memory (bool, optional): Whether to pin memory address in CPU memory. Defaults to False.
num_workers (int, optional): Number of worker threads for this dataloader. Defaults to 0.
kwargs (dict): optional parameters for ``torch.utils.data.DataLoader``, more details could be found in
`DataLoader <https://pytorch.org/docs/stable/_modules/torch/utils/data/dataloader.html#DataLoader>`_.
Returns:
:class:`torch.utils.data.DataLoader`: A DataLoader used for training or testing.
"""
_kwargs
=
kwargs
.
copy
()
...
...
colossalai/utils/gradient_accumulation/__init__.py
View file @
ec5086c4
...
...
@@ -13,19 +13,25 @@ def accumulate_gradient(model: nn.Module,
accumulate_size
:
int
,
gradient_handlers
:
List
[
BaseGradientHandler
]
=
None
,
lr_scheduler
:
_LRScheduler
=
None
):
"""
:param model: your model object
:type model: :class:`torch.nn.Module`
:param optimizer: your optimizer object
:type optimizer: :class:`torch.optim.Optimizer`
:param dataloader: your dataloader object
:type dataloader: Iterable
:param accumulate_size: the number of steps to accumulate gradients
:type accumulate_size: int
:param gradient_handlers: list of gradient handler objects. Default is None
:type gradient_handlers: List[:class:`colossalai.engine.BaseGradientHandler`]
:param lr_scheduler: your lr scheduler object. Default is None
:type lr_scheduler: `torch.optim.lr_scheduler._LRScheduler`
r
"""Turning model, optimizer, dataloader into corresponding object for gradient accumulation.
Args:
model (:class:`torch.nn.Module`): your model object for gradient accumulation.
optimizer (:class:`torch.optim.Optimizer`): your optimizer object for gradient accumulation.
dataloader (:class:`torch.utils.data.DataLoader` or iterable objects):
your dataloader object, would be called like iter(dataloader)
accumulate_size (int): the number of steps to accumulate gradients
gradient_handlers (List[:class:`colossalai.engine.BaseGradientHandler`]):
list of gradient handler objects. Default is None.
lr_scheduler (`torch.optim.lr_scheduler` or `colossalai.nn.lr_scheduler`):
your ``lr_scheduler`` object for gradient accumulation. Defaults to None.
More details about `gradient_handlers` could be found in
`Gradient_handler <https://github.com/hpcaitech/ColossalAI/tree/main/colossalai/engine/gradient_handler>`_.
More details about `lr_scheduler` could be found
`lr_scheduler <https://github.com/hpcaitech/ColossalAI/tree/main/colossalai/nn/lr_scheduler>`_. and
`how to adjust learning rate <https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate>`_.
"""
optimizer
=
GradAccumOptimizer
(
optimizer
,
accumulate_size
=
accumulate_size
,
model
=
model
)
dataloader
=
GradAccumDataloader
(
dataloader
,
accumulate_size
=
accumulate_size
)
...
...
colossalai/utils/gradient_accumulation/_gradient_accumulation.py
View file @
ec5086c4
...
...
@@ -15,15 +15,13 @@ from colossalai.engine import BaseGradientHandler
class
GradAccumOptimizer
(
ColossalaiOptimizer
):
"""A wrapper for the optimizer to enable gradient accumulation by skipping the steps
before accumulation size is reached
:param optim: Your optimizer object
:type optim: :class:`torch.optim.Optimizer`
:param accumulate_size: The number of steps to accumulate gradients
:type accumulate_size: int
:param model: Your model object to check if it is DDP for special handling of no_sync() context
:type model: :class:`torch.nn.Module`
before accumulation size is reached.
Args:
optim (:class:`torch.optim.Optimizer`): Your optimizer object for gradient accumulation.
accumulate_size (int): The number of steps to accumulate gradients.
model (:class:`torch.nn.Module`):
Your model object to check if it is DistributedDataParallel for special handling of no_sync() context.
"""
def
__init__
(
self
,
optim
:
Optimizer
,
accumulate_size
:
int
,
model
:
nn
.
Module
=
None
):
...
...
@@ -76,18 +74,18 @@ class GradAccumOptimizer(ColossalaiOptimizer):
class
GradAccumDataloader
:
"""A wrapper for dataloder to enable gradient accumulation by dropping the last incomplete steps.
"""A wrapper for datalo
a
der to enable gradient accumulation by dropping the last incomplete steps.
For example, if a dataloader has 10 batches of data and accumulate size is 4. The model paramters will
be update only twice at step 4 and step 8. The last two batches of data do not form a complete 4-step cycle.
Note:
The dataloader would drop the last incomplete steps for gradient accumulation.
For example, if a dataloader has 10 batches of data and accumulate size is 4. The model parameters will
be updated only twice at step 4 and step 8. The last two batches of data do not form a complete 4-step cycle.
Thus, they will be automatically skipped by this class. If the dataloader is not standard PyTorch dataloader,
(e.g. Dali dataloader), this class will automatically consume (load data for nothing) the remaining 2 batches.
:param dataloader: Your dataloader object
:type dataloader: Iterable
:param accumulate_size: The number of steps to accumulate gradients
:type accumulate_size: int
Args:
optim (``Iterable``): Your dataloader object for gradient accumulation.
accumulate_size (int): The number of steps to accumulate gradients.
"""
def
__init__
(
self
,
dataloader
:
Iterable
,
accumulate_size
:
int
)
->
None
:
...
...
@@ -125,13 +123,12 @@ class GradAccumDataloader:
class
GradAccumLrSchedulerByStep
(
_LRScheduler
):
"""A wrapper for the LR scheduler to enable gradient accumulation by skipping the steps
before accumulation size is reached
:param lr_scheduler: Your lr scheduler object
:type lr_scheduler: :class:`torch.optim.lr_scheduler._LRScheduler`
:param accumulate_size: The number of steps to accumulate gradients
:type accumulate_size: int
before accumulation size is reached.
Args:
lr_scheduler (:class:`torch.optim.lr_scheduler._LRScheduler`):
Your ``lr_scheduler`` object for gradient accumulation.
accumulate_size (int): The number of steps to accumulate gradients.
"""
def
__init__
(
self
,
lr_scheduler
:
_LRScheduler
,
accumulate_size
:
int
)
->
None
:
...
...
@@ -171,13 +168,16 @@ class GradAccumLrSchedulerByStep(_LRScheduler):
class
GradAccumGradientHandler
:
"""A wrapper for the gradient handler to enable gradient accumulation by skipping the steps
before accumulation size is reached
r
"""A wrapper for the gradient handler to enable gradient accumulation by skipping the steps
before accumulation size is reached.
Args:
grad_handler (:class:`colossalai.engine.BaseGradientHandler`):
Your ``gradient_handler`` object for gradient accumulation, would be called when achieving `accumulate_size`.
accumulate_size (int): The number of steps to accumulate gradients.
:param grad_handler: Your gradient handler object
:type grad_handler: :class:`colossalai.engine.BaseGradientHandler`
:param accumulate_size: The number of steps to accumulate gradients
:type accumulate_size: int
More details about ``gradient_handlers`` could be found in
`Gradient_handler <https://github.com/hpcaitech/ColossalAI/tree/main/colossalai/engine/gradient_handler>`_.
"""
...
...
colossalai/utils/memory_utils/memory_monitor.py
View file @
ec5086c4
...
...
@@ -14,12 +14,13 @@ from typing import Optional
def
colo_cuda_memory_used
(
device
:
Optional
[
torch
.
device
]
=
None
)
->
int
:
"""
Get the free memory info of device.
:param device: a torch device instance or None
:type device: Optional[torch.device]
:return: current memory usage, sized by Byte
:rtype: int
"""Get the free memory info of device.
Args:
device (Optional[``torch.device``]): a torch device instance or None. Defaults None.
Returns:
int: current memory usage, sized by Byte.
"""
if
device
:
assert
device
.
type
==
'cuda'
...
...
@@ -34,7 +35,7 @@ def colo_cuda_memory_used(device: Optional[torch.device] = None) -> int:
def
bytes_to_GB
(
val
,
decimal
=
2
):
"""A byte-to-Gigabyte converter, default
ly
using binary notation.
"""A byte-to-Gigabyte converter, default using binary notation.
:param val: X bytes to convert
:return: X' GB
...
...
@@ -43,7 +44,7 @@ def bytes_to_GB(val, decimal=2):
def
bytes_to_MB
(
val
,
decimal
=
2
):
"""A byte-to-Megabyte converter, default
ly
using binary notation.
"""A byte-to-Megabyte converter, default using binary notation.
:param val: X bytes to convert
:return: X' MB
...
...
@@ -54,13 +55,13 @@ def bytes_to_MB(val, decimal=2):
def
report_memory_usage
(
message
,
logger
=
None
,
report_cpu
=
False
):
"""Calculate and print RAM usage (in GB)
:param message: A prefix message to add in the log
:type message: str
:param
logger
: An instance of
:class:`colossalai.logging.DistributedLogger`
:type logger: :class:`colossalai.logging.DistributedLogger`, optional
:param report_cpu: Whether to report CPU memory
:type report_cpu: bool, optional
:raises
EnvironmentError: Raise error if no distributed environment has been initialized
Args:
message (str): A prefix message to add in the log.
logger
(
:class:`colossalai.logging.DistributedLogger`
): The logger used to record memory information.
report_cpu (bool, optional): Whether to report CPU memory.
Raises:
EnvironmentError: Raise error if no distributed environment has been initialized
.
"""
if
not
gpc
.
is_initialized
(
ParallelMode
.
GLOBAL
):
raise
EnvironmentError
(
"No distributed environment is initialized"
)
...
...
colossalai/utils/moe.py
View file @
ec5086c4
...
...
@@ -12,8 +12,8 @@ def get_moe_epsize_param_dict(model: nn.Module) -> Dict[int, List[nn.Parameter]]
size of every parameter. Since the parameters in data parallelism is replicated
in each GPU, we set their ep_size to 1.
:param model: A pyTorch nn.model from which we get dict
:type
model
:
torch.nn.Module
Args:
model
(:class:`
torch.nn.Module
`): A pyTorch `nn.Module` from which we get dict.
"""
epsize_param_dict
=
dict
()
for
param
in
model
.
parameters
():
...
...
@@ -29,10 +29,10 @@ def get_moe_epsize_param_dict(model: nn.Module) -> Dict[int, List[nn.Parameter]]
def
sync_moe_model_param
(
model
:
nn
.
Module
):
"""Make sure model parameters are consistent in MoE parallel context
"""Make sure model parameters are consistent in MoE parallel context
.
:param model: A pyTorch nn.model on whose parameters you check the consistency
:type
model
:
torch.nn.Module
Args:
model
(:class:`
torch.nn.Module
`): A pyTorch model on whose parameters you check the consistency.
"""
if
is_using_ddp
():
...
...
colossalai/utils/multi_tensor_apply/multi_tensor_apply.py
View file @
ec5086c4
...
...
@@ -3,10 +3,10 @@
class
MultiTensorApply
(
object
):
"""
Apply an operation to a list of tensors efficiently
Apply an operation to a list of tensors efficiently
.
:param chunk_size: Size of a chunk
:type
chunk_size
: int
Args:
chunk_size
(int): Size of a chunk.
"""
available
=
False
...
...
colossalai/utils/tensor_detector/tensor_detector.py
View file @
ec5086c4
...
...
@@ -9,6 +9,7 @@ from collections import defaultdict
LINE_WIDTH
=
108
LINE
=
'-'
*
LINE_WIDTH
+
'
\n
'
class
TensorDetector
():
def
__init__
(
self
,
show_info
:
bool
=
True
,
...
...
@@ -16,17 +17,14 @@ class TensorDetector():
include_cpu
:
bool
=
False
,
module
:
Optional
[
nn
.
Module
]
=
None
):
"""This class is an detector to detect tensor on different devices.
:param show_info: whether to print the info on screen, default True
:type show_info: bool
:param log: the file name to save the log
:type log: str
:param include_cpu: whether to detect tensor on cpu, default False
:type include_cpu: bool
:param module: when sending an `nn.Module` it, the detector can name the tensors detected better
:type module: Optional[nn.Module]
"""This class is a detector to detect tensor on different devices.
Args:
show_info (bool, optional): whether to print the info on screen, default True.
log (str, optional): the file name to save the log. Defaults to None.
include_cpu (bool, optional): whether to detect tensor on cpu, default False.
module (Optional[:class:`nn.Module`]): when sending an ``nn.Module`` object,
the detector can name the tensors detected better.
"""
self
.
show_info
=
show_info
self
.
log
=
log
...
...
@@ -49,7 +47,6 @@ class TensorDetector():
self
.
tensor_info
[
id
(
param
)].
append
(
param
.
dtype
)
self
.
tensor_info
[
id
(
param
)].
append
(
self
.
get_tensor_mem
(
param
))
def
get_tensor_mem
(
self
,
tensor
):
# calculate the memory occupied by a tensor
memory_size
=
tensor
.
element_size
()
*
tensor
.
storage
().
size
()
...
...
@@ -58,7 +55,6 @@ class TensorDetector():
memory_size
+=
grad_memory_size
return
self
.
mem_format
(
memory_size
)
def
mem_format
(
self
,
real_memory_size
):
# format the tensor memory into a reasonal magnitude
if
real_memory_size
>=
2
**
30
:
...
...
@@ -69,7 +65,6 @@ class TensorDetector():
return
str
(
real_memory_size
/
(
2
**
10
))
+
' KB'
return
str
(
real_memory_size
)
+
' B'
def
collect_tensors_state
(
self
):
for
obj
in
gc
.
get_objects
():
if
torch
.
is_tensor
(
obj
):
...
...
@@ -116,7 +111,6 @@ class TensorDetector():
if
obj
.
device
not
in
self
.
devices
:
self
.
devices
.
append
(
obj
.
device
)
def
print_tensors_state
(
self
):
template_format
=
'{:3s}{:<30s}{:>10s}{:>20s}{:>10s}{:>20s}{:>15s}'
self
.
info
+=
LINE
...
...
@@ -174,7 +168,6 @@ class TensorDetector():
with
open
(
self
.
log
+
'.log'
,
'a'
)
as
f
:
f
.
write
(
self
.
info
)
def
detect
(
self
,
include_cpu
=
False
):
self
.
include_cpu
=
include_cpu
self
.
collect_tensors_state
()
...
...
colossalai/utils/timer.py
View file @
ec5086c4
...
...
@@ -25,7 +25,7 @@ class Timer:
return
time
.
time
()
def
start
(
self
):
"""Fi
s
rtly synchronize cuda, reset the clock and then start the timer.
"""Fir
s
tly synchronize cuda, reset the clock and then start the timer.
"""
self
.
_elapsed
=
0
synchronize
()
...
...
@@ -40,10 +40,11 @@ class Timer:
def
stop
(
self
,
keep_in_history
:
bool
=
False
):
"""Stop the timer and record the start-stop time interval.
:param keep_in_history: Whether does it record into history each start-stop interval, defaults to False
:type keep_in_history: bool, optional
:return: Start-stop interval
:rtype: int
Args:
keep_in_history (bool, optional): Whether does it record into history
each start-stop interval, defaults to False.
Returns:
int: Start-stop interval.
"""
synchronize
()
end_time
=
time
.
time
()
...
...
@@ -57,26 +58,27 @@ class Timer:
def
get_history_mean
(
self
):
"""Mean of all history start-stop time intervals.
:r
eturn
: Mean of time interval
s
:rtype: int
R
eturns
:
int: Mean of time intervals
"""
return
sum
(
self
.
_history
)
/
len
(
self
.
_history
)
def
get_history_sum
(
self
):
"""Add up all the start-stop time intervals.
:r
eturn
: Sum of time interval
s
:rtype: int
R
eturns
:
int: Sum of time intervals.
"""
return
sum
(
self
.
_history
)
def
get_elapsed_time
(
self
):
"""Return the last start-stop time interval.
.. note:: Use it only when timer is not in progress
Returns:
int: The last time interval.
:return: The last time interval
:rtype: int
Note:
Use it only when timer is not in progress
"""
assert
not
self
.
_started
,
'Timer is still in progress'
return
self
.
_elapsed
...
...
@@ -90,10 +92,10 @@ class Timer:
class
MultiTimer
:
"""An object contains multiple timers
"""An object contains multiple timers
.
:param on: Whether the timer is enabled. Default is True
:type
on
:
bool, optional
Args:
on
(
bool, optional
): Whether the timer is enabled. Default is True.
"""
def
__init__
(
self
,
on
:
bool
=
True
):
...
...
@@ -101,10 +103,10 @@ class MultiTimer:
self
.
_timers
=
dict
()
def
start
(
self
,
name
:
str
):
"""Start namely one of the timers
"""Start namely one of the timers
.
:param name: Timer's key
:type
name
:
str
Args:
name
(
str
): Timer's key.
"""
if
self
.
_on
:
if
name
not
in
self
.
_timers
:
...
...
@@ -114,10 +116,9 @@ class MultiTimer:
def
stop
(
self
,
name
:
str
,
keep_in_history
:
bool
):
"""Stop namely one of the timers.
:param name: Timer's key
:type name: str
:param keep_in_history: Whether does it record into history each start-stop interval
:type keep_in_history: bool
Args:
name (str): Timer's key.
keep_in_history (bool): Whether does it record into history each start-stop interval.
"""
if
self
.
_on
:
return
self
.
_timers
[
name
].
stop
(
keep_in_history
)
...
...
@@ -127,17 +128,19 @@ class MultiTimer:
def
get_timer
(
self
,
name
):
"""Get timer by its name (from multitimer)
:param name: Timer's key
:return: Timer with the name you give correctly
:rtype: Timer
Args:
name (str): Timer's key.
Returns:
:class:`colossalai.utils.Timer`: Timer with the name you give correctly.
"""
return
self
.
_timers
[
name
]
def
reset
(
self
,
name
=
None
):
"""Reset timers.
:param name: If name is designated, the named timer will be reset and others will not, defaults to None
:type name: optional
Args:
name (str, optional): If name is designated, the named timer will be reset
and others will not, defaults to None.
"""
if
self
.
_on
:
if
name
is
not
None
:
...
...
Prev
1
2
3
4
5
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment