Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
InstructBLIP_pytorch
Commits
c04f261a
Commit
c04f261a
authored
Aug 22, 2024
by
dongchy920
Browse files
InstruceBLIP
parents
Pipeline
#1594
canceled with stages
Changes
421
Pipelines
1
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
3495 additions
and
0 deletions
+3495
-0
lavis/common/annotator/uniformer/mmcv/parallel/scatter_gather.py
...ommon/annotator/uniformer/mmcv/parallel/scatter_gather.py
+59
-0
lavis/common/annotator/uniformer/mmcv/parallel/utils.py
lavis/common/annotator/uniformer/mmcv/parallel/utils.py
+20
-0
lavis/common/annotator/uniformer/mmcv/runner/__init__.py
lavis/common/annotator/uniformer/mmcv/runner/__init__.py
+47
-0
lavis/common/annotator/uniformer/mmcv/runner/base_module.py
lavis/common/annotator/uniformer/mmcv/runner/base_module.py
+195
-0
lavis/common/annotator/uniformer/mmcv/runner/base_runner.py
lavis/common/annotator/uniformer/mmcv/runner/base_runner.py
+542
-0
lavis/common/annotator/uniformer/mmcv/runner/builder.py
lavis/common/annotator/uniformer/mmcv/runner/builder.py
+24
-0
lavis/common/annotator/uniformer/mmcv/runner/checkpoint.py
lavis/common/annotator/uniformer/mmcv/runner/checkpoint.py
+707
-0
lavis/common/annotator/uniformer/mmcv/runner/default_constructor.py
...on/annotator/uniformer/mmcv/runner/default_constructor.py
+44
-0
lavis/common/annotator/uniformer/mmcv/runner/dist_utils.py
lavis/common/annotator/uniformer/mmcv/runner/dist_utils.py
+164
-0
lavis/common/annotator/uniformer/mmcv/runner/epoch_based_runner.py
...mon/annotator/uniformer/mmcv/runner/epoch_based_runner.py
+187
-0
lavis/common/annotator/uniformer/mmcv/runner/fp16_utils.py
lavis/common/annotator/uniformer/mmcv/runner/fp16_utils.py
+410
-0
lavis/common/annotator/uniformer/mmcv/runner/hooks/__init__.py
.../common/annotator/uniformer/mmcv/runner/hooks/__init__.py
+29
-0
lavis/common/annotator/uniformer/mmcv/runner/hooks/checkpoint.py
...ommon/annotator/uniformer/mmcv/runner/hooks/checkpoint.py
+167
-0
lavis/common/annotator/uniformer/mmcv/runner/hooks/closure.py
...s/common/annotator/uniformer/mmcv/runner/hooks/closure.py
+11
-0
lavis/common/annotator/uniformer/mmcv/runner/hooks/ema.py
lavis/common/annotator/uniformer/mmcv/runner/hooks/ema.py
+89
-0
lavis/common/annotator/uniformer/mmcv/runner/hooks/evaluation.py
...ommon/annotator/uniformer/mmcv/runner/hooks/evaluation.py
+509
-0
lavis/common/annotator/uniformer/mmcv/runner/hooks/hook.py
lavis/common/annotator/uniformer/mmcv/runner/hooks/hook.py
+92
-0
lavis/common/annotator/uniformer/mmcv/runner/hooks/iter_timer.py
...ommon/annotator/uniformer/mmcv/runner/hooks/iter_timer.py
+18
-0
lavis/common/annotator/uniformer/mmcv/runner/hooks/logger/__init__.py
.../annotator/uniformer/mmcv/runner/hooks/logger/__init__.py
+15
-0
lavis/common/annotator/uniformer/mmcv/runner/hooks/logger/base.py
...mmon/annotator/uniformer/mmcv/runner/hooks/logger/base.py
+166
-0
No files found.
Too many changes to show.
To preserve performance only
421 of 421+
files are displayed.
Plain diff
Email patch
lavis/common/annotator/uniformer/mmcv/parallel/scatter_gather.py
0 → 100644
View file @
c04f261a
# Copyright (c) OpenMMLab. All rights reserved.
import
torch
from
torch.nn.parallel._functions
import
Scatter
as
OrigScatter
from
._functions
import
Scatter
from
.data_container
import
DataContainer
def
scatter
(
inputs
,
target_gpus
,
dim
=
0
):
"""Scatter inputs to target gpus.
The only difference from original :func:`scatter` is to add support for
:type:`~mmcv.parallel.DataContainer`.
"""
def
scatter_map
(
obj
):
if
isinstance
(
obj
,
torch
.
Tensor
):
if
target_gpus
!=
[
-
1
]:
return
OrigScatter
.
apply
(
target_gpus
,
None
,
dim
,
obj
)
else
:
# for CPU inference we use self-implemented scatter
return
Scatter
.
forward
(
target_gpus
,
obj
)
if
isinstance
(
obj
,
DataContainer
):
if
obj
.
cpu_only
:
return
obj
.
data
else
:
return
Scatter
.
forward
(
target_gpus
,
obj
.
data
)
if
isinstance
(
obj
,
tuple
)
and
len
(
obj
)
>
0
:
return
list
(
zip
(
*
map
(
scatter_map
,
obj
)))
if
isinstance
(
obj
,
list
)
and
len
(
obj
)
>
0
:
out
=
list
(
map
(
list
,
zip
(
*
map
(
scatter_map
,
obj
))))
return
out
if
isinstance
(
obj
,
dict
)
and
len
(
obj
)
>
0
:
out
=
list
(
map
(
type
(
obj
),
zip
(
*
map
(
scatter_map
,
obj
.
items
()))))
return
out
return
[
obj
for
targets
in
target_gpus
]
# After scatter_map is called, a scatter_map cell will exist. This cell
# has a reference to the actual function scatter_map, which has references
# to a closure that has a reference to the scatter_map cell (because the
# fn is recursive). To avoid this reference cycle, we set the function to
# None, clearing the cell
try
:
return
scatter_map
(
inputs
)
finally
:
scatter_map
=
None
def
scatter_kwargs
(
inputs
,
kwargs
,
target_gpus
,
dim
=
0
):
"""Scatter with support for kwargs dictionary."""
inputs
=
scatter
(
inputs
,
target_gpus
,
dim
)
if
inputs
else
[]
kwargs
=
scatter
(
kwargs
,
target_gpus
,
dim
)
if
kwargs
else
[]
if
len
(
inputs
)
<
len
(
kwargs
):
inputs
.
extend
([()
for
_
in
range
(
len
(
kwargs
)
-
len
(
inputs
))])
elif
len
(
kwargs
)
<
len
(
inputs
):
kwargs
.
extend
([{}
for
_
in
range
(
len
(
inputs
)
-
len
(
kwargs
))])
inputs
=
tuple
(
inputs
)
kwargs
=
tuple
(
kwargs
)
return
inputs
,
kwargs
lavis/common/annotator/uniformer/mmcv/parallel/utils.py
0 → 100644
View file @
c04f261a
# Copyright (c) OpenMMLab. All rights reserved.
from
.registry
import
MODULE_WRAPPERS
def
is_module_wrapper
(
module
):
"""Check if a module is a module wrapper.
The following 3 modules in MMCV (and their subclasses) are regarded as
module wrappers: DataParallel, DistributedDataParallel,
MMDistributedDataParallel (the deprecated version). You may add you own
module wrapper by registering it to mmcv.parallel.MODULE_WRAPPERS.
Args:
module (nn.Module): The module to be checked.
Returns:
bool: True if the input module is a module wrapper.
"""
module_wrappers
=
tuple
(
MODULE_WRAPPERS
.
module_dict
.
values
())
return
isinstance
(
module
,
module_wrappers
)
lavis/common/annotator/uniformer/mmcv/runner/__init__.py
0 → 100644
View file @
c04f261a
# Copyright (c) OpenMMLab. All rights reserved.
from
.base_module
import
BaseModule
,
ModuleList
,
Sequential
from
.base_runner
import
BaseRunner
from
.builder
import
RUNNERS
,
build_runner
from
.checkpoint
import
(
CheckpointLoader
,
_load_checkpoint
,
_load_checkpoint_with_prefix
,
load_checkpoint
,
load_state_dict
,
save_checkpoint
,
weights_to_cpu
)
from
.default_constructor
import
DefaultRunnerConstructor
from
.dist_utils
import
(
allreduce_grads
,
allreduce_params
,
get_dist_info
,
init_dist
,
master_only
)
from
.epoch_based_runner
import
EpochBasedRunner
,
Runner
from
.fp16_utils
import
LossScaler
,
auto_fp16
,
force_fp32
,
wrap_fp16_model
from
.hooks
import
(
HOOKS
,
CheckpointHook
,
ClosureHook
,
DistEvalHook
,
DistSamplerSeedHook
,
DvcliveLoggerHook
,
EMAHook
,
EvalHook
,
Fp16OptimizerHook
,
GradientCumulativeFp16OptimizerHook
,
GradientCumulativeOptimizerHook
,
Hook
,
IterTimerHook
,
LoggerHook
,
LrUpdaterHook
,
MlflowLoggerHook
,
NeptuneLoggerHook
,
OptimizerHook
,
PaviLoggerHook
,
SyncBuffersHook
,
TensorboardLoggerHook
,
TextLoggerHook
,
WandbLoggerHook
)
from
.iter_based_runner
import
IterBasedRunner
,
IterLoader
from
.log_buffer
import
LogBuffer
from
.optimizer
import
(
OPTIMIZER_BUILDERS
,
OPTIMIZERS
,
DefaultOptimizerConstructor
,
build_optimizer
,
build_optimizer_constructor
)
from
.priority
import
Priority
,
get_priority
from
.utils
import
get_host_info
,
get_time_str
,
obj_from_dict
,
set_random_seed
__all__
=
[
'BaseRunner'
,
'Runner'
,
'EpochBasedRunner'
,
'IterBasedRunner'
,
'LogBuffer'
,
'HOOKS'
,
'Hook'
,
'CheckpointHook'
,
'ClosureHook'
,
'LrUpdaterHook'
,
'OptimizerHook'
,
'IterTimerHook'
,
'DistSamplerSeedHook'
,
'LoggerHook'
,
'PaviLoggerHook'
,
'TextLoggerHook'
,
'TensorboardLoggerHook'
,
'NeptuneLoggerHook'
,
'WandbLoggerHook'
,
'MlflowLoggerHook'
,
'DvcliveLoggerHook'
,
'_load_checkpoint'
,
'load_state_dict'
,
'load_checkpoint'
,
'weights_to_cpu'
,
'save_checkpoint'
,
'Priority'
,
'get_priority'
,
'get_host_info'
,
'get_time_str'
,
'obj_from_dict'
,
'init_dist'
,
'get_dist_info'
,
'master_only'
,
'OPTIMIZER_BUILDERS'
,
'OPTIMIZERS'
,
'DefaultOptimizerConstructor'
,
'build_optimizer'
,
'build_optimizer_constructor'
,
'IterLoader'
,
'set_random_seed'
,
'auto_fp16'
,
'force_fp32'
,
'wrap_fp16_model'
,
'Fp16OptimizerHook'
,
'SyncBuffersHook'
,
'EMAHook'
,
'build_runner'
,
'RUNNERS'
,
'allreduce_grads'
,
'allreduce_params'
,
'LossScaler'
,
'CheckpointLoader'
,
'BaseModule'
,
'_load_checkpoint_with_prefix'
,
'EvalHook'
,
'DistEvalHook'
,
'Sequential'
,
'ModuleList'
,
'GradientCumulativeOptimizerHook'
,
'GradientCumulativeFp16OptimizerHook'
,
'DefaultRunnerConstructor'
]
lavis/common/annotator/uniformer/mmcv/runner/base_module.py
0 → 100644
View file @
c04f261a
# Copyright (c) OpenMMLab. All rights reserved.
import
copy
import
warnings
from
abc
import
ABCMeta
from
collections
import
defaultdict
from
logging
import
FileHandler
import
torch.nn
as
nn
from
annotator.uniformer.mmcv.runner.dist_utils
import
master_only
from
annotator.uniformer.mmcv.utils.logging
import
get_logger
,
logger_initialized
,
print_log
class
BaseModule
(
nn
.
Module
,
metaclass
=
ABCMeta
):
"""Base module for all modules in openmmlab.
``BaseModule`` is a wrapper of ``torch.nn.Module`` with additional
functionality of parameter initialization. Compared with
``torch.nn.Module``, ``BaseModule`` mainly adds three attributes.
- ``init_cfg``: the config to control the initialization.
- ``init_weights``: The function of parameter
initialization and recording initialization
information.
- ``_params_init_info``: Used to track the parameter
initialization information. This attribute only
exists during executing the ``init_weights``.
Args:
init_cfg (dict, optional): Initialization config dict.
"""
def
__init__
(
self
,
init_cfg
=
None
):
"""Initialize BaseModule, inherited from `torch.nn.Module`"""
# NOTE init_cfg can be defined in different levels, but init_cfg
# in low levels has a higher priority.
super
(
BaseModule
,
self
).
__init__
()
# define default value of init_cfg instead of hard code
# in init_weights() function
self
.
_is_init
=
False
self
.
init_cfg
=
copy
.
deepcopy
(
init_cfg
)
# Backward compatibility in derived classes
# if pretrained is not None:
# warnings.warn('DeprecationWarning: pretrained is a deprecated \
# key, please consider using init_cfg')
# self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
@
property
def
is_init
(
self
):
return
self
.
_is_init
def
init_weights
(
self
):
"""Initialize the weights."""
is_top_level_module
=
False
# check if it is top-level module
if
not
hasattr
(
self
,
'_params_init_info'
):
# The `_params_init_info` is used to record the initialization
# information of the parameters
# the key should be the obj:`nn.Parameter` of model and the value
# should be a dict containing
# - init_info (str): The string that describes the initialization.
# - tmp_mean_value (FloatTensor): The mean of the parameter,
# which indicates whether the parameter has been modified.
# this attribute would be deleted after all parameters
# is initialized.
self
.
_params_init_info
=
defaultdict
(
dict
)
is_top_level_module
=
True
# Initialize the `_params_init_info`,
# When detecting the `tmp_mean_value` of
# the corresponding parameter is changed, update related
# initialization information
for
name
,
param
in
self
.
named_parameters
():
self
.
_params_init_info
[
param
][
'init_info'
]
=
f
'The value is the same before and '
\
f
'after calling `init_weights` '
\
f
'of
{
self
.
__class__
.
__name__
}
'
self
.
_params_init_info
[
param
][
'tmp_mean_value'
]
=
param
.
data
.
mean
()
# pass `params_init_info` to all submodules
# All submodules share the same `params_init_info`,
# so it will be updated when parameters are
# modified at any level of the model.
for
sub_module
in
self
.
modules
():
sub_module
.
_params_init_info
=
self
.
_params_init_info
# Get the initialized logger, if not exist,
# create a logger named `mmcv`
logger_names
=
list
(
logger_initialized
.
keys
())
logger_name
=
logger_names
[
0
]
if
logger_names
else
'mmcv'
from
..cnn
import
initialize
from
..cnn.utils.weight_init
import
update_init_info
module_name
=
self
.
__class__
.
__name__
if
not
self
.
_is_init
:
if
self
.
init_cfg
:
print_log
(
f
'initialize
{
module_name
}
with init_cfg
{
self
.
init_cfg
}
'
,
logger
=
logger_name
)
initialize
(
self
,
self
.
init_cfg
)
if
isinstance
(
self
.
init_cfg
,
dict
):
# prevent the parameters of
# the pre-trained model
# from being overwritten by
# the `init_weights`
if
self
.
init_cfg
[
'type'
]
==
'Pretrained'
:
return
for
m
in
self
.
children
():
if
hasattr
(
m
,
'init_weights'
):
m
.
init_weights
()
# users may overload the `init_weights`
update_init_info
(
m
,
init_info
=
f
'Initialized by '
f
'user-defined `init_weights`'
f
' in
{
m
.
__class__
.
__name__
}
'
)
self
.
_is_init
=
True
else
:
warnings
.
warn
(
f
'init_weights of
{
self
.
__class__
.
__name__
}
has '
f
'been called more than once.'
)
if
is_top_level_module
:
self
.
_dump_init_info
(
logger_name
)
for
sub_module
in
self
.
modules
():
del
sub_module
.
_params_init_info
@
master_only
def
_dump_init_info
(
self
,
logger_name
):
"""Dump the initialization information to a file named
`initialization.log.json` in workdir.
Args:
logger_name (str): The name of logger.
"""
logger
=
get_logger
(
logger_name
)
with_file_handler
=
False
# dump the information to the logger file if there is a `FileHandler`
for
handler
in
logger
.
handlers
:
if
isinstance
(
handler
,
FileHandler
):
handler
.
stream
.
write
(
'Name of parameter - Initialization information
\n
'
)
for
name
,
param
in
self
.
named_parameters
():
handler
.
stream
.
write
(
f
'
\n
{
name
}
-
{
param
.
shape
}
: '
f
"
\n
{
self
.
_params_init_info
[
param
][
'init_info'
]
}
\n
"
)
handler
.
stream
.
flush
()
with_file_handler
=
True
if
not
with_file_handler
:
for
name
,
param
in
self
.
named_parameters
():
print_log
(
f
'
\n
{
name
}
-
{
param
.
shape
}
: '
f
"
\n
{
self
.
_params_init_info
[
param
][
'init_info'
]
}
\n
"
,
logger
=
logger_name
)
def
__repr__
(
self
):
s
=
super
().
__repr__
()
if
self
.
init_cfg
:
s
+=
f
'
\n
init_cfg=
{
self
.
init_cfg
}
'
return
s
class
Sequential
(
BaseModule
,
nn
.
Sequential
):
"""Sequential module in openmmlab.
Args:
init_cfg (dict, optional): Initialization config dict.
"""
def
__init__
(
self
,
*
args
,
init_cfg
=
None
):
BaseModule
.
__init__
(
self
,
init_cfg
)
nn
.
Sequential
.
__init__
(
self
,
*
args
)
class
ModuleList
(
BaseModule
,
nn
.
ModuleList
):
"""ModuleList in openmmlab.
Args:
modules (iterable, optional): an iterable of modules to add.
init_cfg (dict, optional): Initialization config dict.
"""
def
__init__
(
self
,
modules
=
None
,
init_cfg
=
None
):
BaseModule
.
__init__
(
self
,
init_cfg
)
nn
.
ModuleList
.
__init__
(
self
,
modules
)
lavis/common/annotator/uniformer/mmcv/runner/base_runner.py
0 → 100644
View file @
c04f261a
# Copyright (c) OpenMMLab. All rights reserved.
import
copy
import
logging
import
os.path
as
osp
import
warnings
from
abc
import
ABCMeta
,
abstractmethod
import
torch
from
torch.optim
import
Optimizer
import
annotator.uniformer.mmcv
as
mmcv
from
..parallel
import
is_module_wrapper
from
.checkpoint
import
load_checkpoint
from
.dist_utils
import
get_dist_info
from
.hooks
import
HOOKS
,
Hook
from
.log_buffer
import
LogBuffer
from
.priority
import
Priority
,
get_priority
from
.utils
import
get_time_str
class
BaseRunner
(
metaclass
=
ABCMeta
):
"""The base class of Runner, a training helper for PyTorch.
All subclasses should implement the following APIs:
- ``run()``
- ``train()``
- ``val()``
- ``save_checkpoint()``
Args:
model (:obj:`torch.nn.Module`): The model to be run.
batch_processor (callable): A callable method that process a data
batch. The interface of this method should be
`batch_processor(model, data, train_mode) -> dict`
optimizer (dict or :obj:`torch.optim.Optimizer`): It can be either an
optimizer (in most cases) or a dict of optimizers (in models that
requires more than one optimizer, e.g., GAN).
work_dir (str, optional): The working directory to save checkpoints
and logs. Defaults to None.
logger (:obj:`logging.Logger`): Logger used during training.
Defaults to None. (The default value is just for backward
compatibility)
meta (dict | None): A dict records some import information such as
environment info and seed, which will be logged in logger hook.
Defaults to None.
max_epochs (int, optional): Total training epochs.
max_iters (int, optional): Total training iterations.
"""
def
__init__
(
self
,
model
,
batch_processor
=
None
,
optimizer
=
None
,
work_dir
=
None
,
logger
=
None
,
meta
=
None
,
max_iters
=
None
,
max_epochs
=
None
):
if
batch_processor
is
not
None
:
if
not
callable
(
batch_processor
):
raise
TypeError
(
'batch_processor must be callable, '
f
'but got
{
type
(
batch_processor
)
}
'
)
warnings
.
warn
(
'batch_processor is deprecated, please implement '
'train_step() and val_step() in the model instead.'
)
# raise an error is `batch_processor` is not None and
# `model.train_step()` exists.
if
is_module_wrapper
(
model
):
_model
=
model
.
module
else
:
_model
=
model
if
hasattr
(
_model
,
'train_step'
)
or
hasattr
(
_model
,
'val_step'
):
raise
RuntimeError
(
'batch_processor and model.train_step()/model.val_step() '
'cannot be both available.'
)
else
:
assert
hasattr
(
model
,
'train_step'
)
# check the type of `optimizer`
if
isinstance
(
optimizer
,
dict
):
for
name
,
optim
in
optimizer
.
items
():
if
not
isinstance
(
optim
,
Optimizer
):
raise
TypeError
(
f
'optimizer must be a dict of torch.optim.Optimizers, '
f
'but optimizer["
{
name
}
"] is a
{
type
(
optim
)
}
'
)
elif
not
isinstance
(
optimizer
,
Optimizer
)
and
optimizer
is
not
None
:
raise
TypeError
(
f
'optimizer must be a torch.optim.Optimizer object '
f
'or dict or None, but got
{
type
(
optimizer
)
}
'
)
# check the type of `logger`
if
not
isinstance
(
logger
,
logging
.
Logger
):
raise
TypeError
(
f
'logger must be a logging.Logger object, '
f
'but got
{
type
(
logger
)
}
'
)
# check the type of `meta`
if
meta
is
not
None
and
not
isinstance
(
meta
,
dict
):
raise
TypeError
(
f
'meta must be a dict or None, but got
{
type
(
meta
)
}
'
)
self
.
model
=
model
self
.
batch_processor
=
batch_processor
self
.
optimizer
=
optimizer
self
.
logger
=
logger
self
.
meta
=
meta
# create work_dir
if
mmcv
.
is_str
(
work_dir
):
self
.
work_dir
=
osp
.
abspath
(
work_dir
)
mmcv
.
mkdir_or_exist
(
self
.
work_dir
)
elif
work_dir
is
None
:
self
.
work_dir
=
None
else
:
raise
TypeError
(
'"work_dir" must be a str or None'
)
# get model name from the model class
if
hasattr
(
self
.
model
,
'module'
):
self
.
_model_name
=
self
.
model
.
module
.
__class__
.
__name__
else
:
self
.
_model_name
=
self
.
model
.
__class__
.
__name__
self
.
_rank
,
self
.
_world_size
=
get_dist_info
()
self
.
timestamp
=
get_time_str
()
self
.
mode
=
None
self
.
_hooks
=
[]
self
.
_epoch
=
0
self
.
_iter
=
0
self
.
_inner_iter
=
0
if
max_epochs
is
not
None
and
max_iters
is
not
None
:
raise
ValueError
(
'Only one of `max_epochs` or `max_iters` can be set.'
)
self
.
_max_epochs
=
max_epochs
self
.
_max_iters
=
max_iters
# TODO: Redesign LogBuffer, it is not flexible and elegant enough
self
.
log_buffer
=
LogBuffer
()
@
property
def
model_name
(
self
):
"""str: Name of the model, usually the module class name."""
return
self
.
_model_name
@
property
def
rank
(
self
):
"""int: Rank of current process. (distributed training)"""
return
self
.
_rank
@
property
def
world_size
(
self
):
"""int: Number of processes participating in the job.
(distributed training)"""
return
self
.
_world_size
@
property
def
hooks
(
self
):
"""list[:obj:`Hook`]: A list of registered hooks."""
return
self
.
_hooks
@
property
def
epoch
(
self
):
"""int: Current epoch."""
return
self
.
_epoch
@
property
def
iter
(
self
):
"""int: Current iteration."""
return
self
.
_iter
@
property
def
inner_iter
(
self
):
"""int: Iteration in an epoch."""
return
self
.
_inner_iter
@
property
def
max_epochs
(
self
):
"""int: Maximum training epochs."""
return
self
.
_max_epochs
@
property
def
max_iters
(
self
):
"""int: Maximum training iterations."""
return
self
.
_max_iters
@
abstractmethod
def
train
(
self
):
pass
@
abstractmethod
def
val
(
self
):
pass
@
abstractmethod
def
run
(
self
,
data_loaders
,
workflow
,
**
kwargs
):
pass
@
abstractmethod
def
save_checkpoint
(
self
,
out_dir
,
filename_tmpl
,
save_optimizer
=
True
,
meta
=
None
,
create_symlink
=
True
):
pass
def
current_lr
(
self
):
"""Get current learning rates.
Returns:
list[float] | dict[str, list[float]]: Current learning rates of all
param groups. If the runner has a dict of optimizers, this
method will return a dict.
"""
if
isinstance
(
self
.
optimizer
,
torch
.
optim
.
Optimizer
):
lr
=
[
group
[
'lr'
]
for
group
in
self
.
optimizer
.
param_groups
]
elif
isinstance
(
self
.
optimizer
,
dict
):
lr
=
dict
()
for
name
,
optim
in
self
.
optimizer
.
items
():
lr
[
name
]
=
[
group
[
'lr'
]
for
group
in
optim
.
param_groups
]
else
:
raise
RuntimeError
(
'lr is not applicable because optimizer does not exist.'
)
return
lr
def
current_momentum
(
self
):
"""Get current momentums.
Returns:
list[float] | dict[str, list[float]]: Current momentums of all
param groups. If the runner has a dict of optimizers, this
method will return a dict.
"""
def
_get_momentum
(
optimizer
):
momentums
=
[]
for
group
in
optimizer
.
param_groups
:
if
'momentum'
in
group
.
keys
():
momentums
.
append
(
group
[
'momentum'
])
elif
'betas'
in
group
.
keys
():
momentums
.
append
(
group
[
'betas'
][
0
])
else
:
momentums
.
append
(
0
)
return
momentums
if
self
.
optimizer
is
None
:
raise
RuntimeError
(
'momentum is not applicable because optimizer does not exist.'
)
elif
isinstance
(
self
.
optimizer
,
torch
.
optim
.
Optimizer
):
momentums
=
_get_momentum
(
self
.
optimizer
)
elif
isinstance
(
self
.
optimizer
,
dict
):
momentums
=
dict
()
for
name
,
optim
in
self
.
optimizer
.
items
():
momentums
[
name
]
=
_get_momentum
(
optim
)
return
momentums
def
register_hook
(
self
,
hook
,
priority
=
'NORMAL'
):
"""Register a hook into the hook list.
The hook will be inserted into a priority queue, with the specified
priority (See :class:`Priority` for details of priorities).
For hooks with the same priority, they will be triggered in the same
order as they are registered.
Args:
hook (:obj:`Hook`): The hook to be registered.
priority (int or str or :obj:`Priority`): Hook priority.
Lower value means higher priority.
"""
assert
isinstance
(
hook
,
Hook
)
if
hasattr
(
hook
,
'priority'
):
raise
ValueError
(
'"priority" is a reserved attribute for hooks'
)
priority
=
get_priority
(
priority
)
hook
.
priority
=
priority
# insert the hook to a sorted list
inserted
=
False
for
i
in
range
(
len
(
self
.
_hooks
)
-
1
,
-
1
,
-
1
):
if
priority
>=
self
.
_hooks
[
i
].
priority
:
self
.
_hooks
.
insert
(
i
+
1
,
hook
)
inserted
=
True
break
if
not
inserted
:
self
.
_hooks
.
insert
(
0
,
hook
)
def
register_hook_from_cfg
(
self
,
hook_cfg
):
"""Register a hook from its cfg.
Args:
hook_cfg (dict): Hook config. It should have at least keys 'type'
and 'priority' indicating its type and priority.
Notes:
The specific hook class to register should not use 'type' and
'priority' arguments during initialization.
"""
hook_cfg
=
hook_cfg
.
copy
()
priority
=
hook_cfg
.
pop
(
'priority'
,
'NORMAL'
)
hook
=
mmcv
.
build_from_cfg
(
hook_cfg
,
HOOKS
)
self
.
register_hook
(
hook
,
priority
=
priority
)
def
call_hook
(
self
,
fn_name
):
"""Call all hooks.
Args:
fn_name (str): The function name in each hook to be called, such as
"before_train_epoch".
"""
for
hook
in
self
.
_hooks
:
getattr
(
hook
,
fn_name
)(
self
)
def
get_hook_info
(
self
):
# Get hooks info in each stage
stage_hook_map
=
{
stage
:
[]
for
stage
in
Hook
.
stages
}
for
hook
in
self
.
hooks
:
try
:
priority
=
Priority
(
hook
.
priority
).
name
except
ValueError
:
priority
=
hook
.
priority
classname
=
hook
.
__class__
.
__name__
hook_info
=
f
'(
{
priority
:
<
12
}
)
{
classname
:
<
35
}
'
for
trigger_stage
in
hook
.
get_triggered_stages
():
stage_hook_map
[
trigger_stage
].
append
(
hook_info
)
stage_hook_infos
=
[]
for
stage
in
Hook
.
stages
:
hook_infos
=
stage_hook_map
[
stage
]
if
len
(
hook_infos
)
>
0
:
info
=
f
'
{
stage
}
:
\n
'
info
+=
'
\n
'
.
join
(
hook_infos
)
info
+=
'
\n
-------------------- '
stage_hook_infos
.
append
(
info
)
return
'
\n
'
.
join
(
stage_hook_infos
)
def
load_checkpoint
(
self
,
filename
,
map_location
=
'cpu'
,
strict
=
False
,
revise_keys
=
[(
r
'^module.'
,
''
)]):
return
load_checkpoint
(
self
.
model
,
filename
,
map_location
,
strict
,
self
.
logger
,
revise_keys
=
revise_keys
)
def
resume
(
self
,
checkpoint
,
resume_optimizer
=
True
,
map_location
=
'default'
):
if
map_location
==
'default'
:
if
torch
.
cuda
.
is_available
():
device_id
=
torch
.
cuda
.
current_device
()
checkpoint
=
self
.
load_checkpoint
(
checkpoint
,
map_location
=
lambda
storage
,
loc
:
storage
.
cuda
(
device_id
))
else
:
checkpoint
=
self
.
load_checkpoint
(
checkpoint
)
else
:
checkpoint
=
self
.
load_checkpoint
(
checkpoint
,
map_location
=
map_location
)
self
.
_epoch
=
checkpoint
[
'meta'
][
'epoch'
]
self
.
_iter
=
checkpoint
[
'meta'
][
'iter'
]
if
self
.
meta
is
None
:
self
.
meta
=
{}
self
.
meta
.
setdefault
(
'hook_msgs'
,
{})
# load `last_ckpt`, `best_score`, `best_ckpt`, etc. for hook messages
self
.
meta
[
'hook_msgs'
].
update
(
checkpoint
[
'meta'
].
get
(
'hook_msgs'
,
{}))
# Re-calculate the number of iterations when resuming
# models with different number of GPUs
if
'config'
in
checkpoint
[
'meta'
]:
config
=
mmcv
.
Config
.
fromstring
(
checkpoint
[
'meta'
][
'config'
],
file_format
=
'.py'
)
previous_gpu_ids
=
config
.
get
(
'gpu_ids'
,
None
)
if
previous_gpu_ids
and
len
(
previous_gpu_ids
)
>
0
and
len
(
previous_gpu_ids
)
!=
self
.
world_size
:
self
.
_iter
=
int
(
self
.
_iter
*
len
(
previous_gpu_ids
)
/
self
.
world_size
)
self
.
logger
.
info
(
'the iteration number is changed due to '
'change of GPU number'
)
# resume meta information meta
self
.
meta
=
checkpoint
[
'meta'
]
if
'optimizer'
in
checkpoint
and
resume_optimizer
:
if
isinstance
(
self
.
optimizer
,
Optimizer
):
self
.
optimizer
.
load_state_dict
(
checkpoint
[
'optimizer'
])
elif
isinstance
(
self
.
optimizer
,
dict
):
for
k
in
self
.
optimizer
.
keys
():
self
.
optimizer
[
k
].
load_state_dict
(
checkpoint
[
'optimizer'
][
k
])
else
:
raise
TypeError
(
'Optimizer should be dict or torch.optim.Optimizer '
f
'but got
{
type
(
self
.
optimizer
)
}
'
)
self
.
logger
.
info
(
'resumed epoch %d, iter %d'
,
self
.
epoch
,
self
.
iter
)
def
register_lr_hook
(
self
,
lr_config
):
if
lr_config
is
None
:
return
elif
isinstance
(
lr_config
,
dict
):
assert
'policy'
in
lr_config
policy_type
=
lr_config
.
pop
(
'policy'
)
# If the type of policy is all in lower case, e.g., 'cyclic',
# then its first letter will be capitalized, e.g., to be 'Cyclic'.
# This is for the convenient usage of Lr updater.
# Since this is not applicable for `
# CosineAnnealingLrUpdater`,
# the string will not be changed if it contains capital letters.
if
policy_type
==
policy_type
.
lower
():
policy_type
=
policy_type
.
title
()
hook_type
=
policy_type
+
'LrUpdaterHook'
lr_config
[
'type'
]
=
hook_type
hook
=
mmcv
.
build_from_cfg
(
lr_config
,
HOOKS
)
else
:
hook
=
lr_config
self
.
register_hook
(
hook
,
priority
=
'VERY_HIGH'
)
def
register_momentum_hook
(
self
,
momentum_config
):
if
momentum_config
is
None
:
return
if
isinstance
(
momentum_config
,
dict
):
assert
'policy'
in
momentum_config
policy_type
=
momentum_config
.
pop
(
'policy'
)
# If the type of policy is all in lower case, e.g., 'cyclic',
# then its first letter will be capitalized, e.g., to be 'Cyclic'.
# This is for the convenient usage of momentum updater.
# Since this is not applicable for
# `CosineAnnealingMomentumUpdater`,
# the string will not be changed if it contains capital letters.
if
policy_type
==
policy_type
.
lower
():
policy_type
=
policy_type
.
title
()
hook_type
=
policy_type
+
'MomentumUpdaterHook'
momentum_config
[
'type'
]
=
hook_type
hook
=
mmcv
.
build_from_cfg
(
momentum_config
,
HOOKS
)
else
:
hook
=
momentum_config
self
.
register_hook
(
hook
,
priority
=
'HIGH'
)
def
register_optimizer_hook
(
self
,
optimizer_config
):
if
optimizer_config
is
None
:
return
if
isinstance
(
optimizer_config
,
dict
):
optimizer_config
.
setdefault
(
'type'
,
'OptimizerHook'
)
hook
=
mmcv
.
build_from_cfg
(
optimizer_config
,
HOOKS
)
else
:
hook
=
optimizer_config
self
.
register_hook
(
hook
,
priority
=
'ABOVE_NORMAL'
)
def
register_checkpoint_hook
(
self
,
checkpoint_config
):
if
checkpoint_config
is
None
:
return
if
isinstance
(
checkpoint_config
,
dict
):
checkpoint_config
.
setdefault
(
'type'
,
'CheckpointHook'
)
hook
=
mmcv
.
build_from_cfg
(
checkpoint_config
,
HOOKS
)
else
:
hook
=
checkpoint_config
self
.
register_hook
(
hook
,
priority
=
'NORMAL'
)
def
register_logger_hooks
(
self
,
log_config
):
if
log_config
is
None
:
return
log_interval
=
log_config
[
'interval'
]
for
info
in
log_config
[
'hooks'
]:
logger_hook
=
mmcv
.
build_from_cfg
(
info
,
HOOKS
,
default_args
=
dict
(
interval
=
log_interval
))
self
.
register_hook
(
logger_hook
,
priority
=
'VERY_LOW'
)
def
register_timer_hook
(
self
,
timer_config
):
if
timer_config
is
None
:
return
if
isinstance
(
timer_config
,
dict
):
timer_config_
=
copy
.
deepcopy
(
timer_config
)
hook
=
mmcv
.
build_from_cfg
(
timer_config_
,
HOOKS
)
else
:
hook
=
timer_config
self
.
register_hook
(
hook
,
priority
=
'LOW'
)
def
register_custom_hooks
(
self
,
custom_config
):
if
custom_config
is
None
:
return
if
not
isinstance
(
custom_config
,
list
):
custom_config
=
[
custom_config
]
for
item
in
custom_config
:
if
isinstance
(
item
,
dict
):
self
.
register_hook_from_cfg
(
item
)
else
:
self
.
register_hook
(
item
,
priority
=
'NORMAL'
)
def
register_profiler_hook
(
self
,
profiler_config
):
if
profiler_config
is
None
:
return
if
isinstance
(
profiler_config
,
dict
):
profiler_config
.
setdefault
(
'type'
,
'ProfilerHook'
)
hook
=
mmcv
.
build_from_cfg
(
profiler_config
,
HOOKS
)
else
:
hook
=
profiler_config
self
.
register_hook
(
hook
)
def
register_training_hooks
(
self
,
lr_config
,
optimizer_config
=
None
,
checkpoint_config
=
None
,
log_config
=
None
,
momentum_config
=
None
,
timer_config
=
dict
(
type
=
'IterTimerHook'
),
custom_hooks_config
=
None
):
"""Register default and custom hooks for training.
Default and custom hooks include:
+----------------------+-------------------------+
| Hooks | Priority |
+======================+=========================+
| LrUpdaterHook | VERY_HIGH (10) |
+----------------------+-------------------------+
| MomentumUpdaterHook | HIGH (30) |
+----------------------+-------------------------+
| OptimizerStepperHook | ABOVE_NORMAL (40) |
+----------------------+-------------------------+
| CheckpointSaverHook | NORMAL (50) |
+----------------------+-------------------------+
| IterTimerHook | LOW (70) |
+----------------------+-------------------------+
| LoggerHook(s) | VERY_LOW (90) |
+----------------------+-------------------------+
| CustomHook(s) | defaults to NORMAL (50) |
+----------------------+-------------------------+
If custom hooks have same priority with default hooks, custom hooks
will be triggered after default hooks.
"""
self
.
register_lr_hook
(
lr_config
)
self
.
register_momentum_hook
(
momentum_config
)
self
.
register_optimizer_hook
(
optimizer_config
)
self
.
register_checkpoint_hook
(
checkpoint_config
)
self
.
register_timer_hook
(
timer_config
)
self
.
register_logger_hooks
(
log_config
)
self
.
register_custom_hooks
(
custom_hooks_config
)
lavis/common/annotator/uniformer/mmcv/runner/builder.py
0 → 100644
View file @
c04f261a
# Copyright (c) OpenMMLab. All rights reserved.
import
copy
from
..utils
import
Registry
RUNNERS
=
Registry
(
'runner'
)
RUNNER_BUILDERS
=
Registry
(
'runner builder'
)
def
build_runner_constructor
(
cfg
):
return
RUNNER_BUILDERS
.
build
(
cfg
)
def
build_runner
(
cfg
,
default_args
=
None
):
runner_cfg
=
copy
.
deepcopy
(
cfg
)
constructor_type
=
runner_cfg
.
pop
(
'constructor'
,
'DefaultRunnerConstructor'
)
runner_constructor
=
build_runner_constructor
(
dict
(
type
=
constructor_type
,
runner_cfg
=
runner_cfg
,
default_args
=
default_args
))
runner
=
runner_constructor
()
return
runner
lavis/common/annotator/uniformer/mmcv/runner/checkpoint.py
0 → 100644
View file @
c04f261a
# Copyright (c) OpenMMLab. All rights reserved.
import
io
import
os
import
os.path
as
osp
import
pkgutil
import
re
import
time
import
warnings
from
collections
import
OrderedDict
from
importlib
import
import_module
from
tempfile
import
TemporaryDirectory
import
torch
import
torchvision
from
torch.optim
import
Optimizer
from
torch.utils
import
model_zoo
import
annotator.uniformer.mmcv
as
mmcv
from
..fileio
import
FileClient
from
..fileio
import
load
as
load_file
from
..parallel
import
is_module_wrapper
from
..utils
import
mkdir_or_exist
from
.dist_utils
import
get_dist_info
ENV_MMCV_HOME
=
'MMCV_HOME'
ENV_XDG_CACHE_HOME
=
'XDG_CACHE_HOME'
DEFAULT_CACHE_DIR
=
'~/.cache'
def
_get_mmcv_home
():
mmcv_home
=
os
.
path
.
expanduser
(
os
.
getenv
(
ENV_MMCV_HOME
,
os
.
path
.
join
(
os
.
getenv
(
ENV_XDG_CACHE_HOME
,
DEFAULT_CACHE_DIR
),
'mmcv'
)))
mkdir_or_exist
(
mmcv_home
)
return
mmcv_home
def
load_state_dict
(
module
,
state_dict
,
strict
=
False
,
logger
=
None
):
"""Load state_dict to a module.
This method is modified from :meth:`torch.nn.Module.load_state_dict`.
Default value for ``strict`` is set to ``False`` and the message for
param mismatch will be shown even if strict is False.
Args:
module (Module): Module that receives the state_dict.
state_dict (OrderedDict): Weights.
strict (bool): whether to strictly enforce that the keys
in :attr:`state_dict` match the keys returned by this module's
:meth:`~torch.nn.Module.state_dict` function. Default: ``False``.
logger (:obj:`logging.Logger`, optional): Logger to log the error
message. If not specified, print function will be used.
"""
unexpected_keys
=
[]
all_missing_keys
=
[]
err_msg
=
[]
metadata
=
getattr
(
state_dict
,
'_metadata'
,
None
)
state_dict
=
state_dict
.
copy
()
if
metadata
is
not
None
:
state_dict
.
_metadata
=
metadata
# use _load_from_state_dict to enable checkpoint version control
def
load
(
module
,
prefix
=
''
):
# recursively check parallel module in case that the model has a
# complicated structure, e.g., nn.Module(nn.Module(DDP))
if
is_module_wrapper
(
module
):
module
=
module
.
module
local_metadata
=
{}
if
metadata
is
None
else
metadata
.
get
(
prefix
[:
-
1
],
{})
module
.
_load_from_state_dict
(
state_dict
,
prefix
,
local_metadata
,
True
,
all_missing_keys
,
unexpected_keys
,
err_msg
)
for
name
,
child
in
module
.
_modules
.
items
():
if
child
is
not
None
:
load
(
child
,
prefix
+
name
+
'.'
)
load
(
module
)
load
=
None
# break load->load reference cycle
# ignore "num_batches_tracked" of BN layers
missing_keys
=
[
key
for
key
in
all_missing_keys
if
'num_batches_tracked'
not
in
key
]
if
unexpected_keys
:
err_msg
.
append
(
'unexpected key in source '
f
'state_dict:
{
", "
.
join
(
unexpected_keys
)
}
\n
'
)
if
missing_keys
:
err_msg
.
append
(
f
'missing keys in source state_dict:
{
", "
.
join
(
missing_keys
)
}
\n
'
)
rank
,
_
=
get_dist_info
()
if
len
(
err_msg
)
>
0
and
rank
==
0
:
err_msg
.
insert
(
0
,
'The model and loaded state dict do not match exactly
\n
'
)
err_msg
=
'
\n
'
.
join
(
err_msg
)
if
strict
:
raise
RuntimeError
(
err_msg
)
elif
logger
is
not
None
:
logger
.
warning
(
err_msg
)
else
:
print
(
err_msg
)
def
get_torchvision_models
():
model_urls
=
dict
()
for
_
,
name
,
ispkg
in
pkgutil
.
walk_packages
(
torchvision
.
models
.
__path__
):
if
ispkg
:
continue
_zoo
=
import_module
(
f
'torchvision.models.
{
name
}
'
)
if
hasattr
(
_zoo
,
'model_urls'
):
_urls
=
getattr
(
_zoo
,
'model_urls'
)
model_urls
.
update
(
_urls
)
return
model_urls
def
get_external_models
():
mmcv_home
=
_get_mmcv_home
()
default_json_path
=
osp
.
join
(
mmcv
.
__path__
[
0
],
'model_zoo/open_mmlab.json'
)
default_urls
=
load_file
(
default_json_path
)
assert
isinstance
(
default_urls
,
dict
)
external_json_path
=
osp
.
join
(
mmcv_home
,
'open_mmlab.json'
)
if
osp
.
exists
(
external_json_path
):
external_urls
=
load_file
(
external_json_path
)
assert
isinstance
(
external_urls
,
dict
)
default_urls
.
update
(
external_urls
)
return
default_urls
def
get_mmcls_models
():
mmcls_json_path
=
osp
.
join
(
mmcv
.
__path__
[
0
],
'model_zoo/mmcls.json'
)
mmcls_urls
=
load_file
(
mmcls_json_path
)
return
mmcls_urls
def
get_deprecated_model_names
():
deprecate_json_path
=
osp
.
join
(
mmcv
.
__path__
[
0
],
'model_zoo/deprecated.json'
)
deprecate_urls
=
load_file
(
deprecate_json_path
)
assert
isinstance
(
deprecate_urls
,
dict
)
return
deprecate_urls
def
_process_mmcls_checkpoint
(
checkpoint
):
state_dict
=
checkpoint
[
'state_dict'
]
new_state_dict
=
OrderedDict
()
for
k
,
v
in
state_dict
.
items
():
if
k
.
startswith
(
'backbone.'
):
new_state_dict
[
k
[
9
:]]
=
v
new_checkpoint
=
dict
(
state_dict
=
new_state_dict
)
return
new_checkpoint
class
CheckpointLoader
:
"""A general checkpoint loader to manage all schemes."""
_schemes
=
{}
@
classmethod
def
_register_scheme
(
cls
,
prefixes
,
loader
,
force
=
False
):
if
isinstance
(
prefixes
,
str
):
prefixes
=
[
prefixes
]
else
:
assert
isinstance
(
prefixes
,
(
list
,
tuple
))
for
prefix
in
prefixes
:
if
(
prefix
not
in
cls
.
_schemes
)
or
force
:
cls
.
_schemes
[
prefix
]
=
loader
else
:
raise
KeyError
(
f
'
{
prefix
}
is already registered as a loader backend, '
'add "force=True" if you want to override it'
)
# sort, longer prefixes take priority
cls
.
_schemes
=
OrderedDict
(
sorted
(
cls
.
_schemes
.
items
(),
key
=
lambda
t
:
t
[
0
],
reverse
=
True
))
@
classmethod
def
register_scheme
(
cls
,
prefixes
,
loader
=
None
,
force
=
False
):
"""Register a loader to CheckpointLoader.
This method can be used as a normal class method or a decorator.
Args:
prefixes (str or list[str] or tuple[str]):
The prefix of the registered loader.
loader (function, optional): The loader function to be registered.
When this method is used as a decorator, loader is None.
Defaults to None.
force (bool, optional): Whether to override the loader
if the prefix has already been registered. Defaults to False.
"""
if
loader
is
not
None
:
cls
.
_register_scheme
(
prefixes
,
loader
,
force
=
force
)
return
def
_register
(
loader_cls
):
cls
.
_register_scheme
(
prefixes
,
loader_cls
,
force
=
force
)
return
loader_cls
return
_register
@
classmethod
def
_get_checkpoint_loader
(
cls
,
path
):
"""Finds a loader that supports the given path. Falls back to the local
loader if no other loader is found.
Args:
path (str): checkpoint path
Returns:
loader (function): checkpoint loader
"""
for
p
in
cls
.
_schemes
:
if
path
.
startswith
(
p
):
return
cls
.
_schemes
[
p
]
@
classmethod
def
load_checkpoint
(
cls
,
filename
,
map_location
=
None
,
logger
=
None
):
"""load checkpoint through URL scheme path.
Args:
filename (str): checkpoint file name with given prefix
map_location (str, optional): Same as :func:`torch.load`.
Default: None
logger (:mod:`logging.Logger`, optional): The logger for message.
Default: None
Returns:
dict or OrderedDict: The loaded checkpoint.
"""
checkpoint_loader
=
cls
.
_get_checkpoint_loader
(
filename
)
class_name
=
checkpoint_loader
.
__name__
mmcv
.
print_log
(
f
'load checkpoint from
{
class_name
[
10
:]
}
path:
{
filename
}
'
,
logger
)
return
checkpoint_loader
(
filename
,
map_location
)
@
CheckpointLoader
.
register_scheme
(
prefixes
=
''
)
def
load_from_local
(
filename
,
map_location
):
"""load checkpoint by local file path.
Args:
filename (str): local checkpoint file path
map_location (str, optional): Same as :func:`torch.load`.
Returns:
dict or OrderedDict: The loaded checkpoint.
"""
if
not
osp
.
isfile
(
filename
):
raise
IOError
(
f
'
{
filename
}
is not a checkpoint file'
)
checkpoint
=
torch
.
load
(
filename
,
map_location
=
map_location
)
return
checkpoint
@
CheckpointLoader
.
register_scheme
(
prefixes
=
(
'http://'
,
'https://'
))
def
load_from_http
(
filename
,
map_location
=
None
,
model_dir
=
None
):
"""load checkpoint through HTTP or HTTPS scheme path. In distributed
setting, this function only download checkpoint at local rank 0.
Args:
filename (str): checkpoint file path with modelzoo or
torchvision prefix
map_location (str, optional): Same as :func:`torch.load`.
model_dir (string, optional): directory in which to save the object,
Default: None
Returns:
dict or OrderedDict: The loaded checkpoint.
"""
rank
,
world_size
=
get_dist_info
()
rank
=
int
(
os
.
environ
.
get
(
'LOCAL_RANK'
,
rank
))
if
rank
==
0
:
checkpoint
=
model_zoo
.
load_url
(
filename
,
model_dir
=
model_dir
,
map_location
=
map_location
)
if
world_size
>
1
:
torch
.
distributed
.
barrier
()
if
rank
>
0
:
checkpoint
=
model_zoo
.
load_url
(
filename
,
model_dir
=
model_dir
,
map_location
=
map_location
)
return
checkpoint
@
CheckpointLoader
.
register_scheme
(
prefixes
=
'pavi://'
)
def
load_from_pavi
(
filename
,
map_location
=
None
):
"""load checkpoint through the file path prefixed with pavi. In distributed
setting, this function download ckpt at all ranks to different temporary
directories.
Args:
filename (str): checkpoint file path with pavi prefix
map_location (str, optional): Same as :func:`torch.load`.
Default: None
Returns:
dict or OrderedDict: The loaded checkpoint.
"""
assert
filename
.
startswith
(
'pavi://'
),
\
f
'Expected filename startswith `pavi://`, but get
{
filename
}
'
model_path
=
filename
[
7
:]
try
:
from
pavi
import
modelcloud
except
ImportError
:
raise
ImportError
(
'Please install pavi to load checkpoint from modelcloud.'
)
model
=
modelcloud
.
get
(
model_path
)
with
TemporaryDirectory
()
as
tmp_dir
:
downloaded_file
=
osp
.
join
(
tmp_dir
,
model
.
name
)
model
.
download
(
downloaded_file
)
checkpoint
=
torch
.
load
(
downloaded_file
,
map_location
=
map_location
)
return
checkpoint
@
CheckpointLoader
.
register_scheme
(
prefixes
=
's3://'
)
def
load_from_ceph
(
filename
,
map_location
=
None
,
backend
=
'petrel'
):
"""load checkpoint through the file path prefixed with s3. In distributed
setting, this function download ckpt at all ranks to different temporary
directories.
Args:
filename (str): checkpoint file path with s3 prefix
map_location (str, optional): Same as :func:`torch.load`.
backend (str, optional): The storage backend type. Options are 'ceph',
'petrel'. Default: 'petrel'.
.. warning::
:class:`mmcv.fileio.file_client.CephBackend` will be deprecated,
please use :class:`mmcv.fileio.file_client.PetrelBackend` instead.
Returns:
dict or OrderedDict: The loaded checkpoint.
"""
allowed_backends
=
[
'ceph'
,
'petrel'
]
if
backend
not
in
allowed_backends
:
raise
ValueError
(
f
'Load from Backend
{
backend
}
is not supported.'
)
if
backend
==
'ceph'
:
warnings
.
warn
(
'CephBackend will be deprecated, please use PetrelBackend instead'
)
# CephClient and PetrelBackend have the same prefix 's3://' and the latter
# will be chosen as default. If PetrelBackend can not be instantiated
# successfully, the CephClient will be chosen.
try
:
file_client
=
FileClient
(
backend
=
backend
)
except
ImportError
:
allowed_backends
.
remove
(
backend
)
file_client
=
FileClient
(
backend
=
allowed_backends
[
0
])
with
io
.
BytesIO
(
file_client
.
get
(
filename
))
as
buffer
:
checkpoint
=
torch
.
load
(
buffer
,
map_location
=
map_location
)
return
checkpoint
@
CheckpointLoader
.
register_scheme
(
prefixes
=
(
'modelzoo://'
,
'torchvision://'
))
def
load_from_torchvision
(
filename
,
map_location
=
None
):
"""load checkpoint through the file path prefixed with modelzoo or
torchvision.
Args:
filename (str): checkpoint file path with modelzoo or
torchvision prefix
map_location (str, optional): Same as :func:`torch.load`.
Returns:
dict or OrderedDict: The loaded checkpoint.
"""
model_urls
=
get_torchvision_models
()
if
filename
.
startswith
(
'modelzoo://'
):
warnings
.
warn
(
'The URL scheme of "modelzoo://" is deprecated, please '
'use "torchvision://" instead'
)
model_name
=
filename
[
11
:]
else
:
model_name
=
filename
[
14
:]
return
load_from_http
(
model_urls
[
model_name
],
map_location
=
map_location
)
@
CheckpointLoader
.
register_scheme
(
prefixes
=
(
'open-mmlab://'
,
'openmmlab://'
))
def
load_from_openmmlab
(
filename
,
map_location
=
None
):
"""load checkpoint through the file path prefixed with open-mmlab or
openmmlab.
Args:
filename (str): checkpoint file path with open-mmlab or
openmmlab prefix
map_location (str, optional): Same as :func:`torch.load`.
Default: None
Returns:
dict or OrderedDict: The loaded checkpoint.
"""
model_urls
=
get_external_models
()
prefix_str
=
'open-mmlab://'
if
filename
.
startswith
(
prefix_str
):
model_name
=
filename
[
13
:]
else
:
model_name
=
filename
[
12
:]
prefix_str
=
'openmmlab://'
deprecated_urls
=
get_deprecated_model_names
()
if
model_name
in
deprecated_urls
:
warnings
.
warn
(
f
'
{
prefix_str
}{
model_name
}
is deprecated in favor '
f
'of
{
prefix_str
}{
deprecated_urls
[
model_name
]
}
'
)
model_name
=
deprecated_urls
[
model_name
]
model_url
=
model_urls
[
model_name
]
# check if is url
if
model_url
.
startswith
((
'http://'
,
'https://'
)):
checkpoint
=
load_from_http
(
model_url
,
map_location
=
map_location
)
else
:
filename
=
osp
.
join
(
_get_mmcv_home
(),
model_url
)
if
not
osp
.
isfile
(
filename
):
raise
IOError
(
f
'
{
filename
}
is not a checkpoint file'
)
checkpoint
=
torch
.
load
(
filename
,
map_location
=
map_location
)
return
checkpoint
@
CheckpointLoader
.
register_scheme
(
prefixes
=
'mmcls://'
)
def
load_from_mmcls
(
filename
,
map_location
=
None
):
"""load checkpoint through the file path prefixed with mmcls.
Args:
filename (str): checkpoint file path with mmcls prefix
map_location (str, optional): Same as :func:`torch.load`.
Returns:
dict or OrderedDict: The loaded checkpoint.
"""
model_urls
=
get_mmcls_models
()
model_name
=
filename
[
8
:]
checkpoint
=
load_from_http
(
model_urls
[
model_name
],
map_location
=
map_location
)
checkpoint
=
_process_mmcls_checkpoint
(
checkpoint
)
return
checkpoint
def
_load_checkpoint
(
filename
,
map_location
=
None
,
logger
=
None
):
"""Load checkpoint from somewhere (modelzoo, file, url).
Args:
filename (str): Accept local filepath, URL, ``torchvision://xxx``,
``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for
details.
map_location (str, optional): Same as :func:`torch.load`.
Default: None.
logger (:mod:`logging.Logger`, optional): The logger for error message.
Default: None
Returns:
dict or OrderedDict: The loaded checkpoint. It can be either an
OrderedDict storing model weights or a dict containing other
information, which depends on the checkpoint.
"""
return
CheckpointLoader
.
load_checkpoint
(
filename
,
map_location
,
logger
)
def
_load_checkpoint_with_prefix
(
prefix
,
filename
,
map_location
=
None
):
"""Load partial pretrained model with specific prefix.
Args:
prefix (str): The prefix of sub-module.
filename (str): Accept local filepath, URL, ``torchvision://xxx``,
``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for
details.
map_location (str | None): Same as :func:`torch.load`. Default: None.
Returns:
dict or OrderedDict: The loaded checkpoint.
"""
checkpoint
=
_load_checkpoint
(
filename
,
map_location
=
map_location
)
if
'state_dict'
in
checkpoint
:
state_dict
=
checkpoint
[
'state_dict'
]
else
:
state_dict
=
checkpoint
if
not
prefix
.
endswith
(
'.'
):
prefix
+=
'.'
prefix_len
=
len
(
prefix
)
state_dict
=
{
k
[
prefix_len
:]:
v
for
k
,
v
in
state_dict
.
items
()
if
k
.
startswith
(
prefix
)
}
assert
state_dict
,
f
'
{
prefix
}
is not in the pretrained model'
return
state_dict
def
load_checkpoint
(
model
,
filename
,
map_location
=
None
,
strict
=
False
,
logger
=
None
,
revise_keys
=
[(
r
'^module\.'
,
''
)]):
"""Load checkpoint from a file or URI.
Args:
model (Module): Module to load checkpoint.
filename (str): Accept local filepath, URL, ``torchvision://xxx``,
``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for
details.
map_location (str): Same as :func:`torch.load`.
strict (bool): Whether to allow different params for the model and
checkpoint.
logger (:mod:`logging.Logger` or None): The logger for error message.
revise_keys (list): A list of customized keywords to modify the
state_dict in checkpoint. Each item is a (pattern, replacement)
pair of the regular expression operations. Default: strip
the prefix 'module.' by [(r'^module
\\
.', '')].
Returns:
dict or OrderedDict: The loaded checkpoint.
"""
checkpoint
=
_load_checkpoint
(
filename
,
map_location
,
logger
)
# OrderedDict is a subclass of dict
if
not
isinstance
(
checkpoint
,
dict
):
raise
RuntimeError
(
f
'No state_dict found in checkpoint file
{
filename
}
'
)
# get state_dict from checkpoint
if
'state_dict'
in
checkpoint
:
state_dict
=
checkpoint
[
'state_dict'
]
else
:
state_dict
=
checkpoint
# strip prefix of state_dict
metadata
=
getattr
(
state_dict
,
'_metadata'
,
OrderedDict
())
for
p
,
r
in
revise_keys
:
state_dict
=
OrderedDict
(
{
re
.
sub
(
p
,
r
,
k
):
v
for
k
,
v
in
state_dict
.
items
()})
# Keep metadata in state_dict
state_dict
.
_metadata
=
metadata
# load state_dict
load_state_dict
(
model
,
state_dict
,
strict
,
logger
)
return
checkpoint
def
weights_to_cpu
(
state_dict
):
"""Copy a model state_dict to cpu.
Args:
state_dict (OrderedDict): Model weights on GPU.
Returns:
OrderedDict: Model weights on GPU.
"""
state_dict_cpu
=
OrderedDict
()
for
key
,
val
in
state_dict
.
items
():
state_dict_cpu
[
key
]
=
val
.
cpu
()
# Keep metadata in state_dict
state_dict_cpu
.
_metadata
=
getattr
(
state_dict
,
'_metadata'
,
OrderedDict
())
return
state_dict_cpu
def
_save_to_state_dict
(
module
,
destination
,
prefix
,
keep_vars
):
"""Saves module state to `destination` dictionary.
This method is modified from :meth:`torch.nn.Module._save_to_state_dict`.
Args:
module (nn.Module): The module to generate state_dict.
destination (dict): A dict where state will be stored.
prefix (str): The prefix for parameters and buffers used in this
module.
"""
for
name
,
param
in
module
.
_parameters
.
items
():
if
param
is
not
None
:
destination
[
prefix
+
name
]
=
param
if
keep_vars
else
param
.
detach
()
for
name
,
buf
in
module
.
_buffers
.
items
():
# remove check of _non_persistent_buffers_set to allow nn.BatchNorm2d
if
buf
is
not
None
:
destination
[
prefix
+
name
]
=
buf
if
keep_vars
else
buf
.
detach
()
def
get_state_dict
(
module
,
destination
=
None
,
prefix
=
''
,
keep_vars
=
False
):
"""Returns a dictionary containing a whole state of the module.
Both parameters and persistent buffers (e.g. running averages) are
included. Keys are corresponding parameter and buffer names.
This method is modified from :meth:`torch.nn.Module.state_dict` to
recursively check parallel module in case that the model has a complicated
structure, e.g., nn.Module(nn.Module(DDP)).
Args:
module (nn.Module): The module to generate state_dict.
destination (OrderedDict): Returned dict for the state of the
module.
prefix (str): Prefix of the key.
keep_vars (bool): Whether to keep the variable property of the
parameters. Default: False.
Returns:
dict: A dictionary containing a whole state of the module.
"""
# recursively check parallel module in case that the model has a
# complicated structure, e.g., nn.Module(nn.Module(DDP))
if
is_module_wrapper
(
module
):
module
=
module
.
module
# below is the same as torch.nn.Module.state_dict()
if
destination
is
None
:
destination
=
OrderedDict
()
destination
.
_metadata
=
OrderedDict
()
destination
.
_metadata
[
prefix
[:
-
1
]]
=
local_metadata
=
dict
(
version
=
module
.
_version
)
_save_to_state_dict
(
module
,
destination
,
prefix
,
keep_vars
)
for
name
,
child
in
module
.
_modules
.
items
():
if
child
is
not
None
:
get_state_dict
(
child
,
destination
,
prefix
+
name
+
'.'
,
keep_vars
=
keep_vars
)
for
hook
in
module
.
_state_dict_hooks
.
values
():
hook_result
=
hook
(
module
,
destination
,
prefix
,
local_metadata
)
if
hook_result
is
not
None
:
destination
=
hook_result
return
destination
def
save_checkpoint
(
model
,
filename
,
optimizer
=
None
,
meta
=
None
,
file_client_args
=
None
):
"""Save checkpoint to file.
The checkpoint will have 3 fields: ``meta``, ``state_dict`` and
``optimizer``. By default ``meta`` will contain version and time info.
Args:
model (Module): Module whose params are to be saved.
filename (str): Checkpoint filename.
optimizer (:obj:`Optimizer`, optional): Optimizer to be saved.
meta (dict, optional): Metadata to be saved in checkpoint.
file_client_args (dict, optional): Arguments to instantiate a
FileClient. See :class:`mmcv.fileio.FileClient` for details.
Default: None.
`New in version 1.3.16.`
"""
if
meta
is
None
:
meta
=
{}
elif
not
isinstance
(
meta
,
dict
):
raise
TypeError
(
f
'meta must be a dict or None, but got
{
type
(
meta
)
}
'
)
meta
.
update
(
mmcv_version
=
mmcv
.
__version__
,
time
=
time
.
asctime
())
if
is_module_wrapper
(
model
):
model
=
model
.
module
if
hasattr
(
model
,
'CLASSES'
)
and
model
.
CLASSES
is
not
None
:
# save class name to the meta
meta
.
update
(
CLASSES
=
model
.
CLASSES
)
checkpoint
=
{
'meta'
:
meta
,
'state_dict'
:
weights_to_cpu
(
get_state_dict
(
model
))
}
# save optimizer state dict in the checkpoint
if
isinstance
(
optimizer
,
Optimizer
):
checkpoint
[
'optimizer'
]
=
optimizer
.
state_dict
()
elif
isinstance
(
optimizer
,
dict
):
checkpoint
[
'optimizer'
]
=
{}
for
name
,
optim
in
optimizer
.
items
():
checkpoint
[
'optimizer'
][
name
]
=
optim
.
state_dict
()
if
filename
.
startswith
(
'pavi://'
):
if
file_client_args
is
not
None
:
raise
ValueError
(
'file_client_args should be "None" if filename starts with'
f
'"pavi://", but got
{
file_client_args
}
'
)
try
:
from
pavi
import
modelcloud
from
pavi
import
exception
except
ImportError
:
raise
ImportError
(
'Please install pavi to load checkpoint from modelcloud.'
)
model_path
=
filename
[
7
:]
root
=
modelcloud
.
Folder
()
model_dir
,
model_name
=
osp
.
split
(
model_path
)
try
:
model
=
modelcloud
.
get
(
model_dir
)
except
exception
.
NodeNotFoundError
:
model
=
root
.
create_training_model
(
model_dir
)
with
TemporaryDirectory
()
as
tmp_dir
:
checkpoint_file
=
osp
.
join
(
tmp_dir
,
model_name
)
with
open
(
checkpoint_file
,
'wb'
)
as
f
:
torch
.
save
(
checkpoint
,
f
)
f
.
flush
()
model
.
create_file
(
checkpoint_file
,
name
=
model_name
)
else
:
file_client
=
FileClient
.
infer_client
(
file_client_args
,
filename
)
with
io
.
BytesIO
()
as
f
:
torch
.
save
(
checkpoint
,
f
)
file_client
.
put
(
f
.
getvalue
(),
filename
)
lavis/common/annotator/uniformer/mmcv/runner/default_constructor.py
0 → 100644
View file @
c04f261a
from
.builder
import
RUNNER_BUILDERS
,
RUNNERS
@
RUNNER_BUILDERS
.
register_module
()
class
DefaultRunnerConstructor
:
"""Default constructor for runners.
Custom existing `Runner` like `EpocBasedRunner` though `RunnerConstructor`.
For example, We can inject some new properties and functions for `Runner`.
Example:
>>> from annotator.uniformer.mmcv.runner import RUNNER_BUILDERS, build_runner
>>> # Define a new RunnerReconstructor
>>> @RUNNER_BUILDERS.register_module()
>>> class MyRunnerConstructor:
... def __init__(self, runner_cfg, default_args=None):
... if not isinstance(runner_cfg, dict):
... raise TypeError('runner_cfg should be a dict',
... f'but got {type(runner_cfg)}')
... self.runner_cfg = runner_cfg
... self.default_args = default_args
...
... def __call__(self):
... runner = RUNNERS.build(self.runner_cfg,
... default_args=self.default_args)
... # Add new properties for existing runner
... runner.my_name = 'my_runner'
... runner.my_function = lambda self: print(self.my_name)
... ...
>>> # build your runner
>>> runner_cfg = dict(type='EpochBasedRunner', max_epochs=40,
... constructor='MyRunnerConstructor')
>>> runner = build_runner(runner_cfg)
"""
def
__init__
(
self
,
runner_cfg
,
default_args
=
None
):
if
not
isinstance
(
runner_cfg
,
dict
):
raise
TypeError
(
'runner_cfg should be a dict'
,
f
'but got
{
type
(
runner_cfg
)
}
'
)
self
.
runner_cfg
=
runner_cfg
self
.
default_args
=
default_args
def
__call__
(
self
):
return
RUNNERS
.
build
(
self
.
runner_cfg
,
default_args
=
self
.
default_args
)
lavis/common/annotator/uniformer/mmcv/runner/dist_utils.py
0 → 100644
View file @
c04f261a
# Copyright (c) OpenMMLab. All rights reserved.
import
functools
import
os
import
subprocess
from
collections
import
OrderedDict
import
torch
import
torch.multiprocessing
as
mp
from
torch
import
distributed
as
dist
from
torch._utils
import
(
_flatten_dense_tensors
,
_take_tensors
,
_unflatten_dense_tensors
)
def
init_dist
(
launcher
,
backend
=
'nccl'
,
**
kwargs
):
if
mp
.
get_start_method
(
allow_none
=
True
)
is
None
:
mp
.
set_start_method
(
'spawn'
)
if
launcher
==
'pytorch'
:
_init_dist_pytorch
(
backend
,
**
kwargs
)
elif
launcher
==
'mpi'
:
_init_dist_mpi
(
backend
,
**
kwargs
)
elif
launcher
==
'slurm'
:
_init_dist_slurm
(
backend
,
**
kwargs
)
else
:
raise
ValueError
(
f
'Invalid launcher type:
{
launcher
}
'
)
def
_init_dist_pytorch
(
backend
,
**
kwargs
):
# TODO: use local_rank instead of rank % num_gpus
rank
=
int
(
os
.
environ
[
'RANK'
])
num_gpus
=
torch
.
cuda
.
device_count
()
torch
.
cuda
.
set_device
(
rank
%
num_gpus
)
dist
.
init_process_group
(
backend
=
backend
,
**
kwargs
)
def
_init_dist_mpi
(
backend
,
**
kwargs
):
# TODO: use local_rank instead of rank % num_gpus
rank
=
int
(
os
.
environ
[
'OMPI_COMM_WORLD_RANK'
])
num_gpus
=
torch
.
cuda
.
device_count
()
torch
.
cuda
.
set_device
(
rank
%
num_gpus
)
dist
.
init_process_group
(
backend
=
backend
,
**
kwargs
)
def
_init_dist_slurm
(
backend
,
port
=
None
):
"""Initialize slurm distributed training environment.
If argument ``port`` is not specified, then the master port will be system
environment variable ``MASTER_PORT``. If ``MASTER_PORT`` is not in system
environment variable, then a default port ``29500`` will be used.
Args:
backend (str): Backend of torch.distributed.
port (int, optional): Master port. Defaults to None.
"""
proc_id
=
int
(
os
.
environ
[
'SLURM_PROCID'
])
ntasks
=
int
(
os
.
environ
[
'SLURM_NTASKS'
])
node_list
=
os
.
environ
[
'SLURM_NODELIST'
]
num_gpus
=
torch
.
cuda
.
device_count
()
torch
.
cuda
.
set_device
(
proc_id
%
num_gpus
)
addr
=
subprocess
.
getoutput
(
f
'scontrol show hostname
{
node_list
}
| head -n1'
)
# specify master port
if
port
is
not
None
:
os
.
environ
[
'MASTER_PORT'
]
=
str
(
port
)
elif
'MASTER_PORT'
in
os
.
environ
:
pass
# use MASTER_PORT in the environment variable
else
:
# 29500 is torch.distributed default port
os
.
environ
[
'MASTER_PORT'
]
=
'29500'
# use MASTER_ADDR in the environment variable if it already exists
if
'MASTER_ADDR'
not
in
os
.
environ
:
os
.
environ
[
'MASTER_ADDR'
]
=
addr
os
.
environ
[
'WORLD_SIZE'
]
=
str
(
ntasks
)
os
.
environ
[
'LOCAL_RANK'
]
=
str
(
proc_id
%
num_gpus
)
os
.
environ
[
'RANK'
]
=
str
(
proc_id
)
dist
.
init_process_group
(
backend
=
backend
)
def
get_dist_info
():
if
dist
.
is_available
()
and
dist
.
is_initialized
():
rank
=
dist
.
get_rank
()
world_size
=
dist
.
get_world_size
()
else
:
rank
=
0
world_size
=
1
return
rank
,
world_size
def
master_only
(
func
):
@
functools
.
wraps
(
func
)
def
wrapper
(
*
args
,
**
kwargs
):
rank
,
_
=
get_dist_info
()
if
rank
==
0
:
return
func
(
*
args
,
**
kwargs
)
return
wrapper
def
allreduce_params
(
params
,
coalesce
=
True
,
bucket_size_mb
=-
1
):
"""Allreduce parameters.
Args:
params (list[torch.Parameters]): List of parameters or buffers of a
model.
coalesce (bool, optional): Whether allreduce parameters as a whole.
Defaults to True.
bucket_size_mb (int, optional): Size of bucket, the unit is MB.
Defaults to -1.
"""
_
,
world_size
=
get_dist_info
()
if
world_size
==
1
:
return
params
=
[
param
.
data
for
param
in
params
]
if
coalesce
:
_allreduce_coalesced
(
params
,
world_size
,
bucket_size_mb
)
else
:
for
tensor
in
params
:
dist
.
all_reduce
(
tensor
.
div_
(
world_size
))
def
allreduce_grads
(
params
,
coalesce
=
True
,
bucket_size_mb
=-
1
):
"""Allreduce gradients.
Args:
params (list[torch.Parameters]): List of parameters of a model
coalesce (bool, optional): Whether allreduce parameters as a whole.
Defaults to True.
bucket_size_mb (int, optional): Size of bucket, the unit is MB.
Defaults to -1.
"""
grads
=
[
param
.
grad
.
data
for
param
in
params
if
param
.
requires_grad
and
param
.
grad
is
not
None
]
_
,
world_size
=
get_dist_info
()
if
world_size
==
1
:
return
if
coalesce
:
_allreduce_coalesced
(
grads
,
world_size
,
bucket_size_mb
)
else
:
for
tensor
in
grads
:
dist
.
all_reduce
(
tensor
.
div_
(
world_size
))
def
_allreduce_coalesced
(
tensors
,
world_size
,
bucket_size_mb
=-
1
):
if
bucket_size_mb
>
0
:
bucket_size_bytes
=
bucket_size_mb
*
1024
*
1024
buckets
=
_take_tensors
(
tensors
,
bucket_size_bytes
)
else
:
buckets
=
OrderedDict
()
for
tensor
in
tensors
:
tp
=
tensor
.
type
()
if
tp
not
in
buckets
:
buckets
[
tp
]
=
[]
buckets
[
tp
].
append
(
tensor
)
buckets
=
buckets
.
values
()
for
bucket
in
buckets
:
flat_tensors
=
_flatten_dense_tensors
(
bucket
)
dist
.
all_reduce
(
flat_tensors
)
flat_tensors
.
div_
(
world_size
)
for
tensor
,
synced
in
zip
(
bucket
,
_unflatten_dense_tensors
(
flat_tensors
,
bucket
)):
tensor
.
copy_
(
synced
)
lavis/common/annotator/uniformer/mmcv/runner/epoch_based_runner.py
0 → 100644
View file @
c04f261a
# Copyright (c) OpenMMLab. All rights reserved.
import
os.path
as
osp
import
platform
import
shutil
import
time
import
warnings
import
torch
import
annotator.uniformer.mmcv
as
mmcv
from
.base_runner
import
BaseRunner
from
.builder
import
RUNNERS
from
.checkpoint
import
save_checkpoint
from
.utils
import
get_host_info
@
RUNNERS
.
register_module
()
class
EpochBasedRunner
(
BaseRunner
):
"""Epoch-based Runner.
This runner train models epoch by epoch.
"""
def
run_iter
(
self
,
data_batch
,
train_mode
,
**
kwargs
):
if
self
.
batch_processor
is
not
None
:
outputs
=
self
.
batch_processor
(
self
.
model
,
data_batch
,
train_mode
=
train_mode
,
**
kwargs
)
elif
train_mode
:
outputs
=
self
.
model
.
train_step
(
data_batch
,
self
.
optimizer
,
**
kwargs
)
else
:
outputs
=
self
.
model
.
val_step
(
data_batch
,
self
.
optimizer
,
**
kwargs
)
if
not
isinstance
(
outputs
,
dict
):
raise
TypeError
(
'"batch_processor()" or "model.train_step()"'
'and "model.val_step()" must return a dict'
)
if
'log_vars'
in
outputs
:
self
.
log_buffer
.
update
(
outputs
[
'log_vars'
],
outputs
[
'num_samples'
])
self
.
outputs
=
outputs
def
train
(
self
,
data_loader
,
**
kwargs
):
self
.
model
.
train
()
self
.
mode
=
'train'
self
.
data_loader
=
data_loader
self
.
_max_iters
=
self
.
_max_epochs
*
len
(
self
.
data_loader
)
self
.
call_hook
(
'before_train_epoch'
)
time
.
sleep
(
2
)
# Prevent possible deadlock during epoch transition
for
i
,
data_batch
in
enumerate
(
self
.
data_loader
):
self
.
_inner_iter
=
i
self
.
call_hook
(
'before_train_iter'
)
self
.
run_iter
(
data_batch
,
train_mode
=
True
,
**
kwargs
)
self
.
call_hook
(
'after_train_iter'
)
self
.
_iter
+=
1
self
.
call_hook
(
'after_train_epoch'
)
self
.
_epoch
+=
1
@
torch
.
no_grad
()
def
val
(
self
,
data_loader
,
**
kwargs
):
self
.
model
.
eval
()
self
.
mode
=
'val'
self
.
data_loader
=
data_loader
self
.
call_hook
(
'before_val_epoch'
)
time
.
sleep
(
2
)
# Prevent possible deadlock during epoch transition
for
i
,
data_batch
in
enumerate
(
self
.
data_loader
):
self
.
_inner_iter
=
i
self
.
call_hook
(
'before_val_iter'
)
self
.
run_iter
(
data_batch
,
train_mode
=
False
)
self
.
call_hook
(
'after_val_iter'
)
self
.
call_hook
(
'after_val_epoch'
)
def
run
(
self
,
data_loaders
,
workflow
,
max_epochs
=
None
,
**
kwargs
):
"""Start running.
Args:
data_loaders (list[:obj:`DataLoader`]): Dataloaders for training
and validation.
workflow (list[tuple]): A list of (phase, epochs) to specify the
running order and epochs. E.g, [('train', 2), ('val', 1)] means
running 2 epochs for training and 1 epoch for validation,
iteratively.
"""
assert
isinstance
(
data_loaders
,
list
)
assert
mmcv
.
is_list_of
(
workflow
,
tuple
)
assert
len
(
data_loaders
)
==
len
(
workflow
)
if
max_epochs
is
not
None
:
warnings
.
warn
(
'setting max_epochs in run is deprecated, '
'please set max_epochs in runner_config'
,
DeprecationWarning
)
self
.
_max_epochs
=
max_epochs
assert
self
.
_max_epochs
is
not
None
,
(
'max_epochs must be specified during instantiation'
)
for
i
,
flow
in
enumerate
(
workflow
):
mode
,
epochs
=
flow
if
mode
==
'train'
:
self
.
_max_iters
=
self
.
_max_epochs
*
len
(
data_loaders
[
i
])
break
work_dir
=
self
.
work_dir
if
self
.
work_dir
is
not
None
else
'NONE'
self
.
logger
.
info
(
'Start running, host: %s, work_dir: %s'
,
get_host_info
(),
work_dir
)
self
.
logger
.
info
(
'Hooks will be executed in the following order:
\n
%s'
,
self
.
get_hook_info
())
self
.
logger
.
info
(
'workflow: %s, max: %d epochs'
,
workflow
,
self
.
_max_epochs
)
self
.
call_hook
(
'before_run'
)
while
self
.
epoch
<
self
.
_max_epochs
:
for
i
,
flow
in
enumerate
(
workflow
):
mode
,
epochs
=
flow
if
isinstance
(
mode
,
str
):
# self.train()
if
not
hasattr
(
self
,
mode
):
raise
ValueError
(
f
'runner has no method named "
{
mode
}
" to run an '
'epoch'
)
epoch_runner
=
getattr
(
self
,
mode
)
else
:
raise
TypeError
(
'mode in workflow must be a str, but got {}'
.
format
(
type
(
mode
)))
for
_
in
range
(
epochs
):
if
mode
==
'train'
and
self
.
epoch
>=
self
.
_max_epochs
:
break
epoch_runner
(
data_loaders
[
i
],
**
kwargs
)
time
.
sleep
(
1
)
# wait for some hooks like loggers to finish
self
.
call_hook
(
'after_run'
)
def
save_checkpoint
(
self
,
out_dir
,
filename_tmpl
=
'epoch_{}.pth'
,
save_optimizer
=
True
,
meta
=
None
,
create_symlink
=
True
):
"""Save the checkpoint.
Args:
out_dir (str): The directory that checkpoints are saved.
filename_tmpl (str, optional): The checkpoint filename template,
which contains a placeholder for the epoch number.
Defaults to 'epoch_{}.pth'.
save_optimizer (bool, optional): Whether to save the optimizer to
the checkpoint. Defaults to True.
meta (dict, optional): The meta information to be saved in the
checkpoint. Defaults to None.
create_symlink (bool, optional): Whether to create a symlink
"latest.pth" to point to the latest checkpoint.
Defaults to True.
"""
if
meta
is
None
:
meta
=
{}
elif
not
isinstance
(
meta
,
dict
):
raise
TypeError
(
f
'meta should be a dict or None, but got
{
type
(
meta
)
}
'
)
if
self
.
meta
is
not
None
:
meta
.
update
(
self
.
meta
)
# Note: meta.update(self.meta) should be done before
# meta.update(epoch=self.epoch + 1, iter=self.iter) otherwise
# there will be problems with resumed checkpoints.
# More details in https://github.com/open-mmlab/mmcv/pull/1108
meta
.
update
(
epoch
=
self
.
epoch
+
1
,
iter
=
self
.
iter
)
filename
=
filename_tmpl
.
format
(
self
.
epoch
+
1
)
filepath
=
osp
.
join
(
out_dir
,
filename
)
optimizer
=
self
.
optimizer
if
save_optimizer
else
None
save_checkpoint
(
self
.
model
,
filepath
,
optimizer
=
optimizer
,
meta
=
meta
)
# in some environments, `os.symlink` is not supported, you may need to
# set `create_symlink` to False
if
create_symlink
:
dst_file
=
osp
.
join
(
out_dir
,
'latest.pth'
)
if
platform
.
system
()
!=
'Windows'
:
mmcv
.
symlink
(
filename
,
dst_file
)
else
:
shutil
.
copy
(
filepath
,
dst_file
)
@
RUNNERS
.
register_module
()
class
Runner
(
EpochBasedRunner
):
"""Deprecated name of EpochBasedRunner."""
def
__init__
(
self
,
*
args
,
**
kwargs
):
warnings
.
warn
(
'Runner was deprecated, please use EpochBasedRunner instead'
)
super
().
__init__
(
*
args
,
**
kwargs
)
lavis/common/annotator/uniformer/mmcv/runner/fp16_utils.py
0 → 100644
View file @
c04f261a
# Copyright (c) OpenMMLab. All rights reserved.
import
functools
import
warnings
from
collections
import
abc
from
inspect
import
getfullargspec
import
numpy
as
np
import
torch
import
torch.nn
as
nn
from
annotator.uniformer.mmcv.utils
import
TORCH_VERSION
,
digit_version
from
.dist_utils
import
allreduce_grads
as
_allreduce_grads
try
:
# If PyTorch version >= 1.6.0, torch.cuda.amp.autocast would be imported
# and used; otherwise, auto fp16 will adopt mmcv's implementation.
# Note that when PyTorch >= 1.6.0, we still cast tensor types to fp16
# manually, so the behavior may not be consistent with real amp.
from
torch.cuda.amp
import
autocast
except
ImportError
:
pass
def
cast_tensor_type
(
inputs
,
src_type
,
dst_type
):
"""Recursively convert Tensor in inputs from src_type to dst_type.
Args:
inputs: Inputs that to be casted.
src_type (torch.dtype): Source type..
dst_type (torch.dtype): Destination type.
Returns:
The same type with inputs, but all contained Tensors have been cast.
"""
if
isinstance
(
inputs
,
nn
.
Module
):
return
inputs
elif
isinstance
(
inputs
,
torch
.
Tensor
):
return
inputs
.
to
(
dst_type
)
elif
isinstance
(
inputs
,
str
):
return
inputs
elif
isinstance
(
inputs
,
np
.
ndarray
):
return
inputs
elif
isinstance
(
inputs
,
abc
.
Mapping
):
return
type
(
inputs
)({
k
:
cast_tensor_type
(
v
,
src_type
,
dst_type
)
for
k
,
v
in
inputs
.
items
()
})
elif
isinstance
(
inputs
,
abc
.
Iterable
):
return
type
(
inputs
)(
cast_tensor_type
(
item
,
src_type
,
dst_type
)
for
item
in
inputs
)
else
:
return
inputs
def
auto_fp16
(
apply_to
=
None
,
out_fp32
=
False
):
"""Decorator to enable fp16 training automatically.
This decorator is useful when you write custom modules and want to support
mixed precision training. If inputs arguments are fp32 tensors, they will
be converted to fp16 automatically. Arguments other than fp32 tensors are
ignored. If you are using PyTorch >= 1.6, torch.cuda.amp is used as the
backend, otherwise, original mmcv implementation will be adopted.
Args:
apply_to (Iterable, optional): The argument names to be converted.
`None` indicates all arguments.
out_fp32 (bool): Whether to convert the output back to fp32.
Example:
>>> import torch.nn as nn
>>> class MyModule1(nn.Module):
>>>
>>> # Convert x and y to fp16
>>> @auto_fp16()
>>> def forward(self, x, y):
>>> pass
>>> import torch.nn as nn
>>> class MyModule2(nn.Module):
>>>
>>> # convert pred to fp16
>>> @auto_fp16(apply_to=('pred', ))
>>> def do_something(self, pred, others):
>>> pass
"""
def
auto_fp16_wrapper
(
old_func
):
@
functools
.
wraps
(
old_func
)
def
new_func
(
*
args
,
**
kwargs
):
# check if the module has set the attribute `fp16_enabled`, if not,
# just fallback to the original method.
if
not
isinstance
(
args
[
0
],
torch
.
nn
.
Module
):
raise
TypeError
(
'@auto_fp16 can only be used to decorate the '
'method of nn.Module'
)
if
not
(
hasattr
(
args
[
0
],
'fp16_enabled'
)
and
args
[
0
].
fp16_enabled
):
return
old_func
(
*
args
,
**
kwargs
)
# get the arg spec of the decorated method
args_info
=
getfullargspec
(
old_func
)
# get the argument names to be casted
args_to_cast
=
args_info
.
args
if
apply_to
is
None
else
apply_to
# convert the args that need to be processed
new_args
=
[]
# NOTE: default args are not taken into consideration
if
args
:
arg_names
=
args_info
.
args
[:
len
(
args
)]
for
i
,
arg_name
in
enumerate
(
arg_names
):
if
arg_name
in
args_to_cast
:
new_args
.
append
(
cast_tensor_type
(
args
[
i
],
torch
.
float
,
torch
.
half
))
else
:
new_args
.
append
(
args
[
i
])
# convert the kwargs that need to be processed
new_kwargs
=
{}
if
kwargs
:
for
arg_name
,
arg_value
in
kwargs
.
items
():
if
arg_name
in
args_to_cast
:
new_kwargs
[
arg_name
]
=
cast_tensor_type
(
arg_value
,
torch
.
float
,
torch
.
half
)
else
:
new_kwargs
[
arg_name
]
=
arg_value
# apply converted arguments to the decorated method
if
(
TORCH_VERSION
!=
'parrots'
and
digit_version
(
TORCH_VERSION
)
>=
digit_version
(
'1.6.0'
)):
with
autocast
(
enabled
=
True
):
output
=
old_func
(
*
new_args
,
**
new_kwargs
)
else
:
output
=
old_func
(
*
new_args
,
**
new_kwargs
)
# cast the results back to fp32 if necessary
if
out_fp32
:
output
=
cast_tensor_type
(
output
,
torch
.
half
,
torch
.
float
)
return
output
return
new_func
return
auto_fp16_wrapper
def
force_fp32
(
apply_to
=
None
,
out_fp16
=
False
):
"""Decorator to convert input arguments to fp32 in force.
This decorator is useful when you write custom modules and want to support
mixed precision training. If there are some inputs that must be processed
in fp32 mode, then this decorator can handle it. If inputs arguments are
fp16 tensors, they will be converted to fp32 automatically. Arguments other
than fp16 tensors are ignored. If you are using PyTorch >= 1.6,
torch.cuda.amp is used as the backend, otherwise, original mmcv
implementation will be adopted.
Args:
apply_to (Iterable, optional): The argument names to be converted.
`None` indicates all arguments.
out_fp16 (bool): Whether to convert the output back to fp16.
Example:
>>> import torch.nn as nn
>>> class MyModule1(nn.Module):
>>>
>>> # Convert x and y to fp32
>>> @force_fp32()
>>> def loss(self, x, y):
>>> pass
>>> import torch.nn as nn
>>> class MyModule2(nn.Module):
>>>
>>> # convert pred to fp32
>>> @force_fp32(apply_to=('pred', ))
>>> def post_process(self, pred, others):
>>> pass
"""
def
force_fp32_wrapper
(
old_func
):
@
functools
.
wraps
(
old_func
)
def
new_func
(
*
args
,
**
kwargs
):
# check if the module has set the attribute `fp16_enabled`, if not,
# just fallback to the original method.
if
not
isinstance
(
args
[
0
],
torch
.
nn
.
Module
):
raise
TypeError
(
'@force_fp32 can only be used to decorate the '
'method of nn.Module'
)
if
not
(
hasattr
(
args
[
0
],
'fp16_enabled'
)
and
args
[
0
].
fp16_enabled
):
return
old_func
(
*
args
,
**
kwargs
)
# get the arg spec of the decorated method
args_info
=
getfullargspec
(
old_func
)
# get the argument names to be casted
args_to_cast
=
args_info
.
args
if
apply_to
is
None
else
apply_to
# convert the args that need to be processed
new_args
=
[]
if
args
:
arg_names
=
args_info
.
args
[:
len
(
args
)]
for
i
,
arg_name
in
enumerate
(
arg_names
):
if
arg_name
in
args_to_cast
:
new_args
.
append
(
cast_tensor_type
(
args
[
i
],
torch
.
half
,
torch
.
float
))
else
:
new_args
.
append
(
args
[
i
])
# convert the kwargs that need to be processed
new_kwargs
=
dict
()
if
kwargs
:
for
arg_name
,
arg_value
in
kwargs
.
items
():
if
arg_name
in
args_to_cast
:
new_kwargs
[
arg_name
]
=
cast_tensor_type
(
arg_value
,
torch
.
half
,
torch
.
float
)
else
:
new_kwargs
[
arg_name
]
=
arg_value
# apply converted arguments to the decorated method
if
(
TORCH_VERSION
!=
'parrots'
and
digit_version
(
TORCH_VERSION
)
>=
digit_version
(
'1.6.0'
)):
with
autocast
(
enabled
=
False
):
output
=
old_func
(
*
new_args
,
**
new_kwargs
)
else
:
output
=
old_func
(
*
new_args
,
**
new_kwargs
)
# cast the results back to fp32 if necessary
if
out_fp16
:
output
=
cast_tensor_type
(
output
,
torch
.
float
,
torch
.
half
)
return
output
return
new_func
return
force_fp32_wrapper
def
allreduce_grads
(
params
,
coalesce
=
True
,
bucket_size_mb
=-
1
):
warnings
.
warning
(
'"mmcv.runner.fp16_utils.allreduce_grads" is deprecated, and will be '
'removed in v2.8. Please switch to "mmcv.runner.allreduce_grads'
)
_allreduce_grads
(
params
,
coalesce
=
coalesce
,
bucket_size_mb
=
bucket_size_mb
)
def
wrap_fp16_model
(
model
):
"""Wrap the FP32 model to FP16.
If you are using PyTorch >= 1.6, torch.cuda.amp is used as the
backend, otherwise, original mmcv implementation will be adopted.
For PyTorch >= 1.6, this function will
1. Set fp16 flag inside the model to True.
Otherwise:
1. Convert FP32 model to FP16.
2. Remain some necessary layers to be FP32, e.g., normalization layers.
3. Set `fp16_enabled` flag inside the model to True.
Args:
model (nn.Module): Model in FP32.
"""
if
(
TORCH_VERSION
==
'parrots'
or
digit_version
(
TORCH_VERSION
)
<
digit_version
(
'1.6.0'
)):
# convert model to fp16
model
.
half
()
# patch the normalization layers to make it work in fp32 mode
patch_norm_fp32
(
model
)
# set `fp16_enabled` flag
for
m
in
model
.
modules
():
if
hasattr
(
m
,
'fp16_enabled'
):
m
.
fp16_enabled
=
True
def
patch_norm_fp32
(
module
):
"""Recursively convert normalization layers from FP16 to FP32.
Args:
module (nn.Module): The modules to be converted in FP16.
Returns:
nn.Module: The converted module, the normalization layers have been
converted to FP32.
"""
if
isinstance
(
module
,
(
nn
.
modules
.
batchnorm
.
_BatchNorm
,
nn
.
GroupNorm
)):
module
.
float
()
if
isinstance
(
module
,
nn
.
GroupNorm
)
or
torch
.
__version__
<
'1.3'
:
module
.
forward
=
patch_forward_method
(
module
.
forward
,
torch
.
half
,
torch
.
float
)
for
child
in
module
.
children
():
patch_norm_fp32
(
child
)
return
module
def
patch_forward_method
(
func
,
src_type
,
dst_type
,
convert_output
=
True
):
"""Patch the forward method of a module.
Args:
func (callable): The original forward method.
src_type (torch.dtype): Type of input arguments to be converted from.
dst_type (torch.dtype): Type of input arguments to be converted to.
convert_output (bool): Whether to convert the output back to src_type.
Returns:
callable: The patched forward method.
"""
def
new_forward
(
*
args
,
**
kwargs
):
output
=
func
(
*
cast_tensor_type
(
args
,
src_type
,
dst_type
),
**
cast_tensor_type
(
kwargs
,
src_type
,
dst_type
))
if
convert_output
:
output
=
cast_tensor_type
(
output
,
dst_type
,
src_type
)
return
output
return
new_forward
class
LossScaler
:
"""Class that manages loss scaling in mixed precision training which
supports both dynamic or static mode.
The implementation refers to
https://github.com/NVIDIA/apex/blob/master/apex/fp16_utils/loss_scaler.py.
Indirectly, by supplying ``mode='dynamic'`` for dynamic loss scaling.
It's important to understand how :class:`LossScaler` operates.
Loss scaling is designed to combat the problem of underflowing
gradients encountered at long times when training fp16 networks.
Dynamic loss scaling begins by attempting a very high loss
scale. Ironically, this may result in OVERflowing gradients.
If overflowing gradients are encountered, :class:`FP16_Optimizer` then
skips the update step for this particular iteration/minibatch,
and :class:`LossScaler` adjusts the loss scale to a lower value.
If a certain number of iterations occur without overflowing gradients
detected,:class:`LossScaler` increases the loss scale once more.
In this way :class:`LossScaler` attempts to "ride the edge" of always
using the highest loss scale possible without incurring overflow.
Args:
init_scale (float): Initial loss scale value, default: 2**32.
scale_factor (float): Factor used when adjusting the loss scale.
Default: 2.
mode (str): Loss scaling mode. 'dynamic' or 'static'
scale_window (int): Number of consecutive iterations without an
overflow to wait before increasing the loss scale. Default: 1000.
"""
def
__init__
(
self
,
init_scale
=
2
**
32
,
mode
=
'dynamic'
,
scale_factor
=
2.
,
scale_window
=
1000
):
self
.
cur_scale
=
init_scale
self
.
cur_iter
=
0
assert
mode
in
(
'dynamic'
,
'static'
),
'mode can only be dynamic or static'
self
.
mode
=
mode
self
.
last_overflow_iter
=
-
1
self
.
scale_factor
=
scale_factor
self
.
scale_window
=
scale_window
def
has_overflow
(
self
,
params
):
"""Check if params contain overflow."""
if
self
.
mode
!=
'dynamic'
:
return
False
for
p
in
params
:
if
p
.
grad
is
not
None
and
LossScaler
.
_has_inf_or_nan
(
p
.
grad
.
data
):
return
True
return
False
def
_has_inf_or_nan
(
x
):
"""Check if params contain NaN."""
try
:
cpu_sum
=
float
(
x
.
float
().
sum
())
except
RuntimeError
as
instance
:
if
'value cannot be converted'
not
in
instance
.
args
[
0
]:
raise
return
True
else
:
if
cpu_sum
==
float
(
'inf'
)
or
cpu_sum
==
-
float
(
'inf'
)
\
or
cpu_sum
!=
cpu_sum
:
return
True
return
False
def
update_scale
(
self
,
overflow
):
"""update the current loss scale value when overflow happens."""
if
self
.
mode
!=
'dynamic'
:
return
if
overflow
:
self
.
cur_scale
=
max
(
self
.
cur_scale
/
self
.
scale_factor
,
1
)
self
.
last_overflow_iter
=
self
.
cur_iter
else
:
if
(
self
.
cur_iter
-
self
.
last_overflow_iter
)
%
\
self
.
scale_window
==
0
:
self
.
cur_scale
*=
self
.
scale_factor
self
.
cur_iter
+=
1
def
state_dict
(
self
):
"""Returns the state of the scaler as a :class:`dict`."""
return
dict
(
cur_scale
=
self
.
cur_scale
,
cur_iter
=
self
.
cur_iter
,
mode
=
self
.
mode
,
last_overflow_iter
=
self
.
last_overflow_iter
,
scale_factor
=
self
.
scale_factor
,
scale_window
=
self
.
scale_window
)
def
load_state_dict
(
self
,
state_dict
):
"""Loads the loss_scaler state dict.
Args:
state_dict (dict): scaler state.
"""
self
.
cur_scale
=
state_dict
[
'cur_scale'
]
self
.
cur_iter
=
state_dict
[
'cur_iter'
]
self
.
mode
=
state_dict
[
'mode'
]
self
.
last_overflow_iter
=
state_dict
[
'last_overflow_iter'
]
self
.
scale_factor
=
state_dict
[
'scale_factor'
]
self
.
scale_window
=
state_dict
[
'scale_window'
]
@
property
def
loss_scale
(
self
):
return
self
.
cur_scale
lavis/common/annotator/uniformer/mmcv/runner/hooks/__init__.py
0 → 100644
View file @
c04f261a
# Copyright (c) OpenMMLab. All rights reserved.
from
.checkpoint
import
CheckpointHook
from
.closure
import
ClosureHook
from
.ema
import
EMAHook
from
.evaluation
import
DistEvalHook
,
EvalHook
from
.hook
import
HOOKS
,
Hook
from
.iter_timer
import
IterTimerHook
from
.logger
import
(
DvcliveLoggerHook
,
LoggerHook
,
MlflowLoggerHook
,
NeptuneLoggerHook
,
PaviLoggerHook
,
TensorboardLoggerHook
,
TextLoggerHook
,
WandbLoggerHook
)
from
.lr_updater
import
LrUpdaterHook
from
.memory
import
EmptyCacheHook
from
.momentum_updater
import
MomentumUpdaterHook
from
.optimizer
import
(
Fp16OptimizerHook
,
GradientCumulativeFp16OptimizerHook
,
GradientCumulativeOptimizerHook
,
OptimizerHook
)
from
.profiler
import
ProfilerHook
from
.sampler_seed
import
DistSamplerSeedHook
from
.sync_buffer
import
SyncBuffersHook
__all__
=
[
'HOOKS'
,
'Hook'
,
'CheckpointHook'
,
'ClosureHook'
,
'LrUpdaterHook'
,
'OptimizerHook'
,
'Fp16OptimizerHook'
,
'IterTimerHook'
,
'DistSamplerSeedHook'
,
'EmptyCacheHook'
,
'LoggerHook'
,
'MlflowLoggerHook'
,
'PaviLoggerHook'
,
'TextLoggerHook'
,
'TensorboardLoggerHook'
,
'NeptuneLoggerHook'
,
'WandbLoggerHook'
,
'DvcliveLoggerHook'
,
'MomentumUpdaterHook'
,
'SyncBuffersHook'
,
'EMAHook'
,
'EvalHook'
,
'DistEvalHook'
,
'ProfilerHook'
,
'GradientCumulativeOptimizerHook'
,
'GradientCumulativeFp16OptimizerHook'
]
lavis/common/annotator/uniformer/mmcv/runner/hooks/checkpoint.py
0 → 100644
View file @
c04f261a
# Copyright (c) OpenMMLab. All rights reserved.
import
os.path
as
osp
import
warnings
from
annotator.uniformer.mmcv.fileio
import
FileClient
from
..dist_utils
import
allreduce_params
,
master_only
from
.hook
import
HOOKS
,
Hook
@
HOOKS
.
register_module
()
class
CheckpointHook
(
Hook
):
"""Save checkpoints periodically.
Args:
interval (int): The saving period. If ``by_epoch=True``, interval
indicates epochs, otherwise it indicates iterations.
Default: -1, which means "never".
by_epoch (bool): Saving checkpoints by epoch or by iteration.
Default: True.
save_optimizer (bool): Whether to save optimizer state_dict in the
checkpoint. It is usually used for resuming experiments.
Default: True.
out_dir (str, optional): The root directory to save checkpoints. If not
specified, ``runner.work_dir`` will be used by default. If
specified, the ``out_dir`` will be the concatenation of ``out_dir``
and the last level directory of ``runner.work_dir``.
`Changed in version 1.3.16.`
max_keep_ckpts (int, optional): The maximum checkpoints to keep.
In some cases we want only the latest few checkpoints and would
like to delete old ones to save the disk space.
Default: -1, which means unlimited.
save_last (bool, optional): Whether to force the last checkpoint to be
saved regardless of interval. Default: True.
sync_buffer (bool, optional): Whether to synchronize buffers in
different gpus. Default: False.
file_client_args (dict, optional): Arguments to instantiate a
FileClient. See :class:`mmcv.fileio.FileClient` for details.
Default: None.
`New in version 1.3.16.`
.. warning::
Before v1.3.16, the ``out_dir`` argument indicates the path where the
checkpoint is stored. However, since v1.3.16, ``out_dir`` indicates the
root directory and the final path to save checkpoint is the
concatenation of ``out_dir`` and the last level directory of
``runner.work_dir``. Suppose the value of ``out_dir`` is "/path/of/A"
and the value of ``runner.work_dir`` is "/path/of/B", then the final
path will be "/path/of/A/B".
"""
def
__init__
(
self
,
interval
=-
1
,
by_epoch
=
True
,
save_optimizer
=
True
,
out_dir
=
None
,
max_keep_ckpts
=-
1
,
save_last
=
True
,
sync_buffer
=
False
,
file_client_args
=
None
,
**
kwargs
):
self
.
interval
=
interval
self
.
by_epoch
=
by_epoch
self
.
save_optimizer
=
save_optimizer
self
.
out_dir
=
out_dir
self
.
max_keep_ckpts
=
max_keep_ckpts
self
.
save_last
=
save_last
self
.
args
=
kwargs
self
.
sync_buffer
=
sync_buffer
self
.
file_client_args
=
file_client_args
def
before_run
(
self
,
runner
):
if
not
self
.
out_dir
:
self
.
out_dir
=
runner
.
work_dir
self
.
file_client
=
FileClient
.
infer_client
(
self
.
file_client_args
,
self
.
out_dir
)
# if `self.out_dir` is not equal to `runner.work_dir`, it means that
# `self.out_dir` is set so the final `self.out_dir` is the
# concatenation of `self.out_dir` and the last level directory of
# `runner.work_dir`
if
self
.
out_dir
!=
runner
.
work_dir
:
basename
=
osp
.
basename
(
runner
.
work_dir
.
rstrip
(
osp
.
sep
))
self
.
out_dir
=
self
.
file_client
.
join_path
(
self
.
out_dir
,
basename
)
runner
.
logger
.
info
((
f
'Checkpoints will be saved to
{
self
.
out_dir
}
by '
f
'
{
self
.
file_client
.
name
}
.'
))
# disable the create_symlink option because some file backends do not
# allow to create a symlink
if
'create_symlink'
in
self
.
args
:
if
self
.
args
[
'create_symlink'
]
and
not
self
.
file_client
.
allow_symlink
:
self
.
args
[
'create_symlink'
]
=
False
warnings
.
warn
(
(
'create_symlink is set as True by the user but is changed'
'to be False because creating symbolic link is not '
f
'allowed in
{
self
.
file_client
.
name
}
'
))
else
:
self
.
args
[
'create_symlink'
]
=
self
.
file_client
.
allow_symlink
def
after_train_epoch
(
self
,
runner
):
if
not
self
.
by_epoch
:
return
# save checkpoint for following cases:
# 1. every ``self.interval`` epochs
# 2. reach the last epoch of training
if
self
.
every_n_epochs
(
runner
,
self
.
interval
)
or
(
self
.
save_last
and
self
.
is_last_epoch
(
runner
)):
runner
.
logger
.
info
(
f
'Saving checkpoint at
{
runner
.
epoch
+
1
}
epochs'
)
if
self
.
sync_buffer
:
allreduce_params
(
runner
.
model
.
buffers
())
self
.
_save_checkpoint
(
runner
)
@
master_only
def
_save_checkpoint
(
self
,
runner
):
"""Save the current checkpoint and delete unwanted checkpoint."""
runner
.
save_checkpoint
(
self
.
out_dir
,
save_optimizer
=
self
.
save_optimizer
,
**
self
.
args
)
if
runner
.
meta
is
not
None
:
if
self
.
by_epoch
:
cur_ckpt_filename
=
self
.
args
.
get
(
'filename_tmpl'
,
'epoch_{}.pth'
).
format
(
runner
.
epoch
+
1
)
else
:
cur_ckpt_filename
=
self
.
args
.
get
(
'filename_tmpl'
,
'iter_{}.pth'
).
format
(
runner
.
iter
+
1
)
runner
.
meta
.
setdefault
(
'hook_msgs'
,
dict
())
runner
.
meta
[
'hook_msgs'
][
'last_ckpt'
]
=
self
.
file_client
.
join_path
(
self
.
out_dir
,
cur_ckpt_filename
)
# remove other checkpoints
if
self
.
max_keep_ckpts
>
0
:
if
self
.
by_epoch
:
name
=
'epoch_{}.pth'
current_ckpt
=
runner
.
epoch
+
1
else
:
name
=
'iter_{}.pth'
current_ckpt
=
runner
.
iter
+
1
redundant_ckpts
=
range
(
current_ckpt
-
self
.
max_keep_ckpts
*
self
.
interval
,
0
,
-
self
.
interval
)
filename_tmpl
=
self
.
args
.
get
(
'filename_tmpl'
,
name
)
for
_step
in
redundant_ckpts
:
ckpt_path
=
self
.
file_client
.
join_path
(
self
.
out_dir
,
filename_tmpl
.
format
(
_step
))
if
self
.
file_client
.
isfile
(
ckpt_path
):
self
.
file_client
.
remove
(
ckpt_path
)
else
:
break
def
after_train_iter
(
self
,
runner
):
if
self
.
by_epoch
:
return
# save checkpoint for following cases:
# 1. every ``self.interval`` iterations
# 2. reach the last iteration of training
if
self
.
every_n_iters
(
runner
,
self
.
interval
)
or
(
self
.
save_last
and
self
.
is_last_iter
(
runner
)):
runner
.
logger
.
info
(
f
'Saving checkpoint at
{
runner
.
iter
+
1
}
iterations'
)
if
self
.
sync_buffer
:
allreduce_params
(
runner
.
model
.
buffers
())
self
.
_save_checkpoint
(
runner
)
lavis/common/annotator/uniformer/mmcv/runner/hooks/closure.py
0 → 100644
View file @
c04f261a
# Copyright (c) OpenMMLab. All rights reserved.
from
.hook
import
HOOKS
,
Hook
@
HOOKS
.
register_module
()
class
ClosureHook
(
Hook
):
def
__init__
(
self
,
fn_name
,
fn
):
assert
hasattr
(
self
,
fn_name
)
assert
callable
(
fn
)
setattr
(
self
,
fn_name
,
fn
)
lavis/common/annotator/uniformer/mmcv/runner/hooks/ema.py
0 → 100644
View file @
c04f261a
# Copyright (c) OpenMMLab. All rights reserved.
from
...parallel
import
is_module_wrapper
from
..hooks.hook
import
HOOKS
,
Hook
@
HOOKS
.
register_module
()
class
EMAHook
(
Hook
):
r
"""Exponential Moving Average Hook.
Use Exponential Moving Average on all parameters of model in training
process. All parameters have a ema backup, which update by the formula
as below. EMAHook takes priority over EvalHook and CheckpointSaverHook.
.. math::
\text{Xema\_{t+1}} = (1 - \text{momentum}) \times
\text{Xema\_{t}} + \text{momentum} \times X_t
Args:
momentum (float): The momentum used for updating ema parameter.
Defaults to 0.0002.
interval (int): Update ema parameter every interval iteration.
Defaults to 1.
warm_up (int): During first warm_up steps, we may use smaller momentum
to update ema parameters more slowly. Defaults to 100.
resume_from (str): The checkpoint path. Defaults to None.
"""
def
__init__
(
self
,
momentum
=
0.0002
,
interval
=
1
,
warm_up
=
100
,
resume_from
=
None
):
assert
isinstance
(
interval
,
int
)
and
interval
>
0
self
.
warm_up
=
warm_up
self
.
interval
=
interval
assert
momentum
>
0
and
momentum
<
1
self
.
momentum
=
momentum
**
interval
self
.
checkpoint
=
resume_from
def
before_run
(
self
,
runner
):
"""To resume model with it's ema parameters more friendly.
Register ema parameter as ``named_buffer`` to model
"""
model
=
runner
.
model
if
is_module_wrapper
(
model
):
model
=
model
.
module
self
.
param_ema_buffer
=
{}
self
.
model_parameters
=
dict
(
model
.
named_parameters
(
recurse
=
True
))
for
name
,
value
in
self
.
model_parameters
.
items
():
# "." is not allowed in module's buffer name
buffer_name
=
f
"ema_
{
name
.
replace
(
'.'
,
'_'
)
}
"
self
.
param_ema_buffer
[
name
]
=
buffer_name
model
.
register_buffer
(
buffer_name
,
value
.
data
.
clone
())
self
.
model_buffers
=
dict
(
model
.
named_buffers
(
recurse
=
True
))
if
self
.
checkpoint
is
not
None
:
runner
.
resume
(
self
.
checkpoint
)
def
after_train_iter
(
self
,
runner
):
"""Update ema parameter every self.interval iterations."""
curr_step
=
runner
.
iter
# We warm up the momentum considering the instability at beginning
momentum
=
min
(
self
.
momentum
,
(
1
+
curr_step
)
/
(
self
.
warm_up
+
curr_step
))
if
curr_step
%
self
.
interval
!=
0
:
return
for
name
,
parameter
in
self
.
model_parameters
.
items
():
buffer_name
=
self
.
param_ema_buffer
[
name
]
buffer_parameter
=
self
.
model_buffers
[
buffer_name
]
buffer_parameter
.
mul_
(
1
-
momentum
).
add_
(
momentum
,
parameter
.
data
)
def
after_train_epoch
(
self
,
runner
):
"""We load parameter values from ema backup to model before the
EvalHook."""
self
.
_swap_ema_parameters
()
def
before_train_epoch
(
self
,
runner
):
"""We recover model's parameter from ema backup after last epoch's
EvalHook."""
self
.
_swap_ema_parameters
()
def
_swap_ema_parameters
(
self
):
"""Swap the parameter of model with parameter in ema_buffer."""
for
name
,
value
in
self
.
model_parameters
.
items
():
temp
=
value
.
data
.
clone
()
ema_buffer
=
self
.
model_buffers
[
self
.
param_ema_buffer
[
name
]]
value
.
data
.
copy_
(
ema_buffer
.
data
)
ema_buffer
.
data
.
copy_
(
temp
)
lavis/common/annotator/uniformer/mmcv/runner/hooks/evaluation.py
0 → 100644
View file @
c04f261a
# Copyright (c) OpenMMLab. All rights reserved.
import
os.path
as
osp
import
warnings
from
math
import
inf
import
torch.distributed
as
dist
from
torch.nn.modules.batchnorm
import
_BatchNorm
from
torch.utils.data
import
DataLoader
from
annotator.uniformer.mmcv.fileio
import
FileClient
from
annotator.uniformer.mmcv.utils
import
is_seq_of
from
.hook
import
Hook
from
.logger
import
LoggerHook
class
EvalHook
(
Hook
):
"""Non-Distributed evaluation hook.
This hook will regularly perform evaluation in a given interval when
performing in non-distributed environment.
Args:
dataloader (DataLoader): A PyTorch dataloader, whose dataset has
implemented ``evaluate`` function.
start (int | None, optional): Evaluation starting epoch. It enables
evaluation before the training starts if ``start`` <= the resuming
epoch. If None, whether to evaluate is merely decided by
``interval``. Default: None.
interval (int): Evaluation interval. Default: 1.
by_epoch (bool): Determine perform evaluation by epoch or by iteration.
If set to True, it will perform by epoch. Otherwise, by iteration.
Default: True.
save_best (str, optional): If a metric is specified, it would measure
the best checkpoint during evaluation. The information about best
checkpoint would be saved in ``runner.meta['hook_msgs']`` to keep
best score value and best checkpoint path, which will be also
loaded when resume checkpoint. Options are the evaluation metrics
on the test dataset. e.g., ``bbox_mAP``, ``segm_mAP`` for bbox
detection and instance segmentation. ``AR@100`` for proposal
recall. If ``save_best`` is ``auto``, the first key of the returned
``OrderedDict`` result will be used. Default: None.
rule (str | None, optional): Comparison rule for best score. If set to
None, it will infer a reasonable rule. Keys such as 'acc', 'top'
.etc will be inferred by 'greater' rule. Keys contain 'loss' will
be inferred by 'less' rule. Options are 'greater', 'less', None.
Default: None.
test_fn (callable, optional): test a model with samples from a
dataloader, and return the test results. If ``None``, the default
test function ``mmcv.engine.single_gpu_test`` will be used.
(default: ``None``)
greater_keys (List[str] | None, optional): Metric keys that will be
inferred by 'greater' comparison rule. If ``None``,
_default_greater_keys will be used. (default: ``None``)
less_keys (List[str] | None, optional): Metric keys that will be
inferred by 'less' comparison rule. If ``None``, _default_less_keys
will be used. (default: ``None``)
out_dir (str, optional): The root directory to save checkpoints. If not
specified, `runner.work_dir` will be used by default. If specified,
the `out_dir` will be the concatenation of `out_dir` and the last
level directory of `runner.work_dir`.
`New in version 1.3.16.`
file_client_args (dict): Arguments to instantiate a FileClient.
See :class:`mmcv.fileio.FileClient` for details. Default: None.
`New in version 1.3.16.`
**eval_kwargs: Evaluation arguments fed into the evaluate function of
the dataset.
Notes:
If new arguments are added for EvalHook, tools/test.py,
tools/eval_metric.py may be affected.
"""
# Since the key for determine greater or less is related to the downstream
# tasks, downstream repos may need to overwrite the following inner
# variable accordingly.
rule_map
=
{
'greater'
:
lambda
x
,
y
:
x
>
y
,
'less'
:
lambda
x
,
y
:
x
<
y
}
init_value_map
=
{
'greater'
:
-
inf
,
'less'
:
inf
}
_default_greater_keys
=
[
'acc'
,
'top'
,
'AR@'
,
'auc'
,
'precision'
,
'mAP'
,
'mDice'
,
'mIoU'
,
'mAcc'
,
'aAcc'
]
_default_less_keys
=
[
'loss'
]
def
__init__
(
self
,
dataloader
,
start
=
None
,
interval
=
1
,
by_epoch
=
True
,
save_best
=
None
,
rule
=
None
,
test_fn
=
None
,
greater_keys
=
None
,
less_keys
=
None
,
out_dir
=
None
,
file_client_args
=
None
,
**
eval_kwargs
):
if
not
isinstance
(
dataloader
,
DataLoader
):
raise
TypeError
(
f
'dataloader must be a pytorch DataLoader, '
f
'but got
{
type
(
dataloader
)
}
'
)
if
interval
<=
0
:
raise
ValueError
(
f
'interval must be a positive number, '
f
'but got
{
interval
}
'
)
assert
isinstance
(
by_epoch
,
bool
),
'``by_epoch`` should be a boolean'
if
start
is
not
None
and
start
<
0
:
raise
ValueError
(
f
'The evaluation start epoch
{
start
}
is smaller '
f
'than 0'
)
self
.
dataloader
=
dataloader
self
.
interval
=
interval
self
.
start
=
start
self
.
by_epoch
=
by_epoch
assert
isinstance
(
save_best
,
str
)
or
save_best
is
None
,
\
'""save_best"" should be a str or None '
\
f
'rather than
{
type
(
save_best
)
}
'
self
.
save_best
=
save_best
self
.
eval_kwargs
=
eval_kwargs
self
.
initial_flag
=
True
if
test_fn
is
None
:
from
annotator.uniformer.mmcv.engine
import
single_gpu_test
self
.
test_fn
=
single_gpu_test
else
:
self
.
test_fn
=
test_fn
if
greater_keys
is
None
:
self
.
greater_keys
=
self
.
_default_greater_keys
else
:
if
not
isinstance
(
greater_keys
,
(
list
,
tuple
)):
greater_keys
=
(
greater_keys
,
)
assert
is_seq_of
(
greater_keys
,
str
)
self
.
greater_keys
=
greater_keys
if
less_keys
is
None
:
self
.
less_keys
=
self
.
_default_less_keys
else
:
if
not
isinstance
(
less_keys
,
(
list
,
tuple
)):
less_keys
=
(
less_keys
,
)
assert
is_seq_of
(
less_keys
,
str
)
self
.
less_keys
=
less_keys
if
self
.
save_best
is
not
None
:
self
.
best_ckpt_path
=
None
self
.
_init_rule
(
rule
,
self
.
save_best
)
self
.
out_dir
=
out_dir
self
.
file_client_args
=
file_client_args
def
_init_rule
(
self
,
rule
,
key_indicator
):
"""Initialize rule, key_indicator, comparison_func, and best score.
Here is the rule to determine which rule is used for key indicator
when the rule is not specific (note that the key indicator matching
is case-insensitive):
1. If the key indicator is in ``self.greater_keys``, the rule will be
specified as 'greater'.
2. Or if the key indicator is in ``self.less_keys``, the rule will be
specified as 'less'.
3. Or if the key indicator is equal to the substring in any one item
in ``self.greater_keys``, the rule will be specified as 'greater'.
4. Or if the key indicator is equal to the substring in any one item
in ``self.less_keys``, the rule will be specified as 'less'.
Args:
rule (str | None): Comparison rule for best score.
key_indicator (str | None): Key indicator to determine the
comparison rule.
"""
if
rule
not
in
self
.
rule_map
and
rule
is
not
None
:
raise
KeyError
(
f
'rule must be greater, less or None, '
f
'but got
{
rule
}
.'
)
if
rule
is
None
:
if
key_indicator
!=
'auto'
:
# `_lc` here means we use the lower case of keys for
# case-insensitive matching
key_indicator_lc
=
key_indicator
.
lower
()
greater_keys
=
[
key
.
lower
()
for
key
in
self
.
greater_keys
]
less_keys
=
[
key
.
lower
()
for
key
in
self
.
less_keys
]
if
key_indicator_lc
in
greater_keys
:
rule
=
'greater'
elif
key_indicator_lc
in
less_keys
:
rule
=
'less'
elif
any
(
key
in
key_indicator_lc
for
key
in
greater_keys
):
rule
=
'greater'
elif
any
(
key
in
key_indicator_lc
for
key
in
less_keys
):
rule
=
'less'
else
:
raise
ValueError
(
f
'Cannot infer the rule for key '
f
'
{
key_indicator
}
, thus a specific rule '
f
'must be specified.'
)
self
.
rule
=
rule
self
.
key_indicator
=
key_indicator
if
self
.
rule
is
not
None
:
self
.
compare_func
=
self
.
rule_map
[
self
.
rule
]
def
before_run
(
self
,
runner
):
if
not
self
.
out_dir
:
self
.
out_dir
=
runner
.
work_dir
self
.
file_client
=
FileClient
.
infer_client
(
self
.
file_client_args
,
self
.
out_dir
)
# if `self.out_dir` is not equal to `runner.work_dir`, it means that
# `self.out_dir` is set so the final `self.out_dir` is the
# concatenation of `self.out_dir` and the last level directory of
# `runner.work_dir`
if
self
.
out_dir
!=
runner
.
work_dir
:
basename
=
osp
.
basename
(
runner
.
work_dir
.
rstrip
(
osp
.
sep
))
self
.
out_dir
=
self
.
file_client
.
join_path
(
self
.
out_dir
,
basename
)
runner
.
logger
.
info
(
(
f
'The best checkpoint will be saved to
{
self
.
out_dir
}
by '
f
'
{
self
.
file_client
.
name
}
'
))
if
self
.
save_best
is
not
None
:
if
runner
.
meta
is
None
:
warnings
.
warn
(
'runner.meta is None. Creating an empty one.'
)
runner
.
meta
=
dict
()
runner
.
meta
.
setdefault
(
'hook_msgs'
,
dict
())
self
.
best_ckpt_path
=
runner
.
meta
[
'hook_msgs'
].
get
(
'best_ckpt'
,
None
)
def
before_train_iter
(
self
,
runner
):
"""Evaluate the model only at the start of training by iteration."""
if
self
.
by_epoch
or
not
self
.
initial_flag
:
return
if
self
.
start
is
not
None
and
runner
.
iter
>=
self
.
start
:
self
.
after_train_iter
(
runner
)
self
.
initial_flag
=
False
def
before_train_epoch
(
self
,
runner
):
"""Evaluate the model only at the start of training by epoch."""
if
not
(
self
.
by_epoch
and
self
.
initial_flag
):
return
if
self
.
start
is
not
None
and
runner
.
epoch
>=
self
.
start
:
self
.
after_train_epoch
(
runner
)
self
.
initial_flag
=
False
def
after_train_iter
(
self
,
runner
):
"""Called after every training iter to evaluate the results."""
if
not
self
.
by_epoch
and
self
.
_should_evaluate
(
runner
):
# Because the priority of EvalHook is higher than LoggerHook, the
# training log and the evaluating log are mixed. Therefore,
# we need to dump the training log and clear it before evaluating
# log is generated. In addition, this problem will only appear in
# `IterBasedRunner` whose `self.by_epoch` is False, because
# `EpochBasedRunner` whose `self.by_epoch` is True calls
# `_do_evaluate` in `after_train_epoch` stage, and at this stage
# the training log has been printed, so it will not cause any
# problem. more details at
# https://github.com/open-mmlab/mmsegmentation/issues/694
for
hook
in
runner
.
_hooks
:
if
isinstance
(
hook
,
LoggerHook
):
hook
.
after_train_iter
(
runner
)
runner
.
log_buffer
.
clear
()
self
.
_do_evaluate
(
runner
)
def
after_train_epoch
(
self
,
runner
):
"""Called after every training epoch to evaluate the results."""
if
self
.
by_epoch
and
self
.
_should_evaluate
(
runner
):
self
.
_do_evaluate
(
runner
)
def
_do_evaluate
(
self
,
runner
):
"""perform evaluation and save ckpt."""
results
=
self
.
test_fn
(
runner
.
model
,
self
.
dataloader
)
runner
.
log_buffer
.
output
[
'eval_iter_num'
]
=
len
(
self
.
dataloader
)
key_score
=
self
.
evaluate
(
runner
,
results
)
# the key_score may be `None` so it needs to skip the action to save
# the best checkpoint
if
self
.
save_best
and
key_score
:
self
.
_save_ckpt
(
runner
,
key_score
)
def
_should_evaluate
(
self
,
runner
):
"""Judge whether to perform evaluation.
Here is the rule to judge whether to perform evaluation:
1. It will not perform evaluation during the epoch/iteration interval,
which is determined by ``self.interval``.
2. It will not perform evaluation if the start time is larger than
current time.
3. It will not perform evaluation when current time is larger than
the start time but during epoch/iteration interval.
Returns:
bool: The flag indicating whether to perform evaluation.
"""
if
self
.
by_epoch
:
current
=
runner
.
epoch
check_time
=
self
.
every_n_epochs
else
:
current
=
runner
.
iter
check_time
=
self
.
every_n_iters
if
self
.
start
is
None
:
if
not
check_time
(
runner
,
self
.
interval
):
# No evaluation during the interval.
return
False
elif
(
current
+
1
)
<
self
.
start
:
# No evaluation if start is larger than the current time.
return
False
else
:
# Evaluation only at epochs/iters 3, 5, 7...
# if start==3 and interval==2
if
(
current
+
1
-
self
.
start
)
%
self
.
interval
:
return
False
return
True
def
_save_ckpt
(
self
,
runner
,
key_score
):
"""Save the best checkpoint.
It will compare the score according to the compare function, write
related information (best score, best checkpoint path) and save the
best checkpoint into ``work_dir``.
"""
if
self
.
by_epoch
:
current
=
f
'epoch_
{
runner
.
epoch
+
1
}
'
cur_type
,
cur_time
=
'epoch'
,
runner
.
epoch
+
1
else
:
current
=
f
'iter_
{
runner
.
iter
+
1
}
'
cur_type
,
cur_time
=
'iter'
,
runner
.
iter
+
1
best_score
=
runner
.
meta
[
'hook_msgs'
].
get
(
'best_score'
,
self
.
init_value_map
[
self
.
rule
])
if
self
.
compare_func
(
key_score
,
best_score
):
best_score
=
key_score
runner
.
meta
[
'hook_msgs'
][
'best_score'
]
=
best_score
if
self
.
best_ckpt_path
and
self
.
file_client
.
isfile
(
self
.
best_ckpt_path
):
self
.
file_client
.
remove
(
self
.
best_ckpt_path
)
runner
.
logger
.
info
(
(
f
'The previous best checkpoint
{
self
.
best_ckpt_path
}
was '
'removed'
))
best_ckpt_name
=
f
'best_
{
self
.
key_indicator
}
_
{
current
}
.pth'
self
.
best_ckpt_path
=
self
.
file_client
.
join_path
(
self
.
out_dir
,
best_ckpt_name
)
runner
.
meta
[
'hook_msgs'
][
'best_ckpt'
]
=
self
.
best_ckpt_path
runner
.
save_checkpoint
(
self
.
out_dir
,
best_ckpt_name
,
create_symlink
=
False
)
runner
.
logger
.
info
(
f
'Now best checkpoint is saved as
{
best_ckpt_name
}
.'
)
runner
.
logger
.
info
(
f
'Best
{
self
.
key_indicator
}
is
{
best_score
:
0.4
f
}
'
f
'at
{
cur_time
}
{
cur_type
}
.'
)
def
evaluate
(
self
,
runner
,
results
):
"""Evaluate the results.
Args:
runner (:obj:`mmcv.Runner`): The underlined training runner.
results (list): Output results.
"""
eval_res
=
self
.
dataloader
.
dataset
.
evaluate
(
results
,
logger
=
runner
.
logger
,
**
self
.
eval_kwargs
)
for
name
,
val
in
eval_res
.
items
():
runner
.
log_buffer
.
output
[
name
]
=
val
runner
.
log_buffer
.
ready
=
True
if
self
.
save_best
is
not
None
:
# If the performance of model is pool, the `eval_res` may be an
# empty dict and it will raise exception when `self.save_best` is
# not None. More details at
# https://github.com/open-mmlab/mmdetection/issues/6265.
if
not
eval_res
:
warnings
.
warn
(
'Since `eval_res` is an empty dict, the behavior to save '
'the best checkpoint will be skipped in this evaluation.'
)
return
None
if
self
.
key_indicator
==
'auto'
:
# infer from eval_results
self
.
_init_rule
(
self
.
rule
,
list
(
eval_res
.
keys
())[
0
])
return
eval_res
[
self
.
key_indicator
]
return
None
class
DistEvalHook
(
EvalHook
):
"""Distributed evaluation hook.
This hook will regularly perform evaluation in a given interval when
performing in distributed environment.
Args:
dataloader (DataLoader): A PyTorch dataloader, whose dataset has
implemented ``evaluate`` function.
start (int | None, optional): Evaluation starting epoch. It enables
evaluation before the training starts if ``start`` <= the resuming
epoch. If None, whether to evaluate is merely decided by
``interval``. Default: None.
interval (int): Evaluation interval. Default: 1.
by_epoch (bool): Determine perform evaluation by epoch or by iteration.
If set to True, it will perform by epoch. Otherwise, by iteration.
default: True.
save_best (str, optional): If a metric is specified, it would measure
the best checkpoint during evaluation. The information about best
checkpoint would be saved in ``runner.meta['hook_msgs']`` to keep
best score value and best checkpoint path, which will be also
loaded when resume checkpoint. Options are the evaluation metrics
on the test dataset. e.g., ``bbox_mAP``, ``segm_mAP`` for bbox
detection and instance segmentation. ``AR@100`` for proposal
recall. If ``save_best`` is ``auto``, the first key of the returned
``OrderedDict`` result will be used. Default: None.
rule (str | None, optional): Comparison rule for best score. If set to
None, it will infer a reasonable rule. Keys such as 'acc', 'top'
.etc will be inferred by 'greater' rule. Keys contain 'loss' will
be inferred by 'less' rule. Options are 'greater', 'less', None.
Default: None.
test_fn (callable, optional): test a model with samples from a
dataloader in a multi-gpu manner, and return the test results. If
``None``, the default test function ``mmcv.engine.multi_gpu_test``
will be used. (default: ``None``)
tmpdir (str | None): Temporary directory to save the results of all
processes. Default: None.
gpu_collect (bool): Whether to use gpu or cpu to collect results.
Default: False.
broadcast_bn_buffer (bool): Whether to broadcast the
buffer(running_mean and running_var) of rank 0 to other rank
before evaluation. Default: True.
out_dir (str, optional): The root directory to save checkpoints. If not
specified, `runner.work_dir` will be used by default. If specified,
the `out_dir` will be the concatenation of `out_dir` and the last
level directory of `runner.work_dir`.
file_client_args (dict): Arguments to instantiate a FileClient.
See :class:`mmcv.fileio.FileClient` for details. Default: None.
**eval_kwargs: Evaluation arguments fed into the evaluate function of
the dataset.
"""
def
__init__
(
self
,
dataloader
,
start
=
None
,
interval
=
1
,
by_epoch
=
True
,
save_best
=
None
,
rule
=
None
,
test_fn
=
None
,
greater_keys
=
None
,
less_keys
=
None
,
broadcast_bn_buffer
=
True
,
tmpdir
=
None
,
gpu_collect
=
False
,
out_dir
=
None
,
file_client_args
=
None
,
**
eval_kwargs
):
if
test_fn
is
None
:
from
annotator.uniformer.mmcv.engine
import
multi_gpu_test
test_fn
=
multi_gpu_test
super
().
__init__
(
dataloader
,
start
=
start
,
interval
=
interval
,
by_epoch
=
by_epoch
,
save_best
=
save_best
,
rule
=
rule
,
test_fn
=
test_fn
,
greater_keys
=
greater_keys
,
less_keys
=
less_keys
,
out_dir
=
out_dir
,
file_client_args
=
file_client_args
,
**
eval_kwargs
)
self
.
broadcast_bn_buffer
=
broadcast_bn_buffer
self
.
tmpdir
=
tmpdir
self
.
gpu_collect
=
gpu_collect
def
_do_evaluate
(
self
,
runner
):
"""perform evaluation and save ckpt."""
# Synchronization of BatchNorm's buffer (running_mean
# and running_var) is not supported in the DDP of pytorch,
# which may cause the inconsistent performance of models in
# different ranks, so we broadcast BatchNorm's buffers
# of rank 0 to other ranks to avoid this.
if
self
.
broadcast_bn_buffer
:
model
=
runner
.
model
for
name
,
module
in
model
.
named_modules
():
if
isinstance
(
module
,
_BatchNorm
)
and
module
.
track_running_stats
:
dist
.
broadcast
(
module
.
running_var
,
0
)
dist
.
broadcast
(
module
.
running_mean
,
0
)
tmpdir
=
self
.
tmpdir
if
tmpdir
is
None
:
tmpdir
=
osp
.
join
(
runner
.
work_dir
,
'.eval_hook'
)
results
=
self
.
test_fn
(
runner
.
model
,
self
.
dataloader
,
tmpdir
=
tmpdir
,
gpu_collect
=
self
.
gpu_collect
)
if
runner
.
rank
==
0
:
print
(
'
\n
'
)
runner
.
log_buffer
.
output
[
'eval_iter_num'
]
=
len
(
self
.
dataloader
)
key_score
=
self
.
evaluate
(
runner
,
results
)
# the key_score may be `None` so it needs to skip the action to
# save the best checkpoint
if
self
.
save_best
and
key_score
:
self
.
_save_ckpt
(
runner
,
key_score
)
lavis/common/annotator/uniformer/mmcv/runner/hooks/hook.py
0 → 100644
View file @
c04f261a
# Copyright (c) OpenMMLab. All rights reserved.
from
annotator.uniformer.mmcv.utils
import
Registry
,
is_method_overridden
HOOKS
=
Registry
(
'hook'
)
class
Hook
:
stages
=
(
'before_run'
,
'before_train_epoch'
,
'before_train_iter'
,
'after_train_iter'
,
'after_train_epoch'
,
'before_val_epoch'
,
'before_val_iter'
,
'after_val_iter'
,
'after_val_epoch'
,
'after_run'
)
def
before_run
(
self
,
runner
):
pass
def
after_run
(
self
,
runner
):
pass
def
before_epoch
(
self
,
runner
):
pass
def
after_epoch
(
self
,
runner
):
pass
def
before_iter
(
self
,
runner
):
pass
def
after_iter
(
self
,
runner
):
pass
def
before_train_epoch
(
self
,
runner
):
self
.
before_epoch
(
runner
)
def
before_val_epoch
(
self
,
runner
):
self
.
before_epoch
(
runner
)
def
after_train_epoch
(
self
,
runner
):
self
.
after_epoch
(
runner
)
def
after_val_epoch
(
self
,
runner
):
self
.
after_epoch
(
runner
)
def
before_train_iter
(
self
,
runner
):
self
.
before_iter
(
runner
)
def
before_val_iter
(
self
,
runner
):
self
.
before_iter
(
runner
)
def
after_train_iter
(
self
,
runner
):
self
.
after_iter
(
runner
)
def
after_val_iter
(
self
,
runner
):
self
.
after_iter
(
runner
)
def
every_n_epochs
(
self
,
runner
,
n
):
return
(
runner
.
epoch
+
1
)
%
n
==
0
if
n
>
0
else
False
def
every_n_inner_iters
(
self
,
runner
,
n
):
return
(
runner
.
inner_iter
+
1
)
%
n
==
0
if
n
>
0
else
False
def
every_n_iters
(
self
,
runner
,
n
):
return
(
runner
.
iter
+
1
)
%
n
==
0
if
n
>
0
else
False
def
end_of_epoch
(
self
,
runner
):
return
runner
.
inner_iter
+
1
==
len
(
runner
.
data_loader
)
def
is_last_epoch
(
self
,
runner
):
return
runner
.
epoch
+
1
==
runner
.
_max_epochs
def
is_last_iter
(
self
,
runner
):
return
runner
.
iter
+
1
==
runner
.
_max_iters
def
get_triggered_stages
(
self
):
trigger_stages
=
set
()
for
stage
in
Hook
.
stages
:
if
is_method_overridden
(
stage
,
Hook
,
self
):
trigger_stages
.
add
(
stage
)
# some methods will be triggered in multi stages
# use this dict to map method to stages.
method_stages_map
=
{
'before_epoch'
:
[
'before_train_epoch'
,
'before_val_epoch'
],
'after_epoch'
:
[
'after_train_epoch'
,
'after_val_epoch'
],
'before_iter'
:
[
'before_train_iter'
,
'before_val_iter'
],
'after_iter'
:
[
'after_train_iter'
,
'after_val_iter'
],
}
for
method
,
map_stages
in
method_stages_map
.
items
():
if
is_method_overridden
(
method
,
Hook
,
self
):
trigger_stages
.
update
(
map_stages
)
return
[
stage
for
stage
in
Hook
.
stages
if
stage
in
trigger_stages
]
lavis/common/annotator/uniformer/mmcv/runner/hooks/iter_timer.py
0 → 100644
View file @
c04f261a
# Copyright (c) OpenMMLab. All rights reserved.
import
time
from
.hook
import
HOOKS
,
Hook
@
HOOKS
.
register_module
()
class
IterTimerHook
(
Hook
):
def
before_epoch
(
self
,
runner
):
self
.
t
=
time
.
time
()
def
before_iter
(
self
,
runner
):
runner
.
log_buffer
.
update
({
'data_time'
:
time
.
time
()
-
self
.
t
})
def
after_iter
(
self
,
runner
):
runner
.
log_buffer
.
update
({
'time'
:
time
.
time
()
-
self
.
t
})
self
.
t
=
time
.
time
()
lavis/common/annotator/uniformer/mmcv/runner/hooks/logger/__init__.py
0 → 100644
View file @
c04f261a
# Copyright (c) OpenMMLab. All rights reserved.
from
.base
import
LoggerHook
from
.dvclive
import
DvcliveLoggerHook
from
.mlflow
import
MlflowLoggerHook
from
.neptune
import
NeptuneLoggerHook
from
.pavi
import
PaviLoggerHook
from
.tensorboard
import
TensorboardLoggerHook
from
.text
import
TextLoggerHook
from
.wandb
import
WandbLoggerHook
__all__
=
[
'LoggerHook'
,
'MlflowLoggerHook'
,
'PaviLoggerHook'
,
'TensorboardLoggerHook'
,
'TextLoggerHook'
,
'WandbLoggerHook'
,
'NeptuneLoggerHook'
,
'DvcliveLoggerHook'
]
lavis/common/annotator/uniformer/mmcv/runner/hooks/logger/base.py
0 → 100644
View file @
c04f261a
# Copyright (c) OpenMMLab. All rights reserved.
import
numbers
from
abc
import
ABCMeta
,
abstractmethod
import
numpy
as
np
import
torch
from
..hook
import
Hook
class
LoggerHook
(
Hook
):
"""Base class for logger hooks.
Args:
interval (int): Logging interval (every k iterations).
ignore_last (bool): Ignore the log of last iterations in each epoch
if less than `interval`.
reset_flag (bool): Whether to clear the output buffer after logging.
by_epoch (bool): Whether EpochBasedRunner is used.
"""
__metaclass__
=
ABCMeta
def
__init__
(
self
,
interval
=
10
,
ignore_last
=
True
,
reset_flag
=
False
,
by_epoch
=
True
):
self
.
interval
=
interval
self
.
ignore_last
=
ignore_last
self
.
reset_flag
=
reset_flag
self
.
by_epoch
=
by_epoch
@
abstractmethod
def
log
(
self
,
runner
):
pass
@
staticmethod
def
is_scalar
(
val
,
include_np
=
True
,
include_torch
=
True
):
"""Tell the input variable is a scalar or not.
Args:
val: Input variable.
include_np (bool): Whether include 0-d np.ndarray as a scalar.
include_torch (bool): Whether include 0-d torch.Tensor as a scalar.
Returns:
bool: True or False.
"""
if
isinstance
(
val
,
numbers
.
Number
):
return
True
elif
include_np
and
isinstance
(
val
,
np
.
ndarray
)
and
val
.
ndim
==
0
:
return
True
elif
include_torch
and
isinstance
(
val
,
torch
.
Tensor
)
and
len
(
val
)
==
1
:
return
True
else
:
return
False
def
get_mode
(
self
,
runner
):
if
runner
.
mode
==
'train'
:
if
'time'
in
runner
.
log_buffer
.
output
:
mode
=
'train'
else
:
mode
=
'val'
elif
runner
.
mode
==
'val'
:
mode
=
'val'
else
:
raise
ValueError
(
f
"runner mode should be 'train' or 'val', "
f
'but got
{
runner
.
mode
}
'
)
return
mode
def
get_epoch
(
self
,
runner
):
if
runner
.
mode
==
'train'
:
epoch
=
runner
.
epoch
+
1
elif
runner
.
mode
==
'val'
:
# normal val mode
# runner.epoch += 1 has been done before val workflow
epoch
=
runner
.
epoch
else
:
raise
ValueError
(
f
"runner mode should be 'train' or 'val', "
f
'but got
{
runner
.
mode
}
'
)
return
epoch
def
get_iter
(
self
,
runner
,
inner_iter
=
False
):
"""Get the current training iteration step."""
if
self
.
by_epoch
and
inner_iter
:
current_iter
=
runner
.
inner_iter
+
1
else
:
current_iter
=
runner
.
iter
+
1
return
current_iter
def
get_lr_tags
(
self
,
runner
):
tags
=
{}
lrs
=
runner
.
current_lr
()
if
isinstance
(
lrs
,
dict
):
for
name
,
value
in
lrs
.
items
():
tags
[
f
'learning_rate/
{
name
}
'
]
=
value
[
0
]
else
:
tags
[
'learning_rate'
]
=
lrs
[
0
]
return
tags
def
get_momentum_tags
(
self
,
runner
):
tags
=
{}
momentums
=
runner
.
current_momentum
()
if
isinstance
(
momentums
,
dict
):
for
name
,
value
in
momentums
.
items
():
tags
[
f
'momentum/
{
name
}
'
]
=
value
[
0
]
else
:
tags
[
'momentum'
]
=
momentums
[
0
]
return
tags
def
get_loggable_tags
(
self
,
runner
,
allow_scalar
=
True
,
allow_text
=
False
,
add_mode
=
True
,
tags_to_skip
=
(
'time'
,
'data_time'
)):
tags
=
{}
for
var
,
val
in
runner
.
log_buffer
.
output
.
items
():
if
var
in
tags_to_skip
:
continue
if
self
.
is_scalar
(
val
)
and
not
allow_scalar
:
continue
if
isinstance
(
val
,
str
)
and
not
allow_text
:
continue
if
add_mode
:
var
=
f
'
{
self
.
get_mode
(
runner
)
}
/
{
var
}
'
tags
[
var
]
=
val
tags
.
update
(
self
.
get_lr_tags
(
runner
))
tags
.
update
(
self
.
get_momentum_tags
(
runner
))
return
tags
def
before_run
(
self
,
runner
):
for
hook
in
runner
.
hooks
[::
-
1
]:
if
isinstance
(
hook
,
LoggerHook
):
hook
.
reset_flag
=
True
break
def
before_epoch
(
self
,
runner
):
runner
.
log_buffer
.
clear
()
# clear logs of last epoch
def
after_train_iter
(
self
,
runner
):
if
self
.
by_epoch
and
self
.
every_n_inner_iters
(
runner
,
self
.
interval
):
runner
.
log_buffer
.
average
(
self
.
interval
)
elif
not
self
.
by_epoch
and
self
.
every_n_iters
(
runner
,
self
.
interval
):
runner
.
log_buffer
.
average
(
self
.
interval
)
elif
self
.
end_of_epoch
(
runner
)
and
not
self
.
ignore_last
:
# not precise but more stable
runner
.
log_buffer
.
average
(
self
.
interval
)
if
runner
.
log_buffer
.
ready
:
self
.
log
(
runner
)
if
self
.
reset_flag
:
runner
.
log_buffer
.
clear_output
()
def
after_train_epoch
(
self
,
runner
):
if
runner
.
log_buffer
.
ready
:
self
.
log
(
runner
)
if
self
.
reset_flag
:
runner
.
log_buffer
.
clear_output
()
def
after_val_epoch
(
self
,
runner
):
runner
.
log_buffer
.
average
()
self
.
log
(
runner
)
if
self
.
reset_flag
:
runner
.
log_buffer
.
clear_output
()
Prev
1
…
11
12
13
14
15
16
17
18
19
…
22
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment