Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
SOLOv2-pytorch
Commits
c76a95ec
Commit
c76a95ec
authored
Oct 12, 2018
by
Kai Chen
Browse files
reorganize the training api
parent
2a43cc7d
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
161 additions
and
97 deletions
+161
-97
mmdet/api/__init__.py
mmdet/api/__init__.py
+5
-1
mmdet/api/env.py
mmdet/api/env.py
+57
-0
mmdet/api/train.py
mmdet/api/train.py
+55
-49
mmdet/core/utils/__init__.py
mmdet/core/utils/__init__.py
+3
-3
mmdet/core/utils/dist_utils.py
mmdet/core/utils/dist_utils.py
+0
-32
mmdet/datasets/loader/build_loader.py
mmdet/datasets/loader/build_loader.py
+1
-1
tools/train.py
tools/train.py
+40
-11
No files found.
mmdet/api/__init__.py
View file @
c76a95ec
from
.env
import
init_dist
,
get_root_logger
,
set_random_seed
from
.train
import
train_detector
from
.train
import
train_detector
from
.inference
import
inference_detector
from
.inference
import
inference_detector
__all__
=
[
'train_detector'
,
'inference_detector'
]
__all__
=
[
'init_dist'
,
'get_root_logger'
,
'set_random_seed'
,
'train_detector'
,
'inference_detector'
]
mmdet/api/env.py
0 → 100644
View file @
c76a95ec
import
logging
import
os
import
random
import
numpy
as
np
import
torch
import
torch.distributed
as
dist
import
torch.multiprocessing
as
mp
from
mmcv.runner
import
get_dist_info
def
init_dist
(
launcher
,
backend
=
'nccl'
,
**
kwargs
):
if
mp
.
get_start_method
(
allow_none
=
True
)
is
None
:
mp
.
set_start_method
(
'spawn'
)
if
launcher
==
'pytorch'
:
_init_dist_pytorch
(
backend
,
**
kwargs
)
elif
launcher
==
'mpi'
:
_init_dist_mpi
(
backend
,
**
kwargs
)
elif
launcher
==
'slurm'
:
_init_dist_slurm
(
backend
,
**
kwargs
)
else
:
raise
ValueError
(
'Invalid launcher type: {}'
.
format
(
launcher
))
def
_init_dist_pytorch
(
backend
,
**
kwargs
):
# TODO: use local_rank instead of rank % num_gpus
rank
=
int
(
os
.
environ
[
'RANK'
])
num_gpus
=
torch
.
cuda
.
device_count
()
torch
.
cuda
.
set_device
(
rank
%
num_gpus
)
dist
.
init_process_group
(
backend
=
backend
,
**
kwargs
)
def
_init_dist_mpi
(
backend
,
**
kwargs
):
raise
NotImplementedError
def
_init_dist_slurm
(
backend
,
**
kwargs
):
raise
NotImplementedError
def
set_random_seed
(
seed
):
random
.
seed
(
seed
)
np
.
random
.
seed
(
seed
)
torch
.
manual_seed
(
seed
)
torch
.
cuda
.
manual_seed_all
(
seed
)
def
get_root_logger
(
log_level
=
logging
.
INFO
):
logger
=
logging
.
getLogger
()
if
not
logger
.
hasHandlers
():
logging
.
basicConfig
(
format
=
'%(asctime)s - %(levelname)s - %(message)s'
,
level
=
log_level
)
rank
,
_
=
get_dist_info
()
if
rank
!=
0
:
logger
.
setLevel
(
'ERROR'
)
return
logger
mmdet/api/train.py
View file @
c76a95ec
from
__future__
import
division
from
__future__
import
division
import
logging
import
random
import
random
from
collections
import
OrderedDict
from
collections
import
OrderedDict
...
@@ -9,11 +8,11 @@ import torch
...
@@ -9,11 +8,11 @@ import torch
from
mmcv.runner
import
Runner
,
DistSamplerSeedHook
from
mmcv.runner
import
Runner
,
DistSamplerSeedHook
from
mmcv.parallel
import
MMDataParallel
,
MMDistributedDataParallel
from
mmcv.parallel
import
MMDataParallel
,
MMDistributedDataParallel
from
mmdet
import
__version__
from
mmdet.core
import
(
DistOptimizerHook
,
CocoDistEvalRecallHook
,
from
mmdet.core
import
(
init_dist
,
DistOptimizerHook
,
CocoDistEvalRecallHook
,
CocoDistEvalmAPHook
)
CocoDistEvalmAPHook
)
from
mmdet.datasets
import
build_dataloader
from
mmdet.datasets
import
build_dataloader
from
mmdet.models
import
RPN
from
mmdet.models
import
RPN
from
.env
import
get_root_logger
def
parse_losses
(
losses
):
def
parse_losses
(
losses
):
...
@@ -46,13 +45,6 @@ def batch_processor(model, data, train_mode):
...
@@ -46,13 +45,6 @@ def batch_processor(model, data, train_mode):
return
outputs
return
outputs
def
get_logger
(
log_level
):
logging
.
basicConfig
(
format
=
'%(asctime)s - %(levelname)s - %(message)s'
,
level
=
log_level
)
logger
=
logging
.
getLogger
()
return
logger
def
set_random_seed
(
seed
):
def
set_random_seed
(
seed
):
random
.
seed
(
seed
)
random
.
seed
(
seed
)
np
.
random
.
seed
(
seed
)
np
.
random
.
seed
(
seed
)
...
@@ -60,58 +52,72 @@ def set_random_seed(seed):
...
@@ -60,58 +52,72 @@ def set_random_seed(seed):
torch
.
cuda
.
manual_seed_all
(
seed
)
torch
.
cuda
.
manual_seed_all
(
seed
)
def
train_detector
(
model
,
dataset
,
cfg
):
def
train_detector
(
model
,
# save mmdet version in checkpoint as meta data
dataset
,
cfg
.
checkpoint_config
.
meta
=
dict
(
cfg
,
mmdet_version
=
__version__
,
config
=
cfg
.
text
)
distributed
=
False
,
validate
=
False
,
logger
=
get_logger
(
cfg
.
log_level
)
logger
=
None
):
if
logger
is
None
:
# set random seed if specified
logger
=
get_root_logger
(
cfg
.
log_level
)
if
cfg
.
seed
is
not
None
:
logger
.
info
(
'Set random seed to {}'
.
format
(
cfg
.
seed
))
set_random_seed
(
cfg
.
seed
)
# init distributed environment if necessary
# start training
if
cfg
.
launcher
==
'none'
:
if
distributed
:
dist
=
False
_dist_train
(
model
,
dataset
,
cfg
,
validate
=
validate
)
logger
.
info
(
'Non-distributed training.'
)
else
:
else
:
dist
=
True
_non_dist_train
(
model
,
dataset
,
cfg
,
validate
=
validate
)
init_dist
(
cfg
.
launcher
,
**
cfg
.
dist_params
)
if
torch
.
distributed
.
get_rank
()
!=
0
:
logger
.
setLevel
(
'ERROR'
)
logger
.
info
(
'Distributed training.'
)
def
_dist_train
(
model
,
dataset
,
cfg
,
validate
=
False
):
# prepare data loaders
# prepare data loaders
data_loaders
=
[
data_loaders
=
[
build_dataloader
(
dataset
,
cfg
.
data
.
imgs_per_gpu
,
build_dataloader
(
cfg
.
data
.
workers_per_gpu
,
cfg
.
gpus
,
dist
)
dataset
,
cfg
.
data
.
imgs_per_gpu
,
cfg
.
data
.
workers_per_gpu
,
dist
=
True
)
]
]
# put model on gpus
# put model on gpus
if
dist
:
model
=
MMDistributedDataParallel
(
model
.
cuda
())
model
=
MMDistributedDataParallel
(
model
.
cuda
())
else
:
model
=
MMDataParallel
(
model
,
device_ids
=
range
(
cfg
.
gpus
)).
cuda
()
# build runner
# build runner
runner
=
Runner
(
model
,
batch_processor
,
cfg
.
optimizer
,
cfg
.
work_dir
,
runner
=
Runner
(
model
,
batch_processor
,
cfg
.
optimizer
,
cfg
.
work_dir
,
cfg
.
log_level
)
cfg
.
log_level
)
# register hooks
# register hooks
optimizer_config
=
DistOptimizerHook
(
optimizer_config
=
DistOptimizerHook
(
**
cfg
.
optimizer_config
)
**
cfg
.
optimizer_config
)
if
dist
else
cfg
.
optimizer_config
runner
.
register_training_hooks
(
cfg
.
lr_config
,
optimizer_config
,
runner
.
register_training_hooks
(
cfg
.
lr_config
,
optimizer_config
,
cfg
.
checkpoint_config
,
cfg
.
log_config
)
cfg
.
checkpoint_config
,
cfg
.
log_config
)
if
dist
:
runner
.
register_hook
(
DistSamplerSeedHook
())
runner
.
register_hook
(
DistSamplerSeedHook
())
# register eval hooks
# register eval hooks
if
validate
:
if
cfg
.
validate
:
if
isinstance
(
model
.
module
,
RPN
):
if
isinstance
(
model
.
module
,
RPN
):
runner
.
register_hook
(
CocoDistEvalRecallHook
(
cfg
.
data
.
val
))
runner
.
register_hook
(
CocoDistEvalRecallHook
(
cfg
.
data
.
val
))
elif
cfg
.
data
.
val
.
type
==
'CocoDataset'
:
elif
cfg
.
data
.
val
.
type
==
'CocoDataset'
:
runner
.
register_hook
(
CocoDistEvalmAPHook
(
cfg
.
data
.
val
))
runner
.
register_hook
(
CocoDistEvalmAPHook
(
cfg
.
data
.
val
))
if
cfg
.
resume_from
:
runner
.
resume
(
cfg
.
resume_from
)
elif
cfg
.
load_from
:
runner
.
load_checkpoint
(
cfg
.
load_from
)
runner
.
run
(
data_loaders
,
cfg
.
workflow
,
cfg
.
total_epochs
)
def
_non_dist_train
(
model
,
dataset
,
cfg
,
validate
=
False
):
# prepare data loaders
data_loaders
=
[
build_dataloader
(
dataset
,
cfg
.
data
.
imgs_per_gpu
,
cfg
.
data
.
workers_per_gpu
,
cfg
.
gpus
,
dist
=
False
)
]
# put model on gpus
model
=
MMDataParallel
(
model
,
device_ids
=
range
(
cfg
.
gpus
)).
cuda
()
# build runner
runner
=
Runner
(
model
,
batch_processor
,
cfg
.
optimizer
,
cfg
.
work_dir
,
cfg
.
log_level
)
runner
.
register_training_hooks
(
cfg
.
lr_config
,
cfg
.
optimizer_config
,
cfg
.
checkpoint_config
,
cfg
.
log_config
)
if
cfg
.
resume_from
:
if
cfg
.
resume_from
:
runner
.
resume
(
cfg
.
resume_from
)
runner
.
resume
(
cfg
.
resume_from
)
...
...
mmdet/core/utils/__init__.py
View file @
c76a95ec
from
.dist_utils
import
init_dist
,
allreduce_grads
,
DistOptimizerHook
from
.dist_utils
import
allreduce_grads
,
DistOptimizerHook
from
.misc
import
tensor2imgs
,
unmap
,
multi_apply
from
.misc
import
tensor2imgs
,
unmap
,
multi_apply
__all__
=
[
__all__
=
[
'init_dist'
,
'allreduce_grads'
,
'DistOptimizerHook'
,
'tensor2imgs'
,
'allreduce_grads'
,
'DistOptimizerHook'
,
'tensor2imgs'
,
'unmap'
,
'unmap'
,
'multi_apply'
'multi_apply'
]
]
mmdet/core/utils/dist_utils.py
View file @
c76a95ec
import
os
from
collections
import
OrderedDict
from
collections
import
OrderedDict
import
torch
import
torch.multiprocessing
as
mp
import
torch.distributed
as
dist
import
torch.distributed
as
dist
from
torch._utils
import
(
_flatten_dense_tensors
,
_unflatten_dense_tensors
,
from
torch._utils
import
(
_flatten_dense_tensors
,
_unflatten_dense_tensors
,
_take_tensors
)
_take_tensors
)
from
mmcv.runner
import
OptimizerHook
from
mmcv.runner
import
OptimizerHook
def
init_dist
(
launcher
,
backend
=
'nccl'
,
**
kwargs
):
if
mp
.
get_start_method
(
allow_none
=
True
)
is
None
:
mp
.
set_start_method
(
'spawn'
)
if
launcher
==
'pytorch'
:
_init_dist_pytorch
(
backend
,
**
kwargs
)
elif
launcher
==
'mpi'
:
_init_dist_mpi
(
backend
,
**
kwargs
)
elif
launcher
==
'slurm'
:
_init_dist_slurm
(
backend
,
**
kwargs
)
else
:
raise
ValueError
(
'Invalid launcher type: {}'
.
format
(
launcher
))
def
_init_dist_pytorch
(
backend
,
**
kwargs
):
# TODO: use local_rank instead of rank % num_gpus
rank
=
int
(
os
.
environ
[
'RANK'
])
num_gpus
=
torch
.
cuda
.
device_count
()
torch
.
cuda
.
set_device
(
rank
%
num_gpus
)
dist
.
init_process_group
(
backend
=
backend
,
**
kwargs
)
def
_init_dist_mpi
(
backend
,
**
kwargs
):
raise
NotImplementedError
def
_init_dist_slurm
(
backend
,
**
kwargs
):
raise
NotImplementedError
def
_allreduce_coalesced
(
tensors
,
world_size
,
bucket_size_mb
=-
1
):
def
_allreduce_coalesced
(
tensors
,
world_size
,
bucket_size_mb
=-
1
):
if
bucket_size_mb
>
0
:
if
bucket_size_mb
>
0
:
bucket_size_bytes
=
bucket_size_mb
*
1024
*
1024
bucket_size_bytes
=
bucket_size_mb
*
1024
*
1024
...
...
mmdet/datasets/loader/build_loader.py
View file @
c76a95ec
...
@@ -15,7 +15,7 @@ resource.setrlimit(resource.RLIMIT_NOFILE, (4096, rlimit[1]))
...
@@ -15,7 +15,7 @@ resource.setrlimit(resource.RLIMIT_NOFILE, (4096, rlimit[1]))
def
build_dataloader
(
dataset
,
def
build_dataloader
(
dataset
,
imgs_per_gpu
,
imgs_per_gpu
,
workers_per_gpu
,
workers_per_gpu
,
num_gpus
,
num_gpus
=
1
,
dist
=
True
,
dist
=
True
,
**
kwargs
):
**
kwargs
):
if
dist
:
if
dist
:
...
...
tools/train.py
View file @
c76a95ec
...
@@ -4,8 +4,9 @@ import argparse
...
@@ -4,8 +4,9 @@ import argparse
from
mmcv
import
Config
from
mmcv
import
Config
from
mmcv.runner
import
obj_from_dict
from
mmcv.runner
import
obj_from_dict
from
mmdet
import
datasets
from
mmdet
import
datasets
,
__version__
from
mmdet.api
import
train_detector
from
mmdet.api
import
(
train_detector
,
init_dist
,
get_root_logger
,
set_random_seed
)
from
mmdet.models
import
build_detector
from
mmdet.models
import
build_detector
...
@@ -16,10 +17,14 @@ def parse_args():
...
@@ -16,10 +17,14 @@ def parse_args():
parser
.
add_argument
(
parser
.
add_argument
(
'--validate'
,
'--validate'
,
action
=
'store_true'
,
action
=
'store_true'
,
help
=
'whether to
add a
val
id
ate
phase
'
)
help
=
'whether to
e
val
u
ate
the checkpoint during training
'
)
parser
.
add_argument
(
parser
.
add_argument
(
'--gpus'
,
type
=
int
,
default
=
1
,
help
=
'number of gpus to use'
)
'--gpus'
,
parser
.
add_argument
(
'--seed'
,
type
=
int
,
help
=
'random seed'
)
type
=
int
,
default
=
1
,
help
=
'number of gpus to use '
'(only applicable to non-distributed training)'
)
parser
.
add_argument
(
'--seed'
,
type
=
int
,
default
=
None
,
help
=
'random seed'
)
parser
.
add_argument
(
parser
.
add_argument
(
'--launcher'
,
'--launcher'
,
choices
=
[
'none'
,
'pytorch'
,
'slurm'
,
'mpi'
],
choices
=
[
'none'
,
'pytorch'
,
'slurm'
,
'mpi'
],
...
@@ -33,19 +38,43 @@ def parse_args():
...
@@ -33,19 +38,43 @@ def parse_args():
def
main
():
def
main
():
args
=
parse_args
()
args
=
parse_args
()
cfg
=
Config
.
fromfile
(
args
.
config
)
cfg
=
Config
.
fromfile
(
args
.
config
)
# update configs according to CLI args
if
args
.
work_dir
is
not
None
:
if
args
.
work_dir
is
not
None
:
cfg
.
work_dir
=
args
.
work_dir
cfg
.
work_dir
=
args
.
work_dir
cfg
.
validate
=
args
.
validate
cfg
.
gpus
=
args
.
gpus
cfg
.
gpus
=
args
.
gpus
cfg
.
seed
=
args
.
seed
if
cfg
.
checkpoint_config
is
not
None
:
cfg
.
launcher
=
args
.
launcher
# save mmdet version in checkpoints as meta data
cfg
.
local_rank
=
args
.
local_rank
cfg
.
checkpoint_config
.
meta
=
dict
(
# build model
mmdet_version
=
__version__
,
config
=
cfg
.
text
)
# init distributed env first, since logger depends on the dist info.
if
args
.
launcher
==
'none'
:
distributed
=
False
else
:
distributed
=
True
init_dist
(
args
.
launcher
,
**
cfg
.
dist_params
)
# init logger before other steps
logger
=
get_root_logger
(
cfg
.
log_level
)
logger
.
info
(
'Distributed training: {}'
.
format
(
distributed
))
# set random seeds
if
args
.
seed
is
not
None
:
logger
.
info
(
'Set random seed to {}'
.
format
(
args
.
seed
))
set_random_seed
(
args
.
seed
)
model
=
build_detector
(
model
=
build_detector
(
cfg
.
model
,
train_cfg
=
cfg
.
train_cfg
,
test_cfg
=
cfg
.
test_cfg
)
cfg
.
model
,
train_cfg
=
cfg
.
train_cfg
,
test_cfg
=
cfg
.
test_cfg
)
train_dataset
=
obj_from_dict
(
cfg
.
data
.
train
,
datasets
)
train_dataset
=
obj_from_dict
(
cfg
.
data
.
train
,
datasets
)
train_detector
(
model
,
train_dataset
,
cfg
)
train_detector
(
model
,
train_dataset
,
cfg
,
distributed
=
distributed
,
validate
=
args
.
validate
,
logger
=
logger
)
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment