Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
MobileNetV3_pytorch
Commits
78e8e038
Commit
78e8e038
authored
Jun 07, 2023
by
Sugon_ldc
Browse files
add new model
parents
Changes
24
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
536 additions
and
0 deletions
+536
-0
run_pretraining.sh
run_pretraining.sh
+4
-0
statistics.py
statistics.py
+69
-0
train.py
train.py
+462
-0
train.sh
train.sh
+1
-0
No files found.
run_pretraining.sh
0 → 100644
View file @
78e8e038
#!/usr/bin/env bash
export
HIP_VISIBLE_DEVICES
=
0,1,2,3
python
-m
torch.distributed.run
--nproc_per_node
4 train.py
--batch-size
=
128
--mode
=
small
--print-freq
=
1
--dataset
=
CIFAR10
--ema-decay
=
0
--label-smoothing
=
0
--lr
=
0.2
--save-epoch-freq
=
10
--lr-decay
=
cos
--lr-min
=
0
--warmup-epochs
=
5
--weight-decay
=
6e-5
--num-epochs
=
400
--num-workers
=
2
--width-multiplier
=
1
--data-dir
/data/
--save-path
/data/mobilenetv3out/
$1
$2
statistics.py
0 → 100644
View file @
78e8e038
# -*- coding: UTF-8 -*-
'''
statistical information and display
Ref: https://github.com/pytorch/examples/blob/master/imagenet/main.py
'''
import
torch
def
accuracy
(
output
,
target
,
topk
=
(
1
,)):
'''
Computes the accuracy over the k top predictions for the specified values of k
'''
with
torch
.
no_grad
():
maxk
=
max
(
topk
)
batch_size
=
target
.
size
(
0
)
_
,
pred
=
output
.
topk
(
maxk
,
1
,
True
,
True
)
pred
=
pred
.
t
()
correct
=
pred
.
eq
(
target
.
view
(
1
,
-
1
).
expand_as
(
pred
))
res
=
[]
for
k
in
topk
:
correct_k
=
correct
[:
k
].
contiguous
().
view
(
-
1
).
float
().
sum
(
0
,
keepdim
=
True
)
res
.
append
(
correct_k
.
mul_
(
100.0
/
batch_size
))
return
res
class
AverageMeter
(
object
):
'''
Computes and stores the average and current value
'''
def
__init__
(
self
,
name
,
fmt
=
':f'
):
self
.
name
=
name
self
.
fmt
=
fmt
self
.
reset
()
def
reset
(
self
):
self
.
val
=
0
self
.
avg
=
0
self
.
sum
=
0
self
.
count
=
0
def
update
(
self
,
val
,
n
=
1
):
self
.
val
=
val
self
.
sum
+=
val
*
n
self
.
count
+=
n
self
.
avg
=
self
.
sum
/
self
.
count
def
__str__
(
self
):
# fmtstr = '{name}:{val' + self.fmt + '}(avg:{avg' + self.fmt + '})'
fmtstr
=
'Avg {name}:{avg'
+
self
.
fmt
+
'}'
return
fmtstr
.
format
(
**
self
.
__dict__
)
class
ProgressMeter
(
object
):
def
__init__
(
self
,
num_batches
,
meters
,
prefix
=
""
):
self
.
batch_fmtstr
=
self
.
_get_batch_fmtstr
(
num_batches
)
self
.
meters
=
meters
self
.
prefix
=
prefix
def
display
(
self
,
batch
):
entries
=
[
self
.
prefix
+
self
.
batch_fmtstr
.
format
(
batch
)]
entries
+=
[
str
(
meter
)
for
meter
in
self
.
meters
]
print
(
' '
.
join
(
entries
))
def
_get_batch_fmtstr
(
self
,
num_batches
):
num_digits
=
len
(
str
(
num_batches
//
1
))
fmt
=
'{:'
+
str
(
num_digits
)
+
'd}'
return
'['
+
fmt
+
'/'
+
fmt
.
format
(
num_batches
)
+
']'
\ No newline at end of file
train.py
0 → 100644
View file @
78e8e038
# -*- coding: UTF-8 -*-
'''
Train the model
Ref: https://pytorch.org/tutorials/beginner/transfer_learning_tutorial.html
'''
import
torch
import
torch.nn
as
nn
import
torch.optim
as
optim
from
torch.optim
import
lr_scheduler
from
torch.autograd
import
Variable
import
time
import
datetime
import
os
from
mobileNetV3
import
MobileNetV3
import
argparse
import
copy
from
math
import
cos
,
pi
from
statistics
import
*
from
EMA
import
EMA
from
LabelSmoothing
import
LabelSmoothingLoss
# from DataLoader import dataloaders
from
ResultWriter
import
ResultWriter
from
CosineLR
import
*
from
Mixup
import
mixup_data
,
mixup_criterion
from
collections
import
OrderedDict
import
torch.distributed
as
dist
def
train
(
args
,
model
,
dataloader
,
loader_len
,
criterion
,
optimizer
,
scheduler
,
use_gpu
,
epoch
,
ema
=
None
,
save_file_name
=
'train.csv'
):
'''
train the model
'''
# save result every epoch
resultWriter
=
ResultWriter
(
args
.
save_path
,
save_file_name
)
if
epoch
==
0
:
resultWriter
.
create_csv
([
'epoch'
,
'loss'
,
'top-1'
,
'top-5'
,
'lr'
])
# use gpu or not
device
=
torch
.
device
(
'cuda'
if
use_gpu
else
'cpu'
)
# statistical information
batch_time
=
AverageMeter
(
'Time'
,
':6.3f'
)
sample_time
=
AverageMeter
(
'samples/sec'
,
':6.2f'
)
data_time
=
AverageMeter
(
'Data'
,
':6.3f'
)
losses
=
AverageMeter
(
'Loss'
,
':.4e'
)
top1
=
AverageMeter
(
'Acc@1'
,
':6.2f'
)
top5
=
AverageMeter
(
'Acc@5'
,
':6.2f'
)
progress
=
ProgressMeter
(
loader_len
,
[
batch_time
,
data_time
,
sample_time
,
losses
,
top1
,
top5
],
prefix
=
"{} Epoch: [{}] rank {} "
.
format
(
datetime
.
datetime
.
fromtimestamp
(
time
.
time
()),
epoch
,
local_rank
))
# update lr here if using stepLR
if
args
.
lr_decay
==
'step'
:
scheduler
.
step
(
epoch
)
# Set model to training mode
model
.
train
()
end
=
time
.
time
()
# Iterate over data
for
i
,
(
inputs
,
labels
)
in
enumerate
(
dataloader
):
# measure data loading time
data_time
.
update
(
time
.
time
()
-
end
)
inputs
=
inputs
.
to
(
device
)
labels
=
labels
.
to
(
device
)
if
args
.
mixup
:
# using mixup
inputs
,
labels_a
,
labels_b
,
lam
=
mixup_data
(
inputs
,
labels
,
args
.
mixup_alpha
)
outputs
=
model
(
inputs
)
loss
=
mixup_criterion
(
criterion
,
outputs
,
labels_a
,
labels_b
,
lam
)
acc1_a
,
acc5_a
=
accuracy
(
outputs
,
labels_a
,
topk
=
(
1
,
5
))
acc1_b
,
acc5_b
=
accuracy
(
outputs
,
labels_b
,
topk
=
(
1
,
5
))
# measure accuracy and record loss
acc1
=
lam
*
acc1_a
+
(
1
-
lam
)
*
acc1_b
acc5
=
lam
*
acc5_a
+
(
1
-
lam
)
*
acc5_b
else
:
# normal forward
outputs
=
model
(
inputs
)
loss
=
criterion
(
outputs
,
labels
)
# measure accuracy and record loss
acc1
,
acc5
=
accuracy
(
outputs
,
labels
,
topk
=
(
1
,
5
))
# zero the parameter gradients
optimizer
.
zero_grad
()
losses
.
update
(
loss
.
item
(),
inputs
.
size
(
0
))
top1
.
update
(
acc1
[
0
],
inputs
.
size
(
0
))
top5
.
update
(
acc5
[
0
],
inputs
.
size
(
0
))
# backward + optimize
loss
.
backward
()
if
args
.
lr_decay
==
'cos'
:
# update lr here if using cosine lr decay
scheduler
.
step
(
epoch
*
loader_len
+
i
)
elif
args
.
lr_decay
==
'sgdr'
:
# update lr here if using sgdr
scheduler
.
step
(
epoch
+
i
/
loader_len
)
optimizer
.
step
()
if
args
.
ema_decay
>
0
:
# EMA update after training(every iteration)
ema
.
update
()
batch_time
.
update
(
time
.
time
()
-
end
)
sample_time
.
update
(
args
.
batch_size
/
batch_time
.
avg
)
end
=
time
.
time
()
if
i
%
args
.
print_freq
==
0
:
progress
.
display
(
i
)
# write training result to file
resultWriter
.
write_csv
([
epoch
,
losses
.
avg
,
top1
.
avg
.
item
(),
top5
.
avg
.
item
(),
scheduler
.
optimizer
.
param_groups
[
0
][
'lr'
]])
print
()
# there is a bug in get_lr() if using pytorch 1.1.0, see https://github.com/pytorch/pytorch/issues/22107
# so here we don't use get_lr()
# print('lr:%.6f' % scheduler.get_lr()[0])
print
(
'lr:%.6f'
%
scheduler
.
optimizer
.
param_groups
[
0
][
'lr'
])
print
(
'{} Train *** rank:{} Loss:{losses.avg:.2e} Acc@1:{top1.avg:.2f} Acc@5:{top5.avg:.2f}'
.
format
(
datetime
.
datetime
.
fromtimestamp
(
time
.
time
()),
local_rank
,
losses
=
losses
,
top1
=
top1
,
top5
=
top5
))
if
epoch
%
args
.
save_epoch_freq
==
0
and
epoch
!=
0
and
local_rank
==
0
:
if
not
os
.
path
.
exists
(
args
.
save_path
):
os
.
makedirs
(
args
.
save_path
)
torch
.
save
(
model
.
state_dict
(),
os
.
path
.
join
(
args
.
save_path
,
"epoch_"
+
str
(
epoch
)
+
".pth"
))
return
sample_time
.
avg
def
validate
(
args
,
model
,
dataloader
,
loader_len
,
criterion
,
use_gpu
,
epoch
,
ema
=
None
,
save_file_name
=
'val.csv'
):
'''
validate the model
'''
# save result every epoch
resultWriter
=
ResultWriter
(
args
.
save_path
,
save_file_name
)
if
epoch
==
0
:
resultWriter
.
create_csv
([
'epoch'
,
'loss'
,
'top-1'
,
'top-5'
])
device
=
torch
.
device
(
'cuda'
if
use_gpu
else
'cpu'
)
batch_time
=
AverageMeter
(
'Time'
,
':6.3f'
)
data_time
=
AverageMeter
(
'Data'
,
':6.3f'
)
losses
=
AverageMeter
(
'Loss'
,
':.4e'
)
top1
=
AverageMeter
(
'Acc@1'
,
':6.2f'
)
top5
=
AverageMeter
(
'Acc@5'
,
':6.2f'
)
progress
=
ProgressMeter
(
loader_len
,
[
batch_time
,
data_time
,
losses
,
top1
,
top5
],
prefix
=
"{} Epoch: [{}] rank "
.
format
(
datetime
.
datetime
.
fromtimestamp
(
time
.
time
()),
epoch
,
local_rank
))
if
args
.
ema_decay
>
0
:
# apply EMA at validation stage
ema
.
apply_shadow
()
# Set model to evaluate mode
model
.
eval
()
end
=
time
.
time
()
# Iterate over data
for
i
,
(
inputs
,
labels
)
in
enumerate
(
dataloader
):
# measure data loading time
data_time
.
update
(
time
.
time
()
-
end
)
inputs
=
inputs
.
to
(
device
)
labels
=
labels
.
to
(
device
)
with
torch
.
set_grad_enabled
(
False
):
outputs
=
model
(
inputs
)
loss
=
criterion
(
outputs
,
labels
)
# measure accuracy and record loss
acc1
,
acc5
=
accuracy
(
outputs
,
labels
,
topk
=
(
1
,
5
))
losses
.
update
(
loss
.
item
(),
inputs
.
size
(
0
))
top1
.
update
(
acc1
[
0
],
inputs
.
size
(
0
))
top5
.
update
(
acc5
[
0
],
inputs
.
size
(
0
))
batch_time
.
update
(
time
.
time
()
-
end
)
end
=
time
.
time
()
if
args
.
ema_decay
>
0
:
# restore the origin parameters after val
ema
.
restore
()
# write val result to file
resultWriter
.
write_csv
([
epoch
,
losses
.
avg
,
top1
.
avg
.
item
(),
top5
.
avg
.
item
()])
print
(
'{} Val *** rank:{} Loss:{losses.avg:.2e} Acc@1:{top1.avg:.2f} Acc@5:{top5.avg:.2f}'
.
format
(
datetime
.
datetime
.
fromtimestamp
(
time
.
time
()),
local_rank
,
losses
=
losses
,
top1
=
top1
,
top5
=
top5
))
if
epoch
%
args
.
save_epoch_freq
==
0
and
epoch
!=
0
and
local_rank
==
0
:
if
not
os
.
path
.
exists
(
args
.
save_path
):
os
.
makedirs
(
args
.
save_path
)
torch
.
save
(
model
.
state_dict
(),
os
.
path
.
join
(
args
.
save_path
,
"epoch_"
+
str
(
epoch
)
+
".pth"
))
top1_acc
=
top1
.
avg
.
item
()
top5_acc
=
top5
.
avg
.
item
()
return
top1_acc
,
top5_acc
def
train_model
(
args
,
model
,
dataloader
,
loaders_len
,
criterion
,
optimizer
,
scheduler
,
use_gpu
):
'''
train the model
'''
since
=
time
.
time
()
ema
=
None
# exponential moving average
if
args
.
ema_decay
>
0
:
ema
=
EMA
(
model
,
decay
=
args
.
ema_decay
)
ema
.
register
()
best_model_wts
=
copy
.
deepcopy
(
model
.
state_dict
())
best_acc
=
0.0
correspond_top5
=
0.0
sample_time
=
0.0
for
epoch
in
range
(
args
.
start_epoch
,
args
.
num_epochs
):
epoch_time
=
time
.
time
()
sample_time
=
train
(
args
,
model
,
dataloader
[
'train'
],
loaders_len
[
'train'
],
criterion
,
optimizer
,
scheduler
,
use_gpu
,
epoch
,
ema
)
top1_acc
,
top5_acc
=
validate
(
args
,
model
,
dataloader
[
'val'
],
loaders_len
[
'val'
],
criterion
,
use_gpu
,
epoch
,
ema
)
epoch_time
=
time
.
time
()
-
epoch_time
print
(
'Time of epoch-[{:d}/{:d}] : {:.0f}h {:.0f}m {:.0f}s
\n
'
.
format
(
epoch
,
args
.
num_epochs
,
epoch_time
//
3600
,
(
epoch_time
%
3600
)
//
60
,
epoch_time
%
60
))
# deep copy the model if it has higher top-1 accuracy
if
top1_acc
>
best_acc
:
best_acc
=
top1_acc
correspond_top5
=
top5_acc
if
args
.
ema_decay
>
0
:
ema
.
apply_shadow
()
best_model_wts
=
copy
.
deepcopy
(
model
.
state_dict
())
if
args
.
ema_decay
>
0
:
ema
.
restore
()
print
(
os
.
path
.
split
(
args
.
save_path
)[
-
1
])
print
(
'{} Best val top-1 Accuracy: {:4f}'
.
format
(
datetime
.
datetime
.
fromtimestamp
(
time
.
time
()),
best_acc
))
print
(
'{} Corresponding top-5 Accuracy: {:4f}'
.
format
(
datetime
.
datetime
.
fromtimestamp
(
time
.
time
()),
correspond_top5
))
time_elapsed
=
time
.
time
()
-
since
print
(
'{} Training complete in {:.0f}h {:.0f}m {:.0f}s, samples/sec {:.2f}'
.
format
(
datetime
.
datetime
.
fromtimestamp
(
time
.
time
()),
time_elapsed
//
3600
,
(
time_elapsed
%
3600
)
//
60
,
time_elapsed
%
60
,
sample_time
))
# load best model weights
model
.
load_state_dict
(
best_model_wts
)
# save best model weights
if
args
.
save
and
local_rank
==
0
:
torch
.
save
(
model
.
state_dict
(),
os
.
path
.
join
(
args
.
save_path
,
'best_model_wts-'
+
'{:.2f}'
.
format
(
best_acc
)
+
'.pth'
))
return
model
def
write_pid_file
(
pid_file_path
):
'''Write pid file for watching the process later.
In each round of case, we will write the current pid in the same path.
'''
if
os
.
path
.
exists
(
pid_file_path
):
os
.
remove
(
pid_file_path
)
file_d
=
open
(
pid_file_path
,
"w"
)
file_d
.
write
(
"%s
\n
"
%
os
.
getpid
())
file_d
.
close
()
if
__name__
==
'__main__'
:
import
warnings
warnings
.
filterwarnings
(
'ignore'
)
parser
=
argparse
.
ArgumentParser
(
description
=
'PyTorch implementation of MobileNetV3'
)
# Root catalog of images
parser
.
add_argument
(
'--data-dir'
,
type
=
str
,
default
=
'/media/data2/chenjiarong/ImageData'
)
parser
.
add_argument
(
'--batch-size'
,
type
=
int
,
default
=
256
)
parser
.
add_argument
(
'--num-epochs'
,
type
=
int
,
default
=
150
)
parser
.
add_argument
(
'--lr'
,
type
=
float
,
default
=
0.1
)
parser
.
add_argument
(
'--num-workers'
,
type
=
int
,
default
=
4
)
#parser.add_argument('--gpus', type=str, default='0')
parser
.
add_argument
(
'--print-freq'
,
type
=
int
,
default
=
1000
)
parser
.
add_argument
(
'--save-epoch-freq'
,
type
=
int
,
default
=
1
)
parser
.
add_argument
(
'--save-path'
,
type
=
str
,
default
=
'/media/data2/chenjiarong/saved-model/MobileNetV3'
)
parser
.
add_argument
(
'-save'
,
default
=
False
,
action
=
'store_true'
,
help
=
'save model or not'
)
parser
.
add_argument
(
'--resume'
,
type
=
str
,
default
=
''
,
help
=
'For training from one checkpoint'
)
parser
.
add_argument
(
'--start-epoch'
,
type
=
int
,
default
=
0
,
help
=
'Corresponding to the epoch of resume'
)
parser
.
add_argument
(
'--ema-decay'
,
type
=
float
,
default
=
0.9999
,
help
=
'The decay of exponential moving average '
)
parser
.
add_argument
(
'--dataset'
,
type
=
str
,
default
=
'ImageNet'
,
help
=
'The dataset to be trained'
)
parser
.
add_argument
(
'-dali'
,
default
=
False
,
action
=
'store_true'
,
help
=
'Using DALI or not'
)
parser
.
add_argument
(
'--mode'
,
type
=
str
,
default
=
'large'
,
help
=
'large or small MobileNetV3'
)
# parser.add_argument('--num-class', type=int, default=1000)
parser
.
add_argument
(
'--width-multiplier'
,
type
=
float
,
default
=
1.0
,
help
=
'width multiplier'
)
parser
.
add_argument
(
'--dropout'
,
type
=
float
,
default
=
0.2
,
help
=
'dropout rate'
)
parser
.
add_argument
(
'--label-smoothing'
,
type
=
float
,
default
=
0.1
,
help
=
'label smoothing'
)
parser
.
add_argument
(
'--lr-decay'
,
type
=
str
,
default
=
'step'
,
help
=
'learning rate decay method, step, cos or sgdr'
)
parser
.
add_argument
(
'--step-size'
,
type
=
int
,
default
=
3
,
help
=
'step size in stepLR()'
)
parser
.
add_argument
(
'--gamma'
,
type
=
float
,
default
=
0.99
,
help
=
'gamma in stepLR()'
)
parser
.
add_argument
(
'--lr-min'
,
type
=
float
,
default
=
0
,
help
=
'minium lr using in CosineWarmupLR'
)
parser
.
add_argument
(
'--warmup-epochs'
,
type
=
int
,
default
=
0
,
help
=
'warmup epochs using in CosineWarmupLR'
)
parser
.
add_argument
(
'--T-0'
,
type
=
int
,
default
=
10
,
help
=
'T_0 in CosineAnnealingWarmRestarts'
)
parser
.
add_argument
(
'--T-mult'
,
type
=
int
,
default
=
2
,
help
=
'T_mult in CosineAnnealingWarmRestarts'
)
parser
.
add_argument
(
'--decay-rate'
,
type
=
float
,
default
=
1
,
help
=
'decay rate in CosineAnnealingWarmRestarts'
)
parser
.
add_argument
(
'--optimizer'
,
type
=
str
,
default
=
'sgd'
,
help
=
'optimizer'
)
parser
.
add_argument
(
'--weight-decay'
,
type
=
float
,
default
=
1e-5
,
help
=
'weight decay'
)
parser
.
add_argument
(
'--bn-momentum'
,
type
=
float
,
default
=
0.1
,
help
=
'momentum in BatchNorm2d'
)
parser
.
add_argument
(
'-use-seed'
,
default
=
False
,
action
=
'store_true'
,
help
=
'using fixed random seed or not'
)
parser
.
add_argument
(
'--seed'
,
type
=
int
,
default
=
1
,
help
=
'random seed'
)
parser
.
add_argument
(
'-deterministic'
,
default
=
False
,
action
=
'store_true'
,
help
=
'torch.backends.cudnn.deterministic'
)
parser
.
add_argument
(
'-nbd'
,
default
=
False
,
action
=
'store_true'
,
help
=
'no bias decay'
)
parser
.
add_argument
(
'-zero-gamma'
,
default
=
False
,
action
=
'store_true'
,
help
=
'zero gamma in BatchNorm2d when init'
)
parser
.
add_argument
(
'-mixup'
,
default
=
False
,
action
=
'store_true'
,
help
=
'mixup or not'
)
parser
.
add_argument
(
'--mixup-alpha'
,
type
=
float
,
default
=
0.2
,
help
=
'alpha used in mixup'
)
parser
.
add_argument
(
"--log_dir"
,
type
=
str
,
default
=
"/data/flagperf/training/result/"
,
help
=
"Log directory in container."
)
args
=
parser
.
parse_args
()
write_pid_file
(
args
.
log_dir
)
args
.
lr_decay
=
args
.
lr_decay
.
lower
()
args
.
dataset
=
args
.
dataset
.
lower
()
args
.
optimizer
=
args
.
optimizer
.
lower
()
# folder to save what we need in this type: MobileNetV3-mode-dataset-width_multiplier-dropout-lr-batch_size-ema_decay-label_smoothing
folder_name
=
[
'MobileNetV3'
,
args
.
mode
,
args
.
dataset
,
'wm'
+
str
(
args
.
width_multiplier
),
'dp'
+
str
(
args
.
dropout
),
'lr'
+
str
(
args
.
lr
),
'bs'
+
str
(
args
.
batch_size
),
'ed'
+
str
(
args
.
ema_decay
),
'ls'
+
str
(
args
.
label_smoothing
),
args
.
optimizer
+
str
(
args
.
weight_decay
),
'bn'
+
str
(
args
.
bn_momentum
),
'epochs'
+
str
(
args
.
num_epochs
),
'seed'
+
(
str
(
args
.
seed
)
if
args
.
use_seed
else
'None'
),
'determin'
+
str
(
args
.
deterministic
),
'NoBiasDecay'
+
str
(
args
.
nbd
),
'zeroGamma'
+
str
(
args
.
zero_gamma
),
'mixup'
+
(
str
(
args
.
mixup_alpha
)
if
args
.
mixup
else
'False'
)]
if
args
.
lr_decay
==
'step'
:
folder_name
.
append
(
args
.
lr_decay
+
str
(
args
.
step_size
)
+
'&'
+
str
(
args
.
gamma
))
elif
args
.
lr_decay
==
'cos'
:
folder_name
.
append
(
args
.
lr_decay
+
str
(
args
.
warmup_epochs
)
+
'&'
+
str
(
args
.
lr_min
))
elif
args
.
lr_decay
==
'sgdr'
:
folder_name
.
append
(
args
.
lr_decay
+
str
(
args
.
T_0
)
+
'&'
+
str
(
args
.
T_mult
)
+
'&'
+
str
(
args
.
warmup_epochs
)
+
'&'
+
str
(
args
.
decay_rate
))
folder_name
=
'-'
.
join
(
folder_name
)
args
.
save_path
=
os
.
path
.
join
(
args
.
save_path
,
folder_name
)
if
not
os
.
path
.
exists
(
args
.
save_path
):
os
.
makedirs
(
args
.
save_path
)
world_size
=
int
(
os
.
environ
[
"WORLD_SIZE"
])
local_rank
=
int
(
os
.
environ
[
'LOCAL_RANK'
])
dist
.
init_process_group
(
backend
=
"nccl"
)
torch
.
cuda
.
set_device
(
local_rank
)
# use gpu or not
use_gpu
=
torch
.
cuda
.
is_available
()
print
(
"use_gpu:{}"
.
format
(
use_gpu
))
# set random seed
if
args
.
use_seed
:
print
(
'Using fixed random seed'
)
torch
.
manual_seed
(
args
.
seed
)
else
:
print
(
'do not use fixed random seed'
)
if
use_gpu
:
if
args
.
use_seed
:
torch
.
cuda
.
manual_seed
(
args
.
seed
)
if
torch
.
cuda
.
device_count
()
>
1
:
torch
.
cuda
.
manual_seed_all
(
args
.
seed
)
if
args
.
deterministic
:
torch
.
backends
.
cudnn
.
deterministic
=
True
torch
.
backends
.
cudnn
.
benchmark
=
False
else
:
torch
.
backends
.
cudnn
.
deterministic
=
False
torch
.
backends
.
cudnn
.
benchmark
=
True
print
(
'torch.backends.cudnn.deterministic:'
+
str
(
args
.
deterministic
))
# read data
# dataloaders = dataloaders(args)
if
args
.
dali
and
(
args
.
dataset
==
'tinyimagenet'
or
args
.
dataset
==
'imagenet'
):
if
args
.
dataset
==
'imagenet'
:
from
DALIDataLoader
import
get_dali_imageNet_train_loader
,
get_dali_imageNet_val_loader
train_loader
,
train_loader_len
=
get_dali_imageNet_train_loader
(
data_path
=
args
.
data_dir
,
batch_size
=
args
.
batch_size
,
seed
=
args
.
seed
,
num_threads
=
args
.
num_workers
)
val_loader
,
val_loader_len
=
get_dali_imageNet_val_loader
(
data_path
=
args
.
data_dir
,
batch_size
=
args
.
batch_size
,
seed
=
args
.
seed
,
num_threads
=
args
.
num_workers
)
dataloaders
=
{
'train'
:
train_loader
,
'val'
:
val_loader
}
loaders_len
=
{
'train'
:
train_loader_len
,
'val'
:
val_loader_len
}
elif
args
.
dataset
==
'tinyimagenet'
:
from
DALIDataLoader
import
get_dali_tinyImageNet_train_loader
,
get_dali_tinyImageNet_val_loader
train_loader
,
train_loader_len
=
get_dali_tinyImageNet_train_loader
(
data_path
=
args
.
data_dir
,
batch_size
=
args
.
batch_size
,
seed
=
args
.
seed
,
num_threads
=
args
.
num_workers
)
val_loader
,
val_loader_len
=
get_dali_tinyImageNet_val_loader
(
data_path
=
args
.
data_dir
,
batch_size
=
args
.
batch_size
,
seed
=
args
.
seed
,
num_threads
=
args
.
num_workers
)
dataloaders
=
{
'train'
:
train_loader
,
'val'
:
val_loader
}
loaders_len
=
{
'train'
:
train_loader_len
,
'val'
:
val_loader_len
}
else
:
from
DataLoader
import
dataloaders
loaders
=
dataloaders
(
args
)
train_loader
=
loaders
[
'train'
]
train_loader_len
=
len
(
train_loader
)
val_loader
=
loaders
[
'val'
]
val_loader_len
=
len
(
val_loader
)
dataloaders
=
{
'train'
:
train_loader
,
'val'
:
val_loader
}
loaders_len
=
{
'train'
:
train_loader_len
,
'val'
:
val_loader_len
}
# different input size and number of classes for different datasets
if
args
.
dataset
==
'imagenet'
:
input_size
=
224
num_class
=
1000
elif
args
.
dataset
==
'tinyimagenet'
:
input_size
=
56
num_class
=
200
if
args
.
dataset
==
'cifar100'
:
input_size
=
32
num_class
=
100
elif
args
.
dataset
==
'cifar10'
or
args
.
dataset
==
'svhn'
:
input_size
=
32
num_class
=
10
# get model
model
=
MobileNetV3
(
mode
=
args
.
mode
,
classes_num
=
num_class
,
input_size
=
input_size
,
width_multiplier
=
args
.
width_multiplier
,
dropout
=
args
.
dropout
,
BN_momentum
=
args
.
bn_momentum
,
zero_gamma
=
args
.
zero_gamma
)
if
use_gpu
:
model
.
cuda
()
model
=
torch
.
nn
.
parallel
.
DistributedDataParallel
(
model
)
else
:
model
.
to
(
torch
.
device
(
'cpu'
))
if
args
.
resume
:
if
os
.
path
.
isfile
(
args
.
resume
):
print
((
"=> loading checkpoint '{}'"
.
format
(
args
.
resume
)))
state_dict
=
torch
.
load
(
args
.
resume
)
new_state_dict
=
OrderedDict
()
for
k
,
v
in
state_dict
.
items
():
if
'classifier'
in
k
:
continue
name
=
"module."
+
k
if
'featureList.1.conv2.1'
in
k
:
name
=
name
.
replace
(
'featureList.1.conv2.1'
,
'featureList.1.conv2.1.lastBN'
)
new_state_dict
[
name
]
=
v
model
.
load_state_dict
(
new_state_dict
,
strict
=
False
)
else
:
print
((
"=> no checkpoint found at '{}'"
.
format
(
args
.
resume
)))
exit
()
if
args
.
label_smoothing
>
0
:
# using Label Smoothing
criterion
=
LabelSmoothingLoss
(
num_class
,
label_smoothing
=
args
.
label_smoothing
)
else
:
criterion
=
nn
.
CrossEntropyLoss
()
if
args
.
optimizer
==
'sgd'
:
if
args
.
nbd
:
from
NoBiasDecay
import
noBiasDecay
optimizer_ft
=
optim
.
SGD
(
# no bias decay
noBiasDecay
(
model
,
args
.
lr
,
args
.
weight_decay
),
momentum
=
0.9
)
else
:
optimizer_ft
=
optim
.
SGD
(
model
.
parameters
(),
lr
=
args
.
lr
,
momentum
=
0.9
,
weight_decay
=
args
.
weight_decay
)
elif
args
.
optimizer
==
'rmsprop'
:
optimizer_ft
=
optim
.
RMSprop
(
model
.
parameters
(),
lr
=
args
.
lr
,
momentum
=
0.9
,
weight_decay
=
args
.
weight_decay
)
elif
args
.
optimizer
==
'adam'
:
optimizer_ft
=
optim
.
Adam
(
model
.
parameters
(),
lr
=
args
.
lr
,
weight_decay
=
args
.
weight_decay
)
if
args
.
lr_decay
==
'step'
:
# Decay LR by a factor of 0.99 every 3 epoch
lr_scheduler
=
lr_scheduler
.
StepLR
(
optimizer_ft
,
step_size
=
args
.
step_size
,
gamma
=
args
.
gamma
)
elif
args
.
lr_decay
==
'cos'
:
lr_scheduler
=
CosineWarmupLR
(
optimizer
=
optimizer_ft
,
epochs
=
args
.
num_epochs
,
iter_in_one_epoch
=
loaders_len
[
'train'
],
lr_min
=
args
.
lr_min
,
warmup_epochs
=
args
.
warmup_epochs
)
elif
args
.
lr_decay
==
'sgdr'
:
lr_scheduler
=
CosineAnnealingWarmRestarts
(
optimizer
=
optimizer_ft
,
T_0
=
args
.
T_0
,
T_mult
=
args
.
T_mult
,
warmup_epochs
=
args
.
warmup_epochs
,
decay_rate
=
args
.
decay_rate
)
model
=
train_model
(
args
=
args
,
model
=
model
,
dataloader
=
dataloaders
,
loaders_len
=
loaders_len
,
criterion
=
criterion
,
optimizer
=
optimizer_ft
,
scheduler
=
lr_scheduler
,
use_gpu
=
use_gpu
)
train.sh
0 → 100644
View file @
78e8e038
python
-m
torch.distributed.run
--nproc_per_node
4 train.py
--batch-size
=
128
--mode
=
small
--print-freq
=
1
--dataset
=
CIFAR10
--ema-decay
=
0
--label-smoothing
=
0
--lr
=
0.2
--save-epoch-freq
=
10
--lr-decay
=
cos
--lr-min
=
0
--warmup-epochs
=
5
--weight-decay
=
6e-5
--num-epochs
=
400
--num-workers
=
2
--width-multiplier
=
1
--data-dir
/data/
--save-path
/data/mobilenetv3out/
--log_dir
/data/pid.txt 2>&1 |
tee
mobilenetv3_dcu_
`
date
+%Y%m%d%H%M%S
`
.log
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment