Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
dcuai
dlexamples
Commits
0fc002df
Commit
0fc002df
authored
Apr 14, 2022
by
huchen
Browse files
init the dlexamples new
parent
0e04b692
Changes
375
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
3522 additions
and
0 deletions
+3522
-0
Keras/NLP/bert4keras/pretraining/README.md
Keras/NLP/bert4keras/pretraining/README.md
+19
-0
Keras/NLP/bert4keras/pretraining/data_utils.py
Keras/NLP/bert4keras/pretraining/data_utils.py
+408
-0
Keras/NLP/bert4keras/pretraining/pretraining.py
Keras/NLP/bert4keras/pretraining/pretraining.py
+325
-0
Keras/NLP/bert4keras/setup.py
Keras/NLP/bert4keras/setup.py
+16
-0
PyTorch/Compute-Vision/Accuracy_Validation/ResNet50/README.md
...rch/Compute-Vision/Accuracy_Validation/ResNet50/README.md
+15
-0
PyTorch/Compute-Vision/Accuracy_Validation/ResNet50/main_acc.py
...h/Compute-Vision/Accuracy_Validation/ResNet50/main_acc.py
+510
-0
PyTorch/Compute-Vision/Classification/README.md
PyTorch/Compute-Vision/Classification/README.md
+33
-0
PyTorch/Compute-Vision/Classification/main_bench.py
PyTorch/Compute-Vision/Classification/main_bench.py
+489
-0
PyTorch/Compute-Vision/Classification/mpi_slurm.sbatch
PyTorch/Compute-Vision/Classification/mpi_slurm.sbatch
+47
-0
PyTorch/Compute-Vision/Classification/single_process.sh
PyTorch/Compute-Vision/Classification/single_process.sh
+49
-0
PyTorch/Compute-Vision/Objection/Faster-rcnn/README.md
PyTorch/Compute-Vision/Objection/Faster-rcnn/README.md
+20
-0
PyTorch/Compute-Vision/Objection/Faster-rcnn/coco_eval.py
PyTorch/Compute-Vision/Objection/Faster-rcnn/coco_eval.py
+349
-0
PyTorch/Compute-Vision/Objection/Faster-rcnn/coco_utils.py
PyTorch/Compute-Vision/Objection/Faster-rcnn/coco_utils.py
+397
-0
PyTorch/Compute-Vision/Objection/Faster-rcnn/engine.py
PyTorch/Compute-Vision/Objection/Faster-rcnn/engine.py
+107
-0
PyTorch/Compute-Vision/Objection/Faster-rcnn/group_by_aspect_ratio.py
...ute-Vision/Objection/Faster-rcnn/group_by_aspect_ratio.py
+187
-0
PyTorch/Compute-Vision/Objection/Faster-rcnn/mpi_slurm.sbatch
...rch/Compute-Vision/Objection/Faster-rcnn/mpi_slurm.sbatch
+35
-0
PyTorch/Compute-Vision/Objection/Faster-rcnn/single_process.sh
...ch/Compute-Vision/Objection/Faster-rcnn/single_process.sh
+42
-0
PyTorch/Compute-Vision/Objection/Faster-rcnn/test.py
PyTorch/Compute-Vision/Objection/Faster-rcnn/test.py
+208
-0
PyTorch/Compute-Vision/Objection/Faster-rcnn/train.py
PyTorch/Compute-Vision/Objection/Faster-rcnn/train.py
+216
-0
PyTorch/Compute-Vision/Objection/Faster-rcnn/transforms.py
PyTorch/Compute-Vision/Objection/Faster-rcnn/transforms.py
+50
-0
No files found.
Too many changes to show.
To preserve performance only
375 of 375+
files are displayed.
Plain diff
Email patch
Keras/NLP/bert4keras/pretraining/README.md
0 → 100644
View file @
0fc002df
# 预训练相关代码
目前支持RoBERTa和GPT模式的预训练。请在tensorflow 1.14或1.15下运行。
## 使用
```
python data_utils.py # 生成tfrecord
python pretraining.py # 启动预训练过程
```
请阅读
`data_utils.py`
和
`pretraining.py`
修改相应的配置和参数,以适配自己的语料和设备。
## 背景
keras是一个友好的框架,通常我们都是基于tf后端使用,另外还有tf.keras可以使用,基本上跟keras 2.3.x的接口一致了。
这种一致性意味着使用keras几乎就相当于使用tf,这意味着tf的一切优势keras也有,但tf没有的优势(比如使用简便)keras也有。
因此,作者参考原训练过程地实现了基于keras的预训练脚本,而有了这个keras版之后,因为前面所述的一致性,所以我们可以很轻松地迁移到多GPU上训练,也可以很轻松地迁移到TPU上训练。
Keras/NLP/bert4keras/pretraining/data_utils.py
0 → 100644
View file @
0fc002df
#! -*- coding: utf-8 -*-
# 预训练语料构建
import
os
os
.
environ
[
'TF_KERAS'
]
=
'1'
# 必须使用tf.keras
import
numpy
as
np
import
tensorflow
as
tf
from
bert4keras.snippets
import
parallel_apply
from
bert4keras.backend
import
K
class
TrainingDataset
(
object
):
"""预训练数据集生成器
"""
def
__init__
(
self
,
tokenizer
,
sequence_length
=
512
):
"""参数说明:
tokenizer必须是bert4keras自带的tokenizer类;
"""
self
.
tokenizer
=
tokenizer
self
.
sequence_length
=
sequence_length
self
.
token_pad_id
=
tokenizer
.
_token_pad_id
self
.
token_cls_id
=
tokenizer
.
_token_start_id
self
.
token_sep_id
=
tokenizer
.
_token_end_id
self
.
token_mask_id
=
tokenizer
.
_token_mask_id
self
.
vocab_size
=
tokenizer
.
_vocab_size
def
padding
(
self
,
sequence
,
padding_value
=
None
):
"""对单个序列进行补0
"""
if
padding_value
is
None
:
padding_value
=
self
.
token_pad_id
sequence
=
sequence
[:
self
.
sequence_length
]
padding_length
=
self
.
sequence_length
-
len
(
sequence
)
return
sequence
+
[
padding_value
]
*
padding_length
def
sentence_process
(
self
,
text
):
"""单个文本的处理函数,返回处理后的instance
"""
raise
NotImplementedError
def
paragraph_process
(
self
,
texts
,
starts
,
ends
,
paddings
):
"""单个段落(多个文本)的处理函数
说明:texts是单句组成的list;starts是每个instance的起始id;
ends是每个instance的终止id;paddings是每个instance的填充id。
做法:不断塞句子,直到长度最接近sequence_length,然后padding。
"""
instances
,
instance
=
[],
[[
start
]
for
start
in
starts
]
for
text
in
texts
:
# 处理单个句子
sub_instance
=
self
.
sentence_process
(
text
)
sub_instance
=
[
i
[:
self
.
sequence_length
-
2
]
for
i
in
sub_instance
]
new_length
=
len
(
instance
[
0
])
+
len
(
sub_instance
[
0
])
# 如果长度即将溢出
if
new_length
>
self
.
sequence_length
-
1
:
# 插入终止符,并padding
complete_instance
=
[]
for
item
,
end
,
pad
in
zip
(
instance
,
ends
,
paddings
):
item
.
append
(
end
)
item
=
self
.
padding
(
item
,
pad
)
complete_instance
.
append
(
item
)
# 存储结果,并构建新样本
instances
.
append
(
complete_instance
)
instance
=
[[
start
]
for
start
in
starts
]
# 样本续接
for
item
,
sub_item
in
zip
(
instance
,
sub_instance
):
item
.
extend
(
sub_item
)
# 插入终止符,并padding
complete_instance
=
[]
for
item
,
end
,
pad
in
zip
(
instance
,
ends
,
paddings
):
item
.
append
(
end
)
item
=
self
.
padding
(
item
,
pad
)
complete_instance
.
append
(
item
)
# 存储最后的instance
instances
.
append
(
complete_instance
)
return
instances
def
tfrecord_serialize
(
self
,
instances
,
instance_keys
):
"""转为tfrecord的字符串,等待写入到文件
"""
def
create_feature
(
x
):
return
tf
.
train
.
Feature
(
int64_list
=
tf
.
train
.
Int64List
(
value
=
x
))
serialized_instances
=
[]
for
instance
in
instances
:
features
=
{
k
:
create_feature
(
v
)
for
k
,
v
in
zip
(
instance_keys
,
instance
)
}
tf_features
=
tf
.
train
.
Features
(
feature
=
features
)
tf_example
=
tf
.
train
.
Example
(
features
=
tf_features
)
serialized_instance
=
tf_example
.
SerializeToString
()
serialized_instances
.
append
(
serialized_instance
)
return
serialized_instances
def
process
(
self
,
corpus
,
record_name
,
workers
=
8
,
max_queue_size
=
2000
):
"""处理输入语料(corpus),最终转为tfrecord格式(record_name)
自带多进程支持,如果cpu核心数多,请加大workers和max_queue_size。
"""
writer
=
tf
.
io
.
TFRecordWriter
(
record_name
)
globals
()[
'count'
]
=
0
def
write_to_tfrecord
(
serialized_instances
):
globals
()[
'count'
]
+=
len
(
serialized_instances
)
for
serialized_instance
in
serialized_instances
:
writer
.
write
(
serialized_instance
)
def
paragraph_process
(
texts
):
instances
=
self
.
paragraph_process
(
texts
)
serialized_instances
=
self
.
tfrecord_serialize
(
instances
)
return
serialized_instances
parallel_apply
(
func
=
paragraph_process
,
iterable
=
corpus
,
workers
=
workers
,
max_queue_size
=
max_queue_size
,
callback
=
write_to_tfrecord
,
)
writer
.
close
()
print
(
'write %s examples into %s'
%
(
count
,
record_name
))
@
staticmethod
def
load_tfrecord
(
record_names
,
batch_size
,
parse_function
):
"""加载处理成tfrecord格式的语料
"""
if
not
isinstance
(
record_names
,
list
):
record_names
=
[
record_names
]
dataset
=
tf
.
data
.
TFRecordDataset
(
record_names
)
# 加载
dataset
=
dataset
.
map
(
parse_function
)
# 解析
dataset
=
dataset
.
repeat
()
# 循环
dataset
=
dataset
.
shuffle
(
batch_size
*
1000
)
# 打乱
dataset
=
dataset
.
batch
(
batch_size
)
# 成批
return
dataset
class
TrainingDatasetRoBERTa
(
TrainingDataset
):
"""预训练数据集生成器(RoBERTa模式)
"""
def
__init__
(
self
,
tokenizer
,
word_segment
,
mask_rate
=
0.15
,
sequence_length
=
512
):
"""参数说明:
tokenizer必须是bert4keras自带的tokenizer类;
word_segment是任意分词函数。
"""
super
(
TrainingDatasetRoBERTa
,
self
).
__init__
(
tokenizer
,
sequence_length
)
self
.
word_segment
=
word_segment
self
.
mask_rate
=
mask_rate
def
token_process
(
self
,
token_id
):
"""以80%的几率替换为[MASK],以10%的几率保持不变,
以10%的几率替换为一个随机token。
"""
rand
=
np
.
random
.
random
()
if
rand
<=
0.8
:
return
self
.
token_mask_id
elif
rand
<=
0.9
:
return
token_id
else
:
return
np
.
random
.
randint
(
0
,
self
.
vocab_size
)
def
sentence_process
(
self
,
text
):
"""单个文本的处理函数
流程:分词,然后转id,按照mask_rate构建全词mask的序列
来指定哪些token是否要被mask
"""
words
=
self
.
word_segment
(
text
)
rands
=
np
.
random
.
random
(
len
(
words
))
token_ids
,
mask_ids
=
[],
[]
for
rand
,
word
in
zip
(
rands
,
words
):
word_tokens
=
self
.
tokenizer
.
tokenize
(
text
=
word
)[
1
:
-
1
]
word_token_ids
=
self
.
tokenizer
.
tokens_to_ids
(
word_tokens
)
token_ids
.
extend
(
word_token_ids
)
if
rand
<
self
.
mask_rate
:
word_mask_ids
=
[
self
.
token_process
(
i
)
+
1
for
i
in
word_token_ids
]
else
:
word_mask_ids
=
[
0
]
*
len
(
word_tokens
)
mask_ids
.
extend
(
word_mask_ids
)
return
[
token_ids
,
mask_ids
]
def
paragraph_process
(
self
,
texts
):
"""给原方法补上starts、ends、paddings
"""
starts
=
[
self
.
token_cls_id
,
0
]
ends
=
[
self
.
token_sep_id
,
0
]
paddings
=
[
self
.
token_pad_id
,
0
]
return
super
(
TrainingDatasetRoBERTa
,
self
).
paragraph_process
(
texts
,
starts
,
ends
,
paddings
)
def
tfrecord_serialize
(
self
,
instances
):
"""给原方法补上instance_keys
"""
instance_keys
=
[
'token_ids'
,
'mask_ids'
]
return
super
(
TrainingDatasetRoBERTa
,
self
).
tfrecord_serialize
(
instances
,
instance_keys
)
@
staticmethod
def
load_tfrecord
(
record_names
,
sequence_length
,
batch_size
):
"""给原方法补上parse_function
"""
def
parse_function
(
serialized
):
features
=
{
'token_ids'
:
tf
.
io
.
FixedLenFeature
([
sequence_length
],
tf
.
int64
),
'mask_ids'
:
tf
.
io
.
FixedLenFeature
([
sequence_length
],
tf
.
int64
),
}
features
=
tf
.
io
.
parse_single_example
(
serialized
,
features
)
token_ids
=
features
[
'token_ids'
]
mask_ids
=
features
[
'mask_ids'
]
segment_ids
=
K
.
zeros_like
(
token_ids
,
dtype
=
'int64'
)
is_masked
=
K
.
not_equal
(
mask_ids
,
0
)
masked_token_ids
=
K
.
switch
(
is_masked
,
mask_ids
-
1
,
token_ids
)
x
=
{
'Input-Token'
:
masked_token_ids
,
'Input-Segment'
:
segment_ids
,
'token_ids'
:
token_ids
,
'is_masked'
:
K
.
cast
(
is_masked
,
K
.
floatx
()),
}
y
=
{
'mlm_loss'
:
K
.
zeros
([
1
]),
'mlm_acc'
:
K
.
zeros
([
1
]),
}
return
x
,
y
return
TrainingDataset
.
load_tfrecord
(
record_names
,
batch_size
,
parse_function
)
class
TrainingDatasetGPT
(
TrainingDataset
):
"""预训练数据集生成器(GPT模式,单向语言模型)
"""
def
sentence_process
(
self
,
text
):
"""单个文本的处理函数
流程:分词,然后转id。
"""
tokens
=
self
.
tokenizer
.
tokenize
(
text
=
text
)[
1
:
-
1
]
token_ids
=
self
.
tokenizer
.
tokens_to_ids
(
tokens
)
return
[
token_ids
]
def
paragraph_process
(
self
,
texts
):
"""给原方法补上starts、ends、paddings
"""
starts
=
[
self
.
token_cls_id
]
ends
=
[
self
.
token_sep_id
]
paddings
=
[
self
.
token_pad_id
]
return
super
(
TrainingDatasetGPT
,
self
).
paragraph_process
(
texts
,
starts
,
ends
,
paddings
)
def
tfrecord_serialize
(
self
,
instances
):
"""给原方法补上instance_keys
"""
instance_keys
=
[
'token_ids'
]
return
super
(
TrainingDatasetGPT
,
self
).
tfrecord_serialize
(
instances
,
instance_keys
)
@
staticmethod
def
load_tfrecord
(
record_names
,
sequence_length
,
batch_size
):
"""给原方法补上parse_function
"""
def
parse_function
(
serialized
):
features
=
{
'token_ids'
:
tf
.
io
.
FixedLenFeature
([
sequence_length
],
tf
.
int64
),
}
features
=
tf
.
io
.
parse_single_example
(
serialized
,
features
)
token_ids
=
features
[
'token_ids'
]
segment_ids
=
K
.
zeros_like
(
token_ids
,
dtype
=
'int64'
)
x
=
{
'Input-Token'
:
token_ids
,
'Input-Segment'
:
segment_ids
,
}
y
=
{
'lm_loss'
:
K
.
zeros
([
1
]),
'lm_acc'
:
K
.
zeros
([
1
]),
}
return
x
,
y
return
TrainingDataset
.
load_tfrecord
(
record_names
,
batch_size
,
parse_function
)
class
TrainingDatasetUniLM
(
TrainingDatasetGPT
):
"""预训练数据集生成器(UniLM模式,Seq2Seq模型)
"""
@
staticmethod
def
load_tfrecord
(
record_names
,
sequence_length
,
batch_size
,
token_sep_id
):
"""给原方法补上parse_function
"""
def
parse_function
(
serialized
):
features
=
{
'token_ids'
:
tf
.
io
.
FixedLenFeature
([
sequence_length
],
tf
.
int64
),
}
features
=
tf
.
io
.
parse_single_example
(
serialized
,
features
)
token_ids
=
features
[
'token_ids'
]
segment
=
K
.
random_uniform
(
shape
=
[
1
],
minval
=
1
,
maxval
=
sequence_length
-
1
,
dtype
=
'int64'
)[
0
]
segment_ids
=
K
.
one_hot
(
segment
+
1
,
sequence_length
)
segment_ids
=
K
.
cast
(
K
.
cumsum
(
segment_ids
),
'int64'
)
token_ids_1
=
token_ids
[:
segment
]
token_ids_2
=
K
.
zeros
([
1
],
dtype
=
'int64'
)
+
token_sep_id
token_ids_3
=
token_ids
[
segment
:
-
1
]
token_ids
=
K
.
concatenate
([
token_ids_1
,
token_ids_2
,
token_ids_3
])
x
=
{
'Input-Token'
:
token_ids
,
'Input-Segment'
:
segment_ids
,
}
y
=
{
'unilm_loss'
:
K
.
zeros
([
1
]),
'unilm_acc'
:
K
.
zeros
([
1
]),
}
return
x
,
y
return
TrainingDataset
.
load_tfrecord
(
record_names
,
batch_size
,
parse_function
)
if
__name__
==
'__main__'
:
from
bert4keras.tokenizers
import
Tokenizer
import
json
,
glob
,
re
from
tqdm
import
tqdm
model
=
'roberta'
sequence_length
=
512
workers
=
40
max_queue_size
=
4000
dict_path
=
'/home/spaces_ac_cn/chinese_L-12_H-768_A-12/vocab.txt'
tokenizer
=
Tokenizer
(
dict_path
,
do_lower_case
=
True
)
def
some_texts
():
filenames
=
glob
.
glob
(
'/home/spaces_ac_cn/corpus/*/*/*'
)
np
.
random
.
shuffle
(
filenames
)
count
,
texts
=
0
,
[]
for
filename
in
filenames
:
with
open
(
filename
)
as
f
:
for
l
in
f
:
l
=
json
.
loads
(
l
)[
'text'
].
strip
()
texts
.
extend
(
re
.
findall
(
u
'.*?[
\n
。]+'
,
l
))
count
+=
1
if
count
==
10
:
# 10篇文章合在一起再处理
yield
texts
count
,
texts
=
0
,
[]
if
texts
:
yield
texts
assert
model
in
[
'roberta'
,
'gpt'
,
'unilm'
]
# 判断是否支持的模型类型
if
model
==
'roberta'
:
import
jieba_fast
as
jieba
jieba
.
initialize
()
def
word_segment
(
text
):
return
jieba
.
lcut
(
text
)
TD
=
TrainingDatasetRoBERTa
(
tokenizer
,
word_segment
,
sequence_length
=
sequence_length
)
for
i
in
range
(
10
):
# 数据重复10遍
TD
.
process
(
corpus
=
tqdm
(
some_texts
()),
record_name
=
'../corpus_tfrecord/corpus.%s.tfrecord'
%
i
,
workers
=
workers
,
max_queue_size
=
max_queue_size
,
)
elif
model
==
'gpt'
:
TD
=
TrainingDatasetGPT
(
tokenizer
,
sequence_length
=
sequence_length
)
TD
.
process
(
corpus
=
tqdm
(
some_texts
()),
record_name
=
'../corpus_tfrecord/corpus.tfrecord'
,
workers
=
workers
,
max_queue_size
=
max_queue_size
,
)
elif
model
==
'unilm'
:
TD
=
TrainingDatasetUniLM
(
tokenizer
,
sequence_length
=
sequence_length
)
TD
.
process
(
corpus
=
tqdm
(
some_texts
()),
record_name
=
'../corpus_tfrecord/corpus.tfrecord'
,
workers
=
workers
,
max_queue_size
=
max_queue_size
,
)
Keras/NLP/bert4keras/pretraining/pretraining.py
0 → 100644
View file @
0fc002df
#! -*- coding: utf-8 -*-
# 预训练脚本,多GPU版/TPU版本
import
os
,
re
os
.
environ
[
'TF_KERAS'
]
=
'1'
# 必须使用tf.keras
import
tensorflow
as
tf
from
data_utils
import
*
from
bert4keras.models
import
build_transformer_model
from
bert4keras.backend
import
keras
,
K
from
bert4keras.optimizers
import
Adam
from
bert4keras.optimizers
import
extend_with_weight_decay
from
bert4keras.optimizers
import
extend_with_layer_adaptation
from
bert4keras.optimizers
import
extend_with_piecewise_linear_lr
from
bert4keras.optimizers
import
extend_with_gradient_accumulation
from
keras.layers
import
Input
,
Lambda
from
keras.models
import
Model
from
keras.callbacks
import
Callback
,
CSVLogger
model
=
'roberta'
# 语料路径和模型保存路径
# 如果是TPU训练,那么语料必须存放在Google Cloud Storage上面,
# 路径必须以gs://开头;如果是GPU训练,改为普通路径即可。
model_saved_path
=
'gs://xxxx/bert4keras/saved_model/bert_model.ckpt'
corpus_paths
=
[
'gs://xxxx/bert4keras/corpus/corpus.%s.tfrecord'
%
i
for
i
in
range
(
10
)
]
# 其他配置
sequence_length
=
512
batch_size
=
4096
config_path
=
'/home/spaces_ac_cn/chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path
=
'/home/spaces_ac_cn/chinese_L-12_H-768_A-12/bert_model.ckpt'
# 如果从零训练,就设为None
learning_rate
=
0.00176
weight_decay_rate
=
0.01
num_warmup_steps
=
3125
num_train_steps
=
125000
steps_per_epoch
=
10000
grad_accum_steps
=
16
# 大于1即表明使用梯度累积
epochs
=
num_train_steps
*
grad_accum_steps
//
steps_per_epoch
exclude_from_weight_decay
=
[
'Norm'
,
'bias'
]
exclude_from_layer_adaptation
=
[
'Norm'
,
'bias'
]
tpu_address
=
'grpc://xxx.xxx.xxx.xxx:8470'
# 如果用多GPU跑,直接设为None
which_optimizer
=
'lamb'
# adam 或 lamb,均自带weight decay
lr_schedule
=
{
num_warmup_steps
*
grad_accum_steps
:
1.0
,
num_train_steps
*
grad_accum_steps
:
0.0
,
}
floatx
=
K
.
floatx
()
# 读取数据集,构建数据张量
if
model
==
'roberta'
:
dataset
=
TrainingDatasetRoBERTa
.
load_tfrecord
(
record_names
=
corpus_paths
,
sequence_length
=
sequence_length
,
batch_size
=
batch_size
//
grad_accum_steps
,
)
elif
model
==
'gpt'
:
dataset
=
TrainingDatasetGPT
.
load_tfrecord
(
record_names
=
corpus_paths
,
sequence_length
=
sequence_length
,
batch_size
=
batch_size
//
grad_accum_steps
,
)
elif
model
==
'unilm'
:
dataset
=
TrainingDatasetUniLM
.
load_tfrecord
(
record_names
=
corpus_paths
,
sequence_length
=
sequence_length
,
batch_size
=
batch_size
//
grad_accum_steps
,
token_sep_id
=
3
,
# 这里需要自己指定[SEP]的id
)
def
build_transformer_model_with_mlm
():
"""带mlm的bert模型
"""
bert
=
build_transformer_model
(
config_path
,
with_mlm
=
'linear'
,
return_keras_model
=
False
)
proba
=
bert
.
model
.
output
# 辅助输入
token_ids
=
Input
(
shape
=
(
None
,),
dtype
=
'int64'
,
name
=
'token_ids'
)
# 目标id
is_masked
=
Input
(
shape
=
(
None
,),
dtype
=
floatx
,
name
=
'is_masked'
)
# mask标记
def
mlm_loss
(
inputs
):
"""计算loss的函数,需要封装为一个层
"""
y_true
,
y_pred
,
mask
=
inputs
loss
=
K
.
sparse_categorical_crossentropy
(
y_true
,
y_pred
,
from_logits
=
True
)
loss
=
K
.
sum
(
loss
*
mask
)
/
(
K
.
sum
(
mask
)
+
K
.
epsilon
())
return
loss
def
mlm_acc
(
inputs
):
"""计算准确率的函数,需要封装为一个层
"""
y_true
,
y_pred
,
mask
=
inputs
y_true
=
K
.
cast
(
y_true
,
floatx
)
acc
=
keras
.
metrics
.
sparse_categorical_accuracy
(
y_true
,
y_pred
)
acc
=
K
.
sum
(
acc
*
mask
)
/
(
K
.
sum
(
mask
)
+
K
.
epsilon
())
return
acc
mlm_loss
=
Lambda
(
mlm_loss
,
name
=
'mlm_loss'
)([
token_ids
,
proba
,
is_masked
])
mlm_acc
=
Lambda
(
mlm_acc
,
name
=
'mlm_acc'
)([
token_ids
,
proba
,
is_masked
])
train_model
=
Model
(
bert
.
model
.
inputs
+
[
token_ids
,
is_masked
],
[
mlm_loss
,
mlm_acc
]
)
loss
=
{
'mlm_loss'
:
lambda
y_true
,
y_pred
:
y_pred
,
'mlm_acc'
:
lambda
y_true
,
y_pred
:
K
.
stop_gradient
(
y_pred
),
}
return
bert
,
train_model
,
loss
def
build_transformer_model_with_lm
():
"""带lm的bert模型
"""
bert
=
build_transformer_model
(
config_path
,
with_mlm
=
'linear'
,
application
=
'lm'
,
return_keras_model
=
False
)
token_ids
=
bert
.
model
.
inputs
[
0
]
proba
=
bert
.
model
.
output
def
lm_loss
(
inputs
,
mask
=
None
):
"""计算loss的函数,需要封装为一个层
"""
y_true
,
y_pred
=
inputs
y_true
,
y_pred
=
y_true
[:,
1
:],
y_pred
[:,
:
-
1
]
if
mask
is
None
:
mask
=
1.0
else
:
mask
=
K
.
cast
(
mask
[
1
][:,
1
:],
floatx
)
loss
=
K
.
sparse_categorical_crossentropy
(
y_true
,
y_pred
,
from_logits
=
True
)
loss
=
K
.
sum
(
loss
*
mask
)
/
(
K
.
sum
(
mask
)
+
K
.
epsilon
())
return
loss
def
lm_acc
(
inputs
,
mask
=
None
):
"""计算准确率的函数,需要封装为一个层
"""
y_true
,
y_pred
=
inputs
y_true
,
y_pred
=
K
.
cast
(
y_true
[:,
1
:],
floatx
),
y_pred
[:,
:
-
1
]
if
mask
is
None
:
mask
=
1.0
else
:
mask
=
K
.
cast
(
mask
[
1
][:,
1
:],
floatx
)
acc
=
keras
.
metrics
.
sparse_categorical_accuracy
(
y_true
,
y_pred
)
acc
=
K
.
sum
(
acc
*
mask
)
/
(
K
.
sum
(
mask
)
+
K
.
epsilon
())
return
acc
lm_loss
=
Lambda
(
lm_loss
,
name
=
'lm_loss'
)([
token_ids
,
proba
])
lm_acc
=
Lambda
(
lm_acc
,
name
=
'lm_acc'
)([
token_ids
,
proba
])
train_model
=
Model
(
bert
.
model
.
inputs
,
[
lm_loss
,
lm_acc
])
loss
=
{
'lm_loss'
:
lambda
y_true
,
y_pred
:
y_pred
,
'lm_acc'
:
lambda
y_true
,
y_pred
:
K
.
stop_gradient
(
y_pred
),
}
return
bert
,
train_model
,
loss
def
build_transformer_model_with_unilm
():
"""带unilm的bert模型
"""
bert
=
build_transformer_model
(
config_path
,
with_mlm
=
'linear'
,
application
=
'unilm'
,
return_keras_model
=
False
)
token_ids
=
bert
.
model
.
inputs
[
0
]
segment_ids
=
bert
.
model
.
inputs
[
1
]
proba
=
bert
.
model
.
output
def
unilm_loss
(
inputs
,
mask
=
None
):
"""计算loss的函数,需要封装为一个层
"""
y_true
,
y_pred
,
segment_ids
=
inputs
y_true
,
y_pred
=
y_true
[:,
1
:],
y_pred
[:,
:
-
1
]
if
mask
is
None
:
mask
=
1.0
else
:
mask
=
K
.
cast
(
mask
[
1
][:,
1
:],
floatx
)
segment_ids
=
K
.
cast
(
segment_ids
,
floatx
)
mask
=
mask
*
segment_ids
[:,
1
:]
loss
=
K
.
sparse_categorical_crossentropy
(
y_true
,
y_pred
,
from_logits
=
True
)
loss
=
K
.
sum
(
loss
*
mask
)
/
(
K
.
sum
(
mask
)
+
K
.
epsilon
())
return
loss
def
unilm_acc
(
inputs
,
mask
=
None
):
"""计算准确率的函数,需要封装为一个层
"""
y_true
,
y_pred
,
segment_ids
=
inputs
y_true
,
y_pred
=
K
.
cast
(
y_true
[:,
1
:],
floatx
),
y_pred
[:,
:
-
1
]
if
mask
is
None
:
mask
=
1.0
else
:
mask
=
K
.
cast
(
mask
[
1
][:,
1
:],
floatx
)
segment_ids
=
K
.
cast
(
segment_ids
,
floatx
)
mask
=
mask
*
segment_ids
[:,
1
:]
acc
=
keras
.
metrics
.
sparse_categorical_accuracy
(
y_true
,
y_pred
)
acc
=
K
.
sum
(
acc
*
mask
)
/
(
K
.
sum
(
mask
)
+
K
.
epsilon
())
return
acc
token_proba_segment
=
[
token_ids
,
proba
,
segment_ids
]
unilm_loss
=
Lambda
(
unilm_loss
,
name
=
'unilm_loss'
)(
token_proba_segment
)
unilm_acc
=
Lambda
(
unilm_acc
,
name
=
'unilm_acc'
)(
token_proba_segment
)
train_model
=
Model
(
bert
.
model
.
inputs
,
[
unilm_loss
,
unilm_acc
])
loss
=
{
'unilm_loss'
:
lambda
y_true
,
y_pred
:
y_pred
,
'unilm_acc'
:
lambda
y_true
,
y_pred
:
K
.
stop_gradient
(
y_pred
),
}
return
bert
,
train_model
,
loss
def
build_transformer_model_for_pretraining
():
"""构建训练模型,通用于TPU/GPU
注意全程要用keras标准的层写法,一些比较灵活的“移花接木”式的
写法可能会在TPU上训练失败。此外,要注意的是TPU并非支持所有
tensorflow算子,尤其不支持动态(变长)算子,因此编写相应运算
时要格外留意。
"""
if
model
==
'roberta'
:
bert
,
train_model
,
loss
=
build_transformer_model_with_mlm
()
elif
model
==
'gpt'
:
bert
,
train_model
,
loss
=
build_transformer_model_with_lm
()
elif
model
==
'unilm'
:
bert
,
train_model
,
loss
=
build_transformer_model_with_unilm
()
# 优化器
optimizer
=
extend_with_weight_decay
(
Adam
)
if
which_optimizer
==
'lamb'
:
optimizer
=
extend_with_layer_adaptation
(
optimizer
)
optimizer
=
extend_with_piecewise_linear_lr
(
optimizer
)
optimizer_params
=
{
'learning_rate'
:
learning_rate
,
'lr_schedule'
:
lr_schedule
,
'weight_decay_rate'
:
weight_decay_rate
,
'exclude_from_weight_decay'
:
exclude_from_weight_decay
,
'exclude_from_layer_adaptation'
:
exclude_from_layer_adaptation
,
'bias_correction'
:
False
,
}
if
grad_accum_steps
>
1
:
optimizer
=
extend_with_gradient_accumulation
(
optimizer
)
optimizer_params
[
'grad_accum_steps'
]
=
grad_accum_steps
optimizer
=
optimizer
(
**
optimizer_params
)
# 模型定型
train_model
.
compile
(
loss
=
loss
,
optimizer
=
optimizer
)
# 如果传入权重,则加载。注:须在此处加载,才保证不报错。
if
checkpoint_path
is
not
None
:
bert
.
load_weights_from_checkpoint
(
checkpoint_path
)
return
train_model
if
tpu_address
is
None
:
# 单机多卡模式(多机多卡也类似,但需要硬软件配合,请参考https://tf.wiki)
strategy
=
tf
.
distribute
.
MirroredStrategy
()
else
:
# TPU模式
resolver
=
tf
.
distribute
.
cluster_resolver
.
TPUClusterResolver
(
tpu
=
tpu_address
)
tf
.
config
.
experimental_connect_to_host
(
resolver
.
master
())
tf
.
tpu
.
experimental
.
initialize_tpu_system
(
resolver
)
strategy
=
tf
.
distribute
.
experimental
.
TPUStrategy
(
resolver
)
with
strategy
.
scope
():
train_model
=
build_transformer_model_for_pretraining
()
train_model
.
summary
()
class
ModelCheckpoint
(
keras
.
callbacks
.
Callback
):
"""自动保存最新模型
"""
def
on_epoch_end
(
self
,
epoch
,
logs
=
None
):
self
.
model
.
save_weights
(
model_saved_path
,
overwrite
=
True
)
# 保存模型
checkpoint
=
ModelCheckpoint
()
# 记录日志
csv_logger
=
keras
.
callbacks
.
CSVLogger
(
'training.log'
)
# 模型训练
train_model
.
fit
(
dataset
,
steps_per_epoch
=
steps_per_epoch
,
epochs
=
epochs
,
callbacks
=
[
checkpoint
,
csv_logger
],
)
Keras/NLP/bert4keras/setup.py
0 → 100644
View file @
0fc002df
#! -*- coding: utf-8 -*-
from
setuptools
import
setup
,
find_packages
setup
(
name
=
'bert4keras'
,
version
=
'0.10.5'
,
description
=
'an elegant bert4keras'
,
long_description
=
'bert4keras: https://github.com/bojone/bert4keras'
,
license
=
'Apache License 2.0'
,
url
=
'https://github.com/bojone/bert4keras'
,
author
=
'bojone'
,
author_email
=
'bojone@spaces.ac.cn'
,
install_requires
=
[
'keras<=2.3.1'
],
packages
=
find_packages
()
)
PyTorch/Compute-Vision/Accuracy_Validation/ResNet50/README.md
0 → 100644
View file @
0fc002df
# 介绍
该测试用例用于ResNet50精度验证,单卡运行指令如下
# 运行示例
## fp32
python3 main_acc.py --batch-size=64 --arch=resnet50 -j 6 --epochs=90 --save-path=/path/to/{save_model_dir} /path/to/{ImageNet_pytorch_data_dir}/
## fp16
python3 main_acc.py --batch-size=64 --arch=resnet50 -j 6 --epochs=90 --amp --opt-level O1 --loss-scale=dynamic --save-path=/path/to/{save_model_dir} /path/to/{ImageNet_pytorch_data_dir}/
# 参考
[
https://github.com/pytorch/examples/tree/master/imagenet
](
https://github.com/pytorch/examples/tree/master/imagenet
)
PyTorch/Compute-Vision/Accuracy_Validation/ResNet50/main_acc.py
0 → 100644
View file @
0fc002df
import
argparse
import
os
import
random
import
shutil
import
time
import
warnings
import
torch
import
torch.nn
as
nn
import
torch.nn.parallel
import
torch.backends.cudnn
as
cudnn
import
torch.distributed
as
dist
import
torch.optim
import
torch.multiprocessing
as
mp
import
torch.utils.data
import
torch.utils.data.distributed
import
torchvision.transforms
as
transforms
import
torchvision.datasets
as
datasets
import
torchvision.models
as
models
use_cuda
=
torch
.
cuda
.
is_available
()
if
use_cuda
:
try
:
from
apex.parallel
import
DistributedDataParallel
as
DDP
from
apex.fp16_utils
import
*
from
apex
import
amp
,
optimizers
except
ImportError
:
raise
ImportError
(
"Please install apex from https://www.github.com/nvidia/apex to run this example."
)
model_names
=
sorted
(
name
for
name
in
models
.
__dict__
if
name
.
islower
()
and
not
name
.
startswith
(
"__"
)
and
callable
(
models
.
__dict__
[
name
]))
parser
=
argparse
.
ArgumentParser
(
description
=
'PyTorch ImageNet Training'
)
parser
.
add_argument
(
'data'
,
metavar
=
'DIR'
,
help
=
'path to dataset'
)
parser
.
add_argument
(
'-a'
,
'--arch'
,
metavar
=
'ARCH'
,
default
=
'resnet18'
,
choices
=
model_names
,
help
=
'model architecture: '
+
' | '
.
join
(
model_names
)
+
' (default: resnet18)'
)
parser
.
add_argument
(
'-j'
,
'--workers'
,
default
=
4
,
type
=
int
,
metavar
=
'N'
,
help
=
'number of data loading workers (default: 4)'
)
parser
.
add_argument
(
'--epochs'
,
default
=
90
,
type
=
int
,
metavar
=
'N'
,
help
=
'number of total epochs to run'
)
parser
.
add_argument
(
'--start-epoch'
,
default
=
0
,
type
=
int
,
metavar
=
'N'
,
help
=
'manual epoch number (useful on restarts)'
)
parser
.
add_argument
(
'-b'
,
'--batch-size'
,
default
=
256
,
type
=
int
,
metavar
=
'N'
,
help
=
'mini-batch size (default: 256), this is the total '
'batch size of all GPUs on the current node when '
'using Data Parallel or Distributed Data Parallel'
)
parser
.
add_argument
(
'--lr'
,
'--learning-rate'
,
default
=
0.1
,
type
=
float
,
metavar
=
'LR'
,
help
=
'initial learning rate'
,
dest
=
'lr'
)
parser
.
add_argument
(
'--momentum'
,
default
=
0.9
,
type
=
float
,
metavar
=
'M'
,
help
=
'momentum'
)
parser
.
add_argument
(
'--wd'
,
'--weight-decay'
,
default
=
1e-4
,
type
=
float
,
metavar
=
'W'
,
help
=
'weight decay (default: 1e-4)'
,
dest
=
'weight_decay'
)
parser
.
add_argument
(
'-p'
,
'--print-freq'
,
default
=
10
,
type
=
int
,
metavar
=
'N'
,
help
=
'print frequency (default: 10)'
)
parser
.
add_argument
(
'--resume'
,
default
=
''
,
type
=
str
,
metavar
=
'PATH'
,
help
=
'path to latest checkpoint (default: none)'
)
parser
.
add_argument
(
'-e'
,
'--evaluate'
,
dest
=
'evaluate'
,
action
=
'store_true'
,
help
=
'evaluate model on validation set'
)
parser
.
add_argument
(
'--pretrained'
,
dest
=
'pretrained'
,
action
=
'store_true'
,
help
=
'use pre-trained model'
)
parser
.
add_argument
(
'--world-size'
,
default
=-
1
,
type
=
int
,
help
=
'number of nodes for distributed training'
)
parser
.
add_argument
(
'--rank'
,
default
=-
1
,
type
=
int
,
help
=
'node rank for distributed training'
)
parser
.
add_argument
(
'--dist-url'
,
default
=
'tcp://224.66.41.62:23456'
,
type
=
str
,
help
=
'url used to set up distributed training'
)
parser
.
add_argument
(
'--dist-backend'
,
default
=
'nccl'
,
type
=
str
,
help
=
'distributed backend'
)
parser
.
add_argument
(
'--seed'
,
default
=
None
,
type
=
int
,
help
=
'seed for initializing training. '
)
parser
.
add_argument
(
'--gpu'
,
default
=
None
,
type
=
int
,
help
=
'GPU id to use.'
)
parser
.
add_argument
(
'--multiprocessing-distributed'
,
action
=
'store_true'
,
help
=
'Use multi-processing distributed training to launch '
'N processes per node, which has N GPUs. This is the '
'fastest way to use PyTorch for either single node or '
'multi node data parallel training'
)
parser
.
add_argument
(
"--save-path"
,
default
=
'/root'
,
type
=
str
,
help
=
'path to save checkpoint'
)
#aiss add
parser
.
add_argument
(
"--amp"
,
action
=
"store_true"
,
help
=
"Run model AMP (automatic mixed precision) mode."
,
)
parser
.
add_argument
(
'--opt-level'
,
type
=
str
)
parser
.
add_argument
(
'--loss-scale'
,
type
=
str
,
default
=
None
)
parser
.
add_argument
(
'--keep-batchnorm-fp32'
,
type
=
str
,
default
=
None
)
best_acc1
=
0
def
main
():
args
=
parser
.
parse_args
()
if
args
.
seed
is
not
None
:
random
.
seed
(
args
.
seed
)
torch
.
manual_seed
(
args
.
seed
)
cudnn
.
deterministic
=
True
warnings
.
warn
(
'You have chosen to seed training. '
'This will turn on the CUDNN deterministic setting, '
'which can slow down your training considerably! '
'You may see unexpected behavior when restarting '
'from checkpoints.'
)
if
args
.
gpu
is
not
None
:
warnings
.
warn
(
'You have chosen a specific GPU. This will completely '
'disable data parallelism.'
)
if
args
.
dist_url
==
"env://"
and
args
.
world_size
==
-
1
:
args
.
world_size
=
int
(
os
.
environ
[
"WORLD_SIZE"
])
args
.
distributed
=
args
.
world_size
>
1
or
args
.
multiprocessing_distributed
ngpus_per_node
=
torch
.
cuda
.
device_count
()
if
args
.
multiprocessing_distributed
:
# Since we have ngpus_per_node processes per node, the total world_size
# needs to be adjusted accordingly
args
.
world_size
=
ngpus_per_node
*
args
.
world_size
# Use torch.multiprocessing.spawn to launch distributed processes: the
# main_worker process function
mp
.
spawn
(
main_worker
,
nprocs
=
ngpus_per_node
,
args
=
(
ngpus_per_node
,
args
))
else
:
# Simply call main_worker function
main_worker
(
args
.
gpu
,
ngpus_per_node
,
args
)
local_rank
=
0
def
main_worker
(
gpu
,
ngpus_per_node
,
args
):
global
best_acc1
args
.
gpu
=
gpu
if
args
.
gpu
is
not
None
:
print
(
"Use GPU: {} for training"
.
format
(
args
.
gpu
))
if
args
.
distributed
:
if
args
.
dist_url
==
"env://"
and
args
.
rank
==
-
1
:
args
.
rank
=
int
(
os
.
environ
[
"RANK"
])
if
args
.
multiprocessing_distributed
:
# For multiprocessing distributed training, rank needs to be the
# global rank among all the processes
args
.
rank
=
args
.
rank
*
ngpus_per_node
+
gpu
#args.gpu = args.rank % torch.cuda.device_count()
local_rank
=
args
.
rank
%
torch
.
cuda
.
device_count
()
print
(
"device count is "
,
torch
.
cuda
.
device_count
())
torch
.
cuda
.
set_device
(
local_rank
)
dist
.
init_process_group
(
backend
=
args
.
dist_backend
,
init_method
=
args
.
dist_url
,
world_size
=
args
.
world_size
,
rank
=
args
.
rank
)
# create model
if
args
.
pretrained
:
print
(
"=> using pre-trained model '{}'"
.
format
(
args
.
arch
))
model
=
models
.
__dict__
[
args
.
arch
](
pretrained
=
True
)
else
:
print
(
"=> creating model '{}'"
.
format
(
args
.
arch
))
model
=
models
.
__dict__
[
args
.
arch
]()
# define loss function (criterion) and optimizer
#criterion = nn.CrossEntropyLoss().cuda(args.gpu)
criterion
=
nn
.
CrossEntropyLoss
().
cuda
()
model
.
cuda
()
optimizer
=
torch
.
optim
.
SGD
(
model
.
parameters
(),
args
.
lr
,
momentum
=
args
.
momentum
,
weight_decay
=
args
.
weight_decay
)
#add for amp
if
args
.
amp
:
Model
,
optimizer
=
amp
.
initialize
(
model
,
optimizer
,
opt_level
=
args
.
opt_level
,
keep_batchnorm_fp32
=
args
.
keep_batchnorm_fp32
,
loss_scale
=
args
.
loss_scale
)
if
args
.
distributed
:
# For multiprocessing distributed, DistributedDataParallel constructor
# should always set the single device scope, otherwise,
# DistributedDataParallel will use all available devices.
if
args
.
gpu
is
not
None
:
torch
.
cuda
.
set_device
(
args
.
gpu
)
model
.
cuda
(
args
.
gpu
)
# When using a single GPU per process and per
# DistributedDataParallel, we need to divide the batch size
# ourselves based on the total number of GPUs we have
args
.
batch_size
=
int
(
args
.
batch_size
/
ngpus_per_node
)
args
.
workers
=
int
(
args
.
workers
/
ngpus_per_node
)
model
=
torch
.
nn
.
parallel
.
DistributedDataParallel
(
model
,
device_ids
=
[
args
.
gpu
])
else
:
#model.cuda()
# DistributedDataParallel will divide and allocate batch_size to all
# available GPUs if device_ids are not set
if
args
.
amp
:
model
=
DDP
(
model
)
print
(
"input R branch!!!!!!!!!!!!!!!!!!!"
)
else
:
model
=
torch
.
nn
.
parallel
.
DistributedDataParallel
(
model
)
elif
args
.
gpu
is
not
None
:
torch
.
cuda
.
set_device
(
args
.
gpu
)
model
=
model
.
cuda
(
args
.
gpu
)
else
:
# DataParallel will divide and allocate batch_size to all available GPUs
if
args
.
arch
.
startswith
(
'alexnet'
)
or
args
.
arch
.
startswith
(
'vgg'
):
model
.
features
=
torch
.
nn
.
DataParallel
(
model
.
features
)
model
.
cuda
()
else
:
model
=
torch
.
nn
.
DataParallel
(
model
).
cuda
()
# optionally resume from a checkpoint
# if args.resume:
# if os.path.isfile(args.resume):
# print("=> loading checkpoint '{}'".format(args.resume))
# checkpoint = torch.load(args.resume)
# args.start_epoch = checkpoint['epoch']
# best_acc1 = checkpoint['best_acc1']
# if args.gpu is not None:
# # best_acc1 may be from a checkpoint from a different GPU
# best_acc1 = best_acc1.to(args.gpu)
# model.load_state_dict(checkpoint['state_dict'])
# optimizer.load_state_dict(checkpoint['optimizer'])
# print("=> loaded checkpoint '{}' (epoch {})"
# .format(args.resume, checkpoint['epoch']))
# else:
# print("=> no checkpoint found at '{}'".format(args.resume))
if
args
.
resume
:
if
os
.
path
.
isfile
(
args
.
resume
):
print
(
"=> loading checkpoint '{}'"
.
format
(
args
.
resume
))
if
args
.
gpu
is
None
:
checkpoint
=
torch
.
load
(
args
.
resume
)
else
:
# Map model to be loaded to specified single gpu.
loc
=
'cuda:{}'
.
format
(
args
.
gpu
)
checkpoint
=
torch
.
load
(
args
.
resume
,
map_location
=
loc
)
args
.
start_epoch
=
checkpoint
[
'epoch'
]
best_acc1
=
checkpoint
[
'best_acc1'
]
if
args
.
gpu
is
not
None
:
# best_acc1 may be from a checkpoint from a different GPU
best_acc1
=
best_acc1
.
to
(
args
.
gpu
)
model
.
load_state_dict
(
checkpoint
[
'state_dict'
])
optimizer
.
load_state_dict
(
checkpoint
[
'optimizer'
])
print
(
"=> loaded checkpoint '{}' (epoch {})"
.
format
(
args
.
resume
,
checkpoint
[
'epoch'
]))
else
:
print
(
"=> no checkpoint found at '{}'"
.
format
(
args
.
resume
))
cudnn
.
benchmark
=
True
# Data loading code
traindir
=
os
.
path
.
join
(
args
.
data
,
'train'
)
valdir
=
os
.
path
.
join
(
args
.
data
,
'val'
)
normalize
=
transforms
.
Normalize
(
mean
=
[
0.485
,
0.456
,
0.406
],
std
=
[
0.229
,
0.224
,
0.225
])
train_dataset
=
datasets
.
ImageFolder
(
traindir
,
transforms
.
Compose
([
transforms
.
RandomResizedCrop
(
224
),
transforms
.
RandomHorizontalFlip
(),
transforms
.
ToTensor
(),
normalize
,
]))
if
args
.
distributed
:
train_sampler
=
torch
.
utils
.
data
.
distributed
.
DistributedSampler
(
train_dataset
)
else
:
train_sampler
=
None
train_loader
=
torch
.
utils
.
data
.
DataLoader
(
train_dataset
,
batch_size
=
args
.
batch_size
,
shuffle
=
(
train_sampler
is
None
),
num_workers
=
args
.
workers
,
pin_memory
=
True
,
sampler
=
train_sampler
)
val_loader
=
torch
.
utils
.
data
.
DataLoader
(
datasets
.
ImageFolder
(
valdir
,
transforms
.
Compose
([
transforms
.
Resize
(
256
),
transforms
.
CenterCrop
(
224
),
transforms
.
ToTensor
(),
normalize
,
])),
batch_size
=
args
.
batch_size
,
shuffle
=
False
,
num_workers
=
args
.
workers
,
pin_memory
=
True
)
if
args
.
evaluate
:
validate
(
val_loader
,
model
,
criterion
,
args
)
return
for
epoch
in
range
(
args
.
start_epoch
,
args
.
epochs
):
if
args
.
distributed
:
train_sampler
.
set_epoch
(
epoch
)
adjust_learning_rate
(
optimizer
,
epoch
,
args
)
# train for one epoch
train
(
train_loader
,
model
,
criterion
,
optimizer
,
epoch
,
args
)
# evaluate on validation set
acc1
=
validate
(
val_loader
,
model
,
criterion
,
args
)
# remember best acc@1 and save checkpoint
is_best
=
acc1
>
best_acc1
best_acc1
=
max
(
acc1
,
best_acc1
)
#if args.rank == 0:
if
not
os
.
path
.
isdir
(
args
.
save_path
):
os
.
mkdir
(
args
.
save_path
)
if
not
args
.
multiprocessing_distributed
or
(
args
.
multiprocessing_distributed
and
args
.
rank
%
ngpus_per_node
==
0
):
save_checkpoint
({
'epoch'
:
epoch
+
1
,
'arch'
:
args
.
arch
,
'state_dict'
:
model
.
state_dict
(),
'best_acc1'
:
best_acc1
,
'optimizer'
:
optimizer
.
state_dict
(),
},
epoch
,
is_best
,
args
.
rank
,
args
.
save_path
)
def
train
(
train_loader
,
model
,
criterion
,
optimizer
,
epoch
,
args
):
batch_time
=
AverageMeter
(
'Time'
,
':6.3f'
)
data_time
=
AverageMeter
(
'Data'
,
':6.3f'
)
losses
=
AverageMeter
(
'Loss'
,
':.4e'
)
top1
=
AverageMeter
(
'Acc@1'
,
':6.2f'
)
top5
=
AverageMeter
(
'Acc@5'
,
':6.2f'
)
progress
=
ProgressMeter
(
len
(
train_loader
),
[
batch_time
,
data_time
,
losses
,
top1
,
top5
],
prefix
=
"Epoch: [{}]"
.
format
(
epoch
))
# switch to train mode
model
.
train
()
end
=
time
.
time
()
for
i
,
(
images
,
target
)
in
enumerate
(
train_loader
):
# measure data loading time
data_time
.
update
(
time
.
time
()
-
end
)
if
args
.
gpu
is
not
None
:
images
=
images
.
cuda
(
args
.
gpu
,
non_blocking
=
True
)
target
=
target
.
cuda
(
args
.
gpu
,
non_blocking
=
True
)
###aiss add
if
use_cuda
:
images
=
images
.
cuda
()
target
=
target
.
cuda
()
# compute output
#print("image type: ",type(images))
#print("model type: ",type(model))
#print("target type: ",type(target))
output
=
model
(
images
)
loss
=
criterion
(
output
,
target
)
# measure accuracy and record loss
acc1
,
acc5
=
accuracy
(
output
,
target
,
topk
=
(
1
,
5
))
losses
.
update
(
loss
.
item
(),
images
.
size
(
0
))
top1
.
update
(
acc1
[
0
],
images
.
size
(
0
))
top5
.
update
(
acc5
[
0
],
images
.
size
(
0
))
# compute gradient and do SGD step
optimizer
.
zero_grad
()
#loss.backward()
##aiss add amp
if
args
.
amp
:
with
amp
.
scale_loss
(
loss
,
optimizer
)
as
scaled_loss
:
scaled_loss
.
backward
()
else
:
loss
.
backward
()
optimizer
.
step
()
# measure elapsed time
batch_time
.
update
(
time
.
time
()
-
end
)
end
=
time
.
time
()
if
i
%
args
.
print_freq
==
0
:
progress
.
display
(
i
)
def
validate
(
val_loader
,
model
,
criterion
,
args
):
batch_time
=
AverageMeter
(
'Time'
,
':6.3f'
)
losses
=
AverageMeter
(
'Loss'
,
':.4e'
)
top1
=
AverageMeter
(
'Acc@1'
,
':6.2f'
)
top5
=
AverageMeter
(
'Acc@5'
,
':6.2f'
)
progress
=
ProgressMeter
(
len
(
val_loader
),
[
batch_time
,
losses
,
top1
,
top5
],
prefix
=
'Test: '
)
# switch to evaluate mode
model
.
eval
()
with
torch
.
no_grad
():
end
=
time
.
time
()
for
i
,
(
images
,
target
)
in
enumerate
(
val_loader
):
#if args.gpu is not None:
# images = images.cuda(args.gpu, non_blocking=True)
#target = target.cuda(args.gpu, non_blocking=True)
if
args
.
gpu
is
not
None
:
images
=
images
.
cuda
(
args
.
gpu
,
non_blocking
=
True
)
target
=
target
.
cuda
(
args
.
gpu
,
non_blocking
=
True
)
if
use_cuda
:
images
=
images
.
cuda
()
target
=
target
.
cuda
()
# compute output
output
=
model
(
images
)
loss
=
criterion
(
output
,
target
)
# measure accuracy and record loss
acc1
,
acc5
=
accuracy
(
output
,
target
,
topk
=
(
1
,
5
))
losses
.
update
(
loss
.
item
(),
images
.
size
(
0
))
top1
.
update
(
acc1
[
0
],
images
.
size
(
0
))
top5
.
update
(
acc5
[
0
],
images
.
size
(
0
))
# measure elapsed time
batch_time
.
update
(
time
.
time
()
-
end
)
end
=
time
.
time
()
if
i
%
args
.
print_freq
==
0
:
progress
.
display
(
i
)
# TODO: this should also be done with the ProgressMeter
print
(
' * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}'
.
format
(
top1
=
top1
,
top5
=
top5
))
return
top1
.
avg
#def save_checkpoint(state, epoch, is_best, rank, filename='/public/home/aiss/419_rocm2.x_DL/examples_dl/aiss_test/base_acc/HK/HK3/test_resume/checkpoint.pth.tar'):
# filename='/public/home/aiss/419_rocm2.x_DL/examples_dl/aiss_test/base_acc/HK/HK3/test_resume'+str(rank)+'/checkpoint_'+str(epoch)+'.pth.tar'
# torch.save(state, filename)
# if is_best:
# #shutil.copyfile(filename, '/public/home/aiss/419_rocm2.x_DL/examples_dl/aiss_test/base_acc/HK/HK3/test_resume/model_best.pth.tar')
# best_dir='/public/home/aiss/419_rocm2.x_DL/examples_dl/aiss_test/base_acc/HK/HK3/test_resume'+str(rank)+'/model_best.pth.tar'
# shutil.copyfile(filename, best_dir)
def
save_checkpoint
(
state
,
epoch
,
is_best
,
rank
,
filename
):
rank_path
=
filename
+
'/'
+
str
(
rank
)
if
not
os
.
path
.
isdir
(
rank_path
):
os
.
mkdir
(
rank_path
)
filename
=
rank_path
+
'/checkpoint_'
+
str
(
epoch
)
+
'.pth.tar'
torch
.
save
(
state
,
filename
)
if
is_best
:
#shutil.copyfile(filename, '/public/home/aiss/419_rocm2.x_DL/examples_dl/aiss_test/base_acc/HK/HK3/test_resume/model_best.pth.tar')
best_dir
=
rank_path
+
'/model_best.pth.tar'
shutil
.
copyfile
(
filename
,
best_dir
)
class
AverageMeter
(
object
):
"""Computes and stores the average and current value"""
def
__init__
(
self
,
name
,
fmt
=
':f'
):
self
.
name
=
name
self
.
fmt
=
fmt
self
.
reset
()
def
reset
(
self
):
self
.
val
=
0
self
.
avg
=
0
self
.
sum
=
0
self
.
count
=
0
def
update
(
self
,
val
,
n
=
1
):
self
.
val
=
val
self
.
sum
+=
val
*
n
self
.
count
+=
n
self
.
avg
=
self
.
sum
/
self
.
count
def
__str__
(
self
):
fmtstr
=
'{name} {val'
+
self
.
fmt
+
'} ({avg'
+
self
.
fmt
+
'})'
return
fmtstr
.
format
(
**
self
.
__dict__
)
class
ProgressMeter
(
object
):
def
__init__
(
self
,
num_batches
,
meters
,
prefix
=
""
):
self
.
batch_fmtstr
=
self
.
_get_batch_fmtstr
(
num_batches
)
self
.
meters
=
meters
self
.
prefix
=
prefix
def
display
(
self
,
batch
):
entries
=
[
self
.
prefix
+
self
.
batch_fmtstr
.
format
(
batch
)]
entries
+=
[
str
(
meter
)
for
meter
in
self
.
meters
]
print
(
'
\t
'
.
join
(
entries
))
def
_get_batch_fmtstr
(
self
,
num_batches
):
num_digits
=
len
(
str
(
num_batches
//
1
))
fmt
=
'{:'
+
str
(
num_digits
)
+
'd}'
return
'['
+
fmt
+
'/'
+
fmt
.
format
(
num_batches
)
+
']'
def
adjust_learning_rate
(
optimizer
,
epoch
,
args
):
"""Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
lr
=
args
.
lr
*
(
0.1
**
(
epoch
//
30
))
for
param_group
in
optimizer
.
param_groups
:
param_group
[
'lr'
]
=
lr
def
accuracy
(
output
,
target
,
topk
=
(
1
,)):
"""Computes the accuracy over the k top predictions for the specified values of k"""
with
torch
.
no_grad
():
maxk
=
max
(
topk
)
batch_size
=
target
.
size
(
0
)
_
,
pred
=
output
.
topk
(
maxk
,
1
,
True
,
True
)
pred
=
pred
.
t
()
correct
=
pred
.
eq
(
target
.
view
(
1
,
-
1
).
expand_as
(
pred
))
res
=
[]
for
k
in
topk
:
correct_k
=
correct
[:
k
].
view
(
-
1
).
float
().
sum
(
0
,
keepdim
=
True
)
res
.
append
(
correct_k
.
mul_
(
100.0
/
batch_size
))
return
res
if
__name__
==
'__main__'
:
main
()
PyTorch/Compute-Vision/Classification/README.md
0 → 100644
View file @
0fc002df
# 简介
该测试用例用于PyTorch分类模型性能测试
*
该脚本可支持PyTorch的nccl和gloo分布式通信库方式
# 运行
## 单卡
python3 `pwd`/main_bench.py --batch-size=64 --a=resnet50 -j 24 --epochs=1 --synthetic /path/to/any/existing/folder
## 单机多卡
mpirun -np 4 --bind-to none
`pwd`
/single_process.sh localhost inception_v3 64
## 分布式多卡
mpirun -np $np --hostfile hostfile --bind-to none `pwd`/single_process.sh $dist_url resnet50 64
hostfile格式参考:
node1 slots=4
node2 slots=4
# 参考
[
https://github.com/pytorch/examples/tree/master/imagenet
](
https://github.com/pytorch/examples/tree/master/imagenet
)
PyTorch/Compute-Vision/Classification/main_bench.py
0 → 100644
View file @
0fc002df
###########################
#add inception model
#with bench data v1
#20191106:add synthetic data v2
#used for large-scale test, change the iter of each epoch based on the total dataset
###########################
import
argparse
import
os
import
random
import
shutil
import
time
import
warnings
import
torch
import
torch.nn
as
nn
import
torch.nn.parallel
import
torch.backends.cudnn
as
cudnn
import
torch.distributed
as
dist
import
torch.optim
import
torch.multiprocessing
as
mp
import
torch.utils.data
import
torch.utils.data.distributed
import
torchvision.transforms
as
transforms
import
torchvision.datasets
as
datasets
import
torchvision.models
as
models
model_names
=
sorted
(
name
for
name
in
models
.
__dict__
if
name
.
islower
()
and
not
name
.
startswith
(
"__"
)
and
callable
(
models
.
__dict__
[
name
]))
parser
=
argparse
.
ArgumentParser
(
description
=
'PyTorch ImageNet Training'
)
parser
.
add_argument
(
'data'
,
metavar
=
'DIR'
,
help
=
'path to dataset'
)
parser
.
add_argument
(
'--synthetic'
,
action
=
'store_true'
,
help
=
'use snthetic data'
)
parser
.
add_argument
(
'-a'
,
'--arch'
,
metavar
=
'ARCH'
,
default
=
'resnet18'
,
choices
=
model_names
,
help
=
'model architecture: '
+
' | '
.
join
(
model_names
)
+
' (default: resnet18)'
)
parser
.
add_argument
(
'-j'
,
'--workers'
,
default
=
4
,
type
=
int
,
metavar
=
'N'
,
help
=
'number of data loading workers (default: 4)'
)
parser
.
add_argument
(
'--epochs'
,
default
=
90
,
type
=
int
,
metavar
=
'N'
,
help
=
'number of total epochs to run'
)
parser
.
add_argument
(
'--start-epoch'
,
default
=
0
,
type
=
int
,
metavar
=
'N'
,
help
=
'manual epoch number (useful on restarts)'
)
parser
.
add_argument
(
'-b'
,
'--batch-size'
,
default
=
256
,
type
=
int
,
metavar
=
'N'
,
help
=
'mini-batch size (default: 256), this is the total '
'batch size of all GPUs on the current node when '
'using Data Parallel or Distributed Data Parallel'
)
parser
.
add_argument
(
'--lr'
,
'--learning-rate'
,
default
=
0.1
,
type
=
float
,
metavar
=
'LR'
,
help
=
'initial learning rate'
,
dest
=
'lr'
)
parser
.
add_argument
(
'--momentum'
,
default
=
0.9
,
type
=
float
,
metavar
=
'M'
,
help
=
'momentum'
)
parser
.
add_argument
(
'--wd'
,
'--weight-decay'
,
default
=
1e-4
,
type
=
float
,
metavar
=
'W'
,
help
=
'weight decay (default: 1e-4)'
,
dest
=
'weight_decay'
)
parser
.
add_argument
(
'-p'
,
'--print-freq'
,
default
=
10
,
type
=
int
,
metavar
=
'N'
,
help
=
'print frequency (default: 10)'
)
parser
.
add_argument
(
'--resume'
,
default
=
''
,
type
=
str
,
metavar
=
'PATH'
,
help
=
'path to latest checkpoint (default: none)'
)
parser
.
add_argument
(
'-e'
,
'--evaluate'
,
dest
=
'evaluate'
,
action
=
'store_true'
,
help
=
'evaluate model on validation set'
)
parser
.
add_argument
(
'--pretrained'
,
dest
=
'pretrained'
,
action
=
'store_true'
,
help
=
'use pre-trained model'
)
parser
.
add_argument
(
'--world-size'
,
default
=-
1
,
type
=
int
,
help
=
'number of nodes for distributed training'
)
parser
.
add_argument
(
'--rank'
,
default
=-
1
,
type
=
int
,
help
=
'node rank for distributed training'
)
parser
.
add_argument
(
'--dist-url'
,
default
=
'tcp://224.66.41.62:23456'
,
type
=
str
,
help
=
'url used to set up distributed training'
)
parser
.
add_argument
(
'--dist-backend'
,
default
=
'nccl'
,
type
=
str
,
help
=
'distributed backend'
)
parser
.
add_argument
(
'--seed'
,
default
=
None
,
type
=
int
,
help
=
'seed for initializing training. '
)
parser
.
add_argument
(
'--gpu'
,
default
=
None
,
type
=
int
,
help
=
'GPU id to use.'
)
parser
.
add_argument
(
'--multiprocessing-distributed'
,
action
=
'store_true'
,
help
=
'Use multi-processing distributed training to launch '
'N processes per node, which has N GPUs. This is the '
'fastest way to use PyTorch for either single node or '
'multi node data parallel training'
)
best_acc1
=
0
def
main
():
args
=
parser
.
parse_args
()
if
args
.
seed
is
not
None
:
random
.
seed
(
args
.
seed
)
torch
.
manual_seed
(
args
.
seed
)
cudnn
.
deterministic
=
True
warnings
.
warn
(
'You have chosen to seed training. '
'This will turn on the CUDNN deterministic setting, '
'which can slow down your training considerably! '
'You may see unexpected behavior when restarting '
'from checkpoints.'
)
if
args
.
gpu
is
not
None
:
warnings
.
warn
(
'You have chosen a specific GPU. This will completely '
'disable data parallelism.'
)
if
args
.
dist_url
==
"env://"
and
args
.
world_size
==
-
1
:
args
.
world_size
=
int
(
os
.
environ
[
"WORLD_SIZE"
])
args
.
distributed
=
args
.
world_size
>
1
or
args
.
multiprocessing_distributed
ngpus_per_node
=
torch
.
cuda
.
device_count
()
if
args
.
multiprocessing_distributed
:
# Since we have ngpus_per_node processes per node, the total world_size
# needs to be adjusted accordingly
args
.
world_size
=
ngpus_per_node
*
args
.
world_size
# Use torch.multiprocessing.spawn to launch distributed processes: the
# main_worker process function
mp
.
spawn
(
main_worker
,
nprocs
=
ngpus_per_node
,
args
=
(
ngpus_per_node
,
args
))
else
:
# Simply call main_worker function
main_worker
(
args
.
gpu
,
ngpus_per_node
,
args
)
def
main_worker
(
gpu
,
ngpus_per_node
,
args
):
global
best_acc1
args
.
gpu
=
gpu
if
args
.
gpu
is
not
None
:
print
(
"Use GPU: {} for training"
.
format
(
args
.
gpu
))
if
args
.
distributed
:
if
args
.
dist_url
==
"env://"
and
args
.
rank
==
-
1
:
args
.
rank
=
int
(
os
.
environ
[
"RANK"
])
if
args
.
multiprocessing_distributed
:
# For multiprocessing distributed training, rank needs to be the
# global rank among all the processes
args
.
rank
=
args
.
rank
*
ngpus_per_node
+
gpu
dist
.
init_process_group
(
backend
=
args
.
dist_backend
,
init_method
=
args
.
dist_url
,
world_size
=
args
.
world_size
,
rank
=
args
.
rank
)
# create model
if
args
.
pretrained
:
print
(
"=> using pre-trained model '{}'"
.
format
(
args
.
arch
))
model
=
models
.
__dict__
[
args
.
arch
](
pretrained
=
True
)
# else:
# print("=> creating model '{}'".format(args.arch))
# model = models.__dict__[args.arch]()
else
:
if
(
args
.
arch
==
"inception_v3"
):
print
(
"=> creating model '{}'"
.
format
(
args
.
arch
))
model
=
models
.
__dict__
[
args
.
arch
](
aux_logits
=
False
)
else
:
print
(
"=> creating model '{}'"
.
format
(
args
.
arch
))
model
=
models
.
__dict__
[
args
.
arch
]()
if
args
.
distributed
:
# For multiprocessing distributed, DistributedDataParallel constructor
# should always set the single device scope, otherwise,
# DistributedDataParallel will use all available devices.
if
args
.
gpu
is
not
None
:
torch
.
cuda
.
set_device
(
args
.
gpu
)
model
.
cuda
(
args
.
gpu
)
# When using a single GPU per process and per
# DistributedDataParallel, we need to divide the batch size
# ourselves based on the total number of GPUs we have
args
.
batch_size
=
int
(
args
.
batch_size
/
ngpus_per_node
)
args
.
workers
=
int
(
args
.
workers
/
ngpus_per_node
)
model
=
torch
.
nn
.
parallel
.
DistributedDataParallel
(
model
,
device_ids
=
[
args
.
gpu
])
else
:
model
.
cuda
()
# DistributedDataParallel will divide and allocate batch_size to all
# available GPUs if device_ids are not set
model
=
torch
.
nn
.
parallel
.
DistributedDataParallel
(
model
)
elif
args
.
gpu
is
not
None
:
torch
.
cuda
.
set_device
(
args
.
gpu
)
model
=
model
.
cuda
(
args
.
gpu
)
else
:
# DataParallel will divide and allocate batch_size to all available GPUs
if
args
.
arch
.
startswith
(
'alexnet'
)
or
args
.
arch
.
startswith
(
'vgg'
):
model
.
features
=
torch
.
nn
.
DataParallel
(
model
.
features
)
model
.
cuda
()
else
:
model
=
torch
.
nn
.
DataParallel
(
model
).
cuda
()
# define loss function (criterion) and optimizer
criterion
=
nn
.
CrossEntropyLoss
().
cuda
(
args
.
gpu
)
optimizer
=
torch
.
optim
.
SGD
(
model
.
parameters
(),
args
.
lr
,
momentum
=
args
.
momentum
,
weight_decay
=
args
.
weight_decay
)
# optionally resume from a checkpoint
if
args
.
resume
:
if
os
.
path
.
isfile
(
args
.
resume
):
print
(
"=> loading checkpoint '{}'"
.
format
(
args
.
resume
))
checkpoint
=
torch
.
load
(
args
.
resume
)
args
.
start_epoch
=
checkpoint
[
'epoch'
]
best_acc1
=
checkpoint
[
'best_acc1'
]
if
args
.
gpu
is
not
None
:
# best_acc1 may be from a checkpoint from a different GPU
best_acc1
=
best_acc1
.
to
(
args
.
gpu
)
model
.
load_state_dict
(
checkpoint
[
'state_dict'
])
optimizer
.
load_state_dict
(
checkpoint
[
'optimizer'
])
print
(
"=> loaded checkpoint '{}' (epoch {})"
.
format
(
args
.
resume
,
checkpoint
[
'epoch'
]))
else
:
print
(
"=> no checkpoint found at '{}'"
.
format
(
args
.
resume
))
cudnn
.
benchmark
=
True
# Data loading code
traindir
=
os
.
path
.
join
(
args
.
data
,
'train'
)
valdir
=
os
.
path
.
join
(
args
.
data
,
'val'
)
normalize
=
transforms
.
Normalize
(
mean
=
[
0.485
,
0.456
,
0.406
],
std
=
[
0.229
,
0.224
,
0.225
])
#######add inceptionby aiss 0710
if
args
.
synthetic
:
print
(
"use syn date!"
)
train_dataset
=
datasets
.
ImageFolder
(
traindir
,
transforms
.
Compose
([
transforms
.
RandomResizedCrop
(
224
),
transforms
.
RandomHorizontalFlip
(),
transforms
.
ToTensor
(),
normalize
,
]))
if
not
args
.
synthetic
else
\
datasets
.
FakeData
(
1281280
,
num_classes
=
1000
,
transform
=
transforms
.
ToTensor
())
if
args
.
distributed
:
train_sampler
=
torch
.
utils
.
data
.
distributed
.
DistributedSampler
(
train_dataset
)
else
:
train_sampler
=
None
train_loader
=
torch
.
utils
.
data
.
DataLoader
(
train_dataset
,
batch_size
=
args
.
batch_size
,
shuffle
=
(
train_sampler
is
None
),
num_workers
=
args
.
workers
,
pin_memory
=
False
,
sampler
=
train_sampler
)
#global len_train_loader
#len_train_loader=len(train_loader)
#print("iters per epoch: ",len(train_loader))
val_loader
=
torch
.
utils
.
data
.
DataLoader
(
datasets
.
ImageFolder
(
valdir
,
transforms
.
Compose
([
transforms
.
Resize
(
256
),
transforms
.
CenterCrop
(
224
),
transforms
.
ToTensor
(),
normalize
,
]))
if
not
args
.
synthetic
else
datasets
.
FakeData
(
20000
,
num_classes
=
1000
,
transform
=
transforms
.
ToTensor
()),
batch_size
=
args
.
batch_size
,
shuffle
=
False
,
num_workers
=
args
.
workers
,
pin_memory
=
False
)
# if args.evaluate:
# validate(val_loader, model, criterion, args)
# return
#####add by aiss
global
is_stop
is_stop
=
0
# HC debug
print
(
"Begin training ..."
);
for
epoch
in
range
(
args
.
start_epoch
,
args
.
epochs
):
if
args
.
distributed
:
train_sampler
.
set_epoch
(
epoch
)
adjust_learning_rate
(
optimizer
,
epoch
,
args
)
# train for one epoch
train
(
train_loader
,
model
,
criterion
,
optimizer
,
epoch
,
args
)
print
(
"is_stop is "
,
is_stop
)
if
is_stop
==
1
:
break
#acc1 = validate(val_loader, model, criterion, args)
# #remember best acc@1 and save checkpoint
#is_best = acc1 > best_acc1
#best_acc1 = max(acc1, best_acc1)
#if not args.multiprocessing_distributed or (args.multiprocessing_distributed
# and args.rank % ngpus_per_node == 0):
# save_checkpoint({
# 'epoch': epoch + 1,
# 'arch': args.arch,
# 'state_dict': model.state_dict(),
# 'best_acc1': best_acc1,
# 'optimizer' : optimizer.state_dict(),
# }, is_best)
def
train
(
train_loader
,
model
,
criterion
,
optimizer
,
epoch
,
args
):
batch_time
=
AverageMeter
(
'Time'
,
':6.3f'
)
######add by aiss 0708
# images_per_sec=AverageMeter('Perf', ':6.4f')
data_time
=
AverageMeter
(
'Data'
,
':6.3f'
)
losses
=
AverageMeter
(
'Loss'
,
':.4e'
)
top1
=
AverageMeter
(
'Acc@1'
,
':6.2f'
)
top5
=
AverageMeter
(
'Acc@5'
,
':6.2f'
)
progress
=
ProgressMeter
(
len
(
train_loader
),
[
batch_time
,
data_time
,
losses
,
top1
,
top5
],
prefix
=
"Epoch: [{}]"
.
format
(
epoch
))
# switch to train mode
model
.
train
()
end
=
time
.
time
()
##########add by aiss##############
len_train_loader
=
len
(
train_loader
)
print
(
"iters per epoch: "
,
len
(
train_loader
))
for
i
,
(
images
,
target
)
in
enumerate
(
train_loader
):
# measure data loading time
data_time
.
update
(
time
.
time
()
-
end
)
#add by aiss
if
i
==
10
:
time_start
=
time
.
time
()
torch
.
cuda
.
synchronize
()
gpu_start_time
=
time
.
time
()
if
args
.
gpu
is
not
None
:
images
=
images
.
cuda
(
args
.
gpu
,
non_blocking
=
True
)
target
=
target
.
cuda
(
args
.
gpu
,
non_blocking
=
True
)
# compute output
output
=
model
(
images
)
loss
=
criterion
(
output
,
target
)
# measure accuracy and record loss
acc1
,
acc5
=
accuracy
(
output
,
target
,
topk
=
(
1
,
5
))
losses
.
update
(
loss
.
item
(),
images
.
size
(
0
))
top1
.
update
(
acc1
[
0
],
images
.
size
(
0
))
top5
.
update
(
acc5
[
0
],
images
.
size
(
0
))
# compute gradient and do SGD step
optimizer
.
zero_grad
()
loss
.
backward
()
optimizer
.
step
()
# measure elapsed time
batch_time
.
update
(
time
.
time
()
-
end
)
end
=
time
.
time
()
####add by aiss
#print batch_time.val, batch_time.count, batch_time.sum
#images_per_sec= args.batch_size / batch_time.val
if
i
%
args
.
print_freq
==
0
:
progress
.
display
(
i
)
# print("Performance: {} ", images_per_sec)
#print("Performance: {} samples/sec".format(images_per_sec))
# print("step time cost: {} sec".format(batch_time.val))
#if i == len_train_loader-10:
if
i
==
1010
:
is_stop
=
1
time_end
=
end
time_print_step
=
time_end
-
time_start
#####wait for gpu op finished########
torch
.
cuda
.
synchronize
()
gpu_end_time
=
time
.
time
()
gpu_time_print_step
=
gpu_end_time
-
gpu_start_time
# HC debug
print
(
"batch_size is "
,
args
.
batch_size
)
print
(
"world_size is "
,
args
.
world_size
)
########aiss debug####
print
(
"time_start:%s time_end:%s gap:%s"
%
(
time_start
,
time_end
,
time_print_step
))
print
(
"gpu_start_time:%s gpu_end_time:%s gap:%s"
%
(
gpu_start_time
,
gpu_end_time
,
gpu_time_print_step
))
if
args
.
world_size
==
-
1
:
images_total_per_sec
=
args
.
batch_size
*
1000
/
time_print_step
else
:
images_total_per_sec
=
args
.
batch_size
*
1000
*
args
.
world_size
/
time_print_step
print
(
"Total Performance: {} samples/sec"
.
format
(
images_total_per_sec
))
break
def
validate
(
val_loader
,
model
,
criterion
,
args
):
batch_time
=
AverageMeter
(
'Time'
,
':6.3f'
)
losses
=
AverageMeter
(
'Loss'
,
':.4e'
)
top1
=
AverageMeter
(
'Acc@1'
,
':6.2f'
)
top5
=
AverageMeter
(
'Acc@5'
,
':6.2f'
)
progress
=
ProgressMeter
(
len
(
val_loader
),
[
batch_time
,
losses
,
top1
,
top5
],
prefix
=
'Test: '
)
# switch to evaluate mode
model
.
eval
()
with
torch
.
no_grad
():
end
=
time
.
time
()
for
i
,
(
images
,
target
)
in
enumerate
(
val_loader
):
if
args
.
gpu
is
not
None
:
images
=
images
.
cuda
(
args
.
gpu
,
non_blocking
=
True
)
target
=
target
.
cuda
(
args
.
gpu
,
non_blocking
=
True
)
# compute output
output
=
model
(
images
)
loss
=
criterion
(
output
,
target
)
# measure accuracy and record loss
acc1
,
acc5
=
accuracy
(
output
,
target
,
topk
=
(
1
,
5
))
losses
.
update
(
loss
.
item
(),
images
.
size
(
0
))
top1
.
update
(
acc1
[
0
],
images
.
size
(
0
))
top5
.
update
(
acc5
[
0
],
images
.
size
(
0
))
# measure elapsed time
batch_time
.
update
(
time
.
time
()
-
end
)
end
=
time
.
time
()
if
i
%
args
.
print_freq
==
0
:
progress
.
display
(
i
)
# TODO: this should also be done with the ProgressMeter
print
(
' * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}'
.
format
(
top1
=
top1
,
top5
=
top5
))
return
top1
.
avg
def
save_checkpoint
(
state
,
is_best
,
filename
=
'checkpoint.pth.tar'
):
torch
.
save
(
state
,
filename
)
if
is_best
:
shutil
.
copyfile
(
filename
,
'model_best.pth.tar'
)
class
AverageMeter
(
object
):
"""Computes and stores the average and current value"""
def
__init__
(
self
,
name
,
fmt
=
':f'
):
self
.
name
=
name
self
.
fmt
=
fmt
self
.
reset
()
def
reset
(
self
):
self
.
val
=
0
self
.
avg
=
0
self
.
sum
=
0
self
.
count
=
0
def
update
(
self
,
val
,
n
=
1
):
self
.
val
=
val
self
.
sum
+=
val
*
n
self
.
count
+=
n
self
.
avg
=
self
.
sum
/
self
.
count
def
__str__
(
self
):
fmtstr
=
'{name} {val'
+
self
.
fmt
+
'} ({avg'
+
self
.
fmt
+
'})'
return
fmtstr
.
format
(
**
self
.
__dict__
)
class
ProgressMeter
(
object
):
def
__init__
(
self
,
num_batches
,
meters
,
prefix
=
""
):
self
.
batch_fmtstr
=
self
.
_get_batch_fmtstr
(
num_batches
)
self
.
meters
=
meters
self
.
prefix
=
prefix
def
display
(
self
,
batch
):
entries
=
[
self
.
prefix
+
self
.
batch_fmtstr
.
format
(
batch
)]
entries
+=
[
str
(
meter
)
for
meter
in
self
.
meters
]
print
(
'
\t
'
.
join
(
entries
))
def
_get_batch_fmtstr
(
self
,
num_batches
):
num_digits
=
len
(
str
(
num_batches
//
1
))
fmt
=
'{:'
+
str
(
num_digits
)
+
'd}'
return
'['
+
fmt
+
'/'
+
fmt
.
format
(
num_batches
)
+
']'
def
adjust_learning_rate
(
optimizer
,
epoch
,
args
):
"""Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
lr
=
args
.
lr
*
(
0.1
**
(
epoch
//
30
))
for
param_group
in
optimizer
.
param_groups
:
param_group
[
'lr'
]
=
lr
def
accuracy
(
output
,
target
,
topk
=
(
1
,)):
"""Computes the accuracy over the k top predictions for the specified values of k"""
with
torch
.
no_grad
():
maxk
=
max
(
topk
)
batch_size
=
target
.
size
(
0
)
_
,
pred
=
output
.
topk
(
maxk
,
1
,
True
,
True
)
pred
=
pred
.
t
()
correct
=
pred
.
eq
(
target
.
view
(
1
,
-
1
).
expand_as
(
pred
))
res
=
[]
for
k
in
topk
:
correct_k
=
correct
[:
k
].
contiguous
().
view
(
-
1
).
float
().
sum
(
0
,
keepdim
=
True
)
res
.
append
(
correct_k
.
mul_
(
100.0
/
batch_size
))
return
res
if
__name__
==
'__main__'
:
main
()
PyTorch/Compute-Vision/Classification/mpi_slurm.sbatch
0 → 100755
View file @
0fc002df
#!/bin/bash
#SBATCH -p caspra
#SBATCH -N 2
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=8
#SBATCH --gres=dcu:4
#SBATCH -J syn_nccl_torch
#SBATCH -o ./log/output.%j
#SBATCH -e ./log/output.%j
#SBATCH -x e01r1n07
#for rocm3.3
module
rm
compiler/rocm/2.9
module load apps/PyTorch/1.5.0a0/hpcx-2.4.1-gcc-7.3.1-rocm3.3
export
MIOPEN_DEBUG_DISABLE_FIND_DB
=
1
which mpirun
which python3
hostfile
=
./
$SLURM_JOB_ID
scontrol show hostnames
$SLURM_JOB_NODELIST
>
${
hostfile
}
rm
`
pwd
`
/hostfile-dl
-f
for
i
in
`
cat
$hostfile
`
do
echo
${
i
}
slots
=
4
>>
`
pwd
`
/hostfile-dl-
$SLURM_JOB_ID
done
np
=
$(
cat
$hostfile
|sort|uniq |wc
-l
)
np
=
$((
$np
*
4
))
nodename
=
$(
cat
$hostfile
|sed
-n
"1p"
)
echo
$nodename
dist_url
=
`
echo
$nodename
|
awk
'{print $1}'
`
#for single card
#echo mpirun -np 1 --allow-run-as-root --bind-to none `pwd`/single_process.sh $dist_url inception_v3 64
#mpirun -np 1 --allow-run-as-root --bind-to none `pwd`/single_process.sh $dist_url inception_v3 64
#for one node
#echo mpirun -np $np --allow-run-as-root --bind-to none `pwd`/single_process.sh $dist_url inception_v3 64
#mpirun -np $np --allow-run-as-root --bind-to none `pwd`/single_process.sh $dist_url inception_v3 64
#for multi-gpu
echo
mpirun
-np
$np
--allow-run-as-root
--hostfile
hostfile-dl-
$SLURM_JOB_ID
--bind-to
none
`
pwd
`
/single_process.sh
$dist_url
resnet50 64
mpirun
-np
$np
--allow-run-as-root
--hostfile
hostfile-dl-
$SLURM_JOB_ID
--bind-to
none
`
pwd
`
/single_process.sh
$dist_url
resnet50 64
PyTorch/Compute-Vision/Classification/single_process.sh
0 → 100755
View file @
0fc002df
#!/bin/bash
export
MIOPEN_DEBUG_DISABLE_FIND_DB
=
1
export
NCCL_SOCKET_IFNAME
=
ib0
export
HSA_USERPTR_FOR_PAGED_MEM
=
0
#source /public/software/apps/DeepLearning/PyTorch/pytorch-env.sh
module
rm
compiler/rocm/2.9
module load apps/PyTorch/1.5.0a0/hpcx-2.4.1-gcc-7.3.1-rocm3.3
#source /public/home/aiss/Pytorch/env_rocm3.3_torch1.5.sh
lrank
=
$OMPI_COMM_WORLD_LOCAL_RANK
comm_rank
=
$OMPI_COMM_WORLD_RANK
comm_size
=
$OMPI_COMM_WORLD_SIZE
APP
=
"python3
`
pwd
`
/main_bench.py --batch-size=
${
3
}
--a=
${
2
}
-j 24 --epochs=1 --dist-url tcp://
${
1
}
:34567 --dist-backend nccl --world-size=
${
comm_size
}
--rank=
${
comm_rank
}
--synthetic /public/software/apps/DeepLearning/Data/ImageNet-pytorch/"
case
${
lrank
}
in
[
0]
)
export
HIP_VISIBLE_DEVICES
=
0
export
UCX_NET_DEVICES
=
mlx5_0:1
export
UCX_IB_PCI_BW
=
mlx5_0:50Gbs
echo
NCCL_SOCKET_IFNAME
=
ib0 numactl
--cpunodebind
=
0
--membind
=
0
${
APP
}
NCCL_SOCKET_IFNAME
=
ib0 numactl
--cpunodebind
=
0
--membind
=
0
${
APP
}
#echo GLOO_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP}
#GLOO_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP}
;;
[
1]
)
export
HIP_VISIBLE_DEVICES
=
1
export
UCX_NET_DEVICES
=
mlx5_1:1
export
UCX_IB_PCI_BW
=
mlx5_1:50Gbs
echo
NCCL_SOCKET_IFNAME
=
ib0 numactl
--cpunodebind
=
1
--membind
=
1
${
APP
}
NCCL_SOCKET_IFNAME
=
ib0 numactl
--cpunodebind
=
1
--membind
=
1
${
APP
}
;;
[
2]
)
export
HIP_VISIBLE_DEVICES
=
2
export
UCX_NET_DEVICES
=
mlx5_2:1
export
UCX_IB_PCI_BW
=
mlx5_2:50Gbs
echo
NCCL_SOCKET_IFNAME
=
ib0 numactl
--cpunodebind
=
2
--membind
=
2
${
APP
}
NCCL_SOCKET_IFNAME
=
ib0 numactl
--cpunodebind
=
2
--membind
=
2
${
APP
}
;;
[
3]
)
export
HIP_VISIBLE_DEVICES
=
3
export
UCX_NET_DEVICES
=
mlx5_3:1
export
UCX_IB_PCI_BW
=
mlx5_3:50Gbs
echo
NCCL_SOCKET_IFNAME
=
ib0 numactl
--cpunodebind
=
3
--membind
=
3
${
APP
}
NCCL_SOCKET_IFNAME
=
ib0 numactl
--cpunodebind
=
3
--membind
=
3
${
APP
}
;;
esac
PyTorch/Compute-Vision/Objection/Faster-rcnn/README.md
0 → 100644
View file @
0fc002df
# 简介
该测试用例用于PyTorch目标检测模型Fasterrcnn测试。
# 运行
*
train.py中get_dataset函数需要根据实际数据集情况设置json文件的位置、类别数目等。
## 单卡
python3 train.py --batch-size=2 -j 8 --epochs=26 --data-path=/path/to/datasets/folder --output-dir=/path/to/result/save/folder
## 单机多卡
mpirun -np 4 --hostfile hostfile --bind-to none
`pwd`
/single_process.sh localhost
## 多机多卡
mpirun -np $np --hostfile hostfile --bind-to none
`pwd`
/single_process.sh ${master_ip}
# 参考
[
https://github.com/pytorch/vision/tree/master/references/detection
](
https://github.com/pytorch/vision/tree/master/references/detection
)
PyTorch/Compute-Vision/Objection/Faster-rcnn/coco_eval.py
0 → 100644
View file @
0fc002df
import
json
import
tempfile
import
numpy
as
np
import
copy
import
time
import
torch
import
torch._six
from
pycocotools.cocoeval
import
COCOeval
from
pycocotools.coco
import
COCO
import
pycocotools.mask
as
mask_util
from
collections
import
defaultdict
import
utils
class
CocoEvaluator
(
object
):
def
__init__
(
self
,
coco_gt
,
iou_types
):
assert
isinstance
(
iou_types
,
(
list
,
tuple
))
coco_gt
=
copy
.
deepcopy
(
coco_gt
)
self
.
coco_gt
=
coco_gt
self
.
iou_types
=
iou_types
self
.
coco_eval
=
{}
for
iou_type
in
iou_types
:
self
.
coco_eval
[
iou_type
]
=
COCOeval
(
coco_gt
,
iouType
=
iou_type
)
self
.
img_ids
=
[]
self
.
eval_imgs
=
{
k
:
[]
for
k
in
iou_types
}
def
update
(
self
,
predictions
):
img_ids
=
list
(
np
.
unique
(
list
(
predictions
.
keys
())))
self
.
img_ids
.
extend
(
img_ids
)
for
iou_type
in
self
.
iou_types
:
results
=
self
.
prepare
(
predictions
,
iou_type
)
coco_dt
=
loadRes
(
self
.
coco_gt
,
results
)
if
results
else
COCO
()
coco_eval
=
self
.
coco_eval
[
iou_type
]
coco_eval
.
cocoDt
=
coco_dt
coco_eval
.
params
.
imgIds
=
list
(
img_ids
)
img_ids
,
eval_imgs
=
evaluate
(
coco_eval
)
self
.
eval_imgs
[
iou_type
].
append
(
eval_imgs
)
def
synchronize_between_processes
(
self
):
for
iou_type
in
self
.
iou_types
:
self
.
eval_imgs
[
iou_type
]
=
np
.
concatenate
(
self
.
eval_imgs
[
iou_type
],
2
)
create_common_coco_eval
(
self
.
coco_eval
[
iou_type
],
self
.
img_ids
,
self
.
eval_imgs
[
iou_type
])
def
accumulate
(
self
):
for
coco_eval
in
self
.
coco_eval
.
values
():
coco_eval
.
accumulate
()
def
summarize
(
self
):
for
iou_type
,
coco_eval
in
self
.
coco_eval
.
items
():
print
(
"IoU metric: {}"
.
format
(
iou_type
))
coco_eval
.
summarize
()
def
prepare
(
self
,
predictions
,
iou_type
):
if
iou_type
==
"bbox"
:
return
self
.
prepare_for_coco_detection
(
predictions
)
elif
iou_type
==
"segm"
:
return
self
.
prepare_for_coco_segmentation
(
predictions
)
elif
iou_type
==
"keypoints"
:
return
self
.
prepare_for_coco_keypoint
(
predictions
)
else
:
raise
ValueError
(
"Unknown iou type {}"
.
format
(
iou_type
))
def
prepare_for_coco_detection
(
self
,
predictions
):
coco_results
=
[]
for
original_id
,
prediction
in
predictions
.
items
():
if
len
(
prediction
)
==
0
:
continue
boxes
=
prediction
[
"boxes"
]
boxes
=
convert_to_xywh
(
boxes
).
tolist
()
scores
=
prediction
[
"scores"
].
tolist
()
labels
=
prediction
[
"labels"
].
tolist
()
coco_results
.
extend
(
[
{
"image_id"
:
original_id
,
"category_id"
:
labels
[
k
],
"bbox"
:
box
,
"score"
:
scores
[
k
],
}
for
k
,
box
in
enumerate
(
boxes
)
]
)
return
coco_results
def
prepare_for_coco_segmentation
(
self
,
predictions
):
coco_results
=
[]
for
original_id
,
prediction
in
predictions
.
items
():
if
len
(
prediction
)
==
0
:
continue
scores
=
prediction
[
"scores"
]
labels
=
prediction
[
"labels"
]
masks
=
prediction
[
"masks"
]
masks
=
masks
>
0.5
scores
=
prediction
[
"scores"
].
tolist
()
labels
=
prediction
[
"labels"
].
tolist
()
rles
=
[
mask_util
.
encode
(
np
.
array
(
mask
[
0
,
:,
:,
np
.
newaxis
],
order
=
"F"
))[
0
]
for
mask
in
masks
]
for
rle
in
rles
:
rle
[
"counts"
]
=
rle
[
"counts"
].
decode
(
"utf-8"
)
coco_results
.
extend
(
[
{
"image_id"
:
original_id
,
"category_id"
:
labels
[
k
],
"segmentation"
:
rle
,
"score"
:
scores
[
k
],
}
for
k
,
rle
in
enumerate
(
rles
)
]
)
return
coco_results
def
prepare_for_coco_keypoint
(
self
,
predictions
):
coco_results
=
[]
for
original_id
,
prediction
in
predictions
.
items
():
if
len
(
prediction
)
==
0
:
continue
boxes
=
prediction
[
"boxes"
]
boxes
=
convert_to_xywh
(
boxes
).
tolist
()
scores
=
prediction
[
"scores"
].
tolist
()
labels
=
prediction
[
"labels"
].
tolist
()
keypoints
=
prediction
[
"keypoints"
]
keypoints
=
keypoints
.
flatten
(
start_dim
=
1
).
tolist
()
coco_results
.
extend
(
[
{
"image_id"
:
original_id
,
"category_id"
:
labels
[
k
],
'keypoints'
:
keypoint
,
"score"
:
scores
[
k
],
}
for
k
,
keypoint
in
enumerate
(
keypoints
)
]
)
return
coco_results
def
convert_to_xywh
(
boxes
):
xmin
,
ymin
,
xmax
,
ymax
=
boxes
.
unbind
(
1
)
return
torch
.
stack
((
xmin
,
ymin
,
xmax
-
xmin
,
ymax
-
ymin
),
dim
=
1
)
def
merge
(
img_ids
,
eval_imgs
):
all_img_ids
=
utils
.
all_gather
(
img_ids
)
all_eval_imgs
=
utils
.
all_gather
(
eval_imgs
)
merged_img_ids
=
[]
for
p
in
all_img_ids
:
merged_img_ids
.
extend
(
p
)
merged_eval_imgs
=
[]
for
p
in
all_eval_imgs
:
merged_eval_imgs
.
append
(
p
)
merged_img_ids
=
np
.
array
(
merged_img_ids
)
merged_eval_imgs
=
np
.
concatenate
(
merged_eval_imgs
,
2
)
# keep only unique (and in sorted order) images
merged_img_ids
,
idx
=
np
.
unique
(
merged_img_ids
,
return_index
=
True
)
merged_eval_imgs
=
merged_eval_imgs
[...,
idx
]
return
merged_img_ids
,
merged_eval_imgs
def
create_common_coco_eval
(
coco_eval
,
img_ids
,
eval_imgs
):
img_ids
,
eval_imgs
=
merge
(
img_ids
,
eval_imgs
)
img_ids
=
list
(
img_ids
)
eval_imgs
=
list
(
eval_imgs
.
flatten
())
coco_eval
.
evalImgs
=
eval_imgs
coco_eval
.
params
.
imgIds
=
img_ids
coco_eval
.
_paramsEval
=
copy
.
deepcopy
(
coco_eval
.
params
)
#################################################################
# From pycocotools, just removed the prints and fixed
# a Python3 bug about unicode not defined
#################################################################
# Ideally, pycocotools wouldn't have hard-coded prints
# so that we could avoid copy-pasting those two functions
def
createIndex
(
self
):
# create index
# print('creating index...')
anns
,
cats
,
imgs
=
{},
{},
{}
imgToAnns
,
catToImgs
=
defaultdict
(
list
),
defaultdict
(
list
)
if
'annotations'
in
self
.
dataset
:
for
ann
in
self
.
dataset
[
'annotations'
]:
imgToAnns
[
ann
[
'image_id'
]].
append
(
ann
)
anns
[
ann
[
'id'
]]
=
ann
if
'images'
in
self
.
dataset
:
for
img
in
self
.
dataset
[
'images'
]:
imgs
[
img
[
'id'
]]
=
img
if
'categories'
in
self
.
dataset
:
for
cat
in
self
.
dataset
[
'categories'
]:
cats
[
cat
[
'id'
]]
=
cat
if
'annotations'
in
self
.
dataset
and
'categories'
in
self
.
dataset
:
for
ann
in
self
.
dataset
[
'annotations'
]:
catToImgs
[
ann
[
'category_id'
]].
append
(
ann
[
'image_id'
])
# print('index created!')
# create class members
self
.
anns
=
anns
self
.
imgToAnns
=
imgToAnns
self
.
catToImgs
=
catToImgs
self
.
imgs
=
imgs
self
.
cats
=
cats
maskUtils
=
mask_util
def
loadRes
(
self
,
resFile
):
"""
Load result file and return a result api object.
:param resFile (str) : file name of result file
:return: res (obj) : result api object
"""
res
=
COCO
()
res
.
dataset
[
'images'
]
=
[
img
for
img
in
self
.
dataset
[
'images'
]]
# print('Loading and preparing results...')
# tic = time.time()
if
isinstance
(
resFile
,
torch
.
_six
.
string_classes
):
anns
=
json
.
load
(
open
(
resFile
))
elif
type
(
resFile
)
==
np
.
ndarray
:
anns
=
self
.
loadNumpyAnnotations
(
resFile
)
else
:
anns
=
resFile
assert
type
(
anns
)
==
list
,
'results in not an array of objects'
annsImgIds
=
[
ann
[
'image_id'
]
for
ann
in
anns
]
assert
set
(
annsImgIds
)
==
(
set
(
annsImgIds
)
&
set
(
self
.
getImgIds
())),
\
'Results do not correspond to current coco set'
if
'caption'
in
anns
[
0
]:
imgIds
=
set
([
img
[
'id'
]
for
img
in
res
.
dataset
[
'images'
]])
&
set
([
ann
[
'image_id'
]
for
ann
in
anns
])
res
.
dataset
[
'images'
]
=
[
img
for
img
in
res
.
dataset
[
'images'
]
if
img
[
'id'
]
in
imgIds
]
for
id
,
ann
in
enumerate
(
anns
):
ann
[
'id'
]
=
id
+
1
elif
'bbox'
in
anns
[
0
]
and
not
anns
[
0
][
'bbox'
]
==
[]:
res
.
dataset
[
'categories'
]
=
copy
.
deepcopy
(
self
.
dataset
[
'categories'
])
for
id
,
ann
in
enumerate
(
anns
):
bb
=
ann
[
'bbox'
]
x1
,
x2
,
y1
,
y2
=
[
bb
[
0
],
bb
[
0
]
+
bb
[
2
],
bb
[
1
],
bb
[
1
]
+
bb
[
3
]]
if
'segmentation'
not
in
ann
:
ann
[
'segmentation'
]
=
[[
x1
,
y1
,
x1
,
y2
,
x2
,
y2
,
x2
,
y1
]]
ann
[
'area'
]
=
bb
[
2
]
*
bb
[
3
]
ann
[
'id'
]
=
id
+
1
ann
[
'iscrowd'
]
=
0
elif
'segmentation'
in
anns
[
0
]:
res
.
dataset
[
'categories'
]
=
copy
.
deepcopy
(
self
.
dataset
[
'categories'
])
for
id
,
ann
in
enumerate
(
anns
):
# now only support compressed RLE format as segmentation results
ann
[
'area'
]
=
maskUtils
.
area
(
ann
[
'segmentation'
])
if
'bbox'
not
in
ann
:
ann
[
'bbox'
]
=
maskUtils
.
toBbox
(
ann
[
'segmentation'
])
ann
[
'id'
]
=
id
+
1
ann
[
'iscrowd'
]
=
0
elif
'keypoints'
in
anns
[
0
]:
res
.
dataset
[
'categories'
]
=
copy
.
deepcopy
(
self
.
dataset
[
'categories'
])
for
id
,
ann
in
enumerate
(
anns
):
s
=
ann
[
'keypoints'
]
x
=
s
[
0
::
3
]
y
=
s
[
1
::
3
]
x1
,
x2
,
y1
,
y2
=
np
.
min
(
x
),
np
.
max
(
x
),
np
.
min
(
y
),
np
.
max
(
y
)
ann
[
'area'
]
=
(
x2
-
x1
)
*
(
y2
-
y1
)
ann
[
'id'
]
=
id
+
1
ann
[
'bbox'
]
=
[
x1
,
y1
,
x2
-
x1
,
y2
-
y1
]
# print('DONE (t={:0.2f}s)'.format(time.time()- tic))
res
.
dataset
[
'annotations'
]
=
anns
createIndex
(
res
)
return
res
def
evaluate
(
self
):
'''
Run per image evaluation on given images and store results (a list of dict) in self.evalImgs
:return: None
'''
# tic = time.time()
# print('Running per image evaluation...')
p
=
self
.
params
# add backward compatibility if useSegm is specified in params
if
p
.
useSegm
is
not
None
:
p
.
iouType
=
'segm'
if
p
.
useSegm
==
1
else
'bbox'
print
(
'useSegm (deprecated) is not None. Running {} evaluation'
.
format
(
p
.
iouType
))
# print('Evaluate annotation type *{}*'.format(p.iouType))
p
.
imgIds
=
list
(
np
.
unique
(
p
.
imgIds
))
if
p
.
useCats
:
p
.
catIds
=
list
(
np
.
unique
(
p
.
catIds
))
p
.
maxDets
=
sorted
(
p
.
maxDets
)
self
.
params
=
p
self
.
_prepare
()
# loop through images, area range, max detection number
catIds
=
p
.
catIds
if
p
.
useCats
else
[
-
1
]
if
p
.
iouType
==
'segm'
or
p
.
iouType
==
'bbox'
:
computeIoU
=
self
.
computeIoU
elif
p
.
iouType
==
'keypoints'
:
computeIoU
=
self
.
computeOks
self
.
ious
=
{
(
imgId
,
catId
):
computeIoU
(
imgId
,
catId
)
for
imgId
in
p
.
imgIds
for
catId
in
catIds
}
evaluateImg
=
self
.
evaluateImg
maxDet
=
p
.
maxDets
[
-
1
]
evalImgs
=
[
evaluateImg
(
imgId
,
catId
,
areaRng
,
maxDet
)
for
catId
in
catIds
for
areaRng
in
p
.
areaRng
for
imgId
in
p
.
imgIds
]
# this is NOT in the pycocotools code, but could be done outside
evalImgs
=
np
.
asarray
(
evalImgs
).
reshape
(
len
(
catIds
),
len
(
p
.
areaRng
),
len
(
p
.
imgIds
))
self
.
_paramsEval
=
copy
.
deepcopy
(
self
.
params
)
# toc = time.time()
# print('DONE (t={:0.2f}s).'.format(toc-tic))
return
p
.
imgIds
,
evalImgs
#################################################################
# end of straight copy from pycocotools, just removing the prints
#################################################################
PyTorch/Compute-Vision/Objection/Faster-rcnn/coco_utils.py
0 → 100644
View file @
0fc002df
import
copy
import
os
import
json
from
PIL
import
Image
,
ImageFile
ImageFile
.
LOAD_TRUNCATED_IMAGES
=
True
import
io
#import boto3
#import brainpp
import
torch
import
torch.utils.data
import
torchvision
from
pycocotools
import
mask
as
coco_mask
from
pycocotools.coco
import
COCO
import
transforms
as
T
from
torchvision.datasets.vision
import
VisionDataset
class
FilterAndRemapCocoCategories
(
object
):
def
__init__
(
self
,
categories
,
remap
=
True
):
self
.
categories
=
categories
self
.
remap
=
remap
def
__call__
(
self
,
image
,
target
):
anno
=
target
[
"annotations"
]
anno
=
[
obj
for
obj
in
anno
if
obj
[
"category_id"
]
in
self
.
categories
]
if
not
self
.
remap
:
target
[
"annotations"
]
=
anno
return
image
,
target
anno
=
copy
.
deepcopy
(
anno
)
for
obj
in
anno
:
obj
[
"category_id"
]
=
self
.
categories
.
index
(
obj
[
"category_id"
])
target
[
"annotations"
]
=
anno
return
image
,
target
def
convert_coco_poly_to_mask
(
segmentations
,
height
,
width
):
masks
=
[]
for
polygons
in
segmentations
:
rles
=
coco_mask
.
frPyObjects
(
polygons
,
height
,
width
)
mask
=
coco_mask
.
decode
(
rles
)
if
len
(
mask
.
shape
)
<
3
:
mask
=
mask
[...,
None
]
mask
=
torch
.
as_tensor
(
mask
,
dtype
=
torch
.
uint8
)
mask
=
mask
.
any
(
dim
=
2
)
masks
.
append
(
mask
)
if
masks
:
masks
=
torch
.
stack
(
masks
,
dim
=
0
)
else
:
masks
=
torch
.
zeros
((
0
,
height
,
width
),
dtype
=
torch
.
uint8
)
return
masks
class
ConvertCocoPolysToMask
(
object
):
def
__call__
(
self
,
image
,
target
):
w
,
h
=
image
.
size
image_id
=
target
[
"image_id"
]
image_id
=
torch
.
tensor
([
image_id
])
anno
=
target
[
"annotations"
]
anno
=
[
obj
for
obj
in
anno
if
obj
[
'iscrowd'
]
==
0
]
boxes
=
[
obj
[
"bbox"
]
for
obj
in
anno
]
# guard against no boxes via resizing
boxes
=
torch
.
as_tensor
(
boxes
,
dtype
=
torch
.
float32
).
reshape
(
-
1
,
4
)
boxes
[:,
2
:]
+=
boxes
[:,
:
2
]
boxes
[:,
0
::
2
].
clamp_
(
min
=
0
,
max
=
w
)
boxes
[:,
1
::
2
].
clamp_
(
min
=
0
,
max
=
h
)
classes
=
[
obj
[
"category_id"
]
for
obj
in
anno
]
classes
=
torch
.
tensor
(
classes
,
dtype
=
torch
.
int64
)
keypoints
=
None
if
anno
and
"keypoints"
in
anno
[
0
]:
keypoints
=
[
obj
[
"keypoints"
]
for
obj
in
anno
]
keypoints
=
torch
.
as_tensor
(
keypoints
,
dtype
=
torch
.
float32
)
num_keypoints
=
keypoints
.
shape
[
0
]
if
num_keypoints
:
keypoints
=
keypoints
.
view
(
num_keypoints
,
-
1
,
3
)
keep
=
(
boxes
[:,
3
]
>
boxes
[:,
1
])
&
(
boxes
[:,
2
]
>
boxes
[:,
0
])
boxes
=
boxes
[
keep
]
classes
=
classes
[
keep
]
if
keypoints
is
not
None
:
keypoints
=
keypoints
[
keep
]
target
=
{}
target
[
"boxes"
]
=
boxes
target
[
"labels"
]
=
classes
target
[
"image_id"
]
=
image_id
if
keypoints
is
not
None
:
target
[
"keypoints"
]
=
keypoints
# for conversion to coco api
area
=
torch
.
tensor
([
obj
[
"area"
]
for
obj
in
anno
])
iscrowd
=
torch
.
tensor
([
obj
[
"iscrowd"
]
for
obj
in
anno
])
target
[
"area"
]
=
area
target
[
"iscrowd"
]
=
iscrowd
return
image
,
target
min_keypoints_per_image
=
10
def
_has_only_empty_bbox
(
anno
):
return
all
(
any
(
o
<=
1
for
o
in
obj
[
"bbox"
][
2
:])
for
obj
in
anno
)
def
_count_visible_keypoints
(
anno
):
return
sum
(
sum
(
1
for
v
in
ann
[
"keypoints"
][
2
::
3
]
if
v
>
0
)
for
ann
in
anno
)
def
_has_valid_annotation
(
anno
):
# if it's empty, there is no annotation
if
len
(
anno
)
==
0
:
return
False
# if all boxes have close to zero area, there is no annotation
if
_has_only_empty_bbox
(
anno
):
return
False
# keypoints task have a slight different critera for considering
# if an annotation is valid
if
"keypoints"
not
in
anno
[
0
]:
return
True
# for keypoint detection tasks, only consider valid images those
# containing at least min_keypoints_per_image
if
_count_visible_keypoints
(
anno
)
>=
min_keypoints_per_image
:
return
True
return
False
def
_coco_remove_images_without_annotations
(
dataset
,
cat_list
=
None
):
assert
isinstance
(
dataset
,
torchvision
.
datasets
.
CocoDetection
)
or
isinstance
(
dataset
,
CocoDetection
)
ids
=
[]
empty
=
0
for
ds_idx
,
img_id
in
enumerate
(
dataset
.
ids
):
ann_ids
=
dataset
.
coco
.
getAnnIds
(
imgIds
=
img_id
,
iscrowd
=
None
)
anno
=
dataset
.
coco
.
loadAnns
(
ann_ids
)
if
cat_list
:
anno
=
[
obj
for
obj
in
anno
if
obj
[
"category_id"
]
in
cat_list
]
if
_has_valid_annotation
(
anno
):
ids
.
append
(
ds_idx
)
else
:
empty
+=
1
print
(
"remove {} empty imgs without annos"
.
format
(
empty
))
dataset
=
torch
.
utils
.
data
.
Subset
(
dataset
,
ids
)
return
dataset
def
convert_to_coco_api
(
ds
):
coco_ds
=
COCO
()
ann_id
=
0
dataset
=
{
'images'
:
[],
'categories'
:
[],
'annotations'
:
[]}
categories
=
set
()
for
img_idx
in
range
(
len
(
ds
)):
# find better way to get target
# targets = ds.get_annotations(img_idx)
img
,
targets
=
ds
[
img_idx
]
image_id
=
targets
[
"image_id"
].
item
()
img_dict
=
{}
img_dict
[
'id'
]
=
image_id
img_dict
[
'height'
]
=
img
.
shape
[
-
2
]
img_dict
[
'width'
]
=
img
.
shape
[
-
1
]
dataset
[
'images'
].
append
(
img_dict
)
bboxes
=
targets
[
"boxes"
]
bboxes
[:,
2
:]
-=
bboxes
[:,
:
2
]
bboxes
=
bboxes
.
tolist
()
labels
=
targets
[
'labels'
].
tolist
()
areas
=
targets
[
'area'
].
tolist
()
iscrowd
=
targets
[
'iscrowd'
].
tolist
()
if
'masks'
in
targets
:
masks
=
targets
[
'masks'
]
# make masks Fortran contiguous for coco_mask
masks
=
masks
.
permute
(
0
,
2
,
1
).
contiguous
().
permute
(
0
,
2
,
1
)
if
'keypoints'
in
targets
:
keypoints
=
targets
[
'keypoints'
]
keypoints
=
keypoints
.
reshape
(
keypoints
.
shape
[
0
],
-
1
).
tolist
()
num_objs
=
len
(
bboxes
)
for
i
in
range
(
num_objs
):
ann
=
{}
ann
[
'image_id'
]
=
image_id
ann
[
'bbox'
]
=
bboxes
[
i
]
ann
[
'category_id'
]
=
labels
[
i
]
categories
.
add
(
labels
[
i
])
ann
[
'area'
]
=
areas
[
i
]
ann
[
'iscrowd'
]
=
iscrowd
[
i
]
ann
[
'id'
]
=
ann_id
if
'keypoints'
in
targets
:
ann
[
'keypoints'
]
=
keypoints
[
i
]
ann
[
'num_keypoints'
]
=
sum
(
k
!=
0
for
k
in
keypoints
[
i
][
2
::
3
])
dataset
[
'annotations'
].
append
(
ann
)
ann_id
+=
1
dataset
[
'categories'
]
=
[{
'id'
:
i
}
for
i
in
sorted
(
categories
)]
coco_ds
.
dataset
=
dataset
coco_ds
.
createIndex
()
return
coco_ds
def
get_coco_api_from_dataset
(
dataset
):
for
_
in
range
(
10
):
if
isinstance
(
dataset
,
torchvision
.
datasets
.
CocoDetection
):
break
if
isinstance
(
dataset
,
torch
.
utils
.
data
.
Subset
):
dataset
=
dataset
.
dataset
if
isinstance
(
dataset
,
torchvision
.
datasets
.
CocoDetection
):
return
dataset
.
coco
return
convert_to_coco_api
(
dataset
)
class
CocoDetection
(
VisionDataset
):
def
__init__
(
self
,
root
,
annFile
,
transforms
):
super
(
CocoDetection
,
self
).
__init__
(
root
,
transforms
=
None
,
transform
=
None
,
target_transform
=
None
)
from
pycocotools.coco
import
COCO
self
.
coco
=
COCO
(
annFile
)
self
.
ids
=
list
(
sorted
(
self
.
coco
.
imgs
.
keys
()))
self
.
_transforms
=
transforms
with
open
(
annFile
,
"r"
)
as
f
:
result
=
json
.
load
(
f
)
catids
=
[
k
[
'id'
]
for
k
in
result
[
'categories'
]]
self
.
catid_inf
=
min
(
catids
)
ids_to_remove
=
[]
ids
=
[]
for
img_id
in
self
.
ids
:
ann_ids
=
self
.
coco
.
getAnnIds
(
imgIds
=
img_id
,
iscrowd
=
None
)
anno
=
self
.
coco
.
loadAnns
(
ann_ids
)
if
all
(
any
(
o
<=
1
for
o
in
obj
[
"bbox"
][
2
:])
for
obj
in
anno
if
obj
[
"iscrowd"
]
==
0
):
ids_to_remove
.
append
(
img_id
)
if
_has_valid_annotation
(
anno
):
ids
.
append
(
img_id
)
print
(
"remove {} illegal image"
.
format
(
len
(
ids_to_remove
)))
self
.
ids
=
[
img_id
for
img_id
in
ids
if
img_id
not
in
ids_to_remove
]
def
__getitem__
(
self
,
idx
):
coco
=
self
.
coco
img_id
=
self
.
ids
[
idx
]
ann_ids
=
coco
.
getAnnIds
(
imgIds
=
img_id
)
target
=
coco
.
loadAnns
(
ann_ids
)
target
=
dict
(
image_id
=
img_id
,
annotations
=
target
)
path
=
coco
.
loadImgs
(
img_id
)[
0
][
'file_name'
]
img
=
Image
.
open
(
os
.
path
.
join
(
self
.
root
,
path
)).
convert
(
'RGB'
)
#img, target = self.remapper(img, target)
if
self
.
_transforms
is
not
None
:
img
,
target
=
self
.
_transforms
(
img
,
target
)
target
[
'labels'
]
=
(
target
[
'labels'
]
-
self
.
catid_inf
+
1
).
long
()
return
img
,
target
def
__len__
(
self
):
return
len
(
self
.
ids
)
'''
class CocoDetection(torchvision.datasets.CocoDetection):
def __init__(self, img_folder, ann_file, transforms):
super(CocoDetection, self).__init__(img_folder, ann_file)
self._transforms = transforms
with open(ann_file, "r") as f:
result = json.load(f)
catids = [k['id'] for k in result['categories']]
self.catid_inf = min(catids)
self.num_classes = len(catids)
print(self.num_classes)
ids_to_remove = []
ids = []
for img_id in self.ids:
ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=None)
anno = self.coco.loadAnns(ann_ids)
if all(
any(o <= 1 for o in obj["bbox"][2:])
for obj in anno
if obj["iscrowd"] == 0
):
ids_to_remove.append(img_id)
if _has_valid_annotation(anno):
ids.append(img_id)
print("remove {} illegal image".format(len(ids_to_remove)))
self.ids = [img_id for img_id in ids if img_id not in ids_to_remove]
def __getitem__(self, idx):
img, target = super(CocoDetection, self).__getitem__(idx)
image_id = self.ids[idx]
target = dict(image_id=image_id, annotations=target)
if self._transforms is not None:
img, target = self._transforms(img, target)
target['labels'] = (target['labels'] - self.catid_inf + 1).long()
return img, target
class OssCocoDetection(VisionDataset):
def __init__(self, root, annFile, transforms,
host='http://oss.{}.brainpp.cn'.format(brainpp.current_vm.site)):
super(OssCocoDetection, self).__init__(root, transforms=None, transform=None, target_transform=None)
from pycocotools.coco import COCO
self.coco = COCO(annFile)
self.ids = list(sorted(self.coco.imgs.keys()))
self.s3_client = boto3.client('s3', endpoint_url=host)
self._transforms = transforms
with open(annFile, "r") as f:
result = json.load(f)
catids = [k['id'] for k in result['categories']]
self.catid_inf = min(catids)
ids_to_remove = []
ids = []
for img_id in self.ids:
ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=None)
anno = self.coco.loadAnns(ann_ids)
if all(
any(o <= 1 for o in obj["bbox"][2:])
for obj in anno
if obj["iscrowd"] == 0
):
ids_to_remove.append(img_id)
if _has_valid_annotation(anno):
ids.append(img_id)
print("remove {} illegal image".format(len(ids_to_remove)))
self.ids = [img_id for img_id in ids if img_id not in ids_to_remove]
def __getitem__(self, idx):
coco = self.coco
img_id = self.ids[idx]
ann_ids = coco.getAnnIds(imgIds=img_id)
target = coco.loadAnns(ann_ids)
target = dict(image_id=img_id, annotations=target)
path = coco.loadImgs(img_id)[0]['file_name']
img_obj = self.s3_client.get_object(
Bucket="generalDetection", Key=os.path.join(self.root, path))
img = Image.open(io.BytesIO(img_obj['Body'].read())).convert('RGB')
#img, target = self.remapper(img, target)
if self._transforms is not None:
img, target = self._transforms(img, target)
target['labels'] = (target['labels'] - self.catid_inf + 1).long()
return img, target
def __len__(self):
return len(self.ids)
def get_oss_coco(root, image_set, transforms, mode='instances'):
t = [ConvertCocoPolysToMask()]
if transforms is not None:
t.append(transforms)
transforms = T.Compose(t)
datasets = list()
for i_key, i_val in root.items():
dataset = OssCocoDetection(
i_val['img_dir'], i_val['ann_file'],
transforms=transforms)
if image_set == "train":
dataset = _coco_remove_images_without_annotations(dataset)
datasets.append(dataset)
dataset = datasets[0] # if len(datasets) == 1 else ConcatDataset(datasets)
# dataset = torch.utils.data.Subset(dataset, [i for i in range(500)])
return dataset
'''
def
get_coco
(
root
,
image_set
,
transforms
,
mode
=
'instances'
):
t
=
[
ConvertCocoPolysToMask
()]
if
transforms
is
not
None
:
t
.
append
(
transforms
)
transforms
=
T
.
Compose
(
t
)
img_folder
=
root
[
image_set
][
'img_dir'
]
ann_file
=
root
[
image_set
][
'ann_file'
]
dataset
=
CocoDetection
(
img_folder
,
ann_file
,
transforms
=
transforms
)
if
image_set
==
"train"
:
dataset
=
_coco_remove_images_without_annotations
(
dataset
)
# dataset = torch.utils.data.Subset(dataset, [i for i in range(500)])
return
dataset
def
get_coco_kp
(
root
,
image_set
,
transforms
):
return
get_coco
(
root
,
image_set
,
transforms
,
mode
=
"person_keypoints"
)
PyTorch/Compute-Vision/Objection/Faster-rcnn/engine.py
0 → 100644
View file @
0fc002df
import
math
import
sys
import
time
import
torch
import
torchvision.models.detection.mask_rcnn
from
coco_utils
import
get_coco_api_from_dataset
from
coco_eval
import
CocoEvaluator
import
utils
def
train_one_epoch
(
model
,
optimizer
,
data_loader
,
device
,
epoch
,
print_freq
):
model
.
train
()
metric_logger
=
utils
.
MetricLogger
(
delimiter
=
" "
)
metric_logger
.
add_meter
(
'lr'
,
utils
.
SmoothedValue
(
window_size
=
1
,
fmt
=
'{value:.6f}'
))
header
=
'Epoch: [{}]'
.
format
(
epoch
)
lr_scheduler
=
None
if
epoch
==
0
:
warmup_factor
=
1.
/
1000
warmup_iters
=
min
(
1000
,
len
(
data_loader
)
-
1
)
lr_scheduler
=
utils
.
warmup_lr_scheduler
(
optimizer
,
warmup_iters
,
warmup_factor
)
for
images
,
targets
in
metric_logger
.
log_every
(
data_loader
,
print_freq
,
header
):
images
=
list
(
image
.
to
(
device
)
for
image
in
images
)
targets
=
[{
k
:
v
.
to
(
device
)
for
k
,
v
in
t
.
items
()}
for
t
in
targets
]
loss_dict
=
model
(
images
,
targets
)
losses
=
sum
(
loss
for
loss
in
loss_dict
.
values
())
# reduce losses over all GPUs for logging purposes
loss_dict_reduced
=
utils
.
reduce_dict
(
loss_dict
)
losses_reduced
=
sum
(
loss
for
loss
in
loss_dict_reduced
.
values
())
loss_value
=
losses_reduced
.
item
()
if
not
math
.
isfinite
(
loss_value
):
print
(
"Loss is {}, stopping training"
.
format
(
loss_value
))
print
(
loss_dict_reduced
)
sys
.
exit
(
1
)
optimizer
.
zero_grad
()
losses
.
backward
()
optimizer
.
step
()
if
lr_scheduler
is
not
None
:
lr_scheduler
.
step
()
metric_logger
.
update
(
loss
=
losses_reduced
,
**
loss_dict_reduced
)
metric_logger
.
update
(
lr
=
optimizer
.
param_groups
[
0
][
"lr"
])
def
_get_iou_types
(
model
):
model_without_ddp
=
model
if
isinstance
(
model
,
torch
.
nn
.
parallel
.
DistributedDataParallel
):
model_without_ddp
=
model
.
module
iou_types
=
[
"bbox"
]
if
isinstance
(
model_without_ddp
,
torchvision
.
models
.
detection
.
MaskRCNN
):
iou_types
.
append
(
"segm"
)
if
isinstance
(
model_without_ddp
,
torchvision
.
models
.
detection
.
KeypointRCNN
):
iou_types
.
append
(
"keypoints"
)
return
iou_types
@
torch
.
no_grad
()
def
evaluate
(
model
,
data_loader
,
device
):
n_threads
=
torch
.
get_num_threads
()
# FIXME remove this and make paste_masks_in_image run on the GPU
torch
.
set_num_threads
(
1
)
cpu_device
=
torch
.
device
(
"cpu"
)
model
.
eval
()
metric_logger
=
utils
.
MetricLogger
(
delimiter
=
" "
)
header
=
'Test:'
coco
=
get_coco_api_from_dataset
(
data_loader
.
dataset
)
iou_types
=
_get_iou_types
(
model
)
coco_evaluator
=
CocoEvaluator
(
coco
,
iou_types
)
for
image
,
targets
in
metric_logger
.
log_every
(
data_loader
,
100
,
header
):
image
=
list
(
img
.
to
(
device
)
for
img
in
image
)
targets
=
[{
k
:
v
.
to
(
device
)
for
k
,
v
in
t
.
items
()}
for
t
in
targets
]
torch
.
cuda
.
synchronize
()
model_time
=
time
.
time
()
outputs
=
model
(
image
)
outputs
=
[{
k
:
v
.
to
(
cpu_device
)
for
k
,
v
in
t
.
items
()}
for
t
in
outputs
]
model_time
=
time
.
time
()
-
model_time
res
=
{
target
[
"image_id"
].
item
():
output
for
target
,
output
in
zip
(
targets
,
outputs
)}
evaluator_time
=
time
.
time
()
coco_evaluator
.
update
(
res
)
evaluator_time
=
time
.
time
()
-
evaluator_time
metric_logger
.
update
(
model_time
=
model_time
,
evaluator_time
=
evaluator_time
)
# gather the stats from all processes
metric_logger
.
synchronize_between_processes
()
print
(
"Averaged stats:"
,
metric_logger
)
coco_evaluator
.
synchronize_between_processes
()
# accumulate predictions from all images
coco_evaluator
.
accumulate
()
coco_evaluator
.
summarize
()
torch
.
set_num_threads
(
n_threads
)
return
coco_evaluator
PyTorch/Compute-Vision/Objection/Faster-rcnn/group_by_aspect_ratio.py
0 → 100644
View file @
0fc002df
import
bisect
from
collections
import
defaultdict
import
copy
import
numpy
as
np
import
torch
import
torch.utils.data
from
torch.utils.data.sampler
import
BatchSampler
,
Sampler
from
torch.utils.model_zoo
import
tqdm
import
torchvision
from
PIL
import
Image
class
GroupedBatchSampler
(
BatchSampler
):
"""
Wraps another sampler to yield a mini-batch of indices.
It enforces that the batch only contain elements from the same group.
It also tries to provide mini-batches which follows an ordering which is
as close as possible to the ordering from the original sampler.
Arguments:
sampler (Sampler): Base sampler.
group_ids (list[int]): If the sampler produces indices in range [0, N),
`group_ids` must be a list of `N` ints which contains the group id of each sample.
The group ids must be a continuous set of integers starting from
0, i.e. they must be in the range [0, num_groups).
batch_size (int): Size of mini-batch.
"""
def
__init__
(
self
,
sampler
,
group_ids
,
batch_size
):
if
not
isinstance
(
sampler
,
Sampler
):
raise
ValueError
(
"sampler should be an instance of "
"torch.utils.data.Sampler, but got sampler={}"
.
format
(
sampler
)
)
self
.
sampler
=
sampler
self
.
group_ids
=
group_ids
self
.
batch_size
=
batch_size
def
__iter__
(
self
):
buffer_per_group
=
defaultdict
(
list
)
samples_per_group
=
defaultdict
(
list
)
num_batches
=
0
for
idx
in
self
.
sampler
:
group_id
=
self
.
group_ids
[
idx
]
buffer_per_group
[
group_id
].
append
(
idx
)
samples_per_group
[
group_id
].
append
(
idx
)
if
len
(
buffer_per_group
[
group_id
])
==
self
.
batch_size
:
yield
buffer_per_group
[
group_id
]
num_batches
+=
1
del
buffer_per_group
[
group_id
]
assert
len
(
buffer_per_group
[
group_id
])
<
self
.
batch_size
# now we have run out of elements that satisfy
# the group criteria, let's return the remaining
# elements so that the size of the sampler is
# deterministic
expected_num_batches
=
len
(
self
)
num_remaining
=
expected_num_batches
-
num_batches
if
num_remaining
>
0
:
# for the remaining batches, take first the buffers with largest number
# of elements
for
group_id
,
_
in
sorted
(
buffer_per_group
.
items
(),
key
=
lambda
x
:
len
(
x
[
1
]),
reverse
=
True
):
remaining
=
self
.
batch_size
-
len
(
buffer_per_group
[
group_id
])
buffer_per_group
[
group_id
].
extend
(
samples_per_group
[
group_id
][:
remaining
])
assert
len
(
buffer_per_group
[
group_id
])
==
self
.
batch_size
yield
buffer_per_group
[
group_id
]
num_remaining
-=
1
if
num_remaining
==
0
:
break
assert
num_remaining
==
0
def
__len__
(
self
):
return
len
(
self
.
sampler
)
//
self
.
batch_size
def
_compute_aspect_ratios_slow
(
dataset
,
indices
=
None
):
print
(
"Your dataset doesn't support the fast path for "
"computing the aspect ratios, so will iterate over "
"the full dataset and load every image instead. "
"This might take some time..."
)
if
indices
is
None
:
indices
=
range
(
len
(
dataset
))
class
SubsetSampler
(
Sampler
):
def
__init__
(
self
,
indices
):
self
.
indices
=
indices
def
__iter__
(
self
):
return
iter
(
self
.
indices
)
def
__len__
(
self
):
return
len
(
self
.
indices
)
sampler
=
SubsetSampler
(
indices
)
data_loader
=
torch
.
utils
.
data
.
DataLoader
(
dataset
,
batch_size
=
1
,
sampler
=
sampler
,
num_workers
=
14
,
# you might want to increase it for faster processing
collate_fn
=
lambda
x
:
x
[
0
])
aspect_ratios
=
[]
with
tqdm
(
total
=
len
(
dataset
))
as
pbar
:
for
_i
,
(
img
,
_
)
in
enumerate
(
data_loader
):
pbar
.
update
(
1
)
height
,
width
=
img
.
shape
[
-
2
:]
aspect_ratio
=
float
(
width
)
/
float
(
height
)
aspect_ratios
.
append
(
aspect_ratio
)
return
aspect_ratios
def
_compute_aspect_ratios_custom_dataset
(
dataset
,
indices
=
None
):
if
indices
is
None
:
indices
=
range
(
len
(
dataset
))
aspect_ratios
=
[]
for
i
in
indices
:
height
,
width
=
dataset
.
get_height_and_width
(
i
)
aspect_ratio
=
float
(
width
)
/
float
(
height
)
aspect_ratios
.
append
(
aspect_ratio
)
return
aspect_ratios
def
_compute_aspect_ratios_coco_dataset
(
dataset
,
indices
=
None
):
if
indices
is
None
:
indices
=
range
(
len
(
dataset
))
aspect_ratios
=
[]
for
i
in
indices
:
img_info
=
dataset
.
coco
.
imgs
[
dataset
.
ids
[
i
]]
aspect_ratio
=
float
(
img_info
[
"width"
])
/
float
(
img_info
[
"height"
])
aspect_ratios
.
append
(
aspect_ratio
)
return
aspect_ratios
def
_compute_aspect_ratios_voc_dataset
(
dataset
,
indices
=
None
):
if
indices
is
None
:
indices
=
range
(
len
(
dataset
))
aspect_ratios
=
[]
for
i
in
indices
:
# this doesn't load the data into memory, because PIL loads it lazily
width
,
height
=
Image
.
open
(
dataset
.
images
[
i
]).
size
aspect_ratio
=
float
(
width
)
/
float
(
height
)
aspect_ratios
.
append
(
aspect_ratio
)
return
aspect_ratios
def
_compute_aspect_ratios_subset_dataset
(
dataset
,
indices
=
None
):
if
indices
is
None
:
indices
=
range
(
len
(
dataset
))
ds_indices
=
[
dataset
.
indices
[
i
]
for
i
in
indices
]
return
compute_aspect_ratios
(
dataset
.
dataset
,
ds_indices
)
def
compute_aspect_ratios
(
dataset
,
indices
=
None
):
if
hasattr
(
dataset
,
"get_height_and_width"
):
return
_compute_aspect_ratios_custom_dataset
(
dataset
,
indices
)
if
isinstance
(
dataset
,
torchvision
.
datasets
.
CocoDetection
):
return
_compute_aspect_ratios_coco_dataset
(
dataset
,
indices
)
if
isinstance
(
dataset
,
torchvision
.
datasets
.
VOCDetection
):
return
_compute_aspect_ratios_voc_dataset
(
dataset
,
indices
)
if
isinstance
(
dataset
,
torch
.
utils
.
data
.
Subset
):
return
_compute_aspect_ratios_subset_dataset
(
dataset
,
indices
)
# slow path
return
_compute_aspect_ratios_slow
(
dataset
,
indices
)
def
_quantize
(
x
,
bins
):
bins
=
copy
.
deepcopy
(
bins
)
bins
=
sorted
(
bins
)
quantized
=
list
(
map
(
lambda
y
:
bisect
.
bisect_right
(
bins
,
y
),
x
))
return
quantized
def
create_aspect_ratio_groups
(
dataset
,
k
=
0
):
aspect_ratios
=
compute_aspect_ratios
(
dataset
)
bins
=
(
2
**
np
.
linspace
(
-
1
,
1
,
2
*
k
+
1
)).
tolist
()
if
k
>
0
else
[
1.0
]
groups
=
_quantize
(
aspect_ratios
,
bins
)
# count number of elements per group
counts
=
np
.
unique
(
groups
,
return_counts
=
True
)[
1
]
fbins
=
[
0
]
+
bins
+
[
np
.
inf
]
print
(
"Using {} as bins for aspect ratio quantization"
.
format
(
fbins
))
print
(
"Count of instances per bin: {}"
.
format
(
counts
))
return
groups
PyTorch/Compute-Vision/Objection/Faster-rcnn/mpi_slurm.sbatch
0 → 100644
View file @
0fc002df
#!/bin/bash
#SBATCH -p normal
#SBATCH -N 1
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=8
#SBATCH --gres=dcu:4
#SBATCH -J detection
#SBATCH -o ./log/output.%j
#SBATCH -e ./log/output.%j
module
rm
compiler/rocm/2.9
module load apps/PyTorch/1.5.0a0/hpcx-2.4.1-gcc-7.3.1-rocm3.3
which mpirun
which python3
hostfile
=
./
$SLURM_JOB_ID
scontrol show hostnames
$SLURM_JOB_NODELIST
>
${
hostfile
}
rm
`
pwd
`
/hostfile-dl
-f
#hostfile=./node_list
for
i
in
`
cat
$hostfile
`
do
echo
${
i
}
slots
=
4
>>
`
pwd
`
/hostfile-dl
done
np
=
$(
cat
$hostfile
|sort|uniq |wc
-l
)
np
=
$((
$np
*
4
))
nodename
=
$(
cat
$hostfile
|sed
-n
"1p"
)
echo
$nodename
dist_url
=
`
echo
$nodename
|
awk
'{print $1}'
`
echo
mpirun
-np
$np
--allow-run-as-root
--hostfile
hostfile-dl
--bind-to
none
`
pwd
`
/single_process.sh
$dist_url
mpirun
-np
$np
--allow-run-as-root
--hostfile
hostfile-dl
--bind-to
none
`
pwd
`
/single_process.sh
$dist_url
PyTorch/Compute-Vision/Objection/Faster-rcnn/single_process.sh
0 → 100755
View file @
0fc002df
#!/bin/bash
export
MIOPEN_DEBUG_DISABLE_FIND_DB
=
1
export
OMP_NUM_THREADS
=
6
export
NCCL_SOCKET_IFNAME
=
ib0
export
HSA_USERPTR_FOR_PAGED_MEM
=
0
lrank
=
$OMPI_COMM_WORLD_LOCAL_RANK
comm_rank
=
$OMPI_COMM_WORLD_RANK
comm_size
=
$OMPI_COMM_WORLD_SIZE
module
rm
compiler/rocm/2.9
module load apps/PyTorch/1.5.0a0/hpcx-2.4.1-gcc-7.3.1-rocm3.3
APP
=
"python3
`
pwd
`
/train.py --batch-size=2 -j 8 --epochs=26 --dist-url tcp://
${
1
}
:34568 --world-size=
${
comm_size
}
--rank=
${
comm_rank
}
--output-dir
`
pwd
`
/output"
case
${
lrank
}
in
[
0]
)
export
HIP_VISIBLE_DEVICES
=
0
export
UCX_NET_DEVICES
=
mlx5_0:1
export
UCX_IB_PCI_BW
=
mlx5_0:50Gbs
echo
NCCL_SOCKET_IFNAME
=
ib0 numactl
--cpunodebind
=
0
--membind
=
0
${
APP
}
NCCL_SOCKET_IFNAME
=
ib0 numactl
--cpunodebind
=
0
--membind
=
0
${
APP
}
;;
[
1]
)
export
HIP_VISIBLE_DEVICES
=
1
export
UCX_NET_DEVICES
=
mlx5_1:1
export
UCX_IB_PCI_BW
=
mlx5_1:50Gbs
echo
NCCL_SOCKET_IFNAME
=
ib0 numactl
--cpunodebind
=
1
--membind
=
1
${
APP
}
NCCL_SOCKET_IFNAME
=
ib0 numactl
--cpunodebind
=
1
--membind
=
1
${
APP
}
;;
[
2]
)
export
HIP_VISIBLE_DEVICES
=
2
export
UCX_NET_DEVICES
=
mlx5_2:1
export
UCX_IB_PCI_BW
=
mlx5_2:50Gbs
echo
NCCL_SOCKET_IFNAME
=
ib0 numactl
--cpunodebind
=
2
--membind
=
2
${
APP
}
NCCL_SOCKET_IFNAME
=
ib0 numactl
--cpunodebind
=
2
--membind
=
2
${
APP
}
;;
[
3]
)
export
HIP_VISIBLE_DEVICES
=
3
export
UCX_NET_DEVICES
=
mlx5_3:1
export
UCX_IB_PCI_BW
=
mlx5_3:50Gbs
echo
NCCL_SOCKET_IFNAME
=
ib0 numactl
--cpunodebind
=
3
--membind
=
3
${
APP
}
NCCL_SOCKET_IFNAME
=
ib0 numactl
--cpunodebind
=
3
--membind
=
3
${
APP
}
;;
esac
PyTorch/Compute-Vision/Objection/Faster-rcnn/test.py
0 → 100644
View file @
0fc002df
r
"""PyTorch Detection Training.
To run in a multi-gpu environment, use the distributed launcher::
python -m torch.distributed.launch --nproc_per_node=$NGPU --use_env \
train.py ... --world-size $NGPU
The default hyperparameters are tuned for training on 8 gpus and 2 images per gpu.
--lr 0.02 --batch-size 2 --world-size 8
If you use different number of gpus, the learning rate should be changed to 0.02/8*$NGPU.
"""
import
datetime
import
os
import
os.path
as
osp
import
time
import
torch
import
torch.utils.data
from
torch
import
nn
import
torchvision
import
torchvision.models.detection
import
torchvision.models.detection.mask_rcnn
from
coco_utils
import
get_coco
,
get_oss_coco
,
get_coco_kp
from
group_by_aspect_ratio
import
GroupedBatchSampler
,
create_aspect_ratio_groups
from
engine
import
train_one_epoch
,
evaluate
import
utils
import
transforms
as
T
def
get_dataset
(
name
,
image_set
,
transform
,
data_path
):
data_root_dir
=
data_path
train_dataset
=
{
'coco_2014_train'
:
{
'img_dir'
:
osp
.
join
(
"objects365_raw_data/objects365/train"
),
"ann_file"
:
osp
.
join
(
data_root_dir
,
"obj_anno/objects365_train_20190423.json"
)},
}
val_dataset
=
{
'coco_2014_train'
:
{
'img_dir'
:
osp
.
join
(
"objects365_raw_data/objects365/val"
),
"ann_file"
:
osp
.
join
(
data_root_dir
,
"obj_anno/objects365_val_20190423.json"
)},
}
paths
=
{
"cocotrain"
:
(
train_dataset
,
get_oss_coco
,
365
),
"cocoval"
:
(
val_dataset
,
get_oss_coco
,
365
),
"coco_kp"
:
(
data_path
,
get_coco_kp
,
2
)
}
p
,
ds_fn
,
num_classes
=
paths
[
name
+
image_set
]
ds
=
ds_fn
(
p
,
image_set
=
image_set
,
transforms
=
transform
)
return
ds
,
num_classes
def
get_transform
(
train
):
transforms
=
[]
transforms
.
append
(
T
.
ToTensor
())
if
train
:
transforms
.
append
(
T
.
RandomHorizontalFlip
(
0.5
))
return
T
.
Compose
(
transforms
)
def
main
(
args
):
utils
.
init_distributed_mode
(
args
)
print
(
args
)
device
=
torch
.
device
(
args
.
device
)
# Data loading code
print
(
"Loading data"
)
#dataset, num_classes = get_dataset(args.dataset, "train", get_transform(train=True), args.data_path)
dataset_test
,
num_classes
=
get_dataset
(
args
.
dataset
,
"val"
,
get_transform
(
train
=
False
),
args
.
data_path
)
print
(
"Creating data loaders"
)
if
args
.
distributed
:
#train_sampler = torch.utils.data.distributed.DistributedSampler(dataset)
test_sampler
=
torch
.
utils
.
data
.
distributed
.
DistributedSampler
(
dataset_test
)
else
:
#train_sampler = torch.utils.data.RandomSampler(dataset)
test_sampler
=
torch
.
utils
.
data
.
SequentialSampler
(
dataset_test
)
'''
if args.aspect_ratio_group_factor >= 0:
group_ids = create_aspect_ratio_groups(dataset, k=args.aspect_ratio_group_factor)
train_batch_sampler = GroupedBatchSampler(train_sampler, group_ids, args.batch_size)
else:
train_batch_sampler = torch.utils.data.BatchSampler(
train_sampler, args.batch_size, drop_last=True)
data_loader = torch.utils.data.DataLoader(
dataset, batch_sampler=train_batch_sampler, num_workers=args.workers,
collate_fn=utils.collate_fn)
'''
data_loader_test
=
torch
.
utils
.
data
.
DataLoader
(
dataset_test
,
batch_size
=
1
,
sampler
=
test_sampler
,
num_workers
=
args
.
workers
,
collate_fn
=
utils
.
collate_fn
)
print
(
"Creating model"
)
model
=
torchvision
.
models
.
detection
.
__dict__
[
args
.
model
](
num_classes
=
num_classes
,
pretrained
=
args
.
pretrained
)
model
.
to
(
device
)
model_without_ddp
=
model
if
args
.
distributed
:
model
=
torch
.
nn
.
parallel
.
DistributedDataParallel
(
model
,
device_ids
=
[
args
.
gpu
])
model_without_ddp
=
model
.
module
params
=
[
p
for
p
in
model
.
parameters
()
if
p
.
requires_grad
]
optimizer
=
torch
.
optim
.
SGD
(
params
,
lr
=
args
.
lr
,
momentum
=
args
.
momentum
,
weight_decay
=
args
.
weight_decay
)
# lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=args.lr_step_size, gamma=args.lr_gamma)
lr_scheduler
=
torch
.
optim
.
lr_scheduler
.
MultiStepLR
(
optimizer
,
milestones
=
args
.
lr_steps
,
gamma
=
args
.
lr_gamma
)
if
args
.
resume
:
checkpoint
=
torch
.
load
(
args
.
resume
,
map_location
=
'cpu'
)
model_without_ddp
.
load_state_dict
(
checkpoint
[
'model'
])
optimizer
.
load_state_dict
(
checkpoint
[
'optimizer'
])
lr_scheduler
.
load_state_dict
(
checkpoint
[
'lr_scheduler'
])
if
args
.
test_only
:
print
(
"evaluating..."
)
evaluate
(
model
,
data_loader_test
,
device
=
device
)
return
print
(
"Start training"
)
start_time
=
time
.
time
()
for
epoch
in
range
(
args
.
epochs
):
if
args
.
distributed
:
train_sampler
.
set_epoch
(
epoch
)
train_one_epoch
(
model
,
optimizer
,
data_loader
,
device
,
epoch
,
args
.
print_freq
)
lr_scheduler
.
step
()
if
args
.
output_dir
:
utils
.
save_on_master
({
'model'
:
model_without_ddp
.
state_dict
(),
'optimizer'
:
optimizer
.
state_dict
(),
'lr_scheduler'
:
lr_scheduler
.
state_dict
(),
'args'
:
args
},
os
.
path
.
join
(
args
.
output_dir
,
'model_{}.pth'
.
format
(
epoch
)))
# evaluate after every epoch
evaluate
(
model
,
data_loader_test
,
device
=
device
)
total_time
=
time
.
time
()
-
start_time
total_time_str
=
str
(
datetime
.
timedelta
(
seconds
=
int
(
total_time
)))
print
(
'Training time {}'
.
format
(
total_time_str
))
if
__name__
==
"__main__"
:
import
argparse
parser
=
argparse
.
ArgumentParser
(
description
=
__doc__
)
parser
.
add_argument
(
'--data-path'
,
default
=
'/data/code/vision_ori/dataset'
,
help
=
'dataset'
)
parser
.
add_argument
(
'--dataset'
,
default
=
'coco'
,
help
=
'dataset'
)
parser
.
add_argument
(
'--model'
,
default
=
'fasterrcnn_resnet50_fpn'
,
help
=
'model'
)
parser
.
add_argument
(
'--device'
,
default
=
'cuda'
,
help
=
'device'
)
parser
.
add_argument
(
'-b'
,
'--batch-size'
,
default
=
2
,
type
=
int
,
help
=
'images per gpu, the total batch size is $NGPU x batch_size'
)
parser
.
add_argument
(
'--epochs'
,
default
=
13
,
type
=
int
,
metavar
=
'N'
,
help
=
'number of total epochs to run'
)
parser
.
add_argument
(
'-j'
,
'--workers'
,
default
=
4
,
type
=
int
,
metavar
=
'N'
,
help
=
'number of data loading workers (default: 4)'
)
parser
.
add_argument
(
'--lr'
,
default
=
0.02
,
type
=
float
,
help
=
'initial learning rate, 0.02 is the default value for training '
'on 8 gpus and 2 images_per_gpu'
)
parser
.
add_argument
(
'--momentum'
,
default
=
0.9
,
type
=
float
,
metavar
=
'M'
,
help
=
'momentum'
)
parser
.
add_argument
(
'--wd'
,
'--weight-decay'
,
default
=
1e-4
,
type
=
float
,
metavar
=
'W'
,
help
=
'weight decay (default: 1e-4)'
,
dest
=
'weight_decay'
)
parser
.
add_argument
(
'--lr-step-size'
,
default
=
8
,
type
=
int
,
help
=
'decrease lr every step-size epochs'
)
parser
.
add_argument
(
'--lr-steps'
,
default
=
[
8
,
11
],
nargs
=
'+'
,
type
=
int
,
help
=
'decrease lr every step-size epochs'
)
parser
.
add_argument
(
'--lr-gamma'
,
default
=
0.1
,
type
=
float
,
help
=
'decrease lr by a factor of lr-gamma'
)
parser
.
add_argument
(
'--print-freq'
,
default
=
20
,
type
=
int
,
help
=
'print frequency'
)
parser
.
add_argument
(
'--output-dir'
,
default
=
'.'
,
help
=
'path where to save'
)
parser
.
add_argument
(
'--resume'
,
default
=
''
,
help
=
'resume from checkpoint'
)
parser
.
add_argument
(
'--aspect-ratio-group-factor'
,
default
=
3
,
type
=
int
)
parser
.
add_argument
(
"--test-only"
,
dest
=
"test_only"
,
help
=
"Only test the model"
,
action
=
"store_true"
,
)
parser
.
add_argument
(
"--pretrained"
,
dest
=
"pretrained"
,
help
=
"Use pre-trained models from the modelzoo"
,
action
=
"store_true"
,
)
# distributed training parameters
parser
.
add_argument
(
'--world-size'
,
default
=
1
,
type
=
int
,
help
=
'number of distributed processes'
)
parser
.
add_argument
(
'--dist-url'
,
default
=
'env://'
,
help
=
'url used to set up distributed training'
)
args
=
parser
.
parse_args
()
if
args
.
output_dir
:
utils
.
mkdir
(
args
.
output_dir
)
main
(
args
)
PyTorch/Compute-Vision/Objection/Faster-rcnn/train.py
0 → 100644
View file @
0fc002df
r
"""PyTorch Detection Training.
To run in a multi-gpu environment, use the distributed launcher::
python -m torch.distributed.launch --nproc_per_node=$NGPU --use_env \
train.py ... --world-size $NGPU
The default hyperparameters are tuned for training on 8 gpus and 2 images per gpu.
--lr 0.02 --batch-size 2 --world-size 8
If you use different number of gpus, the learning rate should be changed to 0.02/8*$NGPU.
"""
import
datetime
import
os
import
os.path
as
osp
import
time
import
torch
import
torch.utils.data
from
torch
import
nn
import
torchvision
import
torchvision.models.detection
import
torchvision.models.detection.mask_rcnn
from
coco_utils
import
get_coco
,
get_coco_kp
from
group_by_aspect_ratio
import
GroupedBatchSampler
,
create_aspect_ratio_groups
from
engine
import
train_one_epoch
,
evaluate
import
utils
import
transforms
as
T
####aiss debug add inter,default 16
#print (torch.get_num_interop_threads())
#torch.set_num_interop_threads(24)
#print (torch.get_num_interop_threads())
def
get_dataset
(
name
,
image_set
,
transform
,
data_path
):
train_dataset
=
{
'train'
:
{
'img_dir'
:
osp
.
join
(
data_path
,
"train"
),
"ann_file"
:
osp
.
join
(
data_path
,
"objects365_Tiny_train.json"
)},
}
val_dataset
=
{
'val'
:
{
'img_dir'
:
osp
.
join
(
data_path
,
"val"
),
"ann_file"
:
osp
.
join
(
data_path
,
"objects365_Tiny_val.json"
)},
}
paths
=
{
"train"
:
(
train_dataset
,
get_coco
,
66
),
"val"
:
(
val_dataset
,
get_coco
,
66
),
}
p
,
ds_fn
,
num_classes
=
paths
[
image_set
]
ds
=
ds_fn
(
p
,
image_set
=
image_set
,
transforms
=
transform
)
return
ds
,
num_classes
def
get_transform
(
train
):
transforms
=
[]
transforms
.
append
(
T
.
ToTensor
())
if
train
:
transforms
.
append
(
T
.
RandomHorizontalFlip
(
0.5
))
return
T
.
Compose
(
transforms
)
def
main
(
args
):
utils
.
init_distributed_mode
(
args
)
print
(
args
)
device
=
torch
.
device
(
args
.
device
)
# Data loading code
print
(
"Loading data"
)
dataset
,
num_classes
=
get_dataset
(
args
.
dataset
,
"train"
,
get_transform
(
train
=
True
),
args
.
data_path
)
# dataset_test, _ = get_dataset(args.dataset, "val", get_transform(train=False), args.data_path)
dataset_test
,
_
=
get_dataset
(
args
.
dataset
,
"val"
,
get_transform
(
train
=
False
),
args
.
data_path
)
print
(
"num classes : {}"
.
format
(
num_classes
))
print
(
"Creating data loaders"
)
if
args
.
distributed
:
train_sampler
=
torch
.
utils
.
data
.
distributed
.
DistributedSampler
(
dataset
)
test_sampler
=
torch
.
utils
.
data
.
distributed
.
DistributedSampler
(
dataset_test
)
else
:
train_sampler
=
torch
.
utils
.
data
.
RandomSampler
(
dataset
)
test_sampler
=
torch
.
utils
.
data
.
SequentialSampler
(
dataset_test
)
if
args
.
aspect_ratio_group_factor
>=
0
:
group_ids
=
create_aspect_ratio_groups
(
dataset
,
k
=
args
.
aspect_ratio_group_factor
)
train_batch_sampler
=
GroupedBatchSampler
(
train_sampler
,
group_ids
,
args
.
batch_size
)
else
:
train_batch_sampler
=
torch
.
utils
.
data
.
BatchSampler
(
train_sampler
,
args
.
batch_size
,
drop_last
=
True
)
'''
train_batch_sampler = torch.utils.data.BatchSampler(
train_sampler, args.batch_size, drop_last=True)
'''
data_loader
=
torch
.
utils
.
data
.
DataLoader
(
dataset
,
batch_sampler
=
train_batch_sampler
,
num_workers
=
args
.
workers
,
collate_fn
=
utils
.
collate_fn
)
data_loader_test
=
torch
.
utils
.
data
.
DataLoader
(
dataset_test
,
batch_size
=
1
,
sampler
=
test_sampler
,
num_workers
=
args
.
workers
,
collate_fn
=
utils
.
collate_fn
)
print
(
"Creating model"
)
model
=
torchvision
.
models
.
detection
.
__dict__
[
args
.
model
](
num_classes
=
num_classes
,
pretrained
=
args
.
pretrained
)
model
.
to
(
device
)
model_without_ddp
=
model
if
args
.
distributed
:
model
=
torch
.
nn
.
parallel
.
DistributedDataParallel
(
model
,
device_ids
=
[
args
.
gpu
])
model_without_ddp
=
model
.
module
params
=
[
p
for
p
in
model
.
parameters
()
if
p
.
requires_grad
]
optimizer
=
torch
.
optim
.
SGD
(
params
,
lr
=
args
.
lr
,
momentum
=
args
.
momentum
,
weight_decay
=
args
.
weight_decay
)
# lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=args.lr_step_size, gamma=args.lr_gamma)
lr_scheduler
=
torch
.
optim
.
lr_scheduler
.
MultiStepLR
(
optimizer
,
milestones
=
args
.
lr_steps
,
gamma
=
args
.
lr_gamma
)
if
args
.
resume
:
checkpoint
=
torch
.
load
(
args
.
resume
,
map_location
=
'cpu'
)
model_without_ddp
.
load_state_dict
(
checkpoint
[
'model'
])
optimizer
.
load_state_dict
(
checkpoint
[
'optimizer'
])
lr_scheduler
.
load_state_dict
(
checkpoint
[
'lr_scheduler'
])
######aiss debug###
args
.
start_epoch
=
checkpoint
[
'epoch'
]
+
1
if
args
.
test_only
:
evaluate
(
model
,
data_loader_test
,
device
=
device
)
return
print
(
"Start training"
)
start_time
=
time
.
time
()
#aiss debug###
#for epoch in range(args.epochs):
for
epoch
in
range
(
args
.
start_epoch
,
args
.
epochs
):
if
args
.
distributed
:
train_sampler
.
set_epoch
(
epoch
)
train_one_epoch
(
model
,
optimizer
,
data_loader
,
device
,
epoch
,
args
.
print_freq
)
lr_scheduler
.
step
()
if
args
.
output_dir
:
utils
.
save_on_master
({
'model'
:
model_without_ddp
.
state_dict
(),
'optimizer'
:
optimizer
.
state_dict
(),
'lr_scheduler'
:
lr_scheduler
.
state_dict
(),
'args'
:
args
,
'epoch'
:
epoch
},
os
.
path
.
join
(
args
.
output_dir
,
'model_{}.pth'
.
format
(
epoch
)))
# evaluate after every epoch
evaluate
(
model
,
data_loader_test
,
device
=
device
)
total_time
=
time
.
time
()
-
start_time
total_time_str
=
str
(
datetime
.
timedelta
(
seconds
=
int
(
total_time
)))
print
(
'Training time {}'
.
format
(
total_time_str
))
if
__name__
==
"__main__"
:
import
argparse
parser
=
argparse
.
ArgumentParser
(
description
=
__doc__
)
parser
.
add_argument
(
'--data-path'
,
default
=
'/IO_data/data/object365/'
,
help
=
'dataset'
)
parser
.
add_argument
(
'--dataset'
,
default
=
'objects365'
,
help
=
'dataset'
)
parser
.
add_argument
(
'--model'
,
default
=
'fasterrcnn_resnet50_fpn'
,
help
=
'model'
)
parser
.
add_argument
(
'--device'
,
default
=
'cuda'
,
help
=
'device'
)
parser
.
add_argument
(
'-b'
,
'--batch-size'
,
default
=
2
,
type
=
int
,
help
=
'images per gpu, the total batch size is $NGPU x batch_size'
)
parser
.
add_argument
(
'--epochs'
,
default
=
26
,
type
=
int
,
metavar
=
'N'
,
help
=
'number of total epochs to run'
)
parser
.
add_argument
(
'-j'
,
'--workers'
,
default
=
4
,
type
=
int
,
metavar
=
'N'
,
help
=
'number of data loading workers (default: 4)'
)
parser
.
add_argument
(
'--lr'
,
default
=
0.02
,
type
=
float
,
help
=
'initial learning rate, 0.02 is the default value for training '
'on 8 gpus and 2 images_per_gpu'
)
parser
.
add_argument
(
'--momentum'
,
default
=
0.9
,
type
=
float
,
metavar
=
'M'
,
help
=
'momentum'
)
parser
.
add_argument
(
'--wd'
,
'--weight-decay'
,
default
=
1e-4
,
type
=
float
,
metavar
=
'W'
,
help
=
'weight decay (default: 1e-4)'
,
dest
=
'weight_decay'
)
parser
.
add_argument
(
'--lr-step-size'
,
default
=
8
,
type
=
int
,
help
=
'decrease lr every step-size epochs'
)
parser
.
add_argument
(
'--lr-steps'
,
default
=
[
16
,
22
],
nargs
=
'+'
,
type
=
int
,
help
=
'decrease lr every step-size epochs'
)
parser
.
add_argument
(
'--lr-gamma'
,
default
=
0.1
,
type
=
float
,
help
=
'decrease lr by a factor of lr-gamma'
)
parser
.
add_argument
(
'--print-freq'
,
default
=
20
,
type
=
int
,
help
=
'print frequency'
)
parser
.
add_argument
(
'--output-dir'
,
default
=
'.'
,
help
=
'path where to save'
)
parser
.
add_argument
(
'--resume'
,
default
=
''
,
help
=
'resume from checkpoint'
)
####aiss debug
parser
.
add_argument
(
'--start_epoch'
,
default
=
0
,
type
=
int
,
help
=
'start epoch'
)
parser
.
add_argument
(
'--aspect-ratio-group-factor'
,
default
=
3
,
type
=
int
)
parser
.
add_argument
(
"--test-only"
,
dest
=
"test_only"
,
help
=
"Only test the model"
,
action
=
"store_true"
,
)
parser
.
add_argument
(
"--pretrained"
,
dest
=
"pretrained"
,
help
=
"Use pre-trained models from the modelzoo"
,
action
=
"store_true"
,
)
# distributed training parameters
parser
.
add_argument
(
'--world-size'
,
default
=
1
,
type
=
int
,
help
=
'number of distributed processes'
)
parser
.
add_argument
(
'--dist-url'
,
default
=
'env://'
,
help
=
'url used to set up distributed training'
)
parser
.
add_argument
(
'--rank'
,
default
=-
1
,
type
=
int
,
help
=
'node rank for distributed training'
)
args
=
parser
.
parse_args
()
if
args
.
output_dir
:
utils
.
mkdir
(
args
.
output_dir
)
main
(
args
)
PyTorch/Compute-Vision/Objection/Faster-rcnn/transforms.py
0 → 100644
View file @
0fc002df
import
random
import
torch
from
torchvision.transforms
import
functional
as
F
def
_flip_coco_person_keypoints
(
kps
,
width
):
flip_inds
=
[
0
,
2
,
1
,
4
,
3
,
6
,
5
,
8
,
7
,
10
,
9
,
12
,
11
,
14
,
13
,
16
,
15
]
flipped_data
=
kps
[:,
flip_inds
]
flipped_data
[...,
0
]
=
width
-
flipped_data
[...,
0
]
# Maintain COCO convention that if visibility == 0, then x, y = 0
inds
=
flipped_data
[...,
2
]
==
0
flipped_data
[
inds
]
=
0
return
flipped_data
class
Compose
(
object
):
def
__init__
(
self
,
transforms
):
self
.
transforms
=
transforms
def
__call__
(
self
,
image
,
target
):
for
t
in
self
.
transforms
:
image
,
target
=
t
(
image
,
target
)
return
image
,
target
class
RandomHorizontalFlip
(
object
):
def
__init__
(
self
,
prob
):
self
.
prob
=
prob
def
__call__
(
self
,
image
,
target
):
if
random
.
random
()
<
self
.
prob
:
height
,
width
=
image
.
shape
[
-
2
:]
image
=
image
.
flip
(
-
1
)
bbox
=
target
[
"boxes"
]
bbox
[:,
[
0
,
2
]]
=
width
-
bbox
[:,
[
2
,
0
]]
target
[
"boxes"
]
=
bbox
if
"masks"
in
target
:
target
[
"masks"
]
=
target
[
"masks"
].
flip
(
-
1
)
if
"keypoints"
in
target
:
keypoints
=
target
[
"keypoints"
]
keypoints
=
_flip_coco_person_keypoints
(
keypoints
,
width
)
target
[
"keypoints"
]
=
keypoints
return
image
,
target
class
ToTensor
(
object
):
def
__call__
(
self
,
image
,
target
):
image
=
F
.
to_tensor
(
image
)
return
image
,
target
Prev
1
2
3
4
5
6
7
…
19
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment