Commit 0fc002df authored by huchen's avatar huchen
Browse files

init the dlexamples new

parent 0e04b692
# 预训练相关代码
目前支持RoBERTa和GPT模式的预训练。请在tensorflow 1.14或1.15下运行。
## 使用
```
python data_utils.py # 生成tfrecord
python pretraining.py # 启动预训练过程
```
请阅读`data_utils.py``pretraining.py`修改相应的配置和参数,以适配自己的语料和设备。
## 背景
keras是一个友好的框架,通常我们都是基于tf后端使用,另外还有tf.keras可以使用,基本上跟keras 2.3.x的接口一致了。
这种一致性意味着使用keras几乎就相当于使用tf,这意味着tf的一切优势keras也有,但tf没有的优势(比如使用简便)keras也有。
因此,作者参考原训练过程地实现了基于keras的预训练脚本,而有了这个keras版之后,因为前面所述的一致性,所以我们可以很轻松地迁移到多GPU上训练,也可以很轻松地迁移到TPU上训练。
#! -*- coding: utf-8 -*-
# 预训练语料构建
import os
os.environ['TF_KERAS'] = '1' # 必须使用tf.keras
import numpy as np
import tensorflow as tf
from bert4keras.snippets import parallel_apply
from bert4keras.backend import K
class TrainingDataset(object):
"""预训练数据集生成器
"""
def __init__(self, tokenizer, sequence_length=512):
"""参数说明:
tokenizer必须是bert4keras自带的tokenizer类;
"""
self.tokenizer = tokenizer
self.sequence_length = sequence_length
self.token_pad_id = tokenizer._token_pad_id
self.token_cls_id = tokenizer._token_start_id
self.token_sep_id = tokenizer._token_end_id
self.token_mask_id = tokenizer._token_mask_id
self.vocab_size = tokenizer._vocab_size
def padding(self, sequence, padding_value=None):
"""对单个序列进行补0
"""
if padding_value is None:
padding_value = self.token_pad_id
sequence = sequence[:self.sequence_length]
padding_length = self.sequence_length - len(sequence)
return sequence + [padding_value] * padding_length
def sentence_process(self, text):
"""单个文本的处理函数,返回处理后的instance
"""
raise NotImplementedError
def paragraph_process(self, texts, starts, ends, paddings):
"""单个段落(多个文本)的处理函数
说明:texts是单句组成的list;starts是每个instance的起始id;
ends是每个instance的终止id;paddings是每个instance的填充id。
做法:不断塞句子,直到长度最接近sequence_length,然后padding。
"""
instances, instance = [], [[start] for start in starts]
for text in texts:
# 处理单个句子
sub_instance = self.sentence_process(text)
sub_instance = [i[:self.sequence_length - 2] for i in sub_instance]
new_length = len(instance[0]) + len(sub_instance[0])
# 如果长度即将溢出
if new_length > self.sequence_length - 1:
# 插入终止符,并padding
complete_instance = []
for item, end, pad in zip(instance, ends, paddings):
item.append(end)
item = self.padding(item, pad)
complete_instance.append(item)
# 存储结果,并构建新样本
instances.append(complete_instance)
instance = [[start] for start in starts]
# 样本续接
for item, sub_item in zip(instance, sub_instance):
item.extend(sub_item)
# 插入终止符,并padding
complete_instance = []
for item, end, pad in zip(instance, ends, paddings):
item.append(end)
item = self.padding(item, pad)
complete_instance.append(item)
# 存储最后的instance
instances.append(complete_instance)
return instances
def tfrecord_serialize(self, instances, instance_keys):
"""转为tfrecord的字符串,等待写入到文件
"""
def create_feature(x):
return tf.train.Feature(int64_list=tf.train.Int64List(value=x))
serialized_instances = []
for instance in instances:
features = {
k: create_feature(v)
for k, v in zip(instance_keys, instance)
}
tf_features = tf.train.Features(feature=features)
tf_example = tf.train.Example(features=tf_features)
serialized_instance = tf_example.SerializeToString()
serialized_instances.append(serialized_instance)
return serialized_instances
def process(self, corpus, record_name, workers=8, max_queue_size=2000):
"""处理输入语料(corpus),最终转为tfrecord格式(record_name)
自带多进程支持,如果cpu核心数多,请加大workers和max_queue_size。
"""
writer = tf.io.TFRecordWriter(record_name)
globals()['count'] = 0
def write_to_tfrecord(serialized_instances):
globals()['count'] += len(serialized_instances)
for serialized_instance in serialized_instances:
writer.write(serialized_instance)
def paragraph_process(texts):
instances = self.paragraph_process(texts)
serialized_instances = self.tfrecord_serialize(instances)
return serialized_instances
parallel_apply(
func=paragraph_process,
iterable=corpus,
workers=workers,
max_queue_size=max_queue_size,
callback=write_to_tfrecord,
)
writer.close()
print('write %s examples into %s' % (count, record_name))
@staticmethod
def load_tfrecord(record_names, batch_size, parse_function):
"""加载处理成tfrecord格式的语料
"""
if not isinstance(record_names, list):
record_names = [record_names]
dataset = tf.data.TFRecordDataset(record_names) # 加载
dataset = dataset.map(parse_function) # 解析
dataset = dataset.repeat() # 循环
dataset = dataset.shuffle(batch_size * 1000) # 打乱
dataset = dataset.batch(batch_size) # 成批
return dataset
class TrainingDatasetRoBERTa(TrainingDataset):
"""预训练数据集生成器(RoBERTa模式)
"""
def __init__(
self, tokenizer, word_segment, mask_rate=0.15, sequence_length=512
):
"""参数说明:
tokenizer必须是bert4keras自带的tokenizer类;
word_segment是任意分词函数。
"""
super(TrainingDatasetRoBERTa, self).__init__(tokenizer, sequence_length)
self.word_segment = word_segment
self.mask_rate = mask_rate
def token_process(self, token_id):
"""以80%的几率替换为[MASK],以10%的几率保持不变,
以10%的几率替换为一个随机token。
"""
rand = np.random.random()
if rand <= 0.8:
return self.token_mask_id
elif rand <= 0.9:
return token_id
else:
return np.random.randint(0, self.vocab_size)
def sentence_process(self, text):
"""单个文本的处理函数
流程:分词,然后转id,按照mask_rate构建全词mask的序列
来指定哪些token是否要被mask
"""
words = self.word_segment(text)
rands = np.random.random(len(words))
token_ids, mask_ids = [], []
for rand, word in zip(rands, words):
word_tokens = self.tokenizer.tokenize(text=word)[1:-1]
word_token_ids = self.tokenizer.tokens_to_ids(word_tokens)
token_ids.extend(word_token_ids)
if rand < self.mask_rate:
word_mask_ids = [
self.token_process(i) + 1 for i in word_token_ids
]
else:
word_mask_ids = [0] * len(word_tokens)
mask_ids.extend(word_mask_ids)
return [token_ids, mask_ids]
def paragraph_process(self, texts):
"""给原方法补上starts、ends、paddings
"""
starts = [self.token_cls_id, 0]
ends = [self.token_sep_id, 0]
paddings = [self.token_pad_id, 0]
return super(TrainingDatasetRoBERTa,
self).paragraph_process(texts, starts, ends, paddings)
def tfrecord_serialize(self, instances):
"""给原方法补上instance_keys
"""
instance_keys = ['token_ids', 'mask_ids']
return super(TrainingDatasetRoBERTa,
self).tfrecord_serialize(instances, instance_keys)
@staticmethod
def load_tfrecord(record_names, sequence_length, batch_size):
"""给原方法补上parse_function
"""
def parse_function(serialized):
features = {
'token_ids': tf.io.FixedLenFeature([sequence_length], tf.int64),
'mask_ids': tf.io.FixedLenFeature([sequence_length], tf.int64),
}
features = tf.io.parse_single_example(serialized, features)
token_ids = features['token_ids']
mask_ids = features['mask_ids']
segment_ids = K.zeros_like(token_ids, dtype='int64')
is_masked = K.not_equal(mask_ids, 0)
masked_token_ids = K.switch(is_masked, mask_ids - 1, token_ids)
x = {
'Input-Token': masked_token_ids,
'Input-Segment': segment_ids,
'token_ids': token_ids,
'is_masked': K.cast(is_masked, K.floatx()),
}
y = {
'mlm_loss': K.zeros([1]),
'mlm_acc': K.zeros([1]),
}
return x, y
return TrainingDataset.load_tfrecord(
record_names, batch_size, parse_function
)
class TrainingDatasetGPT(TrainingDataset):
"""预训练数据集生成器(GPT模式,单向语言模型)
"""
def sentence_process(self, text):
"""单个文本的处理函数
流程:分词,然后转id。
"""
tokens = self.tokenizer.tokenize(text=text)[1:-1]
token_ids = self.tokenizer.tokens_to_ids(tokens)
return [token_ids]
def paragraph_process(self, texts):
"""给原方法补上starts、ends、paddings
"""
starts = [self.token_cls_id]
ends = [self.token_sep_id]
paddings = [self.token_pad_id]
return super(TrainingDatasetGPT,
self).paragraph_process(texts, starts, ends, paddings)
def tfrecord_serialize(self, instances):
"""给原方法补上instance_keys
"""
instance_keys = ['token_ids']
return super(TrainingDatasetGPT,
self).tfrecord_serialize(instances, instance_keys)
@staticmethod
def load_tfrecord(record_names, sequence_length, batch_size):
"""给原方法补上parse_function
"""
def parse_function(serialized):
features = {
'token_ids': tf.io.FixedLenFeature([sequence_length], tf.int64),
}
features = tf.io.parse_single_example(serialized, features)
token_ids = features['token_ids']
segment_ids = K.zeros_like(token_ids, dtype='int64')
x = {
'Input-Token': token_ids,
'Input-Segment': segment_ids,
}
y = {
'lm_loss': K.zeros([1]),
'lm_acc': K.zeros([1]),
}
return x, y
return TrainingDataset.load_tfrecord(
record_names, batch_size, parse_function
)
class TrainingDatasetUniLM(TrainingDatasetGPT):
"""预训练数据集生成器(UniLM模式,Seq2Seq模型)
"""
@staticmethod
def load_tfrecord(record_names, sequence_length, batch_size, token_sep_id):
"""给原方法补上parse_function
"""
def parse_function(serialized):
features = {
'token_ids': tf.io.FixedLenFeature([sequence_length], tf.int64),
}
features = tf.io.parse_single_example(serialized, features)
token_ids = features['token_ids']
segment = K.random_uniform(
shape=[1], minval=1, maxval=sequence_length - 1, dtype='int64'
)[0]
segment_ids = K.one_hot(segment + 1, sequence_length)
segment_ids = K.cast(K.cumsum(segment_ids), 'int64')
token_ids_1 = token_ids[:segment]
token_ids_2 = K.zeros([1], dtype='int64') + token_sep_id
token_ids_3 = token_ids[segment:-1]
token_ids = K.concatenate([token_ids_1, token_ids_2, token_ids_3])
x = {
'Input-Token': token_ids,
'Input-Segment': segment_ids,
}
y = {
'unilm_loss': K.zeros([1]),
'unilm_acc': K.zeros([1]),
}
return x, y
return TrainingDataset.load_tfrecord(
record_names, batch_size, parse_function
)
if __name__ == '__main__':
from bert4keras.tokenizers import Tokenizer
import json, glob, re
from tqdm import tqdm
model = 'roberta'
sequence_length = 512
workers = 40
max_queue_size = 4000
dict_path = '/home/spaces_ac_cn/chinese_L-12_H-768_A-12/vocab.txt'
tokenizer = Tokenizer(dict_path, do_lower_case=True)
def some_texts():
filenames = glob.glob('/home/spaces_ac_cn/corpus/*/*/*')
np.random.shuffle(filenames)
count, texts = 0, []
for filename in filenames:
with open(filename) as f:
for l in f:
l = json.loads(l)['text'].strip()
texts.extend(re.findall(u'.*?[\n。]+', l))
count += 1
if count == 10: # 10篇文章合在一起再处理
yield texts
count, texts = 0, []
if texts:
yield texts
assert model in ['roberta', 'gpt', 'unilm'] # 判断是否支持的模型类型
if model == 'roberta':
import jieba_fast as jieba
jieba.initialize()
def word_segment(text):
return jieba.lcut(text)
TD = TrainingDatasetRoBERTa(
tokenizer, word_segment, sequence_length=sequence_length
)
for i in range(10): # 数据重复10遍
TD.process(
corpus=tqdm(some_texts()),
record_name='../corpus_tfrecord/corpus.%s.tfrecord' % i,
workers=workers,
max_queue_size=max_queue_size,
)
elif model == 'gpt':
TD = TrainingDatasetGPT(tokenizer, sequence_length=sequence_length)
TD.process(
corpus=tqdm(some_texts()),
record_name='../corpus_tfrecord/corpus.tfrecord',
workers=workers,
max_queue_size=max_queue_size,
)
elif model == 'unilm':
TD = TrainingDatasetUniLM(tokenizer, sequence_length=sequence_length)
TD.process(
corpus=tqdm(some_texts()),
record_name='../corpus_tfrecord/corpus.tfrecord',
workers=workers,
max_queue_size=max_queue_size,
)
#! -*- coding: utf-8 -*-
# 预训练脚本,多GPU版/TPU版本
import os, re
os.environ['TF_KERAS'] = '1' # 必须使用tf.keras
import tensorflow as tf
from data_utils import *
from bert4keras.models import build_transformer_model
from bert4keras.backend import keras, K
from bert4keras.optimizers import Adam
from bert4keras.optimizers import extend_with_weight_decay
from bert4keras.optimizers import extend_with_layer_adaptation
from bert4keras.optimizers import extend_with_piecewise_linear_lr
from bert4keras.optimizers import extend_with_gradient_accumulation
from keras.layers import Input, Lambda
from keras.models import Model
from keras.callbacks import Callback, CSVLogger
model = 'roberta'
# 语料路径和模型保存路径
# 如果是TPU训练,那么语料必须存放在Google Cloud Storage上面,
# 路径必须以gs://开头;如果是GPU训练,改为普通路径即可。
model_saved_path = 'gs://xxxx/bert4keras/saved_model/bert_model.ckpt'
corpus_paths = [
'gs://xxxx/bert4keras/corpus/corpus.%s.tfrecord' % i for i in range(10)
]
# 其他配置
sequence_length = 512
batch_size = 4096
config_path = '/home/spaces_ac_cn/chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = '/home/spaces_ac_cn/chinese_L-12_H-768_A-12/bert_model.ckpt' # 如果从零训练,就设为None
learning_rate = 0.00176
weight_decay_rate = 0.01
num_warmup_steps = 3125
num_train_steps = 125000
steps_per_epoch = 10000
grad_accum_steps = 16 # 大于1即表明使用梯度累积
epochs = num_train_steps * grad_accum_steps // steps_per_epoch
exclude_from_weight_decay = ['Norm', 'bias']
exclude_from_layer_adaptation = ['Norm', 'bias']
tpu_address = 'grpc://xxx.xxx.xxx.xxx:8470' # 如果用多GPU跑,直接设为None
which_optimizer = 'lamb' # adam 或 lamb,均自带weight decay
lr_schedule = {
num_warmup_steps * grad_accum_steps: 1.0,
num_train_steps * grad_accum_steps: 0.0,
}
floatx = K.floatx()
# 读取数据集,构建数据张量
if model == 'roberta':
dataset = TrainingDatasetRoBERTa.load_tfrecord(
record_names=corpus_paths,
sequence_length=sequence_length,
batch_size=batch_size // grad_accum_steps,
)
elif model == 'gpt':
dataset = TrainingDatasetGPT.load_tfrecord(
record_names=corpus_paths,
sequence_length=sequence_length,
batch_size=batch_size // grad_accum_steps,
)
elif model == 'unilm':
dataset = TrainingDatasetUniLM.load_tfrecord(
record_names=corpus_paths,
sequence_length=sequence_length,
batch_size=batch_size // grad_accum_steps,
token_sep_id=3, # 这里需要自己指定[SEP]的id
)
def build_transformer_model_with_mlm():
"""带mlm的bert模型
"""
bert = build_transformer_model(
config_path, with_mlm='linear', return_keras_model=False
)
proba = bert.model.output
# 辅助输入
token_ids = Input(shape=(None,), dtype='int64', name='token_ids') # 目标id
is_masked = Input(shape=(None,), dtype=floatx, name='is_masked') # mask标记
def mlm_loss(inputs):
"""计算loss的函数,需要封装为一个层
"""
y_true, y_pred, mask = inputs
loss = K.sparse_categorical_crossentropy(
y_true, y_pred, from_logits=True
)
loss = K.sum(loss * mask) / (K.sum(mask) + K.epsilon())
return loss
def mlm_acc(inputs):
"""计算准确率的函数,需要封装为一个层
"""
y_true, y_pred, mask = inputs
y_true = K.cast(y_true, floatx)
acc = keras.metrics.sparse_categorical_accuracy(y_true, y_pred)
acc = K.sum(acc * mask) / (K.sum(mask) + K.epsilon())
return acc
mlm_loss = Lambda(mlm_loss, name='mlm_loss')([token_ids, proba, is_masked])
mlm_acc = Lambda(mlm_acc, name='mlm_acc')([token_ids, proba, is_masked])
train_model = Model(
bert.model.inputs + [token_ids, is_masked], [mlm_loss, mlm_acc]
)
loss = {
'mlm_loss': lambda y_true, y_pred: y_pred,
'mlm_acc': lambda y_true, y_pred: K.stop_gradient(y_pred),
}
return bert, train_model, loss
def build_transformer_model_with_lm():
"""带lm的bert模型
"""
bert = build_transformer_model(
config_path,
with_mlm='linear',
application='lm',
return_keras_model=False
)
token_ids = bert.model.inputs[0]
proba = bert.model.output
def lm_loss(inputs, mask=None):
"""计算loss的函数,需要封装为一个层
"""
y_true, y_pred = inputs
y_true, y_pred = y_true[:, 1:], y_pred[:, :-1]
if mask is None:
mask = 1.0
else:
mask = K.cast(mask[1][:, 1:], floatx)
loss = K.sparse_categorical_crossentropy(
y_true, y_pred, from_logits=True
)
loss = K.sum(loss * mask) / (K.sum(mask) + K.epsilon())
return loss
def lm_acc(inputs, mask=None):
"""计算准确率的函数,需要封装为一个层
"""
y_true, y_pred = inputs
y_true, y_pred = K.cast(y_true[:, 1:], floatx), y_pred[:, :-1]
if mask is None:
mask = 1.0
else:
mask = K.cast(mask[1][:, 1:], floatx)
acc = keras.metrics.sparse_categorical_accuracy(y_true, y_pred)
acc = K.sum(acc * mask) / (K.sum(mask) + K.epsilon())
return acc
lm_loss = Lambda(lm_loss, name='lm_loss')([token_ids, proba])
lm_acc = Lambda(lm_acc, name='lm_acc')([token_ids, proba])
train_model = Model(bert.model.inputs, [lm_loss, lm_acc])
loss = {
'lm_loss': lambda y_true, y_pred: y_pred,
'lm_acc': lambda y_true, y_pred: K.stop_gradient(y_pred),
}
return bert, train_model, loss
def build_transformer_model_with_unilm():
"""带unilm的bert模型
"""
bert = build_transformer_model(
config_path,
with_mlm='linear',
application='unilm',
return_keras_model=False
)
token_ids = bert.model.inputs[0]
segment_ids = bert.model.inputs[1]
proba = bert.model.output
def unilm_loss(inputs, mask=None):
"""计算loss的函数,需要封装为一个层
"""
y_true, y_pred, segment_ids = inputs
y_true, y_pred = y_true[:, 1:], y_pred[:, :-1]
if mask is None:
mask = 1.0
else:
mask = K.cast(mask[1][:, 1:], floatx)
segment_ids = K.cast(segment_ids, floatx)
mask = mask * segment_ids[:, 1:]
loss = K.sparse_categorical_crossentropy(
y_true, y_pred, from_logits=True
)
loss = K.sum(loss * mask) / (K.sum(mask) + K.epsilon())
return loss
def unilm_acc(inputs, mask=None):
"""计算准确率的函数,需要封装为一个层
"""
y_true, y_pred, segment_ids = inputs
y_true, y_pred = K.cast(y_true[:, 1:], floatx), y_pred[:, :-1]
if mask is None:
mask = 1.0
else:
mask = K.cast(mask[1][:, 1:], floatx)
segment_ids = K.cast(segment_ids, floatx)
mask = mask * segment_ids[:, 1:]
acc = keras.metrics.sparse_categorical_accuracy(y_true, y_pred)
acc = K.sum(acc * mask) / (K.sum(mask) + K.epsilon())
return acc
token_proba_segment = [token_ids, proba, segment_ids]
unilm_loss = Lambda(unilm_loss, name='unilm_loss')(token_proba_segment)
unilm_acc = Lambda(unilm_acc, name='unilm_acc')(token_proba_segment)
train_model = Model(bert.model.inputs, [unilm_loss, unilm_acc])
loss = {
'unilm_loss': lambda y_true, y_pred: y_pred,
'unilm_acc': lambda y_true, y_pred: K.stop_gradient(y_pred),
}
return bert, train_model, loss
def build_transformer_model_for_pretraining():
"""构建训练模型,通用于TPU/GPU
注意全程要用keras标准的层写法,一些比较灵活的“移花接木”式的
写法可能会在TPU上训练失败。此外,要注意的是TPU并非支持所有
tensorflow算子,尤其不支持动态(变长)算子,因此编写相应运算
时要格外留意。
"""
if model == 'roberta':
bert, train_model, loss = build_transformer_model_with_mlm()
elif model == 'gpt':
bert, train_model, loss = build_transformer_model_with_lm()
elif model == 'unilm':
bert, train_model, loss = build_transformer_model_with_unilm()
# 优化器
optimizer = extend_with_weight_decay(Adam)
if which_optimizer == 'lamb':
optimizer = extend_with_layer_adaptation(optimizer)
optimizer = extend_with_piecewise_linear_lr(optimizer)
optimizer_params = {
'learning_rate': learning_rate,
'lr_schedule': lr_schedule,
'weight_decay_rate': weight_decay_rate,
'exclude_from_weight_decay': exclude_from_weight_decay,
'exclude_from_layer_adaptation': exclude_from_layer_adaptation,
'bias_correction': False,
}
if grad_accum_steps > 1:
optimizer = extend_with_gradient_accumulation(optimizer)
optimizer_params['grad_accum_steps'] = grad_accum_steps
optimizer = optimizer(**optimizer_params)
# 模型定型
train_model.compile(loss=loss, optimizer=optimizer)
# 如果传入权重,则加载。注:须在此处加载,才保证不报错。
if checkpoint_path is not None:
bert.load_weights_from_checkpoint(checkpoint_path)
return train_model
if tpu_address is None:
# 单机多卡模式(多机多卡也类似,但需要硬软件配合,请参考https://tf.wiki)
strategy = tf.distribute.MirroredStrategy()
else:
# TPU模式
resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
tpu=tpu_address
)
tf.config.experimental_connect_to_host(resolver.master())
tf.tpu.experimental.initialize_tpu_system(resolver)
strategy = tf.distribute.experimental.TPUStrategy(resolver)
with strategy.scope():
train_model = build_transformer_model_for_pretraining()
train_model.summary()
class ModelCheckpoint(keras.callbacks.Callback):
"""自动保存最新模型
"""
def on_epoch_end(self, epoch, logs=None):
self.model.save_weights(model_saved_path, overwrite=True)
# 保存模型
checkpoint = ModelCheckpoint()
# 记录日志
csv_logger = keras.callbacks.CSVLogger('training.log')
# 模型训练
train_model.fit(
dataset,
steps_per_epoch=steps_per_epoch,
epochs=epochs,
callbacks=[checkpoint, csv_logger],
)
#! -*- coding: utf-8 -*-
from setuptools import setup, find_packages
setup(
name='bert4keras',
version='0.10.5',
description='an elegant bert4keras',
long_description='bert4keras: https://github.com/bojone/bert4keras',
license='Apache License 2.0',
url='https://github.com/bojone/bert4keras',
author='bojone',
author_email='bojone@spaces.ac.cn',
install_requires=['keras<=2.3.1'],
packages=find_packages()
)
# 介绍
该测试用例用于ResNet50精度验证,单卡运行指令如下
# 运行示例
## fp32
python3 main_acc.py --batch-size=64 --arch=resnet50 -j 6 --epochs=90 --save-path=/path/to/{save_model_dir} /path/to/{ImageNet_pytorch_data_dir}/
## fp16
python3 main_acc.py --batch-size=64 --arch=resnet50 -j 6 --epochs=90 --amp --opt-level O1 --loss-scale=dynamic --save-path=/path/to/{save_model_dir} /path/to/{ImageNet_pytorch_data_dir}/
# 参考
[https://github.com/pytorch/examples/tree/master/imagenet](https://github.com/pytorch/examples/tree/master/imagenet)
import argparse
import os
import random
import shutil
import time
import warnings
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.distributed as dist
import torch.optim
import torch.multiprocessing as mp
import torch.utils.data
import torch.utils.data.distributed
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torchvision.models as models
use_cuda = torch.cuda.is_available()
if use_cuda:
try:
from apex.parallel import DistributedDataParallel as DDP
from apex.fp16_utils import *
from apex import amp,optimizers
except ImportError:
raise ImportError(
"Please install apex from https://www.github.com/nvidia/apex to run this example."
)
model_names = sorted(name for name in models.__dict__
if name.islower() and not name.startswith("__")
and callable(models.__dict__[name]))
parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
parser.add_argument('data', metavar='DIR',
help='path to dataset')
parser.add_argument('-a', '--arch', metavar='ARCH', default='resnet18',
choices=model_names,
help='model architecture: ' +
' | '.join(model_names) +
' (default: resnet18)')
parser.add_argument('-j', '--workers', default=4, type=int, metavar='N',
help='number of data loading workers (default: 4)')
parser.add_argument('--epochs', default=90, type=int, metavar='N',
help='number of total epochs to run')
parser.add_argument('--start-epoch', default=0, type=int, metavar='N',
help='manual epoch number (useful on restarts)')
parser.add_argument('-b', '--batch-size', default=256, type=int,
metavar='N',
help='mini-batch size (default: 256), this is the total '
'batch size of all GPUs on the current node when '
'using Data Parallel or Distributed Data Parallel')
parser.add_argument('--lr', '--learning-rate', default=0.1, type=float,
metavar='LR', help='initial learning rate', dest='lr')
parser.add_argument('--momentum', default=0.9, type=float, metavar='M',
help='momentum')
parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float,
metavar='W', help='weight decay (default: 1e-4)',
dest='weight_decay')
parser.add_argument('-p', '--print-freq', default=10, type=int,
metavar='N', help='print frequency (default: 10)')
parser.add_argument('--resume', default='', type=str, metavar='PATH',
help='path to latest checkpoint (default: none)')
parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true',
help='evaluate model on validation set')
parser.add_argument('--pretrained', dest='pretrained', action='store_true',
help='use pre-trained model')
parser.add_argument('--world-size', default=-1, type=int,
help='number of nodes for distributed training')
parser.add_argument('--rank', default=-1, type=int,
help='node rank for distributed training')
parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str,
help='url used to set up distributed training')
parser.add_argument('--dist-backend', default='nccl', type=str,
help='distributed backend')
parser.add_argument('--seed', default=None, type=int,
help='seed for initializing training. ')
parser.add_argument('--gpu', default=None, type=int,
help='GPU id to use.')
parser.add_argument('--multiprocessing-distributed', action='store_true',
help='Use multi-processing distributed training to launch '
'N processes per node, which has N GPUs. This is the '
'fastest way to use PyTorch for either single node or '
'multi node data parallel training')
parser.add_argument("--save-path", default='/root', type=str, help='path to save checkpoint')
#aiss add
parser.add_argument(
"--amp",
action="store_true",
help="Run model AMP (automatic mixed precision) mode.",
)
parser.add_argument('--opt-level', type=str)
parser.add_argument('--loss-scale', type=str, default=None)
parser.add_argument('--keep-batchnorm-fp32', type=str, default=None)
best_acc1 = 0
def main():
args = parser.parse_args()
if args.seed is not None:
random.seed(args.seed)
torch.manual_seed(args.seed)
cudnn.deterministic = True
warnings.warn('You have chosen to seed training. '
'This will turn on the CUDNN deterministic setting, '
'which can slow down your training considerably! '
'You may see unexpected behavior when restarting '
'from checkpoints.')
if args.gpu is not None:
warnings.warn('You have chosen a specific GPU. This will completely '
'disable data parallelism.')
if args.dist_url == "env://" and args.world_size == -1:
args.world_size = int(os.environ["WORLD_SIZE"])
args.distributed = args.world_size > 1 or args.multiprocessing_distributed
ngpus_per_node = torch.cuda.device_count()
if args.multiprocessing_distributed:
# Since we have ngpus_per_node processes per node, the total world_size
# needs to be adjusted accordingly
args.world_size = ngpus_per_node * args.world_size
# Use torch.multiprocessing.spawn to launch distributed processes: the
# main_worker process function
mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args))
else:
# Simply call main_worker function
main_worker(args.gpu, ngpus_per_node, args)
local_rank=0
def main_worker(gpu, ngpus_per_node, args):
global best_acc1
args.gpu = gpu
if args.gpu is not None:
print("Use GPU: {} for training".format(args.gpu))
if args.distributed:
if args.dist_url == "env://" and args.rank == -1:
args.rank = int(os.environ["RANK"])
if args.multiprocessing_distributed:
# For multiprocessing distributed training, rank needs to be the
# global rank among all the processes
args.rank = args.rank * ngpus_per_node + gpu
#args.gpu = args.rank % torch.cuda.device_count()
local_rank = args.rank % torch.cuda.device_count()
print("device count is ", torch.cuda.device_count())
torch.cuda.set_device(local_rank)
dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
world_size=args.world_size, rank=args.rank)
# create model
if args.pretrained:
print("=> using pre-trained model '{}'".format(args.arch))
model = models.__dict__[args.arch](pretrained=True)
else:
print("=> creating model '{}'".format(args.arch))
model = models.__dict__[args.arch]()
# define loss function (criterion) and optimizer
#criterion = nn.CrossEntropyLoss().cuda(args.gpu)
criterion = nn.CrossEntropyLoss().cuda()
model.cuda()
optimizer = torch.optim.SGD(model.parameters(), args.lr,
momentum=args.momentum,
weight_decay=args.weight_decay)
#add for amp
if args.amp:
Model, optimizer = amp.initialize(model, optimizer,
opt_level=args.opt_level,
keep_batchnorm_fp32=args.keep_batchnorm_fp32,
loss_scale=args.loss_scale
)
if args.distributed:
# For multiprocessing distributed, DistributedDataParallel constructor
# should always set the single device scope, otherwise,
# DistributedDataParallel will use all available devices.
if args.gpu is not None:
torch.cuda.set_device(args.gpu)
model.cuda(args.gpu)
# When using a single GPU per process and per
# DistributedDataParallel, we need to divide the batch size
# ourselves based on the total number of GPUs we have
args.batch_size = int(args.batch_size / ngpus_per_node)
args.workers = int(args.workers / ngpus_per_node)
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
else:
#model.cuda()
# DistributedDataParallel will divide and allocate batch_size to all
# available GPUs if device_ids are not set
if args.amp:
model = DDP(model)
print("input R branch!!!!!!!!!!!!!!!!!!!")
else:
model = torch.nn.parallel.DistributedDataParallel(model)
elif args.gpu is not None:
torch.cuda.set_device(args.gpu)
model = model.cuda(args.gpu)
else:
# DataParallel will divide and allocate batch_size to all available GPUs
if args.arch.startswith('alexnet') or args.arch.startswith('vgg'):
model.features = torch.nn.DataParallel(model.features)
model.cuda()
else:
model = torch.nn.DataParallel(model).cuda()
# optionally resume from a checkpoint
# if args.resume:
# if os.path.isfile(args.resume):
# print("=> loading checkpoint '{}'".format(args.resume))
# checkpoint = torch.load(args.resume)
# args.start_epoch = checkpoint['epoch']
# best_acc1 = checkpoint['best_acc1']
# if args.gpu is not None:
# # best_acc1 may be from a checkpoint from a different GPU
# best_acc1 = best_acc1.to(args.gpu)
# model.load_state_dict(checkpoint['state_dict'])
# optimizer.load_state_dict(checkpoint['optimizer'])
# print("=> loaded checkpoint '{}' (epoch {})"
# .format(args.resume, checkpoint['epoch']))
# else:
# print("=> no checkpoint found at '{}'".format(args.resume))
if args.resume:
if os.path.isfile(args.resume):
print("=> loading checkpoint '{}'".format(args.resume))
if args.gpu is None:
checkpoint = torch.load(args.resume)
else:
# Map model to be loaded to specified single gpu.
loc = 'cuda:{}'.format(args.gpu)
checkpoint = torch.load(args.resume, map_location=loc)
args.start_epoch = checkpoint['epoch']
best_acc1 = checkpoint['best_acc1']
if args.gpu is not None:
# best_acc1 may be from a checkpoint from a different GPU
best_acc1 = best_acc1.to(args.gpu)
model.load_state_dict(checkpoint['state_dict'])
optimizer.load_state_dict(checkpoint['optimizer'])
print("=> loaded checkpoint '{}' (epoch {})"
.format(args.resume, checkpoint['epoch']))
else:
print("=> no checkpoint found at '{}'".format(args.resume))
cudnn.benchmark = True
# Data loading code
traindir = os.path.join(args.data, 'train')
valdir = os.path.join(args.data, 'val')
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])
train_dataset = datasets.ImageFolder(
traindir,
transforms.Compose([
transforms.RandomResizedCrop(224),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
normalize,
]))
if args.distributed:
train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
else:
train_sampler = None
train_loader = torch.utils.data.DataLoader(
train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None),
num_workers=args.workers, pin_memory=True, sampler=train_sampler)
val_loader = torch.utils.data.DataLoader(
datasets.ImageFolder(valdir, transforms.Compose([
transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
normalize,
])),
batch_size=args.batch_size, shuffle=False,
num_workers=args.workers, pin_memory=True)
if args.evaluate:
validate(val_loader, model, criterion, args)
return
for epoch in range(args.start_epoch, args.epochs):
if args.distributed:
train_sampler.set_epoch(epoch)
adjust_learning_rate(optimizer, epoch, args)
# train for one epoch
train(train_loader, model, criterion, optimizer, epoch, args)
# evaluate on validation set
acc1 = validate(val_loader, model, criterion, args)
# remember best acc@1 and save checkpoint
is_best = acc1 > best_acc1
best_acc1 = max(acc1, best_acc1)
#if args.rank == 0:
if not os.path.isdir(args.save_path):
os.mkdir(args.save_path)
if not args.multiprocessing_distributed or (args.multiprocessing_distributed
and args.rank % ngpus_per_node == 0):
save_checkpoint({
'epoch': epoch + 1,
'arch': args.arch,
'state_dict': model.state_dict(),
'best_acc1': best_acc1,
'optimizer' : optimizer.state_dict(),
}, epoch, is_best, args.rank, args.save_path)
def train(train_loader, model, criterion, optimizer, epoch, args):
batch_time = AverageMeter('Time', ':6.3f')
data_time = AverageMeter('Data', ':6.3f')
losses = AverageMeter('Loss', ':.4e')
top1 = AverageMeter('Acc@1', ':6.2f')
top5 = AverageMeter('Acc@5', ':6.2f')
progress = ProgressMeter(
len(train_loader),
[batch_time, data_time, losses, top1, top5],
prefix="Epoch: [{}]".format(epoch))
# switch to train mode
model.train()
end = time.time()
for i, (images, target) in enumerate(train_loader):
# measure data loading time
data_time.update(time.time() - end)
if args.gpu is not None:
images = images.cuda(args.gpu, non_blocking=True)
target = target.cuda(args.gpu, non_blocking=True)
###aiss add
if use_cuda:
images = images.cuda()
target = target.cuda()
# compute output
#print("image type: ",type(images))
#print("model type: ",type(model))
#print("target type: ",type(target))
output = model(images)
loss = criterion(output, target)
# measure accuracy and record loss
acc1, acc5 = accuracy(output, target, topk=(1, 5))
losses.update(loss.item(), images.size(0))
top1.update(acc1[0], images.size(0))
top5.update(acc5[0], images.size(0))
# compute gradient and do SGD step
optimizer.zero_grad()
#loss.backward()
##aiss add amp
if args.amp:
with amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
else:
loss.backward()
optimizer.step()
# measure elapsed time
batch_time.update(time.time() - end)
end = time.time()
if i % args.print_freq == 0:
progress.display(i)
def validate(val_loader, model, criterion, args):
batch_time = AverageMeter('Time', ':6.3f')
losses = AverageMeter('Loss', ':.4e')
top1 = AverageMeter('Acc@1', ':6.2f')
top5 = AverageMeter('Acc@5', ':6.2f')
progress = ProgressMeter(
len(val_loader),
[batch_time, losses, top1, top5],
prefix='Test: ')
# switch to evaluate mode
model.eval()
with torch.no_grad():
end = time.time()
for i, (images, target) in enumerate(val_loader):
#if args.gpu is not None:
# images = images.cuda(args.gpu, non_blocking=True)
#target = target.cuda(args.gpu, non_blocking=True)
if args.gpu is not None:
images = images.cuda(args.gpu, non_blocking=True)
target = target.cuda(args.gpu, non_blocking=True)
if use_cuda:
images = images.cuda()
target = target.cuda()
# compute output
output = model(images)
loss = criterion(output, target)
# measure accuracy and record loss
acc1, acc5 = accuracy(output, target, topk=(1, 5))
losses.update(loss.item(), images.size(0))
top1.update(acc1[0], images.size(0))
top5.update(acc5[0], images.size(0))
# measure elapsed time
batch_time.update(time.time() - end)
end = time.time()
if i % args.print_freq == 0:
progress.display(i)
# TODO: this should also be done with the ProgressMeter
print(' * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}'
.format(top1=top1, top5=top5))
return top1.avg
#def save_checkpoint(state, epoch, is_best, rank, filename='/public/home/aiss/419_rocm2.x_DL/examples_dl/aiss_test/base_acc/HK/HK3/test_resume/checkpoint.pth.tar'):
# filename='/public/home/aiss/419_rocm2.x_DL/examples_dl/aiss_test/base_acc/HK/HK3/test_resume'+str(rank)+'/checkpoint_'+str(epoch)+'.pth.tar'
# torch.save(state, filename)
# if is_best:
# #shutil.copyfile(filename, '/public/home/aiss/419_rocm2.x_DL/examples_dl/aiss_test/base_acc/HK/HK3/test_resume/model_best.pth.tar')
# best_dir='/public/home/aiss/419_rocm2.x_DL/examples_dl/aiss_test/base_acc/HK/HK3/test_resume'+str(rank)+'/model_best.pth.tar'
# shutil.copyfile(filename, best_dir)
def save_checkpoint(state, epoch, is_best, rank, filename):
rank_path = filename + '/' + str(rank)
if not os.path.isdir(rank_path):
os.mkdir(rank_path)
filename = rank_path+'/checkpoint_'+str(epoch)+'.pth.tar'
torch.save(state, filename)
if is_best:
#shutil.copyfile(filename, '/public/home/aiss/419_rocm2.x_DL/examples_dl/aiss_test/base_acc/HK/HK3/test_resume/model_best.pth.tar')
best_dir=rank_path+'/model_best.pth.tar'
shutil.copyfile(filename, best_dir)
class AverageMeter(object):
"""Computes and stores the average and current value"""
def __init__(self, name, fmt=':f'):
self.name = name
self.fmt = fmt
self.reset()
def reset(self):
self.val = 0
self.avg = 0
self.sum = 0
self.count = 0
def update(self, val, n=1):
self.val = val
self.sum += val * n
self.count += n
self.avg = self.sum / self.count
def __str__(self):
fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
return fmtstr.format(**self.__dict__)
class ProgressMeter(object):
def __init__(self, num_batches, meters, prefix=""):
self.batch_fmtstr = self._get_batch_fmtstr(num_batches)
self.meters = meters
self.prefix = prefix
def display(self, batch):
entries = [self.prefix + self.batch_fmtstr.format(batch)]
entries += [str(meter) for meter in self.meters]
print('\t'.join(entries))
def _get_batch_fmtstr(self, num_batches):
num_digits = len(str(num_batches // 1))
fmt = '{:' + str(num_digits) + 'd}'
return '[' + fmt + '/' + fmt.format(num_batches) + ']'
def adjust_learning_rate(optimizer, epoch, args):
"""Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
lr = args.lr * (0.1 ** (epoch // 30))
for param_group in optimizer.param_groups:
param_group['lr'] = lr
def accuracy(output, target, topk=(1,)):
"""Computes the accuracy over the k top predictions for the specified values of k"""
with torch.no_grad():
maxk = max(topk)
batch_size = target.size(0)
_, pred = output.topk(maxk, 1, True, True)
pred = pred.t()
correct = pred.eq(target.view(1, -1).expand_as(pred))
res = []
for k in topk:
correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
res.append(correct_k.mul_(100.0 / batch_size))
return res
if __name__ == '__main__':
main()
# 简介
该测试用例用于PyTorch分类模型性能测试
* 该脚本可支持PyTorch的nccl和gloo分布式通信库方式
# 运行
## 单卡
python3 `pwd`/main_bench.py --batch-size=64 --a=resnet50 -j 24 --epochs=1 --synthetic /path/to/any/existing/folder
## 单机多卡
mpirun -np 4 --bind-to none `pwd`/single_process.sh localhost inception_v3 64
## 分布式多卡
mpirun -np $np --hostfile hostfile --bind-to none `pwd`/single_process.sh $dist_url resnet50 64
hostfile格式参考:
node1 slots=4
node2 slots=4
# 参考
[https://github.com/pytorch/examples/tree/master/imagenet](https://github.com/pytorch/examples/tree/master/imagenet)
###########################
#add inception model
#with bench data v1
#20191106:add synthetic data v2
#used for large-scale test, change the iter of each epoch based on the total dataset
###########################
import argparse
import os
import random
import shutil
import time
import warnings
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.distributed as dist
import torch.optim
import torch.multiprocessing as mp
import torch.utils.data
import torch.utils.data.distributed
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torchvision.models as models
model_names = sorted(name for name in models.__dict__
if name.islower() and not name.startswith("__")
and callable(models.__dict__[name]))
parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
parser.add_argument('data', metavar='DIR',
help='path to dataset')
parser.add_argument('--synthetic', action='store_true',
help='use snthetic data')
parser.add_argument('-a', '--arch', metavar='ARCH', default='resnet18',
choices=model_names,
help='model architecture: ' +
' | '.join(model_names) +
' (default: resnet18)')
parser.add_argument('-j', '--workers', default=4, type=int, metavar='N',
help='number of data loading workers (default: 4)')
parser.add_argument('--epochs', default=90, type=int, metavar='N',
help='number of total epochs to run')
parser.add_argument('--start-epoch', default=0, type=int, metavar='N',
help='manual epoch number (useful on restarts)')
parser.add_argument('-b', '--batch-size', default=256, type=int,
metavar='N',
help='mini-batch size (default: 256), this is the total '
'batch size of all GPUs on the current node when '
'using Data Parallel or Distributed Data Parallel')
parser.add_argument('--lr', '--learning-rate', default=0.1, type=float,
metavar='LR', help='initial learning rate', dest='lr')
parser.add_argument('--momentum', default=0.9, type=float, metavar='M',
help='momentum')
parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float,
metavar='W', help='weight decay (default: 1e-4)',
dest='weight_decay')
parser.add_argument('-p', '--print-freq', default=10, type=int,
metavar='N', help='print frequency (default: 10)')
parser.add_argument('--resume', default='', type=str, metavar='PATH',
help='path to latest checkpoint (default: none)')
parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true',
help='evaluate model on validation set')
parser.add_argument('--pretrained', dest='pretrained', action='store_true',
help='use pre-trained model')
parser.add_argument('--world-size', default=-1, type=int,
help='number of nodes for distributed training')
parser.add_argument('--rank', default=-1, type=int,
help='node rank for distributed training')
parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str,
help='url used to set up distributed training')
parser.add_argument('--dist-backend', default='nccl', type=str,
help='distributed backend')
parser.add_argument('--seed', default=None, type=int,
help='seed for initializing training. ')
parser.add_argument('--gpu', default=None, type=int,
help='GPU id to use.')
parser.add_argument('--multiprocessing-distributed', action='store_true',
help='Use multi-processing distributed training to launch '
'N processes per node, which has N GPUs. This is the '
'fastest way to use PyTorch for either single node or '
'multi node data parallel training')
best_acc1 = 0
def main():
args = parser.parse_args()
if args.seed is not None:
random.seed(args.seed)
torch.manual_seed(args.seed)
cudnn.deterministic = True
warnings.warn('You have chosen to seed training. '
'This will turn on the CUDNN deterministic setting, '
'which can slow down your training considerably! '
'You may see unexpected behavior when restarting '
'from checkpoints.')
if args.gpu is not None:
warnings.warn('You have chosen a specific GPU. This will completely '
'disable data parallelism.')
if args.dist_url == "env://" and args.world_size == -1:
args.world_size = int(os.environ["WORLD_SIZE"])
args.distributed = args.world_size > 1 or args.multiprocessing_distributed
ngpus_per_node = torch.cuda.device_count()
if args.multiprocessing_distributed:
# Since we have ngpus_per_node processes per node, the total world_size
# needs to be adjusted accordingly
args.world_size = ngpus_per_node * args.world_size
# Use torch.multiprocessing.spawn to launch distributed processes: the
# main_worker process function
mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args))
else:
# Simply call main_worker function
main_worker(args.gpu, ngpus_per_node, args)
def main_worker(gpu, ngpus_per_node, args):
global best_acc1
args.gpu = gpu
if args.gpu is not None:
print("Use GPU: {} for training".format(args.gpu))
if args.distributed:
if args.dist_url == "env://" and args.rank == -1:
args.rank = int(os.environ["RANK"])
if args.multiprocessing_distributed:
# For multiprocessing distributed training, rank needs to be the
# global rank among all the processes
args.rank = args.rank * ngpus_per_node + gpu
dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
world_size=args.world_size, rank=args.rank)
# create model
if args.pretrained:
print("=> using pre-trained model '{}'".format(args.arch))
model = models.__dict__[args.arch](pretrained=True)
# else:
# print("=> creating model '{}'".format(args.arch))
# model = models.__dict__[args.arch]()
else:
if (args.arch== "inception_v3"):
print("=> creating model '{}'".format(args.arch))
model = models.__dict__[args.arch](aux_logits=False)
else:
print("=> creating model '{}'".format(args.arch))
model = models.__dict__[args.arch]()
if args.distributed:
# For multiprocessing distributed, DistributedDataParallel constructor
# should always set the single device scope, otherwise,
# DistributedDataParallel will use all available devices.
if args.gpu is not None:
torch.cuda.set_device(args.gpu)
model.cuda(args.gpu)
# When using a single GPU per process and per
# DistributedDataParallel, we need to divide the batch size
# ourselves based on the total number of GPUs we have
args.batch_size = int(args.batch_size / ngpus_per_node)
args.workers = int(args.workers / ngpus_per_node)
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
else:
model.cuda()
# DistributedDataParallel will divide and allocate batch_size to all
# available GPUs if device_ids are not set
model = torch.nn.parallel.DistributedDataParallel(model)
elif args.gpu is not None:
torch.cuda.set_device(args.gpu)
model = model.cuda(args.gpu)
else:
# DataParallel will divide and allocate batch_size to all available GPUs
if args.arch.startswith('alexnet') or args.arch.startswith('vgg'):
model.features = torch.nn.DataParallel(model.features)
model.cuda()
else:
model = torch.nn.DataParallel(model).cuda()
# define loss function (criterion) and optimizer
criterion = nn.CrossEntropyLoss().cuda(args.gpu)
optimizer = torch.optim.SGD(model.parameters(), args.lr,
momentum=args.momentum,
weight_decay=args.weight_decay)
# optionally resume from a checkpoint
if args.resume:
if os.path.isfile(args.resume):
print("=> loading checkpoint '{}'".format(args.resume))
checkpoint = torch.load(args.resume)
args.start_epoch = checkpoint['epoch']
best_acc1 = checkpoint['best_acc1']
if args.gpu is not None:
# best_acc1 may be from a checkpoint from a different GPU
best_acc1 = best_acc1.to(args.gpu)
model.load_state_dict(checkpoint['state_dict'])
optimizer.load_state_dict(checkpoint['optimizer'])
print("=> loaded checkpoint '{}' (epoch {})"
.format(args.resume, checkpoint['epoch']))
else:
print("=> no checkpoint found at '{}'".format(args.resume))
cudnn.benchmark = True
# Data loading code
traindir = os.path.join(args.data, 'train')
valdir = os.path.join(args.data, 'val')
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])
#######add inceptionby aiss 0710
if args.synthetic:
print("use syn date!")
train_dataset = datasets.ImageFolder(
traindir,
transforms.Compose([
transforms.RandomResizedCrop(224),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
normalize,
])) if not args.synthetic else \
datasets.FakeData(1281280, num_classes=1000, transform=transforms.ToTensor())
if args.distributed:
train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
else:
train_sampler = None
train_loader = torch.utils.data.DataLoader(
train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None),
num_workers=args.workers, pin_memory=False, sampler=train_sampler)
#global len_train_loader
#len_train_loader=len(train_loader)
#print("iters per epoch: ",len(train_loader))
val_loader = torch.utils.data.DataLoader(
datasets.ImageFolder(valdir, transforms.Compose([
transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
normalize,
])) if not args.synthetic else
datasets.FakeData(20000, num_classes=1000, transform=transforms.ToTensor()),
batch_size=args.batch_size, shuffle=False,
num_workers=args.workers, pin_memory=False)
# if args.evaluate:
# validate(val_loader, model, criterion, args)
# return
#####add by aiss
global is_stop
is_stop = 0
# HC debug
print("Begin training ...");
for epoch in range(args.start_epoch, args.epochs):
if args.distributed:
train_sampler.set_epoch(epoch)
adjust_learning_rate(optimizer, epoch, args)
# train for one epoch
train(train_loader, model, criterion, optimizer, epoch, args)
print("is_stop is ", is_stop)
if is_stop == 1:
break
#acc1 = validate(val_loader, model, criterion, args)
# #remember best acc@1 and save checkpoint
#is_best = acc1 > best_acc1
#best_acc1 = max(acc1, best_acc1)
#if not args.multiprocessing_distributed or (args.multiprocessing_distributed
# and args.rank % ngpus_per_node == 0):
# save_checkpoint({
# 'epoch': epoch + 1,
# 'arch': args.arch,
# 'state_dict': model.state_dict(),
# 'best_acc1': best_acc1,
# 'optimizer' : optimizer.state_dict(),
# }, is_best)
def train(train_loader, model, criterion, optimizer, epoch, args):
batch_time = AverageMeter('Time', ':6.3f')
######add by aiss 0708
# images_per_sec=AverageMeter('Perf', ':6.4f')
data_time = AverageMeter('Data', ':6.3f')
losses = AverageMeter('Loss', ':.4e')
top1 = AverageMeter('Acc@1', ':6.2f')
top5 = AverageMeter('Acc@5', ':6.2f')
progress = ProgressMeter(
len(train_loader),
[batch_time, data_time, losses, top1, top5],
prefix="Epoch: [{}]".format(epoch))
# switch to train mode
model.train()
end = time.time()
##########add by aiss##############
len_train_loader=len(train_loader)
print("iters per epoch: ",len(train_loader))
for i, (images, target) in enumerate(train_loader):
# measure data loading time
data_time.update(time.time() - end)
#add by aiss
if i == 10:
time_start=time.time()
torch.cuda.synchronize()
gpu_start_time=time.time()
if args.gpu is not None:
images = images.cuda(args.gpu, non_blocking=True)
target = target.cuda(args.gpu, non_blocking=True)
# compute output
output = model(images)
loss = criterion(output, target)
# measure accuracy and record loss
acc1, acc5 = accuracy(output, target, topk=(1, 5))
losses.update(loss.item(), images.size(0))
top1.update(acc1[0], images.size(0))
top5.update(acc5[0], images.size(0))
# compute gradient and do SGD step
optimizer.zero_grad()
loss.backward()
optimizer.step()
# measure elapsed time
batch_time.update(time.time() - end)
end = time.time()
####add by aiss
#print batch_time.val, batch_time.count, batch_time.sum
#images_per_sec= args.batch_size / batch_time.val
if i % args.print_freq == 0:
progress.display(i)
# print("Performance: {} ", images_per_sec)
#print("Performance: {} samples/sec".format(images_per_sec))
# print("step time cost: {} sec".format(batch_time.val))
#if i == len_train_loader-10:
if i == 1010:
is_stop=1
time_end=end
time_print_step=time_end-time_start
#####wait for gpu op finished########
torch.cuda.synchronize()
gpu_end_time=time.time()
gpu_time_print_step=gpu_end_time-gpu_start_time
# HC debug
print("batch_size is ", args.batch_size)
print("world_size is ", args.world_size)
########aiss debug####
print("time_start:%s time_end:%s gap:%s" %(time_start,time_end,time_print_step))
print("gpu_start_time:%s gpu_end_time:%s gap:%s" %(gpu_start_time,gpu_end_time,gpu_time_print_step))
if args.world_size == -1:
images_total_per_sec=args.batch_size * 1000 / time_print_step
else:
images_total_per_sec=args.batch_size * 1000 * args.world_size / time_print_step
print("Total Performance: {} samples/sec".format(images_total_per_sec))
break
def validate(val_loader, model, criterion, args):
batch_time = AverageMeter('Time', ':6.3f')
losses = AverageMeter('Loss', ':.4e')
top1 = AverageMeter('Acc@1', ':6.2f')
top5 = AverageMeter('Acc@5', ':6.2f')
progress = ProgressMeter(
len(val_loader),
[batch_time, losses, top1, top5],
prefix='Test: ')
# switch to evaluate mode
model.eval()
with torch.no_grad():
end = time.time()
for i, (images, target) in enumerate(val_loader):
if args.gpu is not None:
images = images.cuda(args.gpu, non_blocking=True)
target = target.cuda(args.gpu, non_blocking=True)
# compute output
output = model(images)
loss = criterion(output, target)
# measure accuracy and record loss
acc1, acc5 = accuracy(output, target, topk=(1, 5))
losses.update(loss.item(), images.size(0))
top1.update(acc1[0], images.size(0))
top5.update(acc5[0], images.size(0))
# measure elapsed time
batch_time.update(time.time() - end)
end = time.time()
if i % args.print_freq == 0:
progress.display(i)
# TODO: this should also be done with the ProgressMeter
print(' * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}'
.format(top1=top1, top5=top5))
return top1.avg
def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
torch.save(state, filename)
if is_best:
shutil.copyfile(filename, 'model_best.pth.tar')
class AverageMeter(object):
"""Computes and stores the average and current value"""
def __init__(self, name, fmt=':f'):
self.name = name
self.fmt = fmt
self.reset()
def reset(self):
self.val = 0
self.avg = 0
self.sum = 0
self.count = 0
def update(self, val, n=1):
self.val = val
self.sum += val * n
self.count += n
self.avg = self.sum / self.count
def __str__(self):
fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
return fmtstr.format(**self.__dict__)
class ProgressMeter(object):
def __init__(self, num_batches, meters, prefix=""):
self.batch_fmtstr = self._get_batch_fmtstr(num_batches)
self.meters = meters
self.prefix = prefix
def display(self, batch):
entries = [self.prefix + self.batch_fmtstr.format(batch)]
entries += [str(meter) for meter in self.meters]
print('\t'.join(entries))
def _get_batch_fmtstr(self, num_batches):
num_digits = len(str(num_batches // 1))
fmt = '{:' + str(num_digits) + 'd}'
return '[' + fmt + '/' + fmt.format(num_batches) + ']'
def adjust_learning_rate(optimizer, epoch, args):
"""Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
lr = args.lr * (0.1 ** (epoch // 30))
for param_group in optimizer.param_groups:
param_group['lr'] = lr
def accuracy(output, target, topk=(1,)):
"""Computes the accuracy over the k top predictions for the specified values of k"""
with torch.no_grad():
maxk = max(topk)
batch_size = target.size(0)
_, pred = output.topk(maxk, 1, True, True)
pred = pred.t()
correct = pred.eq(target.view(1, -1).expand_as(pred))
res = []
for k in topk:
correct_k = correct[:k].contiguous().view(-1).float().sum(0, keepdim=True)
res.append(correct_k.mul_(100.0 / batch_size))
return res
if __name__ == '__main__':
main()
#!/bin/bash
#SBATCH -p caspra
#SBATCH -N 2
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=8
#SBATCH --gres=dcu:4
#SBATCH -J syn_nccl_torch
#SBATCH -o ./log/output.%j
#SBATCH -e ./log/output.%j
#SBATCH -x e01r1n07
#for rocm3.3
module rm compiler/rocm/2.9
module load apps/PyTorch/1.5.0a0/hpcx-2.4.1-gcc-7.3.1-rocm3.3
export MIOPEN_DEBUG_DISABLE_FIND_DB=1
which mpirun
which python3
hostfile=./$SLURM_JOB_ID
scontrol show hostnames $SLURM_JOB_NODELIST > ${hostfile}
rm `pwd`/hostfile-dl -f
for i in `cat $hostfile`
do
echo ${i} slots=4 >> `pwd`/hostfile-dl-$SLURM_JOB_ID
done
np=$(cat $hostfile|sort|uniq |wc -l)
np=$(($np*4))
nodename=$(cat $hostfile |sed -n "1p")
echo $nodename
dist_url=`echo $nodename | awk '{print $1}'`
#for single card
#echo mpirun -np 1 --allow-run-as-root --bind-to none `pwd`/single_process.sh $dist_url inception_v3 64
#mpirun -np 1 --allow-run-as-root --bind-to none `pwd`/single_process.sh $dist_url inception_v3 64
#for one node
#echo mpirun -np $np --allow-run-as-root --bind-to none `pwd`/single_process.sh $dist_url inception_v3 64
#mpirun -np $np --allow-run-as-root --bind-to none `pwd`/single_process.sh $dist_url inception_v3 64
#for multi-gpu
echo mpirun -np $np --allow-run-as-root --hostfile hostfile-dl-$SLURM_JOB_ID --bind-to none `pwd`/single_process.sh $dist_url resnet50 64
mpirun -np $np --allow-run-as-root --hostfile hostfile-dl-$SLURM_JOB_ID --bind-to none `pwd`/single_process.sh $dist_url resnet50 64
#!/bin/bash
export MIOPEN_DEBUG_DISABLE_FIND_DB=1
export NCCL_SOCKET_IFNAME=ib0
export HSA_USERPTR_FOR_PAGED_MEM=0
#source /public/software/apps/DeepLearning/PyTorch/pytorch-env.sh
module rm compiler/rocm/2.9
module load apps/PyTorch/1.5.0a0/hpcx-2.4.1-gcc-7.3.1-rocm3.3
#source /public/home/aiss/Pytorch/env_rocm3.3_torch1.5.sh
lrank=$OMPI_COMM_WORLD_LOCAL_RANK
comm_rank=$OMPI_COMM_WORLD_RANK
comm_size=$OMPI_COMM_WORLD_SIZE
APP="python3 `pwd`/main_bench.py --batch-size=${3} --a=${2} -j 24 --epochs=1 --dist-url tcp://${1}:34567 --dist-backend nccl --world-size=${comm_size} --rank=${comm_rank} --synthetic /public/software/apps/DeepLearning/Data/ImageNet-pytorch/"
case ${lrank} in
[0])
export HIP_VISIBLE_DEVICES=0
export UCX_NET_DEVICES=mlx5_0:1
export UCX_IB_PCI_BW=mlx5_0:50Gbs
echo NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP}
NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP}
#echo GLOO_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP}
#GLOO_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP}
;;
[1])
export HIP_VISIBLE_DEVICES=1
export UCX_NET_DEVICES=mlx5_1:1
export UCX_IB_PCI_BW=mlx5_1:50Gbs
echo NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=1 --membind=1 ${APP}
NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=1 --membind=1 ${APP}
;;
[2])
export HIP_VISIBLE_DEVICES=2
export UCX_NET_DEVICES=mlx5_2:1
export UCX_IB_PCI_BW=mlx5_2:50Gbs
echo NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=2 --membind=2 ${APP}
NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=2 --membind=2 ${APP}
;;
[3])
export HIP_VISIBLE_DEVICES=3
export UCX_NET_DEVICES=mlx5_3:1
export UCX_IB_PCI_BW=mlx5_3:50Gbs
echo NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=3 --membind=3 ${APP}
NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=3 --membind=3 ${APP}
;;
esac
# 简介
该测试用例用于PyTorch目标检测模型Fasterrcnn测试。
# 运行
* train.py中get_dataset函数需要根据实际数据集情况设置json文件的位置、类别数目等。
## 单卡
python3 train.py --batch-size=2 -j 8 --epochs=26 --data-path=/path/to/datasets/folder --output-dir=/path/to/result/save/folder
## 单机多卡
mpirun -np 4 --hostfile hostfile --bind-to none `pwd`/single_process.sh localhost
## 多机多卡
mpirun -np $np --hostfile hostfile --bind-to none `pwd`/single_process.sh ${master_ip}
# 参考
[https://github.com/pytorch/vision/tree/master/references/detection](https://github.com/pytorch/vision/tree/master/references/detection)
import json
import tempfile
import numpy as np
import copy
import time
import torch
import torch._six
from pycocotools.cocoeval import COCOeval
from pycocotools.coco import COCO
import pycocotools.mask as mask_util
from collections import defaultdict
import utils
class CocoEvaluator(object):
def __init__(self, coco_gt, iou_types):
assert isinstance(iou_types, (list, tuple))
coco_gt = copy.deepcopy(coco_gt)
self.coco_gt = coco_gt
self.iou_types = iou_types
self.coco_eval = {}
for iou_type in iou_types:
self.coco_eval[iou_type] = COCOeval(coco_gt, iouType=iou_type)
self.img_ids = []
self.eval_imgs = {k: [] for k in iou_types}
def update(self, predictions):
img_ids = list(np.unique(list(predictions.keys())))
self.img_ids.extend(img_ids)
for iou_type in self.iou_types:
results = self.prepare(predictions, iou_type)
coco_dt = loadRes(self.coco_gt, results) if results else COCO()
coco_eval = self.coco_eval[iou_type]
coco_eval.cocoDt = coco_dt
coco_eval.params.imgIds = list(img_ids)
img_ids, eval_imgs = evaluate(coco_eval)
self.eval_imgs[iou_type].append(eval_imgs)
def synchronize_between_processes(self):
for iou_type in self.iou_types:
self.eval_imgs[iou_type] = np.concatenate(self.eval_imgs[iou_type], 2)
create_common_coco_eval(self.coco_eval[iou_type], self.img_ids, self.eval_imgs[iou_type])
def accumulate(self):
for coco_eval in self.coco_eval.values():
coco_eval.accumulate()
def summarize(self):
for iou_type, coco_eval in self.coco_eval.items():
print("IoU metric: {}".format(iou_type))
coco_eval.summarize()
def prepare(self, predictions, iou_type):
if iou_type == "bbox":
return self.prepare_for_coco_detection(predictions)
elif iou_type == "segm":
return self.prepare_for_coco_segmentation(predictions)
elif iou_type == "keypoints":
return self.prepare_for_coco_keypoint(predictions)
else:
raise ValueError("Unknown iou type {}".format(iou_type))
def prepare_for_coco_detection(self, predictions):
coco_results = []
for original_id, prediction in predictions.items():
if len(prediction) == 0:
continue
boxes = prediction["boxes"]
boxes = convert_to_xywh(boxes).tolist()
scores = prediction["scores"].tolist()
labels = prediction["labels"].tolist()
coco_results.extend(
[
{
"image_id": original_id,
"category_id": labels[k],
"bbox": box,
"score": scores[k],
}
for k, box in enumerate(boxes)
]
)
return coco_results
def prepare_for_coco_segmentation(self, predictions):
coco_results = []
for original_id, prediction in predictions.items():
if len(prediction) == 0:
continue
scores = prediction["scores"]
labels = prediction["labels"]
masks = prediction["masks"]
masks = masks > 0.5
scores = prediction["scores"].tolist()
labels = prediction["labels"].tolist()
rles = [
mask_util.encode(np.array(mask[0, :, :, np.newaxis], order="F"))[0]
for mask in masks
]
for rle in rles:
rle["counts"] = rle["counts"].decode("utf-8")
coco_results.extend(
[
{
"image_id": original_id,
"category_id": labels[k],
"segmentation": rle,
"score": scores[k],
}
for k, rle in enumerate(rles)
]
)
return coco_results
def prepare_for_coco_keypoint(self, predictions):
coco_results = []
for original_id, prediction in predictions.items():
if len(prediction) == 0:
continue
boxes = prediction["boxes"]
boxes = convert_to_xywh(boxes).tolist()
scores = prediction["scores"].tolist()
labels = prediction["labels"].tolist()
keypoints = prediction["keypoints"]
keypoints = keypoints.flatten(start_dim=1).tolist()
coco_results.extend(
[
{
"image_id": original_id,
"category_id": labels[k],
'keypoints': keypoint,
"score": scores[k],
}
for k, keypoint in enumerate(keypoints)
]
)
return coco_results
def convert_to_xywh(boxes):
xmin, ymin, xmax, ymax = boxes.unbind(1)
return torch.stack((xmin, ymin, xmax - xmin, ymax - ymin), dim=1)
def merge(img_ids, eval_imgs):
all_img_ids = utils.all_gather(img_ids)
all_eval_imgs = utils.all_gather(eval_imgs)
merged_img_ids = []
for p in all_img_ids:
merged_img_ids.extend(p)
merged_eval_imgs = []
for p in all_eval_imgs:
merged_eval_imgs.append(p)
merged_img_ids = np.array(merged_img_ids)
merged_eval_imgs = np.concatenate(merged_eval_imgs, 2)
# keep only unique (and in sorted order) images
merged_img_ids, idx = np.unique(merged_img_ids, return_index=True)
merged_eval_imgs = merged_eval_imgs[..., idx]
return merged_img_ids, merged_eval_imgs
def create_common_coco_eval(coco_eval, img_ids, eval_imgs):
img_ids, eval_imgs = merge(img_ids, eval_imgs)
img_ids = list(img_ids)
eval_imgs = list(eval_imgs.flatten())
coco_eval.evalImgs = eval_imgs
coco_eval.params.imgIds = img_ids
coco_eval._paramsEval = copy.deepcopy(coco_eval.params)
#################################################################
# From pycocotools, just removed the prints and fixed
# a Python3 bug about unicode not defined
#################################################################
# Ideally, pycocotools wouldn't have hard-coded prints
# so that we could avoid copy-pasting those two functions
def createIndex(self):
# create index
# print('creating index...')
anns, cats, imgs = {}, {}, {}
imgToAnns, catToImgs = defaultdict(list), defaultdict(list)
if 'annotations' in self.dataset:
for ann in self.dataset['annotations']:
imgToAnns[ann['image_id']].append(ann)
anns[ann['id']] = ann
if 'images' in self.dataset:
for img in self.dataset['images']:
imgs[img['id']] = img
if 'categories' in self.dataset:
for cat in self.dataset['categories']:
cats[cat['id']] = cat
if 'annotations' in self.dataset and 'categories' in self.dataset:
for ann in self.dataset['annotations']:
catToImgs[ann['category_id']].append(ann['image_id'])
# print('index created!')
# create class members
self.anns = anns
self.imgToAnns = imgToAnns
self.catToImgs = catToImgs
self.imgs = imgs
self.cats = cats
maskUtils = mask_util
def loadRes(self, resFile):
"""
Load result file and return a result api object.
:param resFile (str) : file name of result file
:return: res (obj) : result api object
"""
res = COCO()
res.dataset['images'] = [img for img in self.dataset['images']]
# print('Loading and preparing results...')
# tic = time.time()
if isinstance(resFile, torch._six.string_classes):
anns = json.load(open(resFile))
elif type(resFile) == np.ndarray:
anns = self.loadNumpyAnnotations(resFile)
else:
anns = resFile
assert type(anns) == list, 'results in not an array of objects'
annsImgIds = [ann['image_id'] for ann in anns]
assert set(annsImgIds) == (set(annsImgIds) & set(self.getImgIds())), \
'Results do not correspond to current coco set'
if 'caption' in anns[0]:
imgIds = set([img['id'] for img in res.dataset['images']]) & set([ann['image_id'] for ann in anns])
res.dataset['images'] = [img for img in res.dataset['images'] if img['id'] in imgIds]
for id, ann in enumerate(anns):
ann['id'] = id + 1
elif 'bbox' in anns[0] and not anns[0]['bbox'] == []:
res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
for id, ann in enumerate(anns):
bb = ann['bbox']
x1, x2, y1, y2 = [bb[0], bb[0] + bb[2], bb[1], bb[1] + bb[3]]
if 'segmentation' not in ann:
ann['segmentation'] = [[x1, y1, x1, y2, x2, y2, x2, y1]]
ann['area'] = bb[2] * bb[3]
ann['id'] = id + 1
ann['iscrowd'] = 0
elif 'segmentation' in anns[0]:
res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
for id, ann in enumerate(anns):
# now only support compressed RLE format as segmentation results
ann['area'] = maskUtils.area(ann['segmentation'])
if 'bbox' not in ann:
ann['bbox'] = maskUtils.toBbox(ann['segmentation'])
ann['id'] = id + 1
ann['iscrowd'] = 0
elif 'keypoints' in anns[0]:
res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
for id, ann in enumerate(anns):
s = ann['keypoints']
x = s[0::3]
y = s[1::3]
x1, x2, y1, y2 = np.min(x), np.max(x), np.min(y), np.max(y)
ann['area'] = (x2 - x1) * (y2 - y1)
ann['id'] = id + 1
ann['bbox'] = [x1, y1, x2 - x1, y2 - y1]
# print('DONE (t={:0.2f}s)'.format(time.time()- tic))
res.dataset['annotations'] = anns
createIndex(res)
return res
def evaluate(self):
'''
Run per image evaluation on given images and store results (a list of dict) in self.evalImgs
:return: None
'''
# tic = time.time()
# print('Running per image evaluation...')
p = self.params
# add backward compatibility if useSegm is specified in params
if p.useSegm is not None:
p.iouType = 'segm' if p.useSegm == 1 else 'bbox'
print('useSegm (deprecated) is not None. Running {} evaluation'.format(p.iouType))
# print('Evaluate annotation type *{}*'.format(p.iouType))
p.imgIds = list(np.unique(p.imgIds))
if p.useCats:
p.catIds = list(np.unique(p.catIds))
p.maxDets = sorted(p.maxDets)
self.params = p
self._prepare()
# loop through images, area range, max detection number
catIds = p.catIds if p.useCats else [-1]
if p.iouType == 'segm' or p.iouType == 'bbox':
computeIoU = self.computeIoU
elif p.iouType == 'keypoints':
computeIoU = self.computeOks
self.ious = {
(imgId, catId): computeIoU(imgId, catId)
for imgId in p.imgIds
for catId in catIds}
evaluateImg = self.evaluateImg
maxDet = p.maxDets[-1]
evalImgs = [
evaluateImg(imgId, catId, areaRng, maxDet)
for catId in catIds
for areaRng in p.areaRng
for imgId in p.imgIds
]
# this is NOT in the pycocotools code, but could be done outside
evalImgs = np.asarray(evalImgs).reshape(len(catIds), len(p.areaRng), len(p.imgIds))
self._paramsEval = copy.deepcopy(self.params)
# toc = time.time()
# print('DONE (t={:0.2f}s).'.format(toc-tic))
return p.imgIds, evalImgs
#################################################################
# end of straight copy from pycocotools, just removing the prints
#################################################################
import copy
import os
import json
from PIL import Image, ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True
import io
#import boto3
#import brainpp
import torch
import torch.utils.data
import torchvision
from pycocotools import mask as coco_mask
from pycocotools.coco import COCO
import transforms as T
from torchvision.datasets.vision import VisionDataset
class FilterAndRemapCocoCategories(object):
def __init__(self, categories, remap=True):
self.categories = categories
self.remap = remap
def __call__(self, image, target):
anno = target["annotations"]
anno = [obj for obj in anno if obj["category_id"] in self.categories]
if not self.remap:
target["annotations"] = anno
return image, target
anno = copy.deepcopy(anno)
for obj in anno:
obj["category_id"] = self.categories.index(obj["category_id"])
target["annotations"] = anno
return image, target
def convert_coco_poly_to_mask(segmentations, height, width):
masks = []
for polygons in segmentations:
rles = coco_mask.frPyObjects(polygons, height, width)
mask = coco_mask.decode(rles)
if len(mask.shape) < 3:
mask = mask[..., None]
mask = torch.as_tensor(mask, dtype=torch.uint8)
mask = mask.any(dim=2)
masks.append(mask)
if masks:
masks = torch.stack(masks, dim=0)
else:
masks = torch.zeros((0, height, width), dtype=torch.uint8)
return masks
class ConvertCocoPolysToMask(object):
def __call__(self, image, target):
w, h = image.size
image_id = target["image_id"]
image_id = torch.tensor([image_id])
anno = target["annotations"]
anno = [obj for obj in anno if obj['iscrowd'] == 0]
boxes = [obj["bbox"] for obj in anno]
# guard against no boxes via resizing
boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4)
boxes[:, 2:] += boxes[:, :2]
boxes[:, 0::2].clamp_(min=0, max=w)
boxes[:, 1::2].clamp_(min=0, max=h)
classes = [obj["category_id"] for obj in anno]
classes = torch.tensor(classes, dtype=torch.int64)
keypoints = None
if anno and "keypoints" in anno[0]:
keypoints = [obj["keypoints"] for obj in anno]
keypoints = torch.as_tensor(keypoints, dtype=torch.float32)
num_keypoints = keypoints.shape[0]
if num_keypoints:
keypoints = keypoints.view(num_keypoints, -1, 3)
keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
boxes = boxes[keep]
classes = classes[keep]
if keypoints is not None:
keypoints = keypoints[keep]
target = {}
target["boxes"] = boxes
target["labels"] = classes
target["image_id"] = image_id
if keypoints is not None:
target["keypoints"] = keypoints
# for conversion to coco api
area = torch.tensor([obj["area"] for obj in anno])
iscrowd = torch.tensor([obj["iscrowd"] for obj in anno])
target["area"] = area
target["iscrowd"] = iscrowd
return image, target
min_keypoints_per_image = 10
def _has_only_empty_bbox(anno):
return all(any(o <= 1 for o in obj["bbox"][2:]) for obj in anno)
def _count_visible_keypoints(anno):
return sum(sum(1 for v in ann["keypoints"][2::3] if v > 0) for ann in anno)
def _has_valid_annotation(anno):
# if it's empty, there is no annotation
if len(anno) == 0:
return False
# if all boxes have close to zero area, there is no annotation
if _has_only_empty_bbox(anno):
return False
# keypoints task have a slight different critera for considering
# if an annotation is valid
if "keypoints" not in anno[0]:
return True
# for keypoint detection tasks, only consider valid images those
# containing at least min_keypoints_per_image
if _count_visible_keypoints(anno) >= min_keypoints_per_image:
return True
return False
def _coco_remove_images_without_annotations(dataset, cat_list=None):
assert isinstance(dataset, torchvision.datasets.CocoDetection) or isinstance(dataset, CocoDetection)
ids = []
empty = 0
for ds_idx, img_id in enumerate(dataset.ids):
ann_ids = dataset.coco.getAnnIds(imgIds=img_id, iscrowd=None)
anno = dataset.coco.loadAnns(ann_ids)
if cat_list:
anno = [obj for obj in anno if obj["category_id"] in cat_list]
if _has_valid_annotation(anno):
ids.append(ds_idx)
else:
empty += 1
print("remove {} empty imgs without annos".format(empty))
dataset = torch.utils.data.Subset(dataset, ids)
return dataset
def convert_to_coco_api(ds):
coco_ds = COCO()
ann_id = 0
dataset = {'images': [], 'categories': [], 'annotations': []}
categories = set()
for img_idx in range(len(ds)):
# find better way to get target
# targets = ds.get_annotations(img_idx)
img, targets = ds[img_idx]
image_id = targets["image_id"].item()
img_dict = {}
img_dict['id'] = image_id
img_dict['height'] = img.shape[-2]
img_dict['width'] = img.shape[-1]
dataset['images'].append(img_dict)
bboxes = targets["boxes"]
bboxes[:, 2:] -= bboxes[:, :2]
bboxes = bboxes.tolist()
labels = targets['labels'].tolist()
areas = targets['area'].tolist()
iscrowd = targets['iscrowd'].tolist()
if 'masks' in targets:
masks = targets['masks']
# make masks Fortran contiguous for coco_mask
masks = masks.permute(0, 2, 1).contiguous().permute(0, 2, 1)
if 'keypoints' in targets:
keypoints = targets['keypoints']
keypoints = keypoints.reshape(keypoints.shape[0], -1).tolist()
num_objs = len(bboxes)
for i in range(num_objs):
ann = {}
ann['image_id'] = image_id
ann['bbox'] = bboxes[i]
ann['category_id'] = labels[i]
categories.add(labels[i])
ann['area'] = areas[i]
ann['iscrowd'] = iscrowd[i]
ann['id'] = ann_id
if 'keypoints' in targets:
ann['keypoints'] = keypoints[i]
ann['num_keypoints'] = sum(k != 0 for k in keypoints[i][2::3])
dataset['annotations'].append(ann)
ann_id += 1
dataset['categories'] = [{'id': i} for i in sorted(categories)]
coco_ds.dataset = dataset
coco_ds.createIndex()
return coco_ds
def get_coco_api_from_dataset(dataset):
for _ in range(10):
if isinstance(dataset, torchvision.datasets.CocoDetection):
break
if isinstance(dataset, torch.utils.data.Subset):
dataset = dataset.dataset
if isinstance(dataset, torchvision.datasets.CocoDetection):
return dataset.coco
return convert_to_coco_api(dataset)
class CocoDetection(VisionDataset):
def __init__(self, root, annFile, transforms):
super(CocoDetection, self).__init__(root, transforms=None, transform=None, target_transform=None)
from pycocotools.coco import COCO
self.coco = COCO(annFile)
self.ids = list(sorted(self.coco.imgs.keys()))
self._transforms = transforms
with open(annFile, "r") as f:
result = json.load(f)
catids = [k['id'] for k in result['categories']]
self.catid_inf = min(catids)
ids_to_remove = []
ids = []
for img_id in self.ids:
ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=None)
anno = self.coco.loadAnns(ann_ids)
if all(
any(o <= 1 for o in obj["bbox"][2:])
for obj in anno
if obj["iscrowd"] == 0
):
ids_to_remove.append(img_id)
if _has_valid_annotation(anno):
ids.append(img_id)
print("remove {} illegal image".format(len(ids_to_remove)))
self.ids = [img_id for img_id in ids if img_id not in ids_to_remove]
def __getitem__(self, idx):
coco = self.coco
img_id = self.ids[idx]
ann_ids = coco.getAnnIds(imgIds=img_id)
target = coco.loadAnns(ann_ids)
target = dict(image_id=img_id, annotations=target)
path = coco.loadImgs(img_id)[0]['file_name']
img = Image.open(os.path.join(self.root, path)).convert('RGB')
#img, target = self.remapper(img, target)
if self._transforms is not None:
img, target = self._transforms(img, target)
target['labels'] = (target['labels'] - self.catid_inf + 1).long()
return img, target
def __len__(self):
return len(self.ids)
'''
class CocoDetection(torchvision.datasets.CocoDetection):
def __init__(self, img_folder, ann_file, transforms):
super(CocoDetection, self).__init__(img_folder, ann_file)
self._transforms = transforms
with open(ann_file, "r") as f:
result = json.load(f)
catids = [k['id'] for k in result['categories']]
self.catid_inf = min(catids)
self.num_classes = len(catids)
print(self.num_classes)
ids_to_remove = []
ids = []
for img_id in self.ids:
ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=None)
anno = self.coco.loadAnns(ann_ids)
if all(
any(o <= 1 for o in obj["bbox"][2:])
for obj in anno
if obj["iscrowd"] == 0
):
ids_to_remove.append(img_id)
if _has_valid_annotation(anno):
ids.append(img_id)
print("remove {} illegal image".format(len(ids_to_remove)))
self.ids = [img_id for img_id in ids if img_id not in ids_to_remove]
def __getitem__(self, idx):
img, target = super(CocoDetection, self).__getitem__(idx)
image_id = self.ids[idx]
target = dict(image_id=image_id, annotations=target)
if self._transforms is not None:
img, target = self._transforms(img, target)
target['labels'] = (target['labels'] - self.catid_inf + 1).long()
return img, target
class OssCocoDetection(VisionDataset):
def __init__(self, root, annFile, transforms,
host='http://oss.{}.brainpp.cn'.format(brainpp.current_vm.site)):
super(OssCocoDetection, self).__init__(root, transforms=None, transform=None, target_transform=None)
from pycocotools.coco import COCO
self.coco = COCO(annFile)
self.ids = list(sorted(self.coco.imgs.keys()))
self.s3_client = boto3.client('s3', endpoint_url=host)
self._transforms = transforms
with open(annFile, "r") as f:
result = json.load(f)
catids = [k['id'] for k in result['categories']]
self.catid_inf = min(catids)
ids_to_remove = []
ids = []
for img_id in self.ids:
ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=None)
anno = self.coco.loadAnns(ann_ids)
if all(
any(o <= 1 for o in obj["bbox"][2:])
for obj in anno
if obj["iscrowd"] == 0
):
ids_to_remove.append(img_id)
if _has_valid_annotation(anno):
ids.append(img_id)
print("remove {} illegal image".format(len(ids_to_remove)))
self.ids = [img_id for img_id in ids if img_id not in ids_to_remove]
def __getitem__(self, idx):
coco = self.coco
img_id = self.ids[idx]
ann_ids = coco.getAnnIds(imgIds=img_id)
target = coco.loadAnns(ann_ids)
target = dict(image_id=img_id, annotations=target)
path = coco.loadImgs(img_id)[0]['file_name']
img_obj = self.s3_client.get_object(
Bucket="generalDetection", Key=os.path.join(self.root, path))
img = Image.open(io.BytesIO(img_obj['Body'].read())).convert('RGB')
#img, target = self.remapper(img, target)
if self._transforms is not None:
img, target = self._transforms(img, target)
target['labels'] = (target['labels'] - self.catid_inf + 1).long()
return img, target
def __len__(self):
return len(self.ids)
def get_oss_coco(root, image_set, transforms, mode='instances'):
t = [ConvertCocoPolysToMask()]
if transforms is not None:
t.append(transforms)
transforms = T.Compose(t)
datasets = list()
for i_key, i_val in root.items():
dataset = OssCocoDetection(
i_val['img_dir'], i_val['ann_file'],
transforms=transforms)
if image_set == "train":
dataset = _coco_remove_images_without_annotations(dataset)
datasets.append(dataset)
dataset = datasets[0] # if len(datasets) == 1 else ConcatDataset(datasets)
# dataset = torch.utils.data.Subset(dataset, [i for i in range(500)])
return dataset
'''
def get_coco(root, image_set, transforms, mode='instances'):
t = [ConvertCocoPolysToMask()]
if transforms is not None:
t.append(transforms)
transforms = T.Compose(t)
img_folder = root[image_set]['img_dir']
ann_file = root[image_set]['ann_file']
dataset = CocoDetection(img_folder, ann_file, transforms=transforms)
if image_set == "train":
dataset = _coco_remove_images_without_annotations(dataset)
# dataset = torch.utils.data.Subset(dataset, [i for i in range(500)])
return dataset
def get_coco_kp(root, image_set, transforms):
return get_coco(root, image_set, transforms, mode="person_keypoints")
import math
import sys
import time
import torch
import torchvision.models.detection.mask_rcnn
from coco_utils import get_coco_api_from_dataset
from coco_eval import CocoEvaluator
import utils
def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq):
model.train()
metric_logger = utils.MetricLogger(delimiter=" ")
metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
header = 'Epoch: [{}]'.format(epoch)
lr_scheduler = None
if epoch == 0:
warmup_factor = 1. / 1000
warmup_iters = min(1000, len(data_loader) - 1)
lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor)
for images, targets in metric_logger.log_every(data_loader, print_freq, header):
images = list(image.to(device) for image in images)
targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
loss_dict = model(images, targets)
losses = sum(loss for loss in loss_dict.values())
# reduce losses over all GPUs for logging purposes
loss_dict_reduced = utils.reduce_dict(loss_dict)
losses_reduced = sum(loss for loss in loss_dict_reduced.values())
loss_value = losses_reduced.item()
if not math.isfinite(loss_value):
print("Loss is {}, stopping training".format(loss_value))
print(loss_dict_reduced)
sys.exit(1)
optimizer.zero_grad()
losses.backward()
optimizer.step()
if lr_scheduler is not None:
lr_scheduler.step()
metric_logger.update(loss=losses_reduced, **loss_dict_reduced)
metric_logger.update(lr=optimizer.param_groups[0]["lr"])
def _get_iou_types(model):
model_without_ddp = model
if isinstance(model, torch.nn.parallel.DistributedDataParallel):
model_without_ddp = model.module
iou_types = ["bbox"]
if isinstance(model_without_ddp, torchvision.models.detection.MaskRCNN):
iou_types.append("segm")
if isinstance(model_without_ddp, torchvision.models.detection.KeypointRCNN):
iou_types.append("keypoints")
return iou_types
@torch.no_grad()
def evaluate(model, data_loader, device):
n_threads = torch.get_num_threads()
# FIXME remove this and make paste_masks_in_image run on the GPU
torch.set_num_threads(1)
cpu_device = torch.device("cpu")
model.eval()
metric_logger = utils.MetricLogger(delimiter=" ")
header = 'Test:'
coco = get_coco_api_from_dataset(data_loader.dataset)
iou_types = _get_iou_types(model)
coco_evaluator = CocoEvaluator(coco, iou_types)
for image, targets in metric_logger.log_every(data_loader, 100, header):
image = list(img.to(device) for img in image)
targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
torch.cuda.synchronize()
model_time = time.time()
outputs = model(image)
outputs = [{k: v.to(cpu_device) for k, v in t.items()} for t in outputs]
model_time = time.time() - model_time
res = {target["image_id"].item(): output for target, output in zip(targets, outputs)}
evaluator_time = time.time()
coco_evaluator.update(res)
evaluator_time = time.time() - evaluator_time
metric_logger.update(model_time=model_time, evaluator_time=evaluator_time)
# gather the stats from all processes
metric_logger.synchronize_between_processes()
print("Averaged stats:", metric_logger)
coco_evaluator.synchronize_between_processes()
# accumulate predictions from all images
coco_evaluator.accumulate()
coco_evaluator.summarize()
torch.set_num_threads(n_threads)
return coco_evaluator
import bisect
from collections import defaultdict
import copy
import numpy as np
import torch
import torch.utils.data
from torch.utils.data.sampler import BatchSampler, Sampler
from torch.utils.model_zoo import tqdm
import torchvision
from PIL import Image
class GroupedBatchSampler(BatchSampler):
"""
Wraps another sampler to yield a mini-batch of indices.
It enforces that the batch only contain elements from the same group.
It also tries to provide mini-batches which follows an ordering which is
as close as possible to the ordering from the original sampler.
Arguments:
sampler (Sampler): Base sampler.
group_ids (list[int]): If the sampler produces indices in range [0, N),
`group_ids` must be a list of `N` ints which contains the group id of each sample.
The group ids must be a continuous set of integers starting from
0, i.e. they must be in the range [0, num_groups).
batch_size (int): Size of mini-batch.
"""
def __init__(self, sampler, group_ids, batch_size):
if not isinstance(sampler, Sampler):
raise ValueError(
"sampler should be an instance of "
"torch.utils.data.Sampler, but got sampler={}".format(sampler)
)
self.sampler = sampler
self.group_ids = group_ids
self.batch_size = batch_size
def __iter__(self):
buffer_per_group = defaultdict(list)
samples_per_group = defaultdict(list)
num_batches = 0
for idx in self.sampler:
group_id = self.group_ids[idx]
buffer_per_group[group_id].append(idx)
samples_per_group[group_id].append(idx)
if len(buffer_per_group[group_id]) == self.batch_size:
yield buffer_per_group[group_id]
num_batches += 1
del buffer_per_group[group_id]
assert len(buffer_per_group[group_id]) < self.batch_size
# now we have run out of elements that satisfy
# the group criteria, let's return the remaining
# elements so that the size of the sampler is
# deterministic
expected_num_batches = len(self)
num_remaining = expected_num_batches - num_batches
if num_remaining > 0:
# for the remaining batches, take first the buffers with largest number
# of elements
for group_id, _ in sorted(buffer_per_group.items(),
key=lambda x: len(x[1]), reverse=True):
remaining = self.batch_size - len(buffer_per_group[group_id])
buffer_per_group[group_id].extend(
samples_per_group[group_id][:remaining])
assert len(buffer_per_group[group_id]) == self.batch_size
yield buffer_per_group[group_id]
num_remaining -= 1
if num_remaining == 0:
break
assert num_remaining == 0
def __len__(self):
return len(self.sampler) // self.batch_size
def _compute_aspect_ratios_slow(dataset, indices=None):
print("Your dataset doesn't support the fast path for "
"computing the aspect ratios, so will iterate over "
"the full dataset and load every image instead. "
"This might take some time...")
if indices is None:
indices = range(len(dataset))
class SubsetSampler(Sampler):
def __init__(self, indices):
self.indices = indices
def __iter__(self):
return iter(self.indices)
def __len__(self):
return len(self.indices)
sampler = SubsetSampler(indices)
data_loader = torch.utils.data.DataLoader(
dataset, batch_size=1, sampler=sampler,
num_workers=14, # you might want to increase it for faster processing
collate_fn=lambda x: x[0])
aspect_ratios = []
with tqdm(total=len(dataset)) as pbar:
for _i, (img, _) in enumerate(data_loader):
pbar.update(1)
height, width = img.shape[-2:]
aspect_ratio = float(width) / float(height)
aspect_ratios.append(aspect_ratio)
return aspect_ratios
def _compute_aspect_ratios_custom_dataset(dataset, indices=None):
if indices is None:
indices = range(len(dataset))
aspect_ratios = []
for i in indices:
height, width = dataset.get_height_and_width(i)
aspect_ratio = float(width) / float(height)
aspect_ratios.append(aspect_ratio)
return aspect_ratios
def _compute_aspect_ratios_coco_dataset(dataset, indices=None):
if indices is None:
indices = range(len(dataset))
aspect_ratios = []
for i in indices:
img_info = dataset.coco.imgs[dataset.ids[i]]
aspect_ratio = float(img_info["width"]) / float(img_info["height"])
aspect_ratios.append(aspect_ratio)
return aspect_ratios
def _compute_aspect_ratios_voc_dataset(dataset, indices=None):
if indices is None:
indices = range(len(dataset))
aspect_ratios = []
for i in indices:
# this doesn't load the data into memory, because PIL loads it lazily
width, height = Image.open(dataset.images[i]).size
aspect_ratio = float(width) / float(height)
aspect_ratios.append(aspect_ratio)
return aspect_ratios
def _compute_aspect_ratios_subset_dataset(dataset, indices=None):
if indices is None:
indices = range(len(dataset))
ds_indices = [dataset.indices[i] for i in indices]
return compute_aspect_ratios(dataset.dataset, ds_indices)
def compute_aspect_ratios(dataset, indices=None):
if hasattr(dataset, "get_height_and_width"):
return _compute_aspect_ratios_custom_dataset(dataset, indices)
if isinstance(dataset, torchvision.datasets.CocoDetection):
return _compute_aspect_ratios_coco_dataset(dataset, indices)
if isinstance(dataset, torchvision.datasets.VOCDetection):
return _compute_aspect_ratios_voc_dataset(dataset, indices)
if isinstance(dataset, torch.utils.data.Subset):
return _compute_aspect_ratios_subset_dataset(dataset, indices)
# slow path
return _compute_aspect_ratios_slow(dataset, indices)
def _quantize(x, bins):
bins = copy.deepcopy(bins)
bins = sorted(bins)
quantized = list(map(lambda y: bisect.bisect_right(bins, y), x))
return quantized
def create_aspect_ratio_groups(dataset, k=0):
aspect_ratios = compute_aspect_ratios(dataset)
bins = (2 ** np.linspace(-1, 1, 2 * k + 1)).tolist() if k > 0 else [1.0]
groups = _quantize(aspect_ratios, bins)
# count number of elements per group
counts = np.unique(groups, return_counts=True)[1]
fbins = [0] + bins + [np.inf]
print("Using {} as bins for aspect ratio quantization".format(fbins))
print("Count of instances per bin: {}".format(counts))
return groups
#!/bin/bash
#SBATCH -p normal
#SBATCH -N 1
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=8
#SBATCH --gres=dcu:4
#SBATCH -J detection
#SBATCH -o ./log/output.%j
#SBATCH -e ./log/output.%j
module rm compiler/rocm/2.9
module load apps/PyTorch/1.5.0a0/hpcx-2.4.1-gcc-7.3.1-rocm3.3
which mpirun
which python3
hostfile=./$SLURM_JOB_ID
scontrol show hostnames $SLURM_JOB_NODELIST > ${hostfile}
rm `pwd`/hostfile-dl -f
#hostfile=./node_list
for i in `cat $hostfile`
do
echo ${i} slots=4 >> `pwd`/hostfile-dl
done
np=$(cat $hostfile|sort|uniq |wc -l)
np=$(($np*4))
nodename=$(cat $hostfile |sed -n "1p")
echo $nodename
dist_url=`echo $nodename | awk '{print $1}'`
echo mpirun -np $np --allow-run-as-root --hostfile hostfile-dl --bind-to none `pwd`/single_process.sh $dist_url
mpirun -np $np --allow-run-as-root --hostfile hostfile-dl --bind-to none `pwd`/single_process.sh $dist_url
#!/bin/bash
export MIOPEN_DEBUG_DISABLE_FIND_DB=1
export OMP_NUM_THREADS=6
export NCCL_SOCKET_IFNAME=ib0
export HSA_USERPTR_FOR_PAGED_MEM=0
lrank=$OMPI_COMM_WORLD_LOCAL_RANK
comm_rank=$OMPI_COMM_WORLD_RANK
comm_size=$OMPI_COMM_WORLD_SIZE
module rm compiler/rocm/2.9
module load apps/PyTorch/1.5.0a0/hpcx-2.4.1-gcc-7.3.1-rocm3.3
APP="python3 `pwd`/train.py --batch-size=2 -j 8 --epochs=26 --dist-url tcp://${1}:34568 --world-size=${comm_size} --rank=${comm_rank} --output-dir `pwd`/output"
case ${lrank} in
[0])
export HIP_VISIBLE_DEVICES=0
export UCX_NET_DEVICES=mlx5_0:1
export UCX_IB_PCI_BW=mlx5_0:50Gbs
echo NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP}
NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP}
;;
[1])
export HIP_VISIBLE_DEVICES=1
export UCX_NET_DEVICES=mlx5_1:1
export UCX_IB_PCI_BW=mlx5_1:50Gbs
echo NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=1 --membind=1 ${APP}
NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=1 --membind=1 ${APP}
;;
[2])
export HIP_VISIBLE_DEVICES=2
export UCX_NET_DEVICES=mlx5_2:1
export UCX_IB_PCI_BW=mlx5_2:50Gbs
echo NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=2 --membind=2 ${APP}
NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=2 --membind=2 ${APP}
;;
[3])
export HIP_VISIBLE_DEVICES=3
export UCX_NET_DEVICES=mlx5_3:1
export UCX_IB_PCI_BW=mlx5_3:50Gbs
echo NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=3 --membind=3 ${APP}
NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=3 --membind=3 ${APP}
;;
esac
r"""PyTorch Detection Training.
To run in a multi-gpu environment, use the distributed launcher::
python -m torch.distributed.launch --nproc_per_node=$NGPU --use_env \
train.py ... --world-size $NGPU
The default hyperparameters are tuned for training on 8 gpus and 2 images per gpu.
--lr 0.02 --batch-size 2 --world-size 8
If you use different number of gpus, the learning rate should be changed to 0.02/8*$NGPU.
"""
import datetime
import os
import os.path as osp
import time
import torch
import torch.utils.data
from torch import nn
import torchvision
import torchvision.models.detection
import torchvision.models.detection.mask_rcnn
from coco_utils import get_coco, get_oss_coco, get_coco_kp
from group_by_aspect_ratio import GroupedBatchSampler, create_aspect_ratio_groups
from engine import train_one_epoch, evaluate
import utils
import transforms as T
def get_dataset(name, image_set, transform, data_path):
data_root_dir = data_path
train_dataset = {
'coco_2014_train': {
'img_dir': osp.join("objects365_raw_data/objects365/train"),
"ann_file": osp.join(
data_root_dir, "obj_anno/objects365_train_20190423.json")},
}
val_dataset = {
'coco_2014_train': {
'img_dir': osp.join("objects365_raw_data/objects365/val"),
"ann_file": osp.join(
data_root_dir, "obj_anno/objects365_val_20190423.json")},
}
paths = {
"cocotrain": (train_dataset, get_oss_coco, 365),
"cocoval": (val_dataset, get_oss_coco, 365),
"coco_kp": (data_path, get_coco_kp, 2)
}
p, ds_fn, num_classes = paths[name+image_set]
ds = ds_fn(p, image_set=image_set, transforms=transform)
return ds, num_classes
def get_transform(train):
transforms = []
transforms.append(T.ToTensor())
if train:
transforms.append(T.RandomHorizontalFlip(0.5))
return T.Compose(transforms)
def main(args):
utils.init_distributed_mode(args)
print(args)
device = torch.device(args.device)
# Data loading code
print("Loading data")
#dataset, num_classes = get_dataset(args.dataset, "train", get_transform(train=True), args.data_path)
dataset_test, num_classes = get_dataset(args.dataset, "val", get_transform(train=False), args.data_path)
print("Creating data loaders")
if args.distributed:
#train_sampler = torch.utils.data.distributed.DistributedSampler(dataset)
test_sampler = torch.utils.data.distributed.DistributedSampler(dataset_test)
else:
#train_sampler = torch.utils.data.RandomSampler(dataset)
test_sampler = torch.utils.data.SequentialSampler(dataset_test)
'''
if args.aspect_ratio_group_factor >= 0:
group_ids = create_aspect_ratio_groups(dataset, k=args.aspect_ratio_group_factor)
train_batch_sampler = GroupedBatchSampler(train_sampler, group_ids, args.batch_size)
else:
train_batch_sampler = torch.utils.data.BatchSampler(
train_sampler, args.batch_size, drop_last=True)
data_loader = torch.utils.data.DataLoader(
dataset, batch_sampler=train_batch_sampler, num_workers=args.workers,
collate_fn=utils.collate_fn)
'''
data_loader_test = torch.utils.data.DataLoader(
dataset_test, batch_size=1,
sampler=test_sampler, num_workers=args.workers,
collate_fn=utils.collate_fn)
print("Creating model")
model = torchvision.models.detection.__dict__[args.model](num_classes=num_classes,
pretrained=args.pretrained)
model.to(device)
model_without_ddp = model
if args.distributed:
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
model_without_ddp = model.module
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(
params, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay)
# lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=args.lr_step_size, gamma=args.lr_gamma)
lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=args.lr_steps, gamma=args.lr_gamma)
if args.resume:
checkpoint = torch.load(args.resume, map_location='cpu')
model_without_ddp.load_state_dict(checkpoint['model'])
optimizer.load_state_dict(checkpoint['optimizer'])
lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
if args.test_only:
print("evaluating...")
evaluate(model, data_loader_test, device=device)
return
print("Start training")
start_time = time.time()
for epoch in range(args.epochs):
if args.distributed:
train_sampler.set_epoch(epoch)
train_one_epoch(model, optimizer, data_loader, device, epoch, args.print_freq)
lr_scheduler.step()
if args.output_dir:
utils.save_on_master({
'model': model_without_ddp.state_dict(),
'optimizer': optimizer.state_dict(),
'lr_scheduler': lr_scheduler.state_dict(),
'args': args},
os.path.join(args.output_dir, 'model_{}.pth'.format(epoch)))
# evaluate after every epoch
evaluate(model, data_loader_test, device=device)
total_time = time.time() - start_time
total_time_str = str(datetime.timedelta(seconds=int(total_time)))
print('Training time {}'.format(total_time_str))
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(
description=__doc__)
parser.add_argument('--data-path', default='/data/code/vision_ori/dataset', help='dataset')
parser.add_argument('--dataset', default='coco', help='dataset')
parser.add_argument('--model', default='fasterrcnn_resnet50_fpn', help='model')
parser.add_argument('--device', default='cuda', help='device')
parser.add_argument('-b', '--batch-size', default=2, type=int,
help='images per gpu, the total batch size is $NGPU x batch_size')
parser.add_argument('--epochs', default=13, type=int, metavar='N',
help='number of total epochs to run')
parser.add_argument('-j', '--workers', default=4, type=int, metavar='N',
help='number of data loading workers (default: 4)')
parser.add_argument('--lr', default=0.02, type=float,
help='initial learning rate, 0.02 is the default value for training '
'on 8 gpus and 2 images_per_gpu')
parser.add_argument('--momentum', default=0.9, type=float, metavar='M',
help='momentum')
parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float,
metavar='W', help='weight decay (default: 1e-4)',
dest='weight_decay')
parser.add_argument('--lr-step-size', default=8, type=int, help='decrease lr every step-size epochs')
parser.add_argument('--lr-steps', default=[8, 11], nargs='+', type=int, help='decrease lr every step-size epochs')
parser.add_argument('--lr-gamma', default=0.1, type=float, help='decrease lr by a factor of lr-gamma')
parser.add_argument('--print-freq', default=20, type=int, help='print frequency')
parser.add_argument('--output-dir', default='.', help='path where to save')
parser.add_argument('--resume', default='', help='resume from checkpoint')
parser.add_argument('--aspect-ratio-group-factor', default=3, type=int)
parser.add_argument(
"--test-only",
dest="test_only",
help="Only test the model",
action="store_true",
)
parser.add_argument(
"--pretrained",
dest="pretrained",
help="Use pre-trained models from the modelzoo",
action="store_true",
)
# distributed training parameters
parser.add_argument('--world-size', default=1, type=int,
help='number of distributed processes')
parser.add_argument('--dist-url', default='env://', help='url used to set up distributed training')
args = parser.parse_args()
if args.output_dir:
utils.mkdir(args.output_dir)
main(args)
r"""PyTorch Detection Training.
To run in a multi-gpu environment, use the distributed launcher::
python -m torch.distributed.launch --nproc_per_node=$NGPU --use_env \
train.py ... --world-size $NGPU
The default hyperparameters are tuned for training on 8 gpus and 2 images per gpu.
--lr 0.02 --batch-size 2 --world-size 8
If you use different number of gpus, the learning rate should be changed to 0.02/8*$NGPU.
"""
import datetime
import os
import os.path as osp
import time
import torch
import torch.utils.data
from torch import nn
import torchvision
import torchvision.models.detection
import torchvision.models.detection.mask_rcnn
from coco_utils import get_coco, get_coco_kp
from group_by_aspect_ratio import GroupedBatchSampler, create_aspect_ratio_groups
from engine import train_one_epoch, evaluate
import utils
import transforms as T
####aiss debug add inter,default 16
#print (torch.get_num_interop_threads())
#torch.set_num_interop_threads(24)
#print (torch.get_num_interop_threads())
def get_dataset(name, image_set, transform, data_path):
train_dataset = {
'train': {
'img_dir': osp.join(data_path, "train"),
"ann_file": osp.join(data_path, "objects365_Tiny_train.json")},
}
val_dataset = {
'val': {
'img_dir': osp.join(data_path, "val"),
"ann_file": osp.join(data_path, "objects365_Tiny_val.json")},
}
paths = {
"train": (train_dataset, get_coco, 66),
"val": (val_dataset, get_coco, 66),
}
p, ds_fn, num_classes = paths[image_set]
ds = ds_fn(p, image_set=image_set, transforms=transform)
return ds, num_classes
def get_transform(train):
transforms = []
transforms.append(T.ToTensor())
if train:
transforms.append(T.RandomHorizontalFlip(0.5))
return T.Compose(transforms)
def main(args):
utils.init_distributed_mode(args)
print(args)
device = torch.device(args.device)
# Data loading code
print("Loading data")
dataset, num_classes = get_dataset(args.dataset, "train", get_transform(train=True), args.data_path)
# dataset_test, _ = get_dataset(args.dataset, "val", get_transform(train=False), args.data_path)
dataset_test, _ = get_dataset(args.dataset, "val", get_transform(train=False), args.data_path)
print("num classes : {}".format(num_classes))
print("Creating data loaders")
if args.distributed:
train_sampler = torch.utils.data.distributed.DistributedSampler(dataset)
test_sampler = torch.utils.data.distributed.DistributedSampler(dataset_test)
else:
train_sampler = torch.utils.data.RandomSampler(dataset)
test_sampler = torch.utils.data.SequentialSampler(dataset_test)
if args.aspect_ratio_group_factor >= 0:
group_ids = create_aspect_ratio_groups(dataset, k=args.aspect_ratio_group_factor)
train_batch_sampler = GroupedBatchSampler(train_sampler, group_ids, args.batch_size)
else:
train_batch_sampler = torch.utils.data.BatchSampler(
train_sampler, args.batch_size, drop_last=True)
'''
train_batch_sampler = torch.utils.data.BatchSampler(
train_sampler, args.batch_size, drop_last=True)
'''
data_loader = torch.utils.data.DataLoader(
dataset, batch_sampler=train_batch_sampler, num_workers=args.workers,
collate_fn=utils.collate_fn)
data_loader_test = torch.utils.data.DataLoader(
dataset_test, batch_size=1,
sampler=test_sampler, num_workers=args.workers,
collate_fn=utils.collate_fn)
print("Creating model")
model = torchvision.models.detection.__dict__[args.model](num_classes=num_classes,
pretrained=args.pretrained)
model.to(device)
model_without_ddp = model
if args.distributed:
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
model_without_ddp = model.module
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(
params, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay)
# lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=args.lr_step_size, gamma=args.lr_gamma)
lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=args.lr_steps, gamma=args.lr_gamma)
if args.resume:
checkpoint = torch.load(args.resume, map_location='cpu')
model_without_ddp.load_state_dict(checkpoint['model'])
optimizer.load_state_dict(checkpoint['optimizer'])
lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
######aiss debug###
args.start_epoch = checkpoint['epoch'] + 1
if args.test_only:
evaluate(model, data_loader_test, device=device)
return
print("Start training")
start_time = time.time()
#aiss debug###
#for epoch in range(args.epochs):
for epoch in range(args.start_epoch, args.epochs):
if args.distributed:
train_sampler.set_epoch(epoch)
train_one_epoch(model, optimizer, data_loader, device, epoch, args.print_freq)
lr_scheduler.step()
if args.output_dir:
utils.save_on_master({
'model': model_without_ddp.state_dict(),
'optimizer': optimizer.state_dict(),
'lr_scheduler': lr_scheduler.state_dict(),
'args': args,
'epoch': epoch},
os.path.join(args.output_dir, 'model_{}.pth'.format(epoch)))
# evaluate after every epoch
evaluate(model, data_loader_test, device=device)
total_time = time.time() - start_time
total_time_str = str(datetime.timedelta(seconds=int(total_time)))
print('Training time {}'.format(total_time_str))
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(
description=__doc__)
parser.add_argument('--data-path', default='/IO_data/data/object365/', help='dataset')
parser.add_argument('--dataset', default='objects365', help='dataset')
parser.add_argument('--model', default='fasterrcnn_resnet50_fpn', help='model')
parser.add_argument('--device', default='cuda', help='device')
parser.add_argument('-b', '--batch-size', default=2, type=int,
help='images per gpu, the total batch size is $NGPU x batch_size')
parser.add_argument('--epochs', default=26, type=int, metavar='N',
help='number of total epochs to run')
parser.add_argument('-j', '--workers', default=4, type=int, metavar='N',
help='number of data loading workers (default: 4)')
parser.add_argument('--lr', default=0.02, type=float,
help='initial learning rate, 0.02 is the default value for training '
'on 8 gpus and 2 images_per_gpu')
parser.add_argument('--momentum', default=0.9, type=float, metavar='M',
help='momentum')
parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float,
metavar='W', help='weight decay (default: 1e-4)',
dest='weight_decay')
parser.add_argument('--lr-step-size', default=8, type=int, help='decrease lr every step-size epochs')
parser.add_argument('--lr-steps', default=[16, 22], nargs='+', type=int, help='decrease lr every step-size epochs')
parser.add_argument('--lr-gamma', default=0.1, type=float, help='decrease lr by a factor of lr-gamma')
parser.add_argument('--print-freq', default=20, type=int, help='print frequency')
parser.add_argument('--output-dir', default='.', help='path where to save')
parser.add_argument('--resume', default='', help='resume from checkpoint')
####aiss debug
parser.add_argument('--start_epoch', default=0, type=int, help='start epoch')
parser.add_argument('--aspect-ratio-group-factor', default=3, type=int)
parser.add_argument(
"--test-only",
dest="test_only",
help="Only test the model",
action="store_true",
)
parser.add_argument(
"--pretrained",
dest="pretrained",
help="Use pre-trained models from the modelzoo",
action="store_true",
)
# distributed training parameters
parser.add_argument('--world-size', default=1, type=int,
help='number of distributed processes')
parser.add_argument('--dist-url', default='env://', help='url used to set up distributed training')
parser.add_argument('--rank', default=-1, type=int, help='node rank for distributed training')
args = parser.parse_args()
if args.output_dir:
utils.mkdir(args.output_dir)
main(args)
import random
import torch
from torchvision.transforms import functional as F
def _flip_coco_person_keypoints(kps, width):
flip_inds = [0, 2, 1, 4, 3, 6, 5, 8, 7, 10, 9, 12, 11, 14, 13, 16, 15]
flipped_data = kps[:, flip_inds]
flipped_data[..., 0] = width - flipped_data[..., 0]
# Maintain COCO convention that if visibility == 0, then x, y = 0
inds = flipped_data[..., 2] == 0
flipped_data[inds] = 0
return flipped_data
class Compose(object):
def __init__(self, transforms):
self.transforms = transforms
def __call__(self, image, target):
for t in self.transforms:
image, target = t(image, target)
return image, target
class RandomHorizontalFlip(object):
def __init__(self, prob):
self.prob = prob
def __call__(self, image, target):
if random.random() < self.prob:
height, width = image.shape[-2:]
image = image.flip(-1)
bbox = target["boxes"]
bbox[:, [0, 2]] = width - bbox[:, [2, 0]]
target["boxes"] = bbox
if "masks" in target:
target["masks"] = target["masks"].flip(-1)
if "keypoints" in target:
keypoints = target["keypoints"]
keypoints = _flip_coco_person_keypoints(keypoints, width)
target["keypoints"] = keypoints
return image, target
class ToTensor(object):
def __call__(self, image, target):
image = F.to_tensor(image)
return image, target
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment