Commit b0f41f60 authored by yuguo960516's avatar yuguo960516
Browse files

resnet50

parent 456eb360
Pipeline #144 failed with stages
in 0 seconds
# Resnet50
## 模型介绍
使用OneFlow进行Resnet50的训练与推理
## 模型结构
Resnet50 网络中包含了 49 个卷积层、1个全连接层等。
## 数据集
我们考虑到imagenet数据集比较庞大,为了用户可以使用OneFlow快速进行Resnet50的训练验证,采用mini-imagenet小数据集。如果需要原始数据需要参考该链接https://github.com/Oneflow-Inc/models/tree/main/Vision/classification/image/resnet50#prepare-ofrecord-for-the-full-imagenet-dataset进行转换为ofrecord
OFRECORD_PATH="./mini-imagenet/ofrecord"
## 训练及推理
### 环境配置
提供[光源](https://www.sourcefind.cn/#/service-details)拉取的docker镜像:image.sourcefind.cn:5000/dcu/admin/base/oneflow:0.9.1-centos7.6-dtk-22.10.1-py39-latest
### fp32训练
单机单卡训练命令:
bash examples/train_graph_distributed_fp32.sh
修改examples/train_graph_distributed_fp32.sh中DEVICE_NUM_PER_NODE=4,单机四卡训练命令:
bash examples/train_graph_distributed_fp32.sh
### fp16训练
单机单卡训练命令:
bash examples/train_graph_distributed_fp16.sh
修改examples/train_graph_distributed_fp16.sh中DEVICE_NUM_PER_NODE=4,单机四卡训练命令:
bash examples/train_graph_distributed_fp16.sh
### 推理
执行推理命令:
bash examples/infer_graph.sh
## 训练性能和准确率数据
测试数据:mini-imagenet,使用的加速卡:4张DCU-Z100-16G。
根据模型情况填写表格:
| 卡数 | batch size | 类型 | 性能 | Accuracy(%) |
| :------: | :------: | :------: |:------: | -------- |
| 1 | 128 | fp32 | 284 samples/s | - |
| 1 | 128 | fp16 | 318 samples/s | - |
| 4 | 128 | fp32 | 266*4=1064 samples/s | 76.5/50 epoch |
| 4 | 128 | fp16 | 286*4 = 1144 samples/s | 76.5/50 epoch |
## 参考
* https://github.com/Oneflow-Inc/models/tree/main/Vision/classification/image/resnet50
* https://github.com/Oneflow-Inc/oneflow
\ No newline at end of file
import oneflow as flow
import argparse
import numpy as np
import os
import time
from tqdm import tqdm
import sys
sys.path.append(".")
from models.resnet50 import resnet50
from utils.ofrecord_data_utils import OFRecordDataLoader
def _parse_args():
parser = argparse.ArgumentParser("flags for train resnet50")
parser.add_argument(
"--save_checkpoint_path",
type=str,
default="./checkpoints",
help="save checkpoint root dir",
)
parser.add_argument(
"--load_checkpoint", type=str, default="", help="load checkpoint"
)
parser.add_argument(
"--ofrecord_path", type=str, default="./ofrecord/", help="dataset path"
)
# training hyper-parameters
parser.add_argument(
"--learning_rate", type=float, default=0.001, help="learning rate"
)
parser.add_argument("--mom", type=float, default=0.9, help="momentum")
parser.add_argument("--epochs", type=int, default=10, help="training epochs")
parser.add_argument(
"--train_batch_size", type=int, default=16, help="train batch size"
)
parser.add_argument("--val_batch_size", type=int, default=4, help="val batch size")
parser.add_argument(
"--results", type=str, default="./results", help="tensorboard file path"
)
parser.add_argument("--tag", type=str, default="default", help="tag of experiment")
parser.add_argument(
"--print_interval", type=int, default=10, help="print info frequency"
)
return parser.parse_args()
def setup(args):
train_data_loader = OFRecordDataLoader(
ofrecord_root=args.ofrecord_path,
mode="train",
dataset_size=9469,
batch_size=args.train_batch_size,
)
val_data_loader = OFRecordDataLoader(
ofrecord_root=args.ofrecord_path,
mode="val",
dataset_size=3925,
batch_size=args.val_batch_size,
)
criterion = flow.nn.CrossEntropyLoss()
# model setup
eager_model = resnet50()
graph_model = resnet50()
graph_model.load_state_dict(eager_model.state_dict())
eager_model.to("cuda")
graph_model.to("cuda")
# optimizer setup
eager_optimizer = flow.optim.SGD(
eager_model.parameters(), lr=args.learning_rate, momentum=args.mom
)
graph_optimizer = flow.optim.SGD(
graph_model.parameters(), lr=args.learning_rate, momentum=args.mom
)
# criterion setup
criterion = flow.nn.CrossEntropyLoss()
criterion = criterion.to("cuda")
class ModelTrainGraph(flow.nn.Graph):
def __init__(self):
super().__init__()
self.graph_model = graph_model
self.criterion = criterion
self.add_optimizer(graph_optimizer)
def build(self, image, label):
logits = self.graph_model(image)
loss = self.criterion(logits, label)
loss.backward()
return loss
class ModelEvalGraph(flow.nn.Graph):
def __init__(self):
super().__init__()
self.graph_model = graph_model
def build(self, image):
with flow.no_grad():
logits = self.graph_model(image)
predictions = logits.softmax()
return predictions
model_train_graph = ModelTrainGraph()
model_eval_graph = ModelEvalGraph()
dic = {
"train_dataloader": train_data_loader,
"val_dataloader": val_data_loader,
"eager": [eager_model, eager_optimizer, criterion],
"graph": [graph_model, model_train_graph, model_eval_graph],
}
return dic
class Trainer(object):
def __init__(self, args):
super().__init__()
self.graph_losses = []
self.eager_losses = []
self.graph_acc = []
self.eager_acc = []
self.graph_train_step_time_list = []
self.eager_train_step_time_list = []
self.graph_train_epoch_time_list = []
self.eager_train_epoch_time_list = []
self.graph_eval_epoch_time_list = []
self.eager_eval_epoch_time_list = []
self.eager_graph_model_diff_list = []
self.graph_train_total_time = 0.0
self.eager_train_total_time = 0.0
self.graph_eval_total_time = 0.0
self.eager_val_total_time = 0.0
self.args = args
def compare_eager_graph(self, compare_dic):
train_data_loader = compare_dic["train_dataloader"]
val_data_loader = compare_dic["val_dataloader"]
eager_model, eager_optimizer, criterion = compare_dic["eager"]
graph_model, model_train_graph, model_eval_graph = compare_dic["graph"]
all_samples = len(val_data_loader) * self.args.val_batch_size
print_interval = self.args.print_interval
print("start training")
for epoch in range(self.args.epochs):
# train
eager_model.train()
graph_model.train()
start_training_time = time.time()
total_graph_iter_time, total_eager_iter_time = 0, 0
for b in range(len(train_data_loader)):
image, label = train_data_loader()
image = image.to("cuda")
label = label.to("cuda")
# oneflow graph train
graph_iter_start_time = time.time()
graph_loss = model_train_graph(image, label)
graph_loss.numpy() # for synchronize CPU and GPU, get accurate running time
graph_iter_end_time = time.time()
# oneflow eager train
eager_iter_start_time = time.time()
logits = eager_model(image)
eager_loss = criterion(logits, label)
eager_loss.backward()
eager_optimizer.step()
eager_optimizer.zero_grad()
eager_loss.numpy() # for synchronize CPU and GPU, get accurate running time
eager_iter_end_time = time.time()
model_param_diff = compare_model_params(eager_model, model_train_graph)
self.eager_graph_model_diff_list.append(model_param_diff)
# get time
graph_iter_time = graph_iter_end_time - graph_iter_start_time
eager_iter_time = eager_iter_end_time - eager_iter_start_time
total_graph_iter_time += graph_iter_time
total_eager_iter_time += eager_iter_time
if b % print_interval == 0:
gl, el = graph_loss.numpy(), eager_loss.numpy()
print(
"epoch {} train iter {} ; graph loss {} eager loss {}; graph train time: {} eager train time {}".format(
epoch, b, gl, el, graph_iter_time, eager_iter_time
)
)
self.graph_losses.append(gl)
self.graph_train_step_time_list.append(graph_iter_time)
self.eager_losses.append(el)
self.eager_train_step_time_list.append(eager_iter_time)
end_training_time = time.time()
self.graph_train_epoch_time_list.append(
end_training_time - start_training_time - total_eager_iter_time
)
self.eager_train_epoch_time_list.append(
end_training_time - start_training_time - total_graph_iter_time
)
print("epoch %d train done, start validation" % epoch)
# validate
eager_model.eval()
graph_model.eval()
graph_correct, eager_correct = 0.0, 0.0
eval_start_time = time.time()
total_graph_infer_time, total_eager_infer_time = 0, 0
for b in tqdm(range(len(val_data_loader))):
image, label = val_data_loader()
image = image.to("cuda")
# graph val
graph_infer_time = time.time()
predictions = model_eval_graph(image)
graph_preds = predictions.numpy()
graph_clsidxs = np.argmax(graph_preds, axis=1)
total_graph_infer_time += time.time() - graph_infer_time
# eager val
eager_infer_time = time.time()
with flow.no_grad():
logits = eager_model(image)
predictions = logits.softmax()
eager_preds = predictions.numpy()
eager_clsidxs = np.argmax(eager_preds, axis=1)
total_eager_infer_time += time.time() - eager_infer_time
label_nd = label.numpy()
for i in range(self.args.val_batch_size):
if graph_clsidxs[i] == label_nd[i]:
graph_correct += 1
if eager_clsidxs[i] == label_nd[i]:
eager_correct += 1
eval_end_time = time.time()
self.graph_eval_epoch_time_list.append(
eval_end_time - eval_start_time - total_eager_infer_time
)
self.eager_eval_epoch_time_list.append(
eval_end_time - eval_start_time - total_graph_infer_time
)
graph_top1_acc, eager_top1_acc = (
graph_correct / all_samples,
eager_correct / all_samples,
)
self.graph_acc.append(graph_top1_acc)
self.eager_acc.append(eager_top1_acc)
print(
"epoch %d, graph top1 val acc: %f, eager top1 val acc: %f"
% (epoch, graph_top1_acc, eager_top1_acc)
)
def save_report(self,):
print("***** Save Report *****")
# folder setup
report_path = os.path.join(self.args.results)
os.makedirs(report_path, exist_ok=True)
# calculate absolute loss difference
abs_loss_diff = abs(np.array(self.eager_losses) - np.array(self.graph_losses))
# calculate losses linear correlation
loss_corr = calc_corr(self.eager_losses, self.graph_losses)
# calculate accuracy linear correlation
acc_corr = calc_corr(self.eager_acc, self.graph_acc)
# training time compare
train_time_compare = time_compare(
self.graph_train_epoch_time_list, self.eager_train_epoch_time_list
)
# validate time compare
val_time_compare = time_compare(
self.graph_eval_epoch_time_list, self.eager_eval_epoch_time_list
)
# eager graph model diff compare
model_diff_compare = np.array(self.eager_graph_model_diff_list)
# save report
save_path = os.path.join(report_path, "check_report.txt")
writer = open(save_path, "w")
writer.write("Check Report\n")
writer.write("Model: Resnet50\n")
writer.write("Check Results Between Eager Model and Graph Model\n")
writer.write("=================================================\n")
writer.write("Loss Correlation: %.4f\n\n" % loss_corr)
writer.write("Max Loss Difference: %.4f\n" % abs_loss_diff.max())
writer.write("Min Loss Difference: %.4f\n" % abs_loss_diff.min())
writer.write(
"Loss Difference Range: (%.4f, %.4f)\n\n"
% (abs_loss_diff.min(), abs_loss_diff.max())
)
writer.write(
"Model Param Difference Range: (%.4f, %.4f)\n\n"
% (model_diff_compare.min(), model_diff_compare.max())
)
writer.write("Accuracy Correlation: %.4f\n\n" % acc_corr)
writer.write(
"Train Time Compare: %.4f (Eager) : %.4f (Graph)\n\n"
% (1.0, train_time_compare)
)
writer.write(
"Val Time Compare: %.4f (Eager) : %.4f (Graph)" % (1.0, val_time_compare)
)
writer.close()
print("Report saved to: ", save_path)
def save_result(self,):
# create folder
training_results_path = os.path.join(self.args.results, self.args.tag)
os.makedirs(training_results_path, exist_ok=True)
print("***** Save Results *****")
save_results(
self.graph_losses, os.path.join(training_results_path, "graph_losses.txt")
)
save_results(
self.eager_losses, os.path.join(training_results_path, "eager_losses.txt")
)
save_results(
self.graph_acc, os.path.join(training_results_path, "graph_acc.txt")
)
save_results(
self.eager_acc, os.path.join(training_results_path, "eager_acc.txt")
)
save_results(
self.graph_train_step_time_list,
os.path.join(training_results_path, "graph_train_step_time_list.txt"),
)
save_results(
self.eager_train_step_time_list,
os.path.join(training_results_path, "eager_train_step_time_list.txt"),
)
save_results(
self.graph_train_epoch_time_list,
os.path.join(training_results_path, "graph_train_epoch_time_list.txt"),
)
save_results(
self.eager_train_epoch_time_list,
os.path.join(training_results_path, "eager_train_epoch_time_list.txt"),
)
save_results(
self.graph_eval_epoch_time_list,
os.path.join(training_results_path, "graph_eval_epoch_time_list.txt"),
)
save_results(
self.eager_eval_epoch_time_list,
os.path.join(training_results_path, "eager_eval_epoch_time_list.txt"),
)
save_results(
self.eager_graph_model_diff_list,
os.path.join(training_results_path, "eager_graph_model_diff_list.txt"),
)
print("Results saved to: ", training_results_path)
def compare_model_params(eager_model, graph_model):
num_params = len(eager_model.state_dict().keys())
sum_diff = 0.0
for key in eager_model.state_dict():
mean_single_diff = (
(
eager_model.state_dict()[key]
- graph_model.graph_model.state_dict()[key]._origin
)
.abs()
.mean()
)
sum_diff += mean_single_diff
mean_diff = float(sum_diff.numpy() / num_params)
return mean_diff
def save_results(training_info, file_path):
writer = open(file_path, "w")
for info in training_info:
writer.write("%f\n" % info)
writer.close()
# report helpers
def square(lst):
res = list(map(lambda x: x ** 2, lst))
return res
# calculate correlation
def calc_corr(a, b):
E_a = np.mean(a)
E_b = np.mean(b)
E_ab = np.mean(list(map(lambda x: x[0] * x[1], zip(a, b))))
cov_ab = E_ab - E_a * E_b
D_a = np.mean(square(a)) - E_a ** 2
D_b = np.mean(square(b)) - E_b ** 2
σ_a = np.sqrt(D_a)
σ_b = np.sqrt(D_b)
corr_factor = cov_ab / (σ_a * σ_b)
return corr_factor
def time_compare(a, b):
return np.divide(a, b).mean()
if __name__ == "__main__":
args = _parse_args()
trainer = Trainer(args)
compare_dic = setup(args)
print("init done")
trainer.compare_eager_graph(compare_dic)
del compare_dic
# save results
trainer.save_result()
trainer.save_report()
set -aux
OFRECORD_PATH="ofrecord"
if [ ! -d "$OFRECORD_PATH" ]; then
wget https://oneflow-public.oss-cn-beijing.aliyuncs.com/datasets/imagenette_ofrecord.tar.gz
tar zxf imagenette_ofrecord.tar.gz
fi
CHECKPOINT_PATH="checkpoints"
if [ ! -d "$CHECKPOINT_PATH" ]; then
mkdir $CHECKPOINT_PATH
fi
LEARNING_RATE=0.001
MOM=0.9
EPOCH=20
TRAIN_BATCH_SIZE=16
VAL_BATCH_SIZE=16
python3 check/check.py \
--save_checkpoint_path $CHECKPOINT_PATH \
--ofrecord_path $OFRECORD_PATH \
--learning_rate $LEARNING_RATE \
--mom $MOM \
--epochs $EPOCH \
--train_batch_size $TRAIN_BATCH_SIZE \
--val_batch_size $VAL_BATCH_SIZE \
\ No newline at end of file
import os
import argparse
import matplotlib.pyplot as plt
def _parse_args():
parser = argparse.ArgumentParser("flags for draw image")
parser.add_argument(
"--txt_root",
type=str,
default="./results/check_info/",
help="your txt root dir",
)
parser.add_argument(
"--save_root",
type=str,
default="./results/picture/",
help="your draw image save dir",
)
return parser.parse_args()
# helpers
def load_data(file_path):
data = []
with open(file_path, "r") as f:
for _line in f.readlines():
data.append(float(_line.strip()))
return data
def draw_and_save(info_dic):
# info_dic: {
# "title": "compare_loss"
# "save_path": "your_save_path"
# "txts": [a.txt, b.txt],
# "names": [a_name, b_name],
# "xlabel": "epochs",
# "ylabel": "acc",
# "xlim": [0, 1], # Optional
# "ylim": [0, 1], # Optional
# "do_abs_minus": False # Optional
# }
title, save_path = info_dic["title"], info_dic["save_path"]
txts, labels = info_dic["txts"], info_dic["names"]
xlabel, ylabel = info_dic["xlabel"], info_dic["ylabel"]
xlim, ylim = info_dic.get("xlim", 0), info_dic.get("ylim", 0)
do_abs_minus = info_dic.get("do_abs_minus", False)
assert len(txts) == len(labels)
# setup
plt.rcParams["figure.dpi"] = 100
plt.clf()
plt.xlabel(xlabel, fontproperties="Times New Roman")
plt.ylabel(ylabel, fontproperties="Times New Roman")
if xlim:
plt.xlim(xlim[0], xlim[1])
if ylim:
plt.ylim(ylim[0], ylim[1])
if do_abs_minus:
assert len(txts) == 2
data1, data2 = load_data(txts[0]), load_data(txts[1])
assert len(data1) == len(data2)
idxs = [i for i in range(len(data1))]
abs_data = [abs(data1[i] - data2[i]) for i in idxs]
plt.plot(idxs, abs_data)
else:
for txt, label in zip(txts, labels):
data = load_data(txt)
idxs = [i for i in range(len(data))]
plt.plot(idxs, data, label=label)
plt.title(title)
plt.legend(loc="upper right", frameon=True, fontsize=8)
plt.savefig(save_path)
def add_pth(a, b):
return os.path.join(a, b)
if __name__ == "__main__":
args = _parse_args()
txt_root = args.txt_root
save_root = args.save_root
assert os.path.exists(
txt_root
), 'you should run "check/check.sh" before drawing graphs'
# draw and save
os.makedirs(save_root, exist_ok=True)
draw_and_save(
{
"title": "compare_acc",
"save_path": add_pth(save_root, "acc.png"),
"txts": [
add_pth(txt_root, "eager_acc.txt"),
add_pth(txt_root, "graph_acc.txt"),
],
"names": ["eager_acc", "graph_acc"],
"xlabel": "epochs",
"ylabel": "acc",
}
)
draw_and_save(
{
"title": "compare_loss",
"save_path": add_pth(save_root, "compare_loss.png"),
"txts": [
add_pth(txt_root, "eager_losses.txt"),
add_pth(txt_root, "graph_losses.txt"),
],
"names": ["eager_loss", "graph_loss"],
"xlabel": "epochs",
"ylabel": "loss",
}
)
draw_and_save(
{
"title": "compare_train_step_time",
"save_path": add_pth(save_root, "compare_train_step_time.png"),
"txts": [
add_pth(txt_root, "eager_train_step_time_list.txt"),
add_pth(txt_root, "graph_train_step_time_list.txt"),
],
"names": ["eager_step_time", "graph_step_time"],
"xlabel": "iters",
"ylabel": "time(s)",
"ylim": [0, 1],
}
)
draw_and_save(
{
"title": "compare_train_epoch_time",
"save_path": add_pth(save_root, "compare_train_epoch_time.png"),
"txts": [
add_pth(txt_root, "eager_train_epoch_time_list.txt"),
add_pth(txt_root, "graph_train_epoch_time_list.txt"),
],
"names": ["eager_epoch_time", "graph_epoch_time"],
"xlabel": "epochs",
"ylabel": "time(s)",
}
)
draw_and_save(
{
"title": "compare_eval_epoch_time",
"save_path": add_pth(save_root, "compare_eval_epoch_time.png"),
"txts": [
add_pth(txt_root, "eager_eval_epoch_time_list.txt"),
add_pth(txt_root, "graph_eval_epoch_time_list.txt"),
],
"names": ["eager_eval_time", "graph_eval_time"],
"xlabel": "epochs",
"ylabel": "time(s)",
}
)
draw_and_save(
{
"title": "compare_abs_loss",
"save_path": add_pth(save_root, "compare_abs_loss.png"),
"txts": [
add_pth(txt_root, "eager_losses.txt"),
add_pth(txt_root, "graph_losses.txt"),
],
"names": ["eager_loss", "graph_loss"],
"xlabel": "iters",
"ylabel": "abs_loss",
"do_abs_minus": True,
}
)
draw_and_save(
{
"title": "compare_abs_model_param",
"save_path": add_pth(save_root, "compare_abs_model_param.png"),
"txts": [add_pth(txt_root, "eager_graph_model_diff_list.txt")],
"names": ["model_param_abs_diff"],
"xlabel": "iters",
"ylabel": "abs_diff",
}
)
set -aux
TXT_ROOT="results/default/"
SAVE_ROOT="results/picture/"
python3 check/draw.py \
--txt_root $TXT_ROOT \
--save_root $SAVE_ROOT \
\ No newline at end of file
import argparse
import math
import oneflow as flow
_GLOBAL_ARGS = None
def get_args():
global _GLOBAL_ARGS
if _GLOBAL_ARGS is None:
_GLOBAL_ARGS = parse_args()
return _GLOBAL_ARGS
def str2bool(v):
if v.lower() in ("yes", "true", "t", "y", "1"):
return True
elif v.lower() in ("no", "false", "f", "n", "0"):
return False
else:
raise argparse.ArgumentTypeError("Unsupported value encountered.")
def parse_args(ignore_unknown_args=False):
parser = argparse.ArgumentParser(
description="OneFlow ResNet50 Arguments", allow_abbrev=False
)
parser.add_argument(
"--save",
type=str,
default=None,
dest="save_path",
help="root dir of saving checkpoint",
)
parser.add_argument(
"--save-init",
action="store_true",
dest="save_init",
help="save right on init model finished",
)
parser.add_argument(
"--load",
type=str,
default=None,
dest="load_path",
help="root dir of loading checkpoint",
)
parser.add_argument(
"--ofrecord-path",
type=str,
default="./ofrecord",
dest="ofrecord_path",
help="dataset path",
)
parser.add_argument(
"--ofrecord-part-num",
type=int,
default=1,
dest="ofrecord_part_num",
help="ofrecord data part number",
)
parser.add_argument(
"--use-gpu-decode",
action="store_true",
dest="use_gpu_decode",
help="Use gpu decode.",
)
parser.add_argument(
"--synthetic-data",
action="store_true",
dest="synthetic_data",
help="Use synthetic data",
)
# fuse bn relu or bn add relu
parser.add_argument(
"--fuse-bn-relu",
action="store_true",
dest="fuse_bn_relu",
help="Whether to use use fuse batch_normalization and relu.",
)
parser.add_argument(
"--fuse-bn-add-relu",
action="store_true",
dest="fuse_bn_add_relu",
help="Whether to use use fuse batch_normalization, add and relu.",
)
# training hyper-parameters
parser.add_argument(
"--train-batch-size",
type=int,
default=32,
dest="train_batch_size",
help="train batch size",
)
parser.add_argument(
"--val-batch-size",
type=int,
default=32,
dest="val_batch_size",
help="val batch size",
)
parser.add_argument(
"--train-global-batch-size",
type=int,
default=None,
dest="train_global_batch_size",
help="train batch size",
)
parser.add_argument(
"--val-global-batch-size",
type=int,
default=None,
dest="val_global_batch_size",
help="val batch size",
)
parser.add_argument(
"--num-devices-per-node",
type=int,
default=1,
dest="num_devices_per_node",
help="",
)
parser.add_argument(
"--num-nodes",
type=int,
default=1,
dest="num_nodes",
help="node/machine number for training",
)
parser.add_argument("--lr", type=float, default=0.256, dest="learning_rate")
parser.add_argument("--wd", type=float, default=1.0 / 32768, dest="weight_decay")
parser.add_argument("--momentum", type=float, default=0.875, help="momentum")
parser.add_argument(
"--lr-decay-type",
type=str,
default="cosine",
choices=["none", "cosine", "step"],
dest="lr_decay_type",
help="cosine, step",
)
parser.add_argument(
"--grad-clipping",
type=float,
default=0.0,
dest="grad_clipping",
help="gradient clipping",
)
parser.add_argument(
"--warmup-epochs",
type=int,
default=5,
dest="warmup_epochs",
help="the epochs to warmp-up lr to scaled large-batch value",
)
parser.add_argument("--legacy-init", action="store_true", dest="legacy_init")
parser.add_argument(
"--use-fp16", action="store_true", help="Run model in fp16 mode."
)
parser.add_argument(
"--num-epochs", type=int, default=90, dest="num_epochs", help="number of epochs"
)
parser.add_argument(
"--nccl-fusion-threshold-mb",
type=int,
default=16,
dest="nccl_fusion_threshold_mb",
help="NCCL fusion threshold megabytes, set to 0 to compatible with previous version of OneFlow.",
)
parser.add_argument(
"--nccl-fusion-max-ops",
type=int,
default=24,
dest="nccl_fusion_max_ops",
help="Maximum number of ops of NCCL fusion, set to 0 to compatible with previous version of OneFlow.",
)
parser.add_argument(
"--zero-init-residual",
type=str2bool,
default=True,
nargs="?",
const=True,
dest="zero_init_residual",
)
parser.add_argument(
"--scale-grad",
action="store_true",
dest="scale_grad",
help="scale init grad with world_size",
)
# for data process
parser.add_argument(
"--num-classes",
type=int,
default=1000,
dest="num_classes",
help="num of pic classes",
)
parser.add_argument(
"--channel-last", action="store_true", dest="channel_last",
)
parser.add_argument(
"--samples-per-epoch",
type=int,
default=1281167,
dest="samples_per_epoch",
help="train pic number",
)
parser.add_argument(
"--val-samples-per-epoch",
type=int,
default=50000,
dest="val_samples_per_epoch",
help="validation pic number",
)
parser.add_argument(
"--label-smoothing",
type=float,
default=0.1,
dest="label_smoothing",
help="label smoothing factor",
)
parser.add_argument(
"--batches-per-epoch", type=int, default=None, dest="batches_per_epoch",
)
parser.add_argument(
"--val-batches-per-epoch", type=int, default=None, dest="val_batches_per_epoch",
)
parser.add_argument(
"--total-batches", type=int, default=-1, dest="total_batches",
)
parser.add_argument("--skip-eval", action="store_true", dest="skip_eval")
# log and loss print
parser.add_argument(
"--print-interval",
type=int,
default=100,
dest="print_interval",
help="print loss every n iteration",
)
parser.add_argument(
"--print-timestamp", action="store_true", dest="print_timestamp",
)
parser.add_argument(
"--metric-local",
type=str2bool,
default=True,
nargs="?",
const=True,
dest="metric_local",
)
parser.add_argument(
"--metric-train-acc",
type=str2bool,
default=True,
nargs="?",
const=True,
dest="metric_train_acc",
)
parser.add_argument(
"--gpu-stat-file",
type=str,
default=None,
dest="gpu_stat_file",
help="stat gpu utilization and memory usage when print",
)
parser.add_argument("--graph", action="store_true", help="Run model in graph mode.")
parser.add_argument("--ddp", action="store_true", help="Run model in ddp mode.")
if ignore_unknown_args:
args, _ = parser.parse_known_args()
else:
args = parser.parse_args()
if args.num_nodes > 1:
raise ValueError("NOT support num_nodes > 1")
if args.ddp and args.graph:
raise ValueError("graph and ddp can't be set at the same time")
if args.use_fp16 and not args.graph:
raise ValueError("NOT support fp16 in eager mode")
if args.ddp and not args.metric_local:
raise ValueError("metric_local must be set to True when with ddp")
if args.ddp and args.scale_grad:
raise ValueError("scale_grad is unavailable with ddp")
world_size = flow.env.get_world_size()
if args.train_global_batch_size is None:
args.train_global_batch_size = args.train_batch_size * world_size
else:
assert args.train_global_batch_size % args.train_batch_size == 0
if args.val_global_batch_size is None:
args.val_global_batch_size = args.val_batch_size * world_size
else:
assert args.val_global_batch_size % args.val_batch_size == 0
if args.batches_per_epoch is None:
args.batches_per_epoch = math.ceil(
args.samples_per_epoch // args.train_global_batch_size
)
if args.val_batches_per_epoch is None:
args.val_batches_per_epoch = int(
args.val_samples_per_epoch / args.val_global_batch_size
)
if flow.env.get_rank() == 0:
_print_args(args)
return args
def _print_args(args):
print("------------------------ arguments ------------------------", flush=True)
str_list = []
for arg in vars(args):
dots = "." * (48 - len(arg))
str_list.append(" {} {} {}".format(arg, dots, getattr(args, arg)))
for arg in sorted(str_list, key=lambda x: x.lower()):
print(arg, flush=True)
print("-------------------- end of arguments ---------------------", flush=True)
if __name__ == "__main__":
get_args()
# set -aux
PRETRAIN_MODEL_PATH="resnet50_imagenet_pretrain_model"
IMAGE_PATH="data/fish.jpg"
if [ ! -d "data" ]; then
wget https://oneflow-public.oss-cn-beijing.aliyuncs.com/model_zoo/cv/data.tar.gz
tar zxf data.tar.gz
fi
if [ ! -d "$PRETRAIN_MODEL_PATH" ]; then
wget https://oneflow-public.oss-cn-beijing.aliyuncs.com/model_zoo/cv/classification/resnet50_imagenet_pretrain_model.tar.gz
tar zxf resnet50_imagenet_pretrain_model.tar.gz
fi
# SRC_DIR=/path/to/models/resnet50
SRC_DIR=$(realpath $(dirname $0)/..)
python3 $SRC_DIR/infer.py \
--model $PRETRAIN_MODEL_PATH \
--image $IMAGE_PATH \
# set -aux
PRETRAIN_MODEL_PATH="resnet50_imagenet_pretrain_model"
IMAGE_PATH="data/fish.jpg"
if [ ! -d "data" ]; then
wget https://oneflow-public.oss-cn-beijing.aliyuncs.com/model_zoo/cv/data.tar.gz
tar zxf data.tar.gz
fi
if [ ! -d "$PRETRAIN_MODEL_PATH" ]; then
wget https://oneflow-public.oss-cn-beijing.aliyuncs.com/model_zoo/cv/classification/resnet50_imagenet_pretrain_model.tar.gz
tar zxf resnet50_imagenet_pretrain_model.tar.gz
fi
# SRC_DIR=/path/to/models/resnet50
SRC_DIR=$(realpath $(dirname $0)/..)
python3 $SRC_DIR/infer.py \
--model $PRETRAIN_MODEL_PATH \
--image $IMAGE_PATH \
--graph \
# set -aux
DEVICE_NUM_PER_NODE=4
MASTER_ADDR=127.0.0.1
NUM_NODES=1
NODE_RANK=0
export PYTHONUNBUFFERED=1
echo PYTHONUNBUFFERED=$PYTHONUNBUFFERED
export NCCL_LAUNCH_MODE=PARALLEL
echo NCCL_LAUNCH_MODE=$NCCL_LAUNCH_MODE
# export NCCL_DEBUG=INFO
# export ONEFLOW_DEBUG_MODE=True
CHECKPOINT_SAVE_PATH="./ddp_fp32_checkpoints"
if [ ! -d "$CHECKPOINT_SAVE_PATH" ]; then
mkdir $CHECKPOINT_SAVE_PATH
fi
OFRECORD_PATH="./mini-imagenet/ofrecord"
OFRECORD_PART_NUM=8
LEARNING_RATE=0.384
MOM=0.875
EPOCH=50
TRAIN_BATCH_SIZE=64
VAL_BATCH_SIZE=64
# SRC_DIR=/path/to/models/resnet50
SRC_DIR=$(realpath $(dirname $0)/..)
python3 -m oneflow.distributed.launch \
--nproc_per_node $DEVICE_NUM_PER_NODE \
--nnodes $NUM_NODES \
--node_rank $NODE_RANK \
--master_addr $MASTER_ADDR \
$SRC_DIR/train.py \
--save $CHECKPOINT_SAVE_PATH \
--ofrecord-path $OFRECORD_PATH \
--ofrecord-part-num $OFRECORD_PART_NUM \
--num-devices-per-node $DEVICE_NUM_PER_NODE \
--lr $LEARNING_RATE \
--momentum $MOM \
--num-epochs $EPOCH \
--train-batch-size $TRAIN_BATCH_SIZE \
--val-batch-size $VAL_BATCH_SIZE \
--synthetic-data \
--ddp \
--skip-eval
# set -aux
DEVICE_NUM_PER_NODE=8
MASTER_ADDR=127.0.0.1
NUM_NODES=1
NODE_RANK=0
export PYTHONUNBUFFERED=1
echo PYTHONUNBUFFERED=$PYTHONUNBUFFERED
export NCCL_LAUNCH_MODE=PARALLEL
echo NCCL_LAUNCH_MODE=$NCCL_LAUNCH_MODE
# export NCCL_DEBUG=INFO
# export ONEFLOW_DEBUG_MODE=True
CHECKPOINT_SAVE_PATH="./ddp_stat_checkpoints"
if [ ! -d "$CHECKPOINT_SAVE_PATH" ]; then
mkdir $CHECKPOINT_SAVE_PATH
fi
OFRECORD_PATH=PATH_TO_IMAGENET_OFRECORD
OFRECORD_PART_NUM=256
LEARNING_RATE=0.768
MOM=0.875
EPOCH=50
TRAIN_BATCH_SIZE=96
VAL_BATCH_SIZE=50
# SRC_DIR=/path/to/models/resnet50
SRC_DIR=$(realpath $(dirname $0)/..)
nohup python3 $SRC_DIR/utils/stat.py > stat.out 2>&1 & echo $! > stat.pid
python3 -m oneflow.distributed.launch \
--nproc_per_node $DEVICE_NUM_PER_NODE \
--nnodes $NUM_NODES \
--node_rank $NODE_RANK \
--master_addr $MASTER_ADDR \
$SRC_DIR/train.py \
--save $CHECKPOINT_SAVE_PATH \
--ofrecord-path $OFRECORD_PATH \
--ofrecord-part-num $OFRECORD_PART_NUM \
--num-devices-per-node $DEVICE_NUM_PER_NODE \
--lr $LEARNING_RATE \
--momentum $MOM \
--num-epochs $EPOCH \
--train-batch-size $TRAIN_BATCH_SIZE \
--val-batch-size $VAL_BATCH_SIZE \
--ddp \
kill -15 $(cat stat.pid)
# set -aux
export PYTHONUNBUFFERED=1
echo PYTHONUNBUFFERED=$PYTHONUNBUFFERED
CHECKPOINT_SAVE_PATH="./eager_checkpoints"
if [ ! -d "$CHECKPOINT_SAVE_PATH" ]; then
mkdir $CHECKPOINT_SAVE_PATH
fi
OFRECORD_PATH="./mini-imagenet/ofrecord"
if [ ! -d "$OFRECORD_PATH" ]; then
wget https://oneflow-public.oss-cn-beijing.aliyuncs.com/online_document/dataset/imagenet/mini-imagenet.zip
unzip mini-imagenet.zip
fi
OFRECORD_PART_NUM=8
LEARNING_RATE=0.256
MOM=0.875
EPOCH=50
TRAIN_BATCH_SIZE=64
VAL_BATCH_SIZE=64
# SRC_DIR=/path/to/models/resnet50
SRC_DIR=$(realpath $(dirname $0)/..)
python3 $SRC_DIR/train.py \
--ofrecord-path $OFRECORD_PATH \
--ofrecord-part-num $OFRECORD_PART_NUM \
--num-devices-per-node 1 \
--lr $LEARNING_RATE \
--momentum $MOM \
--num-epochs $EPOCH \
--warmup-epochs 0 \
--train-batch-size $TRAIN_BATCH_SIZE \
--val-batch-size $VAL_BATCH_SIZE \
--save $CHECKPOINT_SAVE_PATH \
--samples-per-epoch 64 \
--val-samples-per-epoch 64 \
--skip-eval \
# set -aux
DEVICE_NUM_PER_NODE=4
MASTER_ADDR=127.0.0.1
NUM_NODES=1
NODE_RANK=0
export PYTHONUNBUFFERED=1
echo PYTHONUNBUFFERED=$PYTHONUNBUFFERED
export NCCL_LAUNCH_MODE=PARALLEL
echo NCCL_LAUNCH_MODE=$NCCL_LAUNCH_MODE
# export NCCL_DEBUG=INFO
CHECKPOINT_SAVE_PATH=ddp_checkpoints
if [ ! -d "$CHECKPOINT_SAVE_PATH" ]; then
mkdir $CHECKPOINT_SAVE_PATH
fi
CHECKPOINT_LOAD_PATH="./init_ckpt_by_lazy"
OFRECORD_PATH=/dataset/ImageNet/ofrecord/
OFRECORD_PART_NUM=256
LEARNING_RATE=0.256
MOM=0.875
EPOCH=90
TRAIN_BATCH_SIZE=64
VAL_BATCH_SIZE=50
# SRC_DIR=/path/to/models/resnet50
SRC_DIR=$(realpath $(dirname $0)/..)
python3 -m oneflow.distributed.launch \
--nproc_per_node $DEVICE_NUM_PER_NODE \
--nnodes $NUM_NODES \
--node_rank $NODE_RANK \
--master_addr $MASTER_ADDR \
$SRC_DIR/train.py \
--save $CHECKPOINT_SAVE_PATH \
--load $CHECKPOINT_LOAD_PATH \
--ofrecord-path $OFRECORD_PATH \
--ofrecord-part-num $OFRECORD_PART_NUM \
--num-devices-per-node $DEVICE_NUM_PER_NODE \
--lr $LEARNING_RATE \
--momentum $MOM \
--num-epochs $EPOCH \
--train-batch-size $TRAIN_BATCH_SIZE \
--val-batch-size $VAL_BATCH_SIZE \
# set -aux
export PYTHONUNBUFFERED=1
echo PYTHONUNBUFFERED=$PYTHONUNBUFFERED
CHECKPOINT_SAVE_PATH="./graph_checkpoints"
if [ ! -d "$CHECKPOINT_SAVE_PATH" ]; then
mkdir $CHECKPOINT_SAVE_PATH
fi
OFRECORD_PATH="./mini-imagenet/ofrecord"
if [ ! -d "$OFRECORD_PATH" ]; then
wget https://oneflow-public.oss-cn-beijing.aliyuncs.com/online_document/dataset/imagenet/mini-imagenet.zip
unzip mini-imagenet.zip
fi
OFRECORD_PART_NUM=8
LEARNING_RATE=0.256
MOM=0.875
EPOCH=90
TRAIN_BATCH_SIZE=128
VAL_BATCH_SIZE=128
# SRC_DIR=/path/to/models/resnet50
SRC_DIR=$(realpath $(dirname $0)/..)
python3 $SRC_DIR/train.py \
--ofrecord-path $OFRECORD_PATH \
--ofrecord-part-num $OFRECORD_PART_NUM \
--num-devices-per-node 1 \
--lr $LEARNING_RATE \
--momentum $MOM \
--num-epochs $EPOCH \
--warmup-epochs 0 \
--train-batch-size $TRAIN_BATCH_SIZE \
--val-batch-size $VAL_BATCH_SIZE \
--save $CHECKPOINT_SAVE_PATH \
--samples-per-epoch 128 \
--val-samples-per-epoch 128 \
--scale-grad \
--graph \
--skip-eval \
# set -aux
DEVICE_NUM_PER_NODE=1
MASTER_ADDR=127.0.0.1
NUM_NODES=1
NODE_RANK=0
export PYTHONUNBUFFERED=1
echo PYTHONUNBUFFERED=$PYTHONUNBUFFERED
export NCCL_LAUNCH_MODE=PARALLEL
echo NCCL_LAUNCH_MODE=$NCCL_LAUNCH_MODE
# export NCCL_DEBUG=INFO
# export ONEFLOW_DEBUG_MODE=True
CHECKPOINT_SAVE_PATH="./graph_distributed_fp16_checkpoints"
if [ ! -d "$CHECKPOINT_SAVE_PATH" ]; then
mkdir $CHECKPOINT_SAVE_PATH
fi
OFRECORD_PATH="./mini-imagenet/ofrecord"
OFRECORD_PART_NUM=8
LEARNING_RATE=1.536
MOM=0.875
EPOCH=50
TRAIN_BATCH_SIZE=128
VAL_BATCH_SIZE=128
# SRC_DIR=/path/to/models/resnet50
SRC_DIR=$(realpath $(dirname $0)/..)
python3 -m oneflow.distributed.launch \
--nproc_per_node $DEVICE_NUM_PER_NODE \
--nnodes $NUM_NODES \
--node_rank $NODE_RANK \
--master_addr $MASTER_ADDR \
$SRC_DIR/train.py \
--save $CHECKPOINT_SAVE_PATH \
--ofrecord-path $OFRECORD_PATH \
--ofrecord-part-num $OFRECORD_PART_NUM \
--num-devices-per-node $DEVICE_NUM_PER_NODE \
--lr $LEARNING_RATE \
--momentum $MOM \
--num-epochs $EPOCH \
--train-batch-size $TRAIN_BATCH_SIZE \
--val-batch-size $VAL_BATCH_SIZE \
--graph \
--use-fp16 \
--metric-local True \
--metric-train-acc True
# set -aux
DEVICE_NUM_PER_NODE=1
MASTER_ADDR=127.0.0.1
NUM_NODES=1
NODE_RANK=0
export PYTHONUNBUFFERED=1
echo PYTHONUNBUFFERED=$PYTHONUNBUFFERED
export NCCL_LAUNCH_MODE=PARALLEL
echo NCCL_LAUNCH_MODE=$NCCL_LAUNCH_MODE
# export NCCL_DEBUG=INFO
# export ONEFLOW_DEBUG_MODE=True
CHECKPOINT_SAVE_PATH="./graph_distributed_fp32_checkpoints"
if [ ! -d "$CHECKPOINT_SAVE_PATH" ]; then
mkdir $CHECKPOINT_SAVE_PATH
fi
OFRECORD_PATH="./mini-imagenet/ofrecord"
OFRECORD_PART_NUM=8
LEARNING_RATE=0.384
MOM=0.875
EPOCH=50
TRAIN_BATCH_SIZE=128
VAL_BATCH_SIZE=128
#SRC_DIR=/home/git/resnet50
SRC_DIR=$(realpath $(dirname $0)/..)
python3 -m oneflow.distributed.launch \
--nproc_per_node $DEVICE_NUM_PER_NODE \
--nnodes $NUM_NODES \
--node_rank $NODE_RANK \
--master_addr $MASTER_ADDR \
$SRC_DIR/train.py \
--save $CHECKPOINT_SAVE_PATH \
--ofrecord-path $OFRECORD_PATH \
--ofrecord-part-num $OFRECORD_PART_NUM \
--num-devices-per-node $DEVICE_NUM_PER_NODE \
--lr $LEARNING_RATE \
--momentum $MOM \
--num-epochs $EPOCH \
--train-batch-size $TRAIN_BATCH_SIZE \
--val-batch-size $VAL_BATCH_SIZE \
--scale-grad \
--graph \
--skip-eval \
--fuse-bn-add-relu \
--fuse-bn-relu
import oneflow as flow
from config import get_args
from models.optimizer import make_grad_scaler
from models.optimizer import make_static_grad_scaler
def make_train_graph(
model, cross_entropy, data_loader, optimizer, lr_scheduler=None, *args, **kwargs
):
return TrainGraph(
model, cross_entropy, data_loader, optimizer, lr_scheduler, *args, **kwargs
)
def make_eval_graph(model, data_loader):
return EvalGraph(model, data_loader)
class TrainGraph(flow.nn.Graph):
def __init__(
self,
model,
cross_entropy,
data_loader,
optimizer,
lr_scheduler=None,
return_pred_and_label=True,
):
super().__init__()
args = get_args()
self.return_pred_and_label = return_pred_and_label
if args.use_fp16:
self.config.enable_amp(True)
self.set_grad_scaler(make_grad_scaler())
elif args.scale_grad:
self.set_grad_scaler(make_static_grad_scaler())
self.config.allow_fuse_add_to_output(True)
self.config.allow_fuse_model_update_ops(True)
# Disable cudnn_conv_heuristic_search_algo will open dry-run.
# Dry-run is better with single device, but has no effect with multiple device.
self.config.enable_cudnn_conv_heuristic_search_algo(False)
self.world_size = flow.env.get_world_size()
if self.world_size / args.num_devices_per_node > 1:
self.config.enable_cudnn_conv_heuristic_search_algo(True)
self.model = model
self.cross_entropy = cross_entropy
self.data_loader = data_loader
self.add_optimizer(optimizer, lr_sch=lr_scheduler)
def build(self):
image, label = self.data_loader()
image = image.to("cuda")
label = label.to("cuda")
logits = self.model(image)
loss = self.cross_entropy(logits, label)
if self.return_pred_and_label:
pred = logits.softmax()
else:
pred = None
label = None
loss.backward()
return loss, pred, label
class EvalGraph(flow.nn.Graph):
def __init__(self, model, data_loader):
super().__init__()
args = get_args()
if args.use_fp16:
self.config.enable_amp(True)
self.config.allow_fuse_add_to_output(False)
self.data_loader = data_loader
self.model = model
def build(self):
image, label = self.data_loader()
image = image.to("cuda")
label = label.to("cuda")
logits = self.model(image)
pred = logits.softmax()
return pred, label
import os
import sys
sys.path.append(
os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir))
)
import argparse
import numpy as np
import time
import oneflow as flow
from models.resnet50 import resnet50
from utils.imagenet1000_clsidx_to_labels import clsidx_2_labels
from utils.numpy_data_utils import load_image
def _parse_args():
parser = argparse.ArgumentParser("flags for test resnet50")
parser.add_argument(
"--model",
type=str,
default="./resnet50_imagenet_pretrain_model",
dest="model_path",
help="model path",
)
parser.add_argument(
"--image",
type=str,
default=None,
required=True,
dest="image_path",
help="input image path",
)
parser.add_argument("--graph", action="store_true", help="Run model in graph mode.")
return parser.parse_args()
class InferGraph(flow.nn.Graph):
def __init__(self, model):
super().__init__()
self.model = model
def build(self, image):
with flow.no_grad():
logits = self.model(image)
pred = logits.softmax()
return pred
def main(args):
start_t = time.perf_counter()
print("***** Model Init *****")
model = resnet50()
model.load_state_dict(flow.load(args.model_path))
model = model.to("cuda")
model.eval()
end_t = time.perf_counter()
print(f"***** Model Init Finish, time escapled {end_t - start_t:.6f} s *****")
if args.graph:
model_graph = InferGraph(model)
start_t = end_t
image = load_image(args.image_path)
image = flow.Tensor(image, device=flow.device("cuda"))
if args.graph:
pred = model_graph(image)
else:
pred = model(image).softmax()
pred = pred.numpy()
prob = np.max(pred)
clsidx = np.argmax(pred)
cls = clsidx_2_labels[clsidx]
end_t = time.perf_counter()
print(
"predict image ({}) prob: {:.5f}, class name: {}, time escapled: {:.6f} s".format(
os.path.basename(args.image_path), prob, cls, end_t - start_t
)
)
if __name__ == "__main__":
args = _parse_args()
main(args)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment