"...git@developer.sourcefind.cn:wangsen/paddle_dbnet.git" did not exist on "967f06768b49d0b2ab79e537717dd5fdd0e8fafb"
Commit e3c596d9 authored by Leif's avatar Leif
Browse files

Merge remote-tracking branch 'origin/dygraph' into dygraph

parents 357657f0 efc09082
...@@ -47,6 +47,13 @@ bash test_tipc/test_train_python.sh ./test_tipc/train_infer_python_PACT.txt 'lit ...@@ -47,6 +47,13 @@ bash test_tipc/test_train_python.sh ./test_tipc/train_infer_python_PACT.txt 'lit
bash test_tipc/test_train_python.sh ./test_tipc/train_infer_python_FPGM.txt 'lite_train_lite_infer' bash test_tipc/test_train_python.sh ./test_tipc/train_infer_python_FPGM.txt 'lite_train_lite_infer'
``` ```
多机多卡的运行配置文件分别为 `train_infer_python_fleet.txt`, `train_infer_python_FPGM_fleet.txt``train_infer_python_PACT_fleet.txt`
运行时,需要修改配置文件中的 `gpu_list:xx.xx.xx.xx,yy.yy.yy.yy;0,1`。 将 `xx.xx.xx.xx` 替换为具体的 `ip` 地址,各个`ip`地址之间用`,`分隔。 另外,和单机训练
不同,启动多机多卡训练需要在多机的每个节点上分别运行命令。以多机多卡量化训练为例,指令如下:
```
bash test_tipc/test_train_python.sh ./test_tipc/train_infer_python_PACT_fleet.txt 'lite_train_lite_infer'
```
运行相应指令后,在`test_tipc/output`文件夹下自动会保存运行日志。如'lite_train_lite_infer'模式运行后,在test_tipc/extra_output文件夹有以下文件: 运行相应指令后,在`test_tipc/output`文件夹下自动会保存运行日志。如'lite_train_lite_infer'模式运行后,在test_tipc/extra_output文件夹有以下文件:
``` ```
......
...@@ -35,7 +35,6 @@ use_share_conv_key=$(func_parser_key "${lines[13]}") ...@@ -35,7 +35,6 @@ use_share_conv_key=$(func_parser_key "${lines[13]}")
use_share_conv_list=$(func_parser_value "${lines[13]}") use_share_conv_list=$(func_parser_value "${lines[13]}")
run_train_py=$(func_parser_value "${lines[14]}") run_train_py=$(func_parser_value "${lines[14]}")
LOG_PATH="./test_tipc/extra_output" LOG_PATH="./test_tipc/extra_output"
mkdir -p ${LOG_PATH} mkdir -p ${LOG_PATH}
status_log="${LOG_PATH}/results_python.log" status_log="${LOG_PATH}/results_python.log"
...@@ -98,6 +97,8 @@ if [ ${MODE} = "lite_train_lite_infer" ] || [ ${MODE} = "whole_train_whole_infer ...@@ -98,6 +97,8 @@ if [ ${MODE} = "lite_train_lite_infer" ] || [ ${MODE} = "whole_train_whole_infer
cmd="${python} ${run_train_py} ${set_use_gpu} ${set_save_model} ${set_epoch} ${set_pretrain} ${set_checkpoints} ${set_autocast} ${set_batchsize} ${set_use_custom_op} ${set_model_type} ${set_use_share_conv} ${set_amp_config}" cmd="${python} ${run_train_py} ${set_use_gpu} ${set_save_model} ${set_epoch} ${set_pretrain} ${set_checkpoints} ${set_autocast} ${set_batchsize} ${set_use_custom_op} ${set_model_type} ${set_use_share_conv} ${set_amp_config}"
elif [ ${#ips} -le 26 ];then # train with multi-gpu elif [ ${#ips} -le 26 ];then # train with multi-gpu
cmd="${python} -m paddle.distributed.launch --gpus=${gpu} ${run_train_py} ${set_use_gpu} ${set_save_model} ${set_epoch} ${set_pretrain} ${set_checkpoints} ${set_autocast} ${set_batchsize} ${set_use_custom_op} ${set_model_type} ${set_use_share_conv} ${set_amp_config}" cmd="${python} -m paddle.distributed.launch --gpus=${gpu} ${run_train_py} ${set_use_gpu} ${set_save_model} ${set_epoch} ${set_pretrain} ${set_checkpoints} ${set_autocast} ${set_batchsize} ${set_use_custom_op} ${set_model_type} ${set_use_share_conv} ${set_amp_config}"
else
cmd="${python} -m paddle.distributed.launch --ips=${ips} --gpus=${gpu} ${run_train_py} ${set_use_gpu} ${set_save_model} ${set_epoch} ${set_pretrain} ${set_checkpoints} ${set_autocast} ${set_batchsize} ${set_use_custom_op} ${set_model_type} ${set_use_share_conv} ${set_amp_config}"
fi fi
# run train # run train
......
...@@ -4,9 +4,9 @@ python:python3.7 ...@@ -4,9 +4,9 @@ python:python3.7
gpu_list:0|0,1 gpu_list:0|0,1
use_gpu:True|True use_gpu:True|True
AMP.use_amp:True|False AMP.use_amp:True|False
epoch:lite_train_lite_infer=20|whole_train_whole_infer=1000 epoch:lite_train_lite_infer=2|whole_train_whole_infer=1000
save_model_dir:./output/ save_model_dir:./output/
TRAIN.batch_size:lite_train_lite_infer=2|whole_train_whole_infer=4 TRAIN.batch_size:lite_train_lite_infer=1280|whole_train_whole_infer=1280
pretrained_model:null pretrained_model:null
checkpoints:null checkpoints:null
use_custom_relu:False|True use_custom_relu:False|True
......
===========================train_params===========================
model_name:ch_PPOCRv2_det
python:python3.7
gpu_list:xx.xx.xx.xx,yy.yy.yy.yy;0,1
use_gpu:True
AMP.use_amp:True|False
epoch:lite_train_lite_infer=2|whole_train_whole_infer=1000
save_model_dir:./output/
TRAIN.batch_size:lite_train_lite_infer=1280|whole_train_whole_infer=1280
pretrained_model:null
checkpoints:null
use_custom_relu:False|True
model_type:cls|cls_distill|cls_distill_multiopt
MODEL.siamese:False|True
norm_train:train.py -c mv3_large_x0_5.yml -o prune_train=True
quant_train:False
prune_train:False
...@@ -4,9 +4,9 @@ python:python3.7 ...@@ -4,9 +4,9 @@ python:python3.7
gpu_list:0|0,1 gpu_list:0|0,1
use_gpu:True|True use_gpu:True|True
AMP.use_amp:True|False AMP.use_amp:True|False
epoch:lite_train_lite_infer=20|whole_train_whole_infer=1000 epoch:lite_train_lite_infer=2|whole_train_whole_infer=1000
save_model_dir:./output/ save_model_dir:./output/
TRAIN.batch_size:lite_train_lite_infer=2|whole_train_whole_infer=4 TRAIN.batch_size:lite_train_lite_infer=1280|whole_train_whole_infer=1280
pretrained_model:null pretrained_model:null
checkpoints:null checkpoints:null
use_custom_relu:False|True use_custom_relu:False|True
......
===========================train_params===========================
model_name:ch_PPOCRv2_det
python:python3.7
gpu_list:xx.xx.xx.xx,yy.yy.yy.yy;0,1
use_gpu:True
AMP.use_amp:True|False
epoch:lite_train_lite_infer=2|whole_train_whole_infer=1000
save_model_dir:./output/
TRAIN.batch_size:lite_train_lite_infer=1280|whole_train_whole_infer=1280
pretrained_model:null
checkpoints:null
use_custom_relu:False|True
model_type:cls|cls_distill|cls_distill_multiopt
MODEL.siamese:False|True
norm_train:train.py -c mv3_large_x0_5.yml -o quant_train=True
quant_train:False
prune_train:False
===========================train_params===========================
model_name:ch_PPOCRv2_det
python:python3.7
gpu_list:xx.xx.xx.xx,yy.yy.yy.yy;0,1
use_gpu:True
AMP.use_amp:True|False
epoch:lite_train_lite_infer=2|whole_train_whole_infer=1000
save_model_dir:./output/
TRAIN.batch_size:lite_train_lite_infer=1280|whole_train_whole_infer=1280
pretrained_model:null
checkpoints:null
use_custom_relu:False|True
model_type:cls|cls_distill|cls_distill_multiopt
MODEL.siamese:False|True
norm_train: train.py -c mv3_large_x0_5.yml -o
quant_train:False
prune_train:False
...@@ -24,6 +24,7 @@ os.environ["FLAGS_allocator_strategy"] = 'auto_growth' ...@@ -24,6 +24,7 @@ os.environ["FLAGS_allocator_strategy"] = 'auto_growth'
import cv2 import cv2
import copy import copy
import numpy as np import numpy as np
import json
import time import time
import logging import logging
from PIL import Image from PIL import Image
...@@ -128,6 +129,9 @@ def main(args): ...@@ -128,6 +129,9 @@ def main(args):
is_visualize = True is_visualize = True
font_path = args.vis_font_path font_path = args.vis_font_path
drop_score = args.drop_score drop_score = args.drop_score
draw_img_save_dir = args.draw_img_save_dir
os.makedirs(draw_img_save_dir, exist_ok=True)
save_results = []
# warm up 10 times # warm up 10 times
if args.warmup: if args.warmup:
...@@ -157,6 +161,14 @@ def main(args): ...@@ -157,6 +161,14 @@ def main(args):
for text, score in rec_res: for text, score in rec_res:
logger.debug("{}, {:.3f}".format(text, score)) logger.debug("{}, {:.3f}".format(text, score))
res = [{
"transcription": rec_res[idx][0],
"points": np.array(dt_boxes[idx]).astype(np.int32).tolist(),
} for idx in range(len(dt_boxes))]
save_pred = os.path.basename(image_file) + "\t" + json.dumps(
res, ensure_ascii=False) + "\n"
save_results.append(save_pred)
if is_visualize: if is_visualize:
image = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB)) image = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
boxes = dt_boxes boxes = dt_boxes
...@@ -170,8 +182,6 @@ def main(args): ...@@ -170,8 +182,6 @@ def main(args):
scores, scores,
drop_score=drop_score, drop_score=drop_score,
font_path=font_path) font_path=font_path)
draw_img_save_dir = args.draw_img_save_dir
os.makedirs(draw_img_save_dir, exist_ok=True)
if flag: if flag:
image_file = image_file[:-3] + "png" image_file = image_file[:-3] + "png"
cv2.imwrite( cv2.imwrite(
...@@ -185,6 +195,9 @@ def main(args): ...@@ -185,6 +195,9 @@ def main(args):
text_sys.text_detector.autolog.report() text_sys.text_detector.autolog.report()
text_sys.text_recognizer.autolog.report() text_sys.text_recognizer.autolog.report()
with open(os.path.join(draw_img_save_dir, "system_results.txt"), 'w') as f:
f.writelines(save_results)
if __name__ == "__main__": if __name__ == "__main__":
args = utility.parse_args() args = utility.parse_args()
......
...@@ -146,6 +146,7 @@ def train(config, ...@@ -146,6 +146,7 @@ def train(config,
scaler=None): scaler=None):
cal_metric_during_train = config['Global'].get('cal_metric_during_train', cal_metric_during_train = config['Global'].get('cal_metric_during_train',
False) False)
calc_epoch_interval = config['Global'].get('calc_epoch_interval', 1)
log_smooth_window = config['Global']['log_smooth_window'] log_smooth_window = config['Global']['log_smooth_window']
epoch_num = config['Global']['epoch_num'] epoch_num = config['Global']['epoch_num']
print_batch_step = config['Global']['print_batch_step'] print_batch_step = config['Global']['print_batch_step']
...@@ -244,6 +245,16 @@ def train(config, ...@@ -244,6 +245,16 @@ def train(config,
optimizer.step() optimizer.step()
optimizer.clear_grad() optimizer.clear_grad()
if cal_metric_during_train and epoch % calc_epoch_interval == 0: # only rec and cls need
batch = [item.numpy() for item in batch]
if model_type in ['table', 'kie']:
eval_class(preds, batch)
else:
post_result = post_process_class(preds, batch[1])
eval_class(post_result, batch)
metric = eval_class.get_metric()
train_stats.update(metric)
train_batch_time = time.time() - reader_start train_batch_time = time.time() - reader_start
train_batch_cost += train_batch_time train_batch_cost += train_batch_time
eta_meter.update(train_batch_time) eta_meter.update(train_batch_time)
...@@ -258,16 +269,6 @@ def train(config, ...@@ -258,16 +269,6 @@ def train(config,
stats['lr'] = lr stats['lr'] = lr
train_stats.update(stats) train_stats.update(stats)
if cal_metric_during_train: # only rec and cls need
batch = [item.numpy() for item in batch]
if model_type in ['table', 'kie']:
eval_class(preds, batch)
else:
post_result = post_process_class(preds, batch[1])
eval_class(post_result, batch)
metric = eval_class.get_metric()
train_stats.update(metric)
if vdl_writer is not None and dist.get_rank() == 0: if vdl_writer is not None and dist.get_rank() == 0:
for k, v in train_stats.get().items(): for k, v in train_stats.get().items():
vdl_writer.add_scalar('TRAIN/{}'.format(k), v, global_step) vdl_writer.add_scalar('TRAIN/{}'.format(k), v, global_step)
...@@ -277,12 +278,13 @@ def train(config, ...@@ -277,12 +278,13 @@ def train(config,
(global_step > 0 and global_step % print_batch_step == 0) or (global_step > 0 and global_step % print_batch_step == 0) or
(idx >= len(train_dataloader) - 1)): (idx >= len(train_dataloader) - 1)):
logs = train_stats.log() logs = train_stats.log()
eta_sec = ((epoch_num + 1 - epoch) * \ eta_sec = ((epoch_num + 1 - epoch) * \
len(train_dataloader) - idx - 1) * eta_meter.avg len(train_dataloader) - idx - 1) * eta_meter.avg
eta_sec_format = str(datetime.timedelta(seconds=int(eta_sec))) eta_sec_format = str(datetime.timedelta(seconds=int(eta_sec)))
strs = 'epoch: [{}/{}], global_step: {}, {}, avg_reader_cost: ' \ strs = 'epoch: [{}/{}], global_step: {}, {}, avg_reader_cost: ' \
'{:.5f} s, avg_batch_cost: {:.5f} s, avg_samples: {}, ' \ '{:.5f} s, avg_batch_cost: {:.5f} s, avg_samples: {}, ' \
'ips: {:.5f}, eta: {}'.format( 'ips: {:.5f} samples/s, eta: {}'.format(
epoch, epoch_num, global_step, logs, epoch, epoch_num, global_step, logs,
train_reader_cost / print_batch_step, train_reader_cost / print_batch_step,
train_batch_cost / print_batch_step, train_batch_cost / print_batch_step,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment