Commit 84ce34bd authored by Leif's avatar Leif
Browse files

Merge remote-tracking branch 'origin/dygraph' into dygraph

parents f470ede8 529133fb
#!/bin/bash
source tests/common_func.sh
source test_tipc/common_func.sh
FILENAME=$1
dataline=$(awk 'NR==52, NR==66{print}' $FILENAME)
......@@ -35,7 +35,7 @@ cpp_benchmark_key=$(func_parser_key "${lines[14]}")
cpp_benchmark_value=$(func_parser_value "${lines[14]}")
LOG_PATH="./tests/output"
LOG_PATH="./test_tipc/output"
mkdir -p ${LOG_PATH}
status_log="${LOG_PATH}/results_cpp.log"
......
#!/bin/bash
source ./common_func.sh
export LD_LIBRARY_PATH=${PWD}:$LD_LIBRARY_PATH
FILENAME=$1
dataline=$(awk 'NR==101, NR==110{print}' $FILENAME)
echo $dataline
# parser params
IFS=$'\n'
lines=(${dataline})
# parser lite inference
lite_inference_cmd=$(func_parser_value "${lines[1]}")
lite_model_dir_list=$(func_parser_value "${lines[2]}")
lite_cpu_threads_list=$(func_parser_value "${lines[3]}")
lite_batch_size_list=$(func_parser_value "${lines[4]}")
lite_power_mode_list=$(func_parser_value "${lines[5]}")
lite_infer_img_dir_list=$(func_parser_value "${lines[6]}")
lite_config_dir=$(func_parser_value "${lines[7]}")
lite_rec_dict_dir=$(func_parser_value "${lines[8]}")
lite_benchmark_value=$(func_parser_value "${lines[9]}")
LOG_PATH="./output"
mkdir -p ${LOG_PATH}
status_log="${LOG_PATH}/results.log"
function func_lite(){
IFS='|'
_script=$1
_lite_model=$2
_log_path=$3
_img_dir=$4
_config=$5
if [[ $lite_model =~ "slim" ]]; then
precision="INT8"
else
precision="FP32"
fi
is_single_img=$(echo $_img_dir | grep -E ".jpg|.jpeg|.png|.JPEG|.JPG")
if [[ "$is_single_img" != "" ]]; then
single_img="True"
else
single_img="False"
fi
# lite inference
for num_threads in ${lite_cpu_threads_list[*]}; do
for power_mode in ${lite_power_mode_list[*]}; do
for batchsize in ${lite_batch_size_list[*]}; do
model_name=$(echo $lite_model | awk -F "/" '{print $NF}')
_save_log_path="${_log_path}/lite_${model_name}_precision_${precision}_batchsize_${batchsize}_threads_${num_threads}_powermode_${power_mode}_singleimg_${single_img}.log"
command="${_script} ${lite_model} ${precision} ${num_threads} ${batchsize} ${power_mode} ${_img_dir} ${_config} ${lite_benchmark_value} > ${_save_log_path} 2>&1"
eval ${command}
status_check $? "${command}" "${status_log}"
done
done
done
}
echo "################### run test ###################"
IFS="|"
for lite_model in ${lite_model_dir_list[*]}; do
#run lite inference
for img_dir in ${lite_infer_img_dir_list[*]}; do
func_lite "${lite_inference_cmd}" "${lite_model}" "${LOG_PATH}" "${img_dir}" "${lite_config_dir}"
done
done
#!/bin/bash
source tests/common_func.sh
source test_tipc/common_func.sh
FILENAME=$1
dataline=$(awk 'NR==67, NR==83{print}' $FILENAME)
......@@ -36,8 +36,8 @@ web_precision_key=$(func_parser_key "${lines[15]}")
web_precision_list=$(func_parser_value "${lines[15]}")
pipeline_py=$(func_parser_value "${lines[16]}")
LOG_PATH="../../tests/output"
mkdir -p ./tests/output
LOG_PATH="../../test_tipc/output"
mkdir -p ./test_tipc/output
status_log="${LOG_PATH}/results_serving.log"
function func_serving(){
......
#!/bin/bash
source tests/common_func.sh
source test_tipc/common_func.sh
FILENAME=$1
# MODE be one of ['lite_train_infer' 'whole_infer' 'whole_train_infer', 'infer', 'klquant_infer']
# MODE be one of ['lite_train_lite_infer' 'lite_train_whole_infer' 'whole_train_whole_infer', 'whole_infer', 'klquant_whole_infer']
MODE=$2
dataline=$(awk 'NR==1, NR==51{print}' $FILENAME)
......@@ -59,6 +59,7 @@ export_key1=$(func_parser_key "${lines[33]}")
export_value1=$(func_parser_value "${lines[33]}")
export_key2=$(func_parser_key "${lines[34]}")
export_value2=$(func_parser_value "${lines[34]}")
inference_dir=$(func_parser_value "${lines[35]}")
# parser inference model
infer_model_dir_list=$(func_parser_value "${lines[36]}")
......@@ -88,7 +89,7 @@ infer_key1=$(func_parser_key "${lines[50]}")
infer_value1=$(func_parser_value "${lines[50]}")
# parser klquant_infer
if [ ${MODE} = "klquant_infer" ]; then
if [ ${MODE} = "klquant_whole_infer" ]; then
dataline=$(awk 'NR==82, NR==98{print}' $FILENAME)
lines=(${dataline})
# parser inference model
......@@ -119,7 +120,7 @@ if [ ${MODE} = "klquant_infer" ]; then
infer_value1=$(func_parser_value "${lines[15]}")
fi
LOG_PATH="./tests/output"
LOG_PATH="./test_tipc/output"
mkdir -p ${LOG_PATH}
status_log="${LOG_PATH}/results_python.log"
......@@ -202,7 +203,7 @@ function func_inference(){
done
}
if [ ${MODE} = "infer" ] || [ ${MODE} = "klquant_infer" ]; then
if [ ${MODE} = "whole_infer" ] || [ ${MODE} = "klquant_whole_infer" ]; then
GPUID=$3
if [ ${#GPUID} -le 0 ];then
env=" "
......@@ -245,6 +246,7 @@ else
for gpu in ${gpu_list[*]}; do
use_gpu=${USE_GPU_KEY[Count]}
Count=$(($Count + 1))
ips=""
if [ ${gpu} = "-1" ];then
env=""
elif [ ${#gpu} -le 1 ];then
......@@ -264,6 +266,11 @@ else
env=" "
fi
for autocast in ${autocast_list[*]}; do
if [ ${autocast} = "amp" ]; then
set_amp_config="Global.use_amp=True Global.scale_loss=1024.0 Global.use_dynamic_loss_scaling=True"
else
set_amp_config=" "
fi
for trainer in ${trainer_list[*]}; do
flag_quant=False
if [ ${trainer} = ${pact_key} ]; then
......@@ -290,7 +297,6 @@ else
if [ ${run_train} = "null" ]; then
continue
fi
set_autocast=$(func_set_params "${autocast_key}" "${autocast}")
set_epoch=$(func_set_params "${epoch_key}" "${epoch_num}")
set_pretrain=$(func_set_params "${pretrain_model_key}" "${pretrain_model_value}")
......@@ -306,11 +312,11 @@ else
set_save_model=$(func_set_params "${save_model_key}" "${save_log}")
if [ ${#gpu} -le 2 ];then # train with cpu or single gpu
cmd="${python} ${run_train} ${set_use_gpu} ${set_save_model} ${set_epoch} ${set_pretrain} ${set_autocast} ${set_batchsize} ${set_train_params1} "
elif [ ${#gpu} -le 15 ];then # train with multi-gpu
cmd="${python} -m paddle.distributed.launch --gpus=${gpu} ${run_train} ${set_save_model} ${set_epoch} ${set_pretrain} ${set_autocast} ${set_batchsize} ${set_train_params1}"
cmd="${python} ${run_train} ${set_use_gpu} ${set_save_model} ${set_epoch} ${set_pretrain} ${set_autocast} ${set_batchsize} ${set_train_params1} ${set_amp_config} "
elif [ ${#ips} -le 26 ];then # train with multi-gpu
cmd="${python} -m paddle.distributed.launch --gpus=${gpu} ${run_train} ${set_use_gpu} ${set_save_model} ${set_epoch} ${set_pretrain} ${set_autocast} ${set_batchsize} ${set_train_params1} ${set_amp_config}"
else # train with multi-machine
cmd="${python} -m paddle.distributed.launch --ips=${ips} --gpus=${gpu} ${run_train} ${set_save_model} ${set_pretrain} ${set_epoch} ${set_autocast} ${set_batchsize} ${set_train_params1}"
cmd="${python} -m paddle.distributed.launch --ips=${ips} --gpus=${gpu} ${set_use_gpu} ${run_train} ${set_save_model} ${set_pretrain} ${set_epoch} ${set_autocast} ${set_batchsize} ${set_train_params1} ${set_amp_config}"
fi
# run train
eval "unset CUDA_VISIBLE_DEVICES"
......@@ -342,7 +348,13 @@ else
#run inference
eval $env
save_infer_path="${save_log}"
func_inference "${python}" "${inference_py}" "${save_infer_path}" "${LOG_PATH}" "${train_infer_img_dir}" "${flag_quant}"
if [ ${inference_dir} != "null" ] && [ ${inference_dir} != '##' ]; then
infer_model_dir="${save_infer_path}/${inference_dir}"
else
infer_model_dir=${save_infer_path}
fi
func_inference "${python}" "${inference_py}" "${infer_model_dir}" "${LOG_PATH}" "${train_infer_img_dir}" "${flag_quant}"
eval "unset CUDA_VISIBLE_DEVICES"
fi
done # done with: for trainer in ${trainer_list[*]}; do
......
......@@ -159,7 +159,8 @@ def train(config,
eval_class,
pre_best_model_dict,
logger,
vdl_writer=None):
vdl_writer=None,
scaler=None):
cal_metric_during_train = config['Global'].get('cal_metric_during_train',
False)
log_smooth_window = config['Global']['log_smooth_window']
......@@ -211,33 +212,49 @@ def train(config,
for epoch in range(start_epoch, epoch_num + 1):
train_dataloader = build_dataloader(
config, 'Train', device, logger, seed=epoch)
train_batch_cost = 0.0
train_reader_cost = 0.0
batch_sum = 0
batch_start = time.time()
train_run_cost = 0.0
total_samples = 0
reader_start = time.time()
max_iter = len(train_dataloader) - 1 if platform.system(
) == "Windows" else len(train_dataloader)
for idx, batch in enumerate(train_dataloader):
profiler.add_profiler_step(profiler_options)
train_reader_cost += time.time() - batch_start
train_reader_cost += time.time() - reader_start
if idx >= max_iter:
break
lr = optimizer.get_lr()
images = batch[0]
if use_srn:
model_average = True
if model_type == 'table' or extra_input:
preds = model(images, data=batch[1:])
train_start = time.time()
# use amp
if scaler:
with paddle.amp.auto_cast():
if model_type == 'table' or extra_input:
preds = model(images, data=batch[1:])
else:
preds = model(images)
else:
preds = model(images)
if model_type == 'table' or extra_input:
preds = model(images, data=batch[1:])
else:
preds = model(images)
loss = loss_class(preds, batch)
avg_loss = loss['loss']
avg_loss.backward()
optimizer.step()
if scaler:
scaled_avg_loss = scaler.scale(avg_loss)
scaled_avg_loss.backward()
scaler.minimize(optimizer, scaled_avg_loss)
else:
avg_loss.backward()
optimizer.step()
optimizer.clear_grad()
train_batch_cost += time.time() - batch_start
batch_sum += len(images)
train_run_cost += time.time() - train_start
total_samples += len(images)
if not isinstance(lr_scheduler, float):
lr_scheduler.step()
......@@ -268,12 +285,13 @@ def train(config,
logs = train_stats.log()
strs = 'epoch: [{}/{}], iter: {}, {}, reader_cost: {:.5f} s, batch_cost: {:.5f} s, samples: {}, ips: {:.5f}'.format(
epoch, epoch_num, global_step, logs, train_reader_cost /
print_batch_step, train_batch_cost / print_batch_step,
batch_sum, batch_sum / train_batch_cost)
print_batch_step, (train_reader_cost + train_run_cost) /
print_batch_step, total_samples,
total_samples / (train_reader_cost + train_run_cost))
logger.info(strs)
train_batch_cost = 0.0
train_reader_cost = 0.0
batch_sum = 0
train_run_cost = 0.0
total_samples = 0
# eval
if global_step > start_eval_step and \
(global_step - start_eval_step) % eval_batch_step == 0 and dist.get_rank() == 0:
......@@ -326,7 +344,7 @@ def train(config,
global_step)
global_step += 1
optimizer.clear_grad()
batch_start = time.time()
reader_start = time.time()
if dist.get_rank() == 0:
save_model(
model,
......@@ -367,7 +385,11 @@ def eval(model,
with paddle.no_grad():
total_frame = 0.0
total_time = 0.0
pbar = tqdm(total=len(valid_dataloader), desc='eval model:')
pbar = tqdm(
total=len(valid_dataloader),
desc='eval model:',
position=0,
leave=True)
max_iter = len(valid_dataloader) - 1 if platform.system(
) == "Windows" else len(valid_dataloader)
for idx, batch in enumerate(valid_dataloader):
......@@ -436,8 +458,6 @@ def get_center(model, eval_dataloader, post_process_class):
batch = [item.numpy() for item in batch]
# Obtain usable results from post-processing methods
total_time += time.time() - start
# Evaluate the results of the current batch
post_result = post_process_class(preds, batch[1])
#update char_center
......@@ -480,11 +500,6 @@ def preprocess(is_train=False):
'CLS', 'PGNet', 'Distillation', 'NRTR', 'TableAttn', 'SAR', 'PSE',
'SEED'
]
windows_not_support_list = ['PSE']
if platform.system() == "Windows" and alg in windows_not_support_list:
logger.warning('{} is not support in Windows now'.format(
windows_not_support_list))
sys.exit()
device = 'gpu:{}'.format(dist.ParallelEnv().dev_id) if use_gpu else 'cpu'
device = paddle.set_device(device)
......
......@@ -102,10 +102,27 @@ def main(config, device, logger, vdl_writer):
if valid_dataloader is not None:
logger.info('valid dataloader has {} iters'.format(
len(valid_dataloader)))
use_amp = config["Global"].get("use_amp", False)
if use_amp:
AMP_RELATED_FLAGS_SETTING = {
'FLAGS_cudnn_batchnorm_spatial_persistent': 1,
'FLAGS_max_inplace_grad_add': 8,
}
paddle.fluid.set_flags(AMP_RELATED_FLAGS_SETTING)
scale_loss = config["Global"].get("scale_loss", 1.0)
use_dynamic_loss_scaling = config["Global"].get(
"use_dynamic_loss_scaling", False)
scaler = paddle.amp.GradScaler(
init_loss_scaling=scale_loss,
use_dynamic_loss_scaling=use_dynamic_loss_scaling)
else:
scaler = None
# start train
program.train(config, train_dataloader, valid_dataloader, device, model,
loss_class, optimizer, lr_scheduler, post_process_class,
eval_class, pre_best_model_dict, logger, vdl_writer)
eval_class, pre_best_model_dict, logger, vdl_writer, scaler)
def test_reader(config, device, logger):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment