Merge branch 'dygraph' into dygraph

26c16324 · Evezerest · GitHub · d9549ce6 · 0b37c118 · 26c16324
Unverified Commit 26c16324 authored Nov 08, 2021 by Evezerest Committed by GitHub Nov 08, 2021
12 changed files
--- a/PTDN/results/python_ppocr_det_mobile_results_fp32.txt
+++ b/PTDN/results/python_ppocr_det_mobile_results_fp32.txt
--- a/PTDN/test_inference_cpp.sh
+++ b/PTDN/test_inference_cpp.sh
 #!/bin/bash
-source tests/common_func.sh
+source test_tipc/common_func.sh

 FILENAME=$1
 dataline=$(awk 'NR==52, NR==66{print}'  $FILENAME)
@@ -35,7 +35,7 @@ cpp_benchmark_key=$(func_parser_key "${lines[14]}")
 cpp_benchmark_value=$(func_parser_value "${lines[14]}")


-LOG_PATH="./tests/output"
+LOG_PATH="./test_tipc/output"
 mkdir -p ${LOG_PATH}
 status_log="${LOG_PATH}/results_cpp.log"


--- a/test_tipc/test_lite.sh
+++ b/test_tipc/test_lite.sh
+#!/bin/bash
+source ./common_func.sh
+export LD_LIBRARY_PATH=${PWD}:$LD_LIBRARY_PATH
+
+FILENAME=$1
+dataline=$(awk 'NR==101, NR==110{print}'  $FILENAME)
+echo $dataline
+# parser params
+IFS=$'\n'
+lines=(${dataline})
+
+# parser lite inference
+lite_inference_cmd=$(func_parser_value "${lines[1]}")
+lite_model_dir_list=$(func_parser_value "${lines[2]}")
+lite_cpu_threads_list=$(func_parser_value "${lines[3]}")
+lite_batch_size_list=$(func_parser_value "${lines[4]}")
+lite_power_mode_list=$(func_parser_value "${lines[5]}")
+lite_infer_img_dir_list=$(func_parser_value "${lines[6]}")
+lite_config_dir=$(func_parser_value "${lines[7]}")
+lite_rec_dict_dir=$(func_parser_value "${lines[8]}")
+lite_benchmark_value=$(func_parser_value "${lines[9]}")
+
+LOG_PATH="./output"
+mkdir -p ${LOG_PATH}
+status_log="${LOG_PATH}/results.log"
+
+
+function func_lite(){
+    IFS='|'
+    _script=$1
+    _lite_model=$2
+    _log_path=$3
+    _img_dir=$4
+    _config=$5
+    if [[ $lite_model =~ "slim" ]]; then
+        precision="INT8"
+    else
+        precision="FP32"
+    fi
+    is_single_img=$(echo $_img_dir | grep -E ".jpg|.jpeg|.png|.JPEG|.JPG")
+    if [[ "$is_single_img" != "" ]]; then
+        single_img="True"
+    else
+        single_img="False"
+    fi
+
+    # lite inference
+    for num_threads in ${lite_cpu_threads_list[*]}; do
+        for power_mode in ${lite_power_mode_list[*]}; do
+            for batchsize in ${lite_batch_size_list[*]}; do
+                model_name=$(echo $lite_model | awk -F "/" '{print $NF}')
+                _save_log_path="${_log_path}/lite_${model_name}_precision_${precision}_batchsize_${batchsize}_threads_${num_threads}_powermode_${power_mode}_singleimg_${single_img}.log"
+                command="${_script} ${lite_model} ${precision} ${num_threads} ${batchsize} ${power_mode} ${_img_dir} ${_config} ${lite_benchmark_value} > ${_save_log_path} 2>&1"
+                eval ${command}
+                status_check $? "${command}" "${status_log}"
+            done
+        done
+    done
+}
+
+
+echo "################### run test ###################"
+IFS="|"
+for lite_model in ${lite_model_dir_list[*]}; do
+    #run lite inference
+    for img_dir in ${lite_infer_img_dir_list[*]}; do
+        func_lite "${lite_inference_cmd}" "${lite_model}" "${LOG_PATH}" "${img_dir}" "${lite_config_dir}"
+    done
+done
--- a/test_tipc/test_paddle2onnx.sh
+++ b/test_tipc/test_paddle2onnx.sh
+#!/bin/bash
+source test_tipc/common_func.sh 
+
+FILENAME=$1
+
+dataline=$(cat ${FILENAME})
+lines=(${dataline})
+# common params
+model_name=$(func_parser_value "${lines[1]}")
+python=$(func_parser_value "${lines[2]}")
+
+
+# parser params
+dataline=$(awk 'NR==111, NR==123{print}'  $FILENAME)
+IFS=$'\n'
+lines=(${dataline})
+
+# parser paddle2onnx
+padlle2onnx_cmd=$(func_parser_value "${lines[1]}")
+infer_model_dir_key=$(func_parser_key "${lines[2]}")
+infer_model_dir_value=$(func_parser_value "${lines[2]}")
+model_filename_key=$(func_parser_key "${lines[3]}")
+model_filename_value=$(func_parser_value "${lines[3]}")
+params_filename_key=$(func_parser_key "${lines[4]}")
+params_filename_value=$(func_parser_value "${lines[4]}")
+save_file_key=$(func_parser_key "${lines[5]}")
+save_file_value=$(func_parser_value "${lines[5]}")
+opset_version_key=$(func_parser_key "${lines[6]}")
+opset_version_value=$(func_parser_value "${lines[6]}")
+enable_onnx_checker_key=$(func_parser_key "${lines[7]}")
+enable_onnx_checker_value=$(func_parser_value "${lines[7]}")
+# parser onnx inference 
+inference_py=$(func_parser_value "${lines[8]}")
+use_gpu_key=$(func_parser_key "${lines[9]}")
+use_gpu_value=$(func_parser_value "${lines[9]}")
+det_model_key=$(func_parser_key "${lines[10]}")
+image_dir_key=$(func_parser_key "${lines[11]}")
+image_dir_value=$(func_parser_value "${lines[11]}")
+
+
+LOG_PATH="./test_tipc/output"
+mkdir -p ./test_tipc/output
+status_log="${LOG_PATH}/results_paddle2onnx.log"
+
+
+function func_paddle2onnx(){
+    IFS='|'
+    _script=$1
+
+    # paddle2onnx
+    _save_log_path="${LOG_PATH}/paddle2onnx_infer_cpu.log"
+    set_dirname=$(func_set_params "${infer_model_dir_key}" "${infer_model_dir_value}")
+    set_model_filename=$(func_set_params "${model_filename_key}" "${model_filename_value}")
+    set_params_filename=$(func_set_params "${params_filename_key}" "${params_filename_value}")
+    set_save_model=$(func_set_params "${save_file_key}" "${save_file_value}")
+    set_opset_version=$(func_set_params "${opset_version_key}" "${opset_version_value}")
+    set_enable_onnx_checker=$(func_set_params "${enable_onnx_checker_key}" "${enable_onnx_checker_value}")
+    trans_model_cmd="${padlle2onnx_cmd} ${set_dirname} ${set_model_filename} ${set_params_filename} ${set_save_model} ${set_opset_version} ${set_enable_onnx_checker}"
+    eval $trans_model_cmd
+    last_status=${PIPESTATUS[0]}
+    status_check $last_status "${trans_model_cmd}" "${status_log}"
+    # python inference
+    set_gpu=$(func_set_params "${use_gpu_key}" "${use_gpu_value}")
+    set_model_dir=$(func_set_params "${det_model_key}" "${save_file_value}")
+    set_img_dir=$(func_set_params "${image_dir_key}" "${image_dir_value}")
+    infer_model_cmd="${python} ${inference_py} ${set_gpu} ${set_img_dir} ${set_model_dir} --use_onnx=True > ${_save_log_path} 2>&1 "
+    eval $infer_model_cmd
+    status_check $last_status "${infer_model_cmd}" "${status_log}"
+}
+
+
+echo "################### run test ###################"
+
+export Count=0
+IFS="|"
+func_paddle2onnx 
\ No newline at end of file
--- a/PTDN/test_serving.sh
+++ b/PTDN/test_serving.sh
 #!/bin/bash
-source tests/common_func.sh
+source test_tipc/common_func.sh

 FILENAME=$1
 dataline=$(awk 'NR==67, NR==83{print}'  $FILENAME)
@@ -36,8 +36,8 @@ web_precision_key=$(func_parser_key "${lines[15]}")
 web_precision_list=$(func_parser_value "${lines[15]}")
 pipeline_py=$(func_parser_value "${lines[16]}")

-LOG_PATH="../../tests/output"
-mkdir -p ./tests/output
+LOG_PATH="../../test_tipc/output"
+mkdir -p ./test_tipc/output
 status_log="${LOG_PATH}/results_serving.log"

 function func_serving(){

--- a/PTDN/test_train_inference_python.sh
+++ b/PTDN/test_train_inference_python.sh
 #!/bin/bash
-source tests/common_func.sh
+source test_tipc/common_func.sh

 FILENAME=$1
-# MODE be one of ['lite_train_infer' 'whole_infer' 'whole_train_infer', 'infer', 'klquant_infer']
+# MODE be one of ['lite_train_lite_infer' 'lite_train_whole_infer' 'whole_train_whole_infer', 'whole_infer', 'klquant_whole_infer']
 MODE=$2

 dataline=$(awk 'NR==1, NR==51{print}'  $FILENAME)
@@ -59,6 +59,7 @@ export_key1=$(func_parser_key "${lines[33]}")
 export_value1=$(func_parser_value "${lines[33]}")
 export_key2=$(func_parser_key "${lines[34]}")
 export_value2=$(func_parser_value "${lines[34]}")
+inference_dir=$(func_parser_value "${lines[35]}")

 # parser inference model 
 infer_model_dir_list=$(func_parser_value "${lines[36]}")
@@ -88,7 +89,7 @@ infer_key1=$(func_parser_key "${lines[50]}")
 infer_value1=$(func_parser_value "${lines[50]}")

 # parser klquant_infer
-if [ ${MODE} = "klquant_infer" ]; then
+if [ ${MODE} = "klquant_whole_infer" ]; then
    dataline=$(awk 'NR==82, NR==98{print}'  $FILENAME)
    lines=(${dataline})
    # parser inference model 
@@ -119,7 +120,7 @@ if [ ${MODE} = "klquant_infer" ]; then
    infer_value1=$(func_parser_value "${lines[15]}")
 fi

-LOG_PATH="./tests/output"
+LOG_PATH="./test_tipc/output"
 mkdir -p ${LOG_PATH}
 status_log="${LOG_PATH}/results_python.log"

@@ -202,7 +203,7 @@ function func_inference(){
    done
 }

-if [ ${MODE} = "infer" ] || [ ${MODE} = "klquant_infer" ]; then
+if [ ${MODE} = "whole_infer" ] || [ ${MODE} = "klquant_whole_infer" ]; then
    GPUID=$3
    if [ ${#GPUID} -le 0 ];then
        env=" "
@@ -245,6 +246,7 @@ else
    for gpu in ${gpu_list[*]}; do
        use_gpu=${USE_GPU_KEY[Count]}
        Count=$(($Count + 1))
+        ips=""
        if [ ${gpu} = "-1" ];then
            env=""
        elif [ ${#gpu} -le 1 ];then
@@ -264,6 +266,11 @@ else
            env=" "
        fi
        for autocast in ${autocast_list[*]}; do 
+            if [ ${autocast} = "amp" ]; then
+                set_amp_config="Global.use_amp=True Global.scale_loss=1024.0 Global.use_dynamic_loss_scaling=True"
+            else
+                set_amp_config=" "
+            fi          
            for trainer in ${trainer_list[*]}; do 
                flag_quant=False
                if [ ${trainer} = ${pact_key} ]; then
@@ -290,7 +297,6 @@ else
                if [ ${run_train} = "null" ]; then
                    continue
                fi
-                
                set_autocast=$(func_set_params "${autocast_key}" "${autocast}")
                set_epoch=$(func_set_params "${epoch_key}" "${epoch_num}")
                set_pretrain=$(func_set_params "${pretrain_model_key}" "${pretrain_model_value}")
@@ -306,11 +312,11 @@ else

                set_save_model=$(func_set_params "${save_model_key}" "${save_log}")
                if [ ${#gpu} -le 2 ];then  # train with cpu or single gpu
-                    cmd="${python} ${run_train} ${set_use_gpu}  ${set_save_model} ${set_epoch} ${set_pretrain} ${set_autocast} ${set_batchsize} ${set_train_params1} "
-                elif [ ${#gpu} -le 15 ];then  # train with multi-gpu
-                    cmd="${python} -m paddle.distributed.launch --gpus=${gpu} ${run_train} ${set_save_model} ${set_epoch} ${set_pretrain} ${set_autocast} ${set_batchsize} ${set_train_params1}"
+                    cmd="${python} ${run_train} ${set_use_gpu}  ${set_save_model} ${set_epoch} ${set_pretrain} ${set_autocast} ${set_batchsize} ${set_train_params1} ${set_amp_config} "
+                elif [ ${#ips} -le 26 ];then  # train with multi-gpu
+                    cmd="${python} -m paddle.distributed.launch --gpus=${gpu} ${run_train} ${set_use_gpu} ${set_save_model} ${set_epoch} ${set_pretrain} ${set_autocast} ${set_batchsize} ${set_train_params1} ${set_amp_config}"
                else     # train with multi-machine
-                    cmd="${python} -m paddle.distributed.launch --ips=${ips} --gpus=${gpu} ${run_train} ${set_save_model} ${set_pretrain} ${set_epoch} ${set_autocast} ${set_batchsize} ${set_train_params1}"
+                    cmd="${python} -m paddle.distributed.launch --ips=${ips} --gpus=${gpu} ${run_train} ${set_use_gpu} ${set_save_model} ${set_pretrain} ${set_epoch} ${set_autocast} ${set_batchsize} ${set_train_params1} ${set_amp_config}"
                fi
                # run train
                eval "unset CUDA_VISIBLE_DEVICES"
@@ -342,7 +348,13 @@ else
                    #run inference
                    eval $env
                    save_infer_path="${save_log}"
-                    func_inference "${python}" "${inference_py}" "${save_infer_path}" "${LOG_PATH}" "${train_infer_img_dir}" "${flag_quant}"
+                    if [ ${inference_dir} != "null" ] && [ ${inference_dir} != '##' ]; then
+                        infer_model_dir="${save_infer_path}/${inference_dir}"
+                    else
+                        infer_model_dir=${save_infer_path}
+                    fi
+                    func_inference "${python}" "${inference_py}" "${infer_model_dir}" "${LOG_PATH}" "${train_infer_img_dir}" "${flag_quant}"
+                    
                    eval "unset CUDA_VISIBLE_DEVICES"
                fi
            done  # done with:    for trainer in ${trainer_list[*]}; do 

--- a/tools/infer/predict_cls.py
+++ b/tools/infer/predict_cls.py
@@ -47,6 +47,7 @@ class TextClassifier(object):
        self.postprocess_op = build_post_process(postprocess_params)
        self.predictor, self.input_tensor, self.output_tensors, _ = \
            utility.create_predictor(args, 'cls', logger)
+        self.use_onnx = args.use_onnx

    def resize_norm_img(self, img):
        imgC, imgH, imgW = self.cls_image_shape
@@ -100,10 +101,16 @@ class TextClassifier(object):
            norm_img_batch = np.concatenate(norm_img_batch)
            norm_img_batch = norm_img_batch.copy()

-            self.input_tensor.copy_from_cpu(norm_img_batch)
-            self.predictor.run()
-            prob_out = self.output_tensors[0].copy_to_cpu()
-            self.predictor.try_shrink_memory()
+            if self.use_onnx:
+                input_dict = {}
+                input_dict[self.input_tensor.name] = norm_img_batch
+                outputs = self.predictor.run(self.output_tensors, input_dict)
+                prob_out = outputs[0]
+            else:
+                self.input_tensor.copy_from_cpu(norm_img_batch)
+                self.predictor.run()
+                prob_out = self.output_tensors[0].copy_to_cpu()
+                self.predictor.try_shrink_memory()
            cls_result = self.postprocess_op(prob_out)
            elapse += time.time() - starttime
            for rno in range(len(cls_result)):

--- a/tools/infer/predict_det.py
+++ b/tools/infer/predict_det.py
@@ -38,6 +38,7 @@ class TextDetector(object):
    def __init__(self, args):
        self.args = args
        self.det_algorithm = args.det_algorithm
+        self.use_onnx = args.use_onnx
        pre_process_list = [{
            'DetResizeForTest': {
                'limit_side_len': args.det_limit_side_len,
@@ -100,7 +101,12 @@ class TextDetector(object):
        else:
            logger.info("unknown det_algorithm:{}".format(self.det_algorithm))
            sys.exit(0)
-
+        if self.use_onnx:
+            pre_process_list[0] = {
+                'DetResizeForTest': {
+                    'image_shape': [640, 640]
+                }
+            }
        self.preprocess_op = create_operators(pre_process_list)
        self.postprocess_op = build_post_process(postprocess_params)
        self.predictor, self.input_tensor, self.output_tensors, self.config = utility.create_predictor(
@@ -198,15 +204,19 @@ class TextDetector(object):

        if self.args.benchmark:
            self.autolog.times.stamp()
-
-        self.input_tensor.copy_from_cpu(img)
-        self.predictor.run()
-        outputs = []
-        for output_tensor in self.output_tensors:
-            output = output_tensor.copy_to_cpu()
-            outputs.append(output)
-        if self.args.benchmark:
-            self.autolog.times.stamp()
+        if self.use_onnx:
+            input_dict = {}
+            input_dict[self.input_tensor.name] = img
+            outputs = self.predictor.run(self.output_tensors, input_dict)
+        else:
+            self.input_tensor.copy_from_cpu(img)
+            self.predictor.run()
+            outputs = []
+            for output_tensor in self.output_tensors:
+                output = output_tensor.copy_to_cpu()
+                outputs.append(output)
+            if self.args.benchmark:
+                self.autolog.times.stamp()

        preds = {}
        if self.det_algorithm == "EAST":

--- a/tools/infer/predict_rec.py
+++ b/tools/infer/predict_rec.py
@@ -73,6 +73,7 @@ class TextRecognizer(object):
        self.predictor, self.input_tensor, self.output_tensors, self.config = \
            utility.create_predictor(args, 'rec', logger)
        self.benchmark = args.benchmark
+        self.use_onnx = args.use_onnx
        if args.benchmark:
            import auto_log
            pid = os.getpid()
@@ -106,8 +107,9 @@ class TextRecognizer(object):
            return norm_img.astype(np.float32) / 128. - 1.

        assert imgC == img.shape[2]
-        max_wh_ratio = max(max_wh_ratio, imgW / imgH)
        imgW = int((32 * max_wh_ratio))
+        if self.use_onnx:
+            imgW = 100
        h, w = img.shape[:2]
        ratio = w / float(h)
        if math.ceil(imgH * ratio) > imgW:
@@ -297,51 +299,72 @@ class TextRecognizer(object):
                    gsrm_slf_attn_bias1_list,
                    gsrm_slf_attn_bias2_list,
                ]
-                input_names = self.predictor.get_input_names()
-                for i in range(len(input_names)):
-                    input_tensor = self.predictor.get_input_handle(input_names[
-                        i])
-                    input_tensor.copy_from_cpu(inputs[i])
-                self.predictor.run()
-                outputs = []
-                for output_tensor in self.output_tensors:
-                    output = output_tensor.copy_to_cpu()
-                    outputs.append(output)
-                if self.benchmark:
-                    self.autolog.times.stamp()
-                preds = {"predict": outputs[2]}
+                if self.use_onnx:
+                    input_dict = {}
+                    input_dict[self.input_tensor.name] = norm_img_batch
+                    outputs = self.predictor.run(self.output_tensors,
+                                                 input_dict)
+                    preds = {"predict": outputs[2]}
+                else:
+                    input_names = self.predictor.get_input_names()
+                    for i in range(len(input_names)):
+                        input_tensor = self.predictor.get_input_handle(
+                            input_names[i])
+                        input_tensor.copy_from_cpu(inputs[i])
+                    self.predictor.run()
+                    outputs = []
+                    for output_tensor in self.output_tensors:
+                        output = output_tensor.copy_to_cpu()
+                        outputs.append(output)
+                    if self.benchmark:
+                        self.autolog.times.stamp()
+                    preds = {"predict": outputs[2]}
            elif self.rec_algorithm == "SAR":
                valid_ratios = np.concatenate(valid_ratios)
                inputs = [
                    norm_img_batch,
                    valid_ratios,
                ]
-                input_names = self.predictor.get_input_names()
-                for i in range(len(input_names)):
-                    input_tensor = self.predictor.get_input_handle(input_names[
-                        i])
-                    input_tensor.copy_from_cpu(inputs[i])
-                self.predictor.run()
-                outputs = []
-                for output_tensor in self.output_tensors:
-                    output = output_tensor.copy_to_cpu()
-                    outputs.append(output)
-                if self.benchmark:
-                    self.autolog.times.stamp()
-                preds = outputs[0]
-            else:
-                self.input_tensor.copy_from_cpu(norm_img_batch)
-                self.predictor.run()
-                outputs = []
-                for output_tensor in self.output_tensors:
-                    output = output_tensor.copy_to_cpu()
-                    outputs.append(output)
-                if self.benchmark:
-                    self.autolog.times.stamp()
-                if len(outputs) != 1:
-                    preds = outputs
+                if self.use_onnx:
+                    input_dict = {}
+                    input_dict[self.input_tensor.name] = norm_img_batch
+                    outputs = self.predictor.run(self.output_tensors,
+                                                 input_dict)
+                    preds = outputs[0]
                else:
+                    input_names = self.predictor.get_input_names()
+                    for i in range(len(input_names)):
+                        input_tensor = self.predictor.get_input_handle(
+                            input_names[i])
+                        input_tensor.copy_from_cpu(inputs[i])
+                    self.predictor.run()
+                    outputs = []
+                    for output_tensor in self.output_tensors:
+                        output = output_tensor.copy_to_cpu()
+                        outputs.append(output)
+                    if self.benchmark:
+                        self.autolog.times.stamp()
                    preds = outputs[0]
+            else:
+                if self.use_onnx:
+                    input_dict = {}
+                    input_dict[self.input_tensor.name] = norm_img_batch
+                    outputs = self.predictor.run(self.output_tensors,
+                                                 input_dict)
+                    preds = outputs[0]
+                else:
+                    self.input_tensor.copy_from_cpu(norm_img_batch)
+                    self.predictor.run()
+                    outputs = []
+                    for output_tensor in self.output_tensors:
+                        output = output_tensor.copy_to_cpu()
+                        outputs.append(output)
+                    if self.benchmark:
+                        self.autolog.times.stamp()
+                    if len(outputs) != 1:
+                        preds = outputs
+                    else:
+                        preds = outputs[0]
            rec_result = self.postprocess_op(preds)
            for rno in range(len(rec_result)):
                rec_res[indices[beg_img_no + rno]] = rec_result[rno]

--- a/tools/infer/utility.py
+++ b/tools/infer/utility.py
@@ -121,6 +121,7 @@ def init_args():
    parser.add_argument("--save_log_path", type=str, default="./log_output/")

    parser.add_argument("--show_log", type=str2bool, default=True)
+    parser.add_argument("--use_onnx", type=str2bool, default=False)
    return parser


@@ -144,152 +145,163 @@ def create_predictor(args, mode, logger):
    if model_dir is None:
        logger.info("not find {} model file path {}".format(mode, model_dir))
        sys.exit(0)
-    model_file_path = model_dir + "/inference.pdmodel"
-    params_file_path = model_dir + "/inference.pdiparams"
-    if not os.path.exists(model_file_path):
-        raise ValueError("not find model file path {}".format(model_file_path))
-    if not os.path.exists(params_file_path):
-        raise ValueError("not find params file path {}".format(
-            params_file_path))
-
-    config = inference.Config(model_file_path, params_file_path)
-
-    if hasattr(args, 'precision'):
-        if args.precision == "fp16" and args.use_tensorrt:
-            precision = inference.PrecisionType.Half
-        elif args.precision == "int8":
-            precision = inference.PrecisionType.Int8
-        else:
-            precision = inference.PrecisionType.Float32
+    if args.use_onnx:
+        import onnxruntime as ort
+        model_file_path = model_dir
+        if not os.path.exists(model_file_path):
+            raise ValueError("not find model file path {}".format(
+                model_file_path))
+        sess = ort.InferenceSession(model_file_path)
+        return sess, sess.get_inputs()[0], None, None
+
    else:
-        precision = inference.PrecisionType.Float32
-
-    if args.use_gpu:
-        gpu_id = get_infer_gpuid()
-        if gpu_id is None:
-            raise ValueError(
-                "Not found GPU in current device. Please check your device or set args.use_gpu as False"
-            )
-        config.enable_use_gpu(args.gpu_mem, 0)
-        if args.use_tensorrt:
-            config.enable_tensorrt_engine(
-                precision_mode=precision,
-                max_batch_size=args.max_batch_size,
-                min_subgraph_size=args.min_subgraph_size)
-            # skip the minmum trt subgraph
-        if mode == "det":
-            min_input_shape = {
-                "x": [1, 3, 50, 50],
-                "conv2d_92.tmp_0": [1, 120, 20, 20],
-                "conv2d_91.tmp_0": [1, 24, 10, 10],
-                "conv2d_59.tmp_0": [1, 96, 20, 20],
-                "nearest_interp_v2_1.tmp_0": [1, 256, 10, 10],
-                "nearest_interp_v2_2.tmp_0": [1, 256, 20, 20],
-                "conv2d_124.tmp_0": [1, 256, 20, 20],
-                "nearest_interp_v2_3.tmp_0": [1, 64, 20, 20],
-                "nearest_interp_v2_4.tmp_0": [1, 64, 20, 20],
-                "nearest_interp_v2_5.tmp_0": [1, 64, 20, 20],
-                "elementwise_add_7": [1, 56, 2, 2],
-                "nearest_interp_v2_0.tmp_0": [1, 256, 2, 2]
-            }
-            max_input_shape = {
-                "x": [1, 3, 2000, 2000],
-                "conv2d_92.tmp_0": [1, 120, 400, 400],
-                "conv2d_91.tmp_0": [1, 24, 200, 200],
-                "conv2d_59.tmp_0": [1, 96, 400, 400],
-                "nearest_interp_v2_1.tmp_0": [1, 256, 200, 200],
-                "conv2d_124.tmp_0": [1, 256, 400, 400],
-                "nearest_interp_v2_2.tmp_0": [1, 256, 400, 400],
-                "nearest_interp_v2_3.tmp_0": [1, 64, 400, 400],
-                "nearest_interp_v2_4.tmp_0": [1, 64, 400, 400],
-                "nearest_interp_v2_5.tmp_0": [1, 64, 400, 400],
-                "elementwise_add_7": [1, 56, 400, 400],
-                "nearest_interp_v2_0.tmp_0": [1, 256, 400, 400]
-            }
-            opt_input_shape = {
-                "x": [1, 3, 640, 640],
-                "conv2d_92.tmp_0": [1, 120, 160, 160],
-                "conv2d_91.tmp_0": [1, 24, 80, 80],
-                "conv2d_59.tmp_0": [1, 96, 160, 160],
-                "nearest_interp_v2_1.tmp_0": [1, 256, 80, 80],
-                "nearest_interp_v2_2.tmp_0": [1, 256, 160, 160],
-                "conv2d_124.tmp_0": [1, 256, 160, 160],
-                "nearest_interp_v2_3.tmp_0": [1, 64, 160, 160],
-                "nearest_interp_v2_4.tmp_0": [1, 64, 160, 160],
-                "nearest_interp_v2_5.tmp_0": [1, 64, 160, 160],
-                "elementwise_add_7": [1, 56, 40, 40],
-                "nearest_interp_v2_0.tmp_0": [1, 256, 40, 40]
-            }
-            min_pact_shape = {
-                "nearest_interp_v2_26.tmp_0": [1, 256, 20, 20],
-                "nearest_interp_v2_27.tmp_0": [1, 64, 20, 20],
-                "nearest_interp_v2_28.tmp_0": [1, 64, 20, 20],
-                "nearest_interp_v2_29.tmp_0": [1, 64, 20, 20]
-            }
-            max_pact_shape = {
-                "nearest_interp_v2_26.tmp_0": [1, 256, 400, 400],
-                "nearest_interp_v2_27.tmp_0": [1, 64, 400, 400],
-                "nearest_interp_v2_28.tmp_0": [1, 64, 400, 400],
-                "nearest_interp_v2_29.tmp_0": [1, 64, 400, 400]
-            }
-            opt_pact_shape = {
-                "nearest_interp_v2_26.tmp_0": [1, 256, 160, 160],
-                "nearest_interp_v2_27.tmp_0": [1, 64, 160, 160],
-                "nearest_interp_v2_28.tmp_0": [1, 64, 160, 160],
-                "nearest_interp_v2_29.tmp_0": [1, 64, 160, 160]
-            }
-            min_input_shape.update(min_pact_shape)
-            max_input_shape.update(max_pact_shape)
-            opt_input_shape.update(opt_pact_shape)
-        elif mode == "rec":
-            min_input_shape = {"x": [1, 3, 32, 10]}
-            max_input_shape = {"x": [args.rec_batch_num, 3, 32, 2000]}
-            opt_input_shape = {"x": [args.rec_batch_num, 3, 32, 320]}
-        elif mode == "cls":
-            min_input_shape = {"x": [1, 3, 48, 10]}
-            max_input_shape = {"x": [args.rec_batch_num, 3, 48, 2000]}
-            opt_input_shape = {"x": [args.rec_batch_num, 3, 48, 320]}
+        model_file_path = model_dir + "/inference.pdmodel"
+        params_file_path = model_dir + "/inference.pdiparams"
+        if not os.path.exists(model_file_path):
+            raise ValueError("not find model file path {}".format(
+                model_file_path))
+        if not os.path.exists(params_file_path):
+            raise ValueError("not find params file path {}".format(
+                params_file_path))
+
+        config = inference.Config(model_file_path, params_file_path)
+
+        if hasattr(args, 'precision'):
+            if args.precision == "fp16" and args.use_tensorrt:
+                precision = inference.PrecisionType.Half
+            elif args.precision == "int8":
+                precision = inference.PrecisionType.Int8
+            else:
+                precision = inference.PrecisionType.Float32
        else:
-            min_input_shape = {"x": [1, 3, 10, 10]}
-            max_input_shape = {"x": [1, 3, 1000, 1000]}
-            opt_input_shape = {"x": [1, 3, 500, 500]}
-        config.set_trt_dynamic_shape_info(min_input_shape, max_input_shape,
-                                          opt_input_shape)
+            precision = inference.PrecisionType.Float32
+
+        if args.use_gpu:
+            gpu_id = get_infer_gpuid()
+            if gpu_id is None:
+                raise ValueError(
+                    "Not found GPU in current device. Please check your device or set args.use_gpu as False"
+                )
+            config.enable_use_gpu(args.gpu_mem, 0)
+            if args.use_tensorrt:
+                config.enable_tensorrt_engine(
+                    precision_mode=precision,
+                    max_batch_size=args.max_batch_size,
+                    min_subgraph_size=args.min_subgraph_size)
+                # skip the minmum trt subgraph
+            if mode == "det":
+                min_input_shape = {
+                    "x": [1, 3, 50, 50],
+                    "conv2d_92.tmp_0": [1, 120, 20, 20],
+                    "conv2d_91.tmp_0": [1, 24, 10, 10],
+                    "conv2d_59.tmp_0": [1, 96, 20, 20],
+                    "nearest_interp_v2_1.tmp_0": [1, 256, 10, 10],
+                    "nearest_interp_v2_2.tmp_0": [1, 256, 20, 20],
+                    "conv2d_124.tmp_0": [1, 256, 20, 20],
+                    "nearest_interp_v2_3.tmp_0": [1, 64, 20, 20],
+                    "nearest_interp_v2_4.tmp_0": [1, 64, 20, 20],
+                    "nearest_interp_v2_5.tmp_0": [1, 64, 20, 20],
+                    "elementwise_add_7": [1, 56, 2, 2],
+                    "nearest_interp_v2_0.tmp_0": [1, 256, 2, 2]
+                }
+                max_input_shape = {
+                    "x": [1, 3, 2000, 2000],
+                    "conv2d_92.tmp_0": [1, 120, 400, 400],
+                    "conv2d_91.tmp_0": [1, 24, 200, 200],
+                    "conv2d_59.tmp_0": [1, 96, 400, 400],
+                    "nearest_interp_v2_1.tmp_0": [1, 256, 200, 200],
+                    "conv2d_124.tmp_0": [1, 256, 400, 400],
+                    "nearest_interp_v2_2.tmp_0": [1, 256, 400, 400],
+                    "nearest_interp_v2_3.tmp_0": [1, 64, 400, 400],
+                    "nearest_interp_v2_4.tmp_0": [1, 64, 400, 400],
+                    "nearest_interp_v2_5.tmp_0": [1, 64, 400, 400],
+                    "elementwise_add_7": [1, 56, 400, 400],
+                    "nearest_interp_v2_0.tmp_0": [1, 256, 400, 400]
+                }
+                opt_input_shape = {
+                    "x": [1, 3, 640, 640],
+                    "conv2d_92.tmp_0": [1, 120, 160, 160],
+                    "conv2d_91.tmp_0": [1, 24, 80, 80],
+                    "conv2d_59.tmp_0": [1, 96, 160, 160],
+                    "nearest_interp_v2_1.tmp_0": [1, 256, 80, 80],
+                    "nearest_interp_v2_2.tmp_0": [1, 256, 160, 160],
+                    "conv2d_124.tmp_0": [1, 256, 160, 160],
+                    "nearest_interp_v2_3.tmp_0": [1, 64, 160, 160],
+                    "nearest_interp_v2_4.tmp_0": [1, 64, 160, 160],
+                    "nearest_interp_v2_5.tmp_0": [1, 64, 160, 160],
+                    "elementwise_add_7": [1, 56, 40, 40],
+                    "nearest_interp_v2_0.tmp_0": [1, 256, 40, 40]
+                }
+                min_pact_shape = {
+                    "nearest_interp_v2_26.tmp_0": [1, 256, 20, 20],
+                    "nearest_interp_v2_27.tmp_0": [1, 64, 20, 20],
+                    "nearest_interp_v2_28.tmp_0": [1, 64, 20, 20],
+                    "nearest_interp_v2_29.tmp_0": [1, 64, 20, 20]
+                }
+                max_pact_shape = {
+                    "nearest_interp_v2_26.tmp_0": [1, 256, 400, 400],
+                    "nearest_interp_v2_27.tmp_0": [1, 64, 400, 400],
+                    "nearest_interp_v2_28.tmp_0": [1, 64, 400, 400],
+                    "nearest_interp_v2_29.tmp_0": [1, 64, 400, 400]
+                }
+                opt_pact_shape = {
+                    "nearest_interp_v2_26.tmp_0": [1, 256, 160, 160],
+                    "nearest_interp_v2_27.tmp_0": [1, 64, 160, 160],
+                    "nearest_interp_v2_28.tmp_0": [1, 64, 160, 160],
+                    "nearest_interp_v2_29.tmp_0": [1, 64, 160, 160]
+                }
+                min_input_shape.update(min_pact_shape)
+                max_input_shape.update(max_pact_shape)
+                opt_input_shape.update(opt_pact_shape)
+            elif mode == "rec":
+                min_input_shape = {"x": [1, 3, 32, 10]}
+                max_input_shape = {"x": [args.rec_batch_num, 3, 32, 2000]}
+                opt_input_shape = {"x": [args.rec_batch_num, 3, 32, 320]}
+            elif mode == "cls":
+                min_input_shape = {"x": [1, 3, 48, 10]}
+                max_input_shape = {"x": [args.rec_batch_num, 3, 48, 2000]}
+                opt_input_shape = {"x": [args.rec_batch_num, 3, 48, 320]}
+            else:
+                min_input_shape = {"x": [1, 3, 10, 10]}
+                max_input_shape = {"x": [1, 3, 1000, 1000]}
+                opt_input_shape = {"x": [1, 3, 500, 500]}
+            config.set_trt_dynamic_shape_info(min_input_shape, max_input_shape,
+                                              opt_input_shape)

-    else:
-        config.disable_gpu()
-        if hasattr(args, "cpu_threads"):
-            config.set_cpu_math_library_num_threads(args.cpu_threads)
        else:
-            # default cpu threads as 10
-            config.set_cpu_math_library_num_threads(10)
-        if args.enable_mkldnn:
-            # cache 10 different shapes for mkldnn to avoid memory leak
-            config.set_mkldnn_cache_capacity(10)
-            config.enable_mkldnn()
-            if args.precision == "fp16":
-                config.enable_mkldnn_bfloat16()
-    # enable memory optim
-    config.enable_memory_optim()
-    config.disable_glog_info()
-
-    config.delete_pass("conv_transpose_eltwiseadd_bn_fuse_pass")
-    if mode == 'table':
-        config.delete_pass("fc_fuse_pass")  # not supported for table
-    config.switch_use_feed_fetch_ops(False)
-    config.switch_ir_optim(True)
-
-    # create predictor
-    predictor = inference.create_predictor(config)
-    input_names = predictor.get_input_names()
-    for name in input_names:
-        input_tensor = predictor.get_input_handle(name)
-    output_names = predictor.get_output_names()
-    output_tensors = []
-    for output_name in output_names:
-        output_tensor = predictor.get_output_handle(output_name)
-        output_tensors.append(output_tensor)
-    return predictor, input_tensor, output_tensors, config
+            config.disable_gpu()
+            if hasattr(args, "cpu_threads"):
+                config.set_cpu_math_library_num_threads(args.cpu_threads)
+            else:
+                # default cpu threads as 10
+                config.set_cpu_math_library_num_threads(10)
+            if args.enable_mkldnn:
+                # cache 10 different shapes for mkldnn to avoid memory leak
+                config.set_mkldnn_cache_capacity(10)
+                config.enable_mkldnn()
+                if args.precision == "fp16":
+                    config.enable_mkldnn_bfloat16()
+        # enable memory optim
+        config.enable_memory_optim()
+        config.disable_glog_info()
+
+        config.delete_pass("conv_transpose_eltwiseadd_bn_fuse_pass")
+        if mode == 'table':
+            config.delete_pass("fc_fuse_pass")  # not supported for table
+        config.switch_use_feed_fetch_ops(False)
+        config.switch_ir_optim(True)
+
+        # create predictor
+        predictor = inference.create_predictor(config)
+        input_names = predictor.get_input_names()
+        for name in input_names:
+            input_tensor = predictor.get_input_handle(name)
+        output_names = predictor.get_output_names()
+        output_tensors = []
+        for output_name in output_names:
+            output_tensor = predictor.get_output_handle(output_name)
+            output_tensors.append(output_tensor)
+        return predictor, input_tensor, output_tensors, config


 def get_infer_gpuid():

--- a/tools/program.py
+++ b/tools/program.py
@@ -159,7 +159,8 @@ def train(config,
          eval_class,
          pre_best_model_dict,
          logger,
-          vdl_writer=None):
+          vdl_writer=None,
+          scaler=None):
    cal_metric_during_train = config['Global'].get('cal_metric_during_train',
                                                   False)
    log_smooth_window = config['Global']['log_smooth_window']
@@ -211,33 +212,49 @@ def train(config,
    for epoch in range(start_epoch, epoch_num + 1):
        train_dataloader = build_dataloader(
            config, 'Train', device, logger, seed=epoch)
-        train_batch_cost = 0.0
        train_reader_cost = 0.0
-        batch_sum = 0
-        batch_start = time.time()
+        train_run_cost = 0.0
+        total_samples = 0
+        reader_start = time.time()
        max_iter = len(train_dataloader) - 1 if platform.system(
        ) == "Windows" else len(train_dataloader)
        for idx, batch in enumerate(train_dataloader):
            profiler.add_profiler_step(profiler_options)
-            train_reader_cost += time.time() - batch_start
+            train_reader_cost += time.time() - reader_start
            if idx >= max_iter:
                break
            lr = optimizer.get_lr()
            images = batch[0]
            if use_srn:
                model_average = True
-            if model_type == 'table' or extra_input:
-                preds = model(images, data=batch[1:])
+
+            train_start = time.time()
+            # use amp
+            if scaler:
+                with paddle.amp.auto_cast():
+                    if model_type == 'table' or extra_input:
+                        preds = model(images, data=batch[1:])
+                    else:
+                        preds = model(images)
            else:
-                preds = model(images)
+                if model_type == 'table' or extra_input:
+                    preds = model(images, data=batch[1:])
+                else:
+                    preds = model(images)
            loss = loss_class(preds, batch)
            avg_loss = loss['loss']
-            avg_loss.backward()
-            optimizer.step()
+
+            if scaler:
+                scaled_avg_loss = scaler.scale(avg_loss)
+                scaled_avg_loss.backward()
+                scaler.minimize(optimizer, scaled_avg_loss)
+            else:
+                avg_loss.backward()
+                optimizer.step()
            optimizer.clear_grad()

-            train_batch_cost += time.time() - batch_start
-            batch_sum += len(images)
+            train_run_cost += time.time() - train_start
+            total_samples += len(images)

            if not isinstance(lr_scheduler, float):
                lr_scheduler.step()
@@ -268,12 +285,13 @@ def train(config,
                logs = train_stats.log()
                strs = 'epoch: [{}/{}], iter: {}, {}, reader_cost: {:.5f} s, batch_cost: {:.5f} s, samples: {}, ips: {:.5f}'.format(
                    epoch, epoch_num, global_step, logs, train_reader_cost /
-                    print_batch_step, train_batch_cost / print_batch_step,
-                    batch_sum, batch_sum / train_batch_cost)
+                    print_batch_step, (train_reader_cost + train_run_cost) /
+                    print_batch_step, total_samples,
+                    total_samples / (train_reader_cost + train_run_cost))
                logger.info(strs)
-                train_batch_cost = 0.0
                train_reader_cost = 0.0
-                batch_sum = 0
+                train_run_cost = 0.0
+                total_samples = 0
            # eval
            if global_step > start_eval_step and \
                    (global_step - start_eval_step) % eval_batch_step == 0 and dist.get_rank() == 0:
@@ -326,7 +344,7 @@ def train(config,
                                          global_step)
            global_step += 1
            optimizer.clear_grad()
-            batch_start = time.time()
+            reader_start = time.time()
        if dist.get_rank() == 0:
            save_model(
                model,
@@ -367,7 +385,11 @@ def eval(model,
    with paddle.no_grad():
        total_frame = 0.0
        total_time = 0.0
-        pbar = tqdm(total=len(valid_dataloader), desc='eval model:')
+        pbar = tqdm(
+            total=len(valid_dataloader),
+            desc='eval model:',
+            position=0,
+            leave=True)
        max_iter = len(valid_dataloader) - 1 if platform.system(
        ) == "Windows" else len(valid_dataloader)
        for idx, batch in enumerate(valid_dataloader):
@@ -436,8 +458,6 @@ def get_center(model, eval_dataloader, post_process_class):

        batch = [item.numpy() for item in batch]
        # Obtain usable results from post-processing methods
-        total_time += time.time() - start
-        # Evaluate the results of the current batch
        post_result = post_process_class(preds, batch[1])

        #update char_center
@@ -480,11 +500,6 @@ def preprocess(is_train=False):
        'CLS', 'PGNet', 'Distillation', 'NRTR', 'TableAttn', 'SAR', 'PSE',
        'SEED'
    ]
-    windows_not_support_list = ['PSE']
-    if platform.system() == "Windows" and alg in windows_not_support_list:
-        logger.warning('{} is not support in Windows now'.format(
-            windows_not_support_list))
-        sys.exit()

    device = 'gpu:{}'.format(dist.ParallelEnv().dev_id) if use_gpu else 'cpu'
    device = paddle.set_device(device)

--- a/tools/train.py
+++ b/tools/train.py
@@ -102,10 +102,27 @@ def main(config, device, logger, vdl_writer):
    if valid_dataloader is not None:
        logger.info('valid dataloader has {} iters'.format(
            len(valid_dataloader)))
+
+    use_amp = config["Global"].get("use_amp", False)
+    if use_amp:
+        AMP_RELATED_FLAGS_SETTING = {
+            'FLAGS_cudnn_batchnorm_spatial_persistent': 1,
+            'FLAGS_max_inplace_grad_add': 8,
+        }
+        paddle.fluid.set_flags(AMP_RELATED_FLAGS_SETTING)
+        scale_loss = config["Global"].get("scale_loss", 1.0)
+        use_dynamic_loss_scaling = config["Global"].get(
+            "use_dynamic_loss_scaling", False)
+        scaler = paddle.amp.GradScaler(
+            init_loss_scaling=scale_loss,
+            use_dynamic_loss_scaling=use_dynamic_loss_scaling)
+    else:
+        scaler = None
+
    # start train
    program.train(config, train_dataloader, valid_dataloader, device, model,
                  loss_class, optimizer, lr_scheduler, post_process_class,
-                  eval_class, pre_best_model_dict, logger, vdl_writer)
+                  eval_class, pre_best_model_dict, logger, vdl_writer, scaler)


 def test_reader(config, device, logger):