Merge branch 'release/2.5' of https://github.com/PaddlePaddle/PaddleOCR into release/2.5

b40735dd · andyjpaddle · b9b6c222 · be9ed6ab · b40735dd · b40735dd
Commit b40735dd authored May 11, 2022 by andyjpaddle
4 changed files
--- a/ppstructure/README.md
+++ b/ppstructure/README.md
@@ -40,7 +40,7 @@ The main features of PP-Structure are as follows:

 ### 4.1 Layout analysis and table recognition

-<img src="../doc/table/ppstructure.GIF" width="100%"/>
+<img src="docs/table/ppstructure.GIF" width="100%"/>

 The figure shows the pipeline of layout analysis + table recognition. The image is first divided into four areas of image, text, title and table by layout analysis, and then OCR detection and recognition is performed on the three areas of image, text and title, and the table is performed table recognition, where the image will also be stored for use.

@@ -48,7 +48,7 @@ The figure shows the pipeline of layout analysis + table recognition. The image

 * SER
 *
-![](../doc/vqa/result_ser/zh_val_0_ser.jpg) | ![](../doc/vqa/result_ser/zh_val_42_ser.jpg)
+![](docs/vqa/result_ser/zh_val_0_ser.jpg) | ![](docs/vqa/result_ser/zh_val_42_ser.jpg)
 ---|---

 Different colored boxes in the figure represent different categories. For xfun dataset, there are three categories: query, answer and header:
@@ -62,7 +62,7 @@ The corresponding category and OCR recognition results are also marked at the to

 * RE

-![](../doc/vqa/result_re/zh_val_21_re.jpg) | ![](../doc/vqa/result_re/zh_val_40_re.jpg)
+![](docs/vqa/result_re/zh_val_21_re.jpg) | ![](docs/vqa/result_re/zh_val_40_re.jpg)
 ---|---


@@ -76,7 +76,7 @@ Start from [Quick Installation](./docs/quickstart.md)

 ### 6.1 Layout analysis and table recognition

-![pipeline](../doc/table/pipeline.jpg)
+![pipeline](docs/table/pipeline.jpg)

 In PP-Structure, the image will be divided into 5 types of areas **text, title, image list and table**. For the first 4 types of areas, directly use PP-OCR system to complete the text detection and recognition. For the table area, after the table structuring process, the table in image is converted into an Excel file with the same table style.


--- a/tools/infer/predict_rec.py
+++ b/tools/infer/predict_rec.py
@@ -131,7 +131,7 @@ class TextRecognizer(object):
        padding_im = np.zeros((imgC, imgH, imgW), dtype=np.float32)
        padding_im[:, :, 0:resized_w] = resized_image
        return padding_im
-    
+
    def resize_norm_img_svtr(self, img, image_shape):

        imgC, imgH, imgW = image_shape
@@ -274,7 +274,7 @@ class TextRecognizer(object):
                wh_ratio = w * 1.0 / h
                max_wh_ratio = max(max_wh_ratio, wh_ratio)
            for ino in range(beg_img_no, end_img_no):
-               
+
                if self.rec_algorithm == "SAR":
                    norm_img, _, _, valid_ratio = self.resize_norm_img_sar(
                        img_list[indices[ino]], self.rec_image_shape)
@@ -296,8 +296,8 @@ class TextRecognizer(object):
                    gsrm_slf_attn_bias2_list.append(norm_img[4])
                    norm_img_batch.append(norm_img[0])
                elif self.rec_algorithm == "SVTR":
-                    norm_img = self.resize_norm_img_svtr(
-                        img_list[indices[ino]], self.rec_image_shape)
+                    norm_img = self.resize_norm_img_svtr(img_list[indices[ino]],
+                                                         self.rec_image_shape)
                    norm_img = norm_img[np.newaxis, :]
                    norm_img_batch.append(norm_img)
                else:
@@ -405,9 +405,13 @@ def main(args):
    valid_image_file_list = []
    img_list = []

+    logger.info(
+        "In PP-OCRv3, rec_image_shape parameter defaults to '3, 48, 320', "
+        "if you are using recognition model with PP-OCRv2 or an older version, please set --rec_image_shape='3,32,320"
+    )
    # warmup 2 times
    if args.warmup:
-        img = np.random.uniform(0, 255, [32, 320, 3]).astype(np.uint8)
+        img = np.random.uniform(0, 255, [48, 320, 3]).astype(np.uint8)
        for i in range(2):
            res = text_recognizer([img] * int(args.rec_batch_num))


--- a/tools/infer/predict_system.py
+++ b/tools/infer/predict_system.py
@@ -133,6 +133,9 @@ def main(args):
    os.makedirs(draw_img_save_dir, exist_ok=True)
    save_results = []

+    logger.info("In PP-OCRv3, rec_image_shape parameter defaults to '3, 48, 320', "
+                "if you are using recognition model with PP-OCRv2 or an older version, please set --rec_image_shape='3,32,320")
+                
    # warm up 10 times
    if args.warmup:
        img = np.random.uniform(0, 255, [640, 640, 3]).astype(np.uint8)

--- a/tools/infer/utility.py
+++ b/tools/infer/utility.py
@@ -79,9 +79,9 @@ def init_args():
    parser.add_argument("--det_fce_box_type", type=str, default='poly')

    # params for text recognizer
-    parser.add_argument("--rec_algorithm", type=str, default='CRNN')
+    parser.add_argument("--rec_algorithm", type=str, default='SVTR_LCNet')
    parser.add_argument("--rec_model_dir", type=str)
-    parser.add_argument("--rec_image_shape", type=str, default="3, 32, 320")
+    parser.add_argument("--rec_image_shape", type=str, default="3, 48, 320")
    parser.add_argument("--rec_batch_num", type=int, default=6)
    parser.add_argument("--max_text_length", type=int, default=25)
    parser.add_argument(
@@ -269,11 +269,11 @@ def create_predictor(args, mode, logger):
                max_input_shape.update(max_pact_shape)
                opt_input_shape.update(opt_pact_shape)
            elif mode == "rec":
-                if args.rec_algorithm != "CRNN":
+                if args.rec_algorithm not in ["CRNN", "SVTR_LCNet"]:
                    use_dynamic_shape = False
                imgH = int(args.rec_image_shape.split(',')[-2])
                min_input_shape = {"x": [1, 3, imgH, 10]}
-                max_input_shape = {"x": [args.rec_batch_num, 3, imgH, 1536]}
+                max_input_shape = {"x": [args.rec_batch_num, 3, imgH, 2304]}
                opt_input_shape = {"x": [args.rec_batch_num, 3, imgH, 320]}
            elif mode == "cls":
                min_input_shape = {"x": [1, 3, 48, 10]}
@@ -320,7 +320,7 @@ def create_predictor(args, mode, logger):
 def get_output_tensors(args, mode, predictor):
    output_names = predictor.get_output_names()
    output_tensors = []
-    if mode == "rec" and args.rec_algorithm == "CRNN":
+    if mode == "rec" and args.rec_algorithm in ["CRNN", "SVTR_LCNet"]:
        output_name = 'softmax_0.tmp_0'
        if output_name in output_names:
            return [predictor.get_output_handle(output_name)]