"vscode:/vscode.git/clone" did not exist on "8107bbac0c0a5a0c2de10c8d7d0f779eda8957da"
Commit b40735dd authored by andyjpaddle's avatar andyjpaddle
Browse files

Merge branch 'release/2.5' of https://github.com/PaddlePaddle/PaddleOCR into release/2.5

parents b9b6c222 be9ed6ab
......@@ -40,7 +40,7 @@ The main features of PP-Structure are as follows:
### 4.1 Layout analysis and table recognition
<img src="../doc/table/ppstructure.GIF" width="100%"/>
<img src="docs/table/ppstructure.GIF" width="100%"/>
The figure shows the pipeline of layout analysis + table recognition. The image is first divided into four areas of image, text, title and table by layout analysis, and then OCR detection and recognition is performed on the three areas of image, text and title, and the table is performed table recognition, where the image will also be stored for use.
......@@ -48,7 +48,7 @@ The figure shows the pipeline of layout analysis + table recognition. The image
* SER
*
![](../doc/vqa/result_ser/zh_val_0_ser.jpg) | ![](../doc/vqa/result_ser/zh_val_42_ser.jpg)
![](docs/vqa/result_ser/zh_val_0_ser.jpg) | ![](docs/vqa/result_ser/zh_val_42_ser.jpg)
---|---
Different colored boxes in the figure represent different categories. For xfun dataset, there are three categories: query, answer and header:
......@@ -62,7 +62,7 @@ The corresponding category and OCR recognition results are also marked at the to
* RE
![](../doc/vqa/result_re/zh_val_21_re.jpg) | ![](../doc/vqa/result_re/zh_val_40_re.jpg)
![](docs/vqa/result_re/zh_val_21_re.jpg) | ![](docs/vqa/result_re/zh_val_40_re.jpg)
---|---
......@@ -76,7 +76,7 @@ Start from [Quick Installation](./docs/quickstart.md)
### 6.1 Layout analysis and table recognition
![pipeline](../doc/table/pipeline.jpg)
![pipeline](docs/table/pipeline.jpg)
In PP-Structure, the image will be divided into 5 types of areas **text, title, image list and table**. For the first 4 types of areas, directly use PP-OCR system to complete the text detection and recognition. For the table area, after the table structuring process, the table in image is converted into an Excel file with the same table style.
......
......@@ -131,7 +131,7 @@ class TextRecognizer(object):
padding_im = np.zeros((imgC, imgH, imgW), dtype=np.float32)
padding_im[:, :, 0:resized_w] = resized_image
return padding_im
def resize_norm_img_svtr(self, img, image_shape):
imgC, imgH, imgW = image_shape
......@@ -274,7 +274,7 @@ class TextRecognizer(object):
wh_ratio = w * 1.0 / h
max_wh_ratio = max(max_wh_ratio, wh_ratio)
for ino in range(beg_img_no, end_img_no):
if self.rec_algorithm == "SAR":
norm_img, _, _, valid_ratio = self.resize_norm_img_sar(
img_list[indices[ino]], self.rec_image_shape)
......@@ -296,8 +296,8 @@ class TextRecognizer(object):
gsrm_slf_attn_bias2_list.append(norm_img[4])
norm_img_batch.append(norm_img[0])
elif self.rec_algorithm == "SVTR":
norm_img = self.resize_norm_img_svtr(
img_list[indices[ino]], self.rec_image_shape)
norm_img = self.resize_norm_img_svtr(img_list[indices[ino]],
self.rec_image_shape)
norm_img = norm_img[np.newaxis, :]
norm_img_batch.append(norm_img)
else:
......@@ -405,9 +405,13 @@ def main(args):
valid_image_file_list = []
img_list = []
logger.info(
"In PP-OCRv3, rec_image_shape parameter defaults to '3, 48, 320', "
"if you are using recognition model with PP-OCRv2 or an older version, please set --rec_image_shape='3,32,320"
)
# warmup 2 times
if args.warmup:
img = np.random.uniform(0, 255, [32, 320, 3]).astype(np.uint8)
img = np.random.uniform(0, 255, [48, 320, 3]).astype(np.uint8)
for i in range(2):
res = text_recognizer([img] * int(args.rec_batch_num))
......
......@@ -133,6 +133,9 @@ def main(args):
os.makedirs(draw_img_save_dir, exist_ok=True)
save_results = []
logger.info("In PP-OCRv3, rec_image_shape parameter defaults to '3, 48, 320', "
"if you are using recognition model with PP-OCRv2 or an older version, please set --rec_image_shape='3,32,320")
# warm up 10 times
if args.warmup:
img = np.random.uniform(0, 255, [640, 640, 3]).astype(np.uint8)
......
......@@ -79,9 +79,9 @@ def init_args():
parser.add_argument("--det_fce_box_type", type=str, default='poly')
# params for text recognizer
parser.add_argument("--rec_algorithm", type=str, default='CRNN')
parser.add_argument("--rec_algorithm", type=str, default='SVTR_LCNet')
parser.add_argument("--rec_model_dir", type=str)
parser.add_argument("--rec_image_shape", type=str, default="3, 32, 320")
parser.add_argument("--rec_image_shape", type=str, default="3, 48, 320")
parser.add_argument("--rec_batch_num", type=int, default=6)
parser.add_argument("--max_text_length", type=int, default=25)
parser.add_argument(
......@@ -269,11 +269,11 @@ def create_predictor(args, mode, logger):
max_input_shape.update(max_pact_shape)
opt_input_shape.update(opt_pact_shape)
elif mode == "rec":
if args.rec_algorithm != "CRNN":
if args.rec_algorithm not in ["CRNN", "SVTR_LCNet"]:
use_dynamic_shape = False
imgH = int(args.rec_image_shape.split(',')[-2])
min_input_shape = {"x": [1, 3, imgH, 10]}
max_input_shape = {"x": [args.rec_batch_num, 3, imgH, 1536]}
max_input_shape = {"x": [args.rec_batch_num, 3, imgH, 2304]}
opt_input_shape = {"x": [args.rec_batch_num, 3, imgH, 320]}
elif mode == "cls":
min_input_shape = {"x": [1, 3, 48, 10]}
......@@ -320,7 +320,7 @@ def create_predictor(args, mode, logger):
def get_output_tensors(args, mode, predictor):
output_names = predictor.get_output_names()
output_tensors = []
if mode == "rec" and args.rec_algorithm == "CRNN":
if mode == "rec" and args.rec_algorithm in ["CRNN", "SVTR_LCNet"]:
output_name = 'softmax_0.tmp_0'
if output_name in output_names:
return [predictor.get_output_handle(output_name)]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment