title={EAST: an efficient and accurate scene text detector},
author={Zhou, Xinyu and Yao, Cong and Wen, He and Wang, Yuzhi and Zhou, Shuchang and He, Weiran and Liang, Jiajun},
booktitle={Proceedings of the IEEE conference on Computer Vision and Pattern Recognition},
pages={5551--5560},
year={2017}
}
2. DB:
@article{liao2019real,
title={Real-time Scene Text Detection with Differentiable Binarization},
author={Liao, Minghui and Wan, Zhaoyi and Yao, Cong and Chen, Kai and Bai, Xiang},
journal={arXiv preprint arXiv:1911.08947},
year={2019}
}
3. DTRB:
@inproceedings{baek2019wrong,
title={What is wrong with scene text recognition model comparisons? dataset and model analysis},
author={Baek, Jeonghun and Kim, Geewook and Lee, Junyeop and Park, Sungrae and Han, Dongyoon and Yun, Sangdoo and Oh, Seong Joon and Lee, Hwalsuk},
booktitle={Proceedings of the IEEE International Conference on Computer Vision},
pages={4715--4723},
year={2019}
}
4. SAST:
@inproceedings{wang2019single,
title={A Single-Shot Arbitrarily-Shaped Text Detector based on Context Attended Multi-Task Learning},
author={Wang, Pengfei and Zhang, Chengquan and Qi, Fei and Huang, Zuming and En, Mengyi and Han, Junyu and Liu, Jingtuo and Ding, Errui and Shi, Guangming},
booktitle={Proceedings of the 27th ACM International Conference on Multimedia},
pages={1277--1285},
year={2019}
}
5. SRN:
@article{yu2020towards,
title={Towards Accurate Scene Text Recognition with Semantic Reasoning Networks},
author={Yu, Deli and Li, Xuan and Zhang, Chengquan and Han, Junyu and Liu, Jingtuo and Ding, Errui},
journal={arXiv preprint arXiv:2003.12294},
year={2020}
}
6. end2end-psl:
@inproceedings{sun2019chinese,
title={Chinese Street View Text: Large-scale Chinese Text Reading with Partially Supervised Learning},
author={Sun, Yipeng and Liu, Jiaming and Liu, Wei and Han, Junyu and Ding, Errui and Liu, Jingtuo},
booktitle={Proceedings of the IEEE International Conference on Computer Vision},
For testing our Chinese OCR online:https://www.paddlepaddle.org.cn/hub/scene/ocr
...
...
@@ -34,7 +35,7 @@ For testing our Chinese OCR online:https://www.paddlepaddle.org.cn/hub/scene/o

The picture above is the result of our lightweight Chinese OCR model. For more testing results, please see the end of the article [lightweight Chinese OCR results](#lightweight-Chinese-OCR-results)and[General Chinese OCR results](#General-Chinese-OCR-results).
The picture above is the result of our lightweight Chinese OCR model. For more testing results, please see the end of the article [lightweight Chinese OCR results](#lightweight-Chinese-OCR-results),[General Chinese OCR results](#General-Chinese-OCR-results) and [Support for space Recognition Model](#Space-Chinese-OCR-results).
#### 1. ENVIRONMENT CONFIGURATION
...
...
@@ -45,22 +46,42 @@ Please see [Quick installation](./doc/doc_en/installation_en.md)
#### (1) Download lightweight Chinese OCR models
*If wget is not installed in the windows system, you can copy the link to the browser to download the model. After model downloaded, unzip it and place it in the corresponding directory*
Copy the detection and recognition 'inference model' address in [Chinese model List](#Supported-Chinese-model-list), download and unpack:
```
mkdir inference && cd inference
# Download the detection part of the Chinese OCR and decompress it
wget {url/of/detection/inference_model} && tar xf {name/of/detection/inference_model/package}
# Download the recognition part of the Chinese OCR and decompress it
wget {url/of/recognition/inference_model} && tar xf {name/of/recognition/inference_model/package}
cd ..
```
Take lightweight Chinese OCR model as an example:
```
mkdir inference && cd inference
# Download the detection part of the lightweight Chinese OCR and decompress it
wget https://paddleocr.bj.bcebos.com/ch_models/ch_det_mv3_db_infer.tar && tar xf ch_det_mv3_db_infer.tar
# Download the recognition part of the lightweight Chinese OCR and decompress it
wget https://paddleocr.bj.bcebos.com/ch_models/ch_rec_mv3_crnn_infer.tar && tar xf ch_rec_mv3_crnn_infer.tar
# Download the space-recognized part of the lightweight Chinese OCR and decompress it
wget https://paddleocr.bj.bcebos.com/ch_models/ch_rec_mv3_crnn_enhance_infer.tar && tar xf ch_rec_mv3_crnn_enhance_infer.tar
cd ..
```
#### (2) Download General Chinese OCR models
After the decompression is completed, the file structure should be as follows:
```
mkdir inference && cd inference
# Download the detection part of the general Chinese OCR model and decompress it
wget https://paddleocr.bj.bcebos.com/ch_models/ch_det_r50_vd_db_infer.tar && tar xf ch_det_r50_vd_db_infer.tar
# Download the recognition part of the generic Chinese OCR model and decompress it
wget https://paddleocr.bj.bcebos.com/ch_models/ch_rec_r34_vd_crnn_infer.tar && tar xf ch_rec_r34_vd_crnn_infer.tar
cd ..
|-inference
|-ch_rec_mv3_crnn
|- model
|- params
|-ch_det_mv3_db
|- model
|- params
...
```
#### 3. SINGLE IMAGE AND BATCH PREDICTION
...
...
@@ -85,6 +106,13 @@ To run inference of the Generic Chinese OCR model, follow these steps above to d
To run inference of the space-Generic Chinese OCR model, follow these steps above to download the corresponding models and update the relevant parameters. Examples are as follows:
```
# Prediction on a single image by specifying image path to image_dir
For more text detection and recognition models, please refer to the document [Inference](./doc/doc_en/inference_en.md)
## DOCUMENTATION
...
...
@@ -92,7 +120,9 @@ For more text detection and recognition models, please refer to the document [In
-[Text detection model training/evaluation/prediction](./doc/doc_en/detection_en.md)
-[Text recognition model training/evaluation/prediction](./doc/doc_en/recognition_en.md)
-[Inference](./doc/doc_en/inference_en.md)
-[Introduction of yml file](./doc/doc_en/config_en.md)
-[Dataset](./doc/doc_en/datasets_en.md)
-[FAQ]((#FAQ)
## TEXT DETECTION ALGORITHM
...
...
@@ -145,15 +175,15 @@ Refer to [DTRB](https://arxiv.org/abs/1904.01906), the training and evaluation r
We use [LSVT](https://github.com/PaddlePaddle/PaddleOCR/blob/develop/doc/doc_en/datasets_en.md#1-icdar2019-lsvt) dataset and cropout 30w traning data from original photos by using position groundtruth and make some calibration needed. In addition, based on the LSVT corpus, 500w synthetic data is generated to train the Chinese model. The related configuration and pre-trained models are as follows:
|lightweight Chinese model|MobileNetV3|rec_chinese_lite_train.yml|[Download link](https://paddleocr.bj.bcebos.com/ch_models/ch_rec_mv3_crnn.tar)|
|General Chinese OCR model|Resnet34_vd|rec_chinese_common_train.yml|[Download link](https://paddleocr.bj.bcebos.com/ch_models/ch_rec_r34_vd_crnn.tar)|
|lightweight Chinese model|MobileNetV3|rec_chinese_lite_train.yml|[Download link](https://paddleocr.bj.bcebos.com/ch_models/ch_rec_mv3_crnn.tar)|[inference model](https://paddleocr.bj.bcebos.com/ch_models/ch_rec_mv3_crnn_enhance_infer.tar) & [pre-trained model](https://paddleocr.bj.bcebos.com/ch_models/ch_rec_mv3_crnn_enhance.tar)|
|General Chinese OCR model|Resnet34_vd|rec_chinese_common_train.yml|[Download link](https://paddleocr.bj.bcebos.com/ch_models/ch_rec_r34_vd_crnn.tar)|[inference model](https://paddleocr.bj.bcebos.com/ch_models/ch_rec_r34_vd_crnn_enhance_infer.tar) & [pre-trained model](https://paddleocr.bj.bcebos.com/ch_models/ch_rec_r34_vd_crnn_enhance.tar)|
Please refer to the document for training guide and use of PaddleOCR text recognition algorithms [Text recognition model training/evaluation/prediction](./doc/doc_en/recognition_en.md)
"Environment Variable CUDA_VISIBLE_DEVICES is not set correctly. If you wanna use gpu, please set CUDA_VISIBLE_DEVICES via export CUDA_VISIBLE_DEVICES=cuda_device_id."
)
self.config.ir_optim=True
self.config.gpu_mem=8000
#params for text detector
self.config.det_algorithm=det_algorithm
self.config.det_model_dir=det_model_dir
# self.config.det_model_dir = "./inference/det/"
#DB parmas
self.config.det_db_thresh=0.3
self.config.det_db_box_thresh=0.5
self.config.det_db_unclip_ratio=2.0
#EAST parmas
self.config.det_east_score_thresh=0.8
self.config.det_east_cover_thresh=0.1
self.config.det_east_nms_thresh=0.2
defread_images(self,paths=[]):
images=[]
forimg_pathinpaths:
assertos.path.isfile(
img_path),"The {} isn't a valid file.".format(img_path)
img=cv2.imread(img_path)
ifimgisNone:
logger.info("error in loading image:{}".format(img_path))
continue
images.append(img)
returnimages
defdet_text(self,
images=[],
paths=[],
det_max_side_len=960,
draw_img_save='ocr_det_result',
visualization=False):
"""
Get the text box in the predicted images.
Args:
images (list(numpy.ndarray)): images data, shape of each is [H, W, C]. If images not paths
paths (list[str]): The paths of images. If paths not images
use_gpu (bool): Whether to use gpu. Default false.
output_dir (str): The directory to store output images.
visualization (bool): Whether to save image or not.
box_thresh(float): the threshold of the detected text box's confidence
Returns:
res (list): The result of text detection box and save path of images.
"Environment Variable CUDA_VISIBLE_DEVICES is not set correctly. If you wanna use gpu, please set CUDA_VISIBLE_DEVICES via export CUDA_VISIBLE_DEVICES=cuda_device_id."
)
self.config.ir_optim=True
self.config.gpu_mem=8000
#params for text recognizer
self.config.rec_algorithm=rec_algorithm
self.config.rec_model_dir=rec_model_dir
# self.config.rec_model_dir = "./inference/rec/"
self.config.rec_image_shape="3, 32, 320"
self.config.rec_char_type='ch'
self.config.rec_batch_num=rec_batch_num
self.config.rec_char_dict_path=rec_char_dict_path
self.config.use_space_char=True
defread_images(self,paths=[]):
images=[]
forimg_pathinpaths:
assertos.path.isfile(
img_path),"The {} isn't a valid file.".format(img_path)
img=cv2.imread(img_path)
ifimgisNone:
logger.info("error in loading image:{}".format(img_path))
continue
images.append(img)
returnimages
defrec_text(self,
images=[],
paths=[]):
"""
Get the text box in the predicted images.
Args:
images (list(numpy.ndarray)): images data, shape of each is [H, W, C]. If images not paths
paths (list[str]): The paths of images. If paths not images
Returns:
res (list): The result of text detection box and save path of images.
"Environment Variable CUDA_VISIBLE_DEVICES is not set correctly. If you wanna use gpu, please set CUDA_VISIBLE_DEVICES via export CUDA_VISIBLE_DEVICES=cuda_device_id."
)
self.config.ir_optim=True
self.config.gpu_mem=8000
#params for text detector
self.config.det_algorithm=det_algorithm
self.config.det_model_dir=det_model_dir
# self.config.det_model_dir = "./inference/det/"
#DB parmas
self.config.det_db_thresh=0.3
self.config.det_db_box_thresh=0.5
self.config.det_db_unclip_ratio=2.0
#EAST parmas
self.config.det_east_score_thresh=0.8
self.config.det_east_cover_thresh=0.1
self.config.det_east_nms_thresh=0.2
#params for text recognizer
self.config.rec_algorithm=rec_algorithm
self.config.rec_model_dir=rec_model_dir
# self.config.rec_model_dir = "./inference/rec/"
self.config.rec_image_shape="3, 32, 320"
self.config.rec_char_type='ch'
self.config.rec_batch_num=rec_batch_num
self.config.rec_char_dict_path=rec_char_dict_path
self.config.use_space_char=True
defread_images(self,paths=[]):
images=[]
forimg_pathinpaths:
assertos.path.isfile(
img_path),"The {} isn't a valid file.".format(img_path)
img=cv2.imread(img_path)
ifimgisNone:
logger.info("error in loading image:{}".format(img_path))
continue
images.append(img)
returnimages
defrecognize_text(self,
images=[],
paths=[],
det_max_side_len=960,
draw_img_save='ocr_result',
visualization=False,
text_thresh=0.5):
"""
Get the chinese texts in the predicted images.
Args:
images (list(numpy.ndarray)): images data, shape of each is [H, W, C]. If images not paths
paths (list[str]): The paths of images. If paths not images
use_gpu (bool): Whether to use gpu.
batch_size(int): the program deals once with one
output_dir (str): The directory to store output images.
visualization (bool): Whether to save image or not.
box_thresh(float): the threshold of the detected text box's confidence
text_thresh(float): the threshold of the recognize chinese texts' confidence
Returns:
res (list): The result of chinese texts and save path of images.