Merge branch 'dygraph' into dygraph

191c9dee · Evezerest · GitHub · 3c6d5512 · 8def6786 · 191c9dee
Unverified Commit 191c9dee authored Mar 18, 2022 by Evezerest Committed by GitHub Mar 18, 2022
20 changed files
--- a/doc/doc_ch/recognition.md
+++ b/doc/doc_ch/recognition.md
@@ -11,6 +11,7 @@
    - [2.1 数据增强](#数据增强)
    - [2.2 通用模型训练](#通用模型训练)
    - [2.3 多语言模型训练](#多语言模型训练)
+    - [2.4 知识蒸馏训练](#知识蒸馏训练)
 - [3 评估](#评估)
 - [4 预测](#预测)
 - [5 转Inference模型测试](#Inference)
@@ -368,6 +369,13 @@ Eval:
    label_file_list: ["./train_data/french_val.txt"]
    ...
 ```
+
+<a name="知识蒸馏训练"></a>
+
+### 2.4 知识蒸馏训练
+
+PaddleOCR支持了基于知识蒸馏的文本识别模型训练过程，更多内容可以参考[知识蒸馏说明文档](./knowledge_distillation.md)。
+
 <a name="评估"></a>
 ## 3 评估


--- a/doc/doc_en/detection_en.md
+++ b/doc/doc_en/detection_en.md
@@ -9,6 +9,7 @@ This section uses the icdar2015 dataset as an example to introduce the training,
  * [2.1 Start Training](#21-start-training)
  * [2.2 Load Trained Model and Continue Training](#22-load-trained-model-and-continue-training)
  * [2.3 Training with New Backbone](#23-training-with-new-backbone)
+  * [2.4 Training with knowledge distillation](#24)
 - [3. Evaluation and Test](#3-evaluation-and-test)
  * [3.1 Evaluation](#31-evaluation)
  * [3.2 Test](#32-test)
@@ -174,6 +175,11 @@ After adding the four-part modules of the network, you only need to configure th

 **NOTE**: More details about replace Backbone and other mudule can be found in [doc](add_new_algorithm_en.md).

+
+### 2.4 Training with knowledge distillation
+
+Knowledge distillation is supported in PaddleOCR for text detection training process. For more details, please refer to [doc](./knowledge_distillation_en.md).
+
 ## 3. Evaluation and Test

 ### 3.1 Evaluation

--- a/doc/doc_en/models_list_en.md
+++ b/doc/doc_en/models_list_en.md
@@ -94,6 +94,8 @@ For more supported languages, please refer to : [Multi-language model](./multi_l
 ## 4. Paddle-Lite Model
 |Version|Introduction|Model size|Detection model|Text Direction model|Recognition model|Paddle-Lite branch|
 |---|---|---|---|---|---|---|
+|PP-OCRv2|extra-lightweight chinese OCR optimized model|11M|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_PP-OCRv2_det_infer_opt.nb)|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_ppocr_mobile_v2.0_cls_infer_opt.nb)|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_PP-OCRv2_rec_infer_opt.nb)|v2.10|
+|PP-OCRv2(slim)|extra-lightweight chinese OCR optimized model|4.6M|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_PP-OCRv2_det_slim_opt.nb)|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_ppocr_mobile_v2.0_cls_slim_opt.nb)|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_PP-OCRv2_rec_slim_opt.nb)|v2.10|
 |PP-OCRv2|extra-lightweight chinese OCR optimized model|11M|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_infer_opt.nb)|[download link](https://paddleocr.bj.bcebos.com/dygraph_v2.0/lite/ch_ppocr_mobile_v2.0_cls_opt.nb)|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_infer_opt.nb)|v2.9|
 |PP-OCRv2(slim)|extra-lightweight chinese OCR optimized model|4.9M|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_slim_opt.nb)|[download link](https://paddleocr.bj.bcebos.com/dygraph_v2.0/lite/ch_ppocr_mobile_v2.0_cls_slim_opt.nb)|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_opt.nb)|v2.9|
 |V2.0|ppocr_v2.0 extra-lightweight chinese OCR optimized model|7.8M|[download link](https://paddleocr.bj.bcebos.com/dygraph_v2.0/lite/ch_ppocr_mobile_v2.0_det_opt.nb)|[download link](https://paddleocr.bj.bcebos.com/dygraph_v2.0/lite/ch_ppocr_mobile_v2.0_cls_opt.nb)|[download link](https://paddleocr.bj.bcebos.com/dygraph_v2.0/lite/ch_ppocr_mobile_v2.0_rec_opt.nb)|v2.9|

--- a/doc/doc_en/quickstart_en.md
+++ b/doc/doc_en/quickstart_en.md

 # PaddleOCR Quick Start

-[PaddleOCR Quick Start](#paddleocr-quick-start)
-
 + [1. Install PaddleOCR Whl Package](#1-install-paddleocr-whl-package)
 * [2. Easy-to-Use](#2-easy-to-use)
  + [2.1 Use by Command Line](#21-use-by-command-line)

--- a/doc/doc_en/recognition_en.md
+++ b/doc/doc_en/recognition_en.md
@@ -10,6 +10,7 @@
    - [2.1 Data Augmentation](#Data_Augmentation)
    - [2.2 General Training](#Training)
    - [2.3 Multi-language Training](#Multi_language)
+    - [2.4 Training with Knowledge Distillation](#kd)

 - [3. Evaluation](#EVALUATION)

@@ -361,6 +362,12 @@ Eval:
    ...
 ```

+<a name="kd"></a>
+
+### 2.4 Training with Knowledge Distillation
+
+Knowledge distillation is supported in PaddleOCR for text recognition training process. For more details, please refer to [doc](./knowledge_distillation_en.md).
+
 <a name="EVALUATION"></a>

 ## 3. Evalution

--- a/ppocr/data/imaug/__init__.py
+++ b/ppocr/data/imaug/__init__.py
@@ -22,7 +22,8 @@ from .make_shrink_map import MakeShrinkMap
 from .random_crop_data import EastRandomCropData, RandomCropImgMask
 from .make_pse_gt import MakePseGt

-from .rec_img_aug import RecAug, RecResizeImg, ClsResizeImg, SRNRecResizeImg, NRTRRecResizeImg, SARRecResizeImg
+from .rec_img_aug import RecAug, RecResizeImg, ClsResizeImg, \
+    SRNRecResizeImg, NRTRRecResizeImg, SARRecResizeImg, PRENResizeImg
 from .randaugment import RandAugment
 from .copy_paste import CopyPaste
 from .ColorJitter import ColorJitter
@@ -36,6 +37,9 @@ from .gen_table_mask import *

 from .vqa import *

+from .fce_aug import *
+from .fce_targets import FCENetTargets
+

 def transform(data, ops=None):
    """ transform """

--- a/ppocr/data/imaug/fce_aug.py
+++ b/ppocr/data/imaug/fce_aug.py
--- a/ppocr/data/imaug/fce_targets.py
+++ b/ppocr/data/imaug/fce_targets.py
--- a/ppocr/data/imaug/label_ops.py
+++ b/ppocr/data/imaug/label_ops.py
@@ -785,6 +785,53 @@ class SARLabelEncode(BaseRecLabelEncode):
        return [self.padding_idx]


+class PRENLabelEncode(BaseRecLabelEncode):
+    def __init__(self,
+                 max_text_length,
+                 character_dict_path,
+                 use_space_char=False,
+                 **kwargs):
+        super(PRENLabelEncode, self).__init__(
+            max_text_length, character_dict_path, use_space_char)
+
+    def add_special_char(self, dict_character):
+        padding_str = '<PAD>'  # 0 
+        end_str = '<EOS>'  # 1
+        unknown_str = '<UNK>'  # 2
+
+        dict_character = [padding_str, end_str, unknown_str] + dict_character
+        self.padding_idx = 0
+        self.end_idx = 1
+        self.unknown_idx = 2
+
+        return dict_character
+
+    def encode(self, text):
+        if len(text) == 0 or len(text) >= self.max_text_len:
+            return None
+        if self.lower:
+            text = text.lower()
+        text_list = []
+        for char in text:
+            if char not in self.dict:
+                text_list.append(self.unknown_idx)
+            else:
+                text_list.append(self.dict[char])
+        text_list.append(self.end_idx)
+        if len(text_list) < self.max_text_len:
+            text_list += [self.padding_idx] * (
+                self.max_text_len - len(text_list))
+        return text_list
+
+    def __call__(self, data):
+        text = data['label']
+        encoded_text = self.encode(text)
+        if encoded_text is None:
+            return None
+        data['label'] = np.array(encoded_text)
+        return data
+
+
 class VQATokenLabelEncode(object):
    """
    Label encode for NLP VQA methods

--- a/ppocr/data/imaug/operators.py
+++ b/ppocr/data/imaug/operators.py
@@ -23,14 +23,20 @@ import sys
 import six
 import cv2
 import numpy as np
+import math


 class DecodeImage(object):
    """ decode image """

-    def __init__(self, img_mode='RGB', channel_first=False, **kwargs):
+    def __init__(self,
+                 img_mode='RGB',
+                 channel_first=False,
+                 ignore_orientation=False,
+                 **kwargs):
        self.img_mode = img_mode
        self.channel_first = channel_first
+        self.ignore_orientation = ignore_orientation

    def __call__(self, data):
        img = data['image']
@@ -41,7 +47,11 @@ class DecodeImage(object):
            assert type(img) is bytes and len(
                img) > 0, "invalid input 'img' in DecodeImage"
        img = np.frombuffer(img, dtype='uint8')
-        img = cv2.imdecode(img, 1)
+        if self.ignore_orientation:
+            img = cv2.imdecode(img, cv2.IMREAD_IGNORE_ORIENTATION |
+                               cv2.IMREAD_COLOR)
+        else:
+            img = cv2.imdecode(img, 1)
        if img is None:
            return None
        if self.img_mode == 'GRAY':
@@ -156,6 +166,44 @@ class KeepKeys(object):
        return data_list


+class Pad(object):
+    def __init__(self, size=None, size_div=32, **kwargs):
+        if size is not None and not isinstance(size, (int, list, tuple)):
+            raise TypeError("Type of target_size is invalid. Now is {}".format(
+                type(size)))
+        if isinstance(size, int):
+            size = [size, size]
+        self.size = size
+        self.size_div = size_div
+
+    def __call__(self, data):
+
+        img = data['image']
+        img_h, img_w = img.shape[0], img.shape[1]
+        if self.size:
+            resize_h2, resize_w2 = self.size
+            assert (
+                img_h < resize_h2 and img_w < resize_w2
+            ), '(h, w) of target size should be greater than (img_h, img_w)'
+        else:
+            resize_h2 = max(
+                int(math.ceil(img.shape[0] / self.size_div) * self.size_div),
+                self.size_div)
+            resize_w2 = max(
+                int(math.ceil(img.shape[1] / self.size_div) * self.size_div),
+                self.size_div)
+        img = cv2.copyMakeBorder(
+            img,
+            0,
+            resize_h2 - img_h,
+            0,
+            resize_w2 - img_w,
+            cv2.BORDER_CONSTANT,
+            value=0)
+        data['image'] = img
+        return data
+
+
 class Resize(object):
    def __init__(self, size=(640, 640), **kwargs):
        self.size = size

--- a/ppocr/data/imaug/rec_img_aug.py
+++ b/ppocr/data/imaug/rec_img_aug.py
--- a/ppocr/data/simple_dataset.py
+++ b/ppocr/data/simple_dataset.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 import numpy as np
 import os
+import json
 import random
 import traceback
 from paddle.io import Dataset

--- a/ppocr/losses/__init__.py
+++ b/ppocr/losses/__init__.py
--- a/ppocr/losses/basic_loss.py
+++ b/ppocr/losses/basic_loss.py
--- a/ppocr/losses/det_fce_loss.py
+++ b/ppocr/losses/det_fce_loss.py
--- a/ppocr/losses/rec_pren_loss.py
+++ b/ppocr/losses/rec_pren_loss.py
--- a/ppocr/metrics/__init__.py
+++ b/ppocr/metrics/__init__.py
--- a/ppocr/metrics/det_metric.py
+++ b/ppocr/metrics/det_metric.py
--- a/ppocr/modeling/backbones/__init__.py
+++ b/ppocr/modeling/backbones/__init__.py
--- a/ppocr/modeling/backbones/det_resnet_vd.py
+++ b/ppocr/modeling/backbones/det_resnet_vd.py