Commit fcc70660 authored by Leif's avatar Leif
Browse files

Merge remote-tracking branch 'origin/dygraph' into dygraph

parents 80aced81 013db618
This diff is collapsed.
try:
from PyQt5.QtGui import *
from PyQt5.QtCore import *
from PyQt5.QtWidgets import *
except ImportError:
from PyQt4.QtGui import *
from PyQt4.QtCore import *
from libs.utils import newIcon
import time
import datetime
import json
import cv2
import numpy as np
BB = QDialogButtonBox
class DataPartitionDialog(QDialog):
def __init__(self, parent=None):
super().__init__()
self.parnet = parent
self.title = 'DATA PARTITION'
self.train_ratio = 70
self.val_ratio = 15
self.test_ratio = 15
self.initUI()
def initUI(self):
self.setWindowTitle(self.title)
self.setWindowModality(Qt.ApplicationModal)
self.flag_accept = True
if self.parnet.lang == 'ch':
msg = "导出JSON前请保存所有图像的标注且关闭EXCEL!"
else:
msg = "Please save all the annotations and close the EXCEL before exporting JSON!"
info_msg = QLabel(msg, self)
info_msg.setWordWrap(True)
info_msg.setStyleSheet("color: red")
info_msg.setFont(QFont('Arial', 12))
train_lbl = QLabel('Train split: ', self)
train_lbl.setFont(QFont('Arial', 15))
val_lbl = QLabel('Valid split: ', self)
val_lbl.setFont(QFont('Arial', 15))
test_lbl = QLabel('Test split: ', self)
test_lbl.setFont(QFont('Arial', 15))
self.train_input = QLineEdit(self)
self.train_input.setFont(QFont('Arial', 15))
self.val_input = QLineEdit(self)
self.val_input.setFont(QFont('Arial', 15))
self.test_input = QLineEdit(self)
self.test_input.setFont(QFont('Arial', 15))
self.train_input.setText(str(self.train_ratio))
self.val_input.setText(str(self.val_ratio))
self.test_input.setText(str(self.test_ratio))
validator = QIntValidator(0, 100)
self.train_input.setValidator(validator)
self.val_input.setValidator(validator)
self.test_input.setValidator(validator)
gridlayout = QGridLayout()
gridlayout.addWidget(info_msg, 0, 0, 1, 2)
gridlayout.addWidget(train_lbl, 1, 0)
gridlayout.addWidget(val_lbl, 2, 0)
gridlayout.addWidget(test_lbl, 3, 0)
gridlayout.addWidget(self.train_input, 1, 1)
gridlayout.addWidget(self.val_input, 2, 1)
gridlayout.addWidget(self.test_input, 3, 1)
bb = BB(BB.Ok | BB.Cancel, Qt.Horizontal, self)
bb.button(BB.Ok).setIcon(newIcon('done'))
bb.button(BB.Cancel).setIcon(newIcon('undo'))
bb.accepted.connect(self.validate)
bb.rejected.connect(self.cancel)
gridlayout.addWidget(bb, 4, 0, 1, 2)
self.setLayout(gridlayout)
self.show()
def validate(self):
self.flag_accept = True
self.accept()
def cancel(self):
self.flag_accept = False
self.reject()
def getStatus(self):
return self.flag_accept
def getDataPartition(self):
self.train_ratio = int(self.train_input.text())
self.val_ratio = int(self.val_input.text())
self.test_ratio = int(self.test_input.text())
return self.train_ratio, self.val_ratio, self.test_ratio
def closeEvent(self, event):
self.flag_accept = False
self.reject()
...@@ -161,6 +161,77 @@ def get_rotate_crop_image(img, points): ...@@ -161,6 +161,77 @@ def get_rotate_crop_image(img, points):
print(e) print(e)
def boxPad(box, imgShape, pad : int) -> np.array:
"""
Pad a box with [pad] pixels on each side.
"""
box = np.array(box, dtype=np.int32)
box[0][0], box[0][1] = box[0][0] - pad, box[0][1] - pad
box[1][0], box[1][1] = box[1][0] + pad, box[1][1] - pad
box[2][0], box[2][1] = box[2][0] + pad, box[2][1] + pad
box[3][0], box[3][1] = box[3][0] - pad, box[3][1] + pad
h, w, _ = imgShape
box[:,0] = np.clip(box[:,0], 0, w)
box[:,1] = np.clip(box[:,1], 0, h)
return box
def OBB2HBB(obb) -> np.array:
"""
Convert Oriented Bounding Box to Horizontal Bounding Box.
"""
hbb = np.zeros(4, dtype=np.int32)
hbb[0] = min(obb[:, 0])
hbb[1] = min(obb[:, 1])
hbb[2] = max(obb[:, 0])
hbb[3] = max(obb[:, 1])
return hbb
def expand_list(merged, html_list):
'''
Fill blanks according to merged cells
'''
sr, er, sc, ec = merged
for i in range(sr, er):
for j in range(sc, ec):
html_list[i][j] = None
html_list[sr][sc] = ''
if ec - sc > 1:
html_list[sr][sc] += " colspan={}".format(ec - sc)
if er - sr > 1:
html_list[sr][sc] += " rowspan={}".format(er - sr)
return html_list
def convert_token(html_list):
'''
Convert raw html to label format
'''
token_list = ["<tbody>"]
# final html list:
for row in html_list:
token_list.append("<tr>")
for col in row:
if col == None:
continue
elif col == 'td':
token_list.extend(["<td>", "</td>"])
else:
token_list.append("<td")
if 'colspan' in col:
_, n = col.split('colspan=')
token_list.append(" colspan=\"{}\"".format(n))
if 'rowspan' in col:
_, n = col.split('rowspan=')
token_list.append(" rowspan=\"{}\"".format(n))
token_list.extend([">", "</td>"])
token_list.append("</tr>")
token_list.append("</tbody>")
return token_list
def stepsInfo(lang='en'): def stepsInfo(lang='en'):
if lang == 'ch': if lang == 'ch':
msg = "1. 安装与运行:使用上述命令安装与运行程序。\n" \ msg = "1. 安装与运行:使用上述命令安装与运行程序。\n" \
......
...@@ -84,7 +84,7 @@ mhelp=Help ...@@ -84,7 +84,7 @@ mhelp=Help
iconList=Icon List iconList=Icon List
detectionBoxposition=Detection box position detectionBoxposition=Detection box position
recognitionResult=Recognition result recognitionResult=Recognition result
creatPolygon=Create Quadrilateral creatPolygon=Create PolygonBox
rotateLeft=Left turn 90 degrees rotateLeft=Left turn 90 degrees
rotateRight=Right turn 90 degrees rotateRight=Right turn 90 degrees
drawSquares=Draw Squares drawSquares=Draw Squares
...@@ -110,3 +110,6 @@ lockBoxDetail=Lock selected box/Unlock all box ...@@ -110,3 +110,6 @@ lockBoxDetail=Lock selected box/Unlock all box
keyListTitle=Key List keyListTitle=Key List
keyDialogTip=Enter object label keyDialogTip=Enter object label
keyChange=Change Box Key keyChange=Change Box Key
TableRecognition=Table Recognition
cellreRecognition=Cell Re-Recognition
exportJSON=export JSON(PubTabNet)
...@@ -84,7 +84,7 @@ mhelp=帮助 ...@@ -84,7 +84,7 @@ mhelp=帮助
iconList=缩略图 iconList=缩略图
detectionBoxposition=检测框位置 detectionBoxposition=检测框位置
recognitionResult=识别结果 recognitionResult=识别结果
creatPolygon=四点标注 creatPolygon=多边形标注
drawSquares=正方形标注 drawSquares=正方形标注
rotateLeft=图片左旋转90度 rotateLeft=图片左旋转90度
rotateRight=图片右旋转90度 rotateRight=图片右旋转90度
...@@ -109,4 +109,7 @@ lockBox=锁定框/解除锁定框 ...@@ -109,4 +109,7 @@ lockBox=锁定框/解除锁定框
lockBoxDetail=若当前没有框处于锁定状态则锁定选中的框,若存在锁定框则解除所有锁定框的锁定状态 lockBoxDetail=若当前没有框处于锁定状态则锁定选中的框,若存在锁定框则解除所有锁定框的锁定状态
keyListTitle=关键词列表 keyListTitle=关键词列表
keyDialogTip=请输入类型名称 keyDialogTip=请输入类型名称
keyChange=更改Box关键字类别 keyChange=更改Box关键字类别
\ No newline at end of file TableRecognition=表格识别
cellreRecognition=单元格重识别
exportJSON=导出表格JSON标注
\ No newline at end of file
...@@ -19,12 +19,9 @@ PaddleOCR aims to create multilingual, awesome, leading, and practical OCR tools ...@@ -19,12 +19,9 @@ PaddleOCR aims to create multilingual, awesome, leading, and practical OCR tools
**Recent updates** **Recent updates**
- 2021.12.21 OCR open source online course starts. The lesson starts at 8:30 every night and lasts for ten days. Free registration: https://aistudio.baidu.com/aistudio/course/introduce/25207 - 2021.12.21 release PaddleOCR v2.4, release 1 text detection algorithm (PSENet), 3 text recognition algorithms (NRTR、SEED、SAR), 1 key information extraction algorithm (SDMGR, [tutorial](./ppstructure/docs/kie_en.md)) and 3 DocVQA algorithms (LayoutLM, LayoutLMv2, LayoutXLM, [tutorial](./ppstructure/vqa)).
- 2021.12.21 release PaddleOCR v2.4, release 1 text detection algorithm (PSENet), 3 text recognition algorithms (NRTR、SEED、SAR), 1 key information extraction algorithm (SDMGR, [tutorial](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.4/ppstructure/docs/kie.md)) and 3 DocVQA algorithms (LayoutLM, LayoutLMv2, LayoutXLM, [tutorial](https://github.com/PaddlePaddle/PaddleOCR/tree/release/2.4/ppstructure/vqa)). - 2021.9.7 release PaddleOCR v2.3, [PP-OCRv2](./doc/doc_en/ppocr_introduction_en.md#pp-ocrv2) is proposed. The inference speed of PP-OCRv2 is 220% higher than that of PP-OCR server in CPU device. The F-score of PP-OCRv2 is 7% higher than that of PP-OCR mobile.
- PaddleOCR R&D team would like to share the key points of PP-OCRv2, at 20:15 pm on September 8th, [Course Address](https://aistudio.baidu.com/aistudio/education/group/info/6758). - 2021.8.3 released PaddleOCR v2.2, add a new structured documents analysis toolkit, i.e., [PP-Structure](./ppstructure/README.md), support layout analysis and table recognition (One-key to export chart images to Excel files).
- 2021.9.7 release PaddleOCR v2.3, [PP-OCRv2](#PP-OCRv2) is proposed. The inference speed of PP-OCRv2 is 220% higher than that of PP-OCR server in CPU device. The F-score of PP-OCRv2 is 7% higher than that of PP-OCR mobile.
- 2021.8.3 released PaddleOCR v2.2, add a new structured documents analysis toolkit, i.e., [PP-Structure](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.2/ppstructure/README.md), support layout analysis and table recognition (One-key to export chart images to Excel files).
- 2021.4.8 release end-to-end text recognition algorithm [PGNet](https://www.aaai.org/AAAI21Papers/AAAI-2885.WangP.pdf) which is published in AAAI 2021. Find tutorial [here](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.1/doc/doc_en/pgnet_en.md);release multi language recognition [models](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.1/doc/doc_en/multi_languages_en.md), support more than 80 languages recognition; especically, the performance of [English recognition model](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.1/doc/doc_en/models_list_en.md#English) is Optimized.
- [more](./doc/doc_en/update_en.md) - [more](./doc/doc_en/update_en.md)
...@@ -81,7 +78,6 @@ PaddleOCR support a variety of cutting-edge algorithms related to OCR, and devel ...@@ -81,7 +78,6 @@ PaddleOCR support a variety of cutting-edge algorithms related to OCR, and devel
## Tutorials ## Tutorials
- [Environment Preparation](./doc/doc_en/environment_en.md) - [Environment Preparation](./doc/doc_en/environment_en.md)
- [Quick Start](./doc/doc_en/quickstart_en.md)
- [PP-OCR 🔥](./doc/doc_en/ppocr_introduction_en.md) - [PP-OCR 🔥](./doc/doc_en/ppocr_introduction_en.md)
- [Quick Start](./doc/doc_en/quickstart_en.md) - [Quick Start](./doc/doc_en/quickstart_en.md)
- [Model Zoo](./doc/doc_en/models_en.md) - [Model Zoo](./doc/doc_en/models_en.md)
......
...@@ -27,10 +27,9 @@ PaddleOCR旨在打造一套丰富、领先、且实用的OCR工具库,助力 ...@@ -27,10 +27,9 @@ PaddleOCR旨在打造一套丰富、领先、且实用的OCR工具库,助力
## 近期更新 ## 近期更新
- 2021.12.21《动手学OCR · 十讲》课程开讲,12月21日起每晚八点半线上授课![免费报名地址](https://aistudio.baidu.com/aistudio/course/introduce/25207) - 2021.12.21 发布PaddleOCR v2.4。OCR算法新增1种文本检测算法(PSENet),3种文本识别算法(NRTR、SEED、SAR);文档结构化算法新增1种关键信息提取算法(SDMGR,[文档](./ppstructure/docs/kie.md)),3种DocVQA算法(LayoutLM、LayoutLMv2,LayoutXLM,[文档](./ppstructure/vqa))。
- 2021.12.21 发布PaddleOCR v2.4。OCR算法新增1种文本检测算法(PSENet),3种文本识别算法(NRTR、SEED、SAR);文档结构化算法新增1种关键信息提取算法(SDMGR,[文档](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.4/ppstructure/docs/kie.md)),3种DocVQA算法(LayoutLM、LayoutLMv2,LayoutXLM,[文档](https://github.com/PaddlePaddle/PaddleOCR/tree/release/2.4/ppstructure/vqa))。 - 2021.9.7 发布PaddleOCR v2.3与[PP-OCRv2](./doc/doc_ch/ppocr_introduction.md#pp-ocrv2),CPU推理速度相比于PP-OCR server提升220%;效果相比于PP-OCR mobile 提升7%。
- 2021.9.7 发布PaddleOCR v2.3与[PP-OCRv2](#PP-OCRv2),CPU推理速度相比于PP-OCR server提升220%;效果相比于PP-OCR mobile 提升7%。 - 2021.8.3 发布PaddleOCR v2.2,新增文档结构分析[PP-Structure](./ppstructure/README_ch.md)工具包,支持版面分析与表格识别(含Excel导出)。
- 2021.8.3 发布PaddleOCR v2.2,新增文档结构分析[PP-Structure](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.2/ppstructure/README_ch.md)工具包,支持版面分析与表格识别(含Excel导出)。
> [更多](./doc/doc_ch/update.md) > [更多](./doc/doc_ch/update.md)
...@@ -83,7 +82,6 @@ PaddleOCR旨在打造一套丰富、领先、且实用的OCR工具库,助力 ...@@ -83,7 +82,6 @@ PaddleOCR旨在打造一套丰富、领先、且实用的OCR工具库,助力
## 文档教程 ## 文档教程
- [运行环境准备](./doc/doc_ch/environment.md) - [运行环境准备](./doc/doc_ch/environment.md)
- [快速开始(中英文/多语言/文档分析)](./doc/doc_ch/quickstart.md)
- [PP-OCR文本检测识别🔥](./doc/doc_ch/ppocr_introduction.md) - [PP-OCR文本检测识别🔥](./doc/doc_ch/ppocr_introduction.md)
- [快速开始](./doc/doc_ch/quickstart.md) - [快速开始](./doc/doc_ch/quickstart.md)
- [模型库](./doc/doc_ch/models_list.md) - [模型库](./doc/doc_ch/models_list.md)
......
...@@ -129,7 +129,7 @@ Loss: ...@@ -129,7 +129,7 @@ Loss:
key: head_out key: head_out
multi_head: True multi_head: True
- DistillationSARLoss: - DistillationSARLoss:
weight: 0.5 weight: 1.0
model_name_list: ["Student", "Teacher"] model_name_list: ["Student", "Teacher"]
key: head_out key: head_out
multi_head: True multi_head: True
......
...@@ -166,6 +166,10 @@ void CRNNRecognizer::LoadModel(const std::string &model_dir) { ...@@ -166,6 +166,10 @@ void CRNNRecognizer::LoadModel(const std::string &model_dir) {
config.SetCpuMathLibraryNumThreads(this->cpu_math_library_num_threads_); config.SetCpuMathLibraryNumThreads(this->cpu_math_library_num_threads_);
} }
// get pass_builder object
auto pass_builder = config.pass_builder();
// delete "matmul_transpose_reshape_fuse_pass"
pass_builder->DeletePass("matmul_transpose_reshape_fuse_pass");
config.SwitchUseFeedFetchOps(false); config.SwitchUseFeedFetchOps(false);
// true for multiple input // true for multiple input
config.SwitchSpecifyInputNames(true); config.SwitchSpecifyInputNames(true);
......
...@@ -36,8 +36,8 @@ op: ...@@ -36,8 +36,8 @@ op:
#det模型路径 #det模型路径
model_config: ./ppocr_det_v3_serving model_config: ./ppocr_det_v3_serving
#Fetch结果列表,以client_config中fetch_var的alias_name为准 #Fetch结果列表,以client_config中fetch_var的alias_name为准,不设置默认取全部输出变量
fetch_list: ["sigmoid_0.tmp_0"] #fetch_list: ["sigmoid_0.tmp_0"]
#计算硬件ID,当devices为""或不写时为CPU预测;当devices为"0", "0,1,2"时为GPU预测,表示使用的GPU卡 #计算硬件ID,当devices为""或不写时为CPU预测;当devices为"0", "0,1,2"时为GPU预测,表示使用的GPU卡
devices: "0" devices: "0"
...@@ -62,8 +62,8 @@ op: ...@@ -62,8 +62,8 @@ op:
#rec模型路径 #rec模型路径
model_config: ./ppocr_rec_v3_serving model_config: ./ppocr_rec_v3_serving
#Fetch结果列表,以client_config中fetch_var的alias_name为准 #Fetch结果列表,以client_config中fetch_var的alias_name为准, 不设置默认取全部输出变量
fetch_list: ["softmax_5.tmp_0"] #fetch_list:
#计算硬件ID,当devices为""或不写时为CPU预测;当devices为"0", "0,1,2"时为GPU预测,表示使用的GPU卡 #计算硬件ID,当devices为""或不写时为CPU预测;当devices为"0", "0,1,2"时为GPU预测,表示使用的GPU卡
devices: "0" devices: "0"
......
...@@ -393,7 +393,7 @@ class OCRReader(object): ...@@ -393,7 +393,7 @@ class OCRReader(object):
return norm_img_batch[0] return norm_img_batch[0]
def postprocess(self, outputs, with_score=False): def postprocess(self, outputs, with_score=False):
preds = outputs["softmax_5.tmp_0"] preds = list(outputs.values())[0]
try: try:
preds = preds.numpy() preds = preds.numpy()
except: except:
...@@ -404,8 +404,11 @@ class OCRReader(object): ...@@ -404,8 +404,11 @@ class OCRReader(object):
preds_idx, preds_prob, is_remove_duplicate=True) preds_idx, preds_prob, is_remove_duplicate=True)
return text return text
from argparse import ArgumentParser,RawDescriptionHelpFormatter
from argparse import ArgumentParser, RawDescriptionHelpFormatter
import yaml import yaml
class ArgsParser(ArgumentParser): class ArgsParser(ArgumentParser):
def __init__(self): def __init__(self):
super(ArgsParser, self).__init__( super(ArgsParser, self).__init__(
...@@ -441,16 +444,16 @@ class ArgsParser(ArgumentParser): ...@@ -441,16 +444,16 @@ class ArgsParser(ArgumentParser):
s = s.strip() s = s.strip()
k, v = s.split('=') k, v = s.split('=')
v = self._parse_helper(v) v = self._parse_helper(v)
print(k,v, type(v)) print(k, v, type(v))
cur = config cur = config
parent = cur parent = cur
for kk in k.split("."): for kk in k.split("."):
if kk not in cur: if kk not in cur:
cur[kk] = {} cur[kk] = {}
parent = cur parent = cur
cur = cur[kk] cur = cur[kk]
else: else:
parent = cur parent = cur
cur = cur[kk] cur = cur[kk]
parent[k.split(".")[-1]] = v parent[k.split(".")[-1]] = v
return config return config
\ No newline at end of file
...@@ -56,7 +56,7 @@ class DetOp(Op): ...@@ -56,7 +56,7 @@ class DetOp(Op):
return {"x": det_img[np.newaxis, :].copy()}, False, None, "" return {"x": det_img[np.newaxis, :].copy()}, False, None, ""
def postprocess(self, input_dicts, fetch_dict, data_id, log_id): def postprocess(self, input_dicts, fetch_dict, data_id, log_id):
det_out = fetch_dict["sigmoid_0.tmp_0"] det_out = list(fetch_dict.values())[0]
ratio_list = [ ratio_list = [
float(self.new_h) / self.ori_h, float(self.new_w) / self.ori_w float(self.new_h) / self.ori_h, float(self.new_w) / self.ori_w
] ]
......
...@@ -55,7 +55,7 @@ class DetOp(Op): ...@@ -55,7 +55,7 @@ class DetOp(Op):
return {"x": det_img[np.newaxis, :].copy()}, False, None, "" return {"x": det_img[np.newaxis, :].copy()}, False, None, ""
def postprocess(self, input_dicts, fetch_dict, data_id, log_id): def postprocess(self, input_dicts, fetch_dict, data_id, log_id):
det_out = fetch_dict["sigmoid_0.tmp_0"] det_out = list(fetch_dict.values())[0]
ratio_list = [ ratio_list = [
float(self.new_h) / self.ori_h, float(self.new_w) / self.ori_w float(self.new_h) / self.ori_h, float(self.new_w) / self.ori_w
] ]
......
This diff is collapsed.
This diff is collapsed.
...@@ -38,8 +38,9 @@ PP-OCRv2在PP-OCR的基础上,进一步在5个方面重点优化,检测模 ...@@ -38,8 +38,9 @@ PP-OCRv2在PP-OCR的基础上,进一步在5个方面重点优化,检测模
#### PP-OCRv3 #### PP-OCRv3
PP-OCRv3在PP-OCRv2的基础上进一步升级。检测模型仍然基于DB算法,优化策略采用了带残差注意力机制的FPN结构RSEFPN、增大感受野的PAN结构LKPAN、基于DML训练的更优的教师模型;识别模型将base模型从CRNN替换成了IJCAI 2022论文[SVTR](https://arxiv.org/abs/2205.00159),并采用SVTR轻量化、带指导训练CTC、数据增广策略RecConAug、自监督训练的更好的预训练模型、无标签数据的使用进行模型加速和效果提升。更多细节请参考PP-OCRv3[技术报告](./PP-OCRv3_introduction.md) PP-OCRv3在PP-OCRv2的基础上,针对检测模型和识别模型,进行了共计9个方面的升级:
- PP-OCRv3检测模型对PP-OCRv2中的CML协同互学习文本检测蒸馏策略进行了升级,分别针对教师模型和学生模型进行进一步效果优化。其中,在对教师模型优化时,提出了大感受野的PAN结构LK-PAN和引入了DML蒸馏策略;在对学生模型优化时,提出了残差注意力机制的FPN结构RSE-FPN。
- PP-OCRv3的识别模块是基于文本识别算法[SVTR](https://arxiv.org/abs/2205.00159)优化。SVTR不再采用RNN结构,通过引入Transformers结构更加有效地挖掘文本行图像的上下文信息,从而提升文本识别能力。PP-OCRv3通过轻量级文本识别网络SVTR_LCNet、Attention损失指导CTC损失训练策略、挖掘文字上下文信息的数据增广策略TextConAug、TextRotNet自监督预训练模型、UDML联合互学习策略、UIM无标注数据挖掘方案,6个方面进行模型加速和效果提升。
PP-OCRv3系统pipeline如下: PP-OCRv3系统pipeline如下:
...@@ -47,6 +48,9 @@ PP-OCRv3系统pipeline如下: ...@@ -47,6 +48,9 @@ PP-OCRv3系统pipeline如下:
<img src="../ppocrv3_framework.png" width="800"> <img src="../ppocrv3_framework.png" width="800">
</div> </div>
更多细节请参考PP-OCRv3[技术报告](./PP-OCRv3_introduction.md)
<a name="2"></a> <a name="2"></a>
## 2. 特性 ## 2. 特性
......
...@@ -59,15 +59,13 @@ cd /path/to/ppocr_img ...@@ -59,15 +59,13 @@ cd /path/to/ppocr_img
如果不使用提供的测试图片,可以将下方`--image_dir`参数替换为相应的测试图片路径。 如果不使用提供的测试图片,可以将下方`--image_dir`参数替换为相应的测试图片路径。
**注意** whl包默认使用`PP-OCRv3`模型,识别模型使用的输入shape为`3,48,320`, 因此如果使用识别功能,需要添加参数`--rec_image_shape 3,48,320`,如果不使用默认的`PP-OCRv3`模型,则无需设置该参数。
<a name="211"></a> <a name="211"></a>
#### 2.1.1 中英文模型 #### 2.1.1 中英文模型
* 检测+方向分类器+识别全流程:`--use_angle_cls true`设置使用方向分类器识别180度旋转文字,`--use_gpu false`设置不使用GPU * 检测+方向分类器+识别全流程:`--use_angle_cls true`设置使用方向分类器识别180度旋转文字,`--use_gpu false`设置不使用GPU
```bash ```bash
paddleocr --image_dir ./imgs/11.jpg --use_angle_cls true --use_gpu false --rec_image_shape 3,48,320 paddleocr --image_dir ./imgs/11.jpg --use_angle_cls true --use_gpu false
``` ```
结果是一个list,每个item包含了文本框,文字和识别置信度 结果是一个list,每个item包含了文本框,文字和识别置信度
...@@ -94,7 +92,7 @@ cd /path/to/ppocr_img ...@@ -94,7 +92,7 @@ cd /path/to/ppocr_img
- 单独使用识别:设置`--det``false` - 单独使用识别:设置`--det``false`
```bash ```bash
paddleocr --image_dir ./imgs_words/ch/word_1.jpg --det false --rec_image_shape 3,48,320 paddleocr --image_dir ./imgs_words/ch/word_1.jpg --det false
``` ```
结果是一个list,每个item只包含识别结果和识别置信度 结果是一个list,每个item只包含识别结果和识别置信度
...@@ -104,16 +102,16 @@ cd /path/to/ppocr_img ...@@ -104,16 +102,16 @@ cd /path/to/ppocr_img
``` ```
如需使用2.0模型,请指定参数`--version PP-OCR`,paddleocr默认使用PP-OCRv3模型(`--versioin PP-OCRv3`)。更多whl包使用可参考[whl包文档](./whl.md) 如需使用2.0模型,请指定参数`--ocr_version PP-OCR`,paddleocr默认使用PP-OCRv3模型(`--ocr_version PP-OCRv3`)。更多whl包使用可参考[whl包文档](./whl.md)
<a name="212"></a> <a name="212"></a>
#### 2.1.2 多语言模型 #### 2.1.2 多语言模型
Paddleocr目前支持80个语种,可以通过修改`--lang`参数进行切换,对于英文模型,指定`--lang=en`, PP-OCRv3目前只支持中文和英文模型,其他多语言模型会陆续更新 PaddleOCR目前支持80个语种,可以通过修改`--lang`参数进行切换,对于英文模型,指定`--lang=en`
``` bash ``` bash
paddleocr --image_dir ./imgs_en/254.jpg --lang=en --rec_image_shape 3,48,320 paddleocr --image_dir ./imgs_en/254.jpg --lang=en
``` ```
<div align="center"> <div align="center">
......
# 更新 # 更新
- 2022.5.7 添加对[Weights & Biases](https://docs.wandb.ai/)训练日志记录工具的支持。
- 2021.12.21 《OCR十讲》课程开讲,12月21日起每晚八点半线上授课! 【免费】报名地址:https://aistudio.baidu.com/aistudio/course/introduce/25207 - 2021.12.21 《OCR十讲》课程开讲,12月21日起每晚八点半线上授课! 【免费】报名地址:https://aistudio.baidu.com/aistudio/course/introduce/25207
- 2021.12.21 发布PaddleOCR v2.4。OCR算法新增1种文本检测算法(PSENet),3种文本识别算法(NRTR、SEED、SAR);文档结构化算法新增1种关键信息提取算法(SDMGR),3种DocVQA算法(LayoutLM、LayoutLMv2,LayoutXLM)。 - 2021.12.21 发布PaddleOCR v2.4。OCR算法新增1种文本检测算法(PSENet),3种文本识别算法(NRTR、SEED、SAR);文档结构化算法新增1种关键信息提取算法(SDMGR),3种DocVQA算法(LayoutLM、LayoutLMv2,LayoutXLM)。
- 2021.9.7 发布PaddleOCR v2.3,发布[PP-OCRv2](#PP-OCRv2),CPU推理速度相比于PP-OCR server提升220%;效果相比于PP-OCR mobile 提升7%。 - 2021.9.7 发布PaddleOCR v2.3,发布[PP-OCRv2](#PP-OCRv2),CPU推理速度相比于PP-OCR server提升220%;效果相比于PP-OCR mobile 提升7%。
......
...@@ -199,12 +199,10 @@ for line in result: ...@@ -199,12 +199,10 @@ for line in result:
paddleocr -h paddleocr -h
``` ```
**注意** whl包默认使用`PP-OCRv3`模型,识别模型使用的输入shape为`3,48,320`, 因此如果使用识别功能,需要添加参数`--rec_image_shape 3,48,320`,如果不使用默认的`PP-OCRv3`模型,则无需设置该参数。
* 检测+方向分类器+识别全流程 * 检测+方向分类器+识别全流程
```bash ```bash
paddleocr --image_dir PaddleOCR/doc/imgs/11.jpg --use_angle_cls true --rec_image_shape 3,48,320 paddleocr --image_dir PaddleOCR/doc/imgs/11.jpg --use_angle_cls true
``` ```
结果是一个list,每个item包含了文本框,文字和识别置信度 结果是一个list,每个item包含了文本框,文字和识别置信度
...@@ -217,7 +215,7 @@ paddleocr --image_dir PaddleOCR/doc/imgs/11.jpg --use_angle_cls true --rec_image ...@@ -217,7 +215,7 @@ paddleocr --image_dir PaddleOCR/doc/imgs/11.jpg --use_angle_cls true --rec_image
* 检测+识别 * 检测+识别
```bash ```bash
paddleocr --image_dir PaddleOCR/doc/imgs/11.jpg --rec_image_shape 3,48,320 paddleocr --image_dir PaddleOCR/doc/imgs/11.jpg
``` ```
结果是一个list,每个item包含了文本框,文字和识别置信度 结果是一个list,每个item包含了文本框,文字和识别置信度
...@@ -230,7 +228,7 @@ paddleocr --image_dir PaddleOCR/doc/imgs/11.jpg --rec_image_shape 3,48,320 ...@@ -230,7 +228,7 @@ paddleocr --image_dir PaddleOCR/doc/imgs/11.jpg --rec_image_shape 3,48,320
* 方向分类器+识别 * 方向分类器+识别
```bash ```bash
paddleocr --image_dir PaddleOCR/doc/imgs_words/ch/word_1.jpg --use_angle_cls true --det false --rec_image_shape 3,48,320 paddleocr --image_dir PaddleOCR/doc/imgs_words/ch/word_1.jpg --use_angle_cls true --det false
``` ```
结果是一个list,每个item只包含识别结果和识别置信度 结果是一个list,每个item只包含识别结果和识别置信度
...@@ -256,7 +254,7 @@ paddleocr --image_dir PaddleOCR/doc/imgs/11.jpg --rec false ...@@ -256,7 +254,7 @@ paddleocr --image_dir PaddleOCR/doc/imgs/11.jpg --rec false
* 单独执行识别 * 单独执行识别
```bash ```bash
paddleocr --image_dir PaddleOCR/doc/imgs_words/ch/word_1.jpg --det false --rec_image_shape 3,48,320 paddleocr --image_dir PaddleOCR/doc/imgs_words/ch/word_1.jpg --det false
``` ```
结果是一个list,每个item只包含识别结果和识别置信度 结果是一个list,每个item只包含识别结果和识别置信度
...@@ -416,4 +414,4 @@ im_show.save('result.jpg') ...@@ -416,4 +414,4 @@ im_show.save('result.jpg')
| cls | 前向时是否启动分类 (命令行模式下使用use_angle_cls控制前向是否启动分类) | FALSE | | cls | 前向时是否启动分类 (命令行模式下使用use_angle_cls控制前向是否启动分类) | FALSE |
| show_log | 是否打印logger信息 | FALSE | | show_log | 是否打印logger信息 | FALSE |
| type | 执行ocr或者表格结构化, 值可选['ocr','structure'] | ocr | | type | 执行ocr或者表格结构化, 值可选['ocr','structure'] | ocr |
| ocr_version | OCR模型版本,可选PP-OCRv3, PP-OCRv2, PP-OCR。PP-OCRv3 目前仅支持中、英文的检测识别模型,方向分类器模型;PP-OCRv2 目前仅支持中文的检测和识别模型;PP-OCR支持中文的检测,识别,多语种识别,方向分类器等模型 | PP-OCRv3 | | ocr_version | OCR模型版本,可选PP-OCRv3, PP-OCRv2, PP-OCR。PP-OCRv3 支持中、英文的检测识别、多语种识别,方向分类器模型;PP-OCRv2 目前仅支持中文的检测和识别模型;PP-OCR支持中文的检测,识别,多语种识别,方向分类器等模型 | PP-OCRv3 |
...@@ -36,6 +36,7 @@ Take rec_chinese_lite_train_v2.0.yml as an example ...@@ -36,6 +36,7 @@ Take rec_chinese_lite_train_v2.0.yml as an example
| pretrained_model | Set the path of the pre-trained model | ./pretrain_models/CRNN/best_accuracy | \ | | pretrained_model | Set the path of the pre-trained model | ./pretrain_models/CRNN/best_accuracy | \ |
| checkpoints | set model parameter path | None | Used to load parameters after interruption to continue training| | checkpoints | set model parameter path | None | Used to load parameters after interruption to continue training|
| use_visualdl | Set whether to enable visualdl for visual log display | False | [Tutorial](https://www.paddlepaddle.org.cn/paddle/visualdl) | | use_visualdl | Set whether to enable visualdl for visual log display | False | [Tutorial](https://www.paddlepaddle.org.cn/paddle/visualdl) |
| use_wandb | Set whether to enable W&B for visual log display | False | [Documentation](https://docs.wandb.ai/)
| infer_img | Set inference image path or folder path | ./infer_img | \|| | infer_img | Set inference image path or folder path | ./infer_img | \||
| character_dict_path | Set dictionary path | ./ppocr/utils/ppocr_keys_v1.txt | If the character_dict_path is None, model can only recognize number and lower letters | | character_dict_path | Set dictionary path | ./ppocr/utils/ppocr_keys_v1.txt | If the character_dict_path is None, model can only recognize number and lower letters |
| max_text_length | Set the maximum length of text | 25 | \ | | max_text_length | Set the maximum length of text | 25 | \ |
...@@ -66,7 +67,7 @@ In PaddleOCR, the network is divided into four stages: Transform, Backbone, Neck ...@@ -66,7 +67,7 @@ In PaddleOCR, the network is divided into four stages: Transform, Backbone, Neck
| :---------------------: | :---------------------: | :--------------: | :--------------------: | | :---------------------: | :---------------------: | :--------------: | :--------------------: |
| model_type | Network Type | rec | Currently support`rec`,`det`,`cls` | | model_type | Network Type | rec | Currently support`rec`,`det`,`cls` |
| algorithm | Model name | CRNN | See [algorithm_overview](./algorithm_overview_en.md) for the support list | | algorithm | Model name | CRNN | See [algorithm_overview](./algorithm_overview_en.md) for the support list |
| **Transform** | Set the transformation method | - | Currently only recognition algorithms are supported, see [ppocr/modeling/transforms](../../ppocr/modeling/transforms) for details | | **Transform** | Set the transformation method | - | Currently only recognition algorithms are supported, see [ppocr/modeling/transform](../../ppocr/modeling/transforms) for details |
| name | Transformation class name | TPS | Currently supports `TPS` | | name | Transformation class name | TPS | Currently supports `TPS` |
| num_fiducial | Number of TPS control points | 20 | Ten on the top and bottom | | num_fiducial | Number of TPS control points | 20 | Ten on the top and bottom |
| loc_lr | Localization network learning rate | 0.1 | \ | | loc_lr | Localization network learning rate | 0.1 | \ |
...@@ -130,6 +131,17 @@ In PaddleOCR, the network is divided into four stages: Transform, Backbone, Neck ...@@ -130,6 +131,17 @@ In PaddleOCR, the network is divided into four stages: Transform, Backbone, Neck
| drop_last | Whether to discard the last incomplete mini-batch because the number of samples in the data set cannot be divisible by batch_size | True | \ | | drop_last | Whether to discard the last incomplete mini-batch because the number of samples in the data set cannot be divisible by batch_size | True | \ |
| num_workers | The number of sub-processes used to load data, if it is 0, the sub-process is not started, and the data is loaded in the main process | 8 | \ | | num_workers | The number of sub-processes used to load data, if it is 0, the sub-process is not started, and the data is loaded in the main process | 8 | \ |
### Weights & Biases ([W&B](../../ppocr/utils/loggers/wandb_logger.py))
| Parameter | Use | Defaults | Note |
| :---------------------: | :---------------------: | :--------------: | :--------------------: |
| project | Project to which the run is to be logged | uncategorized | \
| name | Alias/Name of the run | Randomly generated by wandb | \
| id | ID of the run | Randomly generated by wandb | \
| entity | User or team to which the run is being logged | The logged in user | \
| save_dir | local directory in which all the models and other data is saved | wandb | \
| config | model configuration | None | \
<a name="3-multilingual-config-file-generation"></a> <a name="3-multilingual-config-file-generation"></a>
## 3. Multilingual Config File Generation ## 3. Multilingual Config File Generation
...@@ -233,4 +245,4 @@ For more supported languages, please refer to : [Multi-language model](https://g ...@@ -233,4 +245,4 @@ For more supported languages, please refer to : [Multi-language model](https://g
The multi-language model training method is the same as the Chinese model. The training data set is 100w synthetic data. A small amount of fonts and test data can be downloaded using the following two methods. The multi-language model training method is the same as the Chinese model. The training data set is 100w synthetic data. A small amount of fonts and test data can be downloaded using the following two methods.
* [Baidu Netdisk](https://pan.baidu.com/s/1bS_u207Rm7YbY33wOECKDA),Extraction code:frgi. * [Baidu Netdisk](https://pan.baidu.com/s/1bS_u207Rm7YbY33wOECKDA),Extraction code:frgi.
* [Google drive](https://drive.google.com/file/d/18cSWX7wXSy4G0tbKJ0d9PuIaiwRLHpjA/view) * [Google drive](https://drive.google.com/file/d/18cSWX7wXSy4G0tbKJ0d9PuIaiwRLHpjA/view)
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment