Commit 6c7ff9c7 authored by LDOUBLEV's avatar LDOUBLEV
Browse files

fix conflict

parents ac91a9e1 9b8f587e
...@@ -47,16 +47,13 @@ void Normalize::Run(cv::Mat *im, const std::vector<float> &mean, ...@@ -47,16 +47,13 @@ void Normalize::Run(cv::Mat *im, const std::vector<float> &mean,
e /= 255.0; e /= 255.0;
} }
(*im).convertTo(*im, CV_32FC3, e); (*im).convertTo(*im, CV_32FC3, e);
for (int h = 0; h < im->rows; h++) { std::vector<cv::Mat> bgr_channels(3);
for (int w = 0; w < im->cols; w++) { cv::split(*im, bgr_channels);
im->at<cv::Vec3f>(h, w)[0] = for (auto i = 0; i < bgr_channels.size(); i++) {
(im->at<cv::Vec3f>(h, w)[0] - mean[0]) * scale[0]; bgr_channels[i].convertTo(bgr_channels[i], CV_32FC1, 1.0 * scale[i],
im->at<cv::Vec3f>(h, w)[1] = (0.0 - mean[i]) * scale[i]);
(im->at<cv::Vec3f>(h, w)[1] - mean[1]) * scale[1];
im->at<cv::Vec3f>(h, w)[2] =
(im->at<cv::Vec3f>(h, w)[2] - mean[2]) * scale[2];
}
} }
cv::merge(bgr_channels, *im);
} }
void ResizeImgType0::Run(const cv::Mat &img, cv::Mat &resize_img, void ResizeImgType0::Run(const cv::Mat &img, cv::Mat &resize_img,
......
...@@ -29,7 +29,8 @@ deploy/hubserving/ocr_system/ ...@@ -29,7 +29,8 @@ deploy/hubserving/ocr_system/
### 1. 准备环境 ### 1. 准备环境
```shell ```shell
# 安装paddlehub # 安装paddlehub
pip3 install paddlehub --upgrade -i https://pypi.tuna.tsinghua.edu.cn/simple # paddlehub 需要 python>3.6.2
pip3 install paddlehub==2.1.0 --upgrade -i https://pypi.tuna.tsinghua.edu.cn/simple
``` ```
### 2. 下载推理模型 ### 2. 下载推理模型
......
...@@ -30,7 +30,8 @@ The following steps take the 2-stage series service as an example. If only the d ...@@ -30,7 +30,8 @@ The following steps take the 2-stage series service as an example. If only the d
### 1. Prepare the environment ### 1. Prepare the environment
```shell ```shell
# Install paddlehub # Install paddlehub
pip3 install paddlehub --upgrade -i https://pypi.tuna.tsinghua.edu.cn/simple # python>3.6.2 is required bt paddlehub
pip3 install paddlehub==2.1.0 --upgrade -i https://pypi.tuna.tsinghua.edu.cn/simple
``` ```
### 2. Download inference model ### 2. Download inference model
......
...@@ -37,6 +37,17 @@ from paddleslim.dygraph.quant import QAT ...@@ -37,6 +37,17 @@ from paddleslim.dygraph.quant import QAT
from ppocr.data import build_dataloader from ppocr.data import build_dataloader
def export_single_model(quanter, model, infer_shape, save_path, logger):
quanter.save_quantized_model(
model,
save_path,
input_spec=[
paddle.static.InputSpec(
shape=[None] + infer_shape, dtype='float32')
])
logger.info('inference QAT model is saved to {}'.format(save_path))
def main(): def main():
############################################################################################################ ############################################################################################################
# 1. quantization configs # 1. quantization configs
...@@ -76,14 +87,21 @@ def main(): ...@@ -76,14 +87,21 @@ def main():
# for rec algorithm # for rec algorithm
if hasattr(post_process_class, 'character'): if hasattr(post_process_class, 'character'):
char_num = len(getattr(post_process_class, 'character')) char_num = len(getattr(post_process_class, 'character'))
config['Architecture']["Head"]['out_channels'] = char_num if config['Architecture']["algorithm"] in ["Distillation",
]: # distillation model
for key in config['Architecture']["Models"]:
config['Architecture']["Models"][key]["Head"][
'out_channels'] = char_num
else: # base rec model
config['Architecture']["Head"]['out_channels'] = char_num
model = build_model(config['Architecture']) model = build_model(config['Architecture'])
# get QAT model # get QAT model
quanter = QAT(config=quant_config) quanter = QAT(config=quant_config)
quanter.quantize(model) quanter.quantize(model)
init_model(config, model, logger) init_model(config, model)
model.eval() model.eval()
# build metric # build metric
...@@ -92,25 +110,30 @@ def main(): ...@@ -92,25 +110,30 @@ def main():
# build dataloader # build dataloader
valid_dataloader = build_dataloader(config, 'Eval', device, logger) valid_dataloader = build_dataloader(config, 'Eval', device, logger)
use_srn = config['Architecture']['algorithm'] == "SRN"
model_type = config['Architecture']['model_type']
# start eval # start eval
metirc = program.eval(model, valid_dataloader, post_process_class, metric = program.eval(model, valid_dataloader, post_process_class,
eval_class) eval_class, model_type, use_srn)
logger.info('metric eval ***************') logger.info('metric eval ***************')
for k, v in metirc.items(): for k, v in metric.items():
logger.info('{}:{}'.format(k, v)) logger.info('{}:{}'.format(k, v))
save_path = '{}/inference'.format(config['Global']['save_inference_dir'])
infer_shape = [3, 32, 100] if config['Architecture'][ infer_shape = [3, 32, 100] if config['Architecture'][
'model_type'] != "det" else [3, 640, 640] 'model_type'] != "det" else [3, 640, 640]
quanter.save_quantized_model( save_path = config["Global"]["save_inference_dir"]
model,
save_path, arch_config = config["Architecture"]
input_spec=[ if arch_config["algorithm"] in ["Distillation", ]: # distillation model
paddle.static.InputSpec( for idx, name in enumerate(model.model_name_list):
shape=[None] + infer_shape, dtype='float32') sub_model_save_path = os.path.join(save_path, name, "inference")
]) export_single_model(quanter, model.model_list[idx], infer_shape,
logger.info('inference QAT model is saved to {}'.format(save_path)) sub_model_save_path, logger)
else:
save_path = os.path.join(save_path, "inference")
export_single_model(quanter, model, infer_shape, save_path, logger)
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -109,9 +109,18 @@ def main(config, device, logger, vdl_writer): ...@@ -109,9 +109,18 @@ def main(config, device, logger, vdl_writer):
# for rec algorithm # for rec algorithm
if hasattr(post_process_class, 'character'): if hasattr(post_process_class, 'character'):
char_num = len(getattr(post_process_class, 'character')) char_num = len(getattr(post_process_class, 'character'))
config['Architecture']["Head"]['out_channels'] = char_num if config['Architecture']["algorithm"] in ["Distillation",
]: # distillation model
for key in config['Architecture']["Models"]:
config['Architecture']["Models"][key]["Head"][
'out_channels'] = char_num
else: # base rec model
config['Architecture']["Head"]['out_channels'] = char_num
model = build_model(config['Architecture']) model = build_model(config['Architecture'])
quanter = QAT(config=quant_config, act_preprocess=PACT)
quanter.quantize(model)
if config['Global']['distributed']: if config['Global']['distributed']:
model = paddle.DataParallel(model) model = paddle.DataParallel(model)
...@@ -132,8 +141,6 @@ def main(config, device, logger, vdl_writer): ...@@ -132,8 +141,6 @@ def main(config, device, logger, vdl_writer):
logger.info('train dataloader has {} iters, valid dataloader has {} iters'. logger.info('train dataloader has {} iters, valid dataloader has {} iters'.
format(len(train_dataloader), len(valid_dataloader))) format(len(train_dataloader), len(valid_dataloader)))
quanter = QAT(config=quant_config, act_preprocess=PACT)
quanter.quantize(model)
# start train # start train
program.train(config, train_dataloader, valid_dataloader, device, model, program.train(config, train_dataloader, valid_dataloader, device, model,
......
...@@ -111,9 +111,9 @@ ...@@ -111,9 +111,9 @@
| 字段 | 用途 | 默认值 | 备注 | | 字段 | 用途 | 默认值 | 备注 |
| :---------------------: | :---------------------: | :--------------: | :--------------------: | | :---------------------: | :---------------------: | :--------------: | :--------------------: |
| **dataset** | 每次迭代返回一个样本 | - | - | | **dataset** | 每次迭代返回一个样本 | - | - |
| name | dataset类名 | SimpleDataSet | 目前支持`SimpleDataSet``LMDBDateSet` | | name | dataset类名 | SimpleDataSet | 目前支持`SimpleDataSet``LMDBDataSet` |
| data_dir | 数据集图片存放路径 | ./train_data | \ | | data_dir | 数据集图片存放路径 | ./train_data | \ |
| label_file_list | 数据标签路径 | ["./train_data/train_list.txt"] | dataset为LMDBDateSet时不需要此参数 | | label_file_list | 数据标签路径 | ["./train_data/train_list.txt"] | dataset为LMDBDataSet时不需要此参数 |
| ratio_list | 数据集的比例 | [1.0] | 若label_file_list中有两个train_list,且ratio_list为[0.4,0.6],则从train_list1中采样40%,从train_list2中采样60%组合整个dataset | | ratio_list | 数据集的比例 | [1.0] | 若label_file_list中有两个train_list,且ratio_list为[0.4,0.6],则从train_list1中采样40%,从train_list2中采样60%组合整个dataset |
| transforms | 对图片和标签进行变换的方法列表 | [DecodeImage,CTCLabelEncode,RecResizeImg,KeepKeys] | 见[ppocr/data/imaug](../../ppocr/data/imaug) | | transforms | 对图片和标签进行变换的方法列表 | [DecodeImage,CTCLabelEncode,RecResizeImg,KeepKeys] | 见[ppocr/data/imaug](../../ppocr/data/imaug) |
| **loader** | dataloader相关 | - | | | **loader** | dataloader相关 | - | |
......
...@@ -18,9 +18,9 @@ PaddleOCR 也提供了数据格式转换脚本,可以将官网 label 转换支 ...@@ -18,9 +18,9 @@ PaddleOCR 也提供了数据格式转换脚本,可以将官网 label 转换支
``` ```
# 将官网下载的标签文件转换为 train_icdar2015_label.txt # 将官网下载的标签文件转换为 train_icdar2015_label.txt
python gen_label.py --mode="det" --root_path="icdar_c4_train_imgs/" \ python gen_label.py --mode="det" --root_path="/path/to/icdar_c4_train_imgs/" \
--input_path="ch4_training_localization_transcription_gt" \ --input_path="/path/to/ch4_training_localization_transcription_gt" \
--output_label="train_icdar2015_label.txt" --output_label="/path/to/train_icdar2015_label.txt"
``` ```
解压数据集和下载标注文件后,PaddleOCR/train_data/ 有两个文件夹和两个文件,分别是: 解压数据集和下载标注文件后,PaddleOCR/train_data/ 有两个文件夹和两个文件,分别是:
......
...@@ -147,12 +147,12 @@ python3 tools/infer/predict_det.py --image_dir="./doc/imgs/00018069.jpg" --det_m ...@@ -147,12 +147,12 @@ python3 tools/infer/predict_det.py --image_dir="./doc/imgs/00018069.jpg" --det_m
如果输入图片的分辨率比较大,而且想使用更大的分辨率预测,可以设置det_limit_side_len 为想要的值,比如1216: 如果输入图片的分辨率比较大,而且想使用更大的分辨率预测,可以设置det_limit_side_len 为想要的值,比如1216:
``` ```
python3 tools/infer/predict_det.py --image_dir="./doc/imgs/2.jpg" --det_model_dir="./inference/det_db/" --det_limit_type=max --det_limit_side_len=1216 python3 tools/infer/predict_det.py --image_dir="./doc/imgs/1.jpg" --det_model_dir="./inference/det_db/" --det_limit_type=max --det_limit_side_len=1216
``` ```
如果想使用CPU进行预测,执行命令如下 如果想使用CPU进行预测,执行命令如下
``` ```
python3 tools/infer/predict_det.py --image_dir="./doc/imgs/2.jpg" --det_model_dir="./inference/det_db/" --use_gpu=False python3 tools/infer/predict_det.py --image_dir="./doc/imgs/1.jpg" --det_model_dir="./inference/det_db/" --use_gpu=False
``` ```
<a name="DB文本检测模型推理"></a> <a name="DB文本检测模型推理"></a>
...@@ -221,7 +221,7 @@ python3 tools/export_model.py -c configs/det/det_r50_vd_sast_totaltext.yml -o Gl ...@@ -221,7 +221,7 @@ python3 tools/export_model.py -c configs/det/det_r50_vd_sast_totaltext.yml -o Gl
``` ```
**SAST文本检测模型推理,需要设置参数`--det_algorithm="SAST"`,同时,还需要增加参数`--det_sast_polygon=True`,**可以执行如下命令: SAST文本检测模型推理,需要设置参数`--det_algorithm="SAST"`,同时,还需要增加参数`--det_sast_polygon=True`,可以执行如下命令:
``` ```
python3 tools/infer/predict_det.py --det_algorithm="SAST" --image_dir="./doc/imgs_en/img623.jpg" --det_model_dir="./inference/det_sast_tt/" --det_sast_polygon=True python3 tools/infer/predict_det.py --det_algorithm="SAST" --image_dir="./doc/imgs_en/img623.jpg" --det_model_dir="./inference/det_sast_tt/" --det_sast_polygon=True
``` ```
......
# 知识蒸馏
## 1. 简介
### 1.1 知识蒸馏介绍
近年来,深度神经网络在计算机视觉、自然语言处理等领域被验证是一种极其有效的解决问题的方法。通过构建合适的神经网络,加以训练,最终网络模型的性能指标基本上都会超过传统算法。
在数据量足够大的情况下,通过合理构建网络模型的方式增加其参数量,可以显著改善模型性能,但是这又带来了模型复杂度急剧提升的问题。大模型在实际场景中使用的成本较高。
深度神经网络一般有较多的参数冗余,目前有几种主要的方法对模型进行压缩,减小其参数量。如裁剪、量化、知识蒸馏等,其中知识蒸馏是指使用教师模型(teacher model)去指导学生模型(student model)学习特定任务,保证小模型在参数量不变的情况下,得到比较大的性能提升。
此外,在知识蒸馏任务中,也衍生出了互学习的模型训练方法,论文[Deep Mutual Learning](https://arxiv.org/abs/1706.00384)中指出,使用两个完全相同的模型在训练的过程中互相监督,可以达到比单个模型训练更好的效果。
### 1.2 PaddleOCR知识蒸馏简介
无论是大模型蒸馏小模型,还是小模型之间互相学习,更新参数,他们本质上是都是不同模型之间输出或者特征图(feature map)之间的相互监督,区别仅在于 (1) 模型是否需要固定参数。(2) 模型是否需要加载预训练模型。
对于大模型蒸馏小模型的情况,大模型一般需要加载预训练模型并固定参数;对于小模型之间互相蒸馏的情况,小模型一般都不加载预训练模型,参数也都是可学习的状态。
在知识蒸馏任务中,不只有2个模型之间进行蒸馏的情况,多个模型之间互相学习的情况也非常普遍。因此在知识蒸馏代码框架中,也有必要支持该种类别的蒸馏方法。
PaddleOCR中集成了知识蒸馏的算法,具体地,有以下几个主要的特点:
- 支持任意网络的互相学习,不要求子网络结构完全一致或者具有预训练模型;同时子网络数量也没有任何限制,只需要在配置文件中添加即可。
- 支持loss函数通过配置文件任意配置,不仅可以使用某种loss,也可以使用多种loss的组合
- 支持知识蒸馏训练、预测、评估与导出等所有模型相关的环境,方便使用与部署。
通过知识蒸馏,在中英文通用文字识别任务中,不增加任何预测耗时的情况下,可以给模型带来3%以上的精度提升,结合学习率调整策略以及模型结构微调策略,最终提升提升超过5%。
## 2. 配置文件解析
在知识蒸馏训练的过程中,数据预处理、优化器、学习率、全局的一些属性没有任何变化。模型结构、损失函数、后处理、指标计算等模块的配置文件需要进行微调。
下面以识别与检测的知识蒸馏配置文件为例,对知识蒸馏的训练与配置进行解析。
### 2.1 识别配置文件解析
配置文件在[rec_chinese_lite_train_distillation_v2.1.yml](../../configs/rec/ch_ppocr_v2.1/rec_chinese_lite_train_distillation_v2.1.yml)
#### 2.1.1 模型结构
知识蒸馏任务中,模型结构配置如下所示。
```yaml
Architecture:
model_type: &model_type "rec" # 模型类别,rec、det等,每个子网络的的模型类别都与
name: DistillationModel # 结构名称,蒸馏任务中,为DistillationModel,用于构建对应的结构
algorithm: Distillation # 算法名称
Models: # 模型,包含子网络的配置信息
Teacher: # 子网络名称,至少需要包含`pretrained`与`freeze_params`信息,其他的参数为子网络的构造参数
pretrained: # 该子网络是否需要加载预训练模型
freeze_params: false # 是否需要固定参数
return_all_feats: true # 子网络的参数,表示是否需要返回所有的features,如果为False,则只返回最后的输出
model_type: *model_type # 模型类别
algorithm: CRNN # 子网络的算法名称,该子网络剩余参与均为构造参数,与普通的模型训练配置一致
Transform:
Backbone:
name: MobileNetV1Enhance
scale: 0.5
Neck:
name: SequenceEncoder
encoder_type: rnn
hidden_size: 64
Head:
name: CTCHead
mid_channels: 96
fc_decay: 0.00002
Student: # 另外一个子网络,这里给的是DML的蒸馏示例,两个子网络结构相同,均需要学习参数
pretrained: # 下面的组网参数同上
freeze_params: false
return_all_feats: true
model_type: *model_type
algorithm: CRNN
Transform:
Backbone:
name: MobileNetV1Enhance
scale: 0.5
Neck:
name: SequenceEncoder
encoder_type: rnn
hidden_size: 64
Head:
name: CTCHead
mid_channels: 96
fc_decay: 0.00002
```
当然,这里如果希望添加更多的子网络进行训练,也可以按照`Student``Teacher`的添加方式,在配置文件中添加相应的字段。比如说如果希望有3个模型互相监督,共同训练,那么`Architecture`可以写为如下格式。
```yaml
Architecture:
model_type: &model_type "rec"
name: DistillationModel
algorithm: Distillation
Models:
Teacher:
pretrained:
freeze_params: false
return_all_feats: true
model_type: *model_type
algorithm: CRNN
Transform:
Backbone:
name: MobileNetV1Enhance
scale: 0.5
Neck:
name: SequenceEncoder
encoder_type: rnn
hidden_size: 64
Head:
name: CTCHead
mid_channels: 96
fc_decay: 0.00002
Student:
pretrained:
freeze_params: false
return_all_feats: true
model_type: *model_type
algorithm: CRNN
Transform:
Backbone:
name: MobileNetV1Enhance
scale: 0.5
Neck:
name: SequenceEncoder
encoder_type: rnn
hidden_size: 64
Head:
name: CTCHead
mid_channels: 96
fc_decay: 0.00002
Student2: # 知识蒸馏任务中引入的新的子网络,其他部分与上述配置相同
pretrained:
freeze_params: false
return_all_feats: true
model_type: *model_type
algorithm: CRNN
Transform:
Backbone:
name: MobileNetV1Enhance
scale: 0.5
Neck:
name: SequenceEncoder
encoder_type: rnn
hidden_size: 64
Head:
name: CTCHead
mid_channels: 96
fc_decay: 0.00002
```
最终该模型训练时,包含3个子网络:`Teacher`, `Student`, `Student2`
蒸馏模型`DistillationModel`类的具体实现代码可以参考[distillation_model.py](../../ppocr/modeling/architectures/distillation_model.py)
最终模型`forward`输出为一个字典,key为所有的子网络名称,例如这里为`Student``Teacher`,value为对应子网络的输出,可以为`Tensor`(只返回该网络的最后一层)和`dict`(也返回了中间的特征信息)。
在识别任务中,为了添加更多损失函数,保证蒸馏方法的可扩展性,将每个子网络的输出保存为`dict`,其中包含子模块输出。以该识别模型为例,每个子网络的输出结果均为`dict`,key包含`backbone_out`,`neck_out`, `head_out``value`为对应模块的tensor,最终对于上述配置文件,`DistillationModel`的输出格式如下。
```json
{
"Teacher": {
"backbone_out": tensor,
"neck_out": tensor,
"head_out": tensor,
},
"Student": {
"backbone_out": tensor,
"neck_out": tensor,
"head_out": tensor,
}
}
```
#### 2.1.2 损失函数
知识蒸馏任务中,损失函数配置如下所示。
```yaml
Loss:
name: CombinedLoss # 损失函数名称,基于改名称,构建用于损失函数的类
loss_config_list: # 损失函数配置文件列表,为CombinedLoss的必备函数
- DistillationCTCLoss: # 基于蒸馏的CTC损失函数,继承自标准的CTC loss
weight: 1.0 # 损失函数的权重,loss_config_list中,每个损失函数的配置都必须包含该字段
model_name_list: ["Student", "Teacher"] # 对于蒸馏模型的预测结果,提取这两个子网络的输出,与gt计算CTC loss
key: head_out # 取子网络输出dict中,该key对应的tensor
- DistillationDMLLoss: # 蒸馏的DML损失函数,继承自标准的DMLLoss
weight: 1.0 # 权重
act: "softmax" # 激活函数,对输入使用激活函数处理,可以为softmax, sigmoid或者为None,默认为None
model_name_pairs: # 用于计算DML loss的子网络名称对,如果希望计算其他子网络的DML loss,可以在列表下面继续填充
- ["Student", "Teacher"]
key: head_out # 取子网络输出dict中,该key对应的tensor
- DistillationDistanceLoss: # 蒸馏的距离损失函数
weight: 1.0 # 权重
mode: "l2" # 距离计算方法,目前支持l1, l2, smooth_l1
model_name_pairs: # 用于计算distance loss的子网络名称对
- ["Student", "Teacher"]
key: backbone_out # 取子网络输出dict中,该key对应的tensor
```
上述损失函数中,所有的蒸馏损失函数均继承自标准的损失函数类,主要功能为: 对蒸馏模型的输出进行解析,找到用于计算损失的中间节点(tensor),再使用标准的损失函数类去计算。
以上述配置为例,最终蒸馏训练的损失函数包含下面3个部分。
- `Student``Teacher`的最终输出(`head_out`)与gt的CTC loss,权重为1。在这里因为2个子网络都需要更新参数,因此2者都需要计算与g的loss。
- `Student``Teacher`的最终输出(`head_out`)之间的DML loss,权重为1。
- `Student``Teacher`的骨干网络输出(`backbone_out`)之间的l2 loss,权重为1。
关于`CombinedLoss`更加具体的实现可以参考: [combined_loss.py](../../ppocr/losses/combined_loss.py#L23)。关于`DistillationCTCLoss`等蒸馏损失函数更加具体的实现可以参考[distillation_loss.py](../../ppocr/losses/distillation_loss.py)
#### 2.1.3 后处理
知识蒸馏任务中,后处理配置如下所示。
```yaml
PostProcess:
name: DistillationCTCLabelDecode # 蒸馏任务的CTC解码后处理,继承自标准的CTCLabelDecode类
model_name: ["Student", "Teacher"] # 对于蒸馏模型的预测结果,提取这两个子网络的输出,进行解码
key: head_out # 取子网络输出dict中,该key对应的tensor
```
以上述配置为例,最终会同时计算`Student``Teahcer` 2个子网络的CTC解码输出,返回一个`dict``key`为用于处理的子网络名称,`value`为用于处理的子网络列表。
关于`DistillationCTCLabelDecode`更加具体的实现可以参考: [rec_postprocess.py](../../ppocr/postprocess/rec_postprocess.py#L128)
#### 2.1.4 指标计算
知识蒸馏任务中,指标计算配置如下所示。
```yaml
Metric:
name: DistillationMetric # 蒸馏任务的CTC解码后处理,继承自标准的CTCLabelDecode类
base_metric_name: RecMetric # 指标计算的基类,对于模型的输出,会基于该类,计算指标
main_indicator: acc # 指标的名称
key: "Student" # 选取该子网络的 main_indicator 作为作为保存保存best model的判断标准
```
以上述配置为例,最终会使用`Student`子网络的acc指标作为保存best model的判断指标,同时,日志中也会打印出所有子网络的acc指标。
关于`DistillationMetric`更加具体的实现可以参考: [distillation_metric.py](../../ppocr/metrics/distillation_metric.py#L24)
### 2.2 检测配置文件解析
* coming soon!
...@@ -243,7 +243,7 @@ Optimizer: ...@@ -243,7 +243,7 @@ Optimizer:
Train: Train:
dataset: dataset:
# 数据集格式,支持LMDBDateSet以及SimpleDataSet # 数据集格式,支持LMDBDataSet以及SimpleDataSet
name: SimpleDataSet name: SimpleDataSet
# 数据集路径 # 数据集路径
data_dir: ./train_data/ data_dir: ./train_data/
...@@ -263,7 +263,7 @@ Train: ...@@ -263,7 +263,7 @@ Train:
Eval: Eval:
dataset: dataset:
# 数据集格式,支持LMDBDateSet以及SimpleDataSet # 数据集格式,支持LMDBDataSet以及SimpleDataSet
name: SimpleDataSet name: SimpleDataSet
# 数据集路径 # 数据集路径
data_dir: ./train_data data_dir: ./train_data
...@@ -330,6 +330,8 @@ PaddleOCR目前已支持80种(除中文外)语种识别,`configs/rec/multi ...@@ -330,6 +330,8 @@ PaddleOCR目前已支持80种(除中文外)语种识别,`configs/rec/multi
``` ```
意大利文由拉丁字母组成,因此执行完命令后会得到名为 rec_latin_lite_train.yml 的配置文件。
2. 手动修改配置文件 2. 手动修改配置文件
您也可以手动修改模版中的以下几个字段: 您也可以手动修改模版中的以下几个字段:
...@@ -375,7 +377,9 @@ PaddleOCR目前已支持80种(除中文外)语种识别,`configs/rec/multi ...@@ -375,7 +377,9 @@ PaddleOCR目前已支持80种(除中文外)语种识别,`configs/rec/multi
更多支持语种请参考: [多语言模型](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.1/doc/doc_ch/multi_languages.md#%E8%AF%AD%E7%A7%8D%E7%BC%A9%E5%86%99) 更多支持语种请参考: [多语言模型](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.1/doc/doc_ch/multi_languages.md#%E8%AF%AD%E7%A7%8D%E7%BC%A9%E5%86%99)
多语言模型训练方式与中文模型一致,训练数据集均为100w的合成数据,少量的字体可以在 [百度网盘](https://pan.baidu.com/s/1bS_u207Rm7YbY33wOECKDA) 上下载,提取码:frgi。 多语言模型训练方式与中文模型一致,训练数据集均为100w的合成数据,少量的字体可以通过下面两种方式下载。
* [百度网盘](https://pan.baidu.com/s/1bS_u207Rm7YbY33wOECKDA)。提取码:frgi。
* [google drive](https://drive.google.com/file/d/18cSWX7wXSy4G0tbKJ0d9PuIaiwRLHpjA/view)
如您希望在现有模型效果的基础上调优,请参考下列说明修改配置文件: 如您希望在现有模型效果的基础上调优,请参考下列说明修改配置文件:
...@@ -393,7 +397,7 @@ Global: ...@@ -393,7 +397,7 @@ Global:
Train: Train:
dataset: dataset:
# 数据集格式,支持LMDBDateSet以及SimpleDataSet # 数据集格式,支持LMDBDataSet以及SimpleDataSet
name: SimpleDataSet name: SimpleDataSet
# 数据集路径 # 数据集路径
data_dir: ./train_data/ data_dir: ./train_data/
...@@ -403,7 +407,7 @@ Train: ...@@ -403,7 +407,7 @@ Train:
Eval: Eval:
dataset: dataset:
# 数据集格式,支持LMDBDateSet以及SimpleDataSet # 数据集格式,支持LMDBDataSet以及SimpleDataSet
name: SimpleDataSet name: SimpleDataSet
# 数据集路径 # 数据集路径
data_dir: ./train_data data_dir: ./train_data
......
...@@ -5,26 +5,32 @@ ...@@ -5,26 +5,32 @@
### 1.1 安装whl包 ### 1.1 安装whl包
pip安装 pip安装
```bash ```bash
pip install "paddleocr>=2.0.1" # 推荐使用2.0.1+版本 pip install "paddleocr>=2.0.1" # 推荐使用2.0.1+版本
``` ```
本地构建并安装 本地构建并安装
```bash ```bash
python3 setup.py bdist_wheel python3 setup.py bdist_wheel
pip3 install dist/paddleocr-x.x.x-py3-none-any.whl # x.x.x是paddleocr的版本号 pip3 install dist/paddleocr-x.x.x-py3-none-any.whl # x.x.x是paddleocr的版本号
``` ```
## 2 使用 ## 2 使用
### 2.1 代码使用 ### 2.1 代码使用
paddleocr whl包会自动下载ppocr轻量级模型作为默认模型,可以根据第3节**自定义模型**进行自定义更换。 paddleocr whl包会自动下载ppocr轻量级模型作为默认模型,可以根据第3节**自定义模型**进行自定义更换。
* 检测+方向分类器+识别全流程 * 检测+方向分类器+识别全流程
```python ```python
from paddleocr import PaddleOCR, draw_ocr from paddleocr import PaddleOCR, draw_ocr
# Paddleocr目前支持中英文、英文、法语、德语、韩语、日语,可以通过修改lang参数进行切换 # Paddleocr目前支持中英文、英文、法语、德语、韩语、日语,可以通过修改lang参数进行切换
# 参数依次为`ch`, `en`, `french`, `german`, `korean`, `japan`。 # 参数依次为`ch`, `en`, `french`, `german`, `korean`, `japan`。
ocr = PaddleOCR(use_angle_cls=True, lang="ch") # need to run only once to download and load model into memory ocr = PaddleOCR(use_angle_cls=True, lang="ch") # need to run only once to download and load model into memory
img_path = 'PaddleOCR/doc/imgs/11.jpg' img_path = 'PaddleOCR/doc/imgs/11.jpg'
result = ocr.ocr(img_path, cls=True) result = ocr.ocr(img_path, cls=True)
for line in result: for line in result:
...@@ -32,6 +38,7 @@ for line in result: ...@@ -32,6 +38,7 @@ for line in result:
# 显示结果 # 显示结果
from PIL import Image from PIL import Image
image = Image.open(img_path).convert('RGB') image = Image.open(img_path).convert('RGB')
boxes = [line[0] for line in result] boxes = [line[0] for line in result]
txts = [line[1][0] for line in result] txts = [line[1][0] for line in result]
...@@ -40,31 +47,36 @@ im_show = draw_ocr(image, boxes, txts, scores, font_path='/path/to/PaddleOCR/doc ...@@ -40,31 +47,36 @@ im_show = draw_ocr(image, boxes, txts, scores, font_path='/path/to/PaddleOCR/doc
im_show = Image.fromarray(im_show) im_show = Image.fromarray(im_show)
im_show.save('result.jpg') im_show.save('result.jpg')
``` ```
结果是一个list,每个item包含了文本框,文字和识别置信度 结果是一个list,每个item包含了文本框,文字和识别置信度
```bash ```bash
[[[24.0, 36.0], [304.0, 34.0], [304.0, 72.0], [24.0, 74.0]], ['纯臻营养护发素', 0.964739]] [[[24.0, 36.0], [304.0, 34.0], [304.0, 72.0], [24.0, 74.0]], ['纯臻营养护发素', 0.964739]]
[[[24.0, 80.0], [172.0, 80.0], [172.0, 104.0], [24.0, 104.0]], ['产品信息/参数', 0.98069626]] [[[24.0, 80.0], [172.0, 80.0], [172.0, 104.0], [24.0, 104.0]], ['产品信息/参数', 0.98069626]]
[[[24.0, 109.0], [333.0, 109.0], [333.0, 136.0], [24.0, 136.0]], ['(45元/每公斤,100公斤起订)', 0.9676722]] [[[24.0, 109.0], [333.0, 109.0], [333.0, 136.0], [24.0, 136.0]], ['(45元/每公斤,100公斤起订)', 0.9676722]]
...... ......
``` ```
结果可视化 结果可视化
<div align="center"> <div align="center">
<img src="../imgs_results/whl/11_det_rec.jpg" width="800"> <img src="../imgs_results/whl/11_det_rec.jpg" width="800">
</div> </div>
* 检测+识别 * 检测+识别
```python ```python
from paddleocr import PaddleOCR, draw_ocr from paddleocr import PaddleOCR, draw_ocr
ocr = PaddleOCR() # need to run only once to download and load model into memory
ocr = PaddleOCR() # need to run only once to download and load model into memory
img_path = 'PaddleOCR/doc/imgs/11.jpg' img_path = 'PaddleOCR/doc/imgs/11.jpg'
result = ocr.ocr(img_path) result = ocr.ocr(img_path, cls=False)
for line in result: for line in result:
print(line) print(line)
# 显示结果 # 显示结果
from PIL import Image from PIL import Image
image = Image.open(img_path).convert('RGB') image = Image.open(img_path).convert('RGB')
boxes = [line[0] for line in result] boxes = [line[0] for line in result]
txts = [line[1][0] for line in result] txts = [line[1][0] for line in result]
...@@ -73,38 +85,46 @@ im_show = draw_ocr(image, boxes, txts, scores, font_path='/path/to/PaddleOCR/doc ...@@ -73,38 +85,46 @@ im_show = draw_ocr(image, boxes, txts, scores, font_path='/path/to/PaddleOCR/doc
im_show = Image.fromarray(im_show) im_show = Image.fromarray(im_show)
im_show.save('result.jpg') im_show.save('result.jpg')
``` ```
结果是一个list,每个item包含了文本框,文字和识别置信度 结果是一个list,每个item包含了文本框,文字和识别置信度
```bash ```bash
[[[24.0, 36.0], [304.0, 34.0], [304.0, 72.0], [24.0, 74.0]], ['纯臻营养护发素', 0.964739]] [[[24.0, 36.0], [304.0, 34.0], [304.0, 72.0], [24.0, 74.0]], ['纯臻营养护发素', 0.964739]]
[[[24.0, 80.0], [172.0, 80.0], [172.0, 104.0], [24.0, 104.0]], ['产品信息/参数', 0.98069626]] [[[24.0, 80.0], [172.0, 80.0], [172.0, 104.0], [24.0, 104.0]], ['产品信息/参数', 0.98069626]]
[[[24.0, 109.0], [333.0, 109.0], [333.0, 136.0], [24.0, 136.0]], ['(45元/每公斤,100公斤起订)', 0.9676722]] [[[24.0, 109.0], [333.0, 109.0], [333.0, 136.0], [24.0, 136.0]], ['(45元/每公斤,100公斤起订)', 0.9676722]]
...... ......
``` ```
结果可视化 结果可视化
<div align="center"> <div align="center">
<img src="../imgs_results/whl/11_det_rec.jpg" width="800"> <img src="../imgs_results/whl/11_det_rec.jpg" width="800">
</div> </div>
* 方向分类器+识别 * 方向分类器+识别
```python ```python
from paddleocr import PaddleOCR from paddleocr import PaddleOCR
ocr = PaddleOCR(use_angle_cls=True) # need to run only once to download and load model into memory
ocr = PaddleOCR(use_angle_cls=True) # need to run only once to download and load model into memory
img_path = 'PaddleOCR/doc/imgs_words/ch/word_1.jpg' img_path = 'PaddleOCR/doc/imgs_words/ch/word_1.jpg'
result = ocr.ocr(img_path, det=False, cls=True) result = ocr.ocr(img_path, det=False, cls=True)
for line in result: for line in result:
print(line) print(line)
``` ```
结果是一个list,每个item只包含识别结果和识别置信度 结果是一个list,每个item只包含识别结果和识别置信度
```bash ```bash
['韩国小馆', 0.9907421] ['韩国小馆', 0.9907421]
``` ```
* 单独执行检测 * 单独执行检测
```python ```python
from paddleocr import PaddleOCR, draw_ocr from paddleocr import PaddleOCR, draw_ocr
ocr = PaddleOCR() # need to run only once to download and load model into memory
ocr = PaddleOCR() # need to run only once to download and load model into memory
img_path = 'PaddleOCR/doc/imgs/11.jpg' img_path = 'PaddleOCR/doc/imgs/11.jpg'
result = ocr.ocr(img_path, rec=False) result = ocr.ocr(img_path, rec=False)
for line in result: for line in result:
...@@ -118,13 +138,16 @@ im_show = draw_ocr(image, result, txts=None, scores=None, font_path='/path/to/Pa ...@@ -118,13 +138,16 @@ im_show = draw_ocr(image, result, txts=None, scores=None, font_path='/path/to/Pa
im_show = Image.fromarray(im_show) im_show = Image.fromarray(im_show)
im_show.save('result.jpg') im_show.save('result.jpg')
``` ```
结果是一个list,每个item只包含文本框 结果是一个list,每个item只包含文本框
```bash ```bash
[[26.0, 457.0], [137.0, 457.0], [137.0, 477.0], [26.0, 477.0]] [[26.0, 457.0], [137.0, 457.0], [137.0, 477.0], [26.0, 477.0]]
[[25.0, 425.0], [372.0, 425.0], [372.0, 448.0], [25.0, 448.0]] [[25.0, 425.0], [372.0, 425.0], [372.0, 448.0], [25.0, 448.0]]
[[128.0, 397.0], [273.0, 397.0], [273.0, 414.0], [128.0, 414.0]] [[128.0, 397.0], [273.0, 397.0], [273.0, 414.0], [128.0, 414.0]]
...... ......
``` ```
结果可视化 结果可视化
...@@ -133,29 +156,37 @@ im_show.save('result.jpg') ...@@ -133,29 +156,37 @@ im_show.save('result.jpg')
</div> </div>
* 单独执行识别 * 单独执行识别
```python ```python
from paddleocr import PaddleOCR from paddleocr import PaddleOCR
ocr = PaddleOCR() # need to run only once to download and load model into memory
ocr = PaddleOCR() # need to run only once to download and load model into memory
img_path = 'PaddleOCR/doc/imgs_words/ch/word_1.jpg' img_path = 'PaddleOCR/doc/imgs_words/ch/word_1.jpg'
result = ocr.ocr(img_path, det=False) result = ocr.ocr(img_path, det=False)
for line in result: for line in result:
print(line) print(line)
``` ```
结果是一个list,每个item只包含识别结果和识别置信度 结果是一个list,每个item只包含识别结果和识别置信度
```bash ```bash
['韩国小馆', 0.9907421] ['韩国小馆', 0.9907421]
``` ```
* 单独执行方向分类器 * 单独执行方向分类器
```python ```python
from paddleocr import PaddleOCR from paddleocr import PaddleOCR
ocr = PaddleOCR(use_angle_cls=True) # need to run only once to download and load model into memory
ocr = PaddleOCR(use_angle_cls=True) # need to run only once to download and load model into memory
img_path = 'PaddleOCR/doc/imgs_words/ch/word_1.jpg' img_path = 'PaddleOCR/doc/imgs_words/ch/word_1.jpg'
result = ocr.ocr(img_path, det=False, rec=False, cls=True) result = ocr.ocr(img_path, det=False, rec=False, cls=True)
for line in result: for line in result:
print(line) print(line)
``` ```
结果是一个list,每个item只包含分类结果和分类置信度 结果是一个list,每个item只包含分类结果和分类置信度
```bash ```bash
['0', 0.9999924] ['0', 0.9999924]
``` ```
...@@ -163,15 +194,19 @@ for line in result: ...@@ -163,15 +194,19 @@ for line in result:
### 2.2 通过命令行使用 ### 2.2 通过命令行使用
查看帮助信息 查看帮助信息
```bash ```bash
paddleocr -h paddleocr -h
``` ```
* 检测+方向分类器+识别全流程 * 检测+方向分类器+识别全流程
```bash ```bash
paddleocr --image_dir PaddleOCR/doc/imgs/11.jpg --use_angle_cls true paddleocr --image_dir PaddleOCR/doc/imgs/11.jpg --use_angle_cls true
``` ```
结果是一个list,每个item包含了文本框,文字和识别置信度 结果是一个list,每个item包含了文本框,文字和识别置信度
```bash ```bash
[[[24.0, 36.0], [304.0, 34.0], [304.0, 72.0], [24.0, 74.0]], ['纯臻营养护发素', 0.964739]] [[[24.0, 36.0], [304.0, 34.0], [304.0, 72.0], [24.0, 74.0]], ['纯臻营养护发素', 0.964739]]
[[[24.0, 80.0], [172.0, 80.0], [172.0, 104.0], [24.0, 104.0]], ['产品信息/参数', 0.98069626]] [[[24.0, 80.0], [172.0, 80.0], [172.0, 104.0], [24.0, 104.0]], ['产品信息/参数', 0.98069626]]
...@@ -180,10 +215,13 @@ paddleocr --image_dir PaddleOCR/doc/imgs/11.jpg --use_angle_cls true ...@@ -180,10 +215,13 @@ paddleocr --image_dir PaddleOCR/doc/imgs/11.jpg --use_angle_cls true
``` ```
* 检测+识别 * 检测+识别
```bash ```bash
paddleocr --image_dir PaddleOCR/doc/imgs/11.jpg paddleocr --image_dir PaddleOCR/doc/imgs/11.jpg
``` ```
结果是一个list,每个item包含了文本框,文字和识别置信度 结果是一个list,每个item包含了文本框,文字和识别置信度
```bash ```bash
[[[24.0, 36.0], [304.0, 34.0], [304.0, 72.0], [24.0, 74.0]], ['纯臻营养护发素', 0.964739]] [[[24.0, 36.0], [304.0, 34.0], [304.0, 72.0], [24.0, 74.0]], ['纯臻营养护发素', 0.964739]]
[[[24.0, 80.0], [172.0, 80.0], [172.0, 104.0], [24.0, 104.0]], ['产品信息/参数', 0.98069626]] [[[24.0, 80.0], [172.0, 80.0], [172.0, 104.0], [24.0, 104.0]], ['产品信息/参数', 0.98069626]]
...@@ -192,20 +230,25 @@ paddleocr --image_dir PaddleOCR/doc/imgs/11.jpg ...@@ -192,20 +230,25 @@ paddleocr --image_dir PaddleOCR/doc/imgs/11.jpg
``` ```
* 方向分类器+识别 * 方向分类器+识别
```bash ```bash
paddleocr --image_dir PaddleOCR/doc/imgs_words/ch/word_1.jpg --use_angle_cls true --det false paddleocr --image_dir PaddleOCR/doc/imgs_words/ch/word_1.jpg --use_angle_cls true --det false
``` ```
结果是一个list,每个item只包含识别结果和识别置信度 结果是一个list,每个item只包含识别结果和识别置信度
```bash ```bash
['韩国小馆', 0.9907421] ['韩国小馆', 0.9907421]
``` ```
* 单独执行检测 * 单独执行检测
```bash ```bash
paddleocr --image_dir PaddleOCR/doc/imgs/11.jpg --rec false paddleocr --image_dir PaddleOCR/doc/imgs/11.jpg --rec false
``` ```
结果是一个list,每个item只包含文本框 结果是一个list,每个item只包含文本框
```bash ```bash
[[26.0, 457.0], [137.0, 457.0], [137.0, 477.0], [26.0, 477.0]] [[26.0, 457.0], [137.0, 457.0], [137.0, 477.0], [26.0, 477.0]]
[[25.0, 425.0], [372.0, 425.0], [372.0, 448.0], [25.0, 448.0]] [[25.0, 425.0], [372.0, 425.0], [372.0, 448.0], [25.0, 448.0]]
...@@ -214,34 +257,42 @@ paddleocr --image_dir PaddleOCR/doc/imgs/11.jpg --rec false ...@@ -214,34 +257,42 @@ paddleocr --image_dir PaddleOCR/doc/imgs/11.jpg --rec false
``` ```
* 单独执行识别 * 单独执行识别
```bash ```bash
paddleocr --image_dir PaddleOCR/doc/imgs_words/ch/word_1.jpg --det false paddleocr --image_dir PaddleOCR/doc/imgs_words/ch/word_1.jpg --det false
``` ```
结果是一个list,每个item只包含识别结果和识别置信度 结果是一个list,每个item只包含识别结果和识别置信度
```bash ```bash
['韩国小馆', 0.9907421] ['韩国小馆', 0.9907421]
``` ```
* 单独执行方向分类器 * 单独执行方向分类器
```bash ```bash
paddleocr --image_dir PaddleOCR/doc/imgs_words/ch/word_1.jpg --use_angle_cls true --det false --rec false paddleocr --image_dir PaddleOCR/doc/imgs_words/ch/word_1.jpg --use_angle_cls true --det false --rec false
``` ```
结果是一个list,每个item只包含分类结果和分类置信度 结果是一个list,每个item只包含分类结果和分类置信度
```bash ```bash
['0', 0.9999924] ['0', 0.9999924]
``` ```
## 3 自定义模型 ## 3 自定义模型
当内置模型无法满足需求时,需要使用到自己训练的模型。
首先,参照[inference.md](./inference.md) 第一节转换将检测、分类和识别模型转换为inference模型,然后按照如下方式使用 当内置模型无法满足需求时,需要使用到自己训练的模型。 首先,参照[inference.md](./inference.md) 第一节转换将检测、分类和识别模型转换为inference模型,然后按照如下方式使用
### 3.1 代码使用 ### 3.1 代码使用
```python ```python
from paddleocr import PaddleOCR, draw_ocr from paddleocr import PaddleOCR, draw_ocr
# 模型路径下必须含有model和params文件 # 模型路径下必须含有model和params文件
ocr = PaddleOCR(det_model_dir='{your_det_model_dir}', rec_model_dir='{your_rec_model_dir}', rec_char_dict_path='{your_rec_char_dict_path}', cls_model_dir='{your_cls_model_dir}', use_angle_cls=True) ocr = PaddleOCR(det_model_dir='{your_det_model_dir}', rec_model_dir='{your_rec_model_dir}',
rec_char_dict_path='{your_rec_char_dict_path}', cls_model_dir='{your_cls_model_dir}',
use_angle_cls=True)
img_path = 'PaddleOCR/doc/imgs/11.jpg' img_path = 'PaddleOCR/doc/imgs/11.jpg'
result = ocr.ocr(img_path, cls=True) result = ocr.ocr(img_path, cls=True)
for line in result: for line in result:
...@@ -249,6 +300,7 @@ for line in result: ...@@ -249,6 +300,7 @@ for line in result:
# 显示结果 # 显示结果
from PIL import Image from PIL import Image
image = Image.open(img_path).convert('RGB') image = Image.open(img_path).convert('RGB')
boxes = [line[0] for line in result] boxes = [line[0] for line in result]
txts = [line[1][0] for line in result] txts = [line[1][0] for line in result]
...@@ -269,11 +321,13 @@ paddleocr --image_dir PaddleOCR/doc/imgs/11.jpg --det_model_dir {your_det_model_ ...@@ -269,11 +321,13 @@ paddleocr --image_dir PaddleOCR/doc/imgs/11.jpg --det_model_dir {your_det_model_
### 4.1 网络图片 ### 4.1 网络图片
- 代码使用 - 代码使用
```python ```python
from paddleocr import PaddleOCR, draw_ocr from paddleocr import PaddleOCR, draw_ocr, download_with_progressbar
# Paddleocr目前支持中英文、英文、法语、德语、韩语、日语,可以通过修改lang参数进行切换 # Paddleocr目前支持中英文、英文、法语、德语、韩语、日语,可以通过修改lang参数进行切换
# 参数依次为`ch`, `en`, `french`, `german`, `korean`, `japan`。 # 参数依次为`ch`, `en`, `french`, `german`, `korean`, `japan`。
ocr = PaddleOCR(use_angle_cls=True, lang="ch") # need to run only once to download and load model into memory ocr = PaddleOCR(use_angle_cls=True, lang="ch") # need to run only once to download and load model into memory
img_path = 'http://n.sinaimg.cn/ent/transform/w630h933/20171222/o111-fypvuqf1838418.jpg' img_path = 'http://n.sinaimg.cn/ent/transform/w630h933/20171222/o111-fypvuqf1838418.jpg'
result = ocr.ocr(img_path, cls=True) result = ocr.ocr(img_path, cls=True)
for line in result: for line in result:
...@@ -281,7 +335,9 @@ for line in result: ...@@ -281,7 +335,9 @@ for line in result:
# 显示结果 # 显示结果
from PIL import Image from PIL import Image
image = Image.open(img_path).convert('RGB')
download_with_progressbar(img_path, 'tmp.jpg')
image = Image.open('tmp.jpg').convert('RGB')
boxes = [line[0] for line in result] boxes = [line[0] for line in result]
txts = [line[1][0] for line in result] txts = [line[1][0] for line in result]
scores = [line[1][1] for line in result] scores = [line[1][1] for line in result]
...@@ -289,18 +345,24 @@ im_show = draw_ocr(image, boxes, txts, scores, font_path='/path/to/PaddleOCR/doc ...@@ -289,18 +345,24 @@ im_show = draw_ocr(image, boxes, txts, scores, font_path='/path/to/PaddleOCR/doc
im_show = Image.fromarray(im_show) im_show = Image.fromarray(im_show)
im_show.save('result.jpg') im_show.save('result.jpg')
``` ```
- 命令行模式 - 命令行模式
```bash ```bash
paddleocr --image_dir http://n.sinaimg.cn/ent/transform/w630h933/20171222/o111-fypvuqf1838418.jpg --use_angle_cls=true paddleocr --image_dir http://n.sinaimg.cn/ent/transform/w630h933/20171222/o111-fypvuqf1838418.jpg --use_angle_cls=true
``` ```
### 4.2 numpy数组 ### 4.2 numpy数组
仅通过代码使用时支持numpy数组作为输入 仅通过代码使用时支持numpy数组作为输入
```python ```python
import cv2
from paddleocr import PaddleOCR, draw_ocr from paddleocr import PaddleOCR, draw_ocr
# Paddleocr目前支持中英文、英文、法语、德语、韩语、日语,可以通过修改lang参数进行切换 # Paddleocr目前支持中英文、英文、法语、德语、韩语、日语,可以通过修改lang参数进行切换
# 参数依次为`ch`, `en`, `french`, `german`, `korean`, `japan`。 # 参数依次为`ch`, `en`, `french`, `german`, `korean`, `japan`。
ocr = PaddleOCR(use_angle_cls=True, lang="ch") # need to run only once to download and load model into memory ocr = PaddleOCR(use_angle_cls=True, lang="ch") # need to run only once to download and load model into memory
img_path = 'PaddleOCR/doc/imgs/11.jpg' img_path = 'PaddleOCR/doc/imgs/11.jpg'
img = cv2.imread(img_path) img = cv2.imread(img_path)
# img = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY), 如果你自己训练的模型支持灰度图,可以将这句话的注释取消 # img = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY), 如果你自己训练的模型支持灰度图,可以将这句话的注释取消
...@@ -310,6 +372,7 @@ for line in result: ...@@ -310,6 +372,7 @@ for line in result:
# 显示结果 # 显示结果
from PIL import Image from PIL import Image
image = Image.open(img_path).convert('RGB') image = Image.open(img_path).convert('RGB')
boxes = [line[0] for line in result] boxes = [line[0] for line in result]
txts = [line[1][0] for line in result] txts = [line[1][0] for line in result]
...@@ -355,3 +418,5 @@ im_show.save('result.jpg') ...@@ -355,3 +418,5 @@ im_show.save('result.jpg')
| det | 前向时使用启动检测 | TRUE | | det | 前向时使用启动检测 | TRUE |
| rec | 前向时是否启动识别 | TRUE | | rec | 前向时是否启动识别 | TRUE |
| cls | 前向时是否启动分类 (命令行模式下使用use_angle_cls控制前向是否启动分类) | FALSE | | cls | 前向时是否启动分类 (命令行模式下使用use_angle_cls控制前向是否启动分类) | FALSE |
| show_log | 是否打印det和rec等信息 | FALSE |
| type | 执行ocr或者表格结构化, 值可选['ocr','structure'] | ocr |
...@@ -110,9 +110,9 @@ In ppocr, the network is divided into four stages: Transform, Backbone, Neck and ...@@ -110,9 +110,9 @@ In ppocr, the network is divided into four stages: Transform, Backbone, Neck and
| Parameter | Use | Defaults | Note | | Parameter | Use | Defaults | Note |
| :---------------------: | :---------------------: | :--------------: | :--------------------: | | :---------------------: | :---------------------: | :--------------: | :--------------------: |
| **dataset** | Return one sample per iteration | - | - | | **dataset** | Return one sample per iteration | - | - |
| name | dataset class name | SimpleDataSet | Currently support`SimpleDataSet`,`LMDBDateSet` | | name | dataset class name | SimpleDataSet | Currently support`SimpleDataSet`,`LMDBDataSet` |
| data_dir | Image folder path | ./train_data | \ | | data_dir | Image folder path | ./train_data | \ |
| label_file_list | Groundtruth file path | ["./train_data/train_list.txt"] | This parameter is not required when dataset is LMDBDateSet | | label_file_list | Groundtruth file path | ["./train_data/train_list.txt"] | This parameter is not required when dataset is LMDBDataSet |
| ratio_list | Ratio of data set | [1.0] | If there are two train_lists in label_file_list and ratio_list is [0.4,0.6], 40% will be sampled from train_list1, and 60% will be sampled from train_list2 to combine the entire dataset | | ratio_list | Ratio of data set | [1.0] | If there are two train_lists in label_file_list and ratio_list is [0.4,0.6], 40% will be sampled from train_list1, and 60% will be sampled from train_list2 to combine the entire dataset |
| transforms | List of methods to transform images and labels | [DecodeImage,CTCLabelEncode,RecResizeImg,KeepKeys] | see[ppocr/data/imaug](../../ppocr/data/imaug) | | transforms | List of methods to transform images and labels | [DecodeImage,CTCLabelEncode,RecResizeImg,KeepKeys] | see[ppocr/data/imaug](../../ppocr/data/imaug) |
| **loader** | dataloader related | - | | | **loader** | dataloader related | - | |
......
...@@ -154,12 +154,12 @@ Set as `limit_type='min', det_limit_side_len=960`, it means that the shortest si ...@@ -154,12 +154,12 @@ Set as `limit_type='min', det_limit_side_len=960`, it means that the shortest si
If the resolution of the input picture is relatively large and you want to use a larger resolution prediction, you can set det_limit_side_len to the desired value, such as 1216: If the resolution of the input picture is relatively large and you want to use a larger resolution prediction, you can set det_limit_side_len to the desired value, such as 1216:
``` ```
python3 tools/infer/predict_det.py --image_dir="./doc/imgs/22.jpg" --det_model_dir="./inference/det_db/" --det_limit_type=max --det_limit_side_len=1216 python3 tools/infer/predict_det.py --image_dir="./doc/imgs/1.jpg" --det_model_dir="./inference/det_db/" --det_limit_type=max --det_limit_side_len=1216
``` ```
If you want to use the CPU for prediction, execute the command as follows If you want to use the CPU for prediction, execute the command as follows
``` ```
python3 tools/infer/predict_det.py --image_dir="./doc/imgs/22.jpg" --det_model_dir="./inference/det_db/" --use_gpu=False python3 tools/infer/predict_det.py --image_dir="./doc/imgs/1.jpg" --det_model_dir="./inference/det_db/" --use_gpu=False
``` ```
<a name="DB_DETECTION"></a> <a name="DB_DETECTION"></a>
...@@ -230,7 +230,7 @@ First, convert the model saved in the SAST text detection training process into ...@@ -230,7 +230,7 @@ First, convert the model saved in the SAST text detection training process into
python3 tools/export_model.py -c configs/det/det_r50_vd_sast_totaltext.yml -o Global.pretrained_model=./det_r50_vd_sast_totaltext_v2.0_train/best_accuracy Global.save_inference_dir=./inference/det_sast_tt python3 tools/export_model.py -c configs/det/det_r50_vd_sast_totaltext.yml -o Global.pretrained_model=./det_r50_vd_sast_totaltext_v2.0_train/best_accuracy Global.save_inference_dir=./inference/det_sast_tt
``` ```
**For SAST curved text detection model inference, you need to set the parameter `--det_algorithm="SAST"` and `--det_sast_polygon=True`**, run the following command: For SAST curved text detection model inference, you need to set the parameter `--det_algorithm="SAST"` and `--det_sast_polygon=True`, run the following command:
``` ```
python3 tools/infer/predict_det.py --det_algorithm="SAST" --image_dir="./doc/imgs_en/img623.jpg" --det_model_dir="./inference/det_sast_tt/" --det_sast_polygon=True python3 tools/infer/predict_det.py --det_algorithm="SAST" --image_dir="./doc/imgs_en/img623.jpg" --det_model_dir="./inference/det_sast_tt/" --det_sast_polygon=True
......
...@@ -237,7 +237,7 @@ Optimizer: ...@@ -237,7 +237,7 @@ Optimizer:
Train: Train:
dataset: dataset:
# Type of dataset,we support LMDBDateSet and SimpleDataSet # Type of dataset,we support LMDBDataSet and SimpleDataSet
name: SimpleDataSet name: SimpleDataSet
# Path of dataset # Path of dataset
data_dir: ./train_data/ data_dir: ./train_data/
...@@ -257,7 +257,7 @@ Train: ...@@ -257,7 +257,7 @@ Train:
Eval: Eval:
dataset: dataset:
# Type of dataset,we support LMDBDateSet and SimpleDataSet # Type of dataset,we support LMDBDataSet and SimpleDataSet
name: SimpleDataSet name: SimpleDataSet
# Path of dataset # Path of dataset
data_dir: ./train_data data_dir: ./train_data
...@@ -329,6 +329,7 @@ There are two ways to create the required configuration file:: ...@@ -329,6 +329,7 @@ There are two ways to create the required configuration file::
... ...
``` ```
Italian is made up of Latin letters, so after executing the command, you will get the rec_latin_lite_train.yml.
2. Manually modify the configuration file 2. Manually modify the configuration file
...@@ -375,7 +376,9 @@ Currently, the multi-language algorithms supported by PaddleOCR are: ...@@ -375,7 +376,9 @@ Currently, the multi-language algorithms supported by PaddleOCR are:
For more supported languages, please refer to : [Multi-language model](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.1/doc/doc_en/multi_languages_en.md#4-support-languages-and-abbreviations) For more supported languages, please refer to : [Multi-language model](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.1/doc/doc_en/multi_languages_en.md#4-support-languages-and-abbreviations)
The multi-language model training method is the same as the Chinese model. The training data set is 100w synthetic data. A small amount of fonts and test data can be downloaded on [Baidu Netdisk](https://pan.baidu.com/s/1bS_u207Rm7YbY33wOECKDA),Extraction code:frgi. The multi-language model training method is the same as the Chinese model. The training data set is 100w synthetic data. A small amount of fonts and test data can be downloaded using the following two methods.
* [Baidu Netdisk](https://pan.baidu.com/s/1bS_u207Rm7YbY33wOECKDA),Extraction code:frgi.
* [Google drive](https://drive.google.com/file/d/18cSWX7wXSy4G0tbKJ0d9PuIaiwRLHpjA/view)
If you want to finetune on the basis of the existing model effect, please refer to the following instructions to modify the configuration file: If you want to finetune on the basis of the existing model effect, please refer to the following instructions to modify the configuration file:
...@@ -394,7 +397,7 @@ Global: ...@@ -394,7 +397,7 @@ Global:
Train: Train:
dataset: dataset:
# Type of dataset,we support LMDBDateSet and SimpleDataSet # Type of dataset,we support LMDBDataSet and SimpleDataSet
name: SimpleDataSet name: SimpleDataSet
# Path of dataset # Path of dataset
data_dir: ./train_data/ data_dir: ./train_data/
...@@ -404,7 +407,7 @@ Train: ...@@ -404,7 +407,7 @@ Train:
Eval: Eval:
dataset: dataset:
# Type of dataset,we support LMDBDateSet and SimpleDataSet # Type of dataset,we support LMDBDataSet and SimpleDataSet
name: SimpleDataSet name: SimpleDataSet
# Path of dataset # Path of dataset
data_dir: ./train_data data_dir: ./train_data
......
...@@ -15,8 +15,6 @@ ...@@ -15,8 +15,6 @@
- 2020.6.8 Add [datasets](./datasets_en.md) and keep updating - 2020.6.8 Add [datasets](./datasets_en.md) and keep updating
- 2020.6.5 Support exporting `attention` model to `inference_model` - 2020.6.5 Support exporting `attention` model to `inference_model`
- 2020.6.5 Support separate prediction and recognition, output result score - 2020.6.5 Support separate prediction and recognition, output result score
- 2020.6.5 Support exporting `attention` model to `inference_model`
- 2020.6.5 Support separate prediction and recognition, output result score
- 2020.5.30 Provide Lightweight Chinese OCR online experience - 2020.5.30 Provide Lightweight Chinese OCR online experience
- 2020.5.30 Model prediction and training support on Windows system - 2020.5.30 Model prediction and training support on Windows system
- 2020.5.30 Open source general Chinese OCR model - 2020.5.30 Open source general Chinese OCR model
......
...@@ -59,7 +59,7 @@ Visualization of results ...@@ -59,7 +59,7 @@ Visualization of results
from paddleocr import PaddleOCR,draw_ocr from paddleocr import PaddleOCR,draw_ocr
ocr = PaddleOCR(lang='en') # need to run only once to download and load model into memory ocr = PaddleOCR(lang='en') # need to run only once to download and load model into memory
img_path = 'PaddleOCR/doc/imgs_en/img_12.jpg' img_path = 'PaddleOCR/doc/imgs_en/img_12.jpg'
result = ocr.ocr(img_path) result = ocr.ocr(img_path, cls=False)
for line in result: for line in result:
print(line) print(line)
...@@ -305,7 +305,8 @@ paddleocr --image_dir http://n.sinaimg.cn/ent/transform/w630h933/20171222/o111-f ...@@ -305,7 +305,8 @@ paddleocr --image_dir http://n.sinaimg.cn/ent/transform/w630h933/20171222/o111-f
Support numpy array as input only when used by code Support numpy array as input only when used by code
```python ```python
from paddleocr import PaddleOCR, draw_ocr import cv2
from paddleocr import PaddleOCR, draw_ocr, download_with_progressbar
ocr = PaddleOCR(use_angle_cls=True, lang="ch") # need to run only once to download and load model into memory ocr = PaddleOCR(use_angle_cls=True, lang="ch") # need to run only once to download and load model into memory
img_path = 'PaddleOCR/doc/imgs/11.jpg' img_path = 'PaddleOCR/doc/imgs/11.jpg'
img = cv2.imread(img_path) img = cv2.imread(img_path)
...@@ -316,7 +317,9 @@ for line in result: ...@@ -316,7 +317,9 @@ for line in result:
# show result # show result
from PIL import Image from PIL import Image
image = Image.open(img_path).convert('RGB')
download_with_progressbar(img_path, 'tmp.jpg')
image = Image.open('tmp.jpg').convert('RGB')
boxes = [line[0] for line in result] boxes = [line[0] for line in result]
txts = [line[1][0] for line in result] txts = [line[1][0] for line in result]
scores = [line[1][1] for line in result] scores = [line[1][1] for line in result]
...@@ -362,3 +365,5 @@ im_show.save('result.jpg') ...@@ -362,3 +365,5 @@ im_show.save('result.jpg')
| det | Enable detction when `ppocr.ocr` func exec | TRUE | | det | Enable detction when `ppocr.ocr` func exec | TRUE |
| rec | Enable recognition when `ppocr.ocr` func exec | TRUE | | rec | Enable recognition when `ppocr.ocr` func exec | TRUE |
| cls | Enable classification when `ppocr.ocr` func exec((Use use_angle_cls in command line mode to control whether to start classification in the forward direction) | FALSE | | cls | Enable classification when `ppocr.ocr` func exec((Use use_angle_cls in command line mode to control whether to start classification in the forward direction) | FALSE |
| show_log | Whether to print log in det and rec | FALSE |
| type | Perform ocr or table structuring, the value is selected in ['ocr','structure'] | ocr |
\ No newline at end of file
doc/joinus.PNG

99.7 KB | W: | H:

doc/joinus.PNG

213 KB | W: | H:

doc/joinus.PNG
doc/joinus.PNG
doc/joinus.PNG
doc/joinus.PNG
  • 2-up
  • Swipe
  • Onion skin
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment