Commit ed43fc11 authored by wanglch's avatar wanglch
Browse files

Initial commit

parents
Pipeline #2703 canceled with stages
# Byte-compiled / optimized / DLL files
__pycache__/
.ipynb_checkpoints/
*.py[cod]
*$py.class
# C extensions
*.so
inference/
inference_results/
output/
train_data/
log/
*.DS_Store
*.vs
*.user
*~
*.vscode
*.idea
*.log
.clang-format
.clang_format.hook
build/
dist/
paddleocr.egg-info/
/deploy/android_demo/app/OpenCV/
/deploy/android_demo/app/PaddleLite/
/deploy/android_demo/app/.cxx/
/deploy/android_demo/app/cache/
test_tipc/web/models/
test_tipc/web/node_modules/
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v5.0.0
hooks:
- id: check-added-large-files
args: ['--maxkb=512']
- id: check-case-conflict
- id: check-merge-conflict
- id: check-symlinks
- id: detect-private-key
- id: end-of-file-fixer
- id: trailing-whitespace
files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|py)$
- repo: https://github.com/Lucas-C/pre-commit-hooks
rev: v1.5.5
hooks:
- id: remove-crlf
- id: remove-tabs
files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|py)$
- repo: local
hooks:
- id: clang-format
name: clang-format
description: Format files with ClangFormat
entry: bash .clang_format.hook -i
language: system
files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|cuh|proto)$
# For Python files
- repo: https://github.com/psf/black.git
rev: 24.10.0
hooks:
- id: black
files: (.*\.(py|pyi|bzl)|BUILD|.*\.BUILD|WORKSPACE)$
# Flake8
- repo: https://github.com/pycqa/flake8
rev: 7.1.1
hooks:
- id: flake8
args:
- --count
- --select=E9,F63,F7,F82,E721
- --show-source
- --statistics
exclude: ^benchmark/|^test_tipc/
[style]
based_on_style = pep8
column_limit = 80
FROM image.sourcefind.cn:5000/dcu/admin/base/paddlepaddle:3.0.0-py3.10-dtk24.04.3-ubuntu20.04
\ No newline at end of file
Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
prune .github
prune applications
prune benchmark
prune configs
prune deploy
prune doc
prune docs
prune overrides
prune ppocr/ext_op
prune ppocr/losses
prune ppocr/metrics
prune ppocr/modeling
prune ppocr/optimizer
prune ppstructure/docs
prune test_tipc
prune tests
exclude .clang_format.hook
exclude .gitignore
exclude .pre-commit-config.yaml
exclude .style.yapf
exclude mkdocs.yml
exclude train.sh
OCR.png

61 KB

# CRNN
## 论文
[An End-to-End Trainable Neural Network for Image-based Sequence Recognition and Its Application to Scene Text Recognition](https://arxiv.org/abs/1507.05717)
## 模型结构
CRNN模型,即将CNN与RNN网络结合,共同训练。主要用于在一定程度上实现端到端(end-
to-end)地对不定长的文本序列进行识别,不用先对单个文字进行切割,而是将文本识别转化为时序依赖的序列学习问题,就是基于图像的序列识别。这里有一个很精彩的改动,一共有四个最大池化层,但是最后两个池化层的窗口尺寸由2x2改为1x2,也就是图片的高度减半了四次,而宽度则只减半了两次(,这是因为文本图像多数都是高较小而宽较长,所以其feature map 也是这种高小宽长的矩形形状,如果使用1×2的池化窗口可以尽量保证不丢失在宽度方向的信息,更适合英文字母识别。
<div align=center>
<img src="./imgs/arch.png"/>
</div>
## 算法原理
CRNN(卷积循环神经网络)是一种用于光学字符识别(OCR)的深度学习模型,它结合了卷积神经网络(CNN)强大的特征提取能力、循环神经网络(RNN)对序列信息的建模能力,以及连接时序分类(CTC)损失函数来解决不定长序列对齐问题。CNN从输入图像中提取特征图,RNN对特征序列进行建模并预测字符概率分布,CTC则将这些概率分布转化为最终的文本序列。CRNN能够实现端到端的文字识别,无需显式地进行字符分割,同时能够处理不定长的文本序列,广泛应用于自然场景文字识别、文档扫描与识别等领域,具有较高的识别准确率和鲁棒性。
<div align=center>
<img src="./imgs/theory.png"/>
</div>
## 环境配置
### Docker(方法一)
推荐使用docker方式运行, 此处提供[光源](https://www.sourcefind.cn/#/service-details)拉取docker镜像的地址与使用步骤
```
docker pull image.sourcefind.cn:5000/dcu/admin/base/paddlepaddle:3.0.0-py3.10-dtk24.04.3-ubuntu20.04
docker run -it --shm-size=1024G -v /path/your_code_data/:/path/your_code_data/ -v /opt/hyhal:/opt/hyhal --network=host --privileged=true --device=/dev/kfd --device=/dev/dri/ --group-add video --name CRNN <your IMAGE ID> bash # <your IMAGE ID>为以上拉取的docker的镜像ID替换
git clone https://developer.sourcefind.cn/codes/modelzoo/crnn_paddle
cd /path/your_code_data/
pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
pip install paddleocr
# 需要关联动态库包
export LD_LIBRARY_PATH=/opt/dtk-24.04.3/lib:/opt/dtk-24.04.3/cuda/extras/CUPTI/lib64:/opt/dtk-24.04.3/cuda/targets/x86_64-linux/lib:$LD_LIBRARY_PATH
```
Tips:以上dtk驱动、python、paddlepaddle等DCU相关工具版本需要严格一一对应。
### Dockerfile(方法二)
此处提供dockerfile的使用方法
```
git clone http://developer.sourcefind.cn/codes/modelzoo/kimi-vl-a3b-instruct_pytorch.git
docker build -t internvl:latest .
docker run --shm-size 500g --network=host --name=kimi-vl --privileged --device=/dev/kfd --device=/dev/dri --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -v 项目地址(绝对路径):/home/ -v /opt/hyhal:/opt/hyhal:ro -it <your IMAGE ID> bash
cd /path/your_code_data/
pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
pip install paddleocr
# 需要关联动态库包
export LD_LIBRARY_PATH=/opt/dtk-24.04.3/lib:/opt/dtk-24.04.3/cuda/extras/CUPTI/lib64:/opt/dtk-24.04.3/cuda/targets/x86_64-linux/lib:$LD_LIBRARY_PATH
```
### Anaconda(方法三)
此处提供本地配置、编译的详细步骤,例如:
关于本项目DCU显卡所需的特殊深度学习库可从[光合](https://developer.sourcefind.cn/tool/)开发者社区下载安装。
```
DTK驱动:dtk24.04.3
python:3.10
paddlepaddle:3.0.0
```
`Tips:以上dtk驱动、python、torch等DCU相关工具版本需要严格一一对应`
其它非深度学习库参照requirement.txt安装:
```
git clone http://developer.sourcefind.cn/codes/modelzoo/kimi-vl-a3b-instruct_pytorch.git
cd /path/your_code_data/
pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
pip install paddleocr
export LD_LIBRARY_PATH=/opt/dtk-24.04.3/lib:/opt/dtk-24.04.3/cuda/extras/CUPTI/lib64:/opt/dtk-24.04.3/cuda/targets/x86_64-linux/lib:$LD_LIBRARY_PATH
```
## 数据集
在本测试中可以使用(icdar2015)[./train_data/icdar2015/rec]数据集,可在本项目已经提供并整理了可直接用于验证得数据集文件,可在train_data/icdar2015/rec目录下找到。
训练集应有如下文件结构:
```
|-train_data
|-rec
|- rec_gt_train.txt
|- train
|- word_001.png
|- word_002.jpg
|- word_003.jpg
| ...
```
## 训练
训练前需要完成两件事,首先是下载模型,然后是修改配置文件。
### 模型下载
下载地址 https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_r34_vd_none_bilstm_ctc_v2.0_train.tar
```
cd PaddleOCR/
# 下载CRNN的预训练模型
wget -P ./pretrain_models/https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_r34_vd_none_bilstm_ctc_v2.0_train.tar
# 解压模型参数
cd pretrain_models
tar -xf rec_r34_vd_none_bilstm_ctc_v2.0_train.tar && rm -rf rec_r34_vd_none_bilstm_ctc_v2.0_train.tar
```
### 配置文件
```
# 本项目的配置文件已经修改,其余的模型及任务可以参考该配置文件中的配置方式
rec/rec_r34_vd_none_bilstm_ctc.yml
```
## 推理
### 单机单卡
```
# 预测使用的配置文件必须与训练一致
python3 tools/infer_rec.py -c configs/rec/rec_r34_vd_none_bilstm_ctc.yml -o Global.pretrained_model='./PaddleOCR-main/pretrain_models/rec_r34_vd_none_bilstm_ctc_v2.0_train/best_accuracy' Global.infer_img='./PaddleOCR-main/docs/images/ppocrv4.png'
```
## result
<div align=left>
<img src="./imgs/result1.png"/>
</div>
### 精度
## 应用场景
### 算法类别
`文字识别`
### 热点应用行业
`科研,教育,政府,金融`
## 预训练权重
CRNN PaddlePaddle下载地址为:[rec_r34_vd_none_bilstm_ctc.yml](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_r34_vd_none_bilstm_ctc_v2.0_train.tar)
## 源码仓库及问题反馈
- https://developer.sourcefind.cn/codes/modelzoo/crnn_paddle
## 参考资料
- https://github.com/PaddlePaddle/PaddleOCR/
移步[docs](https://paddlepaddle.github.io/PaddleOCR/latest/applications/overview.html)
*.html linguist-language=python
*.ipynb linguist-language=python
.DS_Store
*.pth
*.pyc
*.pyo
*.log
*.tmp
*.pkl
__pycache__/
.idea/
output/
test/*.jpg
datasets/
index/
train_log/
log/
profiling_log/
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
# Real-time Scene Text Detection with Differentiable Binarization
**note**: some code is inherited from [WenmuZhou/DBNet.pytorch](https://github.com/WenmuZhou/DBNet.pytorch)
[中文解读](https://zhuanlan.zhihu.com/p/94677957)
![network](imgs/paper/db.jpg)
## update
2020-06-07: 添加灰度图训练,训练灰度图时需要在配置里移除`dataset.args.transforms.Normalize`
## Install Using Conda
```
conda env create -f environment.yml
git clone https://github.com/WenmuZhou/DBNet.paddle.git
cd DBNet.paddle/
```
or
## Install Manually
```bash
conda create -n dbnet python=3.6
conda activate dbnet
conda install ipython pip
# python dependencies
pip install -r requirement.txt
# clone repo
git clone https://github.com/WenmuZhou/DBNet.paddle.git
cd DBNet.paddle/
```
## Requirements
* paddlepaddle 2.4+
## Download
TBD
## Data Preparation
Training data: prepare a text `train.txt` in the following format, use '\t' as a separator
```
./datasets/train/img/001.jpg ./datasets/train/gt/001.txt
```
Validation data: prepare a text `test.txt` in the following format, use '\t' as a separator
```
./datasets/test/img/001.jpg ./datasets/test/gt/001.txt
```
- Store images in the `img` folder
- Store groundtruth in the `gt` folder
The groundtruth can be `.txt` files, with the following format:
```
x1, y1, x2, y2, x3, y3, x4, y4, annotation
```
## Train
1. config the `dataset['train']['dataset'['data_path']'`,`dataset['validate']['dataset'['data_path']`in [config/icdar2015_resnet18_fpn_DBhead_polyLR.yaml](cconfig/icdar2015_resnet18_fpn_DBhead_polyLR.yaml)
* . single gpu train
```bash
bash single_gpu_train.sh
```
* . Multi-gpu training
```bash
bash multi_gpu_train.sh
```
## Test
[eval.py](tools/eval.py) is used to test model on test dataset
1. config `model_path` in [eval.sh](eval.sh)
2. use following script to test
```bash
bash eval.sh
```
## Predict
[predict.py](tools/predict.py) Can be used to inference on all images in a folder
1. config `model_path`,`input_folder`,`output_folder` in [predict.sh](predict.sh)
2. use following script to predict
```
bash predict.sh
```
You can change the `model_path` in the `predict.sh` file to your model location.
tips: if result is not good, you can change `thre` in [predict.sh](predict.sh)
## Export Model
[export_model.py](tools/export_model.py) Can be used to inference on all images in a folder
use following script to export inference model
```
python tools/export_model.py --config_file config/icdar2015_resnet50_FPN_DBhead_polyLR.yaml -o trainer.resume_checkpoint=model_best.pth trainer.output_dir=output/infer
```
## Paddle Inference infer
[infer.py](tools/infer.py) Can be used to inference on all images in a folder
use following script to export inference model
```
python tools/infer.py --model-dir=output/infer/ --img-path imgs/paper/db.jpg
```
<h2 id="Performance">Performance</h2>
### [ICDAR 2015](http://rrc.cvc.uab.es/?ch=4)
only train on ICDAR2015 dataset
| Method | image size (short size) |learning rate | Precision (%) | Recall (%) | F-measure (%) | FPS |
|:--------------------------:|:-------:|:--------:|:--------:|:------------:|:---------------:|:-----:|
| ImageNet-resnet50-FPN-DBHead(torch) |736 |1e-3|90.19 | 78.14 | 83.88 | 27 |
| ImageNet-resnet50-FPN-DBHead(paddle) |736 |1e-3| 89.47 | 79.03 | 83.92 | 27 |
| ImageNet-resnet50-FPN-DBHead(paddle_amp) |736 |1e-3| 88.62 | 79.95 | 84.06 | 27 |
### examples
TBD
### reference
1. https://arxiv.org/pdf/1911.08947.pdf
2. https://github.com/WenmuZhou/DBNet.pytorch
**If this repository helps you,please star it. Thanks.**
from .base_trainer import BaseTrainer
from .base_dataset import BaseDataSet
# -*- coding: utf-8 -*-
# @Time : 2019/12/4 13:12
# @Author : zhoujun
import copy
from paddle.io import Dataset
from data_loader.modules import *
class BaseDataSet(Dataset):
def __init__(
self,
data_path: str,
img_mode,
pre_processes,
filter_keys,
ignore_tags,
transform=None,
target_transform=None,
):
assert img_mode in ["RGB", "BRG", "GRAY"]
self.ignore_tags = ignore_tags
self.data_list = self.load_data(data_path)
item_keys = ["img_path", "img_name", "text_polys", "texts", "ignore_tags"]
for item in item_keys:
assert (
item in self.data_list[0]
), "data_list from load_data must contains {}".format(item_keys)
self.img_mode = img_mode
self.filter_keys = filter_keys
self.transform = transform
self.target_transform = target_transform
self._init_pre_processes(pre_processes)
def _init_pre_processes(self, pre_processes):
self.aug = []
if pre_processes is not None:
for aug in pre_processes:
if "args" not in aug:
args = {}
else:
args = aug["args"]
if isinstance(args, dict):
cls = eval(aug["type"])(**args)
else:
cls = eval(aug["type"])(args)
self.aug.append(cls)
def load_data(self, data_path: str) -> list:
"""
把数据加载为一个list:
:params data_path: 存储数据的文件夹或者文件
return a dict ,包含了,'img_path','img_name','text_polys','texts','ignore_tags'
"""
raise NotImplementedError
def apply_pre_processes(self, data):
for aug in self.aug:
data = aug(data)
return data
def __getitem__(self, index):
try:
data = copy.deepcopy(self.data_list[index])
im = cv2.imread(data["img_path"], 1 if self.img_mode != "GRAY" else 0)
if self.img_mode == "RGB":
im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
data["img"] = im
data["shape"] = [im.shape[0], im.shape[1]]
data = self.apply_pre_processes(data)
if self.transform:
data["img"] = self.transform(data["img"])
data["text_polys"] = data["text_polys"].tolist()
if len(self.filter_keys):
data_dict = {}
for k, v in data.items():
if k not in self.filter_keys:
data_dict[k] = v
return data_dict
else:
return data
except:
return self.__getitem__(np.random.randint(self.__len__()))
def __len__(self):
return len(self.data_list)
# -*- coding: utf-8 -*-
# @Time : 2019/8/23 21:50
# @Author : zhoujun
import os
import pathlib
import shutil
from pprint import pformat
import anyconfig
import paddle
import numpy as np
import random
from paddle.jit import to_static
from paddle.static import InputSpec
from utils import setup_logger
class BaseTrainer:
def __init__(
self,
config,
model,
criterion,
train_loader,
validate_loader,
metric_cls,
post_process=None,
):
config["trainer"]["output_dir"] = os.path.join(
str(pathlib.Path(os.path.abspath(__name__)).parent),
config["trainer"]["output_dir"],
)
config["name"] = config["name"] + "_" + model.name
self.save_dir = config["trainer"]["output_dir"]
self.checkpoint_dir = os.path.join(self.save_dir, "checkpoint")
os.makedirs(self.checkpoint_dir, exist_ok=True)
self.global_step = 0
self.start_epoch = 0
self.config = config
self.criterion = criterion
# logger and tensorboard
self.visualdl_enable = self.config["trainer"].get("visual_dl", False)
self.epochs = self.config["trainer"]["epochs"]
self.log_iter = self.config["trainer"]["log_iter"]
if paddle.distributed.get_rank() == 0:
anyconfig.dump(config, os.path.join(self.save_dir, "config.yaml"))
self.logger = setup_logger(os.path.join(self.save_dir, "train.log"))
self.logger_info(pformat(self.config))
self.model = self.apply_to_static(model)
# device
if (
paddle.device.cuda.device_count() > 0
and paddle.device.is_compiled_with_cuda()
):
self.with_cuda = True
random.seed(self.config["trainer"]["seed"])
np.random.seed(self.config["trainer"]["seed"])
paddle.seed(self.config["trainer"]["seed"])
else:
self.with_cuda = False
self.logger_info("train with and paddle {}".format(paddle.__version__))
# metrics
self.metrics = {
"recall": 0,
"precision": 0,
"hmean": 0,
"train_loss": float("inf"),
"best_model_epoch": 0,
}
self.train_loader = train_loader
if validate_loader is not None:
assert post_process is not None and metric_cls is not None
self.validate_loader = validate_loader
self.post_process = post_process
self.metric_cls = metric_cls
self.train_loader_len = len(train_loader)
if self.validate_loader is not None:
self.logger_info(
"train dataset has {} samples,{} in dataloader, validate dataset has {} samples,{} in dataloader".format(
len(self.train_loader.dataset),
self.train_loader_len,
len(self.validate_loader.dataset),
len(self.validate_loader),
)
)
else:
self.logger_info(
"train dataset has {} samples,{} in dataloader".format(
len(self.train_loader.dataset), self.train_loader_len
)
)
self._initialize_scheduler()
self._initialize_optimizer()
# resume or finetune
if self.config["trainer"]["resume_checkpoint"] != "":
self._load_checkpoint(
self.config["trainer"]["resume_checkpoint"], resume=True
)
elif self.config["trainer"]["finetune_checkpoint"] != "":
self._load_checkpoint(
self.config["trainer"]["finetune_checkpoint"], resume=False
)
if self.visualdl_enable and paddle.distributed.get_rank() == 0:
from visualdl import LogWriter
self.writer = LogWriter(self.save_dir)
# 混合精度训练
self.amp = self.config.get("amp", None)
if self.amp == "None":
self.amp = None
if self.amp:
self.amp["scaler"] = paddle.amp.GradScaler(
init_loss_scaling=self.amp.get("scale_loss", 1024),
use_dynamic_loss_scaling=self.amp.get("use_dynamic_loss_scaling", True),
)
self.model, self.optimizer = paddle.amp.decorate(
models=self.model,
optimizers=self.optimizer,
level=self.amp.get("amp_level", "O2"),
)
# 分布式训练
if paddle.device.cuda.device_count() > 1:
self.model = paddle.DataParallel(self.model)
# make inverse Normalize
self.UN_Normalize = False
for t in self.config["dataset"]["train"]["dataset"]["args"]["transforms"]:
if t["type"] == "Normalize":
self.normalize_mean = t["args"]["mean"]
self.normalize_std = t["args"]["std"]
self.UN_Normalize = True
def apply_to_static(self, model):
support_to_static = self.config["trainer"].get("to_static", False)
if support_to_static:
specs = None
print("static")
specs = [InputSpec([None, 3, -1, -1])]
model = to_static(model, input_spec=specs)
self.logger_info(
"Successfully to apply @to_static with specs: {}".format(specs)
)
return model
def train(self):
"""
Full training logic
"""
for epoch in range(self.start_epoch + 1, self.epochs + 1):
self.epoch_result = self._train_epoch(epoch)
self._on_epoch_finish()
if paddle.distributed.get_rank() == 0 and self.visualdl_enable:
self.writer.close()
self._on_train_finish()
def _train_epoch(self, epoch):
"""
Training logic for an epoch
:param epoch: Current epoch number
"""
raise NotImplementedError
def _eval(self, epoch):
"""
eval logic for an epoch
:param epoch: Current epoch number
"""
raise NotImplementedError
def _on_epoch_finish(self):
raise NotImplementedError
def _on_train_finish(self):
raise NotImplementedError
def _save_checkpoint(self, epoch, file_name):
"""
Saving checkpoints
:param epoch: current epoch number
:param log: logging information of the epoch
:param save_best: if True, rename the saved checkpoint to 'model_best.pth.tar'
"""
state_dict = self.model.state_dict()
state = {
"epoch": epoch,
"global_step": self.global_step,
"state_dict": state_dict,
"optimizer": self.optimizer.state_dict(),
"config": self.config,
"metrics": self.metrics,
}
filename = os.path.join(self.checkpoint_dir, file_name)
paddle.save(state, filename)
def _load_checkpoint(self, checkpoint_path, resume):
"""
Resume from saved checkpoints
:param checkpoint_path: Checkpoint path to be resumed
"""
self.logger_info("Loading checkpoint: {} ...".format(checkpoint_path))
checkpoint = paddle.load(checkpoint_path)
self.model.set_state_dict(checkpoint["state_dict"])
if resume:
self.global_step = checkpoint["global_step"]
self.start_epoch = checkpoint["epoch"]
self.config["lr_scheduler"]["args"]["last_epoch"] = self.start_epoch
# self.scheduler.load_state_dict(checkpoint['scheduler'])
self.optimizer.set_state_dict(checkpoint["optimizer"])
if "metrics" in checkpoint:
self.metrics = checkpoint["metrics"]
self.logger_info(
"resume from checkpoint {} (epoch {})".format(
checkpoint_path, self.start_epoch
)
)
else:
self.logger_info("finetune from checkpoint {}".format(checkpoint_path))
def _initialize(self, name, module, *args, **kwargs):
module_name = self.config[name]["type"]
module_args = self.config[name].get("args", {})
assert all(
[k not in module_args for k in kwargs]
), "Overwriting kwargs given in config file is not allowed"
module_args.update(kwargs)
return getattr(module, module_name)(*args, **module_args)
def _initialize_scheduler(self):
self.lr_scheduler = self._initialize("lr_scheduler", paddle.optimizer.lr)
def _initialize_optimizer(self):
self.optimizer = self._initialize(
"optimizer",
paddle.optimizer,
parameters=self.model.parameters(),
learning_rate=self.lr_scheduler,
)
def inverse_normalize(self, batch_img):
if self.UN_Normalize:
batch_img[:, 0, :, :] = (
batch_img[:, 0, :, :] * self.normalize_std[0] + self.normalize_mean[0]
)
batch_img[:, 1, :, :] = (
batch_img[:, 1, :, :] * self.normalize_std[1] + self.normalize_mean[1]
)
batch_img[:, 2, :, :] = (
batch_img[:, 2, :, :] * self.normalize_std[2] + self.normalize_mean[2]
)
def logger_info(self, s):
if paddle.distributed.get_rank() == 0:
self.logger.info(s)
name: DBNet
dataset:
train:
dataset:
type: SynthTextDataset # 数据集类型
args:
data_path: ''# SynthTextDataset 根目录
pre_processes: # 数据的预处理过程,包含augment和标签制作
- type: IaaAugment # 使用imgaug进行变换
args:
- {'type':Fliplr, 'args':{'p':0.5}}
- {'type': Affine, 'args':{'rotate':[-10,10]}}
- {'type':Resize,'args':{'size':[0.5,3]}}
- type: EastRandomCropData
args:
size: [640,640]
max_tries: 50
keep_ratio: true
- type: MakeBorderMap
args:
shrink_ratio: 0.4
- type: MakeShrinkMap
args:
shrink_ratio: 0.4
min_text_size: 8
transforms: # 对图片进行的变换方式
- type: ToTensor
args: {}
- type: Normalize
args:
mean: [0.485, 0.456, 0.406]
std: [0.229, 0.224, 0.225]
img_mode: RGB
filter_keys: ['img_path','img_name','text_polys','texts','ignore_tags','shape'] # 返回数据之前,从数据字典里删除的key
ignore_tags: ['*', '###']
loader:
batch_size: 1
shuffle: true
num_workers: 0
collate_fn: ''
name: DBNet
base: ['config/SynthText.yaml']
arch:
type: Model
backbone:
type: resnet18
pretrained: true
neck:
type: FPN
inner_channels: 256
head:
type: DBHead
out_channels: 2
k: 50
post_processing:
type: SegDetectorRepresenter
args:
thresh: 0.3
box_thresh: 0.7
max_candidates: 1000
unclip_ratio: 1.5 # from paper
metric:
type: QuadMetric
args:
is_output_polygon: false
loss:
type: DBLoss
alpha: 1
beta: 10
ohem_ratio: 3
optimizer:
type: Adam
args:
lr: 0.001
weight_decay: 0
amsgrad: true
lr_scheduler:
type: WarmupPolyLR
args:
warmup_epoch: 3
trainer:
seed: 2
epochs: 1200
log_iter: 10
show_images_iter: 50
resume_checkpoint: ''
finetune_checkpoint: ''
output_dir: output
visual_dl: false
amp:
scale_loss: 1024
amp_level: O2
custom_white_list: []
custom_black_list: ['exp', 'sigmoid', 'concat']
dataset:
train:
dataset:
args:
data_path: ./datasets/SynthText
img_mode: RGB
loader:
batch_size: 2
shuffle: true
num_workers: 6
collate_fn: ''
name: DBNet
dataset:
train:
dataset:
type: ICDAR2015Dataset # 数据集类型
args:
data_path: # 一个存放 img_path \t gt_path的文件
- ''
pre_processes: # 数据的预处理过程,包含augment和标签制作
- type: IaaAugment # 使用imgaug进行变换
args:
- {'type':Fliplr, 'args':{'p':0.5}}
- {'type': Affine, 'args':{'rotate':[-10,10]}}
- {'type':Resize,'args':{'size':[0.5,3]}}
- type: EastRandomCropData
args:
size: [640,640]
max_tries: 50
keep_ratio: true
- type: MakeBorderMap
args:
shrink_ratio: 0.4
thresh_min: 0.3
thresh_max: 0.7
- type: MakeShrinkMap
args:
shrink_ratio: 0.4
min_text_size: 8
transforms: # 对图片进行的变换方式
- type: ToTensor
args: {}
- type: Normalize
args:
mean: [0.485, 0.456, 0.406]
std: [0.229, 0.224, 0.225]
img_mode: RGB
filter_keys: [img_path,img_name,text_polys,texts,ignore_tags,shape] # 返回数据之前,从数据字典里删除的key
ignore_tags: ['*', '###']
loader:
batch_size: 1
shuffle: true
num_workers: 0
collate_fn: ''
validate:
dataset:
type: ICDAR2015Dataset
args:
data_path:
- ''
pre_processes:
- type: ResizeShortSize
args:
short_size: 736
resize_text_polys: false
transforms:
- type: ToTensor
args: {}
- type: Normalize
args:
mean: [0.485, 0.456, 0.406]
std: [0.229, 0.224, 0.225]
img_mode: RGB
filter_keys: []
ignore_tags: ['*', '###']
loader:
batch_size: 1
shuffle: true
num_workers: 0
collate_fn: ICDARCollectFN
name: DBNet
base: ['config/icdar2015.yaml']
arch:
type: Model
backbone:
type: deformable_resnet18
pretrained: true
neck:
type: FPN
inner_channels: 256
head:
type: DBHead
out_channels: 2
k: 50
post_processing:
type: SegDetectorRepresenter
args:
thresh: 0.3
box_thresh: 0.7
max_candidates: 1000
unclip_ratio: 1.5 # from paper
metric:
type: QuadMetric
args:
is_output_polygon: false
loss:
type: DBLoss
alpha: 1
beta: 10
ohem_ratio: 3
optimizer:
type: Adam
args:
lr: 0.001
weight_decay: 0
amsgrad: true
lr_scheduler:
type: WarmupPolyLR
args:
warmup_epoch: 3
trainer:
seed: 2
epochs: 1200
log_iter: 10
show_images_iter: 50
resume_checkpoint: ''
finetune_checkpoint: ''
output_dir: output
visual_dl: false
amp:
scale_loss: 1024
amp_level: O2
custom_white_list: []
custom_black_list: ['exp', 'sigmoid', 'concat']
dataset:
train:
dataset:
args:
data_path:
- ./datasets/train.txt
img_mode: RGB
loader:
batch_size: 1
shuffle: true
num_workers: 6
collate_fn: ''
validate:
dataset:
args:
data_path:
- ./datasets/test.txt
pre_processes:
- type: ResizeShortSize
args:
short_size: 736
resize_text_polys: false
img_mode: RGB
loader:
batch_size: 1
shuffle: true
num_workers: 6
collate_fn: ICDARCollectFN
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment