Commit 2ee12253 authored by LDOUBLEV's avatar LDOUBLEV
Browse files

Merge branch 'develop' of https://github.com/PaddlePaddle/PaddleOCR into fixocr

parents bee2b15e 52ae30b0
//
// Created by fujiayi on 2020/7/3.
//
#pragma once
#include <vector>
#include <opencv2/opencv.hpp>
extern const std::vector<int> REC_IMAGE_SHAPE;
cv::Mat get_rotate_crop_image(const cv::Mat &srcimage, const std::vector<std::vector<int>>& box);
cv::Mat crnn_resize_img(const cv::Mat& img, float wh_ratio);
template<class ForwardIterator>
inline size_t argmax(ForwardIterator first, ForwardIterator last) {
return std::distance(first, std::max_element(first, last));
}
\ No newline at end of file
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <iostream>
#include <vector>
#include <math.h>
#include "opencv2/core.hpp"
#include "opencv2/imgcodecs.hpp"
#include "opencv2/imgproc.hpp"
#include "ocr_clipper.hpp"
static void getcontourarea(float **box, float unclip_ratio, float &distance) {
int pts_num = 4;
float area = 0.0f;
float dist = 0.0f;
for (int i = 0; i < pts_num; i++) {
area += box[i][0] * box[(i + 1) % pts_num][1] - box[i][1] * box[(i + 1) % pts_num][0];
dist += sqrtf(
(box[i][0] - box[(i + 1) % pts_num][0]) * (box[i][0] - box[(i + 1) % pts_num][0]) +
(box[i][1] - box[(i + 1) % pts_num][1]) * (box[i][1] - box[(i + 1) % pts_num][1]));
}
area = fabs(float(area / 2.0));
distance = area * unclip_ratio / dist;
}
static cv::RotatedRect unclip(float **box) {
float unclip_ratio = 2.0;
float distance = 1.0;
getcontourarea(box, unclip_ratio, distance);
ClipperLib::ClipperOffset offset;
ClipperLib::Path p;
p << ClipperLib::IntPoint(int(box[0][0]), int(box[0][1]))
<< ClipperLib::IntPoint(int(box[1][0]), int(box[1][1])) <<
ClipperLib::IntPoint(int(box[2][0]), int(box[2][1]))
<< ClipperLib::IntPoint(int(box[3][0]), int(box[3][1]));
offset.AddPath(p, ClipperLib::jtRound, ClipperLib::etClosedPolygon);
ClipperLib::Paths soln;
offset.Execute(soln, distance);
std::vector<cv::Point2f> points;
for (int j = 0; j < soln.size(); j++) {
for (int i = 0; i < soln[soln.size() - 1].size(); i++) {
points.emplace_back(soln[j][i].X, soln[j][i].Y);
}
}
cv::RotatedRect res = cv::minAreaRect(points);
return res;
}
static float **Mat2Vec(cv::Mat mat) {
auto **array = new float *[mat.rows];
for (int i = 0; i < mat.rows; ++i)
array[i] = new float[mat.cols];
for (int i = 0; i < mat.rows; ++i) {
for (int j = 0; j < mat.cols; ++j) {
array[i][j] = mat.at<float>(i, j);
}
}
return array;
}
static void quickSort(float **s, int l, int r) {
if (l < r) {
int i = l, j = r;
float x = s[l][0];
float *xp = s[l];
while (i < j) {
while (i < j && s[j][0] >= x)
j--;
if (i < j)
std::swap(s[i++], s[j]);
while (i < j && s[i][0] < x)
i++;
if (i < j)
std::swap(s[j--], s[i]);
}
s[i] = xp;
quickSort(s, l, i - 1);
quickSort(s, i + 1, r);
}
}
static void quickSort_vector(std::vector<std::vector<int>> &box, int l, int r, int axis) {
if (l < r) {
int i = l, j = r;
int x = box[l][axis];
std::vector<int> xp(box[l]);
while (i < j) {
while (i < j && box[j][axis] >= x)
j--;
if (i < j)
std::swap(box[i++], box[j]);
while (i < j && box[i][axis] < x)
i++;
if (i < j)
std::swap(box[j--], box[i]);
}
box[i] = xp;
quickSort_vector(box, l, i - 1, axis);
quickSort_vector(box, i + 1, r, axis);
}
}
static std::vector<std::vector<int>> order_points_clockwise(std::vector<std::vector<int>> pts) {
std::vector<std::vector<int>> box = pts;
quickSort_vector(box, 0, int(box.size() - 1), 0);
std::vector<std::vector<int>> leftmost = {box[0], box[1]};
std::vector<std::vector<int>> rightmost = {box[2], box[3]};
if (leftmost[0][1] > leftmost[1][1])
std::swap(leftmost[0], leftmost[1]);
if (rightmost[0][1] > rightmost[1][1])
std::swap(rightmost[0], rightmost[1]);
std::vector<std::vector<int>> rect = {leftmost[0], rightmost[0], rightmost[1], leftmost[1]};
return rect;
}
static float **get_mini_boxes(cv::RotatedRect box, float &ssid) {
ssid = box.size.width >= box.size.height ? box.size.height : box.size.width;
cv::Mat points;
cv::boxPoints(box, points);
// sorted box points
auto array = Mat2Vec(points);
quickSort(array, 0, 3);
float *idx1 = array[0], *idx2 = array[1], *idx3 = array[2], *idx4 = array[3];
if (array[3][1] <= array[2][1]) {
idx2 = array[3];
idx3 = array[2];
} else {
idx2 = array[2];
idx3 = array[3];
}
if (array[1][1] <= array[0][1]) {
idx1 = array[1];
idx4 = array[0];
} else {
idx1 = array[0];
idx4 = array[1];
}
array[0] = idx1;
array[1] = idx2;
array[2] = idx3;
array[3] = idx4;
return array;
}
template <class T> T clamp(T x, T min, T max) {
if (x > max){
return max;
}
if (x < min){
return min;
}
return x;
}
static float clampf(float x, float min, float max) {
if (x > max)
return max;
if (x < min)
return min;
return x;
}
float box_score_fast(float **box_array, cv::Mat pred) {
auto array = box_array;
int width = pred.cols;
int height = pred.rows;
float box_x[4] = {array[0][0], array[1][0], array[2][0], array[3][0]};
float box_y[4] = {array[0][1], array[1][1], array[2][1], array[3][1]};
int xmin = clamp(int(std::floorf(*(std::min_element(box_x, box_x + 4)))), 0, width - 1);
int xmax = clamp(int(std::ceilf(*(std::max_element(box_x, box_x + 4)))), 0, width - 1);
int ymin = clamp(int(std::floorf(*(std::min_element(box_y, box_y + 4)))), 0, height - 1);
int ymax = clamp(int(std::ceilf(*(std::max_element(box_y, box_y + 4)))), 0, height - 1);
cv::Mat mask;
mask = cv::Mat::zeros(ymax - ymin + 1, xmax - xmin + 1, CV_8UC1);
cv::Point root_point[4];
root_point[0] = cv::Point(int(array[0][0]) - xmin, int(array[0][1]) - ymin);
root_point[1] = cv::Point(int(array[1][0]) - xmin, int(array[1][1]) - ymin);
root_point[2] = cv::Point(int(array[2][0]) - xmin, int(array[2][1]) - ymin);
root_point[3] = cv::Point(int(array[3][0]) - xmin, int(array[3][1]) - ymin);
const cv::Point *ppt[1] = {root_point};
int npt[] = {4};
cv::fillPoly(mask, ppt, npt, 1, cv::Scalar(1));
cv::Mat croppedImg;
pred(cv::Rect(xmin, ymin, xmax - xmin + 1, ymax - ymin + 1)).copyTo(croppedImg);
auto score = cv::mean(croppedImg, mask)[0];
return score;
}
std::vector<std::vector<std::vector<int>>>
boxes_from_bitmap(const cv::Mat& pred, const cv::Mat& bitmap) {
const int min_size = 3;
const int max_candidates = 1000;
const float box_thresh = 0.5;
int width = bitmap.cols;
int height = bitmap.rows;
std::vector<std::vector<cv::Point>> contours;
std::vector<cv::Vec4i> hierarchy;
cv::findContours(bitmap, contours, hierarchy, cv::RETR_LIST, cv::CHAIN_APPROX_SIMPLE);
int num_contours = contours.size() >= max_candidates ? max_candidates : contours.size();
std::vector<std::vector<std::vector<int>>> boxes;
for (int _i = 0; _i < num_contours; _i++) {
float ssid;
cv::RotatedRect box = cv::minAreaRect(contours[_i]);
auto array = get_mini_boxes(box, ssid);
auto box_for_unclip = array;
//end get_mini_box
if (ssid < min_size) {
continue;
}
float score;
score = box_score_fast(array, pred);
//end box_score_fast
if (score < box_thresh)
continue;
// start for unclip
cv::RotatedRect points = unclip(box_for_unclip);
// end for unclip
cv::RotatedRect clipbox = points;
auto cliparray = get_mini_boxes(clipbox, ssid);
if (ssid < min_size + 2) continue;
int dest_width = pred.cols;
int dest_height = pred.rows;
std::vector<std::vector<int>> intcliparray;
for (int num_pt = 0; num_pt < 4; num_pt++) {
std::vector<int> a{
int(clampf(roundf(cliparray[num_pt][0] / float(width) * float(dest_width)), 0,
float(dest_width))),
int(clampf(roundf(cliparray[num_pt][1] / float(height) * float(dest_height)), 0,
float(dest_height)))};
intcliparray.push_back(a);
}
boxes.push_back(intcliparray);
}//end for
return boxes;
}
int _max(int a, int b) {
return a >= b ? a : b;
}
int _min(int a, int b) {
return a >= b ? b : a;
}
std::vector<std::vector<std::vector<int>>>
filter_tag_det_res(const std::vector<std::vector<std::vector<int>>>& o_boxes,
float ratio_h, float ratio_w,const cv::Mat& srcimg) {
int oriimg_h = srcimg.rows;
int oriimg_w = srcimg.cols;
std::vector<std::vector<std::vector<int>>> boxes{o_boxes};
std::vector<std::vector<std::vector<int>>> root_points;
for (int n = 0; n < boxes.size(); n++) {
boxes[n] = order_points_clockwise(boxes[n]);
for (int m = 0; m < boxes[0].size(); m++) {
boxes[n][m][0] /= ratio_w;
boxes[n][m][1] /= ratio_h;
boxes[n][m][0] = int(_min(_max(boxes[n][m][0], 0), oriimg_w - 1));
boxes[n][m][1] = int(_min(_max(boxes[n][m][1], 0), oriimg_h - 1));
}
}
for (int n = 0; n < boxes.size(); n++) {
int rect_width, rect_height;
rect_width = int(sqrt(
pow(boxes[n][0][0] - boxes[n][1][0], 2) + pow(boxes[n][0][1] - boxes[n][1][1], 2)));
rect_height = int(sqrt(
pow(boxes[n][0][0] - boxes[n][3][0], 2) + pow(boxes[n][0][1] - boxes[n][3][1], 2)));
if (rect_width <= 10 || rect_height <= 10)
continue;
root_points.push_back(boxes[n]);
}
return root_points;
}
/*
using namespace std;
// read data from txt file
cv::Mat readtxt2(std::string path, int imgw, int imgh, int imgc) {
std::cout << "read data file from txt file! " << std::endl;
ifstream in(path);
string line;
int count = 0;
int i = 0, j = 0;
std::vector<float> img_mean = {0.485, 0.456, 0.406};
std::vector<float> img_std = {0.229, 0.224, 0.225};
float trainData[imgh][imgw*imgc];
while (getline(in, line)) {
stringstream ss(line);
double x;
while (ss >> x) {
// trainData[i][j] = float(x) * img_std[j % 3] + img_mean[j % 3];
trainData[i][j] = float(x);
j++;
}
i++;
j = 0;
}
cv::Mat pred_map(imgh, imgw*imgc, CV_32FC1, (float *) trainData);
cv::Mat reshape_img = pred_map.reshape(imgc, imgh);
return reshape_img;
}
*/
//using namespace std;
//
//void writetxt(vector<vector<float>> data, std::string save_path){
//
// ofstream fout(save_path);
//
// for (int i = 0; i < data.size(); i++) {
// for (int j=0; j< data[0].size(); j++){
// fout << data[i][j] << " ";
// }
// fout << endl;
// }
// fout << endl;
// fout.close();
//}
//
// Created by fujiayi on 2020/7/2.
//
#pragma once
std::vector<std::vector<std::vector<int>>> boxes_from_bitmap(const cv::Mat& pred, const cv::Mat& bitmap);
std::vector<std::vector<std::vector<int>>>
filter_tag_det_res(const std::vector<std::vector<std::vector<int>>>& o_boxes,
float ratio_h, float ratio_w, const cv::Mat& srcimg);
\ No newline at end of file
//
// timer.h
// face_demo
//
// Created by Li,Xiaoyang(SYS) on 2019/8/20.
// Copyright © 2019年 Li,Xiaoyang(SYS). All rights reserved.
//
#ifndef timer_h
#define timer_h
#include <chrono>
#include <list>
class Timer final {
public:
Timer() {}
~Timer() {}
void clear() {
ms_time.clear();
}
void start() {
tstart = std::chrono::system_clock::now();
}
void end() {
tend = std::chrono::system_clock::now();
auto ts = std::chrono::duration_cast<std::chrono::microseconds>(tend - tstart);
float elapse_ms = 1000.f * float(ts.count()) * std::chrono::microseconds::period::num / \
std::chrono::microseconds::period::den;
ms_time.push_back(elapse_ms);
}
float get_average_ms() {
if (ms_time.size() == 0) {
return 0.f;
}
float sum = 0.f;
for (auto i : ms_time){
sum += i;
}
return sum / ms_time.size();
}
float get_sum_ms(){
if (ms_time.size() == 0) {
return 0.f;
}
float sum = 0.f;
for (auto i : ms_time){
sum += i;
}
return sum;
}
// return tile (0-99) time.
float get_tile_time(float tile) {
if (tile <0 || tile > 100) {
return -1.f;
}
int total_items = (int)ms_time.size();
if (total_items <= 0) {
return -2.f;
}
ms_time.sort();
int pos = (int)(tile * total_items / 100);
auto it = ms_time.begin();
for (int i = 0; i < pos; ++i) {
++it;
}
return *it;
}
const std::list<float> get_time_stat() {
return ms_time;
}
private:
std::chrono::time_point<std::chrono::system_clock> tstart;
std::chrono::time_point<std::chrono::system_clock> tend;
std::list<float> ms_time;
};
#endif /* timer_h */
......@@ -13,7 +13,7 @@
预计6月中下旬会先后发布基于Serving的服务部署方案和基于Paddle Lite的移动端部署方案,欢迎持续关注。
5. **自研算法发布时间**
自研算法SAST、SRN、End2End-PSL都将在6-7月陆续发布,敬请期待。
自研算法SAST、SRN、End2End-PSL都将在7-8月陆续发布,敬请期待。
6. **如何在Windows或Mac系统上运行**
PaddleOCR已完成Windows和Mac系统适配,运行时注意两点:1、在[快速安装](./installation.md)时,如果不想安装docker,可跳过第一步,直接从第二步安装paddle开始。2、inference模型下载时,如果没有安装wget,可直接点击模型链接或将链接地址复制到浏览器进行下载,并解压放置到相应目录。
......
## 手写中文OCR数据集
这里整理了常用手写中文数据集,持续更新中,欢迎各位小伙伴贡献数据集~
# 手写OCR数据集
这里整理了常用手写数据集,持续更新中,欢迎各位小伙伴贡献数据集~
- [中科院自动化研究所-手写中文数据集](#中科院自动化研究所-手写中文数据集)
- [NIST手写单字数据集-英文](#NIST手写单字数据集-英文)
<a name="中科院自动化研究所-手写中文数据集"></a>
#### 1、中科院自动化研究所-手写中文数据集
## 中科院自动化研究所-手写中文数据集
- **数据来源**:http://www.nlpr.ia.ac.cn/databases/handwriting/Download.html
- **数据简介**:包含在线和离线两类手写单字数据,包含GB2312-80中的3755个一级汉字,共由720人手写完成。在线部分(HWDB)总共包含约210万个训练样本,53万个测试样本;离线部分(OLHWDB)总共包含约210万个训练样本,53万个测试样本。
![](../datasets/CASIA_0.jpg)
(a) 五张单字图片样例
- **数据简介**
* 包含在线和离线两类手写数据,`HWDB1.0~1.2`总共有3895135个手写单字样本,分属7356类(7185个汉字和171个英文字母、数字、符号);`HWDB2.0~2.2`总共有5091页图像,分割为52230个文本行和1349414个文字。所有文字和文本样本均存为灰度图像。部分单字样本图片如下所示。
![](../datasets/CASIA_0.jpg)
- **下载地址**:http://www.nlpr.ia.ac.cn/databases/handwriting/Download.html
- **使用建议**:数据为单字,白色背景,可以大量合成文字行进行训练。白色背景可以处理成透明状态,方便添加各种背景。对于需要语义的情况,建议从真实语料出发,抽取单字组成文字行
<a name="NIST手写单字数据集-英文"></a>
## NIST手写单字数据集-英文(NIST Handprinted Forms and Characters Database)
- **数据来源**: [https://www.nist.gov/srd/nist-special-database-19](https://www.nist.gov/srd/nist-special-database-19)
- **数据简介**: NIST19数据集适用于手写文档和字符识别的模型训练,从3600位作者的手写样本表格中提取得到,总共包含81万张字符图片。其中9张图片示例如下。
![](../datasets/nist_demo.png)
- **下载地址**: [https://www.nist.gov/srd/nist-special-database-19](https://www.nist.gov/srd/nist-special-database-19)
......@@ -74,7 +74,7 @@ python3 tools/infer/predict_det.py --image_dir="./doc/imgs/2.jpg" --det_model_di
可视化文本检测结果默认保存到 ./inference_results 文件夹里面,结果文件的名称前缀为'det_res'。结果示例如下:
![](imgs_results/det_res_2.jpg)
![](../imgs_results/det_res_2.jpg)
通过设置参数det_max_side_len的大小,改变检测算法中图片规范化的最大值。当图片的长宽都小于det_max_side_len,则使用原图预测,否则将图片等比例缩放到最大值,进行预测。该参数默认设置为det_max_side_len=960. 如果输入图片的分辨率比较大,而且想使用更大的分辨率预测,可以执行如下命令:
......
......@@ -2,9 +2,10 @@
经测试PaddleOCR可在glibc 2.23上运行,您也可以测试其他glibc版本或安装glic 2.23
PaddleOCR 工作环境
- PaddlePaddle1.7
- PaddlePaddle 1.7+
- python3
- glibc 2.23
- cuDNN 7.6+ (GPU)
建议使用我们提供的docker运行PaddleOCR,有关docker使用请参考[链接](https://docs.docker.com/get-started/)
......
......@@ -96,6 +96,7 @@ $ hub serving start --modules [Module1==Version1, Module2==Version2, ...] \
**注意:**
- 使用配置文件启动服务时,其他参数会被忽略。
- 如果使用GPU预测(即,`use_gpu`置为`true`),则需要在启动服务之前,设置CUDA_VISIBLE_DEVICES环境变量,如:```export CUDA_VISIBLE_DEVICES=0```,否则不用设置。
- **`use_gpu`不可与`use_multiprocess`同时为`true`**。
如,使用GPU 3号卡启动串联服务:
```shell
......@@ -120,6 +121,25 @@ hub serving start -c deploy/hubserving/ocr_system/config.json
访问示例:
```python tools/test_hubserving.py http://127.0.0.1:8868/predict/ocr_system ./doc/imgs/```
## 返回结果格式说明
返回结果为列表(list),列表中的每一项为词典(dict),词典一共可能包含3种字段,信息如下:
|字段名称|数据类型|意义|
|-|-|-|
|text|str|文本内容|
|confidence|float| 文本识别置信度|
|text_region|list|文本位置坐标|
不同模块返回的字段不同,如,文本识别服务模块返回结果不含`text_region`字段,具体信息如下:
|字段名/模块名|ocr_det|ocr_rec|ocr_system|
|-|-|-|-|
|text||✔|✔|
|confidence||✔|✔|
|text_region|✔||✔|
**说明:** 如果需要增加、删除、修改返回字段,可在相应模块的`module.py`文件中进行修改,完整流程参考下一节自定义修改服务模块。
## 自定义修改服务模块
如果需要修改服务逻辑,你一般需要操作以下步骤(以修改`ocr_system`为例):
......
## 中文OCR训练预测技巧
这里整理了一些中文OCR训练预测技巧,持续更新中,欢迎各位小伙伴贡献OCR炼丹秘籍~
- [更换骨干网络](#更换骨干网络)
- [中文长文本识别](#中文长文本识别)
- [空格识别](#空格识别)
<a name="更换骨干网络"></a>
#### 1、更换骨干网络
- **问题描述**
目前PaddleOCR中使用的骨干网络有ResNet_vd系列和MobileNetV3系列,更换骨干网络是否有助于效果提升?更换时需要注意什么?
- **炼丹建议**
- 无论是文字检测,还是文字识别,骨干网络的选择是预测效果和预测效率的权衡。一般,选择更大规模的骨干网络,例如ResNet101_vd,则检测或识别更准确,但预测耗时相应也会增加。而选择更小规模的骨干网络,例如MobileNetV3_small_x0_35,则预测更快,但检测或识别的准确率会大打折扣。幸运的是不同骨干网络的检测或识别效果与在ImageNet数据集图像1000分类任务效果正相关。[**飞桨图像分类套件PaddleClas**](https://github.com/PaddlePaddle/PaddleClas)汇总了ResNet_vd、Res2Net、HRNet、MobileNetV3、GhostNet等23种系列的分类网络结构,在上述图像分类任务的top1识别准确率,GPU(V100和T4)和CPU(骁龙855)的预测耗时以及相应的[**117个预训练模型下载地址**](https://paddleclas.readthedocs.io/zh_CN/latest/models/models_intro.html)
- 文字检测骨干网络的替换,主要是确定类似与ResNet的4个stages,以方便集成后续的类似FPN的检测头。此外,对于文字检测问题,使用ImageNet训练的分类预训练模型,可以加速收敛和效果提升。
- 文字识别的骨干网络的替换,需要注意网络宽高stride的下降位置。由于文本识别一般宽高比例很大,因此高度下降频率少一些,宽度下降频率多一些。可以参考PaddleOCR中[MobileNetV3骨干网络](https://github.com/PaddlePaddle/PaddleOCR/blob/develop/ppocr/modeling/backbones/rec_mobilenet_v3.py)的改动。
<a name="中文长文本识别"></a>
#### 2、中文长文本识别
- **问题描述**
中文识别模型训练时分辨率最大是[3,32,320],如果待识别的文本图像太长,如下图所示,该如何适配?
<div align="center">
<img src="../tricks/long_text_examples.jpg" width="600">
</div>
- **炼丹建议**
在中文识别模型训练时,并不是采用直接将训练样本缩放到[3,32,320]进行训练,而是先等比例缩放图像,保证图像高度为32,宽度不足320的部分补0,宽高比大于10的样本直接丢弃。预测时,如果是单张图像预测,则按上述操作直接对图像缩放,不做宽度320的限制。如果是多张图预测,则采用batch方式预测,每个batch的宽度动态变换,采用这个batch中最长宽度。[参考代码如下](https://github.com/PaddlePaddle/PaddleOCR/blob/develop/tools/infer/predict_rec.py)
```
def resize_norm_img(self, img, max_wh_ratio):
imgC, imgH, imgW = self.rec_image_shape
assert imgC == img.shape[2]
if self.character_type == "ch":
imgW = int((32 * max_wh_ratio))
h, w = img.shape[:2]
ratio = w / float(h)
if math.ceil(imgH * ratio) > imgW:
resized_w = imgW
else:
resized_w = int(math.ceil(imgH * ratio))
resized_image = cv2.resize(img, (resized_w, imgH))
resized_image = resized_image.astype('float32')
resized_image = resized_image.transpose((2, 0, 1)) / 255
resized_image -= 0.5
resized_image /= 0.5
padding_im = np.zeros((imgC, imgH, imgW), dtype=np.float32)
padding_im[:, :, 0:resized_w] = resized_image
return padding_im
```
<a name="空格识别"></a>
#### 3、空格识别
- **问题描述**
如下图所示,对于中英文混合场景,为了便于阅读和使用识别结果,往往需要将单词之间的空格识别出来,这种情况如何适配?
<div align="center">
<img src="../imgs_results/chinese_db_crnn_server/en_paper.jpg" width="600">
</div>
- **炼丹建议**
空格识别可以考虑以下两种方案:(1)优化文本检测算法。检测结果在空格处将文本断开。这种方案在检测数据标注时,需要将含有空格的文本行分成好多段。(2)优化文本识别算法。在识别字典里面引入空格字符,然后在识别的训练数据中,如果用空行,进行标注。此外,合成数据时,通过拼接训练数据,生成含有空格的文本。PaddleOCR目前采用的是第二种方案。
\ No newline at end of file
# 垂类多语言OCR数据集
这里整理了常用垂类和多语言OCR数据集,持续更新中,欢迎各位小伙伴贡献数据集~
- [中国城市车牌数据集](#中国城市车牌数据集)
- [银行信用卡数据集](#银行信用卡数据集)
- [验证码数据集-Captcha](#验证码数据集-Captcha)
- [多语言数据集](#多语言数据集)
<a name="中国城市车牌数据集"></a>
## 中国城市车牌数据集
- **数据来源**[https://github.com/detectRecog/CCPD](https://github.com/detectRecog/CCPD)
- **数据简介**: 包含超过25万张中国城市车牌图片及车牌检测、识别信息的标注。包含以下几种不同场景中的车牌图片信息。
* CCPD-Base: 通用车牌图片
* CCPD-DB: 车牌区域亮度较亮、较暗或者不均匀
* CCPD-FN: 车牌离摄像头拍摄位置相对更远或者更近
* CCPD-Rotate: 车牌包含旋转(水平20\~50度,竖直-10\~10度)
* CCPD-Tilt: 车牌包含旋转(水平15\~45度,竖直15\~45度)
* CCPD-Blur: 车牌包含由于摄像机镜头抖动导致的模糊情况
* CCPD-Weather: 车牌在雨天、雪天或者雾天拍摄得到
* CCPD-Challenge: 至今在车牌检测识别任务中最有挑战性的一些图片
* CCPD-NP: 没有安装车牌的新车图片。
![](../datasets/ccpd_demo.png)
- **下载地址**
* 百度云下载地址(提取码是hm0U): [https://pan.baidu.com/s/1i5AOjAbtkwb17Zy-NQGqkw](https://pan.baidu.com/s/1i5AOjAbtkwb17Zy-NQGqkw)
* Google drive下载地址:[https://drive.google.com/file/d/1rdEsCUcIUaYOVRkx5IMTRNA7PcGMmSgc/view](https://drive.google.com/file/d/1rdEsCUcIUaYOVRkx5IMTRNA7PcGMmSgc/view)
<a name="银行信用卡数据集"></a>
## 银行信用卡数据集
- **数据来源**: [https://www.kesci.com/home/dataset/5954cf1372ead054a5e25870](https://www.kesci.com/home/dataset/5954cf1372ead054a5e25870)
- **数据简介**: 训练数据共提供了三类数据
* 1.招行样卡数据: 包括卡面图片数据及标注数据,总共618张图片
* 2.单字符数据: 包括图片及标注数据,总共37张图片。
* 3.仅包含其他银行卡面,不具有更细致的信息,总共50张图片。
* demo图片展示如下,标注信息存储在excel表格中,下面的demo图片标注为
* 前8位卡号:62257583
* 卡片种类:本行卡
* 有效期结束:07/41
* 卡用户拼音:MICHAEL
![](../datasets/cmb_demo.jpg)
- **下载地址**: [https://cdn.kesci.com/cmb2017-2.zip](https://cdn.kesci.com/cmb2017-2.zip)
<a name="验证码数据集-Captcha"></a>
## 验证码数据集-Captcha
- **数据来源**: [https://github.com/lepture/captcha](https://github.com/lepture/captcha)
- **数据简介**: 这是一个数据合成的工具包,可以根据输入的文本,输出验证码图片,使用该工具包生成几张demo图片如下。
![](../datasets/captcha_demo.png)
- **下载地址**: 该数据集是生成得到,无下载地址。
<a name="多语言数据集"></a>
## 多语言数据集(Multi-lingual scene text detection and recognition)
- **数据来源**: [https://rrc.cvc.uab.es/?ch=15&com=downloads](https://rrc.cvc.uab.es/?ch=15&com=downloads)
- **数据简介**: 多语言检测数据集MLT同时包含了语种识别和检测任务。
* 在检测任务中,训练集包含10000张图片,共有10种语言,每种语言包含1000张训练图片。测试集包含10000张图片。
* 在识别任务中,训练集包含111998个样本。
- **下载地址**: 训练集较大,分2部分下载,需要在网站上注册之后才能下载:
[https://rrc.cvc.uab.es/?ch=15&com=downloads](https://rrc.cvc.uab.es/?ch=15&com=downloads)
# BENCHMARK
This document gives the prediction time-consuming benchmark of PaddleOCR Ultra Lightweight Chinese Model (8.6M) on each platform.
## TEST DATA
* 500 images were randomly sampled from the Chinese public data set [ICDAR2017-RCTW](https://github.com/PaddlePaddle/PaddleOCR/blob/develop/doc/doc_ch/datasets.md#ICDAR2017-RCTW-17).
Most of the pictures in the set were collected in the wild through mobile phone cameras.
Some are screenshots.
These pictures show various scenes, including street scenes, posters, menus, indoor scenes and screenshots of mobile applications.
## MEASUREMENT
The predicted time-consuming indicators on the four platforms are as follows:
| Long size(px) | T4(s) | V100(s) | Intel Xeon 6148(s) | Snapdragon 855(s) |
| :---------: | :-----: | :-------: | :------------------: | :-----------------: |
| 960 | 0.092 | 0.057 | 0.319 | 0.354 |
| 640 | 0.067 | 0.045 | 0.198 | 0.236 |
| 480 | 0.057 | 0.043 | 0.151 | 0.175 |
Explanation:
* The evaluation time-consuming stage is the complete stage from image input to result output, including image
pre-processing and post-processing.
* ```Intel Xeon 6148``` is the server-side CPU model. Intel MKL-DNN is used in the test to accelerate the CPU prediction speed.
To use this operation, you need to:
* Update to the latest version of PaddlePaddle: https://www.paddlepaddle.org.cn/documentation/docs/zh/install/Tables.html#whl-dev
Please select the corresponding mkl version wheel package according to the CUDA version and Python version of your environment,
for example, CUDA10, Python3.7 environment, you should:
```
# Obtain the installation package
wget https://paddle-wheel.bj.bcebos.com/0.0.0-gpu-cuda10-cudnn7-mkl/paddlepaddle_gpu-0.0.0-cp37-cp37m-linux_x86_64.whl
# Installation
pip3.7 install paddlepaddle_gpu-0.0.0-cp37-cp37m-linux_x86_64.whl
```
* Use parameters ```--enable_mkldnn True``` to turn on the acceleration switch when making predictions
* ```Snapdragon 855``` is a mobile processing platform model.
# DATA ANNOTATION TOOLS
There are the commonly used data annotation tools, which will be continuously updated. Welcome to contribute tools~
### 1. labelImg
- Tool description: Rectangular label
- Tool address: https://github.com/tzutalin/labelImg
- Sketch diagram:
![labelimg](../datasets/labelimg.jpg)
### 2. roLabelImg
- Tool description: Label tool rewritten based on labelImg, supporting rotating rectangular label
- Tool address: https://github.com/cgvict/roLabelImg
- Sketch diagram:
![roLabelImg](../datasets/roLabelImg.png)
### 3. labelme
- Tool description: Support four points, polygons, circles and other labels
- Tool address: https://github.com/wkentaro/labelme
- Sketch diagram:
![labelme](../datasets/labelme.jpg)
# DATA SYNTHESIS TOOLS
In addition to open source data, users can also use synthesis tools to synthesize data.
There are the commonly used data synthesis tools, which will be continuously updated. Welcome to contribute tools~
* [Text_renderer](https://github.com/Sanster/text_renderer)
* [SynthText](https://github.com/ankush-me/SynthText)
* [SynthText_Chinese_version](https://github.com/JarveeLee/SynthText_Chinese_version)
* [TextRecognitionDataGenerator](https://github.com/Belval/TextRecognitionDataGenerator)
* [SynthText3D](https://github.com/MhLiao/SynthText3D)
* [UnrealText](https://github.com/Jyouhou/UnrealText/)
......@@ -22,7 +22,7 @@ After decompressing the data set and downloading the annotation file, PaddleOCR/
└─ test_icdar2015_label.txt Test annotation of icdar dataset
```
The provided annotation file format is as follow:
The provided annotation file format is as follow, seperated by "\t":
```
" Image file name Image annotation information encoded by json.dumps"
ch4_test_images/img_61.jpg [{"transcription": "MASA", "points": [[310, 104], [416, 141], [418, 216], [312, 179]], ...}]
......@@ -56,28 +56,29 @@ tar xf ./pretrain_models/MobileNetV3_large_x0_5_pretrained.tar ./pretrain_models
```
**START TRAINING**
**START TRAINING**
*If CPU version installed, please set the parameter `use_gpu` to `false` in the configuration.*
```
python3 tools/train.py -c configs/det/det_mv3_db.yml
```
In the above instruction, use `-c` to select the training to use the configs/det/det_db_mv3.yml configuration file.
For a detailed explanation of the configuration file, please refer to [link](./config_en.md).
In the above instruction, use `-c` to select the training to use the `configs/det/det_db_mv3.yml` configuration file.
For a detailed explanation of the configuration file, please refer to [config](./config_en.md).
You can also use the `-o` parameter to change the training parameters without modifying the yml file. For example, adjust the training learning rate to 0.0001
You can also use `-o` to change the training parameters without modifying the yml file. For example, adjust the training learning rate to 0.0001
```
python3 tools/train.py -c configs/det/det_mv3_db.yml -o Optimizer.base_lr=0.0001
```
**load trained model and conntinue training**
If you expect to load trained model and continue the training again, you can specify the `Global.checkpoints` parameter as the model path to be loaded.
If you expect to load trained model and continue the training again, you can specify the parameter `Global.checkpoints` as the model path to be loaded.
For example:
```
python3 tools/train.py -c configs/det/det_mv3_db.yml -o Global.checkpoints=./your/trained/model
```
**Note**:The priority of Global.checkpoints is higher than the priority of Global.pretrain_weights, that is, when two parameters are specified at the same time, the model specified by Global.checkpoints will be loaded first. If the model path specified by Global.checkpoints is wrong, the one specified by Global.pretrain_weights will be loaded.
**Note**:The priority of `Global.checkpoints` is higher than that of `Global.pretrain_weights`, that is, when two parameters are specified at the same time, the model specified by Global.checkpoints will be loaded first. If the model path specified by `Global.checkpoints` is wrong, the one specified by `Global.pretrain_weights` will be loaded.
## EVALUATION
......@@ -86,34 +87,34 @@ PaddleOCR calculates three indicators for evaluating performance of OCR detectio
Run the following code to calculate the evaluation indicators. The result will be saved in the test result file specified by `save_res_path` in the configuration file `det_db_mv3.yml`
When evaluating, set post-processing parameters box_thresh=0.6, unclip_ratio=1.5. If you use different datasets, different models for training, these two parameters should be adjusted for better result.
When evaluating, set post-processing parameters `box_thresh=0.6`, `unclip_ratio=1.5`. If you use different datasets, different models for training, these two parameters should be adjusted for better result.
```
python3 tools/eval.py -c configs/det/det_mv3_db.yml -o Global.checkpoints="{path/to/weights}/best_accuracy" PostProcess.box_thresh=0.6 PostProcess.unclip_ratio=1.5
```
The model parameters during training are saved in the `Global.save_model_dir` directory by default. When evaluating indicators, you need to set Global.checkpoints to point to the saved parameter file.
The model parameters during training are saved in the `Global.save_model_dir` directory by default. When evaluating indicators, you need to set `Global.checkpoints` to point to the saved parameter file.
Such as:
```
```shell
python3 tools/eval.py -c configs/det/det_mv3_db.yml -o Global.checkpoints="./output/det_db/best_accuracy" PostProcess.box_thresh=0.6 PostProcess.unclip_ratio=1.5
```
* Note: box_thresh and unclip_ratio are parameters required for DB post-processing, and not need to be set when evaluating the EAST model.
* Note: `box_thresh` and `unclip_ratio` are parameters required for DB post-processing, and not need to be set when evaluating the EAST model.
## TEST DETECTION RESULT
## TEST
Test the detection result on a single image:
```
```shell
python3 tools/infer_det.py -c configs/det/det_mv3_db.yml -o TestReader.infer_img="./doc/imgs_en/img_10.jpg" Global.checkpoints="./output/det_db/best_accuracy"
```
When testing the DB model, adjust the post-processing threshold:
```
```shell
python3 tools/infer_det.py -c configs/det/det_mv3_db.yml -o TestReader.infer_img="./doc/imgs_en/img_10.jpg" Global.checkpoints="./output/det_db/best_accuracy" PostProcess.box_thresh=0.6 PostProcess.unclip_ratio=1.5
```
Test the detection result on all images in the folder:
```
```shell
python3 tools/infer_det.py -c configs/det/det_mv3_db.yml -o TestReader.infer_img="./doc/imgs_en/" Global.checkpoints="./output/det_db/best_accuracy"
```
# Handwritten OCR dataset
Here we have sorted out the commonly used handwritten OCR dataset datasets, which are being updated continuously. We welcome you to contribute datasets ~
- [Institute of automation, Chinese Academy of Sciences - handwritten Chinese dataset](#Institute of automation, Chinese Academy of Sciences - handwritten Chinese dataset)
- [NIST handwritten single character dataset - English](#NIST handwritten single character dataset - English)
<a name="Institute of automation, Chinese Academy of Sciences - handwritten Chinese dataset"></a>
## Institute of automation, Chinese Academy of Sciences - handwritten Chinese dataset
- **Data source**:http://www.nlpr.ia.ac.cn/databases/handwriting/Download.html
- **Data introduction**:
* It includes online and offline handwritten data,`HWDB1.0~1.2` has totally 3895135 handwritten single character samples, which belong to 7356 categories (7185 Chinese characters and 171 English letters, numbers and symbols);`HWDB2.0~2.2` has totally 5091 pages of images, which are divided into 52230 text lines and 1349414 words. All text and text samples are stored as grayscale images. Some sample words are shown below.
![](../datasets/CASIA_0.jpg)
- **Download address**:http://www.nlpr.ia.ac.cn/databases/handwriting/Download.html
- **使用建议**:Data for single character, white background, can form a large number of text lines for training. White background can be processed into transparent state, which is convenient to add various backgrounds. For the case of semantic needs, it is suggested to extract single character from real corpus to form text lines.
<a name="NIST handwritten single character dataset - English"></a>
## NIST handwritten single character dataset - English(NIST Handprinted Forms and Characters Database)
- **Data source**: [https://www.nist.gov/srd/nist-special-database-19](https://www.nist.gov/srd/nist-special-database-19)
- **Data introduction**: NIST19 dataset is suitable for handwritten document and character recognition model training. It is extracted from the handwritten sample form of 3600 authors and contains 810000 character images in total. Nine of them are shown below.
![](../datasets/nist_demo.png)
- **Download address**: [https://www.nist.gov/srd/nist-special-database-19](https://www.nist.gov/srd/nist-special-database-19)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment