ocr_db_crnn.cc 11.5 KB
Newer Older
LDOUVLEV's avatar
LDOUVLEV committed
1
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
LDOUBLEV's avatar
LDOUBLEV committed
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include <chrono>
#include "paddle_api.h"  // NOLINT

LDOUVLEV's avatar
LDOUVLEV committed
18
19
#include "crnn_process.h"
#include "db_post_process.h"
LDOUBLEV's avatar
LDOUBLEV committed
20
21

using namespace paddle::lite_api;  // NOLINT
LDOUVLEV's avatar
LDOUVLEV committed
22
using namespace std;
LDOUBLEV's avatar
LDOUBLEV committed
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70

// fill tensor with mean and scale and trans layout: nhwc -> nchw, neon speed up
void neon_mean_scale(const float* din,
                     float* dout,
                     int size,
                     const std::vector<float> mean,
                     const std::vector<float> scale) {
  if (mean.size() != 3 || scale.size() != 3) {
    std::cerr << "[ERROR] mean or scale size must equal to 3\n";
    exit(1);
  }
  float32x4_t vmean0 = vdupq_n_f32(mean[0]);
  float32x4_t vmean1 = vdupq_n_f32(mean[1]);
  float32x4_t vmean2 = vdupq_n_f32(mean[2]);
  float32x4_t vscale0 = vdupq_n_f32(scale[0]);
  float32x4_t vscale1 = vdupq_n_f32(scale[1]);
  float32x4_t vscale2 = vdupq_n_f32(scale[2]);

  float* dout_c0 = dout;
  float* dout_c1 = dout + size;
  float* dout_c2 = dout + size * 2;

  int i = 0;
  for (; i < size - 3; i += 4) {
    float32x4x3_t vin3 = vld3q_f32(din);
    float32x4_t vsub0 = vsubq_f32(vin3.val[0], vmean0);
    float32x4_t vsub1 = vsubq_f32(vin3.val[1], vmean1);
    float32x4_t vsub2 = vsubq_f32(vin3.val[2], vmean2);
    float32x4_t vs0 = vmulq_f32(vsub0, vscale0);
    float32x4_t vs1 = vmulq_f32(vsub1, vscale1);
    float32x4_t vs2 = vmulq_f32(vsub2, vscale2);
    vst1q_f32(dout_c0, vs0);
    vst1q_f32(dout_c1, vs1);
    vst1q_f32(dout_c2, vs2);

    din += 12;
    dout_c0 += 4;
    dout_c1 += 4;
    dout_c2 += 4;
  }
  for (; i < size; i++) {
    *(dout_c0++) = (*(din++) - mean[0]) * scale[0];
    *(dout_c1++) = (*(din++) - mean[1]) * scale[1];
    *(dout_c2++) = (*(din++) - mean[2]) * scale[2];
  }
}

// resize image to a size multiple of 32 which is required by the network
LDOUVLEV's avatar
LDOUVLEV committed
71
72
73
cv::Mat DetResizeImg(const cv::Mat img,
                     int max_size_len,
                     std::vector<float>& ratio_hw) {
LDOUBLEV's avatar
LDOUBLEV committed
74
75
76
77
  int w = img.cols;
  int h = img.rows;

  float ratio = 1.f;
LDOUVLEV's avatar
LDOUVLEV committed
78
79
80
  int max_wh = w >= h ? w : h;
  if (max_wh > max_size_len) {
    if (h > w) {
LDOUBLEV's avatar
LDOUBLEV committed
81
82
83
84
85
86
87
88
89
90
      ratio = float(max_size_len) / float(h);
    } else {
      ratio = float(max_size_len) / float(w);
    }
  }

  int resize_h = int(float(h) * ratio);
  int resize_w = int(float(w) * ratio);
  if (resize_h % 32 == 0)
    resize_h = resize_h;
LDOUVLEV's avatar
LDOUVLEV committed
91
  else if (resize_h / 32 < 1 + 1e-5)
LDOUBLEV's avatar
LDOUBLEV committed
92
93
94
95
96
97
    resize_h = 32;
  else
    resize_h = (resize_h / 32 - 1) * 32;

  if (resize_w % 32 == 0)
    resize_w = resize_w;
LDOUVLEV's avatar
LDOUVLEV committed
98
  else if (resize_w / 32 < 1 + 1e-5)
LDOUBLEV's avatar
LDOUBLEV committed
99
100
    resize_w = 32;
  else
LDOUVLEV's avatar
LDOUVLEV committed
101
    resize_w = (resize_w / 32 - 1) * 32;
LDOUBLEV's avatar
LDOUBLEV committed
102
103
104
105

  cv::Mat resize_img;
  cv::resize(img, resize_img, cv::Size(resize_w, resize_h));

LDOUVLEV's avatar
LDOUVLEV committed
106
107
  ratio_hw.push_back(float(resize_h) / float(h));
  ratio_hw.push_back(float(resize_w) / float(w));
LDOUBLEV's avatar
LDOUBLEV committed
108
109
110
  return resize_img;
}

LDOUVLEV's avatar
LDOUVLEV committed
111
112
113
114
115
116
void RunRecModel(std::vector<std::vector<std::vector<int>>> boxes,
                 cv::Mat img,
                 std::shared_ptr<PaddlePredictor> predictor_crnn,
                 std::string dict_path,
                 std::vector<std::string>& rec_text,
                 std::vector<float>& rec_text_score) {
LDOUBLEV's avatar
LDOUBLEV committed
117
118
119
120
121
122
123
124
  std::vector<float> mean = {0.5f, 0.5f, 0.5f};
  std::vector<float> scale = {1 / 0.5f, 1 / 0.5f, 1 / 0.5f};

  cv::Mat srcimg;
  img.copyTo(srcimg);
  cv::Mat crop_img;
  cv::Mat resize_img;

LDOUVLEV's avatar
LDOUVLEV committed
125
  auto charactor_dict = ReadDict(dict_path);
LDOUBLEV's avatar
LDOUBLEV committed
126
127

  int index = 0;
LDOUVLEV's avatar
LDOUVLEV committed
128
129
  for (int i = boxes.size() - 1; i >= 0; i--) {
    crop_img = GetRotateCropImage(srcimg, boxes[i]);
LDOUBLEV's avatar
LDOUBLEV committed
130
131
    float wh_ratio = float(crop_img.cols) / float(crop_img.rows);

LDOUVLEV's avatar
LDOUVLEV committed
132
    resize_img = CrnnResizeImg(crop_img, wh_ratio);
LDOUBLEV's avatar
LDOUBLEV committed
133
134
    resize_img.convertTo(resize_img, CV_32FC3, 1 / 255.f);

LDOUVLEV's avatar
LDOUVLEV committed
135
    const float* dimg = reinterpret_cast<const float*>(resize_img.data);
LDOUBLEV's avatar
LDOUBLEV committed
136

LDOUVLEV's avatar
LDOUVLEV committed
137
138
    std::unique_ptr<Tensor> input_tensor0(
        std::move(predictor_crnn->GetInput(0)));
LDOUBLEV's avatar
LDOUBLEV committed
139
    input_tensor0->Resize({1, 3, resize_img.rows, resize_img.cols});
LDOUVLEV's avatar
LDOUVLEV committed
140
    auto* data0 = input_tensor0->mutable_data<float>();
LDOUBLEV's avatar
LDOUBLEV committed
141

LDOUVLEV's avatar
LDOUVLEV committed
142
143
    neon_mean_scale(
        dimg, data0, resize_img.rows * resize_img.cols, mean, scale);
LDOUBLEV's avatar
LDOUBLEV committed
144
145
146
147
148
    //// Run CRNN predictor
    predictor_crnn->Run();

    // Get output and run postprocess
    std::unique_ptr<const Tensor> output_tensor0(
LDOUVLEV's avatar
LDOUVLEV committed
149
150
        std::move(predictor_crnn->GetOutput(0)));
    auto* rec_idx = output_tensor0->data<int>();
LDOUBLEV's avatar
LDOUBLEV committed
151
152
153
154
155

    auto rec_idx_lod = output_tensor0->lod();
    auto shape_out = output_tensor0->shape();

    std::vector<int> pred_idx;
LDOUVLEV's avatar
LDOUVLEV committed
156
157
    for (int n = int(rec_idx_lod[0][0]); n < int(rec_idx_lod[0][1] * 2);
         n += 2) {
LDOUBLEV's avatar
LDOUBLEV committed
158
159
160
      pred_idx.push_back(int(rec_idx[n]));
    }

LDOUVLEV's avatar
LDOUVLEV committed
161
    if (pred_idx.size() < 1e-3) continue;
LDOUBLEV's avatar
LDOUBLEV committed
162
163

    index += 1;
LDOUVLEV's avatar
LDOUVLEV committed
164
    std::string pred_txt = "";
LDOUBLEV's avatar
LDOUBLEV committed
165
    for (int n = 0; n < pred_idx.size(); n++) {
LDOUVLEV's avatar
LDOUVLEV committed
166
      pred_txt += charactor_dict[pred_idx[n]];
LDOUBLEV's avatar
LDOUBLEV committed
167
    }
LDOUVLEV's avatar
LDOUVLEV committed
168
    rec_text.push_back(pred_txt);
LDOUBLEV's avatar
LDOUBLEV committed
169
170

    ////get score
LDOUVLEV's avatar
LDOUVLEV committed
171
172
173
    std::unique_ptr<const Tensor> output_tensor1(
        std::move(predictor_crnn->GetOutput(1)));
    auto* predict_batch = output_tensor1->data<float>();
LDOUBLEV's avatar
LDOUBLEV committed
174
175
176
177
178
179
180
181
182
183
184
    auto predict_shape = output_tensor1->shape();

    auto predict_lod = output_tensor1->lod();

    int argmax_idx;
    int blank = predict_shape[1];
    float score = 0.f;
    int count = 0;
    float max_value = 0.0f;

    for (int n = predict_lod[0][0]; n < predict_lod[0][1] - 1; n++) {
LDOUVLEV's avatar
LDOUVLEV committed
185
186
187
188
189
      argmax_idx = int(Argmax(&predict_batch[n * predict_shape[1]],
                              &predict_batch[(n + 1) * predict_shape[1]]));
      max_value =
          float(*std::max_element(&predict_batch[n * predict_shape[1]],
                                  &predict_batch[(n + 1) * predict_shape[1]]));
LDOUBLEV's avatar
LDOUBLEV committed
190
191
192
193
194
195
196

      if (blank - 1 - argmax_idx > 1e-5) {
        score += max_value;
        count += 1;
      }
    }
    score /= count;
LDOUVLEV's avatar
LDOUVLEV committed
197
    rec_text_score.push_back(score);
LDOUBLEV's avatar
LDOUBLEV committed
198
199
200
  }
}

LDOUVLEV's avatar
LDOUVLEV committed
201
202
203
204
std::vector<std::vector<std::vector<int>>> RunDetModel(
    std::shared_ptr<PaddlePredictor> predictor,
    cv::Mat img,
    std::map<std::string, double> Config) {
LDOUBLEV's avatar
LDOUBLEV committed
205
  // Read img
LDOUVLEV's avatar
LDOUVLEV committed
206
  int max_side_len = int(Config["max_side_len"]);
LDOUBLEV's avatar
LDOUBLEV committed
207
208
209
210

  cv::Mat srcimg;
  img.copyTo(srcimg);

LDOUVLEV's avatar
LDOUVLEV committed
211
212
  std::vector<float> ratio_hw;
  img = DetResizeImg(img, max_side_len, ratio_hw);
LDOUBLEV's avatar
LDOUBLEV committed
213
214
215
216
217
218
219
220
221
  cv::Mat img_fp;
  img.convertTo(img_fp, CV_32FC3, 1.0 / 255.f);

  // Prepare input data from image
  std::unique_ptr<Tensor> input_tensor0(std::move(predictor->GetInput(0)));
  input_tensor0->Resize({1, 3, img_fp.rows, img_fp.cols});
  auto* data0 = input_tensor0->mutable_data<float>();

  std::vector<float> mean = {0.485f, 0.456f, 0.406f};
LDOUVLEV's avatar
LDOUVLEV committed
222
  std::vector<float> scale = {1 / 0.229f, 1 / 0.224f, 1 / 0.225f};
LDOUBLEV's avatar
LDOUBLEV committed
223
224
225
226
227
228
229
  const float* dimg = reinterpret_cast<const float*>(img_fp.data);
  neon_mean_scale(dimg, data0, img_fp.rows * img_fp.cols, mean, scale);

  // Run predictor
  predictor->Run();

  // Get output and post process
LDOUVLEV's avatar
LDOUVLEV committed
230
231
  std::unique_ptr<const Tensor> output_tensor(
      std::move(predictor->GetOutput(0)));
LDOUBLEV's avatar
LDOUBLEV committed
232
233
234
235
236
237
238
239
240
241
242
243
244
  auto* outptr = output_tensor->data<float>();
  auto shape_out = output_tensor->shape();

  int64_t out_numl = 1;
  double sum = 0;
  for (auto i : shape_out) {
    out_numl *= i;
  }

  // Save output
  float pred[shape_out[2]][shape_out[3]];
  unsigned char cbuf[shape_out[2]][shape_out[3]];

LDOUVLEV's avatar
LDOUVLEV committed
245
246
247
248
  for (int i = 0; i < int(shape_out[2] * shape_out[3]); i++) {
    pred[int(i / int(shape_out[3]))][int(i % shape_out[3])] = float(outptr[i]);
    cbuf[int(i / int(shape_out[3]))][int(i % shape_out[3])] =
        (unsigned char)((outptr[i]) * 255);
LDOUBLEV's avatar
LDOUBLEV committed
249
250
251
  }

  cv::Mat cbuf_map(shape_out[2], shape_out[3], CV_8UC1, (unsigned char*)cbuf);
LDOUVLEV's avatar
LDOUVLEV committed
252
  cv::Mat pred_map(shape_out[2], shape_out[3], CV_32F, (float*)pred);
LDOUBLEV's avatar
LDOUBLEV committed
253

LDOUVLEV's avatar
LDOUVLEV committed
254
  const double threshold = double(Config["det_db_thresh"]) * 255;
LDOUBLEV's avatar
LDOUBLEV committed
255
256
257
258
  const double maxvalue = 255;
  cv::Mat bit_map;
  cv::threshold(cbuf_map, bit_map, threshold, maxvalue, cv::THRESH_BINARY);

LDOUVLEV's avatar
LDOUVLEV committed
259
  auto boxes = BoxesFromBitmap(pred_map, bit_map, Config);
LDOUBLEV's avatar
LDOUBLEV committed
260

LDOUVLEV's avatar
LDOUVLEV committed
261
262
  std::vector<std::vector<std::vector<int>>> filter_boxes =
      FilterTagDetRes(boxes, ratio_hw[0], ratio_hw[1], srcimg);
LDOUBLEV's avatar
LDOUBLEV committed
263

LDOUVLEV's avatar
LDOUVLEV committed
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
  return filter_boxes;
}

std::shared_ptr<PaddlePredictor> loadModel(std::string model_file) {
  MobileConfig config;
  config.set_model_from_file(model_file);

  std::shared_ptr<PaddlePredictor> predictor =
      CreatePaddlePredictor<MobileConfig>(config);
  return predictor;
}

cv::Mat Visualization(cv::Mat srcimg,
                      std::vector<std::vector<std::vector<int>>> boxes) {
  cv::Point rook_points[boxes.size()][4];
  for (int n = 0; n < boxes.size(); n++) {
    for (int m = 0; m < boxes[0].size(); m++) {
      rook_points[n][m] = cv::Point(int(boxes[n][m][0]), int(boxes[n][m][1]));
LDOUBLEV's avatar
LDOUBLEV committed
282
283
284
285
    }
  }
  cv::Mat img_vis;
  srcimg.copyTo(img_vis);
LDOUVLEV's avatar
LDOUVLEV committed
286
287
288
289
  for (int n = 0; n < boxes.size(); n++) {
    const cv::Point* ppt[1] = {rook_points[n]};
    int npt[] = {4};
    cv::polylines(img_vis, ppt, npt, 1, 1, CV_RGB(0, 255, 0), 2, 8, 0);
LDOUBLEV's avatar
LDOUBLEV committed
290
291
  }

LDOUVLEV's avatar
LDOUVLEV committed
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
  cv::imwrite("./imgs/vis.jpg", img_vis);
  std::cout << "The detection visualized image saved in ./imgs/vis.jpg"
            << std::endl;
  return img_vis;
}

std::vector<std::string> split(const std::string& str,
                               const std::string& delim) {
  std::vector<std::string> res;
  if ("" == str) return res;
  char* strs = new char[str.length() + 1];
  std::strcpy(strs, str.c_str());

  char* d = new char[delim.length() + 1];
  std::strcpy(d, delim.c_str());

  char* p = std::strtok(strs, d);
  while (p) {
    string s = p;
    res.push_back(s);
    p = std::strtok(NULL, d);
  }
LDOUBLEV's avatar
LDOUBLEV committed
314

LDOUVLEV's avatar
LDOUVLEV committed
315
  return res;
LDOUBLEV's avatar
LDOUBLEV committed
316
317
}

LDOUVLEV's avatar
LDOUVLEV committed
318
319
320
321
322
323
324
325
326
327
std::map<std::string, double> LoadConfigTxt(std::string config_path) {
  auto config = ReadDict(config_path);

  std::map<std::string, double> dict;
  for (int i = 0; i < config.size(); i++) {
    std::vector<std::string> res = split(config[i], " ");
    dict[res[0]] = stod(res[1]);
  }
  return dict;
}
LDOUBLEV's avatar
LDOUBLEV committed
328
329

int main(int argc, char** argv) {
LDOUVLEV's avatar
LDOUVLEV committed
330
331
332
  if (argc < 5) {
    std::cerr << "[ERROR] usage: " << argv[0]
              << " det_model_file rec_model_file image_path\n";
LDOUBLEV's avatar
LDOUBLEV committed
333
334
335
336
337
    exit(1);
  }
  std::string det_model_file = argv[1];
  std::string rec_model_file = argv[2];
  std::string img_path = argv[3];
LDOUVLEV's avatar
LDOUVLEV committed
338
339
340
341
  std::string dict_path = argv[4];

  //// load config from txt file
  auto Config = LoadConfigTxt("./config.txt");
LDOUBLEV's avatar
LDOUBLEV committed
342
343
344

  auto start = std::chrono::system_clock::now();

LDOUVLEV's avatar
LDOUVLEV committed
345
346
347
  auto det_predictor = loadModel(det_model_file);
  auto rec_predictor = loadModel(rec_model_file);

LDOUBLEV's avatar
LDOUBLEV committed
348
  cv::Mat srcimg = cv::imread(img_path, cv::IMREAD_COLOR);
LDOUVLEV's avatar
LDOUVLEV committed
349
  auto boxes = RunDetModel(det_predictor, srcimg, Config);
LDOUBLEV's avatar
LDOUBLEV committed
350

LDOUVLEV's avatar
LDOUVLEV committed
351
352
353
354
  std::vector<std::string> rec_text;
  std::vector<float> rec_text_score;
  RunRecModel(
      boxes, srcimg, rec_predictor, dict_path, rec_text, rec_text_score);
LDOUBLEV's avatar
LDOUBLEV committed
355

LDOUVLEV's avatar
LDOUVLEV committed
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
  auto end = std::chrono::system_clock::now();
  auto duration =
      std::chrono::duration_cast<std::chrono::microseconds>(end - start);

  //// visualization
  auto img_vis = Visualization(srcimg, boxes);

  //// print recognized text
  for (int i = 0; i < rec_text.size(); i++) {
    std::cout << i << "\t" << rec_text[i] << "\t" << rec_text_score[i]
              << std::endl;
  }

  std::cout << "花费了"
            << double(duration.count()) *
                   std::chrono::microseconds::period::num /
                   std::chrono::microseconds::period::den
LDOUBLEV's avatar
LDOUBLEV committed
373
374
375
376
            << "秒" << std::endl;

  return 0;
}