ocr_db_crnn.cc 11.2 KB
Newer Older
LDOUVLEV's avatar
LDOUVLEV committed
1
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
LDOUBLEV's avatar
LDOUBLEV committed
2
3
4
5
6
7
8
9
10
11
12
13
14
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

15
#include "paddle_api.h" // NOLINT
LDOUBLEV's avatar
LDOUBLEV committed
16
17
#include <chrono>

LDOUVLEV's avatar
LDOUVLEV committed
18
19
#include "crnn_process.h"
#include "db_post_process.h"
LDOUBLEV's avatar
LDOUBLEV committed
20

21
using namespace paddle::lite_api; // NOLINT
LDOUVLEV's avatar
LDOUVLEV committed
22
using namespace std;
LDOUBLEV's avatar
LDOUBLEV committed
23
24

// fill tensor with mean and scale and trans layout: nhwc -> nchw, neon speed up
25
void neon_mean_scale(const float *din, float *dout, int size,
LDOUBLEV's avatar
LDOUBLEV committed
26
27
28
29
30
31
32
33
34
35
36
37
38
                     const std::vector<float> mean,
                     const std::vector<float> scale) {
  if (mean.size() != 3 || scale.size() != 3) {
    std::cerr << "[ERROR] mean or scale size must equal to 3\n";
    exit(1);
  }
  float32x4_t vmean0 = vdupq_n_f32(mean[0]);
  float32x4_t vmean1 = vdupq_n_f32(mean[1]);
  float32x4_t vmean2 = vdupq_n_f32(mean[2]);
  float32x4_t vscale0 = vdupq_n_f32(scale[0]);
  float32x4_t vscale1 = vdupq_n_f32(scale[1]);
  float32x4_t vscale2 = vdupq_n_f32(scale[2]);

39
40
41
  float *dout_c0 = dout;
  float *dout_c1 = dout + size;
  float *dout_c2 = dout + size * 2;
LDOUBLEV's avatar
LDOUBLEV committed
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68

  int i = 0;
  for (; i < size - 3; i += 4) {
    float32x4x3_t vin3 = vld3q_f32(din);
    float32x4_t vsub0 = vsubq_f32(vin3.val[0], vmean0);
    float32x4_t vsub1 = vsubq_f32(vin3.val[1], vmean1);
    float32x4_t vsub2 = vsubq_f32(vin3.val[2], vmean2);
    float32x4_t vs0 = vmulq_f32(vsub0, vscale0);
    float32x4_t vs1 = vmulq_f32(vsub1, vscale1);
    float32x4_t vs2 = vmulq_f32(vsub2, vscale2);
    vst1q_f32(dout_c0, vs0);
    vst1q_f32(dout_c1, vs1);
    vst1q_f32(dout_c2, vs2);

    din += 12;
    dout_c0 += 4;
    dout_c1 += 4;
    dout_c2 += 4;
  }
  for (; i < size; i++) {
    *(dout_c0++) = (*(din++) - mean[0]) * scale[0];
    *(dout_c1++) = (*(din++) - mean[1]) * scale[1];
    *(dout_c2++) = (*(din++) - mean[2]) * scale[2];
  }
}

// resize image to a size multiple of 32 which is required by the network
69
70
cv::Mat DetResizeImg(const cv::Mat img, int max_size_len,
                     std::vector<float> &ratio_hw) {
LDOUBLEV's avatar
LDOUBLEV committed
71
72
73
74
  int w = img.cols;
  int h = img.rows;

  float ratio = 1.f;
LDOUVLEV's avatar
LDOUVLEV committed
75
76
77
  int max_wh = w >= h ? w : h;
  if (max_wh > max_size_len) {
    if (h > w) {
LDOUBLEV's avatar
LDOUBLEV committed
78
79
80
81
82
83
84
85
86
87
      ratio = float(max_size_len) / float(h);
    } else {
      ratio = float(max_size_len) / float(w);
    }
  }

  int resize_h = int(float(h) * ratio);
  int resize_w = int(float(w) * ratio);
  if (resize_h % 32 == 0)
    resize_h = resize_h;
LDOUVLEV's avatar
LDOUVLEV committed
88
  else if (resize_h / 32 < 1 + 1e-5)
LDOUBLEV's avatar
LDOUBLEV committed
89
90
91
92
93
94
    resize_h = 32;
  else
    resize_h = (resize_h / 32 - 1) * 32;

  if (resize_w % 32 == 0)
    resize_w = resize_w;
LDOUVLEV's avatar
LDOUVLEV committed
95
  else if (resize_w / 32 < 1 + 1e-5)
LDOUBLEV's avatar
LDOUBLEV committed
96
97
    resize_w = 32;
  else
LDOUVLEV's avatar
LDOUVLEV committed
98
    resize_w = (resize_w / 32 - 1) * 32;
LDOUBLEV's avatar
LDOUBLEV committed
99
100
101
102

  cv::Mat resize_img;
  cv::resize(img, resize_img, cv::Size(resize_w, resize_h));

LDOUVLEV's avatar
LDOUVLEV committed
103
104
  ratio_hw.push_back(float(resize_h) / float(h));
  ratio_hw.push_back(float(resize_w) / float(w));
LDOUBLEV's avatar
LDOUBLEV committed
105
106
107
  return resize_img;
}

108
void RunRecModel(std::vector<std::vector<std::vector<int>>> boxes, cv::Mat img,
LDOUVLEV's avatar
LDOUVLEV committed
109
                 std::shared_ptr<PaddlePredictor> predictor_crnn,
LDOUBLEV's avatar
update  
LDOUBLEV committed
110
111
112
                 std::vector<std::string> &rec_text,
                 std::vector<float> &rec_text_score,
                 std::vector<std::string> charactor_dict) {
LDOUBLEV's avatar
LDOUBLEV committed
113
114
115
116
117
118
119
120
121
  std::vector<float> mean = {0.5f, 0.5f, 0.5f};
  std::vector<float> scale = {1 / 0.5f, 1 / 0.5f, 1 / 0.5f};

  cv::Mat srcimg;
  img.copyTo(srcimg);
  cv::Mat crop_img;
  cv::Mat resize_img;

  int index = 0;
LDOUVLEV's avatar
LDOUVLEV committed
122
123
  for (int i = boxes.size() - 1; i >= 0; i--) {
    crop_img = GetRotateCropImage(srcimg, boxes[i]);
LDOUBLEV's avatar
LDOUBLEV committed
124
125
    float wh_ratio = float(crop_img.cols) / float(crop_img.rows);

LDOUBLEV's avatar
update  
LDOUBLEV committed
126
    resize_img = CrnnResizeNormImg(crop_img, wh_ratio, false);
LDOUBLEV's avatar
LDOUBLEV committed
127
128
    resize_img.convertTo(resize_img, CV_32FC3, 1 / 255.f);

129
    const float *dimg = reinterpret_cast<const float *>(resize_img.data);
LDOUBLEV's avatar
LDOUBLEV committed
130

LDOUVLEV's avatar
LDOUVLEV committed
131
132
    std::unique_ptr<Tensor> input_tensor0(
        std::move(predictor_crnn->GetInput(0)));
LDOUBLEV's avatar
LDOUBLEV committed
133
    input_tensor0->Resize({1, 3, resize_img.rows, resize_img.cols});
134
    auto *data0 = input_tensor0->mutable_data<float>();
LDOUBLEV's avatar
LDOUBLEV committed
135

136
137
    neon_mean_scale(dimg, data0, resize_img.rows * resize_img.cols, mean,
                    scale);
LDOUBLEV's avatar
LDOUBLEV committed
138
139
140
141
142
    //// Run CRNN predictor
    predictor_crnn->Run();

    // Get output and run postprocess
    std::unique_ptr<const Tensor> output_tensor0(
LDOUVLEV's avatar
LDOUVLEV committed
143
        std::move(predictor_crnn->GetOutput(0)));
LDOUBLEV's avatar
LDOUBLEV committed
144
    auto *rec_idx = output_tensor0->data<int64>();
LDOUBLEV's avatar
LDOUBLEV committed
145
146
147
148
149

    auto rec_idx_lod = output_tensor0->lod();
    auto shape_out = output_tensor0->shape();

    std::vector<int> pred_idx;
LDOUBLEV's avatar
LDOUBLEV committed
150
    for (int n = int(rec_idx_lod[0][0]); n < int(rec_idx_lod[0][1]); n += 1) {
LDOUBLEV's avatar
LDOUBLEV committed
151
152
153
      pred_idx.push_back(int(rec_idx[n]));
    }

154
155
    if (pred_idx.size() < 1e-3)
      continue;
LDOUBLEV's avatar
LDOUBLEV committed
156
157

    index += 1;
LDOUVLEV's avatar
LDOUVLEV committed
158
    std::string pred_txt = "";
LDOUBLEV's avatar
LDOUBLEV committed
159
    for (int n = 0; n < pred_idx.size(); n++) {
LDOUVLEV's avatar
LDOUVLEV committed
160
      pred_txt += charactor_dict[pred_idx[n]];
LDOUBLEV's avatar
LDOUBLEV committed
161
    }
LDOUVLEV's avatar
LDOUVLEV committed
162
    rec_text.push_back(pred_txt);
LDOUBLEV's avatar
LDOUBLEV committed
163
164

    ////get score
LDOUVLEV's avatar
LDOUVLEV committed
165
166
    std::unique_ptr<const Tensor> output_tensor1(
        std::move(predictor_crnn->GetOutput(1)));
167
    auto *predict_batch = output_tensor1->data<float>();
LDOUBLEV's avatar
LDOUBLEV committed
168
169
170
171
172
173
174
175
176
177
178
    auto predict_shape = output_tensor1->shape();

    auto predict_lod = output_tensor1->lod();

    int argmax_idx;
    int blank = predict_shape[1];
    float score = 0.f;
    int count = 0;
    float max_value = 0.0f;

    for (int n = predict_lod[0][0]; n < predict_lod[0][1] - 1; n++) {
LDOUVLEV's avatar
LDOUVLEV committed
179
180
181
182
183
      argmax_idx = int(Argmax(&predict_batch[n * predict_shape[1]],
                              &predict_batch[(n + 1) * predict_shape[1]]));
      max_value =
          float(*std::max_element(&predict_batch[n * predict_shape[1]],
                                  &predict_batch[(n + 1) * predict_shape[1]]));
LDOUBLEV's avatar
LDOUBLEV committed
184
185
186
187
188
189
190

      if (blank - 1 - argmax_idx > 1e-5) {
        score += max_value;
        count += 1;
      }
    }
    score /= count;
LDOUVLEV's avatar
LDOUVLEV committed
191
    rec_text_score.push_back(score);
LDOUBLEV's avatar
LDOUBLEV committed
192
193
194
  }
}

195
196
197
std::vector<std::vector<std::vector<int>>>
RunDetModel(std::shared_ptr<PaddlePredictor> predictor, cv::Mat img,
            std::map<std::string, double> Config) {
LDOUBLEV's avatar
LDOUBLEV committed
198
  // Read img
LDOUVLEV's avatar
LDOUVLEV committed
199
  int max_side_len = int(Config["max_side_len"]);
LDOUBLEV's avatar
LDOUBLEV committed
200
201
202
203

  cv::Mat srcimg;
  img.copyTo(srcimg);

LDOUVLEV's avatar
LDOUVLEV committed
204
205
  std::vector<float> ratio_hw;
  img = DetResizeImg(img, max_side_len, ratio_hw);
LDOUBLEV's avatar
LDOUBLEV committed
206
207
208
209
210
211
  cv::Mat img_fp;
  img.convertTo(img_fp, CV_32FC3, 1.0 / 255.f);

  // Prepare input data from image
  std::unique_ptr<Tensor> input_tensor0(std::move(predictor->GetInput(0)));
  input_tensor0->Resize({1, 3, img_fp.rows, img_fp.cols});
212
  auto *data0 = input_tensor0->mutable_data<float>();
LDOUBLEV's avatar
LDOUBLEV committed
213
214

  std::vector<float> mean = {0.485f, 0.456f, 0.406f};
LDOUVLEV's avatar
LDOUVLEV committed
215
  std::vector<float> scale = {1 / 0.229f, 1 / 0.224f, 1 / 0.225f};
216
  const float *dimg = reinterpret_cast<const float *>(img_fp.data);
LDOUBLEV's avatar
LDOUBLEV committed
217
218
219
220
221
222
  neon_mean_scale(dimg, data0, img_fp.rows * img_fp.cols, mean, scale);

  // Run predictor
  predictor->Run();

  // Get output and post process
LDOUVLEV's avatar
LDOUVLEV committed
223
224
  std::unique_ptr<const Tensor> output_tensor(
      std::move(predictor->GetOutput(0)));
225
  auto *outptr = output_tensor->data<float>();
LDOUBLEV's avatar
LDOUBLEV committed
226
227
228
  auto shape_out = output_tensor->shape();

  // Save output
LDOUBLEV's avatar
update  
LDOUBLEV committed
229
230
  float pred[shape_out[2] * shape_out[3]];
  unsigned char cbuf[shape_out[2] * shape_out[3]];
LDOUBLEV's avatar
LDOUBLEV committed
231

LDOUVLEV's avatar
LDOUVLEV committed
232
  for (int i = 0; i < int(shape_out[2] * shape_out[3]); i++) {
LDOUBLEV's avatar
update  
LDOUBLEV committed
233
234
    pred[i] = float(outptr[i]);
    cbuf[i] = (unsigned char)((outptr[i]) * 255);
LDOUBLEV's avatar
LDOUBLEV committed
235
236
  }

237
238
  cv::Mat cbuf_map(shape_out[2], shape_out[3], CV_8UC1, (unsigned char *)cbuf);
  cv::Mat pred_map(shape_out[2], shape_out[3], CV_32F, (float *)pred);
LDOUBLEV's avatar
LDOUBLEV committed
239

LDOUVLEV's avatar
LDOUVLEV committed
240
  const double threshold = double(Config["det_db_thresh"]) * 255;
LDOUBLEV's avatar
LDOUBLEV committed
241
242
243
244
  const double maxvalue = 255;
  cv::Mat bit_map;
  cv::threshold(cbuf_map, bit_map, threshold, maxvalue, cv::THRESH_BINARY);

LDOUVLEV's avatar
LDOUVLEV committed
245
  auto boxes = BoxesFromBitmap(pred_map, bit_map, Config);
LDOUBLEV's avatar
LDOUBLEV committed
246

LDOUVLEV's avatar
LDOUVLEV committed
247
248
  std::vector<std::vector<std::vector<int>>> filter_boxes =
      FilterTagDetRes(boxes, ratio_hw[0], ratio_hw[1], srcimg);
LDOUBLEV's avatar
LDOUBLEV committed
249

LDOUVLEV's avatar
LDOUVLEV committed
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
  return filter_boxes;
}

std::shared_ptr<PaddlePredictor> loadModel(std::string model_file) {
  MobileConfig config;
  config.set_model_from_file(model_file);

  std::shared_ptr<PaddlePredictor> predictor =
      CreatePaddlePredictor<MobileConfig>(config);
  return predictor;
}

cv::Mat Visualization(cv::Mat srcimg,
                      std::vector<std::vector<std::vector<int>>> boxes) {
  cv::Point rook_points[boxes.size()][4];
  for (int n = 0; n < boxes.size(); n++) {
    for (int m = 0; m < boxes[0].size(); m++) {
      rook_points[n][m] = cv::Point(int(boxes[n][m][0]), int(boxes[n][m][1]));
LDOUBLEV's avatar
LDOUBLEV committed
268
269
270
271
    }
  }
  cv::Mat img_vis;
  srcimg.copyTo(img_vis);
LDOUVLEV's avatar
LDOUVLEV committed
272
  for (int n = 0; n < boxes.size(); n++) {
273
    const cv::Point *ppt[1] = {rook_points[n]};
LDOUVLEV's avatar
LDOUVLEV committed
274
275
    int npt[] = {4};
    cv::polylines(img_vis, ppt, npt, 1, 1, CV_RGB(0, 255, 0), 2, 8, 0);
LDOUBLEV's avatar
LDOUBLEV committed
276
277
  }

278
279
  cv::imwrite("./vis.jpg", img_vis);
  std::cout << "The detection visualized image saved in ./vis.jpg" << std::endl;
LDOUVLEV's avatar
LDOUVLEV committed
280
281
282
  return img_vis;
}

283
284
std::vector<std::string> split(const std::string &str,
                               const std::string &delim) {
LDOUVLEV's avatar
LDOUVLEV committed
285
  std::vector<std::string> res;
286
287
288
  if ("" == str)
    return res;
  char *strs = new char[str.length() + 1];
LDOUVLEV's avatar
LDOUVLEV committed
289
290
  std::strcpy(strs, str.c_str());

291
  char *d = new char[delim.length() + 1];
LDOUVLEV's avatar
LDOUVLEV committed
292
293
  std::strcpy(d, delim.c_str());

294
  char *p = std::strtok(strs, d);
LDOUVLEV's avatar
LDOUVLEV committed
295
296
297
298
299
  while (p) {
    string s = p;
    res.push_back(s);
    p = std::strtok(NULL, d);
  }
LDOUBLEV's avatar
LDOUBLEV committed
300

LDOUVLEV's avatar
LDOUVLEV committed
301
  return res;
LDOUBLEV's avatar
LDOUBLEV committed
302
303
}

LDOUVLEV's avatar
LDOUVLEV committed
304
305
306
307
308
309
310
311
312
313
std::map<std::string, double> LoadConfigTxt(std::string config_path) {
  auto config = ReadDict(config_path);

  std::map<std::string, double> dict;
  for (int i = 0; i < config.size(); i++) {
    std::vector<std::string> res = split(config[i], " ");
    dict[res[0]] = stod(res[1]);
  }
  return dict;
}
LDOUBLEV's avatar
LDOUBLEV committed
314

315
int main(int argc, char **argv) {
LDOUVLEV's avatar
LDOUVLEV committed
316
317
318
  if (argc < 5) {
    std::cerr << "[ERROR] usage: " << argv[0]
              << " det_model_file rec_model_file image_path\n";
LDOUBLEV's avatar
LDOUBLEV committed
319
320
321
322
323
    exit(1);
  }
  std::string det_model_file = argv[1];
  std::string rec_model_file = argv[2];
  std::string img_path = argv[3];
LDOUVLEV's avatar
LDOUVLEV committed
324
325
326
327
  std::string dict_path = argv[4];

  //// load config from txt file
  auto Config = LoadConfigTxt("./config.txt");
LDOUBLEV's avatar
LDOUBLEV committed
328
329
330

  auto start = std::chrono::system_clock::now();

LDOUVLEV's avatar
LDOUVLEV committed
331
332
333
  auto det_predictor = loadModel(det_model_file);
  auto rec_predictor = loadModel(rec_model_file);

LDOUBLEV's avatar
update  
LDOUBLEV committed
334
335
  auto charactor_dict = ReadDict(dict_path);

LDOUBLEV's avatar
LDOUBLEV committed
336
  cv::Mat srcimg = cv::imread(img_path, cv::IMREAD_COLOR);
LDOUVLEV's avatar
LDOUVLEV committed
337
  auto boxes = RunDetModel(det_predictor, srcimg, Config);
LDOUBLEV's avatar
LDOUBLEV committed
338

LDOUVLEV's avatar
LDOUVLEV committed
339
340
  std::vector<std::string> rec_text;
  std::vector<float> rec_text_score;
LDOUBLEV's avatar
update  
LDOUBLEV committed
341
342
  RunRecModel(boxes, srcimg, rec_predictor, rec_text, rec_text_score,
              charactor_dict);
LDOUBLEV's avatar
LDOUBLEV committed
343

LDOUVLEV's avatar
LDOUVLEV committed
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
  auto end = std::chrono::system_clock::now();
  auto duration =
      std::chrono::duration_cast<std::chrono::microseconds>(end - start);

  //// visualization
  auto img_vis = Visualization(srcimg, boxes);

  //// print recognized text
  for (int i = 0; i < rec_text.size(); i++) {
    std::cout << i << "\t" << rec_text[i] << "\t" << rec_text_score[i]
              << std::endl;
  }

  std::cout << "花费了"
            << double(duration.count()) *
                   std::chrono::microseconds::period::num /
                   std::chrono::microseconds::period::den
LDOUBLEV's avatar
LDOUBLEV committed
361
362
363
364
            << "秒" << std::endl;

  return 0;
}