ocr_db_crnn.cc 11.3 KB
Newer Older
LDOUVLEV's avatar
LDOUVLEV committed
1
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
LDOUBLEV's avatar
LDOUBLEV committed
2
3
4
5
6
7
8
9
10
11
12
13
14
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

15
#include "paddle_api.h" // NOLINT
LDOUBLEV's avatar
LDOUBLEV committed
16
17
#include <chrono>

LDOUVLEV's avatar
LDOUVLEV committed
18
19
#include "crnn_process.h"
#include "db_post_process.h"
LDOUBLEV's avatar
LDOUBLEV committed
20

21
using namespace paddle::lite_api; // NOLINT
LDOUVLEV's avatar
LDOUVLEV committed
22
using namespace std;
LDOUBLEV's avatar
LDOUBLEV committed
23
24

// fill tensor with mean and scale and trans layout: nhwc -> nchw, neon speed up
25
void neon_mean_scale(const float *din, float *dout, int size,
LDOUBLEV's avatar
LDOUBLEV committed
26
27
28
29
30
31
32
33
34
35
36
37
38
                     const std::vector<float> mean,
                     const std::vector<float> scale) {
  if (mean.size() != 3 || scale.size() != 3) {
    std::cerr << "[ERROR] mean or scale size must equal to 3\n";
    exit(1);
  }
  float32x4_t vmean0 = vdupq_n_f32(mean[0]);
  float32x4_t vmean1 = vdupq_n_f32(mean[1]);
  float32x4_t vmean2 = vdupq_n_f32(mean[2]);
  float32x4_t vscale0 = vdupq_n_f32(scale[0]);
  float32x4_t vscale1 = vdupq_n_f32(scale[1]);
  float32x4_t vscale2 = vdupq_n_f32(scale[2]);

39
40
41
  float *dout_c0 = dout;
  float *dout_c1 = dout + size;
  float *dout_c2 = dout + size * 2;
LDOUBLEV's avatar
LDOUBLEV committed
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68

  int i = 0;
  for (; i < size - 3; i += 4) {
    float32x4x3_t vin3 = vld3q_f32(din);
    float32x4_t vsub0 = vsubq_f32(vin3.val[0], vmean0);
    float32x4_t vsub1 = vsubq_f32(vin3.val[1], vmean1);
    float32x4_t vsub2 = vsubq_f32(vin3.val[2], vmean2);
    float32x4_t vs0 = vmulq_f32(vsub0, vscale0);
    float32x4_t vs1 = vmulq_f32(vsub1, vscale1);
    float32x4_t vs2 = vmulq_f32(vsub2, vscale2);
    vst1q_f32(dout_c0, vs0);
    vst1q_f32(dout_c1, vs1);
    vst1q_f32(dout_c2, vs2);

    din += 12;
    dout_c0 += 4;
    dout_c1 += 4;
    dout_c2 += 4;
  }
  for (; i < size; i++) {
    *(dout_c0++) = (*(din++) - mean[0]) * scale[0];
    *(dout_c1++) = (*(din++) - mean[1]) * scale[1];
    *(dout_c2++) = (*(din++) - mean[2]) * scale[2];
  }
}

// resize image to a size multiple of 32 which is required by the network
69
70
cv::Mat DetResizeImg(const cv::Mat img, int max_size_len,
                     std::vector<float> &ratio_hw) {
LDOUBLEV's avatar
LDOUBLEV committed
71
72
73
74
  int w = img.cols;
  int h = img.rows;

  float ratio = 1.f;
LDOUVLEV's avatar
LDOUVLEV committed
75
76
77
  int max_wh = w >= h ? w : h;
  if (max_wh > max_size_len) {
    if (h > w) {
LDOUBLEV's avatar
LDOUBLEV committed
78
79
80
81
82
83
84
85
86
87
      ratio = float(max_size_len) / float(h);
    } else {
      ratio = float(max_size_len) / float(w);
    }
  }

  int resize_h = int(float(h) * ratio);
  int resize_w = int(float(w) * ratio);
  if (resize_h % 32 == 0)
    resize_h = resize_h;
LDOUVLEV's avatar
LDOUVLEV committed
88
  else if (resize_h / 32 < 1 + 1e-5)
LDOUBLEV's avatar
LDOUBLEV committed
89
90
91
92
93
94
    resize_h = 32;
  else
    resize_h = (resize_h / 32 - 1) * 32;

  if (resize_w % 32 == 0)
    resize_w = resize_w;
LDOUVLEV's avatar
LDOUVLEV committed
95
  else if (resize_w / 32 < 1 + 1e-5)
LDOUBLEV's avatar
LDOUBLEV committed
96
97
    resize_w = 32;
  else
LDOUVLEV's avatar
LDOUVLEV committed
98
    resize_w = (resize_w / 32 - 1) * 32;
LDOUBLEV's avatar
LDOUBLEV committed
99
100
101
102

  cv::Mat resize_img;
  cv::resize(img, resize_img, cv::Size(resize_w, resize_h));

LDOUVLEV's avatar
LDOUVLEV committed
103
104
  ratio_hw.push_back(float(resize_h) / float(h));
  ratio_hw.push_back(float(resize_w) / float(w));
LDOUBLEV's avatar
LDOUBLEV committed
105
106
107
  return resize_img;
}

108
void RunRecModel(std::vector<std::vector<std::vector<int>>> boxes, cv::Mat img,
LDOUVLEV's avatar
LDOUVLEV committed
109
                 std::shared_ptr<PaddlePredictor> predictor_crnn,
110
111
                 std::string dict_path, std::vector<std::string> &rec_text,
                 std::vector<float> &rec_text_score) {
LDOUBLEV's avatar
LDOUBLEV committed
112
113
114
115
116
117
118
119
  std::vector<float> mean = {0.5f, 0.5f, 0.5f};
  std::vector<float> scale = {1 / 0.5f, 1 / 0.5f, 1 / 0.5f};

  cv::Mat srcimg;
  img.copyTo(srcimg);
  cv::Mat crop_img;
  cv::Mat resize_img;

LDOUVLEV's avatar
LDOUVLEV committed
120
  auto charactor_dict = ReadDict(dict_path);
LDOUBLEV's avatar
LDOUBLEV committed
121
122

  int index = 0;
LDOUVLEV's avatar
LDOUVLEV committed
123
124
  for (int i = boxes.size() - 1; i >= 0; i--) {
    crop_img = GetRotateCropImage(srcimg, boxes[i]);
LDOUBLEV's avatar
LDOUBLEV committed
125
126
    float wh_ratio = float(crop_img.cols) / float(crop_img.rows);

LDOUVLEV's avatar
LDOUVLEV committed
127
    resize_img = CrnnResizeImg(crop_img, wh_ratio);
LDOUBLEV's avatar
LDOUBLEV committed
128
129
    resize_img.convertTo(resize_img, CV_32FC3, 1 / 255.f);

130
    const float *dimg = reinterpret_cast<const float *>(resize_img.data);
LDOUBLEV's avatar
LDOUBLEV committed
131

LDOUVLEV's avatar
LDOUVLEV committed
132
133
    std::unique_ptr<Tensor> input_tensor0(
        std::move(predictor_crnn->GetInput(0)));
LDOUBLEV's avatar
LDOUBLEV committed
134
    input_tensor0->Resize({1, 3, resize_img.rows, resize_img.cols});
135
    auto *data0 = input_tensor0->mutable_data<float>();
LDOUBLEV's avatar
LDOUBLEV committed
136

137
138
    neon_mean_scale(dimg, data0, resize_img.rows * resize_img.cols, mean,
                    scale);
LDOUBLEV's avatar
LDOUBLEV committed
139
140
141
142
143
    //// Run CRNN predictor
    predictor_crnn->Run();

    // Get output and run postprocess
    std::unique_ptr<const Tensor> output_tensor0(
LDOUVLEV's avatar
LDOUVLEV committed
144
        std::move(predictor_crnn->GetOutput(0)));
145
    auto *rec_idx = output_tensor0->data<int>();
LDOUBLEV's avatar
LDOUBLEV committed
146
147
148
149
150

    auto rec_idx_lod = output_tensor0->lod();
    auto shape_out = output_tensor0->shape();

    std::vector<int> pred_idx;
LDOUVLEV's avatar
LDOUVLEV committed
151
152
    for (int n = int(rec_idx_lod[0][0]); n < int(rec_idx_lod[0][1] * 2);
         n += 2) {
LDOUBLEV's avatar
LDOUBLEV committed
153
154
155
      pred_idx.push_back(int(rec_idx[n]));
    }

156
157
    if (pred_idx.size() < 1e-3)
      continue;
LDOUBLEV's avatar
LDOUBLEV committed
158
159

    index += 1;
LDOUVLEV's avatar
LDOUVLEV committed
160
    std::string pred_txt = "";
LDOUBLEV's avatar
LDOUBLEV committed
161
    for (int n = 0; n < pred_idx.size(); n++) {
LDOUVLEV's avatar
LDOUVLEV committed
162
      pred_txt += charactor_dict[pred_idx[n]];
LDOUBLEV's avatar
LDOUBLEV committed
163
    }
LDOUVLEV's avatar
LDOUVLEV committed
164
    rec_text.push_back(pred_txt);
LDOUBLEV's avatar
LDOUBLEV committed
165
166

    ////get score
LDOUVLEV's avatar
LDOUVLEV committed
167
168
    std::unique_ptr<const Tensor> output_tensor1(
        std::move(predictor_crnn->GetOutput(1)));
169
    auto *predict_batch = output_tensor1->data<float>();
LDOUBLEV's avatar
LDOUBLEV committed
170
171
172
173
174
175
176
177
178
179
180
    auto predict_shape = output_tensor1->shape();

    auto predict_lod = output_tensor1->lod();

    int argmax_idx;
    int blank = predict_shape[1];
    float score = 0.f;
    int count = 0;
    float max_value = 0.0f;

    for (int n = predict_lod[0][0]; n < predict_lod[0][1] - 1; n++) {
LDOUVLEV's avatar
LDOUVLEV committed
181
182
183
184
185
      argmax_idx = int(Argmax(&predict_batch[n * predict_shape[1]],
                              &predict_batch[(n + 1) * predict_shape[1]]));
      max_value =
          float(*std::max_element(&predict_batch[n * predict_shape[1]],
                                  &predict_batch[(n + 1) * predict_shape[1]]));
LDOUBLEV's avatar
LDOUBLEV committed
186
187
188
189
190
191
192

      if (blank - 1 - argmax_idx > 1e-5) {
        score += max_value;
        count += 1;
      }
    }
    score /= count;
LDOUVLEV's avatar
LDOUVLEV committed
193
    rec_text_score.push_back(score);
LDOUBLEV's avatar
LDOUBLEV committed
194
195
196
  }
}

197
198
199
std::vector<std::vector<std::vector<int>>>
RunDetModel(std::shared_ptr<PaddlePredictor> predictor, cv::Mat img,
            std::map<std::string, double> Config) {
LDOUBLEV's avatar
LDOUBLEV committed
200
  // Read img
LDOUVLEV's avatar
LDOUVLEV committed
201
  int max_side_len = int(Config["max_side_len"]);
LDOUBLEV's avatar
LDOUBLEV committed
202
203
204
205

  cv::Mat srcimg;
  img.copyTo(srcimg);

LDOUVLEV's avatar
LDOUVLEV committed
206
207
  std::vector<float> ratio_hw;
  img = DetResizeImg(img, max_side_len, ratio_hw);
LDOUBLEV's avatar
LDOUBLEV committed
208
209
210
211
212
213
  cv::Mat img_fp;
  img.convertTo(img_fp, CV_32FC3, 1.0 / 255.f);

  // Prepare input data from image
  std::unique_ptr<Tensor> input_tensor0(std::move(predictor->GetInput(0)));
  input_tensor0->Resize({1, 3, img_fp.rows, img_fp.cols});
214
  auto *data0 = input_tensor0->mutable_data<float>();
LDOUBLEV's avatar
LDOUBLEV committed
215
216

  std::vector<float> mean = {0.485f, 0.456f, 0.406f};
LDOUVLEV's avatar
LDOUVLEV committed
217
  std::vector<float> scale = {1 / 0.229f, 1 / 0.224f, 1 / 0.225f};
218
  const float *dimg = reinterpret_cast<const float *>(img_fp.data);
LDOUBLEV's avatar
LDOUBLEV committed
219
220
221
222
223
224
  neon_mean_scale(dimg, data0, img_fp.rows * img_fp.cols, mean, scale);

  // Run predictor
  predictor->Run();

  // Get output and post process
LDOUVLEV's avatar
LDOUVLEV committed
225
226
  std::unique_ptr<const Tensor> output_tensor(
      std::move(predictor->GetOutput(0)));
227
  auto *outptr = output_tensor->data<float>();
LDOUBLEV's avatar
LDOUBLEV committed
228
229
230
231
232
233
  auto shape_out = output_tensor->shape();

  // Save output
  float pred[shape_out[2]][shape_out[3]];
  unsigned char cbuf[shape_out[2]][shape_out[3]];

LDOUVLEV's avatar
LDOUVLEV committed
234
235
236
237
  for (int i = 0; i < int(shape_out[2] * shape_out[3]); i++) {
    pred[int(i / int(shape_out[3]))][int(i % shape_out[3])] = float(outptr[i]);
    cbuf[int(i / int(shape_out[3]))][int(i % shape_out[3])] =
        (unsigned char)((outptr[i]) * 255);
LDOUBLEV's avatar
LDOUBLEV committed
238
239
  }

240
241
  cv::Mat cbuf_map(shape_out[2], shape_out[3], CV_8UC1, (unsigned char *)cbuf);
  cv::Mat pred_map(shape_out[2], shape_out[3], CV_32F, (float *)pred);
LDOUBLEV's avatar
LDOUBLEV committed
242

LDOUVLEV's avatar
LDOUVLEV committed
243
  const double threshold = double(Config["det_db_thresh"]) * 255;
LDOUBLEV's avatar
LDOUBLEV committed
244
245
246
247
  const double maxvalue = 255;
  cv::Mat bit_map;
  cv::threshold(cbuf_map, bit_map, threshold, maxvalue, cv::THRESH_BINARY);

LDOUVLEV's avatar
LDOUVLEV committed
248
  auto boxes = BoxesFromBitmap(pred_map, bit_map, Config);
LDOUBLEV's avatar
LDOUBLEV committed
249

LDOUVLEV's avatar
LDOUVLEV committed
250
251
  std::vector<std::vector<std::vector<int>>> filter_boxes =
      FilterTagDetRes(boxes, ratio_hw[0], ratio_hw[1], srcimg);
LDOUBLEV's avatar
LDOUBLEV committed
252

LDOUVLEV's avatar
LDOUVLEV committed
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
  return filter_boxes;
}

std::shared_ptr<PaddlePredictor> loadModel(std::string model_file) {
  MobileConfig config;
  config.set_model_from_file(model_file);

  std::shared_ptr<PaddlePredictor> predictor =
      CreatePaddlePredictor<MobileConfig>(config);
  return predictor;
}

cv::Mat Visualization(cv::Mat srcimg,
                      std::vector<std::vector<std::vector<int>>> boxes) {
  cv::Point rook_points[boxes.size()][4];
  for (int n = 0; n < boxes.size(); n++) {
    for (int m = 0; m < boxes[0].size(); m++) {
      rook_points[n][m] = cv::Point(int(boxes[n][m][0]), int(boxes[n][m][1]));
LDOUBLEV's avatar
LDOUBLEV committed
271
272
273
274
    }
  }
  cv::Mat img_vis;
  srcimg.copyTo(img_vis);
LDOUVLEV's avatar
LDOUVLEV committed
275
  for (int n = 0; n < boxes.size(); n++) {
276
    const cv::Point *ppt[1] = {rook_points[n]};
LDOUVLEV's avatar
LDOUVLEV committed
277
278
    int npt[] = {4};
    cv::polylines(img_vis, ppt, npt, 1, 1, CV_RGB(0, 255, 0), 2, 8, 0);
LDOUBLEV's avatar
LDOUBLEV committed
279
280
  }

281
282
  cv::imwrite("./vis.jpg", img_vis);
  std::cout << "The detection visualized image saved in ./vis.jpg" << std::endl;
LDOUVLEV's avatar
LDOUVLEV committed
283
284
285
  return img_vis;
}

286
287
std::vector<std::string> split(const std::string &str,
                               const std::string &delim) {
LDOUVLEV's avatar
LDOUVLEV committed
288
  std::vector<std::string> res;
289
290
291
  if ("" == str)
    return res;
  char *strs = new char[str.length() + 1];
LDOUVLEV's avatar
LDOUVLEV committed
292
293
  std::strcpy(strs, str.c_str());

294
  char *d = new char[delim.length() + 1];
LDOUVLEV's avatar
LDOUVLEV committed
295
296
  std::strcpy(d, delim.c_str());

297
  char *p = std::strtok(strs, d);
LDOUVLEV's avatar
LDOUVLEV committed
298
299
300
301
302
  while (p) {
    string s = p;
    res.push_back(s);
    p = std::strtok(NULL, d);
  }
LDOUBLEV's avatar
LDOUBLEV committed
303

LDOUVLEV's avatar
LDOUVLEV committed
304
  return res;
LDOUBLEV's avatar
LDOUBLEV committed
305
306
}

LDOUVLEV's avatar
LDOUVLEV committed
307
308
309
310
311
312
313
314
315
316
std::map<std::string, double> LoadConfigTxt(std::string config_path) {
  auto config = ReadDict(config_path);

  std::map<std::string, double> dict;
  for (int i = 0; i < config.size(); i++) {
    std::vector<std::string> res = split(config[i], " ");
    dict[res[0]] = stod(res[1]);
  }
  return dict;
}
LDOUBLEV's avatar
LDOUBLEV committed
317

318
int main(int argc, char **argv) {
LDOUVLEV's avatar
LDOUVLEV committed
319
320
321
  if (argc < 5) {
    std::cerr << "[ERROR] usage: " << argv[0]
              << " det_model_file rec_model_file image_path\n";
LDOUBLEV's avatar
LDOUBLEV committed
322
323
324
325
326
    exit(1);
  }
  std::string det_model_file = argv[1];
  std::string rec_model_file = argv[2];
  std::string img_path = argv[3];
LDOUVLEV's avatar
LDOUVLEV committed
327
328
329
330
  std::string dict_path = argv[4];

  //// load config from txt file
  auto Config = LoadConfigTxt("./config.txt");
LDOUBLEV's avatar
LDOUBLEV committed
331
332
333

  auto start = std::chrono::system_clock::now();

LDOUVLEV's avatar
LDOUVLEV committed
334
335
336
  auto det_predictor = loadModel(det_model_file);
  auto rec_predictor = loadModel(rec_model_file);

LDOUBLEV's avatar
LDOUBLEV committed
337
  cv::Mat srcimg = cv::imread(img_path, cv::IMREAD_COLOR);
LDOUVLEV's avatar
LDOUVLEV committed
338
  auto boxes = RunDetModel(det_predictor, srcimg, Config);
LDOUBLEV's avatar
LDOUBLEV committed
339

LDOUVLEV's avatar
LDOUVLEV committed
340
341
  std::vector<std::string> rec_text;
  std::vector<float> rec_text_score;
342
343
  RunRecModel(boxes, srcimg, rec_predictor, dict_path, rec_text,
              rec_text_score);
LDOUBLEV's avatar
LDOUBLEV committed
344

LDOUVLEV's avatar
LDOUVLEV committed
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
  auto end = std::chrono::system_clock::now();
  auto duration =
      std::chrono::duration_cast<std::chrono::microseconds>(end - start);

  //// visualization
  auto img_vis = Visualization(srcimg, boxes);

  //// print recognized text
  for (int i = 0; i < rec_text.size(); i++) {
    std::cout << i << "\t" << rec_text[i] << "\t" << rec_text_score[i]
              << std::endl;
  }

  std::cout << "花费了"
            << double(duration.count()) *
                   std::chrono::microseconds::period::num /
                   std::chrono::microseconds::period::den
LDOUBLEV's avatar
LDOUBLEV committed
362
363
364
365
            << "秒" << std::endl;

  return 0;
}