ocr_db_crnn.cc 11.6 KB
Newer Older
LDOUVLEV's avatar
LDOUVLEV committed
1
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
LDOUBLEV's avatar
LDOUBLEV committed
2
3
4
5
6
7
8
9
10
11
12
13
14
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

15
#include "paddle_api.h" // NOLINT
LDOUBLEV's avatar
LDOUBLEV committed
16
17
#include <chrono>

LDOUVLEV's avatar
LDOUVLEV committed
18
19
#include "crnn_process.h"
#include "db_post_process.h"
LDOUBLEV's avatar
LDOUBLEV committed
20

21
using namespace paddle::lite_api; // NOLINT
LDOUVLEV's avatar
LDOUVLEV committed
22
using namespace std;
LDOUBLEV's avatar
LDOUBLEV committed
23
24

// fill tensor with mean and scale and trans layout: nhwc -> nchw, neon speed up
LDOUBLEV's avatar
LDOUBLEV committed
25
26
27
void NeonMeanScale(const float *din, float *dout, int size,
                   const std::vector<float> mean,
                   const std::vector<float> scale) {
LDOUBLEV's avatar
LDOUBLEV committed
28
29
30
31
32
33
34
35
36
37
38
  if (mean.size() != 3 || scale.size() != 3) {
    std::cerr << "[ERROR] mean or scale size must equal to 3\n";
    exit(1);
  }
  float32x4_t vmean0 = vdupq_n_f32(mean[0]);
  float32x4_t vmean1 = vdupq_n_f32(mean[1]);
  float32x4_t vmean2 = vdupq_n_f32(mean[2]);
  float32x4_t vscale0 = vdupq_n_f32(scale[0]);
  float32x4_t vscale1 = vdupq_n_f32(scale[1]);
  float32x4_t vscale2 = vdupq_n_f32(scale[2]);

39
40
41
  float *dout_c0 = dout;
  float *dout_c1 = dout + size;
  float *dout_c2 = dout + size * 2;
LDOUBLEV's avatar
LDOUBLEV committed
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68

  int i = 0;
  for (; i < size - 3; i += 4) {
    float32x4x3_t vin3 = vld3q_f32(din);
    float32x4_t vsub0 = vsubq_f32(vin3.val[0], vmean0);
    float32x4_t vsub1 = vsubq_f32(vin3.val[1], vmean1);
    float32x4_t vsub2 = vsubq_f32(vin3.val[2], vmean2);
    float32x4_t vs0 = vmulq_f32(vsub0, vscale0);
    float32x4_t vs1 = vmulq_f32(vsub1, vscale1);
    float32x4_t vs2 = vmulq_f32(vsub2, vscale2);
    vst1q_f32(dout_c0, vs0);
    vst1q_f32(dout_c1, vs1);
    vst1q_f32(dout_c2, vs2);

    din += 12;
    dout_c0 += 4;
    dout_c1 += 4;
    dout_c2 += 4;
  }
  for (; i < size; i++) {
    *(dout_c0++) = (*(din++) - mean[0]) * scale[0];
    *(dout_c1++) = (*(din++) - mean[1]) * scale[1];
    *(dout_c2++) = (*(din++) - mean[2]) * scale[2];
  }
}

// resize image to a size multiple of 32 which is required by the network
69
70
cv::Mat DetResizeImg(const cv::Mat img, int max_size_len,
                     std::vector<float> &ratio_hw) {
LDOUBLEV's avatar
LDOUBLEV committed
71
72
73
74
  int w = img.cols;
  int h = img.rows;

  float ratio = 1.f;
LDOUVLEV's avatar
LDOUVLEV committed
75
76
77
  int max_wh = w >= h ? w : h;
  if (max_wh > max_size_len) {
    if (h > w) {
LDOUBLEV's avatar
LDOUBLEV committed
78
      ratio = static_cast<float>(max_size_len) / static_cast<float>(h);
LDOUBLEV's avatar
LDOUBLEV committed
79
    } else {
LDOUBLEV's avatar
LDOUBLEV committed
80
      ratio = static_cast<float>(max_size_len) / static_cast<float>(w);
LDOUBLEV's avatar
LDOUBLEV committed
81
82
83
    }
  }

LDOUBLEV's avatar
LDOUBLEV committed
84
85
  int resize_h = static_cast<int>(float(h) * ratio);
  int resize_w = static_cast<int>(float(w) * ratio);
LDOUBLEV's avatar
LDOUBLEV committed
86
87
  if (resize_h % 32 == 0)
    resize_h = resize_h;
LDOUVLEV's avatar
LDOUVLEV committed
88
  else if (resize_h / 32 < 1 + 1e-5)
LDOUBLEV's avatar
LDOUBLEV committed
89
90
91
92
93
94
    resize_h = 32;
  else
    resize_h = (resize_h / 32 - 1) * 32;

  if (resize_w % 32 == 0)
    resize_w = resize_w;
LDOUVLEV's avatar
LDOUVLEV committed
95
  else if (resize_w / 32 < 1 + 1e-5)
LDOUBLEV's avatar
LDOUBLEV committed
96
97
    resize_w = 32;
  else
LDOUVLEV's avatar
LDOUVLEV committed
98
    resize_w = (resize_w / 32 - 1) * 32;
LDOUBLEV's avatar
LDOUBLEV committed
99
100
101
102

  cv::Mat resize_img;
  cv::resize(img, resize_img, cv::Size(resize_w, resize_h));

LDOUBLEV's avatar
LDOUBLEV committed
103
104
  ratio_hw.push_back(static_cast<float>(resize_h) / static_cast<float>(h));
  ratio_hw.push_back(static_cast<float>(resize_w) / static_cast<float>(w));
LDOUBLEV's avatar
LDOUBLEV committed
105
106
107
  return resize_img;
}

108
void RunRecModel(std::vector<std::vector<std::vector<int>>> boxes, cv::Mat img,
LDOUVLEV's avatar
LDOUVLEV committed
109
                 std::shared_ptr<PaddlePredictor> predictor_crnn,
LDOUBLEV's avatar
update  
LDOUBLEV committed
110
111
112
                 std::vector<std::string> &rec_text,
                 std::vector<float> &rec_text_score,
                 std::vector<std::string> charactor_dict) {
LDOUBLEV's avatar
LDOUBLEV committed
113
114
115
116
117
118
119
120
121
  std::vector<float> mean = {0.5f, 0.5f, 0.5f};
  std::vector<float> scale = {1 / 0.5f, 1 / 0.5f, 1 / 0.5f};

  cv::Mat srcimg;
  img.copyTo(srcimg);
  cv::Mat crop_img;
  cv::Mat resize_img;

  int index = 0;
LDOUVLEV's avatar
LDOUVLEV committed
122
123
  for (int i = boxes.size() - 1; i >= 0; i--) {
    crop_img = GetRotateCropImage(srcimg, boxes[i]);
LDOUBLEV's avatar
LDOUBLEV committed
124
125
    float wh_ratio =
        static_cast<float>(crop_img.cols) / static_cast<float>(crop_img.rows);
LDOUBLEV's avatar
LDOUBLEV committed
126

127
    resize_img = CrnnResizeImg(crop_img, wh_ratio);
LDOUBLEV's avatar
LDOUBLEV committed
128
129
    resize_img.convertTo(resize_img, CV_32FC3, 1 / 255.f);

130
    const float *dimg = reinterpret_cast<const float *>(resize_img.data);
LDOUBLEV's avatar
LDOUBLEV committed
131

LDOUVLEV's avatar
LDOUVLEV committed
132
133
    std::unique_ptr<Tensor> input_tensor0(
        std::move(predictor_crnn->GetInput(0)));
LDOUBLEV's avatar
LDOUBLEV committed
134
    input_tensor0->Resize({1, 3, resize_img.rows, resize_img.cols});
135
    auto *data0 = input_tensor0->mutable_data<float>();
LDOUBLEV's avatar
LDOUBLEV committed
136

LDOUBLEV's avatar
LDOUBLEV committed
137
    NeonMeanScale(dimg, data0, resize_img.rows * resize_img.cols, mean, scale);
LDOUBLEV's avatar
LDOUBLEV committed
138
139
140
141
142
    //// Run CRNN predictor
    predictor_crnn->Run();

    // Get output and run postprocess
    std::unique_ptr<const Tensor> output_tensor0(
LDOUVLEV's avatar
LDOUVLEV committed
143
        std::move(predictor_crnn->GetOutput(0)));
LDOUBLEV's avatar
LDOUBLEV committed
144
    auto *rec_idx = output_tensor0->data<int64>();
LDOUBLEV's avatar
LDOUBLEV committed
145
146
147
148
149

    auto rec_idx_lod = output_tensor0->lod();
    auto shape_out = output_tensor0->shape();

    std::vector<int> pred_idx;
LDOUBLEV's avatar
LDOUBLEV committed
150
151
152
    for (int n = static_cast<int>(rec_idx_lod[0][0]);
         n < static_cast<int>(rec_idx_lod[0][1]); n += 1) {
      pred_idx.push_back(static_cast<int>(rec_idx[n]));
LDOUBLEV's avatar
LDOUBLEV committed
153
154
    }

155
156
    if (pred_idx.size() < 1e-3)
      continue;
LDOUBLEV's avatar
LDOUBLEV committed
157
158

    index += 1;
LDOUVLEV's avatar
LDOUVLEV committed
159
    std::string pred_txt = "";
LDOUBLEV's avatar
LDOUBLEV committed
160
    for (int n = 0; n < pred_idx.size(); n++) {
LDOUVLEV's avatar
LDOUVLEV committed
161
      pred_txt += charactor_dict[pred_idx[n]];
LDOUBLEV's avatar
LDOUBLEV committed
162
    }
LDOUVLEV's avatar
LDOUVLEV committed
163
    rec_text.push_back(pred_txt);
LDOUBLEV's avatar
LDOUBLEV committed
164
165

    ////get score
LDOUVLEV's avatar
LDOUVLEV committed
166
167
    std::unique_ptr<const Tensor> output_tensor1(
        std::move(predictor_crnn->GetOutput(1)));
168
    auto *predict_batch = output_tensor1->data<float>();
LDOUBLEV's avatar
LDOUBLEV committed
169
170
171
172
173
174
175
176
177
    auto predict_shape = output_tensor1->shape();

    auto predict_lod = output_tensor1->lod();

    int blank = predict_shape[1];
    float score = 0.f;
    int count = 0;

    for (int n = predict_lod[0][0]; n < predict_lod[0][1] - 1; n++) {
LDOUBLEV's avatar
LDOUBLEV committed
178
179
180
181
      int argmax_idx =
          static_cast<int>(Argmax(&predict_batch[n * predict_shape[1]],
                                  &predict_batch[(n + 1) * predict_shape[1]]));
      float max_value =
LDOUVLEV's avatar
LDOUVLEV committed
182
183
          float(*std::max_element(&predict_batch[n * predict_shape[1]],
                                  &predict_batch[(n + 1) * predict_shape[1]]));
LDOUBLEV's avatar
LDOUBLEV committed
184
185
186
187
188
189
190

      if (blank - 1 - argmax_idx > 1e-5) {
        score += max_value;
        count += 1;
      }
    }
    score /= count;
LDOUVLEV's avatar
LDOUVLEV committed
191
    rec_text_score.push_back(score);
LDOUBLEV's avatar
LDOUBLEV committed
192
193
194
  }
}

195
196
197
std::vector<std::vector<std::vector<int>>>
RunDetModel(std::shared_ptr<PaddlePredictor> predictor, cv::Mat img,
            std::map<std::string, double> Config) {
LDOUBLEV's avatar
LDOUBLEV committed
198
  // Read img
LDOUVLEV's avatar
LDOUVLEV committed
199
  int max_side_len = int(Config["max_side_len"]);
LDOUBLEV's avatar
LDOUBLEV committed
200
201
202
203

  cv::Mat srcimg;
  img.copyTo(srcimg);

LDOUVLEV's avatar
LDOUVLEV committed
204
205
  std::vector<float> ratio_hw;
  img = DetResizeImg(img, max_side_len, ratio_hw);
LDOUBLEV's avatar
LDOUBLEV committed
206
207
208
209
210
211
  cv::Mat img_fp;
  img.convertTo(img_fp, CV_32FC3, 1.0 / 255.f);

  // Prepare input data from image
  std::unique_ptr<Tensor> input_tensor0(std::move(predictor->GetInput(0)));
  input_tensor0->Resize({1, 3, img_fp.rows, img_fp.cols});
212
  auto *data0 = input_tensor0->mutable_data<float>();
LDOUBLEV's avatar
LDOUBLEV committed
213
214

  std::vector<float> mean = {0.485f, 0.456f, 0.406f};
LDOUVLEV's avatar
LDOUVLEV committed
215
  std::vector<float> scale = {1 / 0.229f, 1 / 0.224f, 1 / 0.225f};
216
  const float *dimg = reinterpret_cast<const float *>(img_fp.data);
LDOUBLEV's avatar
LDOUBLEV committed
217
  NeonMeanScale(dimg, data0, img_fp.rows * img_fp.cols, mean, scale);
LDOUBLEV's avatar
LDOUBLEV committed
218
219
220
221
222

  // Run predictor
  predictor->Run();

  // Get output and post process
LDOUVLEV's avatar
LDOUVLEV committed
223
224
  std::unique_ptr<const Tensor> output_tensor(
      std::move(predictor->GetOutput(0)));
225
  auto *outptr = output_tensor->data<float>();
LDOUBLEV's avatar
LDOUBLEV committed
226
227
228
  auto shape_out = output_tensor->shape();

  // Save output
LDOUBLEV's avatar
update  
LDOUBLEV committed
229
230
  float pred[shape_out[2] * shape_out[3]];
  unsigned char cbuf[shape_out[2] * shape_out[3]];
LDOUBLEV's avatar
LDOUBLEV committed
231

LDOUVLEV's avatar
LDOUVLEV committed
232
  for (int i = 0; i < int(shape_out[2] * shape_out[3]); i++) {
LDOUBLEV's avatar
LDOUBLEV committed
233
234
    pred[i] = static_cast<float>(outptr[i]);
    cbuf[i] = static_cast<unsigned char>((outptr[i]) * 255);
LDOUBLEV's avatar
LDOUBLEV committed
235
236
  }

LDOUBLEV's avatar
LDOUBLEV committed
237
  cv::Mat cbuf_map(shape_out[2], shape_out[3], CV_8UC1,
LDOUBLEV's avatar
LDOUBLEV committed
238
                   reinterpret_cast<unsigned char *>(cbuf));
LDOUBLEV's avatar
LDOUBLEV committed
239
  cv::Mat pred_map(shape_out[2], shape_out[3], CV_32F,
LDOUBLEV's avatar
LDOUBLEV committed
240
                   reinterpret_cast<float *>(pred));
LDOUBLEV's avatar
LDOUBLEV committed
241

LDOUVLEV's avatar
LDOUVLEV committed
242
  const double threshold = double(Config["det_db_thresh"]) * 255;
LDOUBLEV's avatar
LDOUBLEV committed
243
244
245
246
  const double maxvalue = 255;
  cv::Mat bit_map;
  cv::threshold(cbuf_map, bit_map, threshold, maxvalue, cv::THRESH_BINARY);

LDOUVLEV's avatar
LDOUVLEV committed
247
  auto boxes = BoxesFromBitmap(pred_map, bit_map, Config);
LDOUBLEV's avatar
LDOUBLEV committed
248

LDOUVLEV's avatar
LDOUVLEV committed
249
250
  std::vector<std::vector<std::vector<int>>> filter_boxes =
      FilterTagDetRes(boxes, ratio_hw[0], ratio_hw[1], srcimg);
LDOUBLEV's avatar
LDOUBLEV committed
251

LDOUVLEV's avatar
LDOUVLEV committed
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
  return filter_boxes;
}

std::shared_ptr<PaddlePredictor> loadModel(std::string model_file) {
  MobileConfig config;
  config.set_model_from_file(model_file);

  std::shared_ptr<PaddlePredictor> predictor =
      CreatePaddlePredictor<MobileConfig>(config);
  return predictor;
}

cv::Mat Visualization(cv::Mat srcimg,
                      std::vector<std::vector<std::vector<int>>> boxes) {
  cv::Point rook_points[boxes.size()][4];
  for (int n = 0; n < boxes.size(); n++) {
    for (int m = 0; m < boxes[0].size(); m++) {
LDOUBLEV's avatar
LDOUBLEV committed
269
270
      rook_points[n][m] = cv::Point(static_cast<int>(boxes[n][m][0]),
                                    static_cast<int>(boxes[n][m][1]));
LDOUBLEV's avatar
LDOUBLEV committed
271
272
273
274
    }
  }
  cv::Mat img_vis;
  srcimg.copyTo(img_vis);
LDOUVLEV's avatar
LDOUVLEV committed
275
  for (int n = 0; n < boxes.size(); n++) {
276
    const cv::Point *ppt[1] = {rook_points[n]};
LDOUVLEV's avatar
LDOUVLEV committed
277
278
    int npt[] = {4};
    cv::polylines(img_vis, ppt, npt, 1, 1, CV_RGB(0, 255, 0), 2, 8, 0);
LDOUBLEV's avatar
LDOUBLEV committed
279
280
  }

281
282
  cv::imwrite("./vis.jpg", img_vis);
  std::cout << "The detection visualized image saved in ./vis.jpg" << std::endl;
LDOUVLEV's avatar
LDOUVLEV committed
283
284
285
  return img_vis;
}

286
287
std::vector<std::string> split(const std::string &str,
                               const std::string &delim) {
LDOUVLEV's avatar
LDOUVLEV committed
288
  std::vector<std::string> res;
289
290
291
  if ("" == str)
    return res;
  char *strs = new char[str.length() + 1];
LDOUVLEV's avatar
LDOUVLEV committed
292
293
  std::strcpy(strs, str.c_str());

294
  char *d = new char[delim.length() + 1];
LDOUVLEV's avatar
LDOUVLEV committed
295
296
  std::strcpy(d, delim.c_str());

297
  char *p = std::strtok(strs, d);
LDOUVLEV's avatar
LDOUVLEV committed
298
299
300
301
302
  while (p) {
    string s = p;
    res.push_back(s);
    p = std::strtok(NULL, d);
  }
LDOUBLEV's avatar
LDOUBLEV committed
303

LDOUVLEV's avatar
LDOUVLEV committed
304
  return res;
LDOUBLEV's avatar
LDOUBLEV committed
305
306
}

LDOUVLEV's avatar
LDOUVLEV committed
307
308
309
310
311
312
313
314
315
316
std::map<std::string, double> LoadConfigTxt(std::string config_path) {
  auto config = ReadDict(config_path);

  std::map<std::string, double> dict;
  for (int i = 0; i < config.size(); i++) {
    std::vector<std::string> res = split(config[i], " ");
    dict[res[0]] = stod(res[1]);
  }
  return dict;
}
LDOUBLEV's avatar
LDOUBLEV committed
317

318
int main(int argc, char **argv) {
LDOUVLEV's avatar
LDOUVLEV committed
319
320
321
  if (argc < 5) {
    std::cerr << "[ERROR] usage: " << argv[0]
              << " det_model_file rec_model_file image_path\n";
LDOUBLEV's avatar
LDOUBLEV committed
322
323
324
325
326
    exit(1);
  }
  std::string det_model_file = argv[1];
  std::string rec_model_file = argv[2];
  std::string img_path = argv[3];
LDOUVLEV's avatar
LDOUVLEV committed
327
328
329
330
  std::string dict_path = argv[4];

  //// load config from txt file
  auto Config = LoadConfigTxt("./config.txt");
LDOUBLEV's avatar
LDOUBLEV committed
331
332
333

  auto start = std::chrono::system_clock::now();

LDOUVLEV's avatar
LDOUVLEV committed
334
335
336
  auto det_predictor = loadModel(det_model_file);
  auto rec_predictor = loadModel(rec_model_file);

LDOUBLEV's avatar
update  
LDOUBLEV committed
337
  auto charactor_dict = ReadDict(dict_path);
LDOUBLEV's avatar
LDOUBLEV committed
338
  charactor_dict.push_back(" ");
LDOUBLEV's avatar
update  
LDOUBLEV committed
339

LDOUBLEV's avatar
LDOUBLEV committed
340
  cv::Mat srcimg = cv::imread(img_path, cv::IMREAD_COLOR);
LDOUVLEV's avatar
LDOUVLEV committed
341
  auto boxes = RunDetModel(det_predictor, srcimg, Config);
LDOUBLEV's avatar
LDOUBLEV committed
342

LDOUVLEV's avatar
LDOUVLEV committed
343
344
  std::vector<std::string> rec_text;
  std::vector<float> rec_text_score;
LDOUBLEV's avatar
update  
LDOUBLEV committed
345
346
  RunRecModel(boxes, srcimg, rec_predictor, rec_text, rec_text_score,
              charactor_dict);
LDOUBLEV's avatar
LDOUBLEV committed
347

LDOUVLEV's avatar
LDOUVLEV committed
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
  auto end = std::chrono::system_clock::now();
  auto duration =
      std::chrono::duration_cast<std::chrono::microseconds>(end - start);

  //// visualization
  auto img_vis = Visualization(srcimg, boxes);

  //// print recognized text
  for (int i = 0; i < rec_text.size(); i++) {
    std::cout << i << "\t" << rec_text[i] << "\t" << rec_text_score[i]
              << std::endl;
  }

  std::cout << "花费了"
            << double(duration.count()) *
                   std::chrono::microseconds::period::num /
                   std::chrono::microseconds::period::den
LDOUBLEV's avatar
LDOUBLEV committed
365
366
367
368
            << "秒" << std::endl;

  return 0;
}