Unverified Commit 26219d5f authored by shaohua.zhang's avatar shaohua.zhang Committed by GitHub
Browse files

Merge pull request #1 from PaddlePaddle/develop

update-2020-7-17
parents 0e8a3417 311c5997
......@@ -13,6 +13,7 @@
#limitations under the License.
import os
import sys
import math
import random
import functools
......@@ -42,6 +43,10 @@ class TrainReader(object):
img_num = len(label_infor_list)
img_id_list = list(range(img_num))
random.shuffle(img_id_list)
if sys.platform == "win32":
print("multiprocess is not fully compatible with Windows."
"num_workers will be 1.")
self.num_workers = 1
for img_id in range(process_id, img_num, self.num_workers):
label_infor = label_infor_list[img_id_list[img_id]]
outs = self.process(label_infor)
......@@ -56,8 +61,6 @@ class TrainReader(object):
if len(batch_outs) == self.batch_size:
yield batch_outs
batch_outs = []
if len(batch_outs) != 0:
yield batch_outs
return batch_iter_reader
......@@ -92,8 +95,10 @@ class EvalTestReader(object):
for img_path in img_list:
img = cv2.imread(img_path)
if img is None:
logger.info("load image error:" + img_path)
logger.info("{} does not exist!".format(img_path))
continue
elif len(list(img.shape)) == 2 or img.shape[2] == 1:
img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
outs = process_function(img)
outs.append(img_path)
batch_outs.append(outs)
......
......@@ -17,6 +17,8 @@ import cv2
import numpy as np
import json
import sys
from ppocr.utils.utility import initial_logger
logger = initial_logger()
from .data_augment import AugmentData
from .random_crop_data import RandomCropData
......@@ -25,6 +27,10 @@ from .make_border_map import MakeBorderMap
class DBProcessTrain(object):
"""
DB pre-process for Train mode
"""
def __init__(self, params):
self.img_set_dir = params['img_set_dir']
self.image_shape = params['image_shape']
......@@ -96,7 +102,10 @@ class DBProcessTrain(object):
img_path, gt_label = self.convert_label_infor(label_infor)
imgvalue = cv2.imread(img_path)
if imgvalue is None:
logger.info("{} does not exist!".format(img_path))
return None
if len(list(imgvalue.shape)) == 2 or imgvalue.shape[2] == 1:
imgvalue = cv2.cvtColor(imgvalue, cv2.COLOR_GRAY2BGR)
data = self.make_data_dict(imgvalue, gt_label)
data = AugmentData(data)
data = RandomCropData(data, self.image_shape[1:])
......@@ -109,11 +118,15 @@ class DBProcessTrain(object):
class DBProcessTest(object):
"""
DB pre-process for Test mode
"""
def __init__(self, params):
super(DBProcessTest, self).__init__()
self.resize_type = 0
if 'det_image_shape' in params:
self.image_shape = params['det_image_shape']
if 'test_image_shape' in params:
self.image_shape = params['test_image_shape']
# print(self.image_shape)
self.resize_type = 1
if 'max_side_len' in params:
......@@ -124,6 +137,10 @@ class DBProcessTest(object):
def resize_image_type0(self, im):
"""
resize image to a size multiple of 32 which is required by the network
args:
img(array): array with shape [h, w, c]
return(tuple):
img, (ratio_h, ratio_w)
"""
max_side_len = self.max_side_len
h, w, _ = im.shape
......@@ -177,8 +194,12 @@ class DBProcessTest(object):
img_std = [0.229, 0.224, 0.225]
im = im.astype(np.float32, copy=False)
im = im / 255
im -= img_mean
im /= img_std
im[:, :, 0] -= img_mean[0]
im[:, :, 1] -= img_mean[1]
im[:, :, 2] -= img_mean[2]
im[:, :, 0] /= img_std[0]
im[:, :, 1] /= img_std[1]
im[:, :, 2] /= img_std[2]
channel_swap = (2, 0, 1)
im = im.transpose(channel_swap)
return im
......
......@@ -455,17 +455,23 @@ class EASTProcessTrain(object):
class EASTProcessTest(object):
def __init__(self, params):
super(EASTProcessTest, self).__init__()
self.resize_type = 0
if 'test_image_shape' in params:
self.image_shape = params['test_image_shape']
# print(self.image_shape)
self.resize_type = 1
if 'max_side_len' in params:
self.max_side_len = params['max_side_len']
else:
self.max_side_len = 2400
def resize_image(self, im):
def resize_image_type0(self, im):
"""
resize image to a size multiple of 32 which is required by the network
:param im: the resized image
:param max_side_len: limit of max image size to avoid out of memory in gpu
:return: the resized image and the resize ratio
args:
img(array): array with shape [h, w, c]
return(tuple):
img, (ratio_h, ratio_w)
"""
max_side_len = self.max_side_len
h, w, _ = im.shape
......@@ -495,13 +501,30 @@ class EASTProcessTest(object):
resize_w = 32
else:
resize_w = (resize_w // 32 - 1) * 32
im = cv2.resize(im, (int(resize_w), int(resize_h)))
try:
if int(resize_w) <= 0 or int(resize_h) <= 0:
return None, (None, None)
im = cv2.resize(im, (int(resize_w), int(resize_h)))
except:
print(im.shape, resize_w, resize_h)
sys.exit(0)
ratio_h = resize_h / float(h)
ratio_w = resize_w / float(w)
return im, (ratio_h, ratio_w)
def resize_image_type1(self, im):
resize_h, resize_w = self.image_shape
ori_h, ori_w = im.shape[:2] # (h, w, c)
im = cv2.resize(im, (int(resize_w), int(resize_h)))
ratio_h = float(resize_h) / ori_h
ratio_w = float(resize_w) / ori_w
return im, (ratio_h, ratio_w)
def __call__(self, im):
im, (ratio_h, ratio_w) = self.resize_image(im)
if self.resize_type == 0:
im, (ratio_h, ratio_w) = self.resize_image_type0(im)
else:
im, (ratio_h, ratio_w) = self.resize_image_type1(im)
img_mean = [0.485, 0.456, 0.406]
img_std = [0.229, 0.224, 0.225]
im = im[:, :, ::-1].astype(np.float32)
......
......@@ -66,6 +66,8 @@ def reader_main(config=None, mode=None):
reader_function = params['reader_function']
function = create_module(reader_function)(params)
if mode == "train":
if sys.platform == "win32":
return function(0)
readers = []
num_workers = params['num_workers']
for process_id in range(num_workers):
......
......@@ -13,6 +13,7 @@
#limitations under the License.
import os
import sys
import math
import random
import numpy as np
......@@ -40,10 +41,25 @@ class LMDBReader(object):
self.loss_type = params['loss_type']
self.max_text_length = params['max_text_length']
self.mode = params['mode']
self.drop_last = False
self.use_tps = False
if "tps" in params:
self.ues_tps = True
self.use_distort = False
if "distort" in params:
self.use_distort = params['distort'] and params['use_gpu']
if not params['use_gpu']:
logger.info(
"Distort operation can only support in GPU. Distort will be set to False."
)
if params['mode'] == 'train':
self.batch_size = params['train_batch_size_per_card']
self.drop_last = True
else:
self.batch_size = params['test_batch_size_per_card']
self.drop_last = False
self.use_distort = False
self.infer_img = params['infer_img']
def load_hierarchical_lmdb_dataset(self):
lmdb_sets = {}
......@@ -97,33 +113,52 @@ class LMDBReader(object):
process_id = 0
def sample_iter_reader():
lmdb_sets = self.load_hierarchical_lmdb_dataset()
if process_id == 0:
self.print_lmdb_sets_info(lmdb_sets)
cur_index_sets = [1 + process_id] * len(lmdb_sets)
while True:
finish_read_num = 0
for dataset_idx in range(len(lmdb_sets)):
cur_index = cur_index_sets[dataset_idx]
if cur_index > lmdb_sets[dataset_idx]['num_samples']:
finish_read_num += 1
else:
sample_info = self.get_lmdb_sample_info(
lmdb_sets[dataset_idx]['txn'], cur_index)
cur_index_sets[dataset_idx] += self.num_workers
if sample_info is None:
continue
img, label = sample_info
outs = process_image(img, self.image_shape, label,
self.char_ops, self.loss_type,
self.max_text_length)
if outs is None:
continue
yield outs
if finish_read_num == len(lmdb_sets):
break
self.close_lmdb_dataset(lmdb_sets)
if self.mode != 'train' and self.infer_img is not None:
image_file_list = get_image_file_list(self.infer_img)
for single_img in image_file_list:
img = cv2.imread(single_img)
if img.shape[-1] == 1 or len(list(img.shape)) == 2:
img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
norm_img = process_image(
img=img,
image_shape=self.image_shape,
char_ops=self.char_ops,
tps=self.use_tps,
infer_mode=True)
yield norm_img
else:
lmdb_sets = self.load_hierarchical_lmdb_dataset()
if process_id == 0:
self.print_lmdb_sets_info(lmdb_sets)
cur_index_sets = [1 + process_id] * len(lmdb_sets)
while True:
finish_read_num = 0
for dataset_idx in range(len(lmdb_sets)):
cur_index = cur_index_sets[dataset_idx]
if cur_index > lmdb_sets[dataset_idx]['num_samples']:
finish_read_num += 1
else:
sample_info = self.get_lmdb_sample_info(
lmdb_sets[dataset_idx]['txn'], cur_index)
cur_index_sets[dataset_idx] += self.num_workers
if sample_info is None:
continue
img, label = sample_info
outs = process_image(
img=img,
image_shape=self.image_shape,
label=label,
char_ops=self.char_ops,
loss_type=self.loss_type,
max_text_length=self.max_text_length,
distort=self.use_distort)
if outs is None:
continue
yield outs
if finish_read_num == len(lmdb_sets):
break
self.close_lmdb_dataset(lmdb_sets)
def batch_iter_reader():
batch_outs = []
......@@ -132,10 +167,13 @@ class LMDBReader(object):
if len(batch_outs) == self.batch_size:
yield batch_outs
batch_outs = []
if len(batch_outs) != 0:
yield batch_outs
if not self.drop_last:
if len(batch_outs) != 0:
yield batch_outs
return batch_iter_reader
if self.infer_img is None:
return batch_iter_reader
return sample_iter_reader
class SimpleReader(object):
......@@ -152,26 +190,42 @@ class SimpleReader(object):
self.loss_type = params['loss_type']
self.max_text_length = params['max_text_length']
self.mode = params['mode']
self.infer_img = params['infer_img']
self.use_tps = False
if "tps" in params:
self.use_tps = True
self.use_distort = False
if "distort" in params:
self.use_distort = params['distort'] and params['use_gpu']
if not params['use_gpu']:
logger.info(
"Distort operation can only support in GPU.Distort will be set to False."
)
if params['mode'] == 'train':
self.batch_size = params['train_batch_size_per_card']
elif params['mode'] == 'eval':
self.batch_size = params['test_batch_size_per_card']
self.drop_last = True
else:
self.batch_size = 1
self.infer_img = params['infer_img']
self.batch_size = params['test_batch_size_per_card']
self.drop_last = False
self.use_distort = False
def __call__(self, process_id):
if self.mode != 'train':
process_id = 0
def sample_iter_reader():
if self.mode == 'test':
if self.mode != 'train' and self.infer_img is not None:
image_file_list = get_image_file_list(self.infer_img)
for single_img in image_file_list:
img = cv2.imread(single_img)
if img.shape[-1]==1 or len(list(img.shape))==2:
if img.shape[-1] == 1 or len(list(img.shape)) == 2:
img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
norm_img = process_image(img, self.image_shape)
norm_img = process_image(
img=img,
image_shape=self.image_shape,
char_ops=self.char_ops,
tps=self.use_tps,
infer_mode=True)
yield norm_img
else:
with open(self.label_file_path, "rb") as fin:
......@@ -179,20 +233,30 @@ class SimpleReader(object):
img_num = len(label_infor_list)
img_id_list = list(range(img_num))
random.shuffle(img_id_list)
if sys.platform == "win32":
print("multiprocess is not fully compatible with Windows."
"num_workers will be 1.")
self.num_workers = 1
for img_id in range(process_id, img_num, self.num_workers):
label_infor = label_infor_list[img_id_list[img_id]]
substr = label_infor.decode('utf-8').strip("\n").split("\t")
img_path = self.img_set_dir + "/" + substr[0]
img = cv2.imread(img_path)
if img.shape[-1]==1 or len(list(img.shape))==2:
img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
if img is None:
logger.info("{} does not exist!".format(img_path))
continue
if img.shape[-1] == 1 or len(list(img.shape)) == 2:
img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
label = substr[1]
outs = process_image(img, self.image_shape, label,
self.char_ops, self.loss_type,
self.max_text_length)
outs = process_image(
img=img,
image_shape=self.image_shape,
label=label,
char_ops=self.char_ops,
loss_type=self.loss_type,
max_text_length=self.max_text_length,
distort=self.use_distort)
if outs is None:
continue
yield outs
......@@ -204,9 +268,10 @@ class SimpleReader(object):
if len(batch_outs) == self.batch_size:
yield batch_outs
batch_outs = []
if len(batch_outs) != 0:
yield batch_outs
if not self.drop_last:
if len(batch_outs) != 0:
yield batch_outs
if self.mode != 'test':
if self.infer_img is None:
return batch_iter_reader
return sample_iter_reader
......@@ -15,6 +15,9 @@
import math
import cv2
import numpy as np
import random
from ppocr.utils.utility import initial_logger
logger = initial_logger()
def get_bounding_box_rect(pos):
......@@ -48,6 +51,32 @@ def resize_norm_img(img, image_shape):
return padding_im
def resize_norm_img_chinese(img, image_shape):
imgC, imgH, imgW = image_shape
# todo: change to 0 and modified image shape
max_wh_ratio = 0
h, w = img.shape[0], img.shape[1]
ratio = w * 1.0 / h
max_wh_ratio = max(max_wh_ratio, ratio)
imgW = int(32 * max_wh_ratio)
if math.ceil(imgH * ratio) > imgW:
resized_w = imgW
else:
resized_w = int(math.ceil(imgH * ratio))
resized_image = cv2.resize(img, (resized_w, imgH))
resized_image = resized_image.astype('float32')
if image_shape[0] == 1:
resized_image = resized_image / 255
resized_image = resized_image[np.newaxis, :]
else:
resized_image = resized_image.transpose((2, 0, 1)) / 255
resized_image -= 0.5
resized_image /= 0.5
padding_im = np.zeros((imgC, imgH, imgW), dtype=np.float32)
padding_im[:, :, 0:resized_w] = resized_image
return padding_im
def get_img_data(value):
"""get_img_data"""
if not value:
......@@ -61,18 +90,280 @@ def get_img_data(value):
return imgori
def flag():
"""
flag
"""
return 1 if random.random() > 0.5000001 else -1
def cvtColor(img):
"""
cvtColor
"""
hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
delta = 0.001 * random.random() * flag()
hsv[:, :, 2] = hsv[:, :, 2] * (1 + delta)
new_img = cv2.cvtColor(hsv, cv2.COLOR_HSV2BGR)
return new_img
def blur(img):
"""
blur
"""
h, w, _ = img.shape
if h > 10 and w > 10:
return cv2.GaussianBlur(img, (5, 5), 1)
else:
return img
def jitter(img):
"""
jitter
"""
w, h, _ = img.shape
if h > 10 and w > 10:
thres = min(w, h)
s = int(random.random() * thres * 0.01)
src_img = img.copy()
for i in range(s):
img[i:, i:, :] = src_img[:w - i, :h - i, :]
return img
else:
return img
def add_gasuss_noise(image, mean=0, var=0.1):
"""
Gasuss noise
"""
noise = np.random.normal(mean, var**0.5, image.shape)
out = image + 0.5 * noise
out = np.clip(out, 0, 255)
out = np.uint8(out)
return out
def get_crop(image):
"""
random crop
"""
h, w, _ = image.shape
top_min = 1
top_max = 8
top_crop = int(random.randint(top_min, top_max))
top_crop = min(top_crop, h - 1)
crop_img = image.copy()
ratio = random.randint(0, 1)
if ratio:
crop_img = crop_img[top_crop:h, :, :]
else:
crop_img = crop_img[0:h - top_crop, :, :]
return crop_img
class Config:
"""
Config
"""
def __init__(self, ):
self.anglex = random.random() * 30
self.angley = random.random() * 15
self.anglez = random.random() * 10
self.fov = 42
self.r = 0
self.shearx = random.random() * 0.3
self.sheary = random.random() * 0.05
self.borderMode = cv2.BORDER_REPLICATE
def make(self, w, h, ang):
"""
make
"""
self.anglex = random.random() * 5 * flag()
self.angley = random.random() * 5 * flag()
self.anglez = -1 * random.random() * int(ang) * flag()
self.fov = 42
self.r = 0
self.shearx = 0
self.sheary = 0
self.borderMode = cv2.BORDER_REPLICATE
self.w = w
self.h = h
self.perspective = True
self.crop = True
self.affine = False
self.reverse = True
self.noise = True
self.jitter = True
self.blur = True
self.color = True
def rad(x):
"""
rad
"""
return x * np.pi / 180
def get_warpR(config):
"""
get_warpR
"""
anglex, angley, anglez, fov, w, h, r = \
config.anglex, config.angley, config.anglez, config.fov, config.w, config.h, config.r
if w > 69 and w < 112:
anglex = anglex * 1.5
z = np.sqrt(w**2 + h**2) / 2 / np.tan(rad(fov / 2))
# Homogeneous coordinate transformation matrix
rx = np.array([[1, 0, 0, 0],
[0, np.cos(rad(anglex)), -np.sin(rad(anglex)), 0], [
0,
-np.sin(rad(anglex)),
np.cos(rad(anglex)),
0,
], [0, 0, 0, 1]], np.float32)
ry = np.array([[np.cos(rad(angley)), 0, np.sin(rad(angley)), 0],
[0, 1, 0, 0], [
-np.sin(rad(angley)),
0,
np.cos(rad(angley)),
0,
], [0, 0, 0, 1]], np.float32)
rz = np.array([[np.cos(rad(anglez)), np.sin(rad(anglez)), 0, 0],
[-np.sin(rad(anglez)), np.cos(rad(anglez)), 0, 0],
[0, 0, 1, 0], [0, 0, 0, 1]], np.float32)
r = rx.dot(ry).dot(rz)
# generate 4 points
pcenter = np.array([h / 2, w / 2, 0, 0], np.float32)
p1 = np.array([0, 0, 0, 0], np.float32) - pcenter
p2 = np.array([w, 0, 0, 0], np.float32) - pcenter
p3 = np.array([0, h, 0, 0], np.float32) - pcenter
p4 = np.array([w, h, 0, 0], np.float32) - pcenter
dst1 = r.dot(p1)
dst2 = r.dot(p2)
dst3 = r.dot(p3)
dst4 = r.dot(p4)
list_dst = np.array([dst1, dst2, dst3, dst4])
org = np.array([[0, 0], [w, 0], [0, h], [w, h]], np.float32)
dst = np.zeros((4, 2), np.float32)
# Project onto the image plane
dst[:, 0] = list_dst[:, 0] * z / (z - list_dst[:, 2]) + pcenter[0]
dst[:, 1] = list_dst[:, 1] * z / (z - list_dst[:, 2]) + pcenter[1]
warpR = cv2.getPerspectiveTransform(org, dst)
dst1, dst2, dst3, dst4 = dst
r1 = int(min(dst1[1], dst2[1]))
r2 = int(max(dst3[1], dst4[1]))
c1 = int(min(dst1[0], dst3[0]))
c2 = int(max(dst2[0], dst4[0]))
try:
ratio = min(1.0 * h / (r2 - r1), 1.0 * w / (c2 - c1))
dx = -c1
dy = -r1
T1 = np.float32([[1., 0, dx], [0, 1., dy], [0, 0, 1.0 / ratio]])
ret = T1.dot(warpR)
except:
ratio = 1.0
T1 = np.float32([[1., 0, 0], [0, 1., 0], [0, 0, 1.]])
ret = T1
return ret, (-r1, -c1), ratio, dst
def get_warpAffine(config):
"""
get_warpAffine
"""
anglez = config.anglez
rz = np.array([[np.cos(rad(anglez)), np.sin(rad(anglez)), 0],
[-np.sin(rad(anglez)), np.cos(rad(anglez)), 0]], np.float32)
return rz
def warp(img, ang):
"""
warp
"""
h, w, _ = img.shape
config = Config()
config.make(w, h, ang)
new_img = img
if config.perspective:
tp = random.randint(1, 100)
if tp >= 50:
warpR, (r1, c1), ratio, dst = get_warpR(config)
new_w = int(np.max(dst[:, 0])) - int(np.min(dst[:, 0]))
new_img = cv2.warpPerspective(
new_img,
warpR, (int(new_w * ratio), h),
borderMode=config.borderMode)
if config.crop:
img_height, img_width = img.shape[0:2]
tp = random.randint(1, 100)
if tp >= 50 and img_height >= 20 and img_width >= 20:
new_img = get_crop(new_img)
if config.affine:
warpT = get_warpAffine(config)
new_img = cv2.warpAffine(
new_img, warpT, (w, h), borderMode=config.borderMode)
if config.blur:
tp = random.randint(1, 100)
if tp >= 50:
new_img = blur(new_img)
if config.color:
tp = random.randint(1, 100)
if tp >= 50:
new_img = cvtColor(new_img)
if config.jitter:
new_img = jitter(new_img)
if config.noise:
tp = random.randint(1, 100)
if tp >= 50:
new_img = add_gasuss_noise(new_img)
if config.reverse:
tp = random.randint(1, 100)
if tp >= 50:
new_img = 255 - new_img
return new_img
def process_image(img,
image_shape,
label=None,
char_ops=None,
loss_type=None,
max_text_length=None):
norm_img = resize_norm_img(img, image_shape)
max_text_length=None,
tps=None,
infer_mode=False,
distort=False):
if distort:
img = warp(img, 10)
if infer_mode and char_ops.character_type == "ch" and not tps:
norm_img = resize_norm_img_chinese(img, image_shape)
else:
norm_img = resize_norm_img(img, image_shape)
norm_img = norm_img[np.newaxis, :]
if label is not None:
char_num = char_ops.get_char_num()
# char_num = char_ops.get_char_num()
text = char_ops.encode(label)
if len(text) == 0 or len(text) > max_text_length:
logger.info(
"Warning in ppocr/data/rec/img_tools.py:line362: Wrong data type."
"Excepted string with length between 1 and {}, but "
"got '{}'. Label is '{}'".format(max_text_length,
len(text), label))
return None
else:
if loss_type == "ctc":
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
......@@ -59,16 +59,23 @@ class DetModel(object):
return: (image, corresponding label, dataloader)
"""
image_shape = deepcopy(self.image_shape)
if image_shape[1] % 4 != 0 or image_shape[2] % 4 != 0:
raise Exception("The size of the image must be divisible by 4, "
"received image shape is {}, please reset the "
"Global.image_shape in the yml file".format(
image_shape))
image = fluid.layers.data(
name='image', shape=image_shape, dtype='float32')
if mode == "train":
if self.algorithm == "EAST":
h, w = int(image_shape[1] // 4), int(image_shape[2] // 4)
score = fluid.layers.data(
name='score', shape=[1, 128, 128], dtype='float32')
name='score', shape=[1, h, w], dtype='float32')
geo = fluid.layers.data(
name='geo', shape=[9, 128, 128], dtype='float32')
name='geo', shape=[9, h, w], dtype='float32')
mask = fluid.layers.data(
name='mask', shape=[1, 128, 128], dtype='float32')
name='mask', shape=[1, h, w], dtype='float32')
feed_list = [image, score, geo, mask]
labels = {'score': score, 'geo': geo, 'mask': mask}
elif self.algorithm == "DB":
......@@ -109,7 +116,10 @@ class DetModel(object):
"""
image, labels, loader = self.create_feed(mode)
conv_feas = self.backbone(image)
predicts = self.head(conv_feas)
if self.algorithm == "DB":
predicts = self.head(conv_feas, mode)
else:
predicts = self.head(conv_feas)
if mode == "train":
losses = self.loss(predicts, labels)
return loader, losses
......
......@@ -30,6 +30,8 @@ class RecModel(object):
global_params = params['Global']
char_num = global_params['char_ops'].get_char_num()
global_params['char_num'] = char_num
self.char_type = global_params['character_type']
self.infer_img = global_params['infer_img']
if "TPS" in params:
tps_params = deepcopy(params["TPS"])
tps_params.update(global_params)
......@@ -60,8 +62,8 @@ class RecModel(object):
def create_feed(self, mode):
image_shape = deepcopy(self.image_shape)
image_shape.insert(0, -1)
image = fluid.data(name='image', shape=image_shape, dtype='float32')
if mode == "train":
image = fluid.data(name='image', shape=image_shape, dtype='float32')
if self.loss_type == "attention":
label_in = fluid.data(
name='label_in',
......@@ -86,6 +88,16 @@ class RecModel(object):
use_double_buffer=True,
iterable=False)
else:
if self.char_type == "ch" and self.infer_img:
image_shape[-1] = -1
if self.tps != None:
logger.info(
"WARNRNG!!!\n"
"TPS does not support variable shape in chinese!"
"We set img_shape to be the same , it may affect the inference effect"
)
image_shape = deepcopy(self.image_shape)
image = fluid.data(name='image', shape=image_shape, dtype='float32')
labels = None
loader = None
return image, labels, loader
......@@ -110,7 +122,11 @@ class RecModel(object):
return loader, outputs
elif mode == "export":
predict = predicts['predict']
predict = fluid.layers.softmax(predict)
if self.loss_type == "ctc":
predict = fluid.layers.softmax(predict)
return [image, {'decoded_out': decoded_out, 'predicts': predict}]
else:
return loader, {'decoded_out': decoded_out}
predict = predicts['predict']
if self.loss_type == "ctc":
predict = fluid.layers.softmax(predict)
return loader, {'decoded_out': decoded_out, 'predicts': predict}
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
......@@ -115,7 +115,6 @@ class DBHead(object):
initializer = fluid.initializer.Uniform(-stdv, stdv)
bias_attr = fluid.ParamAttr(
regularizer=regularizer,
gradient_clip=gradient_clip,
initializer=initializer,
name=name + "_b_attr")
return bias_attr
......@@ -196,7 +195,7 @@ class DBHead(object):
fuse = fluid.layers.concat(input=[p5, p4, p3, p2], axis=1)
shrink_maps = self.binarize(fuse)
if mode != "train":
return {"maps", shrink_maps}
return {"maps": shrink_maps}
threshold_maps = self.thresh(fuse)
binary_maps = self.step_function(shrink_maps, threshold_maps)
y = fluid.layers.concat(
......
......@@ -18,6 +18,7 @@ from __future__ import print_function
import paddle.fluid as fluid
from ..common_functions import conv_bn_layer, deconv_bn_layer
from collections import OrderedDict
class EASTHead(object):
......@@ -110,7 +111,7 @@ class EASTHead(object):
def __call__(self, inputs):
f_common = self.unet_fusion(inputs)
f_score, f_geo = self.detector_header(f_common)
predicts = {}
predicts = OrderedDict()
predicts['f_score'] = f_score
predicts['f_geo'] = f_geo
return predicts
......@@ -123,6 +123,8 @@ class AttentionPredict(object):
full_ids = fluid.layers.fill_constant_batch_size_like(
input=init_state, shape=[-1, 1], dtype='int64', value=1)
full_scores = fluid.layers.fill_constant_batch_size_like(
input=init_state, shape=[-1, 1], dtype='float32', value=1)
cond = layers.less_than(x=counter, y=array_len)
while_op = layers.While(cond=cond)
......@@ -171,6 +173,9 @@ class AttentionPredict(object):
new_ids = fluid.layers.concat([full_ids, topk_indices], axis=1)
fluid.layers.assign(new_ids, full_ids)
new_scores = fluid.layers.concat([full_scores, topk_scores], axis=1)
fluid.layers.assign(new_scores, full_scores)
layers.increment(x=counter, value=1, in_place=True)
# update the memories
......@@ -184,7 +189,7 @@ class AttentionPredict(object):
length_cond = layers.less_than(x=counter, y=array_len)
finish_cond = layers.logical_not(layers.is_empty(x=topk_indices))
layers.logical_and(x=length_cond, y=finish_cond, out=cond)
return full_ids
return full_ids, full_scores
def __call__(self, inputs, labels=None, mode=None):
encoder_features = self.encoder(inputs)
......@@ -223,10 +228,10 @@ class AttentionPredict(object):
decoder_size, char_num)
_, decoded_out = layers.topk(input=predict, k=1)
decoded_out = layers.lod_reset(decoded_out, y=label_out)
predicts = {'predict': predict, 'decoded_out': decoded_out}
predicts = {'predict':predict, 'decoded_out':decoded_out}
else:
ids = self.gru_attention_infer(
ids, predict = self.gru_attention_infer(
decoder_boot, self.max_length, char_num, word_vector_dim,
encoded_vector, encoded_proj, decoder_size)
predicts = {'decoded_out': ids}
predicts = {'predict':predict, 'decoded_out':ids}
return predicts
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
......@@ -15,6 +15,11 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle.fluid as fluid
from paddle.fluid.regularizer import L2Decay
from ppocr.utils.utility import initial_logger
logger = initial_logger()
def AdamDecay(params, parameter_list=None):
......@@ -28,9 +33,24 @@ def AdamDecay(params, parameter_list=None):
base_lr = params['base_lr']
beta1 = params['beta1']
beta2 = params['beta2']
l2_decay = params.get("l2_decay", 0.0)
if 'decay' in params:
params = params['decay']
decay_mode = params['function']
step_each_epoch = params['step_each_epoch']
total_epoch = params['total_epoch']
if decay_mode == "cosine_decay":
base_lr = fluid.layers.cosine_decay(
learning_rate=base_lr,
step_each_epoch=step_each_epoch,
epochs=total_epoch)
else:
logger.info("Only support Cosine decay currently")
optimizer = fluid.optimizer.Adam(
learning_rate=base_lr,
beta1=beta1,
beta2=beta2,
regularization=L2Decay(regularization_coeff=l2_decay),
parameter_list=parameter_list)
return optimizer
......@@ -35,6 +35,7 @@ class DBPostProcess(object):
self.thresh = params['thresh']
self.box_thresh = params['box_thresh']
self.max_candidates = params['max_candidates']
self.unclip_ratio = params['unclip_ratio']
self.min_size = 3
def boxes_from_bitmap(self, pred, _bitmap, dest_width, dest_height):
......@@ -46,7 +47,8 @@ class DBPostProcess(object):
bitmap = _bitmap
height, width = bitmap.shape
outs = cv2.findContours((bitmap * 255).astype(np.uint8), cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
outs = cv2.findContours((bitmap * 255).astype(np.uint8), cv2.RETR_LIST,
cv2.CHAIN_APPROX_SIMPLE)
if len(outs) == 3:
img, contours, _ = outs[0], outs[1], outs[2]
elif len(outs) == 2:
......@@ -83,7 +85,8 @@ class DBPostProcess(object):
scores[index] = score
return boxes, scores
def unclip(self, box, unclip_ratio=2.0):
def unclip(self, box):
unclip_ratio = self.unclip_ratio
poly = Polygon(box)
distance = poly.area * unclip_ratio / poly.length
offset = pyclipper.PyclipperOffset()
......
......@@ -20,6 +20,12 @@ import numpy as np
from .locality_aware_nms import nms_locality
import cv2
import os
import sys
__dir__ = os.path.dirname(__file__)
sys.path.append(__dir__)
sys.path.append(os.path.join(__dir__, '..'))
class EASTPostPocess(object):
"""
......@@ -30,6 +36,11 @@ class EASTPostPocess(object):
self.score_thresh = params['score_thresh']
self.cover_thresh = params['cover_thresh']
self.nms_thresh = params['nms_thresh']
# c++ la-nms is faster, but only support python 3.5
self.is_python35 = False
if sys.version_info.major == 3 and sys.version_info.minor == 5:
self.is_python35 = True
def restore_rectangle_quad(self, origin, geometry):
"""
......@@ -66,7 +77,11 @@ class EASTPostPocess(object):
boxes = np.zeros((text_box_restored.shape[0], 9), dtype=np.float32)
boxes[:, :8] = text_box_restored.reshape((-1, 8))
boxes[:, 8] = score_map[xy_text[:, 0], xy_text[:, 1]]
boxes = nms_locality(boxes.astype(np.float64), nms_thresh)
if self.is_python35:
import lanms
boxes = lanms.merge_quadrangle_n9(boxes, nms_thresh)
else:
boxes = nms_locality(boxes.astype(np.float64), nms_thresh)
if boxes.shape[0] == 0:
return []
# Here we filter some low score boxes by the average score map,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment