"ppstructure/vscode:/vscode.git/clone" did not exist on "0d7ee968077096cb010896e28061bc4bcb2eb40a"
Commit aad3093a authored by WenmuZhou's avatar WenmuZhou
Browse files

dygraph first commit

parent 10f7e519
"""
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import sys
import six
import cv2
import numpy as np
class DecodeImage(object):
""" decode image """
def __init__(self, img_mode='RGB', channel_first=False, **kwargs):
self.img_mode = img_mode
self.channel_first = channel_first
def __call__(self, data):
img = data['image']
if six.PY2:
assert type(img) is str and len(
img) > 0, "invalid input 'img' in DecodeImage"
else:
assert type(img) is bytes and len(
img) > 0, "invalid input 'img' in DecodeImage"
img = np.frombuffer(img, dtype='uint8')
img = cv2.imdecode(img, 1)
if self.img_mode == 'GRAY':
img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
elif self.img_mode == 'RGB':
assert img.shape[2] == 3, 'invalid shape of image[%s]' % (img.shape)
img = img[:, :, ::-1]
if self.channel_first:
img = img.transpose((2, 0, 1))
data['image'] = img
return data
class NormalizeImage(object):
""" normalize image such as substract mean, divide std
"""
def __init__(self, scale=None, mean=None, std=None, order='chw', **kwargs):
if isinstance(scale, str):
scale = eval(scale)
self.scale = np.float32(scale if scale is not None else 1.0 / 255.0)
mean = mean if mean is not None else [0.485, 0.456, 0.406]
std = std if std is not None else [0.229, 0.224, 0.225]
shape = (3, 1, 1) if order == 'chw' else (1, 1, 3)
self.mean = np.array(mean).reshape(shape).astype('float32')
self.std = np.array(std).reshape(shape).astype('float32')
def __call__(self, data):
img = data['image']
from PIL import Image
if isinstance(img, Image.Image):
img = np.array(img)
assert isinstance(img,
np.ndarray), "invalid input 'img' in NormalizeImage"
data['image'] = (
img.astype('float32') * self.scale - self.mean) / self.std
return data
class ToCHWImage(object):
""" convert hwc image to chw image
"""
def __init__(self, **kwargs):
pass
def __call__(self, data):
img = data['image']
from PIL import Image
if isinstance(img, Image.Image):
img = np.array(img)
data['image'] = img.transpose((2, 0, 1))
return data
class keepKeys(object):
def __init__(self, keep_keys, **kwargs):
self.keep_keys = keep_keys
def __call__(self, data):
data_list = []
for key in self.keep_keys:
data_list.append(data[key])
return data_list
class DetResizeForTest(object):
def __init__(self, **kwargs):
super(DetResizeForTest, self).__init__()
self.resize_type = 0
if 'image_shape' in kwargs:
self.image_shape = kwargs['image_shape']
self.resize_type = 1
if 'limit_side_len' in kwargs:
self.limit_side_len = kwargs['limit_side_len']
self.limit_type = kwargs.get('limit_type', 'min')
else:
self.limit_side_len = 736
self.limit_type = 'min'
def __call__(self, data):
img = data['image']
if self.resize_type == 0:
img, shape = self.resize_image_type0(img)
else:
img, shape = self.resize_image_type1(img)
data['image'] = img
data['shape'] = shape
return data
def resize_image_type1(self, img):
resize_h, resize_w = self.image_shape
ori_h, ori_w = img.shape[:2] # (h, w, c)
img = cv2.resize(img, (int(resize_w), int(resize_h)))
return img, np.array([ori_h, ori_w])
def resize_image_type0(self, img):
"""
resize image to a size multiple of 32 which is required by the network
args:
img(array): array with shape [h, w, c]
return(tuple):
img, (ratio_h, ratio_w)
"""
limit_side_len = self.limit_side_len
h, w, _ = img.shape
# limit the max side
if self.limit_type == 'max':
if max(h, w) > limit_side_len:
if h > w:
ratio = float(limit_side_len) / h
else:
ratio = float(limit_side_len) / w
else:
ratio = 1.
else:
if min(h, w) < limit_side_len:
if h < w:
ratio = float(limit_side_len) / h
else:
ratio = float(limit_side_len) / w
else:
ratio = 1.
resize_h = int(h * ratio)
resize_w = int(w * ratio)
resize_h = int(round(resize_h / 32) * 32)
resize_w = int(round(resize_w / 32) * 32)
try:
if int(resize_w) <= 0 or int(resize_h) <= 0:
return None, (None, None)
img = cv2.resize(img, (int(resize_w), int(resize_h)))
except:
print(img.shape, resize_w, resize_h)
sys.exit(0)
return img, np.array([h, w])
...@@ -108,48 +108,103 @@ def crop_area(im, text_polys, min_crop_side_ratio, max_tries): ...@@ -108,48 +108,103 @@ def crop_area(im, text_polys, min_crop_side_ratio, max_tries):
return 0, 0, w, h return 0, 0, w, h
def RandomCropData(data, size): class EastRandomCropData(object):
max_tries = 10 def __init__(self,
min_crop_side_ratio = 0.1 size=(640, 640),
require_original_image = False max_tries=10,
keep_ratio = True min_crop_side_ratio=0.1,
keep_ratio=True,
im = data['image'] **kwargs):
text_polys = data['polys'] self.size = size
ignore_tags = data['ignore_tags'] self.max_tries = max_tries
texts = data['texts'] self.min_crop_side_ratio = min_crop_side_ratio
all_care_polys = [ self.keep_ratio = keep_ratio
text_polys[i] for i, tag in enumerate(ignore_tags) if not tag
] def __call__(self, data):
# 计算crop区域 img = data['image']
crop_x, crop_y, crop_w, crop_h = crop_area(im, all_care_polys, text_polys = data['polys']
min_crop_side_ratio, max_tries) ignore_tags = data['ignore_tags']
# crop 图片 保持比例填充 texts = data['texts']
scale_w = size[0] / crop_w all_care_polys = [
scale_h = size[1] / crop_h text_polys[i] for i, tag in enumerate(ignore_tags) if not tag
scale = min(scale_w, scale_h) ]
h = int(crop_h * scale) # 计算crop区域
w = int(crop_w * scale) crop_x, crop_y, crop_w, crop_h = crop_area(
if keep_ratio: img, all_care_polys, self.min_crop_side_ratio, self.max_tries)
padimg = np.zeros((size[1], size[0], im.shape[2]), im.dtype) # crop 图片 保持比例填充
padimg[:h, :w] = cv2.resize( scale_w = self.size[0] / crop_w
im[crop_y:crop_y + crop_h, crop_x:crop_x + crop_w], (w, h)) scale_h = self.size[1] / crop_h
img = padimg scale = min(scale_w, scale_h)
else: h = int(crop_h * scale)
img = cv2.resize(im[crop_y:crop_y + crop_h, crop_x:crop_x + crop_w], w = int(crop_w * scale)
tuple(size)) if self.keep_ratio:
# crop 文本框 padimg = np.zeros((self.size[1], self.size[0], img.shape[2]),
text_polys_crop = [] img.dtype)
ignore_tags_crop = [] padimg[:h, :w] = cv2.resize(
texts_crop = [] img[crop_y:crop_y + crop_h, crop_x:crop_x + crop_w], (w, h))
for poly, text, tag in zip(text_polys, texts, ignore_tags): img = padimg
poly = ((poly - (crop_x, crop_y)) * scale).tolist() else:
if not is_poly_outside_rect(poly, 0, 0, w, h): img = cv2.resize(
text_polys_crop.append(poly) img[crop_y:crop_y + crop_h, crop_x:crop_x + crop_w],
ignore_tags_crop.append(tag) tuple(self.size))
texts_crop.append(text) # crop 文本框
data['image'] = img text_polys_crop = []
data['polys'] = np.array(text_polys_crop) ignore_tags_crop = []
data['ignore_tags'] = ignore_tags_crop texts_crop = []
data['texts'] = texts_crop for poly, text, tag in zip(text_polys, texts, ignore_tags):
return data poly = ((poly - (crop_x, crop_y)) * scale).tolist()
if not is_poly_outside_rect(poly, 0, 0, w, h):
text_polys_crop.append(poly)
ignore_tags_crop.append(tag)
texts_crop.append(text)
data['image'] = img
data['polys'] = np.array(text_polys_crop)
data['ignore_tags'] = ignore_tags_crop
data['texts'] = texts_crop
return data
class PSERandomCrop(object):
def __init__(self, size, **kwargs):
self.size = size
def __call__(self, data):
imgs = data['imgs']
h, w = imgs[0].shape[0:2]
th, tw = self.size
if w == tw and h == th:
return imgs
# label中存在文本实例,并且按照概率进行裁剪,使用threshold_label_map控制
if np.max(imgs[2]) > 0 and random.random() > 3 / 8:
# 文本实例的左上角点
tl = np.min(np.where(imgs[2] > 0), axis=1) - self.size
tl[tl < 0] = 0
# 文本实例的右下角点
br = np.max(np.where(imgs[2] > 0), axis=1) - self.size
br[br < 0] = 0
# 保证选到右下角点时,有足够的距离进行crop
br[0] = min(br[0], h - th)
br[1] = min(br[1], w - tw)
for _ in range(50000):
i = random.randint(tl[0], br[0])
j = random.randint(tl[1], br[1])
# 保证shrink_label_map有文本
if imgs[1][i:i + th, j:j + tw].sum() <= 0:
continue
else:
break
else:
i = random.randint(0, h - th)
j = random.randint(0, w - tw)
# return i, j, th, tw
for idx in range(len(imgs)):
if len(imgs[idx].shape) == 3:
imgs[idx] = imgs[idx][i:i + th, j:j + tw, :]
else:
imgs[idx] = imgs[idx][i:i + th, j:j + tw]
data['imgs'] = imgs
return data
#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. # copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
# #
#Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
#You may obtain a copy of the License at # You may obtain a copy of the License at
# #
# http://www.apache.org/licenses/LICENSE-2.0 # http://www.apache.org/licenses/LICENSE-2.0
# #
#Unless required by applicable law or agreed to in writing, software # Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS, # distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
#limitations under the License. # limitations under the License.
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math import math
import cv2 import cv2
import numpy as np import numpy as np
import random import random
from ppocr.utils.utility import initial_logger
logger = initial_logger()
from .text_image_aug import tia_perspective, tia_stretch, tia_distort
def get_bounding_box_rect(pos):
left = min(pos[0]) class RecAug(object):
right = max(pos[0]) def __init__(self, **kwargsz):
top = min(pos[1]) pass
bottom = max(pos[1])
return [left, top, right, bottom] def __call__(self, data):
img = data['image']
img = warp(img, 10)
data['image'] = img
return data
class RecResizeImg(object):
def __init__(self,
image_shape,
infer_mode=False,
character_type='ch',
use_tps=False,
**kwargs):
self.image_shape = image_shape
self.infer_mode = infer_mode
self.character_type = character_type
self.use_tps = use_tps
def __call__(self, data):
img = data['image']
if self.infer_mode and self.character_type == "ch" and not self.use_tps:
norm_img = resize_norm_img_chinese(img, self.image_shape)
else:
norm_img = resize_norm_img(img, self.image_shape)
data['image'] = norm_img
return data
def resize_norm_img(img, image_shape): def resize_norm_img(img, image_shape):
...@@ -77,19 +116,6 @@ def resize_norm_img_chinese(img, image_shape): ...@@ -77,19 +116,6 @@ def resize_norm_img_chinese(img, image_shape):
return padding_im return padding_im
def get_img_data(value):
"""get_img_data"""
if not value:
return None
imgdata = np.frombuffer(value, dtype='uint8')
if imgdata is None:
return None
imgori = cv2.imdecode(imgdata, 1)
if imgori is None:
return None
return imgori
def flag(): def flag():
""" """
flag flag
...@@ -196,6 +222,9 @@ class Config: ...@@ -196,6 +222,9 @@ class Config:
self.h = h self.h = h
self.perspective = True self.perspective = True
self.stretch = True
self.distort = True
self.crop = True self.crop = True
self.affine = False self.affine = False
self.reverse = True self.reverse = True
...@@ -299,168 +328,39 @@ def warp(img, ang): ...@@ -299,168 +328,39 @@ def warp(img, ang):
config.make(w, h, ang) config.make(w, h, ang)
new_img = img new_img = img
prob = 0.4
if config.distort:
img_height, img_width = img.shape[0:2]
if random.random() <= prob and img_height >= 20 and img_width >= 20:
new_img = tia_distort(new_img, random.randint(3, 6))
if config.stretch:
img_height, img_width = img.shape[0:2]
if random.random() <= prob and img_height >= 20 and img_width >= 20:
new_img = tia_stretch(new_img, random.randint(3, 6))
if config.perspective: if config.perspective:
tp = random.randint(1, 100) if random.random() <= prob:
if tp >= 50: new_img = tia_perspective(new_img)
warpR, (r1, c1), ratio, dst = get_warpR(config)
new_w = int(np.max(dst[:, 0])) - int(np.min(dst[:, 0]))
new_img = cv2.warpPerspective(
new_img,
warpR, (int(new_w * ratio), h),
borderMode=config.borderMode)
if config.crop: if config.crop:
img_height, img_width = img.shape[0:2] img_height, img_width = img.shape[0:2]
tp = random.randint(1, 100) if random.random() <= prob and img_height >= 20 and img_width >= 20:
if tp >= 50 and img_height >= 20 and img_width >= 20:
new_img = get_crop(new_img) new_img = get_crop(new_img)
if config.affine:
warpT = get_warpAffine(config)
new_img = cv2.warpAffine(
new_img, warpT, (w, h), borderMode=config.borderMode)
if config.blur: if config.blur:
tp = random.randint(1, 100) if random.random() <= prob:
if tp >= 50:
new_img = blur(new_img) new_img = blur(new_img)
if config.color: if config.color:
tp = random.randint(1, 100) if random.random() <= prob:
if tp >= 50:
new_img = cvtColor(new_img) new_img = cvtColor(new_img)
if config.jitter: if config.jitter:
new_img = jitter(new_img) new_img = jitter(new_img)
if config.noise: if config.noise:
tp = random.randint(1, 100) if random.random() <= prob:
if tp >= 50:
new_img = add_gasuss_noise(new_img) new_img = add_gasuss_noise(new_img)
if config.reverse: if config.reverse:
tp = random.randint(1, 100) if random.random() <= prob:
if tp >= 50:
new_img = 255 - new_img new_img = 255 - new_img
return new_img return new_img
def process_image(img,
image_shape,
label=None,
char_ops=None,
loss_type=None,
max_text_length=None,
tps=None,
infer_mode=False,
distort=False):
if distort:
img = warp(img, 10)
if infer_mode and char_ops.character_type == "ch" and not tps:
norm_img = resize_norm_img_chinese(img, image_shape)
else:
norm_img = resize_norm_img(img, image_shape)
norm_img = norm_img[np.newaxis, :]
if label is not None:
# char_num = char_ops.get_char_num()
text = char_ops.encode(label)
if len(text) == 0 or len(text) > max_text_length:
logger.info(
"Warning in ppocr/data/rec/img_tools.py: Wrong data type."
"Excepted string with length between 1 and {}, but "
"got '{}'. Label is '{}'".format(max_text_length,
len(text), label))
return None
else:
if loss_type == "ctc":
text = text.reshape(-1, 1)
return (norm_img, text)
elif loss_type == "attention":
beg_flag_idx = char_ops.get_beg_end_flag_idx("beg")
end_flag_idx = char_ops.get_beg_end_flag_idx("end")
beg_text = np.append(beg_flag_idx, text)
end_text = np.append(text, end_flag_idx)
beg_text = beg_text.reshape(-1, 1)
end_text = end_text.reshape(-1, 1)
return (norm_img, beg_text, end_text)
else:
assert False, "Unsupport loss_type %s in process_image"\
% loss_type
return (norm_img)
def resize_norm_img_srn(img, image_shape):
imgC, imgH, imgW = image_shape
img_black = np.zeros((imgH, imgW))
im_hei = img.shape[0]
im_wid = img.shape[1]
if im_wid <= im_hei * 1:
img_new = cv2.resize(img, (imgH * 1, imgH))
elif im_wid <= im_hei * 2:
img_new = cv2.resize(img, (imgH * 2, imgH))
elif im_wid <= im_hei * 3:
img_new = cv2.resize(img, (imgH * 3, imgH))
else:
img_new = cv2.resize(img, (imgW, imgH))
img_np = np.asarray(img_new)
img_np = cv2.cvtColor(img_np, cv2.COLOR_BGR2GRAY)
img_black[:, 0:img_np.shape[1]] = img_np
img_black = img_black[:, :, np.newaxis]
row, col, c = img_black.shape
c = 1
return np.reshape(img_black, (c, row, col)).astype(np.float32)
def srn_other_inputs(image_shape,
num_heads,
max_text_length,
char_num):
imgC, imgH, imgW = image_shape
feature_dim = int((imgH / 8) * (imgW / 8))
encoder_word_pos = np.array(range(0, feature_dim)).reshape((feature_dim, 1)).astype('int64')
gsrm_word_pos = np.array(range(0, max_text_length)).reshape((max_text_length, 1)).astype('int64')
lbl_weight = np.array([int(char_num-1)] * max_text_length).reshape((-1,1)).astype('int64')
gsrm_attn_bias_data = np.ones((1, max_text_length, max_text_length))
gsrm_slf_attn_bias1 = np.triu(gsrm_attn_bias_data, 1).reshape([-1, 1, max_text_length, max_text_length])
gsrm_slf_attn_bias1 = np.tile(gsrm_slf_attn_bias1, [1, num_heads, 1, 1]) * [-1e9]
gsrm_slf_attn_bias2 = np.tril(gsrm_attn_bias_data, -1).reshape([-1, 1, max_text_length, max_text_length])
gsrm_slf_attn_bias2 = np.tile(gsrm_slf_attn_bias2, [1, num_heads, 1, 1]) * [-1e9]
encoder_word_pos = encoder_word_pos[np.newaxis, :]
gsrm_word_pos = gsrm_word_pos[np.newaxis, :]
return [lbl_weight, encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1, gsrm_slf_attn_bias2]
def process_image_srn(img,
image_shape,
num_heads,
max_text_length,
label=None,
char_ops=None,
loss_type=None):
norm_img = resize_norm_img_srn(img, image_shape)
norm_img = norm_img[np.newaxis, :]
char_num = char_ops.get_char_num()
[lbl_weight, encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1, gsrm_slf_attn_bias2] = \
srn_other_inputs(image_shape, num_heads, max_text_length,char_num)
if label is not None:
text = char_ops.encode(label)
if len(text) == 0 or len(text) > max_text_length:
return None
else:
if loss_type == "srn":
text_padded = [int(char_num-1)] * max_text_length
for i in range(len(text)):
text_padded[i] = text[i]
lbl_weight[i] = [1.0]
text_padded = np.array(text_padded)
text = text_padded.reshape(-1, 1)
return (norm_img, text,encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1, gsrm_slf_attn_bias2,lbl_weight)
else:
assert False, "Unsupport loss_type %s in process_image"\
% loss_type
return (norm_img, encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1, gsrm_slf_attn_bias2)
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
# You may obtain a copy of the License at # You may obtain a copy of the License at
# #
# http://www.apache.org/licenses/LICENSE-2.0 # http://www.apache.org/licenses/LICENSE-2.0
# #
# Unless required by applicable law or agreed to in writing, software # Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, # distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from .augment import tia_perspective, tia_distort, tia_stretch
__all__ = ['tia_distort', 'tia_stretch', 'tia_perspective']
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
from .warp_mls import WarpMLS
def tia_distort(src, segment=4):
img_h, img_w = src.shape[:2]
cut = img_w // segment
thresh = cut // 3
src_pts = list()
dst_pts = list()
src_pts.append([0, 0])
src_pts.append([img_w, 0])
src_pts.append([img_w, img_h])
src_pts.append([0, img_h])
dst_pts.append([np.random.randint(thresh), np.random.randint(thresh)])
dst_pts.append(
[img_w - np.random.randint(thresh), np.random.randint(thresh)])
dst_pts.append(
[img_w - np.random.randint(thresh), img_h - np.random.randint(thresh)])
dst_pts.append(
[np.random.randint(thresh), img_h - np.random.randint(thresh)])
half_thresh = thresh * 0.5
for cut_idx in np.arange(1, segment, 1):
src_pts.append([cut * cut_idx, 0])
src_pts.append([cut * cut_idx, img_h])
dst_pts.append([
cut * cut_idx + np.random.randint(thresh) - half_thresh,
np.random.randint(thresh) - half_thresh
])
dst_pts.append([
cut * cut_idx + np.random.randint(thresh) - half_thresh,
img_h + np.random.randint(thresh) - half_thresh
])
trans = WarpMLS(src, src_pts, dst_pts, img_w, img_h)
dst = trans.generate()
return dst
def tia_stretch(src, segment=4):
img_h, img_w = src.shape[:2]
cut = img_w // segment
thresh = cut * 4 // 5
src_pts = list()
dst_pts = list()
src_pts.append([0, 0])
src_pts.append([img_w, 0])
src_pts.append([img_w, img_h])
src_pts.append([0, img_h])
dst_pts.append([0, 0])
dst_pts.append([img_w, 0])
dst_pts.append([img_w, img_h])
dst_pts.append([0, img_h])
half_thresh = thresh * 0.5
for cut_idx in np.arange(1, segment, 1):
move = np.random.randint(thresh) - half_thresh
src_pts.append([cut * cut_idx, 0])
src_pts.append([cut * cut_idx, img_h])
dst_pts.append([cut * cut_idx + move, 0])
dst_pts.append([cut * cut_idx + move, img_h])
trans = WarpMLS(src, src_pts, dst_pts, img_w, img_h)
dst = trans.generate()
return dst
def tia_perspective(src):
img_h, img_w = src.shape[:2]
thresh = img_h // 2
src_pts = list()
dst_pts = list()
src_pts.append([0, 0])
src_pts.append([img_w, 0])
src_pts.append([img_w, img_h])
src_pts.append([0, img_h])
dst_pts.append([0, np.random.randint(thresh)])
dst_pts.append([img_w, np.random.randint(thresh)])
dst_pts.append([img_w, img_h - np.random.randint(thresh)])
dst_pts.append([0, img_h - np.random.randint(thresh)])
trans = WarpMLS(src, src_pts, dst_pts, img_w, img_h)
dst = trans.generate()
return dst
\ No newline at end of file
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
class WarpMLS:
def __init__(self, src, src_pts, dst_pts, dst_w, dst_h, trans_ratio=1.):
self.src = src
self.src_pts = src_pts
self.dst_pts = dst_pts
self.pt_count = len(self.dst_pts)
self.dst_w = dst_w
self.dst_h = dst_h
self.trans_ratio = trans_ratio
self.grid_size = 100
self.rdx = np.zeros((self.dst_h, self.dst_w))
self.rdy = np.zeros((self.dst_h, self.dst_w))
@staticmethod
def __bilinear_interp(x, y, v11, v12, v21, v22):
return (v11 * (1 - y) + v12 * y) * (1 - x) + (v21 *
(1 - y) + v22 * y) * x
def generate(self):
self.calc_delta()
return self.gen_img()
def calc_delta(self):
w = np.zeros(self.pt_count, dtype=np.float32)
if self.pt_count < 2:
return
i = 0
while 1:
if self.dst_w <= i < self.dst_w + self.grid_size - 1:
i = self.dst_w - 1
elif i >= self.dst_w:
break
j = 0
while 1:
if self.dst_h <= j < self.dst_h + self.grid_size - 1:
j = self.dst_h - 1
elif j >= self.dst_h:
break
sw = 0
swp = np.zeros(2, dtype=np.float32)
swq = np.zeros(2, dtype=np.float32)
new_pt = np.zeros(2, dtype=np.float32)
cur_pt = np.array([i, j], dtype=np.float32)
k = 0
for k in range(self.pt_count):
if i == self.dst_pts[k][0] and j == self.dst_pts[k][1]:
break
w[k] = 1. / (
(i - self.dst_pts[k][0]) * (i - self.dst_pts[k][0]) +
(j - self.dst_pts[k][1]) * (j - self.dst_pts[k][1]))
sw += w[k]
swp = swp + w[k] * np.array(self.dst_pts[k])
swq = swq + w[k] * np.array(self.src_pts[k])
if k == self.pt_count - 1:
pstar = 1 / sw * swp
qstar = 1 / sw * swq
miu_s = 0
for k in range(self.pt_count):
if i == self.dst_pts[k][0] and j == self.dst_pts[k][1]:
continue
pt_i = self.dst_pts[k] - pstar
miu_s += w[k] * np.sum(pt_i * pt_i)
cur_pt -= pstar
cur_pt_j = np.array([-cur_pt[1], cur_pt[0]])
for k in range(self.pt_count):
if i == self.dst_pts[k][0] and j == self.dst_pts[k][1]:
continue
pt_i = self.dst_pts[k] - pstar
pt_j = np.array([-pt_i[1], pt_i[0]])
tmp_pt = np.zeros(2, dtype=np.float32)
tmp_pt[0] = np.sum(pt_i * cur_pt) * self.src_pts[k][0] - \
np.sum(pt_j * cur_pt) * self.src_pts[k][1]
tmp_pt[1] = -np.sum(pt_i * cur_pt_j) * self.src_pts[k][0] + \
np.sum(pt_j * cur_pt_j) * self.src_pts[k][1]
tmp_pt *= (w[k] / miu_s)
new_pt += tmp_pt
new_pt += qstar
else:
new_pt = self.src_pts[k]
self.rdx[j, i] = new_pt[0] - i
self.rdy[j, i] = new_pt[1] - j
j += self.grid_size
i += self.grid_size
def gen_img(self):
src_h, src_w = self.src.shape[:2]
dst = np.zeros_like(self.src, dtype=np.float32)
for i in np.arange(0, self.dst_h, self.grid_size):
for j in np.arange(0, self.dst_w, self.grid_size):
ni = i + self.grid_size
nj = j + self.grid_size
w = h = self.grid_size
if ni >= self.dst_h:
ni = self.dst_h - 1
h = ni - i + 1
if nj >= self.dst_w:
nj = self.dst_w - 1
w = nj - j + 1
di = np.reshape(np.arange(h), (-1, 1))
dj = np.reshape(np.arange(w), (1, -1))
delta_x = self.__bilinear_interp(
di / h, dj / w, self.rdx[i, j], self.rdx[i, nj],
self.rdx[ni, j], self.rdx[ni, nj])
delta_y = self.__bilinear_interp(
di / h, dj / w, self.rdy[i, j], self.rdy[i, nj],
self.rdy[ni, j], self.rdy[ni, nj])
nx = j + dj + delta_x * self.trans_ratio
ny = i + di + delta_y * self.trans_ratio
nx = np.clip(nx, 0, src_w - 1)
ny = np.clip(ny, 0, src_h - 1)
nxi = np.array(np.floor(nx), dtype=np.int32)
nyi = np.array(np.floor(ny), dtype=np.int32)
nxi1 = np.array(np.ceil(nx), dtype=np.int32)
nyi1 = np.array(np.ceil(ny), dtype=np.int32)
if len(self.src.shape) == 3:
x = np.tile(np.expand_dims(ny - nyi, axis=-1), (1, 1, 3))
y = np.tile(np.expand_dims(nx - nxi, axis=-1), (1, 1, 3))
else:
x = ny - nyi
y = nx - nxi
dst[i:i + h, j:j + w] = self.__bilinear_interp(
x, y, self.src[nyi, nxi], self.src[nyi, nxi1],
self.src[nyi1, nxi], self.src[nyi1, nxi1])
dst = np.clip(dst, 0, 255)
dst = np.array(dst, dtype=np.uint8)
return dst
\ No newline at end of file
#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
import os
import random
import numpy as np
import paddle
from ppocr.utils.utility import create_module
from copy import deepcopy
from .rec.img_tools import process_image
import cv2
import sys
import signal
# handle terminate reader process, do not print stack frame
def _reader_quit(signum, frame):
print("Reader process exit.")
sys.exit()
def _term_group(sig_num, frame):
print('pid {} terminated, terminate group '
'{}...'.format(os.getpid(), os.getpgrp()))
os.killpg(os.getpgid(os.getpid()), signal.SIGKILL)
signal.signal(signal.SIGTERM, _reader_quit)
signal.signal(signal.SIGINT, _term_group)
def reader_main(config=None, mode=None):
"""Create a reader for trainning
Args:
settings: arguments
Returns:
train reader
"""
assert mode in ["train", "eval", "test"],\
"Nonsupport mode:{}".format(mode)
global_params = config['Global']
if mode == "train":
params = deepcopy(config['TrainReader'])
elif mode == "eval":
params = deepcopy(config['EvalReader'])
else:
params = deepcopy(config['TestReader'])
params['mode'] = mode
params.update(global_params)
reader_function = params['reader_function']
function = create_module(reader_function)(params)
if mode == "train":
if sys.platform == "win32":
return function(0)
readers = []
num_workers = params['num_workers']
for process_id in range(num_workers):
readers.append(function(process_id))
return paddle.reader.multiprocess_reader(readers, False)
else:
return function(mode)
#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
import os
import sys
import math
import random
import numpy as np
import cv2
import string
import lmdb
from ppocr.utils.utility import initial_logger
from ppocr.utils.utility import get_image_file_list
logger = initial_logger()
from .img_tools import process_image, process_image_srn, get_img_data
class LMDBReader(object):
def __init__(self, params):
if params['mode'] != 'train':
self.num_workers = 1
else:
self.num_workers = params['num_workers']
self.lmdb_sets_dir = params['lmdb_sets_dir']
self.char_ops = params['char_ops']
self.image_shape = params['image_shape']
self.loss_type = params['loss_type']
self.max_text_length = params['max_text_length']
self.mode = params['mode']
self.drop_last = False
self.use_tps = False
self.num_heads = None
if "num_heads" in params:
self.num_heads = params['num_heads']
if "tps" in params:
self.ues_tps = True
self.use_distort = False
if "distort" in params:
self.use_distort = params['distort'] and params['use_gpu']
if not params['use_gpu']:
logger.info(
"Distort operation can only support in GPU. Distort will be set to False."
)
if params['mode'] == 'train':
self.batch_size = params['train_batch_size_per_card']
self.drop_last = True
else:
self.batch_size = params['test_batch_size_per_card']
self.drop_last = False
self.use_distort = False
self.infer_img = params['infer_img']
def load_hierarchical_lmdb_dataset(self):
lmdb_sets = {}
dataset_idx = 0
for dirpath, dirnames, filenames in os.walk(self.lmdb_sets_dir + '/'):
if not dirnames:
env = lmdb.open(
dirpath,
max_readers=32,
readonly=True,
lock=False,
readahead=False,
meminit=False)
txn = env.begin(write=False)
num_samples = int(txn.get('num-samples'.encode()))
lmdb_sets[dataset_idx] = {"dirpath":dirpath, "env":env, \
"txn":txn, "num_samples":num_samples}
dataset_idx += 1
return lmdb_sets
def print_lmdb_sets_info(self, lmdb_sets):
lmdb_info_strs = []
for dataset_idx in range(len(lmdb_sets)):
tmp_str = " %s:%d," % (lmdb_sets[dataset_idx]['dirpath'],
lmdb_sets[dataset_idx]['num_samples'])
lmdb_info_strs.append(tmp_str)
lmdb_info_strs = ''.join(lmdb_info_strs)
logger.info("DataSummary:" + lmdb_info_strs)
return
def close_lmdb_dataset(self, lmdb_sets):
for dataset_idx in lmdb_sets:
lmdb_sets[dataset_idx]['env'].close()
return
def get_lmdb_sample_info(self, txn, index):
label_key = 'label-%09d'.encode() % index
label = txn.get(label_key)
if label is None:
return None
label = label.decode('utf-8')
img_key = 'image-%09d'.encode() % index
imgbuf = txn.get(img_key)
img = get_img_data(imgbuf)
if img is None:
return None
return img, label
def __call__(self, process_id):
if self.mode != 'train':
process_id = 0
def sample_iter_reader():
if self.mode != 'train' and self.infer_img is not None:
image_file_list = get_image_file_list(self.infer_img)
for single_img in image_file_list:
img = cv2.imread(single_img)
if img.shape[-1] == 1 or len(list(img.shape)) == 2:
img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
if self.loss_type == 'srn':
norm_img = process_image_srn(
img=img,
image_shape=self.image_shape,
num_heads=self.num_heads,
max_text_length=self.max_text_length)
else:
norm_img = process_image(
img=img,
image_shape=self.image_shape,
char_ops=self.char_ops,
tps=self.use_tps,
infer_mode=True)
yield norm_img
else:
lmdb_sets = self.load_hierarchical_lmdb_dataset()
if process_id == 0:
self.print_lmdb_sets_info(lmdb_sets)
cur_index_sets = [1 + process_id] * len(lmdb_sets)
while True:
finish_read_num = 0
for dataset_idx in range(len(lmdb_sets)):
cur_index = cur_index_sets[dataset_idx]
if cur_index > lmdb_sets[dataset_idx]['num_samples']:
finish_read_num += 1
else:
sample_info = self.get_lmdb_sample_info(
lmdb_sets[dataset_idx]['txn'], cur_index)
cur_index_sets[dataset_idx] += self.num_workers
if sample_info is None:
continue
img, label = sample_info
outs = []
if self.loss_type == "srn":
outs = process_image_srn(
img=img,
image_shape=self.image_shape,
num_heads=self.num_heads,
max_text_length=self.max_text_length,
label=label,
char_ops=self.char_ops,
loss_type=self.loss_type)
else:
outs = process_image(
img=img,
image_shape=self.image_shape,
label=label,
char_ops=self.char_ops,
loss_type=self.loss_type,
max_text_length=self.max_text_length)
if outs is None:
continue
yield outs
if finish_read_num == len(lmdb_sets):
break
self.close_lmdb_dataset(lmdb_sets)
def batch_iter_reader():
batch_outs = []
for outs in sample_iter_reader():
batch_outs.append(outs)
if len(batch_outs) == self.batch_size:
yield batch_outs
batch_outs = []
if not self.drop_last:
if len(batch_outs) != 0:
yield batch_outs
if self.infer_img is None:
return batch_iter_reader
return sample_iter_reader
class SimpleReader(object):
def __init__(self, params):
if params['mode'] != 'train':
self.num_workers = 1
else:
self.num_workers = params['num_workers']
if params['mode'] != 'test':
self.img_set_dir = params['img_set_dir']
self.label_file_path = params['label_file_path']
self.use_gpu = params['use_gpu']
self.char_ops = params['char_ops']
self.image_shape = params['image_shape']
self.loss_type = params['loss_type']
self.max_text_length = params['max_text_length']
self.mode = params['mode']
self.infer_img = params['infer_img']
self.use_tps = False
if "num_heads" in params:
self.num_heads = params['num_heads']
if "tps" in params:
self.use_tps = True
self.use_distort = False
if "distort" in params:
self.use_distort = params['distort'] and params['use_gpu']
if not params['use_gpu']:
logger.info(
"Distort operation can only support in GPU.Distort will be set to False."
)
if params['mode'] == 'train':
self.batch_size = params['train_batch_size_per_card']
self.drop_last = True
else:
self.batch_size = params['test_batch_size_per_card']
self.drop_last = False
self.use_distort = False
def __call__(self, process_id):
if self.mode != 'train':
process_id = 0
def get_device_num():
if self.use_gpu:
gpus = os.environ.get("CUDA_VISIBLE_DEVICES", '1')
gpu_num = len(gpus.split(','))
return gpu_num
else:
cpu_num = os.environ.get("CPU_NUM", 1)
return int(cpu_num)
def sample_iter_reader():
if self.mode != 'train' and self.infer_img is not None:
image_file_list = get_image_file_list(self.infer_img)
for single_img in image_file_list:
img = cv2.imread(single_img)
if img.shape[-1] == 1 or len(list(img.shape)) == 2:
img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
if self.loss_type == 'srn':
norm_img = process_image_srn(
img=img,
image_shape=self.image_shape,
char_ops=self.char_ops,
num_heads=self.num_heads,
max_text_length=self.max_text_length)
else:
norm_img = process_image(
img=img,
image_shape=self.image_shape,
char_ops=self.char_ops,
tps=self.use_tps,
infer_mode=True)
yield norm_img
else:
with open(self.label_file_path, "rb") as fin:
label_infor_list = fin.readlines()
img_num = len(label_infor_list)
img_id_list = list(range(img_num))
random.shuffle(img_id_list)
if sys.platform == "win32" and self.num_workers != 1:
print("multiprocess is not fully compatible with Windows."
"num_workers will be 1.")
self.num_workers = 1
if self.batch_size * get_device_num(
) * self.num_workers > img_num:
raise Exception(
"The number of the whole data ({}) is smaller than the batch_size * devices_num * num_workers ({})".
format(img_num, self.batch_size * get_device_num() *
self.num_workers))
for img_id in range(process_id, img_num, self.num_workers):
label_infor = label_infor_list[img_id_list[img_id]]
substr = label_infor.decode('utf-8').strip("\n").split("\t")
img_path = self.img_set_dir + "/" + substr[0]
img = cv2.imread(img_path)
if img is None:
logger.info("{} does not exist!".format(img_path))
continue
if img.shape[-1] == 1 or len(list(img.shape)) == 2:
img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
label = substr[1]
if self.loss_type == "srn":
outs = process_image_srn(
img=img,
image_shape=self.image_shape,
num_heads=self.num_heads,
max_text_length=self.max_text_length,
label=label,
char_ops=self.char_ops,
loss_type=self.loss_type)
else:
outs = process_image(
img=img,
image_shape=self.image_shape,
label=label,
char_ops=self.char_ops,
loss_type=self.loss_type,
max_text_length=self.max_text_length,
distort=self.use_distort)
if outs is None:
continue
yield outs
def batch_iter_reader():
batch_outs = []
for outs in sample_iter_reader():
batch_outs.append(outs)
if len(batch_outs) == self.batch_size:
yield batch_outs
batch_outs = []
if not self.drop_last:
if len(batch_outs) != 0:
yield batch_outs
if self.infer_img is None:
return batch_iter_reader
return sample_iter_reader
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
__all__ = ['DetMetric']
from .eval_det_iou import DetectionIoUEvaluator
class DetMetric(object):
def __init__(self, main_indicator='hmean', **kwargs):
self.evaluator = DetectionIoUEvaluator()
self.main_indicator = main_indicator
self.reset()
def __call__(self, preds, batch, **kwargs):
'''
batch: a list produced by dataloaders.
image: np.ndarray of shape (N, C, H, W).
ratio_list: np.ndarray of shape(N,2)
polygons: np.ndarray of shape (N, K, 4, 2), the polygons of objective regions.
ignore_tags: np.ndarray of shape (N, K), indicates whether a region is ignorable or not.
preds: a list of dict produced by post process
points: np.ndarray of shape (N, K, 4, 2), the polygons of objective regions.
'''
gt_polyons_batch = batch[2]
ignore_tags_batch = batch[3]
for pred, gt_polyons, ignore_tags in zip(preds, gt_polyons_batch,
ignore_tags_batch):
# prepare gt
gt_info_list = [{
'points': gt_polyon,
'text': '',
'ignore': ignore_tag
} for gt_polyon, ignore_tag in zip(gt_polyons, ignore_tags)]
# prepare det
det_info_list = [{
'points': det_polyon,
'text': ''
} for det_polyon in pred['points']]
result = self.evaluator.evaluate_image(gt_info_list, det_info_list)
self.results.append(result)
def get_metric(self):
"""
return metircs {
'precision': 0,
'recall': 0,
'hmean': 0
}
"""
metircs = self.evaluator.combine_results(self.results)
self.reset()
return metircs
def reset(self):
self.results = [] # clear results
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import Levenshtein
class RecMetric(object):
def __init__(self, main_indicator='acc', **kwargs):
self.main_indicator = main_indicator
self.reset()
def __call__(self, pred_label, *args, **kwargs):
preds, labels = pred_label
correct_num = 0
all_num = 0
norm_edit_dis = 0.0
for (pred, pred_conf), (target, _) in zip(preds, labels):
norm_edit_dis += Levenshtein.distance(pred, target) / max(
len(pred), len(target))
if pred == target:
correct_num += 1
all_num += 1
# if all_num < 10 and kwargs.get('show_str', False):
# print('{} -> {}'.format(pred, target))
self.correct_num += correct_num
self.all_num += all_num
self.norm_edit_dis += norm_edit_dis
return {
'acc': correct_num / all_num,
'norm_edit_dis': 1 - norm_edit_dis / all_num
}
def get_metric(self):
"""
return metircs {
'acc': 0,
'norm_edit_dis': 0,
}
"""
acc = self.correct_num / self.all_num
norm_edit_dis = 1 - self.norm_edit_dis / self.all_num
self.reset()
return {'acc': acc, 'norm_edit_dis': norm_edit_dis}
def reset(self):
self.correct_num = 0
self.all_num = 0
self.norm_edit_dis = 0
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import copy
__all__ = ['build_metric']
def build_metric(config):
from .DetMetric import DetMetric
from .RecMetric import RecMetric
support_dict = ['DetMetric', 'RecMetric']
config = copy.deepcopy(config)
module_name = config.pop('name')
assert module_name in support_dict, Exception(
'metric only support {}'.format(support_dict))
module_class = eval(module_name)(**config)
return module_class
...@@ -88,8 +88,8 @@ class DetectionIoUEvaluator(object): ...@@ -88,8 +88,8 @@ class DetectionIoUEvaluator(object):
points = gt[n]['points'] points = gt[n]['points']
# transcription = gt[n]['text'] # transcription = gt[n]['text']
dontCare = gt[n]['ignore'] dontCare = gt[n]['ignore']
# points = Polygon(points) # points = Polygon(points)
# points = points.buffer(0) # points = points.buffer(0)
if not Polygon(points).is_valid or not Polygon(points).is_simple: if not Polygon(points).is_valid or not Polygon(points).is_simple:
continue continue
...@@ -105,8 +105,8 @@ class DetectionIoUEvaluator(object): ...@@ -105,8 +105,8 @@ class DetectionIoUEvaluator(object):
for n in range(len(pred)): for n in range(len(pred)):
points = pred[n]['points'] points = pred[n]['points']
# points = Polygon(points) # points = Polygon(points)
# points = points.buffer(0) # points = points.buffer(0)
if not Polygon(points).is_valid or not Polygon(points).is_simple: if not Polygon(points).is_valid or not Polygon(points).is_simple:
continue continue
......
...@@ -11,3 +11,16 @@ ...@@ -11,3 +11,16 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import copy
from .losses import build_loss
__all__ = ['build_model', 'build_loss']
def build_model(config):
from .architectures import Model
config = copy.deepcopy(config)
module_class = Model(config)
return module_class
...@@ -11,3 +11,6 @@ ...@@ -11,3 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from .model import Model
__all__ = ['Model']
\ No newline at end of file
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from paddle import fluid
from ppocr.utils.utility import create_module
from ppocr.utils.utility import initial_logger
logger = initial_logger()
from copy import deepcopy
class DetModel(object):
def __init__(self, params):
"""
Detection module for OCR text detection.
args:
params (dict): the super parameters for detection module.
"""
global_params = params['Global']
self.algorithm = global_params['algorithm']
backbone_params = deepcopy(params["Backbone"])
backbone_params.update(global_params)
self.backbone = create_module(backbone_params['function'])\
(params=backbone_params)
head_params = deepcopy(params["Head"])
head_params.update(global_params)
self.head = create_module(head_params['function'])\
(params=head_params)
loss_params = deepcopy(params["Loss"])
loss_params.update(global_params)
self.loss = create_module(loss_params['function'])\
(params=loss_params)
self.image_shape = global_params['image_shape']
def create_feed(self, mode):
"""
create Dataloader feeds
args:
mode (str): 'train' for training or else for evaluation
return: (image, corresponding label, dataloader)
"""
image_shape = deepcopy(self.image_shape)
if image_shape[1] % 4 != 0 or image_shape[2] % 4 != 0:
raise Exception("The size of the image must be divisible by 4, "
"received image shape is {}, please reset the "
"Global.image_shape in the yml file".format(
image_shape))
image = fluid.layers.data(
name='image', shape=image_shape, dtype='float32')
if mode == "train":
if self.algorithm == "EAST":
h, w = int(image_shape[1] // 4), int(image_shape[2] // 4)
score = fluid.layers.data(
name='score', shape=[1, h, w], dtype='float32')
geo = fluid.layers.data(
name='geo', shape=[9, h, w], dtype='float32')
mask = fluid.layers.data(
name='mask', shape=[1, h, w], dtype='float32')
feed_list = [image, score, geo, mask]
labels = {'score': score, 'geo': geo, 'mask': mask}
elif self.algorithm == "DB":
shrink_map = fluid.layers.data(
name='shrink_map', shape=image_shape[1:], dtype='float32')
shrink_mask = fluid.layers.data(
name='shrink_mask', shape=image_shape[1:], dtype='float32')
threshold_map = fluid.layers.data(
name='threshold_map',
shape=image_shape[1:],
dtype='float32')
threshold_mask = fluid.layers.data(
name='threshold_mask',
shape=image_shape[1:],
dtype='float32')
feed_list=[image, shrink_map, shrink_mask,\
threshold_map, threshold_mask]
labels = {'shrink_map':shrink_map,\
'shrink_mask':shrink_mask,\
'threshold_map':threshold_map,\
'threshold_mask':threshold_mask}
elif self.algorithm == "SAST":
input_score = fluid.layers.data(
name='score', shape=[1, 128, 128], dtype='float32')
input_border = fluid.layers.data(
name='border', shape=[5, 128, 128], dtype='float32')
input_mask = fluid.layers.data(
name='mask', shape=[1, 128, 128], dtype='float32')
input_tvo = fluid.layers.data(
name='tvo', shape=[9, 128, 128], dtype='float32')
input_tco = fluid.layers.data(
name='tco', shape=[3, 128, 128], dtype='float32')
feed_list = [image, input_score, input_border, input_mask, input_tvo, input_tco]
labels = {'input_score': input_score,\
'input_border': input_border,\
'input_mask': input_mask,\
'input_tvo': input_tvo,\
'input_tco': input_tco}
loader = fluid.io.DataLoader.from_generator(
feed_list=feed_list,
capacity=64,
use_double_buffer=True,
iterable=False)
else:
labels = None
loader = None
return image, labels, loader
def __call__(self, mode):
"""
run forward of defined module
args:
mode (str): 'train' for training; 'export' for inference,
others for evaluation]
"""
image, labels, loader = self.create_feed(mode)
conv_feas = self.backbone(image)
if self.algorithm == "DB":
predicts = self.head(conv_feas, mode)
else:
predicts = self.head(conv_feas)
if mode == "train":
losses = self.loss(predicts, labels)
return loader, losses
elif mode == "export":
return [image, predicts]
else:
return loader, predicts
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os, sys
__dir__ = os.path.dirname(os.path.abspath(__file__))
sys.path.append(__dir__)
sys.path.append('/home/zhoujun20/PaddleOCR')
import paddle
from paddle import nn
from ppocr.modeling.transform import build_transform
from ppocr.modeling.backbones import build_backbone
from ppocr.modeling.necks import build_neck
from ppocr.modeling.heads import build_head
__all__ = ['Model']
class Model(nn.Layer):
def __init__(self, config):
"""
Detection module for OCR.
args:
config (dict): the super parameters for module.
"""
super(Model, self).__init__()
algorithm = config['algorithm']
self.type = config['type']
self.model_name = '{}_{}'.format(self.type, algorithm)
in_channels = config.get('in_channels', 3)
# build transfrom,
# for rec, transfrom can be TPS,None
# for det and cls, transfrom shoule to be None,
# if you make model differently, you can use transfrom in det and cls
if 'Transform' not in config or config['Transform'] is None:
self.use_transform = False
else:
self.use_transform = True
config['Transform']['in_channels'] = in_channels
self.transform = build_transform(config['Transform'])
in_channels = self.transform.out_channels
# build backbone, backbone is need for del, rec and cls
config["Backbone"]['in_channels'] = in_channels
self.backbone = build_backbone(config["Backbone"], self.type)
in_channels = self.backbone.out_channels
# build neck
# for rec, neck can be cnn,rnn or reshape(None)
# for det, neck can be FPN, BIFPN and so on.
# for cls, neck should be none
if 'Neck' not in config or config['Neck'] is None:
self.use_neck = False
else:
self.use_neck = True
config['Neck']['in_channels'] = in_channels
self.neck = build_neck(config['Neck'])
in_channels = self.neck.out_channels
# # build head, head is need for del, rec and cls
config["Head"]['in_channels'] = in_channels
self.head = build_head(config["Head"])
# @paddle.jit.to_static
def forward(self, x):
if self.use_transform:
x = self.transform(x)
x = self.backbone(x)
if self.use_neck:
x = self.neck(x)
x = self.head(x)
return x
def check_static():
import numpy as np
from ppocr.utils.save_load import load_dygraph_pretrain
from ppocr.utils.logging import get_logger
from tools import program
config = program.load_config('configs/det/det_r50_vd_db.yml')
# import cv2
# data = cv2.imread('doc/imgs/1.jpg')
# data = normalize(data)
logger = get_logger()
data = np.zeros((1, 3, 640, 640), dtype=np.float32)
paddle.disable_static()
config['Architecture']['in_channels'] = 3
config['Architecture']["Head"]['out_channels'] = 6624
model = Model(config['Architecture'])
model.eval()
load_dygraph_pretrain(
model,
logger,
'/Users/zhoujun20/Desktop/code/PaddleOCR/db/db',
load_static_weights=True)
x = paddle.to_variable(data)
y = model(x)
for y1 in y:
print(y1.shape)
#
# # from matplotlib import pyplot as plt
# # plt.imshow(y.numpy())
# # plt.show()
static_out = np.load('/Users/zhoujun20/Desktop/code/PaddleOCR/db/db.npy')
diff = y.numpy() - static_out
print(y.shape, static_out.shape, diff.mean())
if __name__ == '__main__':
check_static()
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from paddle import fluid
from ppocr.utils.utility import create_module
from ppocr.utils.utility import initial_logger
logger = initial_logger()
from copy import deepcopy
class RecModel(object):
def __init__(self, params):
super(RecModel, self).__init__()
global_params = params['Global']
char_num = global_params['char_ops'].get_char_num()
global_params['char_num'] = char_num
self.char_type = global_params['character_type']
self.infer_img = global_params['infer_img']
if "TPS" in params:
tps_params = deepcopy(params["TPS"])
tps_params.update(global_params)
self.tps = create_module(tps_params['function'])\
(params=tps_params)
else:
self.tps = None
backbone_params = deepcopy(params["Backbone"])
backbone_params.update(global_params)
self.backbone = create_module(backbone_params['function'])\
(params=backbone_params)
head_params = deepcopy(params["Head"])
head_params.update(global_params)
self.head = create_module(head_params['function'])\
(params=head_params)
loss_params = deepcopy(params["Loss"])
loss_params.update(global_params)
self.loss = create_module(loss_params['function'])\
(params=loss_params)
self.loss_type = global_params['loss_type']
self.image_shape = global_params['image_shape']
self.max_text_length = global_params['max_text_length']
if "num_heads" in global_params:
self.num_heads = global_params["num_heads"]
else:
self.num_heads = None
def create_feed(self, mode):
image_shape = deepcopy(self.image_shape)
image_shape.insert(0, -1)
if mode == "train":
image = fluid.data(name='image', shape=image_shape, dtype='float32')
if self.loss_type == "attention":
label_in = fluid.data(
name='label_in',
shape=[None, 1],
dtype='int32',
lod_level=1)
label_out = fluid.data(
name='label_out',
shape=[None, 1],
dtype='int32',
lod_level=1)
feed_list = [image, label_in, label_out]
labels = {'label_in': label_in, 'label_out': label_out}
elif self.loss_type == "srn":
encoder_word_pos = fluid.data(
name="encoder_word_pos",
shape=[
-1, int((image_shape[-2] / 8) * (image_shape[-1] / 8)),
1
],
dtype="int64")
gsrm_word_pos = fluid.data(
name="gsrm_word_pos",
shape=[-1, self.max_text_length, 1],
dtype="int64")
gsrm_slf_attn_bias1 = fluid.data(
name="gsrm_slf_attn_bias1",
shape=[
-1, self.num_heads, self.max_text_length,
self.max_text_length
],
dtype="float32")
gsrm_slf_attn_bias2 = fluid.data(
name="gsrm_slf_attn_bias2",
shape=[
-1, self.num_heads, self.max_text_length,
self.max_text_length
],
dtype="float32")
lbl_weight = fluid.layers.data(
name="lbl_weight", shape=[-1, 1], dtype='int64')
label = fluid.data(
name='label', shape=[-1, 1], dtype='int32', lod_level=1)
feed_list = [
image, label, encoder_word_pos, gsrm_word_pos,
gsrm_slf_attn_bias1, gsrm_slf_attn_bias2, lbl_weight
]
labels = {
'label': label,
'encoder_word_pos': encoder_word_pos,
'gsrm_word_pos': gsrm_word_pos,
'gsrm_slf_attn_bias1': gsrm_slf_attn_bias1,
'gsrm_slf_attn_bias2': gsrm_slf_attn_bias2,
'lbl_weight': lbl_weight
}
else:
label = fluid.data(
name='label', shape=[None, 1], dtype='int32', lod_level=1)
feed_list = [image, label]
labels = {'label': label}
loader = fluid.io.DataLoader.from_generator(
feed_list=feed_list,
capacity=64,
use_double_buffer=True,
iterable=False)
else:
labels = None
loader = None
if self.char_type == "ch" and self.infer_img:
image_shape[-1] = -1
if self.tps != None:
logger.info(
"WARNRNG!!!\n"
"TPS does not support variable shape in chinese!"
"We set img_shape to be the same , it may affect the inference effect"
)
image_shape = deepcopy(self.image_shape)
image = fluid.data(name='image', shape=image_shape, dtype='float32')
if self.loss_type == "srn":
encoder_word_pos = fluid.data(
name="encoder_word_pos",
shape=[
-1, int((image_shape[-2] / 8) * (image_shape[-1] / 8)),
1
],
dtype="int64")
gsrm_word_pos = fluid.data(
name="gsrm_word_pos",
shape=[-1, self.max_text_length, 1],
dtype="int64")
gsrm_slf_attn_bias1 = fluid.data(
name="gsrm_slf_attn_bias1",
shape=[
-1, self.num_heads, self.max_text_length,
self.max_text_length
],
dtype="float32")
gsrm_slf_attn_bias2 = fluid.data(
name="gsrm_slf_attn_bias2",
shape=[
-1, self.num_heads, self.max_text_length,
self.max_text_length
],
dtype="float32")
feed_list = [
image, encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1,
gsrm_slf_attn_bias2
]
labels = {
'encoder_word_pos': encoder_word_pos,
'gsrm_word_pos': gsrm_word_pos,
'gsrm_slf_attn_bias1': gsrm_slf_attn_bias1,
'gsrm_slf_attn_bias2': gsrm_slf_attn_bias2
}
return image, labels, loader
def __call__(self, mode):
image, labels, loader = self.create_feed(mode)
if self.tps is None:
inputs = image
else:
inputs = self.tps(image)
conv_feas = self.backbone(inputs)
predicts = self.head(conv_feas, labels, mode)
decoded_out = predicts['decoded_out']
if mode == "train":
loss = self.loss(predicts, labels)
if self.loss_type == "attention":
label = labels['label_out']
else:
label = labels['label']
if self.loss_type == 'srn':
total_loss, img_loss, word_loss = self.loss(predicts, labels)
outputs = {
'total_loss': total_loss,
'img_loss': img_loss,
'word_loss': word_loss,
'decoded_out': decoded_out,
'label': label
}
else:
outputs = {'total_loss':loss, 'decoded_out':\
decoded_out, 'label':label}
return loader, outputs
elif mode == "export":
predict = predicts['predict']
if self.loss_type == "ctc":
predict = fluid.layers.softmax(predict)
if self.loss_type == "srn":
raise Exception(
"Warning! SRN does not support export model currently")
return [image, {'decoded_out': decoded_out, 'predicts': predict}]
else:
predict = predicts['predict']
if self.loss_type == "ctc":
predict = fluid.layers.softmax(predict)
return loader, {'decoded_out': decoded_out, 'predicts': predict}
...@@ -11,3 +11,26 @@ ...@@ -11,3 +11,26 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
__all__ = ['build_backbone']
def build_backbone(config, model_type):
if model_type == 'det':
from .det_mobilenet_v3 import MobileNetV3
from .det_resnet_vd import ResNet
support_dict = ['MobileNetV3', 'ResNet', 'ResNet_SAST']
elif model_type == 'rec':
from .rec_mobilenet_v3 import MobileNetV3
from .rec_resnet_vd import ResNet
support_dict = ['MobileNetV3', 'ResNet', 'ResNet_FPN']
else:
raise NotImplementedError
module_name = config.pop('name')
assert module_name in support_dict, Exception(
'when model typs is {}, backbone only support {}'.format(model_type,
support_dict))
module_class = eval(module_name)(**config)
return module_class
#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. # copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
# #
#Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
#You may obtain a copy of the License at # You may obtain a copy of the License at
# #
# http://www.apache.org/licenses/LICENSE-2.0 # http://www.apache.org/licenses/LICENSE-2.0
# #
#Unless required by applicable law or agreed to in writing, software # Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS, # distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
#limitations under the License. # limitations under the License.
from __future__ import absolute_import from __future__ import absolute_import
from __future__ import division from __future__ import division
from __future__ import print_function from __future__ import print_function
import paddle.fluid as fluid import paddle
from paddle.fluid.initializer import MSRA from paddle import nn
from paddle.fluid.param_attr import ParamAttr import paddle.nn.functional as F
from paddle import ParamAttr
__all__ = ['MobileNetV3'] __all__ = ['MobileNetV3']
class MobileNetV3(): def make_divisible(v, divisor=8, min_value=None):
def __init__(self, params): if min_value is None:
min_value = divisor
new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
if new_v < 0.9 * v:
new_v += divisor
return new_v
class MobileNetV3(nn.Layer):
def __init__(self, in_channels=3, model_name='large', scale=0.5, **kwargs):
""" """
the MobilenetV3 backbone network for detection module. the MobilenetV3 backbone network for detection module.
Args: Args:
params(dict): the super parameters for build network params(dict): the super parameters for build network
""" """
self.scale = params['scale'] super(MobileNetV3, self).__init__()
model_name = params['model_name']
self.inplanes = 16
if model_name == "large": if model_name == "large":
self.cfg = [ cfg = [
# k, exp, c, se, nl, s, # k, exp, c, se, nl, s,
[3, 16, 16, False, 'relu', 1], [3, 16, 16, False, 'relu', 1],
[3, 64, 24, False, 'relu', 2], [3, 64, 24, False, 'relu', 2],
...@@ -52,10 +60,9 @@ class MobileNetV3(): ...@@ -52,10 +60,9 @@ class MobileNetV3():
[5, 960, 160, True, 'hard_swish', 1], [5, 960, 160, True, 'hard_swish', 1],
[5, 960, 160, True, 'hard_swish', 1], [5, 960, 160, True, 'hard_swish', 1],
] ]
self.cls_ch_squeeze = 960 cls_ch_squeeze = 960
self.cls_ch_expand = 1280
elif model_name == "small": elif model_name == "small":
self.cfg = [ cfg = [
# k, exp, c, se, nl, s, # k, exp, c, se, nl, s,
[3, 16, 16, True, 'relu', 2], [3, 16, 16, True, 'relu', 2],
[3, 72, 24, False, 'relu', 2], [3, 72, 24, False, 'relu', 2],
...@@ -69,183 +76,203 @@ class MobileNetV3(): ...@@ -69,183 +76,203 @@ class MobileNetV3():
[5, 576, 96, True, 'hard_swish', 1], [5, 576, 96, True, 'hard_swish', 1],
[5, 576, 96, True, 'hard_swish', 1], [5, 576, 96, True, 'hard_swish', 1],
] ]
self.cls_ch_squeeze = 576 cls_ch_squeeze = 576
self.cls_ch_expand = 1280
else: else:
raise NotImplementedError("mode[" + model_name + raise NotImplementedError("mode[" + model_name +
"_model] is not implemented!") "_model] is not implemented!")
supported_scale = [0.35, 0.5, 0.75, 1.0, 1.25] supported_scale = [0.35, 0.5, 0.75, 1.0, 1.25]
assert self.scale in supported_scale, \ assert scale in supported_scale, \
"supported scale are {} but input scale is {}".format(supported_scale, self.scale) "supported scale are {} but input scale is {}".format(supported_scale, scale)
inplanes = 16
def __call__(self, input): # conv1
scale = self.scale self.conv = ConvBNLayer(
inplanes = self.inplanes in_channels=in_channels,
cfg = self.cfg out_channels=make_divisible(inplanes * scale),
cls_ch_squeeze = self.cls_ch_squeeze kernel_size=3,
cls_ch_expand = self.cls_ch_expand
#conv1
conv = self.conv_bn_layer(
input,
filter_size=3,
num_filters=self.make_divisible(inplanes * scale),
stride=2, stride=2,
padding=1, padding=1,
num_groups=1, groups=1,
if_act=True, if_act=True,
act='hard_swish', act='hard_swish',
name='conv1') name='conv1')
self.stages = []
self.out_channels = []
block_list = []
i = 0 i = 0
inplanes = self.make_divisible(inplanes * scale) inplanes = make_divisible(inplanes * scale)
outs = [] for (k, exp, c, se, nl, s) in cfg:
for layer_cfg in cfg: if s == 2 and i > 2:
if layer_cfg[5] == 2 and i > 2: self.out_channels.append(inplanes)
outs.append(conv) self.stages.append(nn.Sequential(*block_list))
conv = self.residual_unit( block_list = []
input=conv, block_list.append(
num_in_filter=inplanes, ResidualUnit(
num_mid_filter=self.make_divisible(scale * layer_cfg[1]), in_channels=inplanes,
num_out_filter=self.make_divisible(scale * layer_cfg[2]), mid_channels=make_divisible(scale * exp),
act=layer_cfg[4], out_channels=make_divisible(scale * c),
stride=layer_cfg[5], kernel_size=k,
filter_size=layer_cfg[0], stride=s,
use_se=layer_cfg[3], use_se=se,
name='conv' + str(i + 2)) act=nl,
inplanes = self.make_divisible(scale * layer_cfg[2]) name="conv" + str(i + 2)))
inplanes = make_divisible(scale * c)
i += 1 i += 1
block_list.append(
ConvBNLayer(
in_channels=inplanes,
out_channels=make_divisible(scale * cls_ch_squeeze),
kernel_size=1,
stride=1,
padding=0,
groups=1,
if_act=True,
act='hard_swish',
name='conv_last'))
conv = self.conv_bn_layer( self.stages.append(nn.Sequential(*block_list))
input=conv, self.out_channels.append(make_divisible(scale * cls_ch_squeeze))
filter_size=1, for i, stage in enumerate(self.stages):
num_filters=self.make_divisible(scale * cls_ch_squeeze), self.add_sublayer(sublayer=stage, name="stage{}".format(i))
stride=1,
padding=0, def forward(self, x):
num_groups=1, x = self.conv(x)
if_act=True, out_list = []
act='hard_swish', for stage in self.stages:
name='conv_last') x = stage(x)
outs.append(conv) out_list.append(x)
return outs return out_list
def conv_bn_layer(self,
input, class ConvBNLayer(nn.Layer):
filter_size, def __init__(self,
num_filters, in_channels,
stride, out_channels,
padding, kernel_size,
num_groups=1, stride,
if_act=True, padding,
act=None, groups=1,
name=None, if_act=True,
use_cudnn=True, act=None,
res_last_bn_init=False): name=None):
conv = fluid.layers.conv2d( super(ConvBNLayer, self).__init__()
input=input, self.if_act = if_act
num_filters=num_filters, self.act = act
filter_size=filter_size, self.conv = nn.Conv2d(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=kernel_size,
stride=stride, stride=stride,
padding=padding, padding=padding,
groups=num_groups, groups=groups,
act=None, weight_attr=ParamAttr(name=name + '_weights'),
use_cudnn=use_cudnn,
param_attr=ParamAttr(name=name + '_weights'),
bias_attr=False) bias_attr=False)
bn_name = name + '_bn'
bn = fluid.layers.batch_norm( self.bn = nn.BatchNorm(
input=conv, num_channels=out_channels,
param_attr=ParamAttr( act=None,
name=bn_name + "_scale", param_attr=ParamAttr(name=name + "_bn_scale"),
regularizer=fluid.regularizer.L2DecayRegularizer( bias_attr=ParamAttr(name=name + "_bn_offset"),
regularization_coeff=0.0)), moving_mean_name=name + "_bn_mean",
bias_attr=ParamAttr( moving_variance_name=name + "_bn_variance")
name=bn_name + "_offset",
regularizer=fluid.regularizer.L2DecayRegularizer( def forward(self, x):
regularization_coeff=0.0)), x = self.conv(x)
moving_mean_name=bn_name + '_mean', x = self.bn(x)
moving_variance_name=bn_name + '_variance') if self.if_act:
if if_act: if self.act == "relu":
if act == 'relu': x = F.relu(x)
bn = fluid.layers.relu(bn) elif self.act == "hard_swish":
elif act == 'hard_swish': x = F.hard_swish(x)
bn = fluid.layers.hard_swish(bn) else:
return bn print("The activation function is selected incorrectly.")
exit()
def make_divisible(self, v, divisor=8, min_value=None): return x
if min_value is None:
min_value = divisor
new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) class ResidualUnit(nn.Layer):
if new_v < 0.9 * v: def __init__(self,
new_v += divisor in_channels,
return new_v mid_channels,
out_channels,
def se_block(self, input, num_out_filter, ratio=4, name=None): kernel_size,
num_mid_filter = num_out_filter // ratio stride,
pool = fluid.layers.pool2d( use_se,
input=input, pool_type='avg', global_pooling=True, use_cudnn=False) act=None,
conv1 = fluid.layers.conv2d( name=''):
input=pool, super(ResidualUnit, self).__init__()
filter_size=1, self.if_shortcut = stride == 1 and in_channels == out_channels
num_filters=num_mid_filter, self.if_se = use_se
act='relu',
param_attr=ParamAttr(name=name + '_1_weights'), self.expand_conv = ConvBNLayer(
bias_attr=ParamAttr(name=name + '_1_offset')) in_channels=in_channels,
conv2 = fluid.layers.conv2d( out_channels=mid_channels,
input=conv1, kernel_size=1,
filter_size=1,
num_filters=num_out_filter,
act='hard_sigmoid',
param_attr=ParamAttr(name=name + '_2_weights'),
bias_attr=ParamAttr(name=name + '_2_offset'))
scale = fluid.layers.elementwise_mul(x=input, y=conv2, axis=0)
return scale
def residual_unit(self,
input,
num_in_filter,
num_mid_filter,
num_out_filter,
stride,
filter_size,
act=None,
use_se=False,
name=None):
conv0 = self.conv_bn_layer(
input=input,
filter_size=1,
num_filters=num_mid_filter,
stride=1, stride=1,
padding=0, padding=0,
if_act=True, if_act=True,
act=act, act=act,
name=name + '_expand') name=name + "_expand")
self.bottleneck_conv = ConvBNLayer(
conv1 = self.conv_bn_layer( in_channels=mid_channels,
input=conv0, out_channels=mid_channels,
filter_size=filter_size, kernel_size=kernel_size,
num_filters=num_mid_filter,
stride=stride, stride=stride,
padding=int((filter_size - 1) // 2), padding=int((kernel_size - 1) // 2),
groups=mid_channels,
if_act=True, if_act=True,
act=act, act=act,
num_groups=num_mid_filter, name=name + "_depthwise")
use_cudnn=False, if self.if_se:
name=name + '_depthwise') self.mid_se = SEModule(mid_channels, name=name + "_se")
if use_se: self.linear_conv = ConvBNLayer(
conv1 = self.se_block( in_channels=mid_channels,
input=conv1, num_out_filter=num_mid_filter, name=name + '_se') out_channels=out_channels,
kernel_size=1,
conv2 = self.conv_bn_layer(
input=conv1,
filter_size=1,
num_filters=num_out_filter,
stride=1, stride=1,
padding=0, padding=0,
if_act=False, if_act=False,
name=name + '_linear', act=None,
res_last_bn_init=True) name=name + "_linear")
if num_in_filter != num_out_filter or stride != 1:
return conv2 def forward(self, inputs):
else: x = self.expand_conv(inputs)
return fluid.layers.elementwise_add(x=input, y=conv2, act=None) x = self.bottleneck_conv(x)
if self.if_se:
x = self.mid_se(x)
x = self.linear_conv(x)
if self.if_shortcut:
x = paddle.elementwise_add(inputs, x)
return x
class SEModule(nn.Layer):
def __init__(self, in_channels, reduction=4, name=""):
super(SEModule, self).__init__()
self.avg_pool = nn.Pool2D(
pool_type="avg", global_pooling=True, use_cudnn=False)
self.conv1 = nn.Conv2d(
in_channels=in_channels,
out_channels=in_channels // reduction,
kernel_size=1,
stride=1,
padding=0,
weight_attr=ParamAttr(name=name + "_1_weights"),
bias_attr=ParamAttr(name=name + "_1_offset"))
self.conv2 = nn.Conv2d(
in_channels=in_channels // reduction,
out_channels=in_channels,
kernel_size=1,
stride=1,
padding=0,
weight_attr=ParamAttr(name + "_2_weights"),
bias_attr=ParamAttr(name=name + "_2_offset"))
def forward(self, inputs):
outputs = self.avg_pool(inputs)
outputs = self.conv1(outputs)
outputs = F.relu(outputs)
outputs = self.conv2(outputs)
outputs = F.hard_sigmoid(outputs)
return inputs * outputs
#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. # copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
# #
#Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
#You may obtain a copy of the License at # You may obtain a copy of the License at
# #
# http://www.apache.org/licenses/LICENSE-2.0 # http://www.apache.org/licenses/LICENSE-2.0
# #
#Unless required by applicable law or agreed to in writing, software # Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS, # distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
#limitations under the License. # limitations under the License.
from __future__ import absolute_import from __future__ import absolute_import
from __future__ import division from __future__ import division
from __future__ import print_function from __future__ import print_function
import paddle.fluid as fluid from paddle import nn
from paddle.fluid.param_attr import ParamAttr from paddle.nn import functional as F
from paddle import ParamAttr
__all__ = ["ResNet"] __all__ = ["ResNet"]
class ResNet(object): class ResNet(nn.Layer):
def __init__(self, params): def __init__(self, in_channels=3, layers=50, **kwargs):
""" """
the Resnet backbone network for detection module. the Resnet backbone network for detection module.
Args: Args:
params(dict): the super parameters for network build params(dict): the super parameters for network build
""" """
self.layers = params['layers'] super(ResNet, self).__init__()
supported_layers = [18, 34, 50, 101, 152] supported_layers = {
assert self.layers in supported_layers, \ 18: {
"supported layers are {} but input layer is {}".format(supported_layers, self.layers) 'depth': [2, 2, 2, 2],
self.is_3x3 = True 'block_class': BasicBlock
},
def __call__(self, input): 34: {
layers = self.layers 'depth': [3, 4, 6, 3],
is_3x3 = self.is_3x3 'block_class': BasicBlock
if layers == 18: },
depth = [2, 2, 2, 2] 50: {
elif layers == 34 or layers == 50: 'depth': [3, 4, 6, 3],
depth = [3, 4, 6, 3] 'block_class': BottleneckBlock
elif layers == 101: },
depth = [3, 4, 23, 3] 101: {
elif layers == 152: 'depth': [3, 4, 23, 3],
depth = [3, 8, 36, 3] 'block_class': BottleneckBlock
elif layers == 200: },
depth = [3, 12, 48, 3] 152: {
'depth': [3, 8, 36, 3],
'block_class': BottleneckBlock
},
200: {
'depth': [3, 12, 48, 3],
'block_class': BottleneckBlock
}
}
assert layers in supported_layers, \
"supported layers are {} but input layer is {}".format(supported_layers.keys(), layers)
is_3x3 = True
depth = supported_layers[layers]['depth']
block_class = supported_layers[layers]['block_class']
num_filters = [64, 128, 256, 512] num_filters = [64, 128, 256, 512]
outs = []
conv = []
if is_3x3 == False: if is_3x3 == False:
conv = self.conv_bn_layer( conv.append(
input=input, ConvBNLayer(
num_filters=64, in_channels=in_channels,
filter_size=7, out_channels=64,
stride=2, kernel_size=7,
act='relu') stride=2,
act='relu'))
else: else:
conv = self.conv_bn_layer( conv.append(
input=input, ConvBNLayer(
num_filters=32, in_channels=3,
filter_size=3, out_channels=32,
stride=2, kernel_size=3,
act='relu', stride=2,
name='conv1_1') act='relu',
conv = self.conv_bn_layer( name='conv1_1'))
input=conv, conv.append(
num_filters=32, ConvBNLayer(
filter_size=3, in_channels=32,
stride=1, out_channels=32,
act='relu', kernel_size=3,
name='conv1_2') stride=1,
conv = self.conv_bn_layer( act='relu',
input=conv, name='conv1_2'))
num_filters=64, conv.append(
filter_size=3, ConvBNLayer(
stride=1, in_channels=32,
act='relu', out_channels=64,
name='conv1_3') kernel_size=3,
stride=1,
conv = fluid.layers.pool2d( act='relu',
input=conv, name='conv1_3'))
pool_size=3, self.conv1 = nn.Sequential(*conv)
pool_stride=2, self.pool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
pool_padding=1, self.stages = []
pool_type='max') self.out_channels = []
in_ch = 64
if layers >= 50: for block_index in range(len(depth)):
for block in range(len(depth)): block_list = []
for i in range(depth[block]): for i in range(depth[block_index]):
if layers in [101, 152, 200] and block == 2: if layers >= 50:
if layers in [101, 152, 200] and block_index == 2:
if i == 0: if i == 0:
conv_name = "res" + str(block + 2) + "a" conv_name = "res" + str(block_index + 2) + "a"
else: else:
conv_name = "res" + str(block + 2) + "b" + str(i) conv_name = "res" + str(block_index +
2) + "b" + str(i)
else: else:
conv_name = "res" + str(block + 2) + chr(97 + i) conv_name = "res" + str(block_index + 2) + chr(97 + i)
conv = self.bottleneck_block( else:
input=conv, conv_name = "res" + str(block_index + 2) + chr(97 + i)
num_filters=num_filters[block], block_list.append(
stride=2 if i == 0 and block != 0 else 1, block_class(
if_first=block == i == 0, in_channels=in_ch,
name=conv_name) out_channels=num_filters[block_index],
outs.append(conv) stride=2 if i == 0 and block_index != 0 else 1,
else: if_first=block_index == i == 0,
for block in range(len(depth)): name=conv_name))
for i in range(depth[block]): in_ch = block_list[-1].out_channels
conv_name = "res" + str(block + 2) + chr(97 + i) self.out_channels.append(in_ch)
conv = self.basic_block( self.stages.append(nn.Sequential(*block_list))
input=conv, for i, stage in enumerate(self.stages):
num_filters=num_filters[block], self.add_sublayer(sublayer=stage, name="stage{}".format(i))
stride=2 if i == 0 and block != 0 else 1,
if_first=block == i == 0, def forward(self, x):
name=conv_name) x = self.conv1(x)
outs.append(conv) x = self.pool(x)
return outs out_list = []
for stage in self.stages:
def conv_bn_layer(self, x = stage(x)
input, out_list.append(x)
num_filters, return out_list
filter_size,
stride=1,
groups=1, class ConvBNLayer(nn.Layer):
act=None, def __init__(self,
name=None): in_channels,
conv = fluid.layers.conv2d( out_channels,
input=input, kernel_size,
num_filters=num_filters, stride=1,
filter_size=filter_size, groups=1,
act=None,
name=None):
super(ConvBNLayer, self).__init__()
self.conv = nn.Conv2d(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=kernel_size,
stride=stride, stride=stride,
padding=(filter_size - 1) // 2, padding=(kernel_size - 1) // 2,
groups=groups, groups=groups,
act=None, weight_attr=ParamAttr(name=name + "_weights"),
param_attr=ParamAttr(name=name + "_weights"),
bias_attr=False) bias_attr=False)
if name == "conv1": if name == "conv1":
bn_name = "bn_" + name bn_name = "bn_" + name
else: else:
bn_name = "bn" + name[3:] bn_name = "bn" + name[3:]
return fluid.layers.batch_norm( self.bn = nn.BatchNorm(
input=conv, num_channels=out_channels,
act=act, act=act,
param_attr=ParamAttr(name=bn_name + '_scale'), param_attr=ParamAttr(name=bn_name + "_scale"),
bias_attr=ParamAttr(bn_name + '_offset'), bias_attr=ParamAttr(name=bn_name + "_offset"),
moving_mean_name=bn_name + '_mean', moving_mean_name=bn_name + "_mean",
moving_variance_name=bn_name + '_variance') moving_variance_name=bn_name + "_variance")
def conv_bn_layer_new(self, def __call__(self, x):
input, x = self.conv(x)
num_filters, x = self.bn(x)
filter_size, return x
stride=1,
groups=1,
act=None, class ConvBNLayerNew(nn.Layer):
name=None): def __init__(self,
pool = fluid.layers.pool2d( in_channels,
input=input, out_channels,
pool_size=2, kernel_size,
pool_stride=2, stride=1,
pool_padding=0, groups=1,
pool_type='avg', act=None,
ceil_mode=True) name=None):
super(ConvBNLayerNew, self).__init__()
conv = fluid.layers.conv2d( self.pool = nn.AvgPool2d(
input=pool, kernel_size=2, stride=2, padding=0, ceil_mode=True)
num_filters=num_filters,
filter_size=filter_size, self.conv = nn.Conv2d(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=kernel_size,
stride=1, stride=1,
padding=(filter_size - 1) // 2, padding=(kernel_size - 1) // 2,
groups=groups, groups=groups,
act=None, weight_attr=ParamAttr(name=name + "_weights"),
param_attr=ParamAttr(name=name + "_weights"),
bias_attr=False) bias_attr=False)
if name == "conv1": if name == "conv1":
bn_name = "bn_" + name bn_name = "bn_" + name
else: else:
bn_name = "bn" + name[3:] bn_name = "bn" + name[3:]
return fluid.layers.batch_norm( self.bn = nn.BatchNorm(
input=conv, num_channels=out_channels,
act=act, act=act,
param_attr=ParamAttr(name=bn_name + '_scale'), param_attr=ParamAttr(name=bn_name + "_scale"),
bias_attr=ParamAttr(bn_name + '_offset'), bias_attr=ParamAttr(name=bn_name + "_offset"),
moving_mean_name=bn_name + '_mean', moving_mean_name=bn_name + "_mean",
moving_variance_name=bn_name + '_variance') moving_variance_name=bn_name + "_variance")
def shortcut(self, input, ch_out, stride, name, if_first=False): def __call__(self, x):
ch_in = input.shape[1] x = self.pool(x)
if ch_in != ch_out or stride != 1: x = self.conv(x)
x = self.bn(x)
return x
class ShortCut(nn.Layer):
def __init__(self, in_channels, out_channels, stride, name, if_first=False):
super(ShortCut, self).__init__()
self.use_conv = True
if in_channels != out_channels or stride != 1:
if if_first: if if_first:
return self.conv_bn_layer(input, ch_out, 1, stride, name=name) self.conv = ConvBNLayer(
in_channels, out_channels, 1, stride, name=name)
else: else:
return self.conv_bn_layer_new( self.conv = ConvBNLayerNew(
input, ch_out, 1, stride, name=name) in_channels, out_channels, 1, stride, name=name)
elif if_first: elif if_first:
return self.conv_bn_layer(input, ch_out, 1, stride, name=name) self.conv = ConvBNLayer(
in_channels, out_channels, 1, stride, name=name)
else: else:
return input self.use_conv = False
def forward(self, x):
if self.use_conv:
x = self.conv(x)
return x
def bottleneck_block(self, input, num_filters, stride, name, if_first):
conv0 = self.conv_bn_layer( class BottleneckBlock(nn.Layer):
input=input, def __init__(self, in_channels, out_channels, stride, name, if_first):
num_filters=num_filters, super(BottleneckBlock, self).__init__()
filter_size=1, self.conv0 = ConvBNLayer(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=1,
act='relu', act='relu',
name=name + "_branch2a") name=name + "_branch2a")
conv1 = self.conv_bn_layer( self.conv1 = ConvBNLayer(
input=conv0, in_channels=out_channels,
num_filters=num_filters, out_channels=out_channels,
filter_size=3, kernel_size=3,
stride=stride, stride=stride,
act='relu', act='relu',
name=name + "_branch2b") name=name + "_branch2b")
conv2 = self.conv_bn_layer( self.conv2 = ConvBNLayer(
input=conv1, in_channels=out_channels,
num_filters=num_filters * 4, out_channels=out_channels * 4,
filter_size=1, kernel_size=1,
act=None, act=None,
name=name + "_branch2c") name=name + "_branch2c")
short = self.shortcut( self.short = ShortCut(
input, in_channels=in_channels,
num_filters * 4, out_channels=out_channels * 4,
stride, stride=stride,
if_first=if_first, if_first=if_first,
name=name + "_branch1") name=name + "_branch1")
self.out_channels = out_channels * 4
def forward(self, x):
y = self.conv0(x)
y = self.conv1(y)
y = self.conv2(y)
y = y + self.short(x)
y = F.relu(y)
return y
return fluid.layers.elementwise_add(x=short, y=conv2, act='relu')
def basic_block(self, input, num_filters, stride, name, if_first): class BasicBlock(nn.Layer):
conv0 = self.conv_bn_layer( def __init__(self, in_channels, out_channels, stride, name, if_first):
input=input, super(BasicBlock, self).__init__()
num_filters=num_filters, self.conv0 = ConvBNLayer(
filter_size=3, in_channels=in_channels,
out_channels=out_channels,
kernel_size=3,
act='relu', act='relu',
stride=stride, stride=stride,
name=name + "_branch2a") name=name + "_branch2a")
conv1 = self.conv_bn_layer( self.conv1 = ConvBNLayer(
input=conv0, in_channels=out_channels,
num_filters=num_filters, out_channels=out_channels,
filter_size=3, kernel_size=3,
act=None, act=None,
name=name + "_branch2b") name=name + "_branch2b")
short = self.shortcut( self.short = ShortCut(
input, in_channels=in_channels,
num_filters, out_channels=out_channels,
stride, stride=stride,
if_first=if_first, if_first=if_first,
name=name + "_branch1") name=name + "_branch1")
return fluid.layers.elementwise_add(x=short, y=conv1, act='relu') self.out_channels = out_channels
def forward(self, x):
y = self.conv0(x)
y = self.conv1(y)
y = y + self.short(x)
return F.relu(y)
if __name__ == '__main__':
import paddle
paddle.disable_static()
x = paddle.zeros([1, 3, 640, 640])
x = paddle.to_variable(x)
print(x.shape)
net = ResNet(layers=18)
y = net(x)
for stage in y:
print(stage.shape)
# paddle.save(net.state_dict(),'1.pth')
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment