Commit 17e1e2c5 authored by LDOUBLEV's avatar LDOUBLEV
Browse files

Merge branch 'dygraph' of https://github.com/PaddlePaddle/PaddleOCR into fix_doc

parents cf35373b f14b79b4
#!/usr/bin/env bash
set -xe
# 运行示例:CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh ${run_mode} ${bs_item} ${fp_item} 500 ${model_mode}
# 参数说明
function _set_params(){
run_mode=${1:-"sp"} # 单卡sp|多卡mp
batch_size=${2:-"64"}
fp_item=${3:-"fp32"} # fp32|fp16
max_iter=${4:-"500"} # 可选,如果需要修改代码提前中断
model_name=${5:-"model_name"}
run_log_path=${TRAIN_LOG_DIR:-$(pwd)} # TRAIN_LOG_DIR 后续QA设置该参数
# 以下不用修改
device=${CUDA_VISIBLE_DEVICES//,/ }
arr=(${device})
num_gpu_devices=${#arr[*]}
log_file=${run_log_path}/${model_name}_${run_mode}_bs${batch_size}_${fp_item}_${num_gpu_devices}
}
function _train(){
echo "Train on ${num_gpu_devices} GPUs"
echo "current CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES, gpus=$num_gpu_devices, batch_size=$batch_size"
train_cmd="-c configs/det/${model_name}.yml
-o Train.loader.batch_size_per_card=${batch_size}
-o Global.epoch_num=${max_iter} "
case ${run_mode} in
sp)
train_cmd="python3.7 tools/train.py "${train_cmd}""
;;
mp)
train_cmd="python3.7 -m paddle.distributed.launch --log_dir=./mylog --gpus=$CUDA_VISIBLE_DEVICES tools/train.py ${train_cmd}"
;;
*) echo "choose run_mode(sp or mp)"; exit 1;
esac
# 以下不用修改
timeout 15m ${train_cmd} > ${log_file} 2>&1
if [ $? -ne 0 ];then
echo -e "${model_name}, FAIL"
export job_fail_flag=1
else
echo -e "${model_name}, SUCCESS"
export job_fail_flag=0
fi
kill -9 `ps -ef|grep 'python3.7'|awk '{print $2}'`
if [ $run_mode = "mp" -a -d mylog ]; then
rm ${log_file}
cp mylog/workerlog.0 ${log_file}
fi
}
_set_params $@
_train
# 提供可稳定复现性能的脚本,默认在标准docker环境内py37执行: paddlepaddle/paddle:latest-gpu-cuda10.1-cudnn7 paddle=2.1.2 py=37
# 执行目录:需说明
cd PaddleOCR
# 1 安装该模型需要的依赖 (如需开启优化策略请注明)
python3.7 -m pip install -r requirements.txt
# 2 拷贝该模型需要数据、预训练模型
wget -p ./tain_data/ https://paddleocr.bj.bcebos.com/dygraph_v2.0/test/icdar2015.tar && cd train_data && tar xf icdar2015.tar && cd ../
wget -p ./pretrain_models/ https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNet50_vd_pretrained.pdparams
# 3 批量运行(如不方便批量,1,2需放到单个模型中)
model_mode_list=(det_mv3_db det_r50_vd_east)
fp_item_list=(fp32)
bs_list=(256 128)
for model_mode in ${model_mode_list[@]}; do
for fp_item in ${fp_item_list[@]}; do
for bs_item in ${bs_list[@]}; do
echo "index is speed, 1gpus, begin, ${model_name}"
run_mode=sp
CUDA_VISIBLE_DEVICES=0 bash benchmark/run_benchmark.sh ${run_mode} ${bs_item} ${fp_item} 10 ${model_mode} # (5min)
sleep 60
echo "index is speed, 8gpus, run_mode is multi_process, begin, ${model_name}"
run_mode=mp
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash benchmark/run_benchmark.sh ${run_mode} ${bs_item} ${fp_item} 10 ${model_mode}
sleep 60
done
done
done
...@@ -8,7 +8,7 @@ Global: ...@@ -8,7 +8,7 @@ Global:
# evaluation is run every 5000 iterations after the 4000th iteration # evaluation is run every 5000 iterations after the 4000th iteration
eval_batch_step: [4000, 5000] eval_batch_step: [4000, 5000]
cal_metric_during_train: False cal_metric_during_train: False
pretrained_model: ./pretrain_models/ResNet50_vd_pretrained/ pretrained_model: ./pretrain_models/ResNet50_vd_pretrained
checkpoints: checkpoints:
save_inference_dir: save_inference_dir:
use_visualdl: False use_visualdl: False
......
Global:
debug: false
use_gpu: true
epoch_num: 800
log_smooth_window: 20
print_batch_step: 10
save_model_dir: ./output/rec_mobile_pp-OCRv2_enhanced_ctc_loss
save_epoch_step: 3
eval_batch_step: [0, 2000]
cal_metric_during_train: true
pretrained_model:
checkpoints:
save_inference_dir:
use_visualdl: false
infer_img: doc/imgs_words/ch/word_1.jpg
character_dict_path: ppocr/utils/ppocr_keys_v1.txt
character_type: ch
max_text_length: 25
infer_mode: false
use_space_char: true
distributed: true
save_res_path: ./output/rec/predicts_mobile_pp-OCRv2_enhanced_ctc_loss.txt
Optimizer:
name: Adam
beta1: 0.9
beta2: 0.999
lr:
name: Piecewise
decay_epochs : [700, 800]
values : [0.001, 0.0001]
warmup_epoch: 5
regularizer:
name: L2
factor: 2.0e-05
Architecture:
model_type: rec
algorithm: CRNN
Transform:
Backbone:
name: MobileNetV1Enhance
scale: 0.5
Neck:
name: SequenceEncoder
encoder_type: rnn
hidden_size: 64
Head:
name: CTCHead
mid_channels: 96
fc_decay: 0.00002
return_feats: true
Loss:
name: CombinedLoss
loss_config_list:
- CTCLoss:
use_focal_loss: false
weight: 1.0
- CenterLoss:
weight: 0.05
num_classes: 6625
feat_dim: 96
init_center: false
center_file_path: "./train_center.pkl"
# you can also try to add ace loss on your own dataset
# - ACELoss:
# weight: 0.1
PostProcess:
name: CTCLabelDecode
Metric:
name: RecMetric
main_indicator: acc
Train:
dataset:
name: SimpleDataSet
data_dir: ./train_data/
label_file_list:
- ./train_data/train_list.txt
transforms:
- DecodeImage:
img_mode: BGR
channel_first: false
- RecAug:
- CTCLabelEncode:
- RecResizeImg:
image_shape: [3, 32, 320]
- KeepKeys:
keep_keys:
- image
- label
- length
- label_ace
loader:
shuffle: true
batch_size_per_card: 128
drop_last: true
num_workers: 8
Eval:
dataset:
name: SimpleDataSet
data_dir: ./train_data
label_file_list:
- ./train_data/val_list.txt
transforms:
- DecodeImage:
img_mode: BGR
channel_first: false
- CTCLabelEncode:
- RecResizeImg:
image_shape: [3, 32, 320]
- KeepKeys:
keep_keys:
- image
- label
- length
loader:
shuffle: false
drop_last: false
batch_size_per_card: 128
num_workers: 8
Global:
use_gpu: True
epoch_num: 400
log_smooth_window: 20
print_batch_step: 10
save_model_dir: ./output/rec/seed
save_epoch_step: 3
# evaluation is run every 5000 iterations after the 4000th iteration
eval_batch_step: [0, 2000]
cal_metric_during_train: True
pretrained_model:
checkpoints:
save_inference_dir:
use_visualdl: False
infer_img: doc/imgs_words_en/word_10.png
# for data or label process
character_dict_path:
character_type: EN_symbol
max_text_length: 100
infer_mode: False
use_space_char: False
save_res_path: ./output/rec/predicts_seed.txt
Optimizer:
name: Adadelta
weight_deacy: 0.0
momentum: 0.9
lr:
name: Piecewise
decay_epochs: [4,5,8]
values: [1.0, 0.1, 0.01]
regularizer:
name: 'L2'
factor: 2.0e-05
Architecture:
model_type: rec
algorithm: SEED
Transform:
name: STN_ON
tps_inputsize: [32, 64]
tps_outputsize: [32, 100]
num_control_points: 20
tps_margins: [0.05,0.05]
stn_activation: none
Backbone:
name: ResNet_ASTER
Head:
name: AsterHead # AttentionHead
sDim: 512
attDim: 512
max_len_labels: 100
Loss:
name: AsterLoss
PostProcess:
name: SEEDLabelDecode
Metric:
name: RecMetric
main_indicator: acc
is_filter: True
Train:
dataset:
name: LMDBDataSet
data_dir: ./train_data/data_lmdb_release/training/
transforms:
- Fasttext:
path: "./cc.en.300.bin"
- DecodeImage: # load image
img_mode: BGR
channel_first: False
- SEEDLabelEncode: # Class handling label
- RecResizeImg:
character_type: en
image_shape: [3, 64, 256]
padding: False
- KeepKeys:
keep_keys: ['image', 'label', 'length', 'fast_label'] # dataloader will return list in this order
loader:
shuffle: True
batch_size_per_card: 256
drop_last: True
num_workers: 6
Eval:
dataset:
name: LMDBDataSet
data_dir: ./train_data/data_lmdb_release/evaluation/
transforms:
- DecodeImage: # load image
img_mode: BGR
channel_first: False
- SEEDLabelEncode: # Class handling label
- RecResizeImg:
character_type: en
image_shape: [3, 64, 256]
padding: False
- KeepKeys:
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
loader:
shuffle: False
drop_last: True
batch_size_per_card: 256
num_workers: 4
...@@ -110,25 +110,42 @@ def main(config, device, logger, vdl_writer): ...@@ -110,25 +110,42 @@ def main(config, device, logger, vdl_writer):
logger.info("metric['hmean']: {}".format(metric['hmean'])) logger.info("metric['hmean']: {}".format(metric['hmean']))
return metric['hmean'] return metric['hmean']
params_sensitive = pruner.sensitive( run_sensitive_analysis = False
eval_func=eval_fn, """
sen_file="./sen.pickle", run_sensitive_analysis=True:
skip_vars=[ Automatically compute the sensitivities of convolutions in a model.
"conv2d_57.w_0", "conv2d_transpose_2.w_0", "conv2d_transpose_3.w_0" The sensitivity of a convolution is the losses of accuracy on test dataset in
]) differenct pruned ratios. The sensitivities can be used to get a group of best
ratios with some condition.
logger.info(
"The sensitivity analysis results of model parameters saved in sen.pickle" run_sensitive_analysis=False:
) Set prune trim ratio to a fixed value, such as 10%. The larger the value,
# calculate pruned params's ratio the more convolution weights will be cropped.
params_sensitive = pruner._get_ratios_by_loss(params_sensitive, loss=0.02)
for key in params_sensitive.keys(): """
logger.info("{}, {}".format(key, params_sensitive[key]))
if run_sensitive_analysis:
#params_sensitive = {} params_sensitive = pruner.sensitive(
#for param in model.parameters(): eval_func=eval_fn,
# if 'transpose' not in param.name and 'linear' not in param.name: sen_file="./deploy/slim/prune/sen.pickle",
# params_sensitive[param.name] = 0.1 skip_vars=[
"conv2d_57.w_0", "conv2d_transpose_2.w_0",
"conv2d_transpose_3.w_0"
])
logger.info(
"The sensitivity analysis results of model parameters saved in sen.pickle"
)
# calculate pruned params's ratio
params_sensitive = pruner._get_ratios_by_loss(
params_sensitive, loss=0.02)
for key in params_sensitive.keys():
logger.info("{}, {}".format(key, params_sensitive[key]))
else:
params_sensitive = {}
for param in model.parameters():
if 'transpose' not in param.name and 'linear' not in param.name:
# set prune ratio as 10%. The larger the value, the more convolution weights will be cropped
params_sensitive[param.name] = 0.1
plan = pruner.prune_vars(params_sensitive, [0]) plan = pruner.prune_vars(params_sensitive, [0])
......
...@@ -50,6 +50,7 @@ PaddleOCR基于动态图开源的文本识别算法列表: ...@@ -50,6 +50,7 @@ PaddleOCR基于动态图开源的文本识别算法列表:
- [x] SRN([paper](https://arxiv.org/abs/2003.12294)) - [x] SRN([paper](https://arxiv.org/abs/2003.12294))
- [x] NRTR([paper](https://arxiv.org/abs/1806.00926v2)) - [x] NRTR([paper](https://arxiv.org/abs/1806.00926v2))
- [x] SAR([paper](https://arxiv.org/abs/1811.00751v2)) - [x] SAR([paper](https://arxiv.org/abs/1811.00751v2))
- [x] SEED([paper](https://arxiv.org/pdf/2005.10977.pdf))
参考[DTRB](https://arxiv.org/abs/1904.01906) 文字识别训练和评估流程,使用MJSynth和SynthText两个文字识别数据集训练,在IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE数据集上进行评估,算法效果如下: 参考[DTRB](https://arxiv.org/abs/1904.01906) 文字识别训练和评估流程,使用MJSynth和SynthText两个文字识别数据集训练,在IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE数据集上进行评估,算法效果如下:
...@@ -66,5 +67,5 @@ PaddleOCR基于动态图开源的文本识别算法列表: ...@@ -66,5 +67,5 @@ PaddleOCR基于动态图开源的文本识别算法列表:
|SRN|Resnet50_vd_fpn| 88.52% | rec_r50fpn_vd_none_srn | [下载链接](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_r50_vd_srn_train.tar) | |SRN|Resnet50_vd_fpn| 88.52% | rec_r50fpn_vd_none_srn | [下载链接](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_r50_vd_srn_train.tar) |
|NRTR|NRTR_MTB| 84.3% | rec_mtb_nrtr | [下载链接](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_mtb_nrtr_train.tar) | |NRTR|NRTR_MTB| 84.3% | rec_mtb_nrtr | [下载链接](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_mtb_nrtr_train.tar) |
|SAR|Resnet31| 87.2% | rec_r31_sar | [下载链接](https://paddleocr.bj.bcebos.com/dygraph_v2.1/rec/rec_r31_sar_train.tar) | |SAR|Resnet31| 87.2% | rec_r31_sar | [下载链接](https://paddleocr.bj.bcebos.com/dygraph_v2.1/rec/rec_r31_sar_train.tar) |
|SEED| Aster_Resnet | 85.2% | rec_resnet_stn_bilstm_att | [下载链接](https://paddleocr.bj.bcebos.com/dygraph_v2.1/rec/rec_resnet_stn_bilstm_att.tar)|
PaddleOCR文本识别算法的训练和使用请参考文档教程中[模型训练/评估中的文本识别部分](./recognition.md) PaddleOCR文本识别算法的训练和使用请参考文档教程中[模型训练/评估中的文本识别部分](./recognition.md)
...@@ -234,6 +234,9 @@ PaddleOCR支持训练和评估交替进行, 可以在 `configs/rec/rec_icdar15_t ...@@ -234,6 +234,9 @@ PaddleOCR支持训练和评估交替进行, 可以在 `configs/rec/rec_icdar15_t
| rec_r50fpn_vd_none_srn.yml | SRN | Resnet50_fpn_vd | None | rnn | srn | | rec_r50fpn_vd_none_srn.yml | SRN | Resnet50_fpn_vd | None | rnn | srn |
| rec_mtb_nrtr.yml | NRTR | nrtr_mtb | None | transformer encoder | transformer decoder | | rec_mtb_nrtr.yml | NRTR | nrtr_mtb | None | transformer encoder | transformer decoder |
| rec_r31_sar.yml | SAR | ResNet31 | None | LSTM encoder | LSTM decoder | | rec_r31_sar.yml | SAR | ResNet31 | None | LSTM encoder | LSTM decoder |
| rec_resnet_stn_bilstm_att.yml | SEED | Aster_Resnet | STN | BiLSTM | att |
*其中SEED模型需要额外加载FastText训练好的[语言模型](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz)
训练中文数据,推荐使用[rec_chinese_lite_train_v2.0.yml](../../configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml),如您希望尝试其他算法在中文数据集上的效果,请参考下列说明修改配置文件: 训练中文数据,推荐使用[rec_chinese_lite_train_v2.0.yml](../../configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml),如您希望尝试其他算法在中文数据集上的效果,请参考下列说明修改配置文件:
...@@ -460,5 +463,3 @@ python3 tools/export_model.py -c configs/rec/ch_ppocr_v2.0/rec_chinese_lite_trai ...@@ -460,5 +463,3 @@ python3 tools/export_model.py -c configs/rec/ch_ppocr_v2.0/rec_chinese_lite_trai
``` ```
python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words_en/word_336.png" --rec_model_dir="./your inference model" --rec_image_shape="3, 32, 100" --rec_char_type="ch" --rec_char_dict_path="your text dict path" python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words_en/word_336.png" --rec_model_dir="./your inference model" --rec_image_shape="3, 32, 100" --rec_char_type="ch" --rec_char_dict_path="your text dict path"
``` ```
doc/joinus.PNG

191 KB | W: | H:

doc/joinus.PNG

184 KB | W: | H:

doc/joinus.PNG
doc/joinus.PNG
doc/joinus.PNG
doc/joinus.PNG
  • 2-up
  • Swipe
  • Onion skin
...@@ -215,6 +215,11 @@ class CTCLabelEncode(BaseRecLabelEncode): ...@@ -215,6 +215,11 @@ class CTCLabelEncode(BaseRecLabelEncode):
data['length'] = np.array(len(text)) data['length'] = np.array(len(text))
text = text + [0] * (self.max_text_len - len(text)) text = text + [0] * (self.max_text_len - len(text))
data['label'] = np.array(text) data['label'] = np.array(text)
label = [0] * len(self.character)
for x in text:
label[x] += 1
data['label_ace'] = np.array(label)
return data return data
def add_special_char(self, dict_character): def add_special_char(self, dict_character):
...@@ -342,6 +347,38 @@ class AttnLabelEncode(BaseRecLabelEncode): ...@@ -342,6 +347,38 @@ class AttnLabelEncode(BaseRecLabelEncode):
return idx return idx
class SEEDLabelEncode(BaseRecLabelEncode):
""" Convert between text-label and text-index """
def __init__(self,
max_text_length,
character_dict_path=None,
character_type='ch',
use_space_char=False,
**kwargs):
super(SEEDLabelEncode,
self).__init__(max_text_length, character_dict_path,
character_type, use_space_char)
def add_special_char(self, dict_character):
self.end_str = "eos"
dict_character = dict_character + [self.end_str]
return dict_character
def __call__(self, data):
text = data['label']
text = self.encode(text)
if text is None:
return None
if len(text) >= self.max_text_len:
return None
data['length'] = np.array(len(text)) + 1 # conclude eos
text = text + [len(self.character) - 1] * (self.max_text_len - len(text)
)
data['label'] = np.array(text)
return data
class SRNLabelEncode(BaseRecLabelEncode): class SRNLabelEncode(BaseRecLabelEncode):
""" Convert between text-label and text-index """ """ Convert between text-label and text-index """
...@@ -421,7 +458,6 @@ class TableLabelEncode(object): ...@@ -421,7 +458,6 @@ class TableLabelEncode(object):
substr = lines[0].decode('utf-8').strip("\r\n").split("\t") substr = lines[0].decode('utf-8').strip("\r\n").split("\t")
character_num = int(substr[0]) character_num = int(substr[0])
elem_num = int(substr[1]) elem_num = int(substr[1])
for cno in range(1, 1 + character_num): for cno in range(1, 1 + character_num):
character = lines[cno].decode('utf-8').strip("\r\n") character = lines[cno].decode('utf-8').strip("\r\n")
list_character.append(character) list_character.append(character)
......
...@@ -23,6 +23,7 @@ import sys ...@@ -23,6 +23,7 @@ import sys
import six import six
import cv2 import cv2
import numpy as np import numpy as np
import fasttext
class DecodeImage(object): class DecodeImage(object):
...@@ -83,12 +84,13 @@ class NRTRDecodeImage(object): ...@@ -83,12 +84,13 @@ class NRTRDecodeImage(object):
elif self.img_mode == 'RGB': elif self.img_mode == 'RGB':
assert img.shape[2] == 3, 'invalid shape of image[%s]' % (img.shape) assert img.shape[2] == 3, 'invalid shape of image[%s]' % (img.shape)
img = img[:, :, ::-1] img = img[:, :, ::-1]
img = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY) img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
if self.channel_first: if self.channel_first:
img = img.transpose((2, 0, 1)) img = img.transpose((2, 0, 1))
data['image'] = img data['image'] = img
return data return data
class NormalizeImage(object): class NormalizeImage(object):
""" normalize image such as substract mean, divide std """ normalize image such as substract mean, divide std
""" """
...@@ -133,6 +135,17 @@ class ToCHWImage(object): ...@@ -133,6 +135,17 @@ class ToCHWImage(object):
return data return data
class Fasttext(object):
def __init__(self, path="None", **kwargs):
self.fast_model = fasttext.load_model(path)
def __call__(self, data):
label = data['label']
fast_label = self.fast_model[label]
data['fast_label'] = fast_label
return data
class KeepKeys(object): class KeepKeys(object):
def __init__(self, keep_keys, **kwargs): def __init__(self, keep_keys, **kwargs):
self.keep_keys = keep_keys self.keep_keys = keep_keys
......
...@@ -88,17 +88,19 @@ class RecResizeImg(object): ...@@ -88,17 +88,19 @@ class RecResizeImg(object):
image_shape, image_shape,
infer_mode=False, infer_mode=False,
character_type='ch', character_type='ch',
padding=True,
**kwargs): **kwargs):
self.image_shape = image_shape self.image_shape = image_shape
self.infer_mode = infer_mode self.infer_mode = infer_mode
self.character_type = character_type self.character_type = character_type
self.padding = padding
def __call__(self, data): def __call__(self, data):
img = data['image'] img = data['image']
if self.infer_mode and self.character_type == "ch": if self.infer_mode and self.character_type == "ch":
norm_img = resize_norm_img_chinese(img, self.image_shape) norm_img = resize_norm_img_chinese(img, self.image_shape)
else: else:
norm_img = resize_norm_img(img, self.image_shape) norm_img = resize_norm_img(img, self.image_shape, self.padding)
data['image'] = norm_img data['image'] = norm_img
return data return data
...@@ -174,16 +176,21 @@ def resize_norm_img_sar(img, image_shape, width_downsample_ratio=0.25): ...@@ -174,16 +176,21 @@ def resize_norm_img_sar(img, image_shape, width_downsample_ratio=0.25):
return padding_im, resize_shape, pad_shape, valid_ratio return padding_im, resize_shape, pad_shape, valid_ratio
def resize_norm_img(img, image_shape): def resize_norm_img(img, image_shape, padding=True):
imgC, imgH, imgW = image_shape imgC, imgH, imgW = image_shape
h = img.shape[0] h = img.shape[0]
w = img.shape[1] w = img.shape[1]
ratio = w / float(h) if not padding:
if math.ceil(imgH * ratio) > imgW: resized_image = cv2.resize(
img, (imgW, imgH), interpolation=cv2.INTER_LINEAR)
resized_w = imgW resized_w = imgW
else: else:
resized_w = int(math.ceil(imgH * ratio)) ratio = w / float(h)
resized_image = cv2.resize(img, (resized_w, imgH)) if math.ceil(imgH * ratio) > imgW:
resized_w = imgW
else:
resized_w = int(math.ceil(imgH * ratio))
resized_image = cv2.resize(img, (resized_w, imgH))
resized_image = resized_image.astype('float32') resized_image = resized_image.astype('float32')
if image_shape[0] == 1: if image_shape[0] == 1:
resized_image = resized_image / 255 resized_image = resized_image / 255
......
...@@ -28,6 +28,8 @@ from .rec_att_loss import AttentionLoss ...@@ -28,6 +28,8 @@ from .rec_att_loss import AttentionLoss
from .rec_srn_loss import SRNLoss from .rec_srn_loss import SRNLoss
from .rec_nrtr_loss import NRTRLoss from .rec_nrtr_loss import NRTRLoss
from .rec_sar_loss import SARLoss from .rec_sar_loss import SARLoss
from .rec_aster_loss import AsterLoss
# cls loss # cls loss
from .cls_loss import ClsLoss from .cls_loss import ClsLoss
...@@ -48,9 +50,8 @@ def build_loss(config): ...@@ -48,9 +50,8 @@ def build_loss(config):
support_dict = [ support_dict = [
'DBLoss', 'PSELoss', 'EASTLoss', 'SASTLoss', 'CTCLoss', 'ClsLoss', 'DBLoss', 'PSELoss', 'EASTLoss', 'SASTLoss', 'CTCLoss', 'ClsLoss',
'AttentionLoss', 'SRNLoss', 'PGLoss', 'CombinedLoss', 'NRTRLoss', 'AttentionLoss', 'SRNLoss', 'PGLoss', 'CombinedLoss', 'NRTRLoss',
'TableAttentionLoss', 'SARLoss' 'TableAttentionLoss', 'SARLoss', 'AsterLoss'
] ]
config = copy.deepcopy(config) config = copy.deepcopy(config)
module_name = config.pop('name') module_name = config.pop('name')
assert module_name in support_dict, Exception('loss only support {}'.format( assert module_name in support_dict, Exception('loss only support {}'.format(
......
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle
import paddle.nn as nn
class ACELoss(nn.Layer):
def __init__(self, **kwargs):
super().__init__()
self.loss_func = nn.CrossEntropyLoss(
weight=None,
ignore_index=0,
reduction='none',
soft_label=True,
axis=-1)
def __call__(self, predicts, batch):
if isinstance(predicts, (list, tuple)):
predicts = predicts[-1]
B, N = predicts.shape[:2]
div = paddle.to_tensor([N]).astype('float32')
predicts = nn.functional.softmax(predicts, axis=-1)
aggregation_preds = paddle.sum(predicts, axis=1)
aggregation_preds = paddle.divide(aggregation_preds, div)
length = batch[2].astype("float32")
batch = batch[3].astype("float32")
batch[:, 0] = paddle.subtract(div, length)
batch = paddle.divide(batch, div)
loss = self.loss_func(aggregation_preds, batch)
return {"loss_ace": loss}
#copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import pickle
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
class CenterLoss(nn.Layer):
"""
Reference: Wen et al. A Discriminative Feature Learning Approach for Deep Face Recognition. ECCV 2016.
"""
def __init__(self,
num_classes=6625,
feat_dim=96,
init_center=False,
center_file_path=None):
super().__init__()
self.num_classes = num_classes
self.feat_dim = feat_dim
self.centers = paddle.randn(
shape=[self.num_classes, self.feat_dim]).astype(
"float64") #random center
if init_center:
assert os.path.exists(
center_file_path
), f"center path({center_file_path}) must exist when init_center is set as True."
with open(center_file_path, 'rb') as f:
char_dict = pickle.load(f)
for key in char_dict.keys():
self.centers[key] = paddle.to_tensor(char_dict[key])
def __call__(self, predicts, batch):
assert isinstance(predicts, (list, tuple))
features, predicts = predicts
feats_reshape = paddle.reshape(
features, [-1, features.shape[-1]]).astype("float64")
label = paddle.argmax(predicts, axis=2)
label = paddle.reshape(label, [label.shape[0] * label.shape[1]])
batch_size = feats_reshape.shape[0]
#calc feat * feat
dist1 = paddle.sum(paddle.square(feats_reshape), axis=1, keepdim=True)
dist1 = paddle.expand(dist1, [batch_size, self.num_classes])
#dist2 of centers
dist2 = paddle.sum(paddle.square(self.centers), axis=1,
keepdim=True) #num_classes
dist2 = paddle.expand(dist2,
[self.num_classes, batch_size]).astype("float64")
dist2 = paddle.transpose(dist2, [1, 0])
#first x * x + y * y
distmat = paddle.add(dist1, dist2)
tmp = paddle.matmul(feats_reshape,
paddle.transpose(self.centers, [1, 0]))
distmat = distmat - 2.0 * tmp
#generate the mask
classes = paddle.arange(self.num_classes).astype("int64")
label = paddle.expand(
paddle.unsqueeze(label, 1), (batch_size, self.num_classes))
mask = paddle.equal(
paddle.expand(classes, [batch_size, self.num_classes]),
label).astype("float64") #get mask
dist = paddle.multiply(distmat, mask)
loss = paddle.sum(paddle.clip(dist, min=1e-12, max=1e+12)) / batch_size
return {'loss_center': loss}
...@@ -15,6 +15,10 @@ ...@@ -15,6 +15,10 @@
import paddle import paddle
import paddle.nn as nn import paddle.nn as nn
from .rec_ctc_loss import CTCLoss
from .center_loss import CenterLoss
from .ace_loss import ACELoss
from .distillation_loss import DistillationCTCLoss from .distillation_loss import DistillationCTCLoss
from .distillation_loss import DistillationDMLLoss from .distillation_loss import DistillationDMLLoss
from .distillation_loss import DistillationDistanceLoss, DistillationDBLoss, DistillationDilaDBLoss from .distillation_loss import DistillationDistanceLoss, DistillationDBLoss, DistillationDilaDBLoss
......
...@@ -112,7 +112,7 @@ class DistillationDMLLoss(DMLLoss): ...@@ -112,7 +112,7 @@ class DistillationDMLLoss(DMLLoss):
if isinstance(loss, dict): if isinstance(loss, dict):
for key in loss: for key in loss:
loss_dict["{}_{}_{}_{}_{}".format(key, pair[ loss_dict["{}_{}_{}_{}_{}".format(key, pair[
0], pair[1], map_name, idx)] = loss[key] 0], pair[1], self.maps_name, idx)] = loss[key]
else: else:
loss_dict["{}_{}_{}".format(self.name, self.maps_name[ loss_dict["{}_{}_{}".format(self.name, self.maps_name[
_c], idx)] = loss _c], idx)] = loss
......
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle
from paddle import nn
class CosineEmbeddingLoss(nn.Layer):
def __init__(self, margin=0.):
super(CosineEmbeddingLoss, self).__init__()
self.margin = margin
self.epsilon = 1e-12
def forward(self, x1, x2, target):
similarity = paddle.fluid.layers.reduce_sum(
x1 * x2, dim=-1) / (paddle.norm(
x1, axis=-1) * paddle.norm(
x2, axis=-1) + self.epsilon)
one_list = paddle.full_like(target, fill_value=1)
out = paddle.fluid.layers.reduce_mean(
paddle.where(
paddle.equal(target, one_list), 1. - similarity,
paddle.maximum(
paddle.zeros_like(similarity), similarity - self.margin)))
return out
class AsterLoss(nn.Layer):
def __init__(self,
weight=None,
size_average=True,
ignore_index=-100,
sequence_normalize=False,
sample_normalize=True,
**kwargs):
super(AsterLoss, self).__init__()
self.weight = weight
self.size_average = size_average
self.ignore_index = ignore_index
self.sequence_normalize = sequence_normalize
self.sample_normalize = sample_normalize
self.loss_sem = CosineEmbeddingLoss()
self.is_cosin_loss = True
self.loss_func_rec = nn.CrossEntropyLoss(weight=None, reduction='none')
def forward(self, predicts, batch):
targets = batch[1].astype("int64")
label_lengths = batch[2].astype('int64')
sem_target = batch[3].astype('float32')
embedding_vectors = predicts['embedding_vectors']
rec_pred = predicts['rec_pred']
if not self.is_cosin_loss:
sem_loss = paddle.sum(self.loss_sem(embedding_vectors, sem_target))
else:
label_target = paddle.ones([embedding_vectors.shape[0]])
sem_loss = paddle.sum(
self.loss_sem(embedding_vectors, sem_target, label_target))
# rec loss
batch_size, def_max_length = targets.shape[0], targets.shape[1]
mask = paddle.zeros([batch_size, def_max_length])
for i in range(batch_size):
mask[i, :label_lengths[i]] = 1
mask = paddle.cast(mask, "float32")
max_length = max(label_lengths)
assert max_length == rec_pred.shape[1]
targets = targets[:, :max_length]
mask = mask[:, :max_length]
rec_pred = paddle.reshape(rec_pred, [-1, rec_pred.shape[2]])
input = nn.functional.log_softmax(rec_pred, axis=1)
targets = paddle.reshape(targets, [-1, 1])
mask = paddle.reshape(mask, [-1, 1])
output = -paddle.index_sample(input, index=targets) * mask
output = paddle.sum(output)
if self.sequence_normalize:
output = output / paddle.sum(mask)
if self.sample_normalize:
output = output / batch_size
loss = output + sem_loss * 0.1
return {'loss': loss}
...@@ -21,16 +21,24 @@ from paddle import nn ...@@ -21,16 +21,24 @@ from paddle import nn
class CTCLoss(nn.Layer): class CTCLoss(nn.Layer):
def __init__(self, **kwargs): def __init__(self, use_focal_loss=False, **kwargs):
super(CTCLoss, self).__init__() super(CTCLoss, self).__init__()
self.loss_func = nn.CTCLoss(blank=0, reduction='none') self.loss_func = nn.CTCLoss(blank=0, reduction='none')
self.use_focal_loss = use_focal_loss
def forward(self, predicts, batch): def forward(self, predicts, batch):
if isinstance(predicts, (list, tuple)):
predicts = predicts[-1]
predicts = predicts.transpose((1, 0, 2)) predicts = predicts.transpose((1, 0, 2))
N, B, _ = predicts.shape N, B, _ = predicts.shape
preds_lengths = paddle.to_tensor([N] * B, dtype='int64') preds_lengths = paddle.to_tensor([N] * B, dtype='int64')
labels = batch[1].astype("int32") labels = batch[1].astype("int32")
label_lengths = batch[2].astype('int64') label_lengths = batch[2].astype('int64')
loss = self.loss_func(predicts, labels, preds_lengths, label_lengths) loss = self.loss_func(predicts, labels, preds_lengths, label_lengths)
if self.use_focal_loss:
weight = paddle.exp(-loss)
weight = paddle.subtract(paddle.to_tensor([1.0]), weight)
weight = paddle.square(weight) * self.focal_loss_alpha
loss = paddle.multiply(loss, weight)
loss = loss.mean() # sum loss = loss.mean() # sum
return {'loss': loss} return {'loss': loss}
...@@ -13,13 +13,20 @@ ...@@ -13,13 +13,20 @@
# limitations under the License. # limitations under the License.
import Levenshtein import Levenshtein
import string
class RecMetric(object): class RecMetric(object):
def __init__(self, main_indicator='acc', **kwargs): def __init__(self, main_indicator='acc', is_filter=False, **kwargs):
self.main_indicator = main_indicator self.main_indicator = main_indicator
self.is_filter = is_filter
self.reset() self.reset()
def _normalize_text(self, text):
text = ''.join(
filter(lambda x: x in (string.digits + string.ascii_letters), text))
return text.lower()
def __call__(self, pred_label, *args, **kwargs): def __call__(self, pred_label, *args, **kwargs):
preds, labels = pred_label preds, labels = pred_label
correct_num = 0 correct_num = 0
...@@ -28,6 +35,9 @@ class RecMetric(object): ...@@ -28,6 +35,9 @@ class RecMetric(object):
for (pred, pred_conf), (target, _) in zip(preds, labels): for (pred, pred_conf), (target, _) in zip(preds, labels):
pred = pred.replace(" ", "") pred = pred.replace(" ", "")
target = target.replace(" ", "") target = target.replace(" ", "")
if self.is_filter:
pred = self._normalize_text(pred)
target = self._normalize_text(target)
norm_edit_dis += Levenshtein.distance(pred, target) / max( norm_edit_dis += Levenshtein.distance(pred, target) / max(
len(pred), len(target), 1) len(pred), len(target), 1)
if pred == target: if pred == target:
...@@ -57,4 +67,3 @@ class RecMetric(object): ...@@ -57,4 +67,3 @@ class RecMetric(object):
self.correct_num = 0 self.correct_num = 0
self.all_num = 0 self.all_num = 0
self.norm_edit_dis = 0 self.norm_edit_dis = 0
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment