Commit 41f1fb8a authored by myhloli's avatar myhloli
Browse files

refactor(ocr): remove unused OCR dictionaries and update model configurations

- Remove unused OCR dictionaries for Arabic, Belarusian, Bulgarian and Armenian languages
- Update model configurations in arch_config.yaml:
- Comment out 'out_channels' for various language models
  - Rename Arabic, Korean, Japanese, Tamil and Devanagari model configurations to use 'v3' instead of 'v4'
- Delete ar_dict.txt, be_dict.txt and bg_dict.txt files
- Update arabic_dict.txt to remove blank line at the start
parent b3d6785d
lang:
ch:
det: ch_PP-OCRv4_det_infer.pth
rec: ch_PP-OCRv4_rec_infer.pth
dict: ppocr_keys_v1.txt
en:
det: en_PP-OCRv3_det_infer.pth
rec: en_PP-OCRv4_rec_infer.pth
dict: en_dict.txt
korean:
det: Multilingual_PP-OCRv3_det_infer.pth
rec: korean_PP-OCRv3_rec_infer.pth
dict: korean_dict.txt
japan:
det: Multilingual_PP-OCRv3_det_infer.pth
rec: japan_PP-OCRv3_rec_infer.pth
dict: japan_dict.txt
chinese_cht:
det: Multilingual_PP-OCRv3_det_infer.pth
rec: chinese_cht_PP-OCRv3_rec_infer.pth
dict: chinese_cht_dict.txt
ta:
det: Multilingual_PP-OCRv3_det_infer.pth
rec: ta_PP-OCRv3_rec_infer.pth
dict: ta_dict.txt
te:
det: Multilingual_PP-OCRv3_det_infer.pth
rec: te_PP-OCRv3_rec_infer.pth
dict: te_dict.txt
ka:
det: Multilingual_PP-OCRv3_det_infer.pth
rec: ka_PP-OCRv3_rec_infer.pth
dict: ka_dict.txt
latin:
det: en_PP-OCRv3_det_infer.pth
rec: latin_PP-OCRv3_rec_infer.pth
dict: latin_dict.txt
arabic:
det: Multilingual_PP-OCRv3_det_infer.pth
rec: arabic_PP-OCRv3_rec_infer.pth
dict: arabic_dict.txt
cyrillic:
det: Multilingual_PP-OCRv3_det_infer.pth
rec: cyrillic_PP-OCRv3_rec_infer.pth
dict: cyrillic_dict.txt
devanagari:
det: Multilingual_PP-OCRv3_det_infer.pth
rec: devanagari_PP-OCRv3_rec_infer.pth
dict: devanagari_dict.txt
\ No newline at end of file
# Copyright (c) Opendatalab. All rights reserved.
import copy
import os.path
from pathlib import Path
import cv2
import numpy as np
import yaml
from loguru import logger
from magic_pdf.libs.config_reader import get_device
from magic_pdf.libs.config_reader import get_device, get_local_models_dir
from .ocr_utils import check_img, preprocess_image, sorted_boxes, merge_det_boxes, update_det_boxes, get_rotate_crop_image
from .tools.infer.predict_system import TextSystem
from .tools.infer import pytorchocr_utility as utility
import argparse
latin_lang = [
'af', 'az', 'bs', 'cs', 'cy', 'da', 'de', 'es', 'et', 'fr', 'ga', 'hr', # noqa: E126
'hu', 'id', 'is', 'it', 'ku', 'la', 'lt', 'lv', 'mi', 'ms', 'mt', 'nl',
'no', 'oc', 'pi', 'pl', 'pt', 'ro', 'rs_latin', 'sk', 'sl', 'sq', 'sv',
'sw', 'tl', 'tr', 'uz', 'vi', 'french', 'german'
]
arabic_lang = ['ar', 'fa', 'ug', 'ur']
cyrillic_lang = [
'ru', 'rs_cyrillic', 'be', 'bg', 'uk', 'mn', 'abq', 'ady', 'kbd', 'ava', # noqa: E126
'dar', 'inh', 'che', 'lbe', 'lez', 'tab'
]
devanagari_lang = [
'hi', 'mr', 'ne', 'bh', 'mai', 'ang', 'bho', 'mah', 'sck', 'new', 'gom', # noqa: E126
'sa', 'bgc'
]
def get_model_params(lang, config):
if lang in config['lang']:
params = config['lang'][lang]
det = params.get('det')
rec = params.get('rec')
dict_file = params.get('dict')
return det, rec, dict_file
else:
raise Exception (f'Language {lang} not supported')
root_dir = Path(__file__).resolve().parent
class PytorchPaddleOCR(TextSystem):
def __init__(self, *args, **kwargs):
parser = utility.init_args()
args = parser.parse_args(args)
self.lang = kwargs.get('lang', 'ch')
if self.lang in latin_lang:
self.lang = 'latin'
elif self.lang in arabic_lang:
self.lang = 'arabic'
elif self.lang in cyrillic_lang:
self.lang = 'cyrillic'
elif self.lang in devanagari_lang:
self.lang = 'devanagari'
else:
pass
if self.lang == 'ch':
kwargs['det_model_path'] = "/Users/myhloli/Downloads/ch_ptocr_v4_det_infer.pth"
kwargs['rec_model_path'] = "/Users/myhloli/Downloads/ch_ptocr_v4_rec_infer.pth"
models_config_path = os.path.join(root_dir, 'models_config.yml')
with open(models_config_path) as file:
config = yaml.safe_load(file)
det, rec, dict_file = get_model_params(self.lang, config)
ocr_models_dir = os.path.join(get_local_models_dir(), 'OCR', 'paddleocr_torch')
kwargs['det_model_path'] = os.path.join(ocr_models_dir, det)
kwargs['rec_model_path'] = os.path.join(ocr_models_dir, rec)
kwargs['rec_char_dict_path'] = os.path.join(root_dir, 'pytorchocr', 'utils', 'dict', dict_file)
kwargs['device'] = get_device()
......
......@@ -172,7 +172,7 @@ chinese_cht_PP-OCRv3_rec_infer:
use_guide: True
Head:
name: CTCHead
out_channels: 8423
# out_channels: 8423
fc_decay: 0.00001
latin_PP-OCRv3_rec_infer:
......@@ -193,7 +193,7 @@ latin_PP-OCRv3_rec_infer:
use_guide: True
Head:
name: CTCHead
out_channels: 187
# out_channels: 187
fc_decay: 0.00001
cyrillic_PP-OCRv3_rec_infer:
......@@ -214,10 +214,10 @@ cyrillic_PP-OCRv3_rec_infer:
use_guide: True
Head:
name: CTCHead
out_channels: 165
# out_channels: 165
fc_decay: 0.00001
arabic_PP-OCRv4_rec_infer:
arabic_PP-OCRv3_rec_infer:
model_type: rec
algorithm: SVTR
Transform:
......@@ -235,10 +235,10 @@ arabic_PP-OCRv4_rec_infer:
use_guide: True
Head:
name: CTCHead
out_channels: 164
# out_channels: 164
fc_decay: 0.00001
korean_PP-OCRv4_rec_infer:
korean_PP-OCRv3_rec_infer:
model_type: rec
algorithm: SVTR
Transform:
......@@ -256,10 +256,10 @@ korean_PP-OCRv4_rec_infer:
use_guide: True
Head:
name: CTCHead
out_channels: 3690
# out_channels: 3690
fc_decay: 0.00001
japan_PP-OCRv4_rec_infer:
japan_PP-OCRv3_rec_infer:
model_type: rec
algorithm: SVTR
Transform:
......@@ -277,10 +277,10 @@ japan_PP-OCRv4_rec_infer:
use_guide: True
Head:
name: CTCHead
out_channels: 4401
# out_channels: 4401
fc_decay: 0.00001
ta_PP-OCRv4_rec_infer:
ta_PP-OCRv3_rec_infer:
model_type: rec
algorithm: SVTR
Transform:
......@@ -298,10 +298,10 @@ ta_PP-OCRv4_rec_infer:
use_guide: True
Head:
name: CTCHead
out_channels: 130
# out_channels: 130
fc_decay: 0.00001
te_PP-OCRv4_rec_infer:
te_PP-OCRv3_rec_infer:
model_type: rec
algorithm: SVTR
Transform:
......@@ -319,10 +319,10 @@ te_PP-OCRv4_rec_infer:
use_guide: True
Head:
name: CTCHead
out_channels: 153
# out_channels: 153
fc_decay: 0.00001
ka_PP-OCRv4_rec_infer:
ka_PP-OCRv3_rec_infer:
model_type: rec
algorithm: SVTR
Transform:
......@@ -340,10 +340,10 @@ ka_PP-OCRv4_rec_infer:
use_guide: True
Head:
name: CTCHead
out_channels: 155
# out_channels: 155
fc_decay: 0.00001
devanagari_PP-OCRv4_rec_infer:
devanagari_PP-OCRv3_rec_infer:
model_type: rec
algorithm: SVTR
Transform:
......@@ -361,6 +361,6 @@ devanagari_PP-OCRv4_rec_infer:
use_guide: True
Head:
name: CTCHead
out_channels: 169
# out_channels: 169
fc_decay: 0.00001
0
1
2
3
4
5
6
7
8
9
a
b
c
d
e
f
g
h
i
j
k
l
m
n
o
p
q
r
s
t
u
v
w
x
y
z
A
B
C
D
E
F
G
H
I
J
K
L
M
N
O
P
Q
R
S
T
U
V
W
X
Y
Z
!
"
#
$
%
&
'
(
)
*
+
,
-
.
/
:
;
<
=
>
?
@
[
\
]
^
_
`
{
|
}
~
\ No newline at end of file
a
r
b
i
c
_
m
g
/
1
0
I
L
S
V
R
C
2
v
l
6
3
9
.
j
p
ا
ل
م
ر
ج
و
ح
ي
ة
5
8
7
أ
ب
ض
4
ك
س
ه
ث
ن
ط
ع
ت
غ
خ
ف
ئ
ز
إ
د
ص
ظ
ذ
ش
ى
ق
ؤ
آ
ء
s
e
n
w
t
u
z
d
A
N
G
h
o
E
T
H
O
B
y
F
U
J
X
W
P
Z
M
k
q
Y
Q
D
f
K
x
'
%
-
#
@
!
&
$
,
:
é
?
+
É
(
b
e
_
i
m
g
/
2
0
I
L
S
V
R
C
1
v
a
l
6
9
4
3
.
j
p
п
а
з
б
у
г
н
ц
ь
8
м
л
і
о
ў
ы
7
5
М
х
с
р
ф
я
е
д
ж
ю
ч
й
к
Д
в
Б
т
І
ш
ё
э
К
Л
Н
А
Ж
Г
В
П
З
Е
О
Р
С
У
Ё
Й
Т
Ч
Э
Ц
Ю
Ш
Ф
Х
Я
Ь
Ы
Ў
s
c
n
w
M
o
t
T
E
A
B
u
h
y
k
r
H
d
Y
O
U
F
f
x
D
G
N
K
P
z
J
X
W
Z
Q
%
-
q
@
'
!
#
&
,
:
$
(
?
é
+
É
!
#
$
%
&
'
(
+
,
-
.
/
0
1
2
3
4
5
6
7
8
9
:
?
@
A
B
C
D
E
F
G
H
I
J
K
L
M
N
O
P
Q
R
S
T
U
V
W
X
Y
Z
_
a
b
c
d
e
f
g
h
i
j
k
l
m
n
o
p
q
r
s
t
u
v
w
x
y
z
É
é
А
Б
В
Г
Д
Е
Ж
З
И
Й
К
Л
М
Н
О
П
Р
С
Т
У
Ф
Х
Ц
Ч
Ш
Щ
Ъ
Ю
Я
а
б
в
г
д
е
ж
з
и
й
к
л
м
н
о
п
р
с
т
у
ф
х
ц
ч
ш
щ
ъ
ь
ю
я
......@@ -8,32 +8,13 @@
7
8
9
a
b
c
d
e
f
g
h
i
j
k
l
m
n
o
p
q
r
s
t
u
v
w
x
y
z
:
;
<
=
>
?
@
A
B
C
......@@ -60,4 +41,55 @@ W
X
Y
Z
[
\
]
^
_
`
a
b
c
d
e
f
g
h
i
j
k
l
m
n
o
p
q
r
s
t
u
v
w
x
y
z
{
|
}
~
!
"
#
$
%
&
'
(
)
*
+
,
-
.
/
x
i
_
m
g
/
1
0
I
L
S
V
R
C
2
v
a
l
3
6
4
5
.
j
p
Q
u
e
r
o
8
7
n
c
9
t
b
é
q
d
ó
y
F
s
,
O
í
T
f
"
U
M
h
:
P
H
A
E
D
z
N
á
ñ
ú
%
;
è
+
Y
-
B
G
(
)
¿
?
w
¡
!
X
É
K
k
Á
ü
Ú
«
»
J
'
ö
W
Z
º
Ö
­
[
]
Ç
ç
à
ä
û
ò
Í
ê
ô
ø
ª
f
a
_
i
m
g
/
1
3
I
L
S
V
R
C
2
0
v
l
6
8
5
.
j
p
و
د
ر
ك
ن
ش
ه
ا
4
9
ی
ج
ِ
7
غ
ل
س
ز
ّ
ت
ک
گ
ي
م
ب
ف
چ
خ
ق
ژ
آ
ص
پ
َ
ع
ئ
ح
ٔ
ض
ُ
ذ
أ
ى
ط
ظ
ث
ة
ً
ء
ؤ
ْ
ۀ
إ
ٍ
ٌ
ٰ
ٓ
ٱ
s
c
e
n
w
N
E
W
Y
D
O
H
A
d
z
r
T
G
o
t
x
h
b
B
M
Z
u
P
F
y
q
U
K
k
J
Q
'
X
#
?
%
$
,
:
&
!
-
(
É
@
é
+
f
e
n
c
h
_
i
m
g
/
r
v
a
l
t
w
o
d
6
1
.
p
B
u
2
à
3
R
y
4
U
E
A
5
P
O
S
T
D
7
Z
8
I
N
L
G
M
H
0
J
K
-
9
F
C
V
é
X
'
s
Q
:
è
x
b
Y
Œ
É
z
W
Ç
È
k
Ô
ô
À
Ê
q
ù
°
ê
î
*
Â
j
"
,
â
%
û
ç
ü
?
!
;
ö
(
)
ï
º
ó
ø
å
+
á
Ë
<
²
Á
Î
&
@
œ
ε
Ü
ë
[
]
í
ò
Ö
ä
ß
«
»
ú
ñ
æ
µ
³
Å
$
#
!
"
#
$
%
&
'
(
)
*
+
,
-
.
/
0
1
2
3
4
5
6
7
8
9
:
;
=
>
?
@
A
B
C
D
E
F
G
H
I
J
K
L
M
N
O
P
Q
R
S
T
U
V
W
X
Y
Z
[
]
_
a
b
c
d
e
f
g
h
i
j
k
l
m
n
o
p
q
r
s
t
u
v
w
x
y
z
£
§
­
°
´
µ
·
º
¿
Á
Ä
Å
É
Ï
Ô
Ö
Ü
ß
à
á
â
ã
ä
å
æ
ç
è
é
ê
ë
í
ï
ñ
ò
ó
ô
ö
ø
ù
ú
û
ü
ō
Š
Ÿ
ʒ
β
δ
з
©
ª
«
¬
!
#
$
%
&
'
(
+
,
-
.
/
0
1
2
3
4
5
6
7
8
9
:
?
@
A
B
C
D
E
F
G
H
I
J
K
L
M
N
O
P
Q
R
S
T
U
V
W
X
Y
Z
_
a
b
c
d
e
f
g
h
i
j
k
l
m
n
o
p
q
r
s
t
u
v
w
x
y
z
É
é
ि
i
t
_
m
g
/
5
I
L
S
V
R
C
2
0
1
v
a
l
7
8
9
6
.
j
p
e
r
o
d
s
n
3
4
P
u
c
A
-
,
"
z
h
f
b
q
ì
'
à
O
è
G
ù
é
ò
;
F
E
B
N
H
k
:
U
T
X
D
K
?
[
M
­
x
y
(
)
W
ö
º
w
]
Q
J
+
ü
!
È
á
%
=
»
ñ
Ö
Y
ä
í
Z
«
@
ó
ø
ï
ú
ê
ç
Á
É
Å
ß
{
}
&
`
û
î
#
$
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment