Commit 41f1fb8a authored by myhloli's avatar myhloli
Browse files

refactor(ocr): remove unused OCR dictionaries and update model configurations

- Remove unused OCR dictionaries for Arabic, Belarusian, Bulgarian and Armenian languages
- Update model configurations in arch_config.yaml:
- Comment out 'out_channels' for various language models
  - Rename Arabic, Korean, Japanese, Tamil and Devanagari model configurations to use 'v3' instead of 'v4'
- Delete ar_dict.txt, be_dict.txt and bg_dict.txt files
- Update arabic_dict.txt to remove blank line at the start
parent b3d6785d
lang:
ch:
det: ch_PP-OCRv4_det_infer.pth
rec: ch_PP-OCRv4_rec_infer.pth
dict: ppocr_keys_v1.txt
en:
det: en_PP-OCRv3_det_infer.pth
rec: en_PP-OCRv4_rec_infer.pth
dict: en_dict.txt
korean:
det: Multilingual_PP-OCRv3_det_infer.pth
rec: korean_PP-OCRv3_rec_infer.pth
dict: korean_dict.txt
japan:
det: Multilingual_PP-OCRv3_det_infer.pth
rec: japan_PP-OCRv3_rec_infer.pth
dict: japan_dict.txt
chinese_cht:
det: Multilingual_PP-OCRv3_det_infer.pth
rec: chinese_cht_PP-OCRv3_rec_infer.pth
dict: chinese_cht_dict.txt
ta:
det: Multilingual_PP-OCRv3_det_infer.pth
rec: ta_PP-OCRv3_rec_infer.pth
dict: ta_dict.txt
te:
det: Multilingual_PP-OCRv3_det_infer.pth
rec: te_PP-OCRv3_rec_infer.pth
dict: te_dict.txt
ka:
det: Multilingual_PP-OCRv3_det_infer.pth
rec: ka_PP-OCRv3_rec_infer.pth
dict: ka_dict.txt
latin:
det: en_PP-OCRv3_det_infer.pth
rec: latin_PP-OCRv3_rec_infer.pth
dict: latin_dict.txt
arabic:
det: Multilingual_PP-OCRv3_det_infer.pth
rec: arabic_PP-OCRv3_rec_infer.pth
dict: arabic_dict.txt
cyrillic:
det: Multilingual_PP-OCRv3_det_infer.pth
rec: cyrillic_PP-OCRv3_rec_infer.pth
dict: cyrillic_dict.txt
devanagari:
det: Multilingual_PP-OCRv3_det_infer.pth
rec: devanagari_PP-OCRv3_rec_infer.pth
dict: devanagari_dict.txt
\ No newline at end of file
# Copyright (c) Opendatalab. All rights reserved. # Copyright (c) Opendatalab. All rights reserved.
import copy import copy
import os.path
from pathlib import Path
import cv2 import cv2
import numpy as np import numpy as np
import yaml
from loguru import logger from loguru import logger
from magic_pdf.libs.config_reader import get_device from magic_pdf.libs.config_reader import get_device, get_local_models_dir
from .ocr_utils import check_img, preprocess_image, sorted_boxes, merge_det_boxes, update_det_boxes, get_rotate_crop_image from .ocr_utils import check_img, preprocess_image, sorted_boxes, merge_det_boxes, update_det_boxes, get_rotate_crop_image
from .tools.infer.predict_system import TextSystem from .tools.infer.predict_system import TextSystem
from .tools.infer import pytorchocr_utility as utility from .tools.infer import pytorchocr_utility as utility
import argparse import argparse
latin_lang = [
'af', 'az', 'bs', 'cs', 'cy', 'da', 'de', 'es', 'et', 'fr', 'ga', 'hr', # noqa: E126
'hu', 'id', 'is', 'it', 'ku', 'la', 'lt', 'lv', 'mi', 'ms', 'mt', 'nl',
'no', 'oc', 'pi', 'pl', 'pt', 'ro', 'rs_latin', 'sk', 'sl', 'sq', 'sv',
'sw', 'tl', 'tr', 'uz', 'vi', 'french', 'german'
]
arabic_lang = ['ar', 'fa', 'ug', 'ur']
cyrillic_lang = [
'ru', 'rs_cyrillic', 'be', 'bg', 'uk', 'mn', 'abq', 'ady', 'kbd', 'ava', # noqa: E126
'dar', 'inh', 'che', 'lbe', 'lez', 'tab'
]
devanagari_lang = [
'hi', 'mr', 'ne', 'bh', 'mai', 'ang', 'bho', 'mah', 'sck', 'new', 'gom', # noqa: E126
'sa', 'bgc'
]
def get_model_params(lang, config):
if lang in config['lang']:
params = config['lang'][lang]
det = params.get('det')
rec = params.get('rec')
dict_file = params.get('dict')
return det, rec, dict_file
else:
raise Exception (f'Language {lang} not supported')
root_dir = Path(__file__).resolve().parent
class PytorchPaddleOCR(TextSystem): class PytorchPaddleOCR(TextSystem):
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
parser = utility.init_args() parser = utility.init_args()
args = parser.parse_args(args) args = parser.parse_args(args)
self.lang = kwargs.get('lang', 'ch') self.lang = kwargs.get('lang', 'ch')
if self.lang in latin_lang:
self.lang = 'latin'
elif self.lang in arabic_lang:
self.lang = 'arabic'
elif self.lang in cyrillic_lang:
self.lang = 'cyrillic'
elif self.lang in devanagari_lang:
self.lang = 'devanagari'
else:
pass
if self.lang == 'ch': models_config_path = os.path.join(root_dir, 'models_config.yml')
kwargs['det_model_path'] = "/Users/myhloli/Downloads/ch_ptocr_v4_det_infer.pth" with open(models_config_path) as file:
kwargs['rec_model_path'] = "/Users/myhloli/Downloads/ch_ptocr_v4_rec_infer.pth" config = yaml.safe_load(file)
det, rec, dict_file = get_model_params(self.lang, config)
ocr_models_dir = os.path.join(get_local_models_dir(), 'OCR', 'paddleocr_torch')
kwargs['det_model_path'] = os.path.join(ocr_models_dir, det)
kwargs['rec_model_path'] = os.path.join(ocr_models_dir, rec)
kwargs['rec_char_dict_path'] = os.path.join(root_dir, 'pytorchocr', 'utils', 'dict', dict_file)
kwargs['device'] = get_device() kwargs['device'] = get_device()
......
...@@ -172,7 +172,7 @@ chinese_cht_PP-OCRv3_rec_infer: ...@@ -172,7 +172,7 @@ chinese_cht_PP-OCRv3_rec_infer:
use_guide: True use_guide: True
Head: Head:
name: CTCHead name: CTCHead
out_channels: 8423 # out_channels: 8423
fc_decay: 0.00001 fc_decay: 0.00001
latin_PP-OCRv3_rec_infer: latin_PP-OCRv3_rec_infer:
...@@ -193,7 +193,7 @@ latin_PP-OCRv3_rec_infer: ...@@ -193,7 +193,7 @@ latin_PP-OCRv3_rec_infer:
use_guide: True use_guide: True
Head: Head:
name: CTCHead name: CTCHead
out_channels: 187 # out_channels: 187
fc_decay: 0.00001 fc_decay: 0.00001
cyrillic_PP-OCRv3_rec_infer: cyrillic_PP-OCRv3_rec_infer:
...@@ -214,10 +214,10 @@ cyrillic_PP-OCRv3_rec_infer: ...@@ -214,10 +214,10 @@ cyrillic_PP-OCRv3_rec_infer:
use_guide: True use_guide: True
Head: Head:
name: CTCHead name: CTCHead
out_channels: 165 # out_channels: 165
fc_decay: 0.00001 fc_decay: 0.00001
arabic_PP-OCRv4_rec_infer: arabic_PP-OCRv3_rec_infer:
model_type: rec model_type: rec
algorithm: SVTR algorithm: SVTR
Transform: Transform:
...@@ -235,10 +235,10 @@ arabic_PP-OCRv4_rec_infer: ...@@ -235,10 +235,10 @@ arabic_PP-OCRv4_rec_infer:
use_guide: True use_guide: True
Head: Head:
name: CTCHead name: CTCHead
out_channels: 164 # out_channels: 164
fc_decay: 0.00001 fc_decay: 0.00001
korean_PP-OCRv4_rec_infer: korean_PP-OCRv3_rec_infer:
model_type: rec model_type: rec
algorithm: SVTR algorithm: SVTR
Transform: Transform:
...@@ -256,10 +256,10 @@ korean_PP-OCRv4_rec_infer: ...@@ -256,10 +256,10 @@ korean_PP-OCRv4_rec_infer:
use_guide: True use_guide: True
Head: Head:
name: CTCHead name: CTCHead
out_channels: 3690 # out_channels: 3690
fc_decay: 0.00001 fc_decay: 0.00001
japan_PP-OCRv4_rec_infer: japan_PP-OCRv3_rec_infer:
model_type: rec model_type: rec
algorithm: SVTR algorithm: SVTR
Transform: Transform:
...@@ -277,10 +277,10 @@ japan_PP-OCRv4_rec_infer: ...@@ -277,10 +277,10 @@ japan_PP-OCRv4_rec_infer:
use_guide: True use_guide: True
Head: Head:
name: CTCHead name: CTCHead
out_channels: 4401 # out_channels: 4401
fc_decay: 0.00001 fc_decay: 0.00001
ta_PP-OCRv4_rec_infer: ta_PP-OCRv3_rec_infer:
model_type: rec model_type: rec
algorithm: SVTR algorithm: SVTR
Transform: Transform:
...@@ -298,10 +298,10 @@ ta_PP-OCRv4_rec_infer: ...@@ -298,10 +298,10 @@ ta_PP-OCRv4_rec_infer:
use_guide: True use_guide: True
Head: Head:
name: CTCHead name: CTCHead
out_channels: 130 # out_channels: 130
fc_decay: 0.00001 fc_decay: 0.00001
te_PP-OCRv4_rec_infer: te_PP-OCRv3_rec_infer:
model_type: rec model_type: rec
algorithm: SVTR algorithm: SVTR
Transform: Transform:
...@@ -319,10 +319,10 @@ te_PP-OCRv4_rec_infer: ...@@ -319,10 +319,10 @@ te_PP-OCRv4_rec_infer:
use_guide: True use_guide: True
Head: Head:
name: CTCHead name: CTCHead
out_channels: 153 # out_channels: 153
fc_decay: 0.00001 fc_decay: 0.00001
ka_PP-OCRv4_rec_infer: ka_PP-OCRv3_rec_infer:
model_type: rec model_type: rec
algorithm: SVTR algorithm: SVTR
Transform: Transform:
...@@ -340,10 +340,10 @@ ka_PP-OCRv4_rec_infer: ...@@ -340,10 +340,10 @@ ka_PP-OCRv4_rec_infer:
use_guide: True use_guide: True
Head: Head:
name: CTCHead name: CTCHead
out_channels: 155 # out_channels: 155
fc_decay: 0.00001 fc_decay: 0.00001
devanagari_PP-OCRv4_rec_infer: devanagari_PP-OCRv3_rec_infer:
model_type: rec model_type: rec
algorithm: SVTR algorithm: SVTR
Transform: Transform:
...@@ -361,6 +361,6 @@ devanagari_PP-OCRv4_rec_infer: ...@@ -361,6 +361,6 @@ devanagari_PP-OCRv4_rec_infer:
use_guide: True use_guide: True
Head: Head:
name: CTCHead name: CTCHead
out_channels: 169 # out_channels: 169
fc_decay: 0.00001 fc_decay: 0.00001
0
1
2
3
4
5
6
7
8
9
a
b
c
d
e
f
g
h
i
j
k
l
m
n
o
p
q
r
s
t
u
v
w
x
y
z
A
B
C
D
E
F
G
H
I
J
K
L
M
N
O
P
Q
R
S
T
U
V
W
X
Y
Z
!
"
#
$
%
&
'
(
)
*
+
,
-
.
/
:
;
<
=
>
?
@
[
\
]
^
_
`
{
|
}
~
\ No newline at end of file
a
r
b
i
c
_
m
g
/
1
0
I
L
S
V
R
C
2
v
l
6
3
9
.
j
p
ا
ل
م
ر
ج
و
ح
ي
ة
5
8
7
أ
ب
ض
4
ك
س
ه
ث
ن
ط
ع
ت
غ
خ
ف
ئ
ز
إ
د
ص
ظ
ذ
ش
ى
ق
ؤ
آ
ء
s
e
n
w
t
u
z
d
A
N
G
h
o
E
T
H
O
B
y
F
U
J
X
W
P
Z
M
k
q
Y
Q
D
f
K
x
'
%
-
#
@
!
&
$
,
:
é
?
+
É
(
b
e
_
i
m
g
/
2
0
I
L
S
V
R
C
1
v
a
l
6
9
4
3
.
j
p
п
а
з
б
у
г
н
ц
ь
8
м
л
і
о
ў
ы
7
5
М
х
с
р
ф
я
е
д
ж
ю
ч
й
к
Д
в
Б
т
І
ш
ё
э
К
Л
Н
А
Ж
Г
В
П
З
Е
О
Р
С
У
Ё
Й
Т
Ч
Э
Ц
Ю
Ш
Ф
Х
Я
Ь
Ы
Ў
s
c
n
w
M
o
t
T
E
A
B
u
h
y
k
r
H
d
Y
O
U
F
f
x
D
G
N
K
P
z
J
X
W
Z
Q
%
-
q
@
'
!
#
&
,
:
$
(
?
é
+
É
!
#
$
%
&
'
(
+
,
-
.
/
0
1
2
3
4
5
6
7
8
9
:
?
@
A
B
C
D
E
F
G
H
I
J
K
L
M
N
O
P
Q
R
S
T
U
V
W
X
Y
Z
_
a
b
c
d
e
f
g
h
i
j
k
l
m
n
o
p
q
r
s
t
u
v
w
x
y
z
É
é
А
Б
В
Г
Д
Е
Ж
З
И
Й
К
Л
М
Н
О
П
Р
С
Т
У
Ф
Х
Ц
Ч
Ш
Щ
Ъ
Ю
Я
а
б
в
г
д
е
ж
з
и
й
к
л
м
н
о
п
р
с
т
у
ф
х
ц
ч
ш
щ
ъ
ь
ю
я
...@@ -8,32 +8,13 @@ ...@@ -8,32 +8,13 @@
7 7
8 8
9 9
a :
b ;
c <
d =
e >
f ?
g @
h
i
j
k
l
m
n
o
p
q
r
s
t
u
v
w
x
y
z
A A
B B
C C
...@@ -60,4 +41,55 @@ W ...@@ -60,4 +41,55 @@ W
X X
Y Y
Z Z
[
\
]
^
_
`
a
b
c
d
e
f
g
h
i
j
k
l
m
n
o
p
q
r
s
t
u
v
w
x
y
z
{
|
}
~
!
"
#
$
%
&
'
(
)
*
+
,
-
.
/
x
i
_
m
g
/
1
0
I
L
S
V
R
C
2
v
a
l
3
6
4
5
.
j
p
Q
u
e
r
o
8
7
n
c
9
t
b
é
q
d
ó
y
F
s
,
O
í
T
f
"
U
M
h
:
P
H
A
E
D
z
N
á
ñ
ú
%
;
è
+
Y
-
B
G
(
)
¿
?
w
¡
!
X
É
K
k
Á
ü
Ú
«
»
J
'
ö
W
Z
º
Ö
­
[
]
Ç
ç
à
ä
û
ò
Í
ê
ô
ø
ª
f
a
_
i
m
g
/
1
3
I
L
S
V
R
C
2
0
v
l
6
8
5
.
j
p
و
د
ر
ك
ن
ش
ه
ا
4
9
ی
ج
ِ
7
غ
ل
س
ز
ّ
ت
ک
گ
ي
م
ب
ف
چ
خ
ق
ژ
آ
ص
پ
َ
ع
ئ
ح
ٔ
ض
ُ
ذ
أ
ى
ط
ظ
ث
ة
ً
ء
ؤ
ْ
ۀ
إ
ٍ
ٌ
ٰ
ٓ
ٱ
s
c
e
n
w
N
E
W
Y
D
O
H
A
d
z
r
T
G
o
t
x
h
b
B
M
Z
u
P
F
y
q
U
K
k
J
Q
'
X
#
?
%
$
,
:
&
!
-
(
É
@
é
+
f
e
n
c
h
_
i
m
g
/
r
v
a
l
t
w
o
d
6
1
.
p
B
u
2
à
3
R
y
4
U
E
A
5
P
O
S
T
D
7
Z
8
I
N
L
G
M
H
0
J
K
-
9
F
C
V
é
X
'
s
Q
:
è
x
b
Y
Œ
É
z
W
Ç
È
k
Ô
ô
À
Ê
q
ù
°
ê
î
*
Â
j
"
,
â
%
û
ç
ü
?
!
;
ö
(
)
ï
º
ó
ø
å
+
á
Ë
<
²
Á
Î
&
@
œ
ε
Ü
ë
[
]
í
ò
Ö
ä
ß
«
»
ú
ñ
æ
µ
³
Å
$
#
!
"
#
$
%
&
'
(
)
*
+
,
-
.
/
0
1
2
3
4
5
6
7
8
9
:
;
=
>
?
@
A
B
C
D
E
F
G
H
I
J
K
L
M
N
O
P
Q
R
S
T
U
V
W
X
Y
Z
[
]
_
a
b
c
d
e
f
g
h
i
j
k
l
m
n
o
p
q
r
s
t
u
v
w
x
y
z
£
§
­
°
´
µ
·
º
¿
Á
Ä
Å
É
Ï
Ô
Ö
Ü
ß
à
á
â
ã
ä
å
æ
ç
è
é
ê
ë
í
ï
ñ
ò
ó
ô
ö
ø
ù
ú
û
ü
ō
Š
Ÿ
ʒ
β
δ
з
©
ª
«
¬
!
#
$
%
&
'
(
+
,
-
.
/
0
1
2
3
4
5
6
7
8
9
:
?
@
A
B
C
D
E
F
G
H
I
J
K
L
M
N
O
P
Q
R
S
T
U
V
W
X
Y
Z
_
a
b
c
d
e
f
g
h
i
j
k
l
m
n
o
p
q
r
s
t
u
v
w
x
y
z
É
é
ि
i
t
_
m
g
/
5
I
L
S
V
R
C
2
0
1
v
a
l
7
8
9
6
.
j
p
e
r
o
d
s
n
3
4
P
u
c
A
-
,
"
z
h
f
b
q
ì
'
à
O
è
G
ù
é
ò
;
F
E
B
N
H
k
:
U
T
X
D
K
?
[
M
­
x
y
(
)
W
ö
º
w
]
Q
J
+
ü
!
È
á
%
=
»
ñ
Ö
Y
ä
í
Z
«
@
ó
ø
ï
ú
ê
ç
Á
É
Å
ß
{
}
&
`
û
î
#
$
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment