Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
43f21a77
Commit
43f21a77
authored
Jun 30, 2025
by
myhloli
Browse files
feat: add support for Korean and Latin configurations in OCR model processing
parent
86391acf
Changes
7
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
13061 additions
and
10 deletions
+13061
-10
mineru/cli/gradio_app.py
mineru/cli/gradio_app.py
+3
-2
mineru/model/ocr/paddleocr2pytorch/pytorch_paddle.py
mineru/model/ocr/paddleocr2pytorch/pytorch_paddle.py
+4
-1
mineru/model/ocr/paddleocr2pytorch/pytorchocr/utils/resources/arch_config.yaml
...leocr2pytorch/pytorchocr/utils/resources/arch_config.yaml
+79
-0
mineru/model/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ppocrv5_eslav_dict.txt
...ch/pytorchocr/utils/resources/dict/ppocrv5_eslav_dict.txt
+517
-0
mineru/model/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ppocrv5_korean_dict.txt
...h/pytorchocr/utils/resources/dict/ppocrv5_korean_dict.txt
+11945
-0
mineru/model/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ppocrv5_latin_dict.txt
...ch/pytorchocr/utils/resources/dict/ppocrv5_latin_dict.txt
+502
-0
mineru/model/ocr/paddleocr2pytorch/pytorchocr/utils/resources/models_config.yml
...eocr2pytorch/pytorchocr/utils/resources/models_config.yml
+11
-7
No files found.
mineru/cli/gradio_app.py
View file @
43f21a77
...
@@ -133,15 +133,16 @@ latin_lang = [
...
@@ -133,15 +133,16 @@ latin_lang = [
]
]
arabic_lang
=
[
'ar'
,
'fa'
,
'ug'
,
'ur'
]
arabic_lang
=
[
'ar'
,
'fa'
,
'ug'
,
'ur'
]
cyrillic_lang
=
[
cyrillic_lang
=
[
'ru'
,
'rs_cyrillic'
,
'b
e'
,
'bg'
,
'uk
'
,
'mn'
,
'abq'
,
'ady'
,
'kbd'
,
'ava'
,
# noqa: E126
'rs_cyrillic'
,
'b
g
'
,
'mn'
,
'abq'
,
'ady'
,
'kbd'
,
'ava'
,
# noqa: E126
'dar'
,
'inh'
,
'che'
,
'lbe'
,
'lez'
,
'tab'
'dar'
,
'inh'
,
'che'
,
'lbe'
,
'lez'
,
'tab'
]
]
east_slavic_lang
=
[
"ru"
,
"be"
,
"uk"
]
devanagari_lang
=
[
devanagari_lang
=
[
'hi'
,
'mr'
,
'ne'
,
'bh'
,
'mai'
,
'ang'
,
'bho'
,
'mah'
,
'sck'
,
'new'
,
'gom'
,
# noqa: E126
'hi'
,
'mr'
,
'ne'
,
'bh'
,
'mai'
,
'ang'
,
'bho'
,
'mah'
,
'sck'
,
'new'
,
'gom'
,
# noqa: E126
'sa'
,
'bgc'
'sa'
,
'bgc'
]
]
other_lang
=
[
'ch'
,
'ch_lite'
,
'ch_server'
,
'en'
,
'korean'
,
'japan'
,
'chinese_cht'
,
'ta'
,
'te'
,
'ka'
]
other_lang
=
[
'ch'
,
'ch_lite'
,
'ch_server'
,
'en'
,
'korean'
,
'japan'
,
'chinese_cht'
,
'ta'
,
'te'
,
'ka'
]
add_lang
=
[
'latin'
,
'arabic'
,
'cyrillic'
,
'devanagari'
]
add_lang
=
[
'latin'
,
'arabic'
,
'east_slavic'
,
'cyrillic'
,
'devanagari'
]
# all_lang = ['', 'auto']
# all_lang = ['', 'auto']
all_lang
=
[]
all_lang
=
[]
...
...
mineru/model/ocr/paddleocr2pytorch/pytorch_paddle.py
View file @
43f21a77
...
@@ -26,9 +26,10 @@ latin_lang = [
...
@@ -26,9 +26,10 @@ latin_lang = [
]
]
arabic_lang
=
[
'ar'
,
'fa'
,
'ug'
,
'ur'
]
arabic_lang
=
[
'ar'
,
'fa'
,
'ug'
,
'ur'
]
cyrillic_lang
=
[
cyrillic_lang
=
[
'ru'
,
'rs_cyrillic'
,
'b
e'
,
'bg'
,
'uk
'
,
'mn'
,
'abq'
,
'ady'
,
'kbd'
,
'ava'
,
# noqa: E126
'rs_cyrillic'
,
'b
g
'
,
'mn'
,
'abq'
,
'ady'
,
'kbd'
,
'ava'
,
# noqa: E126
'dar'
,
'inh'
,
'che'
,
'lbe'
,
'lez'
,
'tab'
'dar'
,
'inh'
,
'che'
,
'lbe'
,
'lez'
,
'tab'
]
]
east_slavic_lang
=
[
"ru"
,
"be"
,
"uk"
]
devanagari_lang
=
[
devanagari_lang
=
[
'hi'
,
'mr'
,
'ne'
,
'bh'
,
'mai'
,
'ang'
,
'bho'
,
'mah'
,
'sck'
,
'new'
,
'gom'
,
# noqa: E126
'hi'
,
'mr'
,
'ne'
,
'bh'
,
'mai'
,
'ang'
,
'bho'
,
'mah'
,
'sck'
,
'new'
,
'gom'
,
# noqa: E126
'sa'
,
'bgc'
'sa'
,
'bgc'
...
@@ -69,6 +70,8 @@ class PytorchPaddleOCR(TextSystem):
...
@@ -69,6 +70,8 @@ class PytorchPaddleOCR(TextSystem):
self
.
lang
=
'cyrillic'
self
.
lang
=
'cyrillic'
elif
self
.
lang
in
devanagari_lang
:
elif
self
.
lang
in
devanagari_lang
:
self
.
lang
=
'devanagari'
self
.
lang
=
'devanagari'
elif
self
.
lang
in
east_slavic_lang
:
self
.
lang
=
'east_slavic'
else
:
else
:
pass
pass
...
...
mineru/model/ocr/paddleocr2pytorch/pytorchocr/utils/resources/arch_config.yaml
View file @
43f21a77
...
@@ -490,3 +490,82 @@ devanagari_PP-OCRv3_rec_infer:
...
@@ -490,3 +490,82 @@ devanagari_PP-OCRv3_rec_infer:
# out_channels: 169
# out_channels: 169
fc_decay
:
0.00001
fc_decay
:
0.00001
korean_PP-OCRv5_rec_infer
:
model_type
:
rec
algorithm
:
SVTR_HGNet
Transform
:
Backbone
:
name
:
PPLCNetV3
scale
:
0.95
Head
:
name
:
MultiHead
out_channels_list
:
CTCLabelDecode
:
11947
head_list
:
-
CTCHead
:
Neck
:
name
:
svtr
dims
:
120
depth
:
2
hidden_dims
:
120
kernel_size
:
[
1
,
3
]
use_guide
:
True
Head
:
fc_decay
:
0.00001
-
NRTRHead
:
nrtr_dim
:
384
max_text_length
:
25
latin_PP-OCRv5_rec_infer
:
model_type
:
rec
algorithm
:
SVTR_HGNet
Transform
:
Backbone
:
name
:
PPLCNetV3
scale
:
0.95
Head
:
name
:
MultiHead
out_channels_list
:
CTCLabelDecode
:
504
head_list
:
-
CTCHead
:
Neck
:
name
:
svtr
dims
:
120
depth
:
2
hidden_dims
:
120
kernel_size
:
[
1
,
3
]
use_guide
:
True
Head
:
fc_decay
:
0.00001
-
NRTRHead
:
nrtr_dim
:
384
max_text_length
:
25
eslav_PP-OCRv5_rec_infer
:
model_type
:
rec
algorithm
:
SVTR_HGNet
Transform
:
Backbone
:
name
:
PPLCNetV3
scale
:
0.95
Head
:
name
:
MultiHead
out_channels_list
:
CTCLabelDecode
:
519
head_list
:
-
CTCHead
:
Neck
:
name
:
svtr
dims
:
120
depth
:
2
hidden_dims
:
120
kernel_size
:
[
1
,
3
]
use_guide
:
True
Head
:
fc_decay
:
0.00001
-
NRTRHead
:
nrtr_dim
:
384
max_text_length
:
25
mineru/model/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ppocrv5_eslav_dict.txt
0 → 100644
View file @
43f21a77
!
"
#
$
%
&
'
(
)
*
+
,
-
.
/
0
1
2
3
4
5
6
7
8
9
:
;
<
=
>
?
A
B
C
D
E
F
G
H
I
J
K
L
M
N
O
P
Q
R
S
T
U
V
W
X
Y
Z
[
]
_
`
a
b
c
d
e
f
g
h
i
j
k
l
m
n
o
p
q
r
s
t
u
v
w
x
y
z
©
‥
{
}
\
|
@
^
~
÷
∕
∙
⋅
·
±
∓
∩
∪
□
←
↔
⇒
⇐
⇔
∀
∃
∄
∴
∵
∝
∞
⊥
∟
∠
∡
∢
′
″
∥
⊾
⊿
∂
∫
∬
∭
∮
∯
∰
∑
∏
√
∛
∜
∱
∲
∳
∶
∷
∼
®
℉
Ω
℧
Å
⌀
ℏ
⅀
⍺
⍵
¢
€
£
¥
₿
Ⅰ
Ⅱ
Ⅲ
Ⅳ
Ⅴ
Ⅵ
Ⅶ
Ⅷ
Ⅸ
Ⅹ
Ⅺ
Ⅻ
ⅰ
ⅱ
ⅲ
ⅳ
ⅴ
ⅵ
ⅶ
ⅷ
ⅸ
ⅹ
ⅺ
ⅻ
➀
➁
➂
➃
➄
➅
➆
➇
➈
➉
➊
➋
➌
➍
➎
➏
➐
➑
➒
➓
❶
❷
❸
❹
❺
❻
❼
❽
❾
❿
①
②
③
④
⑤
⑥
⑦
⑧
⑨
⑩
●
▶
𝑢
︽
–
﹥
𝜓
•
∋
ƒ
०
⬆
Ạ
◀
▫
︾
À
Á
Â
Ã
Ä
Å
Æ
Ç
È
É
Ê
Ë
Ì
Í
Î
Ï
Ð
Ñ
Ò
Ó
Ô
Õ
Ö
Ø
Ù
Ú
Û
Ü
Ý
Þ
à
á
â
ã
ä
å
æ
ç
è
é
ê
ë
ì
í
î
ï
ð
ñ
ò
ó
ô
õ
ö
ø
ù
ú
û
ü
ý
þ
ÿ
¡
¤
¦
§
¨
ª
«
¬
¯
°
²
³
´
µ
¶
¸
¹
º
»
¼
½
¾
¿
×
‐
‑
‒
—
―
‖
‗
‘
’
‚
‛
“
”
„
‟
†
‡
‣
․
…
‧
‰
‴
‵
‶
‷
‸
‹
›
※
‼
‽
‾
₤
₡
₹
−
∖
∗
≈
≠
≡
≤
≥
⊂
⊃
↑
→
↓
↕
™
Ω
℮
∆
✓
✗
✘
▪
◼
✔
✕
☑
☒
№
₽
₴
Α
α
Β
β
Γ
γ
Δ
δ
Ε
ε
Ζ
ζ
Η
η
Θ
θ
Ι
ι
Κ
κ
Λ
λ
Μ
μ
Ν
ν
Ξ
ξ
Ο
ο
Π
π
Ρ
ρ
Σ
σ
ς
Τ
τ
Υ
υ
Φ
φ
Χ
χ
Ψ
ψ
ω
А
Б
В
Г
Ґ
Д
Е
Ё
Є
Ж
З
И
І
Ї
Й
К
Л
М
Н
О
П
Р
С
Т
У
Ў
Ф
Х
Ц
Ч
Ш
Щ
Ъ
Ы
Ь
Э
Ю
Я
а
б
в
г
ґ
д
е
ё
є
ж
з
и
і
ї
й
к
л
м
н
о
п
р
с
т
у
ў
ф
х
ц
ч
ш
щ
ъ
ы
ь
э
ю
я
mineru/model/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ppocrv5_korean_dict.txt
0 → 100644
View file @
43f21a77
This diff is collapsed.
Click to expand it.
mineru/model/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ppocrv5_latin_dict.txt
0 → 100644
View file @
43f21a77
!
"
#
$
%
&
'
(
)
*
+
,
-
.
/
0
1
2
3
4
5
6
7
8
9
:
;
<
=
>
?
@
A
B
C
D
E
F
G
H
I
J
K
L
M
N
O
P
Q
R
S
T
U
V
W
X
Y
Z
[
\
]
^
_
`
a
b
c
d
e
f
g
h
i
j
k
l
m
n
o
p
q
r
s
t
u
v
w
x
y
z
{
|
}
~
¡
¢
£
¤
¥
¦
§
¨
©
ª
«
¬
®
¯
°
±
²
³
´
µ
¶
·
¸
¹
º
»
¼
½
¾
¿
À
Á
Â
Ã
Ä
Å
Æ
Ç
È
É
Ê
Ë
Ì
Í
Î
Ï
Ð
Ñ
Ò
Ó
Ô
Õ
Ö
×
Ø
Ù
Ú
Û
Ü
Ý
Þ
ß
à
á
â
ã
ä
å
æ
ç
è
é
ê
ë
ì
í
î
ï
ð
ñ
ò
ó
ô
õ
ö
÷
ø
ù
ú
û
ü
ý
þ
ÿ
Ą
ą
Ć
ć
Č
č
Ď
ď
Đ
đ
Ė
ė
Ę
ę
Ě
ě
Ğ
ğ
Į
į
İ
ı
Ĺ
ĺ
Ľ
ľ
Ł
ł
Ń
ń
Ň
ň
ō
Ő
ő
Œ
œ
Ŕ
ŕ
Ř
ř
Ś
ś
Ş
ş
Š
š
Ť
ť
Ū
ū
Ů
ů
Ű
ű
Ų
ų
Ÿ
Ź
ź
Ż
ż
Ž
ž
ƒ
ʒ
Ω
α
β
γ
δ
ε
ζ
η
θ
ι
κ
λ
μ
ν
ξ
ο
π
ρ
ς
σ
τ
υ
φ
χ
ψ
ω
з
०
Ṡ
ẞ
Ạ
‐
‑
‒
–
—
―
‖
‗
‘
’
‚
‛
“
”
„
‟
†
‡
•
‣
․
‥
…
‧
‰
′
″
‴
‵
‶
‷
‸
‹
›
※
‼
‽
‾
⁄
₂
₃
₡
₤
€
₴
₹
₽
₿
℉
ℏ
№
™
Ω
℧
Å
℮
⅀
Ⅰ
Ⅱ
Ⅲ
Ⅳ
Ⅴ
Ⅵ
Ⅶ
Ⅷ
Ⅸ
Ⅹ
Ⅺ
Ⅻ
ⅰ
ⅱ
ⅲ
ⅳ
ⅴ
ⅵ
ⅶ
ⅷ
ⅸ
ⅹ
ⅺ
ⅻ
←
↑
→
↓
↔
↕
⇐
⇒
⇔
∀
∂
∃
∄
∅
∆
∋
∏
∑
−
∓
∕
∖
∗
∙
√
∛
∜
∝
∞
∟
∠
∡
∢
∥
∧
∨
∩
∪
∫
∬
∭
∮
∯
∰
∱
∲
∳
∴
∵
∶
∷
∼
≈
≠
≡
≤
≥
⊂
⊃
⊥
⊾
⊿
⋅
⌀
⍵
⍺
①
②
③
④
⑤
⑥
⑦
⑧
⑨
⑩
─
│
└
├
■
□
▪
▫
▶
◀
●
◼
☑
☒
✓
✔
✕
✗
✘
❶
❷
❸
❹
❺
❻
❼
❽
❾
❿
➀
➁
➂
➃
➄
➅
➆
➇
➈
➉
➊
➋
➌
➍
➎
➏
➐
➑
➒
➓
⬆
、
fi
fl
︽
︾
﹥
�
𝑢
𝜓
mineru/model/ocr/paddleocr2pytorch/pytorchocr/utils/resources/models_config.yml
View file @
43f21a77
...
@@ -24,9 +24,9 @@ lang:
...
@@ -24,9 +24,9 @@ lang:
rec
:
en_PP-OCRv4_rec_infer.pth
rec
:
en_PP-OCRv4_rec_infer.pth
dict
:
en_dict.txt
dict
:
en_dict.txt
korean
:
korean
:
det
:
Multilingual
_PP-OCRv
3
_det_infer.pth
det
:
ch
_PP-OCRv
5
_det_infer.pth
rec
:
korean_PP-OCRv
3
_rec_infer.pth
rec
:
korean_PP-OCRv
5
_rec_infer.pth
dict
:
korean_dict.txt
dict
:
ppocrv5_
korean_dict.txt
japan
:
japan
:
det
:
ch_PP-OCRv5_det_infer.pth
det
:
ch_PP-OCRv5_det_infer.pth
rec
:
ch_PP-OCRv5_rec_server_infer.pth
rec
:
ch_PP-OCRv5_rec_server_infer.pth
...
@@ -48,9 +48,9 @@ lang:
...
@@ -48,9 +48,9 @@ lang:
rec
:
ka_PP-OCRv3_rec_infer.pth
rec
:
ka_PP-OCRv3_rec_infer.pth
dict
:
ka_dict.txt
dict
:
ka_dict.txt
latin
:
latin
:
det
:
en
_PP-OCRv
3
_det_infer.pth
det
:
ch
_PP-OCRv
5
_det_infer.pth
rec
:
latin_PP-OCRv
3
_rec_infer.pth
rec
:
latin_PP-OCRv
5
_rec_infer.pth
dict
:
latin_dict.txt
dict
:
ppocrv5_
latin_dict.txt
arabic
:
arabic
:
det
:
Multilingual_PP-OCRv3_det_infer.pth
det
:
Multilingual_PP-OCRv3_det_infer.pth
rec
:
arabic_PP-OCRv3_rec_infer.pth
rec
:
arabic_PP-OCRv3_rec_infer.pth
...
@@ -62,4 +62,8 @@ lang:
...
@@ -62,4 +62,8 @@ lang:
devanagari
:
devanagari
:
det
:
Multilingual_PP-OCRv3_det_infer.pth
det
:
Multilingual_PP-OCRv3_det_infer.pth
rec
:
devanagari_PP-OCRv3_rec_infer.pth
rec
:
devanagari_PP-OCRv3_rec_infer.pth
dict
:
devanagari_dict.txt
dict
:
devanagari_dict.txt
\ No newline at end of file
east_slavic
:
det
:
ch_PP-OCRv5_det_infer.pth
rec
:
eslav_PP-OCRv5_rec_infer.pth
dict
:
ppocrv5_eslav_dict.txt
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment