Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
43f21a77
You need to sign in or sign up before continuing.
Commit
43f21a77
authored
Jun 30, 2025
by
myhloli
Browse files
feat: add support for Korean and Latin configurations in OCR model processing
parent
86391acf
Changes
7
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
13061 additions
and
10 deletions
+13061
-10
mineru/cli/gradio_app.py
mineru/cli/gradio_app.py
+3
-2
mineru/model/ocr/paddleocr2pytorch/pytorch_paddle.py
mineru/model/ocr/paddleocr2pytorch/pytorch_paddle.py
+4
-1
mineru/model/ocr/paddleocr2pytorch/pytorchocr/utils/resources/arch_config.yaml
...leocr2pytorch/pytorchocr/utils/resources/arch_config.yaml
+79
-0
mineru/model/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ppocrv5_eslav_dict.txt
...ch/pytorchocr/utils/resources/dict/ppocrv5_eslav_dict.txt
+517
-0
mineru/model/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ppocrv5_korean_dict.txt
...h/pytorchocr/utils/resources/dict/ppocrv5_korean_dict.txt
+11945
-0
mineru/model/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ppocrv5_latin_dict.txt
...ch/pytorchocr/utils/resources/dict/ppocrv5_latin_dict.txt
+502
-0
mineru/model/ocr/paddleocr2pytorch/pytorchocr/utils/resources/models_config.yml
...eocr2pytorch/pytorchocr/utils/resources/models_config.yml
+11
-7
No files found.
mineru/cli/gradio_app.py
View file @
43f21a77
...
@@ -133,15 +133,16 @@ latin_lang = [
...
@@ -133,15 +133,16 @@ latin_lang = [
]
]
arabic_lang
=
[
'ar'
,
'fa'
,
'ug'
,
'ur'
]
arabic_lang
=
[
'ar'
,
'fa'
,
'ug'
,
'ur'
]
cyrillic_lang
=
[
cyrillic_lang
=
[
'ru'
,
'rs_cyrillic'
,
'b
e'
,
'bg'
,
'uk
'
,
'mn'
,
'abq'
,
'ady'
,
'kbd'
,
'ava'
,
# noqa: E126
'rs_cyrillic'
,
'b
g
'
,
'mn'
,
'abq'
,
'ady'
,
'kbd'
,
'ava'
,
# noqa: E126
'dar'
,
'inh'
,
'che'
,
'lbe'
,
'lez'
,
'tab'
'dar'
,
'inh'
,
'che'
,
'lbe'
,
'lez'
,
'tab'
]
]
east_slavic_lang
=
[
"ru"
,
"be"
,
"uk"
]
devanagari_lang
=
[
devanagari_lang
=
[
'hi'
,
'mr'
,
'ne'
,
'bh'
,
'mai'
,
'ang'
,
'bho'
,
'mah'
,
'sck'
,
'new'
,
'gom'
,
# noqa: E126
'hi'
,
'mr'
,
'ne'
,
'bh'
,
'mai'
,
'ang'
,
'bho'
,
'mah'
,
'sck'
,
'new'
,
'gom'
,
# noqa: E126
'sa'
,
'bgc'
'sa'
,
'bgc'
]
]
other_lang
=
[
'ch'
,
'ch_lite'
,
'ch_server'
,
'en'
,
'korean'
,
'japan'
,
'chinese_cht'
,
'ta'
,
'te'
,
'ka'
]
other_lang
=
[
'ch'
,
'ch_lite'
,
'ch_server'
,
'en'
,
'korean'
,
'japan'
,
'chinese_cht'
,
'ta'
,
'te'
,
'ka'
]
add_lang
=
[
'latin'
,
'arabic'
,
'cyrillic'
,
'devanagari'
]
add_lang
=
[
'latin'
,
'arabic'
,
'east_slavic'
,
'cyrillic'
,
'devanagari'
]
# all_lang = ['', 'auto']
# all_lang = ['', 'auto']
all_lang
=
[]
all_lang
=
[]
...
...
mineru/model/ocr/paddleocr2pytorch/pytorch_paddle.py
View file @
43f21a77
...
@@ -26,9 +26,10 @@ latin_lang = [
...
@@ -26,9 +26,10 @@ latin_lang = [
]
]
arabic_lang
=
[
'ar'
,
'fa'
,
'ug'
,
'ur'
]
arabic_lang
=
[
'ar'
,
'fa'
,
'ug'
,
'ur'
]
cyrillic_lang
=
[
cyrillic_lang
=
[
'ru'
,
'rs_cyrillic'
,
'b
e'
,
'bg'
,
'uk
'
,
'mn'
,
'abq'
,
'ady'
,
'kbd'
,
'ava'
,
# noqa: E126
'rs_cyrillic'
,
'b
g
'
,
'mn'
,
'abq'
,
'ady'
,
'kbd'
,
'ava'
,
# noqa: E126
'dar'
,
'inh'
,
'che'
,
'lbe'
,
'lez'
,
'tab'
'dar'
,
'inh'
,
'che'
,
'lbe'
,
'lez'
,
'tab'
]
]
east_slavic_lang
=
[
"ru"
,
"be"
,
"uk"
]
devanagari_lang
=
[
devanagari_lang
=
[
'hi'
,
'mr'
,
'ne'
,
'bh'
,
'mai'
,
'ang'
,
'bho'
,
'mah'
,
'sck'
,
'new'
,
'gom'
,
# noqa: E126
'hi'
,
'mr'
,
'ne'
,
'bh'
,
'mai'
,
'ang'
,
'bho'
,
'mah'
,
'sck'
,
'new'
,
'gom'
,
# noqa: E126
'sa'
,
'bgc'
'sa'
,
'bgc'
...
@@ -69,6 +70,8 @@ class PytorchPaddleOCR(TextSystem):
...
@@ -69,6 +70,8 @@ class PytorchPaddleOCR(TextSystem):
self
.
lang
=
'cyrillic'
self
.
lang
=
'cyrillic'
elif
self
.
lang
in
devanagari_lang
:
elif
self
.
lang
in
devanagari_lang
:
self
.
lang
=
'devanagari'
self
.
lang
=
'devanagari'
elif
self
.
lang
in
east_slavic_lang
:
self
.
lang
=
'east_slavic'
else
:
else
:
pass
pass
...
...
mineru/model/ocr/paddleocr2pytorch/pytorchocr/utils/resources/arch_config.yaml
View file @
43f21a77
...
@@ -490,3 +490,82 @@ devanagari_PP-OCRv3_rec_infer:
...
@@ -490,3 +490,82 @@ devanagari_PP-OCRv3_rec_infer:
# out_channels: 169
# out_channels: 169
fc_decay
:
0.00001
fc_decay
:
0.00001
korean_PP-OCRv5_rec_infer
:
model_type
:
rec
algorithm
:
SVTR_HGNet
Transform
:
Backbone
:
name
:
PPLCNetV3
scale
:
0.95
Head
:
name
:
MultiHead
out_channels_list
:
CTCLabelDecode
:
11947
head_list
:
-
CTCHead
:
Neck
:
name
:
svtr
dims
:
120
depth
:
2
hidden_dims
:
120
kernel_size
:
[
1
,
3
]
use_guide
:
True
Head
:
fc_decay
:
0.00001
-
NRTRHead
:
nrtr_dim
:
384
max_text_length
:
25
latin_PP-OCRv5_rec_infer
:
model_type
:
rec
algorithm
:
SVTR_HGNet
Transform
:
Backbone
:
name
:
PPLCNetV3
scale
:
0.95
Head
:
name
:
MultiHead
out_channels_list
:
CTCLabelDecode
:
504
head_list
:
-
CTCHead
:
Neck
:
name
:
svtr
dims
:
120
depth
:
2
hidden_dims
:
120
kernel_size
:
[
1
,
3
]
use_guide
:
True
Head
:
fc_decay
:
0.00001
-
NRTRHead
:
nrtr_dim
:
384
max_text_length
:
25
eslav_PP-OCRv5_rec_infer
:
model_type
:
rec
algorithm
:
SVTR_HGNet
Transform
:
Backbone
:
name
:
PPLCNetV3
scale
:
0.95
Head
:
name
:
MultiHead
out_channels_list
:
CTCLabelDecode
:
519
head_list
:
-
CTCHead
:
Neck
:
name
:
svtr
dims
:
120
depth
:
2
hidden_dims
:
120
kernel_size
:
[
1
,
3
]
use_guide
:
True
Head
:
fc_decay
:
0.00001
-
NRTRHead
:
nrtr_dim
:
384
max_text_length
:
25
mineru/model/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ppocrv5_eslav_dict.txt
0 → 100644
View file @
43f21a77
!
"
#
$
%
&
'
(
)
*
+
,
-
.
/
0
1
2
3
4
5
6
7
8
9
:
;
<
=
>
?
A
B
C
D
E
F
G
H
I
J
K
L
M
N
O
P
Q
R
S
T
U
V
W
X
Y
Z
[
]
_
`
a
b
c
d
e
f
g
h
i
j
k
l
m
n
o
p
q
r
s
t
u
v
w
x
y
z
©
‥
{
}
\
|
@
^
~
÷
∕
∙
⋅
·
±
∓
∩
∪
□
←
↔
⇒
⇐
⇔
∀
∃
∄
∴
∵
∝
∞
⊥
∟
∠
∡
∢
′
″
∥
⊾
⊿
∂
∫
∬
∭
∮
∯
∰
∑
∏
√
∛
∜
∱
∲
∳
∶
∷
∼
®
℉
Ω
℧
Å
⌀
ℏ
⅀
⍺
⍵
¢
€
£
¥
₿
Ⅰ
Ⅱ
Ⅲ
Ⅳ
Ⅴ
Ⅵ
Ⅶ
Ⅷ
Ⅸ
Ⅹ
Ⅺ
Ⅻ
ⅰ
ⅱ
ⅲ
ⅳ
ⅴ
ⅵ
ⅶ
ⅷ
ⅸ
ⅹ
ⅺ
ⅻ
➀
➁
➂
➃
➄
➅
➆
➇
➈
➉
➊
➋
➌
➍
➎
➏
➐
➑
➒
➓
❶
❷
❸
❹
❺
❻
❼
❽
❾
❿
①
②
③
④
⑤
⑥
⑦
⑧
⑨
⑩
●
▶
𝑢
︽
–
﹥
𝜓
•
∋
ƒ
०
⬆
Ạ
◀
▫
︾
À
Á
Â
Ã
Ä
Å
Æ
Ç
È
É
Ê
Ë
Ì
Í
Î
Ï
Ð
Ñ
Ò
Ó
Ô
Õ
Ö
Ø
Ù
Ú
Û
Ü
Ý
Þ
à
á
â
ã
ä
å
æ
ç
è
é
ê
ë
ì
í
î
ï
ð
ñ
ò
ó
ô
õ
ö
ø
ù
ú
û
ü
ý
þ
ÿ
¡
¤
¦
§
¨
ª
«
¬
¯
°
²
³
´
µ
¶
¸
¹
º
»
¼
½
¾
¿
×
‐
‑
‒
—
―
‖
‗
‘
’
‚
‛
“
”
„
‟
†
‡
‣
․
…
‧
‰
‴
‵
‶
‷
‸
‹
›
※
‼
‽
‾
₤
₡
₹
−
∖
∗
≈
≠
≡
≤
≥
⊂
⊃
↑
→
↓
↕
™
Ω
℮
∆
✓
✗
✘
▪
◼
✔
✕
☑
☒
№
₽
₴
Α
α
Β
β
Γ
γ
Δ
δ
Ε
ε
Ζ
ζ
Η
η
Θ
θ
Ι
ι
Κ
κ
Λ
λ
Μ
μ
Ν
ν
Ξ
ξ
Ο
ο
Π
π
Ρ
ρ
Σ
σ
ς
Τ
τ
Υ
υ
Φ
φ
Χ
χ
Ψ
ψ
ω
А
Б
В
Г
Ґ
Д
Е
Ё
Є
Ж
З
И
І
Ї
Й
К
Л
М
Н
О
П
Р
С
Т
У
Ў
Ф
Х
Ц
Ч
Ш
Щ
Ъ
Ы
Ь
Э
Ю
Я
а
б
в
г
ґ
д
е
ё
є
ж
з
и
і
ї
й
к
л
м
н
о
п
р
с
т
у
ў
ф
х
ц
ч
ш
щ
ъ
ы
ь
э
ю
я
mineru/model/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ppocrv5_korean_dict.txt
0 → 100644
View file @
43f21a77
This diff is collapsed.
Click to expand it.
mineru/model/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ppocrv5_latin_dict.txt
0 → 100644
View file @
43f21a77
!
"
#
$
%
&
'
(
)
*
+
,
-
.
/
0
1
2
3
4
5
6
7
8
9
:
;
<
=
>
?
@
A
B
C
D
E
F
G
H
I
J
K
L
M
N
O
P
Q
R
S
T
U
V
W
X
Y
Z
[
\
]
^
_
`
a
b
c
d
e
f
g
h
i
j
k
l
m
n
o
p
q
r
s
t
u
v
w
x
y
z
{
|
}
~
¡
¢
£
¤
¥
¦
§
¨
©
ª
«
¬
®
¯
°
±
²
³
´
µ
¶
·
¸
¹
º
»
¼
½
¾
¿
À
Á
Â
Ã
Ä
Å
Æ
Ç
È
É
Ê
Ë
Ì
Í
Î
Ï
Ð
Ñ
Ò
Ó
Ô
Õ
Ö
×
Ø
Ù
Ú
Û
Ü
Ý
Þ
ß
à
á
â
ã
ä
å
æ
ç
è
é
ê
ë
ì
í
î
ï
ð
ñ
ò
ó
ô
õ
ö
÷
ø
ù
ú
û
ü
ý
þ
ÿ
Ą
ą
Ć
ć
Č
č
Ď
ď
Đ
đ
Ė
ė
Ę
ę
Ě
ě
Ğ
ğ
Į
į
İ
ı
Ĺ
ĺ
Ľ
ľ
Ł
ł
Ń
ń
Ň
ň
ō
Ő
ő
Œ
œ
Ŕ
ŕ
Ř
ř
Ś
ś
Ş
ş
Š
š
Ť
ť
Ū
ū
Ů
ů
Ű
ű
Ų
ų
Ÿ
Ź
ź
Ż
ż
Ž
ž
ƒ
ʒ
Ω
α
β
γ
δ
ε
ζ
η
θ
ι
κ
λ
μ
ν
ξ
ο
π
ρ
ς
σ
τ
υ
φ
χ
ψ
ω
з
०
Ṡ
ẞ
Ạ
‐
‑
‒
–
—
―
‖
‗
‘
’
‚
‛
“
”
„
‟
†
‡
•
‣
․
‥
…
‧
‰
′
″
‴
‵
‶
‷
‸
‹
›
※
‼
‽
‾
⁄
₂
₃
₡
₤
€
₴
₹
₽
₿
℉
ℏ
№
™
Ω
℧
Å
℮
⅀
Ⅰ
Ⅱ
Ⅲ
Ⅳ
Ⅴ
Ⅵ
Ⅶ
Ⅷ
Ⅸ
Ⅹ
Ⅺ
Ⅻ
ⅰ
ⅱ
ⅲ
ⅳ
ⅴ
ⅵ
ⅶ
ⅷ
ⅸ
ⅹ
ⅺ
ⅻ
←
↑
→
↓
↔
↕
⇐
⇒
⇔
∀
∂
∃
∄
∅
∆
∋
∏
∑
−
∓
∕
∖
∗
∙
√
∛
∜
∝
∞
∟
∠
∡
∢
∥
∧
∨
∩
∪
∫
∬
∭
∮
∯
∰
∱
∲
∳
∴
∵
∶
∷
∼
≈
≠
≡
≤
≥
⊂
⊃
⊥
⊾
⊿
⋅
⌀
⍵
⍺
①
②
③
④
⑤
⑥
⑦
⑧
⑨
⑩
─
│
└
├
■
□
▪
▫
▶
◀
●
◼
☑
☒
✓
✔
✕
✗
✘
❶
❷
❸
❹
❺
❻
❼
❽
❾
❿
➀
➁
➂
➃
➄
➅
➆
➇
➈
➉
➊
➋
➌
➍
➎
➏
➐
➑
➒
➓
⬆
、
fi
fl
︽
︾
﹥
�
𝑢
𝜓
mineru/model/ocr/paddleocr2pytorch/pytorchocr/utils/resources/models_config.yml
View file @
43f21a77
...
@@ -24,9 +24,9 @@ lang:
...
@@ -24,9 +24,9 @@ lang:
rec
:
en_PP-OCRv4_rec_infer.pth
rec
:
en_PP-OCRv4_rec_infer.pth
dict
:
en_dict.txt
dict
:
en_dict.txt
korean
:
korean
:
det
:
Multilingual
_PP-OCRv
3
_det_infer.pth
det
:
ch
_PP-OCRv
5
_det_infer.pth
rec
:
korean_PP-OCRv
3
_rec_infer.pth
rec
:
korean_PP-OCRv
5
_rec_infer.pth
dict
:
korean_dict.txt
dict
:
ppocrv5_
korean_dict.txt
japan
:
japan
:
det
:
ch_PP-OCRv5_det_infer.pth
det
:
ch_PP-OCRv5_det_infer.pth
rec
:
ch_PP-OCRv5_rec_server_infer.pth
rec
:
ch_PP-OCRv5_rec_server_infer.pth
...
@@ -48,9 +48,9 @@ lang:
...
@@ -48,9 +48,9 @@ lang:
rec
:
ka_PP-OCRv3_rec_infer.pth
rec
:
ka_PP-OCRv3_rec_infer.pth
dict
:
ka_dict.txt
dict
:
ka_dict.txt
latin
:
latin
:
det
:
en
_PP-OCRv
3
_det_infer.pth
det
:
ch
_PP-OCRv
5
_det_infer.pth
rec
:
latin_PP-OCRv
3
_rec_infer.pth
rec
:
latin_PP-OCRv
5
_rec_infer.pth
dict
:
latin_dict.txt
dict
:
ppocrv5_
latin_dict.txt
arabic
:
arabic
:
det
:
Multilingual_PP-OCRv3_det_infer.pth
det
:
Multilingual_PP-OCRv3_det_infer.pth
rec
:
arabic_PP-OCRv3_rec_infer.pth
rec
:
arabic_PP-OCRv3_rec_infer.pth
...
@@ -62,4 +62,8 @@ lang:
...
@@ -62,4 +62,8 @@ lang:
devanagari
:
devanagari
:
det
:
Multilingual_PP-OCRv3_det_infer.pth
det
:
Multilingual_PP-OCRv3_det_infer.pth
rec
:
devanagari_PP-OCRv3_rec_infer.pth
rec
:
devanagari_PP-OCRv3_rec_infer.pth
dict
:
devanagari_dict.txt
dict
:
devanagari_dict.txt
\ No newline at end of file
east_slavic
:
det
:
ch_PP-OCRv5_det_infer.pth
rec
:
eslav_PP-OCRv5_rec_infer.pth
dict
:
ppocrv5_eslav_dict.txt
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment