Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
3a166bf1
Unverified
Commit
3a166bf1
authored
Oct 28, 2024
by
Xiaomeng Zhao
Committed by
GitHub
Oct 28, 2024
Browse files
Merge pull request #802 from papayalove/dev-table-model-update
perf: table model update with PP OCRv4
parents
37c335ae
4949408c
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
23 additions
and
3 deletions
+23
-3
magic_pdf/libs/Constants.py
magic_pdf/libs/Constants.py
+8
-2
magic_pdf/model/pdf_extract_kit.py
magic_pdf/model/pdf_extract_kit.py
+15
-1
No files found.
magic_pdf/libs/Constants.py
View file @
3a166bf1
...
@@ -23,14 +23,20 @@ TABLE_MASTER_DICT = "table_master_structure_dict.txt"
...
@@ -23,14 +23,20 @@ TABLE_MASTER_DICT = "table_master_structure_dict.txt"
TABLE_MASTER_DIR
=
"table_structure_tablemaster_infer/"
TABLE_MASTER_DIR
=
"table_structure_tablemaster_infer/"
# pp detect model dir
# pp detect model dir
DETECT_MODEL_DIR
=
"ch_PP-OCRv
3
_det_infer"
DETECT_MODEL_DIR
=
"ch_PP-OCRv
4
_det_infer"
# pp rec model dir
# pp rec model dir
REC_MODEL_DIR
=
"ch_PP-OCRv
3
_rec_infer"
REC_MODEL_DIR
=
"ch_PP-OCRv
4
_rec_infer"
# pp rec char dict path
# pp rec char dict path
REC_CHAR_DICT
=
"ppocr_keys_v1.txt"
REC_CHAR_DICT
=
"ppocr_keys_v1.txt"
# pp rec copy rec directory
PP_REC_DIRECTORY
=
".paddleocr/whl/rec/ch/ch_PP-OCRv4_rec_infer"
# pp rec copy det directory
PP_DET_DIRECTORY
=
".paddleocr/whl/det/ch/ch_PP-OCRv4_det_infer"
class
MODEL_NAME
:
class
MODEL_NAME
:
# pp table structure algorithm
# pp table structure algorithm
...
...
magic_pdf/model/pdf_extract_kit.py
View file @
3a166bf1
from
loguru
import
logger
from
loguru
import
logger
import
os
import
os
import
time
import
time
from
pathlib
import
Path
import
shutil
from
magic_pdf.libs.Constants
import
*
from
magic_pdf.libs.Constants
import
*
from
magic_pdf.libs.clean_memory
import
clean_memory
from
magic_pdf.libs.clean_memory
import
clean_memory
from
magic_pdf.model.model_list
import
AtomicModel
from
magic_pdf.model.model_list
import
AtomicModel
...
@@ -297,6 +298,17 @@ class CustomPEKModel:
...
@@ -297,6 +298,17 @@ class CustomPEKModel:
device
=
self
.
device
device
=
self
.
device
)
)
home_directory
=
Path
.
home
()
det_source
=
os
.
path
.
join
(
models_dir
,
table_model_dir
,
DETECT_MODEL_DIR
)
rec_source
=
os
.
path
.
join
(
models_dir
,
table_model_dir
,
REC_MODEL_DIR
)
det_dest_dir
=
os
.
path
.
join
(
home_directory
,
PP_DET_DIRECTORY
)
rec_dest_dir
=
os
.
path
.
join
(
home_directory
,
PP_REC_DIRECTORY
)
if
not
os
.
path
.
exists
(
det_dest_dir
):
shutil
.
copytree
(
det_source
,
det_dest_dir
)
if
not
os
.
path
.
exists
(
rec_dest_dir
):
shutil
.
copytree
(
rec_source
,
rec_dest_dir
)
logger
.
info
(
'DocAnalysis init done!'
)
logger
.
info
(
'DocAnalysis init done!'
)
def
__call__
(
self
,
image
):
def
__call__
(
self
,
image
):
...
@@ -472,3 +484,5 @@ class CustomPEKModel:
...
@@ -472,3 +484,5 @@ class CustomPEKModel:
logger
.
info
(
f
"-----page total time:
{
round
(
time
.
time
()
-
page_start
,
2
)
}
-----"
)
logger
.
info
(
f
"-----page total time:
{
round
(
time
.
time
()
-
page_start
,
2
)
}
-----"
)
return
layout_res
return
layout_res
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment