Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
b50f742f
Commit
b50f742f
authored
Mar 13, 2025
by
icecraft
Browse files
feat: add parallel evalution
parent
3a2f86a1
Changes
7
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
141 additions
and
131 deletions
+141
-131
magic_pdf/data/batch_build_dataset.py
magic_pdf/data/batch_build_dataset.py
+33
-32
magic_pdf/data/dataset.py
magic_pdf/data/dataset.py
+21
-23
magic_pdf/data/utils.py
magic_pdf/data/utils.py
+26
-28
magic_pdf/model/doc_analyze_by_custom_model.py
magic_pdf/model/doc_analyze_by_custom_model.py
+22
-20
magic_pdf/model/sub_modules/model_init.py
magic_pdf/model/sub_modules/model_init.py
+28
-17
magic_pdf/tools/cli.py
magic_pdf/tools/cli.py
+8
-8
magic_pdf/tools/common.py
magic_pdf/tools/common.py
+3
-3
No files found.
magic_pdf/data/batch_build_dataset.py
View file @
b50f742f
import
o
s
import
concurrent.future
s
import
glob
import
glob
import
os
import
threading
import
threading
import
concurrent.futures
import
fitz
import
fitz
from
magic_pdf.data.utils
import
fitz_doc_to_image
# PyMuPDF
from
magic_pdf.data.dataset
import
PymuDocDataset
from
magic_pdf.data.dataset
import
PymuDocDataset
from
magic_pdf.data.utils
import
fitz_doc_to_image
# PyMuPDF
def
partition_array_greedy
(
arr
,
k
):
def
partition_array_greedy
(
arr
,
k
):
"""
"""Partition an array into k parts using a simple greedy approach.
Partition an array into k parts using a simple greedy approach.
Parameters:
Parameters:
-----------
-----------
...
@@ -24,7 +26,7 @@ def partition_array_greedy(arr, k):
...
@@ -24,7 +26,7 @@ def partition_array_greedy(arr, k):
"""
"""
# Handle edge cases
# Handle edge cases
if
k
<=
0
:
if
k
<=
0
:
raise
ValueError
(
"
k must be a positive integer
"
)
raise
ValueError
(
'
k must be a positive integer
'
)
if
k
>
len
(
arr
):
if
k
>
len
(
arr
):
k
=
len
(
arr
)
# Adjust k if it's too large
k
=
len
(
arr
)
# Adjust k if it's too large
if
k
==
1
:
if
k
==
1
:
...
@@ -52,8 +54,7 @@ def partition_array_greedy(arr, k):
...
@@ -52,8 +54,7 @@ def partition_array_greedy(arr, k):
def
process_pdf_batch
(
pdf_jobs
,
idx
):
def
process_pdf_batch
(
pdf_jobs
,
idx
):
"""
"""Process a batch of PDF pages using multiple threads.
Process a batch of PDF pages using multiple threads.
Parameters:
Parameters:
-----------
-----------
...
@@ -83,8 +84,8 @@ def process_pdf_batch(pdf_jobs, idx):
...
@@ -83,8 +84,8 @@ def process_pdf_batch(pdf_jobs, idx):
return
(
idx
,
images
)
return
(
idx
,
images
)
def
batch_build_dataset
(
pdf_paths
,
k
,
lang
=
None
):
def
batch_build_dataset
(
pdf_paths
,
k
,
lang
=
None
):
"""
"""
Process multiple PDFs by partitioning them into k balanced parts and
Process multiple PDFs by partitioning them into k balanced parts and
processing each part in parallel.
processing each part in parallel.
Parameters:
Parameters:
-----------
-----------
...
@@ -116,13 +117,13 @@ def batch_build_dataset(pdf_paths, k, lang=None):
...
@@ -116,13 +117,13 @@ def batch_build_dataset(pdf_paths, k, lang=None):
total_pages
+=
num_pages
total_pages
+=
num_pages
doc
.
close
()
doc
.
close
()
except
Exception
as
e
:
except
Exception
as
e
:
print
(
f
"
Error opening
{
pdf_path
}
:
{
e
}
"
)
print
(
f
'
Error opening
{
pdf_path
}
:
{
e
}
'
)
# Partition the jobs based on page countEach job has 1 page
# Partition the jobs based on page countEach job has 1 page
partitions
=
partition_array_greedy
(
pdf_info
,
k
)
partitions
=
partition_array_greedy
(
pdf_info
,
k
)
for
i
,
partition
in
enumerate
(
partitions
):
for
i
,
partition
in
enumerate
(
partitions
):
print
(
f
"
Partition
{
i
+
1
}
:
{
len
(
partition
)
}
pdfs
"
)
print
(
f
'
Partition
{
i
+
1
}
:
{
len
(
partition
)
}
pdfs
'
)
# Process each partition in parallel
# Process each partition in parallel
all_images_h
=
{}
all_images_h
=
{}
...
@@ -145,15 +146,15 @@ def batch_build_dataset(pdf_paths, k, lang=None):
...
@@ -145,15 +146,15 @@ def batch_build_dataset(pdf_paths, k, lang=None):
for
i
,
future
in
enumerate
(
concurrent
.
futures
.
as_completed
(
futures
)):
for
i
,
future
in
enumerate
(
concurrent
.
futures
.
as_completed
(
futures
)):
try
:
try
:
idx
,
images
=
future
.
result
()
idx
,
images
=
future
.
result
()
print
(
f
"
Partition
{
i
+
1
}
completed: processed
{
len
(
images
)
}
images
"
)
print
(
f
'
Partition
{
i
+
1
}
completed: processed
{
len
(
images
)
}
images
'
)
all_images_h
[
idx
]
=
images
all_images_h
[
idx
]
=
images
except
Exception
as
e
:
except
Exception
as
e
:
print
(
f
"
Error processing partition:
{
e
}
"
)
print
(
f
'
Error processing partition:
{
e
}
'
)
results
=
[
None
]
*
len
(
pdf_paths
)
results
=
[
None
]
*
len
(
pdf_paths
)
for
i
in
range
(
len
(
partitions
)):
for
i
in
range
(
len
(
partitions
)):
partition
=
partitions
[
i
]
partition
=
partitions
[
i
]
for
j
in
range
(
len
(
partition
)):
for
j
in
range
(
len
(
partition
)):
with
open
(
pdf_info
[
partition
[
j
]][
0
],
"
rb
"
)
as
f
:
with
open
(
pdf_info
[
partition
[
j
]][
0
],
'
rb
'
)
as
f
:
pdf_bytes
=
f
.
read
()
pdf_bytes
=
f
.
read
()
dataset
=
PymuDocDataset
(
pdf_bytes
,
lang
=
lang
)
dataset
=
PymuDocDataset
(
pdf_bytes
,
lang
=
lang
)
dataset
.
set_images
(
all_images_h
[
i
][
j
])
dataset
.
set_images
(
all_images_h
[
i
][
j
])
...
...
magic_pdf/data/dataset.py
View file @
b50f742f
...
@@ -97,7 +97,7 @@ class Dataset(ABC):
...
@@ -97,7 +97,7 @@ class Dataset(ABC):
@
abstractmethod
@
abstractmethod
def
dump_to_file
(
self
,
file_path
:
str
):
def
dump_to_file
(
self
,
file_path
:
str
):
"""Dump the file
"""Dump the file
.
Args:
Args:
file_path (str): the file path
file_path (str): the file path
...
@@ -119,7 +119,7 @@ class Dataset(ABC):
...
@@ -119,7 +119,7 @@ class Dataset(ABC):
@
abstractmethod
@
abstractmethod
def
classify
(
self
)
->
SupportedPdfParseMethod
:
def
classify
(
self
)
->
SupportedPdfParseMethod
:
"""classify the dataset
"""classify the dataset
.
Returns:
Returns:
SupportedPdfParseMethod: _description_
SupportedPdfParseMethod: _description_
...
@@ -128,8 +128,7 @@ class Dataset(ABC):
...
@@ -128,8 +128,7 @@ class Dataset(ABC):
@
abstractmethod
@
abstractmethod
def
clone
(
self
):
def
clone
(
self
):
"""clone this dataset
"""clone this dataset."""
"""
pass
pass
...
@@ -148,12 +147,13 @@ class PymuDocDataset(Dataset):
...
@@ -148,12 +147,13 @@ class PymuDocDataset(Dataset):
if
lang
==
''
:
if
lang
==
''
:
self
.
_lang
=
None
self
.
_lang
=
None
elif
lang
==
'auto'
:
elif
lang
==
'auto'
:
from
magic_pdf.model.sub_modules.language_detection.utils
import
auto_detect_lang
from
magic_pdf.model.sub_modules.language_detection.utils
import
\
auto_detect_lang
self
.
_lang
=
auto_detect_lang
(
bits
)
self
.
_lang
=
auto_detect_lang
(
bits
)
logger
.
info
(
f
"
lang:
{
lang
}
, detect_lang:
{
self
.
_lang
}
"
)
logger
.
info
(
f
'
lang:
{
lang
}
, detect_lang:
{
self
.
_lang
}
'
)
else
:
else
:
self
.
_lang
=
lang
self
.
_lang
=
lang
logger
.
info
(
f
"
lang:
{
lang
}
"
)
logger
.
info
(
f
'
lang:
{
lang
}
'
)
def
__len__
(
self
)
->
int
:
def
__len__
(
self
)
->
int
:
"""The page number of the pdf."""
"""The page number of the pdf."""
...
@@ -187,7 +187,7 @@ class PymuDocDataset(Dataset):
...
@@ -187,7 +187,7 @@ class PymuDocDataset(Dataset):
return
self
.
_records
[
page_id
]
return
self
.
_records
[
page_id
]
def
dump_to_file
(
self
,
file_path
:
str
):
def
dump_to_file
(
self
,
file_path
:
str
):
"""Dump the file
"""Dump the file
.
Args:
Args:
file_path (str): the file path
file_path (str): the file path
...
@@ -213,7 +213,7 @@ class PymuDocDataset(Dataset):
...
@@ -213,7 +213,7 @@ class PymuDocDataset(Dataset):
return
proc
(
self
,
*
args
,
**
kwargs
)
return
proc
(
self
,
*
args
,
**
kwargs
)
def
classify
(
self
)
->
SupportedPdfParseMethod
:
def
classify
(
self
)
->
SupportedPdfParseMethod
:
"""classify the dataset
"""classify the dataset
.
Returns:
Returns:
SupportedPdfParseMethod: _description_
SupportedPdfParseMethod: _description_
...
@@ -221,8 +221,7 @@ class PymuDocDataset(Dataset):
...
@@ -221,8 +221,7 @@ class PymuDocDataset(Dataset):
return
classify
(
self
.
_data_bits
)
return
classify
(
self
.
_data_bits
)
def
clone
(
self
):
def
clone
(
self
):
"""clone this dataset
"""clone this dataset."""
"""
return
PymuDocDataset
(
self
.
_raw_data
)
return
PymuDocDataset
(
self
.
_raw_data
)
def
set_images
(
self
,
images
):
def
set_images
(
self
,
images
):
...
@@ -274,7 +273,7 @@ class ImageDataset(Dataset):
...
@@ -274,7 +273,7 @@ class ImageDataset(Dataset):
return
self
.
_records
[
page_id
]
return
self
.
_records
[
page_id
]
def
dump_to_file
(
self
,
file_path
:
str
):
def
dump_to_file
(
self
,
file_path
:
str
):
"""Dump the file
"""Dump the file
.
Args:
Args:
file_path (str): the file path
file_path (str): the file path
...
@@ -297,7 +296,7 @@ class ImageDataset(Dataset):
...
@@ -297,7 +296,7 @@ class ImageDataset(Dataset):
return
proc
(
self
,
*
args
,
**
kwargs
)
return
proc
(
self
,
*
args
,
**
kwargs
)
def
classify
(
self
)
->
SupportedPdfParseMethod
:
def
classify
(
self
)
->
SupportedPdfParseMethod
:
"""classify the dataset
"""classify the dataset
.
Returns:
Returns:
SupportedPdfParseMethod: _description_
SupportedPdfParseMethod: _description_
...
@@ -305,8 +304,7 @@ class ImageDataset(Dataset):
...
@@ -305,8 +304,7 @@ class ImageDataset(Dataset):
return
SupportedPdfParseMethod
.
OCR
return
SupportedPdfParseMethod
.
OCR
def
clone
(
self
):
def
clone
(
self
):
"""clone this dataset
"""clone this dataset."""
"""
return
ImageDataset
(
self
.
_raw_data
)
return
ImageDataset
(
self
.
_raw_data
)
def
set_images
(
self
,
images
):
def
set_images
(
self
,
images
):
...
...
magic_pdf/data/utils.py
View file @
b50f742f
import
multiprocessing
as
mp
import
multiprocessing
as
mp
import
threading
import
threading
from
concurrent.futures
import
(
ProcessPoolExecutor
,
ThreadPoolExecutor
,
as_completed
)
import
fitz
import
fitz
import
numpy
as
np
import
numpy
as
np
from
loguru
import
logger
from
loguru
import
logger
from
magic_pdf.utils.annotations
import
ImportPIL
from
magic_pdf.utils.annotations
import
ImportPIL
from
concurrent.futures
import
ProcessPoolExecutor
,
ThreadPoolExecutor
,
as_completed
@
ImportPIL
@
ImportPIL
...
@@ -76,7 +78,7 @@ def convert_page(bytes_page):
...
@@ -76,7 +78,7 @@ def convert_page(bytes_page):
return
fitz_doc_to_image
(
page
)
return
fitz_doc_to_image
(
page
)
def
parallel_process_pdf_safe
(
pages
,
num_workers
=
None
,
**
kwargs
):
def
parallel_process_pdf_safe
(
pages
,
num_workers
=
None
,
**
kwargs
):
"""Process PDF pages in parallel with serialization-safe approach"""
"""Process PDF pages in parallel with serialization-safe approach
.
"""
if
num_workers
is
None
:
if
num_workers
is
None
:
num_workers
=
mp
.
cpu_count
()
num_workers
=
mp
.
cpu_count
()
...
@@ -92,8 +94,7 @@ def parallel_process_pdf_safe(pages, num_workers=None, **kwargs):
...
@@ -92,8 +94,7 @@ def parallel_process_pdf_safe(pages, num_workers=None, **kwargs):
def
threaded_process_pdf
(
pdf_path
,
num_threads
=
4
,
**
kwargs
):
def
threaded_process_pdf
(
pdf_path
,
num_threads
=
4
,
**
kwargs
):
"""
"""Process all pages of a PDF using multiple threads.
Process all pages of a PDF using multiple threads
Parameters:
Parameters:
-----------
-----------
...
@@ -130,13 +131,13 @@ def threaded_process_pdf(pdf_path, num_threads=4, **kwargs):
...
@@ -130,13 +131,13 @@ def threaded_process_pdf(pdf_path, num_threads=4, **kwargs):
try
:
try
:
results
[
page_num
]
=
future
.
result
()
results
[
page_num
]
=
future
.
result
()
except
Exception
as
e
:
except
Exception
as
e
:
print
(
f
"
Error processing page
{
page_num
}
:
{
e
}
"
)
print
(
f
'
Error processing page
{
page_num
}
:
{
e
}
'
)
results
[
page_num
]
=
None
results
[
page_num
]
=
None
# Close the document
# Close the document
doc
.
close
()
doc
.
close
()
if
__name__
==
"
__main__
"
:
if
__name__
==
'
__main__
'
:
pdf
=
fitz
.
open
(
'/tmp/[MS-DOC].pdf'
)
pdf
=
fitz
.
open
(
'/tmp/[MS-DOC].pdf'
)
...
@@ -167,6 +168,3 @@ if __name__ == "__main__":
...
@@ -167,6 +168,3 @@ if __name__ == "__main__":
8 7.900 sec
8 7.900 sec
16 7.984 sec
16 7.984 sec
"""
"""
magic_pdf/model/doc_analyze_by_custom_model.py
View file @
b50f742f
import
concurrent.futures
as
fut
import
multiprocessing
as
mp
import
os
import
os
import
time
import
time
import
torch
import
numpy
as
np
import
numpy
as
np
import
multiprocessing
as
mp
import
torch
import
concurrent.futures
as
fut
os
.
environ
[
'FLAGS_npu_jit_compile'
]
=
'0'
# 关闭paddle的jit编译
os
.
environ
[
'FLAGS_npu_jit_compile'
]
=
'0'
# 关闭paddle的jit编译
os
.
environ
[
'FLAGS_use_stride_kernel'
]
=
'0'
os
.
environ
[
'FLAGS_use_stride_kernel'
]
=
'0'
os
.
environ
[
'PYTORCH_ENABLE_MPS_FALLBACK'
]
=
'1'
# 让mps可以fallback
os
.
environ
[
'PYTORCH_ENABLE_MPS_FALLBACK'
]
=
'1'
# 让mps可以fallback
...
@@ -29,6 +31,7 @@ from magic_pdf.libs.config_reader import (get_device, get_formula_config,
...
@@ -29,6 +31,7 @@ from magic_pdf.libs.config_reader import (get_device, get_formula_config,
get_local_models_dir
,
get_local_models_dir
,
get_table_recog_config
)
get_table_recog_config
)
from
magic_pdf.model.model_list
import
MODEL
from
magic_pdf.model.model_list
import
MODEL
# from magic_pdf.operators.models import InferenceResult
# from magic_pdf.operators.models import InferenceResult
MIN_BATCH_INFERENCE_SIZE
=
100
MIN_BATCH_INFERENCE_SIZE
=
100
...
@@ -310,14 +313,14 @@ def may_batch_image_analyze(
...
@@ -310,14 +313,14 @@ def may_batch_image_analyze(
device
=
get_device
()
device
=
get_device
()
npu_support
=
False
npu_support
=
False
if
str
(
device
).
startswith
(
"
npu
"
):
if
str
(
device
).
startswith
(
'
npu
'
):
import
torch_npu
import
torch_npu
if
torch_npu
.
npu
.
is_available
():
if
torch_npu
.
npu
.
is_available
():
npu_support
=
True
npu_support
=
True
torch
.
npu
.
set_compile_mode
(
jit_compile
=
False
)
torch
.
npu
.
set_compile_mode
(
jit_compile
=
False
)
if
torch
.
cuda
.
is_available
()
and
device
!=
'cpu'
or
npu_support
:
if
torch
.
cuda
.
is_available
()
and
device
!=
'cpu'
or
npu_support
:
gpu_memory
=
int
(
os
.
getenv
(
"
VIRTUAL_VRAM_SIZE
"
,
round
(
get_vram
(
device
))))
gpu_memory
=
int
(
os
.
getenv
(
'
VIRTUAL_VRAM_SIZE
'
,
round
(
get_vram
(
device
))))
if
gpu_memory
is
not
None
and
gpu_memory
>=
8
:
if
gpu_memory
is
not
None
and
gpu_memory
>=
8
:
if
gpu_memory
>=
20
:
if
gpu_memory
>=
20
:
batch_ratio
=
16
batch_ratio
=
16
...
@@ -398,4 +401,3 @@ def may_batch_image_analyze(
...
@@ -398,4 +401,3 @@ def may_batch_image_analyze(
f
' speed:
{
doc_analyze_speed
}
pages/second'
f
' speed:
{
doc_analyze_speed
}
pages/second'
)
)
return
(
idx
,
results
)
return
(
idx
,
results
)
magic_pdf/model/sub_modules/model_init.py
View file @
b50f742f
import
os
import
os
import
torch
import
torch
from
loguru
import
logger
from
loguru
import
logger
from
magic_pdf.config.constants
import
MODEL_NAME
from
magic_pdf.config.constants
import
MODEL_NAME
from
magic_pdf.model.model_list
import
AtomicModel
from
magic_pdf.model.model_list
import
AtomicModel
from
magic_pdf.model.sub_modules.language_detection.yolov11.YOLOv11
import
YOLOv11LangDetModel
from
magic_pdf.model.sub_modules.language_detection.yolov11.YOLOv11
import
\
from
magic_pdf.model.sub_modules.layout.doclayout_yolo.DocLayoutYOLO
import
DocLayoutYOLOModel
YOLOv11LangDetModel
from
magic_pdf.model.sub_modules.layout.layoutlmv3.model_init
import
Layoutlmv3_Predictor
from
magic_pdf.model.sub_modules.layout.doclayout_yolo.DocLayoutYOLO
import
\
DocLayoutYOLOModel
from
magic_pdf.model.sub_modules.layout.layoutlmv3.model_init
import
\
Layoutlmv3_Predictor
from
magic_pdf.model.sub_modules.mfd.yolov8.YOLOv8
import
YOLOv8MFDModel
from
magic_pdf.model.sub_modules.mfd.yolov8.YOLOv8
import
YOLOv8MFDModel
from
magic_pdf.model.sub_modules.mfr.unimernet.Unimernet
import
UnimernetModel
from
magic_pdf.model.sub_modules.mfr.unimernet.Unimernet
import
UnimernetModel
try
:
try
:
from
magic_pdf_ascend_plugin.libs.license_verifier
import
load_license
,
LicenseFormatError
,
LicenseSignatureError
,
LicenseExpiredError
from
magic_pdf_ascend_plugin.libs.license_verifier
import
(
from
magic_pdf_ascend_plugin.model_plugin.ocr.paddleocr.ppocr_273_npu
import
ModifiedPaddleOCR
LicenseExpiredError
,
LicenseFormatError
,
LicenseSignatureError
,
from
magic_pdf_ascend_plugin.model_plugin.table.rapidtable.rapid_table_npu
import
RapidTableModel
load_license
)
from
magic_pdf_ascend_plugin.model_plugin.ocr.paddleocr.ppocr_273_npu
import
\
ModifiedPaddleOCR
from
magic_pdf_ascend_plugin.model_plugin.table.rapidtable.rapid_table_npu
import
\
RapidTableModel
license_key
=
load_license
()
license_key
=
load_license
()
logger
.
info
(
f
'Using Ascend Plugin Success, License id is
{
license_key
[
"payload"
][
"id"
]
}
,'
logger
.
info
(
f
'Using Ascend Plugin Success, License id is
{
license_key
[
"payload"
][
"id"
]
}
,'
f
' License expired at
{
license_key
[
"payload"
][
"date"
][
"end_date"
]
}
'
)
f
' License expired at
{
license_key
[
"payload"
][
"date"
][
"end_date"
]
}
'
)
...
@@ -21,21 +29,24 @@ except Exception as e:
...
@@ -21,21 +29,24 @@ except Exception as e:
if
isinstance
(
e
,
ImportError
):
if
isinstance
(
e
,
ImportError
):
pass
pass
elif
isinstance
(
e
,
LicenseFormatError
):
elif
isinstance
(
e
,
LicenseFormatError
):
logger
.
error
(
"
Ascend Plugin: Invalid license format. Please check the license file.
"
)
logger
.
error
(
'
Ascend Plugin: Invalid license format. Please check the license file.
'
)
elif
isinstance
(
e
,
LicenseSignatureError
):
elif
isinstance
(
e
,
LicenseSignatureError
):
logger
.
error
(
"
Ascend Plugin: Invalid signature. The license may be tampered with.
"
)
logger
.
error
(
'
Ascend Plugin: Invalid signature. The license may be tampered with.
'
)
elif
isinstance
(
e
,
LicenseExpiredError
):
elif
isinstance
(
e
,
LicenseExpiredError
):
logger
.
error
(
"
Ascend Plugin: License has expired. Please renew your license.
"
)
logger
.
error
(
'
Ascend Plugin: License has expired. Please renew your license.
'
)
elif
isinstance
(
e
,
FileNotFoundError
):
elif
isinstance
(
e
,
FileNotFoundError
):
logger
.
error
(
"
Ascend Plugin: Not found License file.
"
)
logger
.
error
(
'
Ascend Plugin: Not found License file.
'
)
else
:
else
:
logger
.
error
(
f
"
Ascend Plugin:
{
e
}
"
)
logger
.
error
(
f
'
Ascend Plugin:
{
e
}
'
)
from
magic_pdf.model.sub_modules.ocr.paddleocr.ppocr_273_mod
import
ModifiedPaddleOCR
from
magic_pdf.model.sub_modules.ocr.paddleocr.ppocr_273_mod
import
ModifiedPaddleOCR
# from magic_pdf.model.sub_modules.ocr.paddleocr.ppocr_291_mod import ModifiedPaddleOCR
# from magic_pdf.model.sub_modules.ocr.paddleocr.ppocr_291_mod import ModifiedPaddleOCR
from
magic_pdf.model.sub_modules.table.rapidtable.rapid_table
import
RapidTableModel
from
magic_pdf.model.sub_modules.table.rapidtable.rapid_table
import
RapidTableModel
from
magic_pdf.model.sub_modules.table.structeqtable.struct_eqtable
import
StructTableModel
from
magic_pdf.model.sub_modules.table.structeqtable.struct_eqtable
import
\
from
magic_pdf.model.sub_modules.table.tablemaster.tablemaster_paddle
import
TableMasterPaddleModel
StructTableModel
from
magic_pdf.model.sub_modules.table.tablemaster.tablemaster_paddle
import
\
TableMasterPaddleModel
def
table_model_init
(
table_model_type
,
model_path
,
max_time
,
_device_
=
'cpu'
,
ocr_engine
=
None
,
table_sub_model_name
=
None
):
def
table_model_init
(
table_model_type
,
model_path
,
max_time
,
_device_
=
'cpu'
,
ocr_engine
=
None
,
table_sub_model_name
=
None
):
if
table_model_type
==
MODEL_NAME
.
STRUCT_EQTABLE
:
if
table_model_type
==
MODEL_NAME
.
STRUCT_EQTABLE
:
...
@@ -56,7 +67,7 @@ def table_model_init(table_model_type, model_path, max_time, _device_='cpu', ocr
...
@@ -56,7 +67,7 @@ def table_model_init(table_model_type, model_path, max_time, _device_='cpu', ocr
def
mfd_model_init
(
weight
,
device
=
'cpu'
):
def
mfd_model_init
(
weight
,
device
=
'cpu'
):
if
str
(
device
).
startswith
(
"
npu
"
):
if
str
(
device
).
startswith
(
'
npu
'
):
device
=
torch
.
device
(
device
)
device
=
torch
.
device
(
device
)
mfd_model
=
YOLOv8MFDModel
(
weight
,
device
)
mfd_model
=
YOLOv8MFDModel
(
weight
,
device
)
return
mfd_model
return
mfd_model
...
@@ -73,14 +84,14 @@ def layout_model_init(weight, config_file, device):
...
@@ -73,14 +84,14 @@ def layout_model_init(weight, config_file, device):
def
doclayout_yolo_model_init
(
weight
,
device
=
'cpu'
):
def
doclayout_yolo_model_init
(
weight
,
device
=
'cpu'
):
if
str
(
device
).
startswith
(
"
npu
"
):
if
str
(
device
).
startswith
(
'
npu
'
):
device
=
torch
.
device
(
device
)
device
=
torch
.
device
(
device
)
model
=
DocLayoutYOLOModel
(
weight
,
device
)
model
=
DocLayoutYOLOModel
(
weight
,
device
)
return
model
return
model
def
langdetect_model_init
(
langdetect_model_weight
,
device
=
'cpu'
):
def
langdetect_model_init
(
langdetect_model_weight
,
device
=
'cpu'
):
if
str
(
device
).
startswith
(
"
npu
"
):
if
str
(
device
).
startswith
(
'
npu
'
):
device
=
torch
.
device
(
device
)
device
=
torch
.
device
(
device
)
model
=
YOLOv11LangDetModel
(
langdetect_model_weight
,
device
)
model
=
YOLOv11LangDetModel
(
langdetect_model_weight
,
device
)
return
model
return
model
...
...
magic_pdf/tools/cli.py
View file @
b50f742f
import
os
import
os
import
shutil
import
shutil
import
tempfile
import
tempfile
from
pathlib
import
Path
import
click
import
click
import
fitz
import
fitz
from
loguru
import
logger
from
loguru
import
logger
from
pathlib
import
Path
import
magic_pdf.model
as
model_config
import
magic_pdf.model
as
model_config
from
magic_pdf.data.data_reader_writer
import
FileBasedDataReader
from
magic_pdf.data.batch_build_dataset
import
batch_build_dataset
from
magic_pdf.data.batch_build_dataset
import
batch_build_dataset
from
magic_pdf.data.data_reader_writer
import
FileBasedDataReader
from
magic_pdf.data.dataset
import
Dataset
from
magic_pdf.data.dataset
import
Dataset
from
magic_pdf.libs.version
import
__version__
from
magic_pdf.libs.version
import
__version__
from
magic_pdf.tools.common
import
do_parse
,
parse_pdf_methods
,
batch_do_parse
from
magic_pdf.tools.common
import
batch_do_parse
,
do_parse
,
parse_pdf_methods
from
magic_pdf.utils.office_to_pdf
import
convert_file_to_pdf
from
magic_pdf.utils.office_to_pdf
import
convert_file_to_pdf
pdf_suffixes
=
[
'.pdf'
]
pdf_suffixes
=
[
'.pdf'
]
ms_office_suffixes
=
[
'.ppt'
,
'.pptx'
,
'.doc'
,
'.docx'
]
ms_office_suffixes
=
[
'.ppt'
,
'.pptx'
,
'.doc'
,
'.docx'
]
image_suffixes
=
[
'.png'
,
'.jpeg'
,
'.jpg'
]
image_suffixes
=
[
'.png'
,
'.jpeg'
,
'.jpg'
]
...
@@ -97,18 +97,18 @@ def cli(path, output_dir, method, lang, debug_able, start_page_id, end_page_id):
...
@@ -97,18 +97,18 @@ def cli(path, output_dir, method, lang, debug_able, start_page_id, end_page_id):
def
read_fn
(
path
:
Path
):
def
read_fn
(
path
:
Path
):
if
path
.
suffix
in
ms_office_suffixes
:
if
path
.
suffix
in
ms_office_suffixes
:
convert_file_to_pdf
(
str
(
path
),
temp_dir
)
convert_file_to_pdf
(
str
(
path
),
temp_dir
)
fn
=
os
.
path
.
join
(
temp_dir
,
f
"
{
path
.
stem
}
.pdf
"
)
fn
=
os
.
path
.
join
(
temp_dir
,
f
'
{
path
.
stem
}
.pdf
'
)
elif
path
.
suffix
in
image_suffixes
:
elif
path
.
suffix
in
image_suffixes
:
with
open
(
str
(
path
),
'rb'
)
as
f
:
with
open
(
str
(
path
),
'rb'
)
as
f
:
bits
=
f
.
read
()
bits
=
f
.
read
()
pdf_bytes
=
fitz
.
open
(
stream
=
bits
).
convert_to_pdf
()
pdf_bytes
=
fitz
.
open
(
stream
=
bits
).
convert_to_pdf
()
fn
=
os
.
path
.
join
(
temp_dir
,
f
"
{
path
.
stem
}
.pdf
"
)
fn
=
os
.
path
.
join
(
temp_dir
,
f
'
{
path
.
stem
}
.pdf
'
)
with
open
(
fn
,
'wb'
)
as
f
:
with
open
(
fn
,
'wb'
)
as
f
:
f
.
write
(
pdf_bytes
)
f
.
write
(
pdf_bytes
)
elif
path
.
suffix
in
pdf_suffixes
:
elif
path
.
suffix
in
pdf_suffixes
:
fn
=
str
(
path
)
fn
=
str
(
path
)
else
:
else
:
raise
Exception
(
f
"
Unknown file suffix:
{
path
.
suffix
}
"
)
raise
Exception
(
f
'
Unknown file suffix:
{
path
.
suffix
}
'
)
disk_rw
=
FileBasedDataReader
(
os
.
path
.
dirname
(
fn
))
disk_rw
=
FileBasedDataReader
(
os
.
path
.
dirname
(
fn
))
return
disk_rw
.
read
(
os
.
path
.
basename
(
fn
))
return
disk_rw
.
read
(
os
.
path
.
basename
(
fn
))
...
...
magic_pdf/tools/common.py
View file @
b50f742f
...
@@ -8,10 +8,10 @@ import magic_pdf.model as model_config
...
@@ -8,10 +8,10 @@ import magic_pdf.model as model_config
from
magic_pdf.config.enums
import
SupportedPdfParseMethod
from
magic_pdf.config.enums
import
SupportedPdfParseMethod
from
magic_pdf.config.make_content_config
import
DropMode
,
MakeMode
from
magic_pdf.config.make_content_config
import
DropMode
,
MakeMode
from
magic_pdf.data.data_reader_writer
import
FileBasedDataWriter
from
magic_pdf.data.data_reader_writer
import
FileBasedDataWriter
from
magic_pdf.data.dataset
import
PymuDoc
Dataset
,
Dataset
from
magic_pdf.data.dataset
import
Dataset
,
PymuDoc
Dataset
from
magic_pdf.libs.draw_bbox
import
draw_char_bbox
from
magic_pdf.libs.draw_bbox
import
draw_char_bbox
from
magic_pdf.model.doc_analyze_by_custom_model
import
doc_analyze
,
batch_doc_analyze
from
magic_pdf.model.doc_analyze_by_custom_model
import
(
batch_doc_analyze
,
doc_analyze
)
# from io import BytesIO
# from io import BytesIO
# from pypdf import PdfReader, PdfWriter
# from pypdf import PdfReader, PdfWriter
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment