Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
4a82d6a0
Commit
4a82d6a0
authored
Nov 28, 2024
by
icecraft
Committed by
xu rui
Dec 03, 2024
Browse files
feat: add function definitions
parent
a3a720ea
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
353 additions
and
16 deletions
+353
-16
magic_pdf/data/dataset.py
magic_pdf/data/dataset.py
+36
-0
magic_pdf/model/doc_analyze_by_custom_model.py
magic_pdf/model/doc_analyze_by_custom_model.py
+1
-1
magic_pdf/model/operators.py
magic_pdf/model/operators.py
+177
-0
magic_pdf/pdf_parse_union_core_v2.py
magic_pdf/pdf_parse_union_core_v2.py
+13
-12
magic_pdf/pipe/operators.py
magic_pdf/pipe/operators.py
+124
-0
magic_pdf/tools/common.py
magic_pdf/tools/common.py
+2
-3
No files found.
magic_pdf/data/dataset.py
View file @
4a82d6a0
...
@@ -32,10 +32,28 @@ class PageableData(ABC):
...
@@ -32,10 +32,28 @@ class PageableData(ABC):
@
abstractmethod
@
abstractmethod
def
draw_rect
(
self
,
rect_coords
,
color
,
fill
,
fill_opacity
,
width
,
overlay
):
def
draw_rect
(
self
,
rect_coords
,
color
,
fill
,
fill_opacity
,
width
,
overlay
):
"""draw rectangle.
Args:
rect_coords (list[float]): four elements array contain the top-left and bottom-right coordinates, [x0, y0, x1, y1]
color (list[float] | None): three element tuple which descript the RGB of the board line, None means no board line
fill (list[float] | None): fill the board with RGB, None means will not fill with color
fill_opacity (float): opacity of the fill, range from [0, 1]
width (float): the width of board
overlay (bool): fill the color in foreground or background. True means fill in background.
"""
pass
pass
@
abstractmethod
@
abstractmethod
def
insert_text
(
self
,
coord
,
content
,
fontsize
,
color
):
def
insert_text
(
self
,
coord
,
content
,
fontsize
,
color
):
"""insert text.
Args:
coord (list[float]): four elements array contain the top-left and bottom-right coordinates, [x0, y0, x1, y1]
content (str): the text content
fontsize (int): font size of the text
color (list[float] | None): three element tuple which descript the RGB of the board line, None will use the default font color!
"""
pass
pass
...
@@ -244,6 +262,16 @@ class Doc(PageableData):
...
@@ -244,6 +262,16 @@ class Doc(PageableData):
return
getattr
(
self
.
_doc
,
name
)
return
getattr
(
self
.
_doc
,
name
)
def
draw_rect
(
self
,
rect_coords
,
color
,
fill
,
fill_opacity
,
width
,
overlay
):
def
draw_rect
(
self
,
rect_coords
,
color
,
fill
,
fill_opacity
,
width
,
overlay
):
"""draw rectangle.
Args:
rect_coords (list[float]): four elements array contain the top-left and bottom-right coordinates, [x0, y0, x1, y1]
color (list[float] | None): three element tuple which descript the RGB of the board line, None means no board line
fill (list[float] | None): fill the board with RGB, None means will not fill with color
fill_opacity (float): opacity of the fill, range from [0, 1]
width (float): the width of board
overlay (bool): fill the color in foreground or background. True means fill in background.
"""
self
.
_doc
.
draw_rect
(
self
.
_doc
.
draw_rect
(
rect_coords
,
rect_coords
,
color
=
color
,
color
=
color
,
...
@@ -254,4 +282,12 @@ class Doc(PageableData):
...
@@ -254,4 +282,12 @@ class Doc(PageableData):
)
)
def
insert_text
(
self
,
coord
,
content
,
fontsize
,
color
):
def
insert_text
(
self
,
coord
,
content
,
fontsize
,
color
):
"""insert text.
Args:
coord (list[float]): four elements array contain the top-left and bottom-right coordinates, [x0, y0, x1, y1]
content (str): the text content
fontsize (int): font size of the text
color (list[float] | None): three element tuple which descript the RGB of the board line, None will use the default font color!
"""
self
.
_doc
.
insert_text
(
coord
,
content
,
fontsize
=
fontsize
,
color
=
color
)
self
.
_doc
.
insert_text
(
coord
,
content
,
fontsize
=
fontsize
,
color
=
color
)
magic_pdf/model/doc_analyze_by_custom_model.py
View file @
4a82d6a0
...
@@ -13,7 +13,7 @@ from magic_pdf.libs.config_reader import (get_device, get_formula_config,
...
@@ -13,7 +13,7 @@ from magic_pdf.libs.config_reader import (get_device, get_formula_config,
get_local_models_dir
,
get_local_models_dir
,
get_table_recog_config
)
get_table_recog_config
)
from
magic_pdf.model.model_list
import
MODEL
from
magic_pdf.model.model_list
import
MODEL
from
magic_pdf.model.
type
s
import
InferenceResult
from
magic_pdf.model.
operator
s
import
InferenceResult
def
dict_compare
(
d1
,
d2
):
def
dict_compare
(
d1
,
d2
):
...
...
magic_pdf/model/
type
s.py
→
magic_pdf/model/
operator
s.py
View file @
4a82d6a0
...
@@ -9,15 +9,26 @@ from magic_pdf.data.dataset import Dataset
...
@@ -9,15 +9,26 @@ from magic_pdf.data.dataset import Dataset
from
magic_pdf.filter
import
classify
from
magic_pdf.filter
import
classify
from
magic_pdf.libs.draw_bbox
import
draw_model_bbox
from
magic_pdf.libs.draw_bbox
import
draw_model_bbox
from
magic_pdf.pdf_parse_union_core_v2
import
pdf_parse_union
from
magic_pdf.pdf_parse_union_core_v2
import
pdf_parse_union
from
magic_pdf.pipe.
type
s
import
PipeResult
from
magic_pdf.pipe.
operator
s
import
PipeResult
class
InferenceResult
:
class
InferenceResult
:
def
__init__
(
self
,
inference_results
:
list
,
dataset
:
Dataset
):
def
__init__
(
self
,
inference_results
:
list
,
dataset
:
Dataset
):
"""Initialized method.
Args:
inference_results (list): the inference result generated by model
dataset (Dataset): the dataset related with model inference result
"""
self
.
_infer_res
=
inference_results
self
.
_infer_res
=
inference_results
self
.
_dataset
=
dataset
self
.
_dataset
=
dataset
def
draw_model
(
self
,
file_path
:
str
)
->
None
:
def
draw_model
(
self
,
file_path
:
str
)
->
None
:
"""Draw model inference result.
Args:
file_path (str): the output file path
"""
dir_name
=
os
.
path
.
dirname
(
file_path
)
dir_name
=
os
.
path
.
dirname
(
file_path
)
base_name
=
os
.
path
.
basename
(
file_path
)
base_name
=
os
.
path
.
basename
(
file_path
)
if
not
os
.
path
.
exists
(
dir_name
):
if
not
os
.
path
.
exists
(
dir_name
):
...
@@ -27,14 +38,34 @@ class InferenceResult:
...
@@ -27,14 +38,34 @@ class InferenceResult:
)
)
def
dump_model
(
self
,
writer
:
DataWriter
,
file_path
:
str
):
def
dump_model
(
self
,
writer
:
DataWriter
,
file_path
:
str
):
"""Dump model inference result to file.
Args:
writer (DataWriter): writer handle
file_path (str): the location of target file
"""
writer
.
write_string
(
writer
.
write_string
(
file_path
,
json
.
dumps
(
self
.
_infer_res
,
ensure_ascii
=
False
,
indent
=
4
)
file_path
,
json
.
dumps
(
self
.
_infer_res
,
ensure_ascii
=
False
,
indent
=
4
)
)
)
def
get_infer_res
(
self
):
def
get_infer_res
(
self
):
"""Get the inference result.
Returns:
list[dict]: the inference result generated by model
"""
return
self
.
_infer_res
return
self
.
_infer_res
def
apply
(
self
,
proc
:
Callable
,
*
args
,
**
kwargs
):
def
apply
(
self
,
proc
:
Callable
,
*
args
,
**
kwargs
):
"""Apply callable method which.
Args:
proc (Callable): invoke proc as follows:
proc(inference_result, *args, **kwargs)
Returns:
Any: return the result generated by proc
"""
return
proc
(
copy
.
deepcopy
(
self
.
_infer_res
),
*
args
,
**
kwargs
)
return
proc
(
copy
.
deepcopy
(
self
.
_infer_res
),
*
args
,
**
kwargs
)
def
pipe_auto_mode
(
def
pipe_auto_mode
(
...
@@ -45,33 +76,30 @@ class InferenceResult:
...
@@ -45,33 +76,30 @@ class InferenceResult:
debug_mode
=
False
,
debug_mode
=
False
,
lang
=
None
,
lang
=
None
,
)
->
PipeResult
:
)
->
PipeResult
:
def
proc
(
*
args
,
**
kwargs
)
->
PipeResult
:
"""Post-proc the model inference result.
res
=
pdf_parse_union
(
*
args
,
**
kwargs
)
step1: classify the dataset type
return
PipeResult
(
res
,
self
.
_dataset
)
step2: based the result of step1, using `pipe_txt_mode` or `pipe_ocr_mode`
Args:
imageWriter (DataWriter): the image writer handle
start_page_id (int, optional): Defaults to 0. Let user select some pages He/She want to process
end_page_id (_type_, optional): Defaults to the last page index of dataset. Let user select some pages He/She want to process
debug_mode (bool, optional): Defaults to False. will dump more log if enabled
lang (_type_, optional): Defaults to None.
Returns:
PipeResult: the result
"""
pdf_proc_method
=
classify
(
self
.
_dataset
.
data_bits
())
pdf_proc_method
=
classify
(
self
.
_dataset
.
data_bits
())
if
pdf_proc_method
==
SupportedPdfParseMethod
.
TXT
:
if
pdf_proc_method
==
SupportedPdfParseMethod
.
TXT
:
return
self
.
apply
(
return
self
.
pipe_txt_mode
(
proc
,
imageWriter
,
start_page_id
,
end_page_id
,
debug_mode
,
lang
self
.
_dataset
,
imageWriter
,
SupportedPdfParseMethod
.
TXT
,
start_page_id
=
0
,
end_page_id
=
None
,
debug_mode
=
False
,
lang
=
None
,
)
)
else
:
else
:
return
self
.
apply
(
return
self
.
pipe_ocr_mode
(
proc
,
imageWriter
,
start_page_id
,
end_page_id
,
debug_mode
,
lang
self
.
_dataset
,
imageWriter
,
SupportedPdfParseMethod
.
OCR
,
start_page_id
=
0
,
end_page_id
=
None
,
debug_mode
=
False
,
lang
=
None
,
)
)
def
pipe_txt_mode
(
def
pipe_txt_mode
(
...
@@ -82,6 +110,20 @@ class InferenceResult:
...
@@ -82,6 +110,20 @@ class InferenceResult:
debug_mode
=
False
,
debug_mode
=
False
,
lang
=
None
,
lang
=
None
,
)
->
PipeResult
:
)
->
PipeResult
:
"""Post-proc the model inference result, Extract the text using the
third library, such as `pymupdf`
Args:
imageWriter (DataWriter): the image writer handle
start_page_id (int, optional): Defaults to 0. Let user select some pages He/She want to process
end_page_id (_type_, optional): Defaults to the last page index of dataset. Let user select some pages He/She want to process
debug_mode (bool, optional): Defaults to False. will dump more log if enabled
lang (_type_, optional): Defaults to None.
Returns:
PipeResult: the result
"""
def
proc
(
*
args
,
**
kwargs
)
->
PipeResult
:
def
proc
(
*
args
,
**
kwargs
)
->
PipeResult
:
res
=
pdf_parse_union
(
*
args
,
**
kwargs
)
res
=
pdf_parse_union
(
*
args
,
**
kwargs
)
return
PipeResult
(
res
,
self
.
_dataset
)
return
PipeResult
(
res
,
self
.
_dataset
)
...
@@ -91,10 +133,10 @@ class InferenceResult:
...
@@ -91,10 +133,10 @@ class InferenceResult:
self
.
_dataset
,
self
.
_dataset
,
imageWriter
,
imageWriter
,
SupportedPdfParseMethod
.
TXT
,
SupportedPdfParseMethod
.
TXT
,
start_page_id
=
0
,
start_page_id
=
start_page_id
,
end_page_id
=
None
,
end_page_id
=
end_page_id
,
debug_mode
=
Fals
e
,
debug_mode
=
debug_mod
e
,
lang
=
None
,
lang
=
lang
,
)
)
def
pipe_ocr_mode
(
def
pipe_ocr_mode
(
...
@@ -105,6 +147,19 @@ class InferenceResult:
...
@@ -105,6 +147,19 @@ class InferenceResult:
debug_mode
=
False
,
debug_mode
=
False
,
lang
=
None
,
lang
=
None
,
)
->
PipeResult
:
)
->
PipeResult
:
"""Post-proc the model inference result, Extract the text using `OCR`
technical.
Args:
imageWriter (DataWriter): the image writer handle
start_page_id (int, optional): Defaults to 0. Let user select some pages He/She want to process
end_page_id (_type_, optional): Defaults to the last page index of dataset. Let user select some pages He/She want to process
debug_mode (bool, optional): Defaults to False. will dump more log if enabled
lang (_type_, optional): Defaults to None.
Returns:
PipeResult: the result
"""
def
proc
(
*
args
,
**
kwargs
)
->
PipeResult
:
def
proc
(
*
args
,
**
kwargs
)
->
PipeResult
:
res
=
pdf_parse_union
(
*
args
,
**
kwargs
)
res
=
pdf_parse_union
(
*
args
,
**
kwargs
)
...
@@ -115,8 +170,8 @@ class InferenceResult:
...
@@ -115,8 +170,8 @@ class InferenceResult:
self
.
_dataset
,
self
.
_dataset
,
imageWriter
,
imageWriter
,
SupportedPdfParseMethod
.
TXT
,
SupportedPdfParseMethod
.
TXT
,
start_page_id
=
0
,
start_page_id
=
start_page_id
,
end_page_id
=
None
,
end_page_id
=
end_page_id
,
debug_mode
=
Fals
e
,
debug_mode
=
debug_mod
e
,
lang
=
None
,
lang
=
lang
,
)
)
magic_pdf/pdf_parse_union_core_v2.py
View file @
4a82d6a0
...
@@ -4,8 +4,8 @@ import statistics
...
@@ -4,8 +4,8 @@ import statistics
import
time
import
time
from
typing
import
List
from
typing
import
List
import
torch
import
fitz
import
fitz
import
torch
from
loguru
import
logger
from
loguru
import
logger
from
magic_pdf.config.enums
import
SupportedPdfParseMethod
from
magic_pdf.config.enums
import
SupportedPdfParseMethod
...
@@ -16,17 +16,13 @@ from magic_pdf.libs.clean_memory import clean_memory
...
@@ -16,17 +16,13 @@ from magic_pdf.libs.clean_memory import clean_memory
from
magic_pdf.libs.config_reader
import
get_local_layoutreader_model_dir
from
magic_pdf.libs.config_reader
import
get_local_layoutreader_model_dir
from
magic_pdf.libs.convert_utils
import
dict_to_list
from
magic_pdf.libs.convert_utils
import
dict_to_list
from
magic_pdf.libs.hash_utils
import
compute_md5
from
magic_pdf.libs.hash_utils
import
compute_md5
from
magic_pdf.libs.pdf_image_tools
import
cut_image_to_pil_image
from
magic_pdf.libs.pdf_image_tools
import
cut_image_to_pil_image
from
magic_pdf.model.magic_model
import
MagicModel
from
magic_pdf.model.magic_model
import
MagicModel
os
.
environ
[
'NO_ALBUMENTATIONS_UPDATE'
]
=
'1'
# 禁止albumentations检查更新
os
.
environ
[
'YOLO_VERBOSE'
]
=
'False'
# disable yolo logger
try
:
try
:
import
torchtext
import
torchtext
if
torchtext
.
__version__
>=
"
0.18.0
"
:
if
torchtext
.
__version__
>=
'
0.18.0
'
:
torchtext
.
disable_torchtext_deprecation_warning
()
torchtext
.
disable_torchtext_deprecation_warning
()
except
ImportError
:
except
ImportError
:
pass
pass
...
@@ -39,6 +35,9 @@ from magic_pdf.pre_proc.ocr_detect_all_bboxes import ocr_prepare_bboxes_for_layo
...
@@ -39,6 +35,9 @@ from magic_pdf.pre_proc.ocr_detect_all_bboxes import ocr_prepare_bboxes_for_layo
from
magic_pdf.pre_proc.ocr_dict_merge
import
fill_spans_in_blocks
,
fix_block_spans_v2
,
fix_discarded_block
from
magic_pdf.pre_proc.ocr_dict_merge
import
fill_spans_in_blocks
,
fix_block_spans_v2
,
fix_discarded_block
from
magic_pdf.pre_proc.ocr_span_list_modify
import
get_qa_need_list_v2
,
remove_overlaps_low_confidence_spans
,
remove_overlaps_min_spans
from
magic_pdf.pre_proc.ocr_span_list_modify
import
get_qa_need_list_v2
,
remove_overlaps_low_confidence_spans
,
remove_overlaps_min_spans
os
.
environ
[
'NO_ALBUMENTATIONS_UPDATE'
]
=
'1'
# 禁止albumentations检查更新
os
.
environ
[
'YOLO_VERBOSE'
]
=
'False'
# disable yolo logger
def
__replace_STX_ETX
(
text_str
:
str
):
def
__replace_STX_ETX
(
text_str
:
str
):
"""Replace
\u0002
and
\u0003
, as these characters become garbled when extracted using pymupdf. In fact, they were originally quotation marks.
"""Replace
\u0002
and
\u0003
, as these characters become garbled when extracted using pymupdf. In fact, they were originally quotation marks.
...
@@ -90,7 +89,10 @@ def chars_to_content(span):
...
@@ -90,7 +89,10 @@ def chars_to_content(span):
LINE_STOP_FLAG
=
(
'.'
,
'!'
,
'?'
,
'。'
,
'!'
,
'?'
,
')'
,
')'
,
'"'
,
'”'
,
':'
,
':'
,
';'
,
';'
,
']'
,
'】'
,
'}'
,
'}'
,
'>'
,
'》'
,
'、'
,
','
,
','
,
'-'
,
'—'
,
'–'
,)
LINE_STOP_FLAG
=
(
'.'
,
'!'
,
'?'
,
'。'
,
'!'
,
'?'
,
')'
,
')'
,
'"'
,
'”'
,
':'
,
':'
,
';'
,
';'
,
']'
,
'】'
,
'}'
,
'}'
,
'>'
,
'》'
,
'、'
,
','
,
','
,
'-'
,
'—'
,
'–'
,)
<<<<<<<
HEAD
LINE_START_FLAG
=
(
'('
,
'('
,
'"'
,
'“'
,
'【'
,
'{'
,
'《'
,
'<'
,
'「'
,
'『'
,
'【'
,
'['
,)
LINE_START_FLAG
=
(
'('
,
'('
,
'"'
,
'“'
,
'【'
,
'{'
,
'《'
,
'<'
,
'「'
,
'『'
,
'【'
,
'['
,)
=======
>>>>>>>
731
f4bf
(
feat
:
add
function
definitions
)
def
fill_char_in_spans
(
spans
,
all_chars
):
def
fill_char_in_spans
(
spans
,
all_chars
):
...
@@ -233,7 +235,7 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang
...
@@ -233,7 +235,7 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang
# 初始化ocr模型
# 初始化ocr模型
atom_model_manager
=
AtomModelSingleton
()
atom_model_manager
=
AtomModelSingleton
()
ocr_model
=
atom_model_manager
.
get_atom_model
(
ocr_model
=
atom_model_manager
.
get_atom_model
(
atom_model_name
=
"
ocr
"
,
atom_model_name
=
'
ocr
'
,
ocr_show_log
=
False
,
ocr_show_log
=
False
,
det_db_box_thresh
=
0.3
,
det_db_box_thresh
=
0.3
,
lang
=
lang
lang
=
lang
...
@@ -241,7 +243,7 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang
...
@@ -241,7 +243,7 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang
for
span
in
empty_spans
:
for
span
in
empty_spans
:
# 对span的bbox截图再ocr
# 对span的bbox截图再ocr
span_img
=
cut_image_to_pil_image
(
span
[
'bbox'
],
pdf_page
,
mode
=
"
cv2
"
)
span_img
=
cut_image_to_pil_image
(
span
[
'bbox'
],
pdf_page
,
mode
=
'
cv2
'
)
ocr_res
=
ocr_model
.
ocr
(
span_img
,
det
=
False
)
ocr_res
=
ocr_model
.
ocr
(
span_img
,
det
=
False
)
if
ocr_res
and
len
(
ocr_res
)
>
0
:
if
ocr_res
and
len
(
ocr_res
)
>
0
:
if
len
(
ocr_res
[
0
])
>
0
:
if
len
(
ocr_res
[
0
])
>
0
:
...
@@ -681,7 +683,7 @@ def parse_page_core(
...
@@ -681,7 +683,7 @@ def parse_page_core(
"""根据parse_mode,构造spans,主要是文本类的字符填充"""
"""根据parse_mode,构造spans,主要是文本类的字符填充"""
if
parse_mode
==
SupportedPdfParseMethod
.
TXT
:
if
parse_mode
==
SupportedPdfParseMethod
.
TXT
:
"""使用新版本的混合ocr方案"""
"""使用新版本的混合ocr方案
.
"""
spans
=
txt_spans_extract_v2
(
page_doc
,
spans
,
all_bboxes
,
all_discarded_blocks
,
lang
)
spans
=
txt_spans_extract_v2
(
page_doc
,
spans
,
all_bboxes
,
all_discarded_blocks
,
lang
)
elif
parse_mode
==
SupportedPdfParseMethod
.
OCR
:
elif
parse_mode
==
SupportedPdfParseMethod
.
OCR
:
...
@@ -689,7 +691,6 @@ def parse_page_core(
...
@@ -689,7 +691,6 @@ def parse_page_core(
else
:
else
:
raise
Exception
(
'parse_mode must be txt or ocr'
)
raise
Exception
(
'parse_mode must be txt or ocr'
)
"""先处理不需要排版的discarded_blocks"""
"""先处理不需要排版的discarded_blocks"""
discarded_block_with_spans
,
spans
=
fill_spans_in_blocks
(
discarded_block_with_spans
,
spans
=
fill_spans_in_blocks
(
all_discarded_blocks
,
spans
,
0.4
all_discarded_blocks
,
spans
,
0.4
...
@@ -762,8 +763,8 @@ def parse_page_core(
...
@@ -762,8 +763,8 @@ def parse_page_core(
def
pdf_parse_union
(
def
pdf_parse_union
(
dataset
:
Dataset
,
model_list
,
model_list
,
dataset
:
Dataset
,
imageWriter
,
imageWriter
,
parse_mode
,
parse_mode
,
start_page_id
=
0
,
start_page_id
=
0
,
...
@@ -832,4 +833,4 @@ def pdf_parse_union(
...
@@ -832,4 +833,4 @@ def pdf_parse_union(
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
pass
pass
\ No newline at end of file
magic_pdf/pipe/
type
s.py
→
magic_pdf/pipe/
operator
s.py
View file @
4a82d6a0
import
json
import
json
import
os
import
os
...
@@ -13,23 +12,76 @@ from magic_pdf.libs.json_compressor import JsonCompressor
...
@@ -13,23 +12,76 @@ from magic_pdf.libs.json_compressor import JsonCompressor
class
PipeResult
:
class
PipeResult
:
def
__init__
(
self
,
pipe_res
,
dataset
:
Dataset
):
def
__init__
(
self
,
pipe_res
,
dataset
:
Dataset
):
"""Initialized.
Args:
pipe_res (list[dict]): the pipeline processed result of model inference result
dataset (Dataset): the dataset associated with pipe_res
"""
self
.
_pipe_res
=
pipe_res
self
.
_pipe_res
=
pipe_res
self
.
_dataset
=
dataset
self
.
_dataset
=
dataset
def
dump_md
(
self
,
writer
:
DataWriter
,
file_path
:
str
,
img_dir_or_bucket_prefix
:
str
,
drop_mode
=
DropMode
.
WHOLE_PDF
,
md_make_mode
=
MakeMode
.
MM_MD
):
def
dump_md
(
self
,
writer
:
DataWriter
,
file_path
:
str
,
img_dir_or_bucket_prefix
:
str
,
drop_mode
=
DropMode
.
WHOLE_PDF
,
md_make_mode
=
MakeMode
.
MM_MD
,
):
"""Dump The Markdown.
Args:
writer (DataWriter): File writer handle
file_path (str): The file location of markdown
img_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure
drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.WHOLE_PDF.
md_make_mode (str, optional): The content Type of Markdown be made. Defaults to MakeMode.MM_MD.
"""
pdf_info_list
=
self
.
_pipe_res
[
'pdf_info'
]
pdf_info_list
=
self
.
_pipe_res
[
'pdf_info'
]
md_content
=
union_make
(
pdf_info_list
,
md_make_mode
,
drop_mode
,
img_dir_or_bucket_prefix
)
md_content
=
union_make
(
pdf_info_list
,
md_make_mode
,
drop_mode
,
img_dir_or_bucket_prefix
)
writer
.
write_string
(
file_path
,
md_content
)
writer
.
write_string
(
file_path
,
md_content
)
def
dump_content_list
(
self
,
writer
:
DataWriter
,
file_path
:
str
,
image_dir_or_bucket_prefix
:
str
,
drop_mode
=
DropMode
.
NONE
):
def
dump_content_list
(
self
,
writer
:
DataWriter
,
file_path
:
str
,
image_dir_or_bucket_prefix
:
str
):
"""Dump Content List.
Args:
writer (DataWriter): File writer handle
file_path (str): The file location of content list
image_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure
"""
pdf_info_list
=
self
.
_pipe_res
[
'pdf_info'
]
pdf_info_list
=
self
.
_pipe_res
[
'pdf_info'
]
content_list
=
union_make
(
pdf_info_list
,
MakeMode
.
STANDARD_FORMAT
,
drop_mode
,
image_dir_or_bucket_prefix
)
content_list
=
union_make
(
writer
.
write_string
(
file_path
,
json
.
dumps
(
content_list
,
ensure_ascii
=
False
,
indent
=
4
))
pdf_info_list
,
MakeMode
.
STANDARD_FORMAT
,
DropMode
.
NONE
,
image_dir_or_bucket_prefix
,
)
writer
.
write_string
(
file_path
,
json
.
dumps
(
content_list
,
ensure_ascii
=
False
,
indent
=
4
)
)
def
dump_middle_json
(
self
,
writer
:
DataWriter
,
file_path
:
str
):
def
dump_middle_json
(
self
,
writer
:
DataWriter
,
file_path
:
str
):
writer
.
write_string
(
file_path
,
json
.
dumps
(
self
.
_pipe_res
,
ensure_ascii
=
False
,
indent
=
4
))
"""Dump the result of pipeline.
Args:
writer (DataWriter): File writer handler
file_path (str): The file location of middle json
"""
writer
.
write_string
(
file_path
,
json
.
dumps
(
self
.
_pipe_res
,
ensure_ascii
=
False
,
indent
=
4
)
)
def
draw_layout
(
self
,
file_path
:
str
)
->
None
:
def
draw_layout
(
self
,
file_path
:
str
)
->
None
:
"""Draw the layout.
Args:
file_path (str): The file location of layout result file
"""
dir_name
=
os
.
path
.
dirname
(
file_path
)
dir_name
=
os
.
path
.
dirname
(
file_path
)
base_name
=
os
.
path
.
basename
(
file_path
)
base_name
=
os
.
path
.
basename
(
file_path
)
if
not
os
.
path
.
exists
(
dir_name
):
if
not
os
.
path
.
exists
(
dir_name
):
...
@@ -38,6 +90,11 @@ class PipeResult:
...
@@ -38,6 +90,11 @@ class PipeResult:
draw_layout_bbox
(
pdf_info
,
self
.
_dataset
.
data_bits
(),
dir_name
,
base_name
)
draw_layout_bbox
(
pdf_info
,
self
.
_dataset
.
data_bits
(),
dir_name
,
base_name
)
def
draw_span
(
self
,
file_path
:
str
):
def
draw_span
(
self
,
file_path
:
str
):
"""Draw the Span.
Args:
file_path (str): The file location of span result file
"""
dir_name
=
os
.
path
.
dirname
(
file_path
)
dir_name
=
os
.
path
.
dirname
(
file_path
)
base_name
=
os
.
path
.
basename
(
file_path
)
base_name
=
os
.
path
.
basename
(
file_path
)
if
not
os
.
path
.
exists
(
dir_name
):
if
not
os
.
path
.
exists
(
dir_name
):
...
@@ -46,6 +103,11 @@ class PipeResult:
...
@@ -46,6 +103,11 @@ class PipeResult:
draw_span_bbox
(
pdf_info
,
self
.
_dataset
.
data_bits
(),
dir_name
,
base_name
)
draw_span_bbox
(
pdf_info
,
self
.
_dataset
.
data_bits
(),
dir_name
,
base_name
)
def
draw_line_sort
(
self
,
file_path
:
str
):
def
draw_line_sort
(
self
,
file_path
:
str
):
"""Draw line sort.
Args:
file_path (str): The file location of line sort result file
"""
dir_name
=
os
.
path
.
dirname
(
file_path
)
dir_name
=
os
.
path
.
dirname
(
file_path
)
base_name
=
os
.
path
.
basename
(
file_path
)
base_name
=
os
.
path
.
basename
(
file_path
)
if
not
os
.
path
.
exists
(
dir_name
):
if
not
os
.
path
.
exists
(
dir_name
):
...
@@ -53,10 +115,10 @@ class PipeResult:
...
@@ -53,10 +115,10 @@ class PipeResult:
pdf_info
=
self
.
_pipe_res
[
'pdf_info'
]
pdf_info
=
self
.
_pipe_res
[
'pdf_info'
]
draw_line_sort_bbox
(
pdf_info
,
self
.
_dataset
.
data_bits
(),
dir_name
,
base_name
)
draw_line_sort_bbox
(
pdf_info
,
self
.
_dataset
.
data_bits
(),
dir_name
,
base_name
)
def
draw_content_list
(
self
,
writer
:
DataWriter
,
file_path
:
str
,
img_dir_or_bucket_prefix
:
str
,
drop_mode
=
DropMode
.
WHOLE_PDF
):
pdf_info_list
=
self
.
_pipe_res
[
'pdf_info'
]
content_list
=
union_make
(
pdf_info_list
,
MakeMode
.
STANDARD_FORMAT
,
drop_mode
,
img_dir_or_bucket_prefix
)
writer
.
write_string
(
file_path
,
json
.
dumps
(
content_list
,
ensure_ascii
=
False
,
indent
=
4
))
def
get_compress_pdf_mid_data
(
self
):
def
get_compress_pdf_mid_data
(
self
):
"""Compress the pipeline result.
Returns:
str: compress the pipeline result and return
"""
return
JsonCompressor
.
compress_json
(
self
.
pdf_mid_data
)
return
JsonCompressor
.
compress_json
(
self
.
pdf_mid_data
)
magic_pdf/tools/common.py
View file @
4a82d6a0
...
@@ -10,7 +10,7 @@ from magic_pdf.config.make_content_config import DropMode, MakeMode
...
@@ -10,7 +10,7 @@ from magic_pdf.config.make_content_config import DropMode, MakeMode
from
magic_pdf.data.data_reader_writer
import
FileBasedDataWriter
from
magic_pdf.data.data_reader_writer
import
FileBasedDataWriter
from
magic_pdf.data.dataset
import
PymuDocDataset
from
magic_pdf.data.dataset
import
PymuDocDataset
from
magic_pdf.model.doc_analyze_by_custom_model
import
doc_analyze
from
magic_pdf.model.doc_analyze_by_custom_model
import
doc_analyze
from
magic_pdf.model.
type
s
import
InferenceResult
from
magic_pdf.model.
operator
s
import
InferenceResult
# from io import BytesIO
# from io import BytesIO
# from pypdf import PdfReader, PdfWriter
# from pypdf import PdfReader, PdfWriter
...
@@ -223,8 +223,7 @@ def do_parse(
...
@@ -223,8 +223,7 @@ def do_parse(
pipe_result
.
dump_content_list
(
pipe_result
.
dump_content_list
(
md_writer
,
md_writer
,
f
'
{
pdf_file_name
}
_content_list.json'
,
f
'
{
pdf_file_name
}
_content_list.json'
,
image_dir
,
image_dir
drop_mode
=
DropMode
.
NONE
,
)
)
logger
.
info
(
f
'local output dir is
{
local_md_dir
}
'
)
logger
.
info
(
f
'local output dir is
{
local_md_dir
}
'
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment