Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
e9d36221
Commit
e9d36221
authored
Dec 17, 2024
by
icecraft
Browse files
feat: add get_middle_json method
parent
51fd53d9
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
29 additions
and
14 deletions
+29
-14
magic_pdf/pipe/operators.py
magic_pdf/pipe/operators.py
+29
-14
No files found.
magic_pdf/pipe/operators.py
View file @
e9d36221
...
@@ -23,10 +23,12 @@ class PipeResult:
...
@@ -23,10 +23,12 @@ class PipeResult:
self
.
_pipe_res
=
pipe_res
self
.
_pipe_res
=
pipe_res
self
.
_dataset
=
dataset
self
.
_dataset
=
dataset
def
get_markdown
(
self
,
def
get_markdown
(
img_dir_or_bucket_prefix
:
str
,
self
,
drop_mode
=
DropMode
.
WHOLE_PDF
,
img_dir_or_bucket_prefix
:
str
,
md_make_mode
=
MakeMode
.
MM_MD
)
->
str
:
drop_mode
=
DropMode
.
WHOLE_PDF
,
md_make_mode
=
MakeMode
.
MM_MD
,
)
->
str
:
"""Get markdown content.
"""Get markdown content.
Args:
Args:
...
@@ -61,13 +63,17 @@ class PipeResult:
...
@@ -61,13 +63,17 @@ class PipeResult:
md_make_mode (str, optional): The content Type of Markdown be made. Defaults to MakeMode.MM_MD.
md_make_mode (str, optional): The content Type of Markdown be made. Defaults to MakeMode.MM_MD.
"""
"""
md_content
=
self
.
get_markdown
(
img_dir_or_bucket_prefix
,
drop_mode
=
drop_mode
,
md_make_mode
=
md_make_mode
)
md_content
=
self
.
get_markdown
(
img_dir_or_bucket_prefix
,
drop_mode
=
drop_mode
,
md_make_mode
=
md_make_mode
)
writer
.
write_string
(
file_path
,
md_content
)
writer
.
write_string
(
file_path
,
md_content
)
def
get_content_list
(
self
,
def
get_content_list
(
image_dir_or_bucket_prefix
:
str
,
self
,
drop_mode
=
DropMode
.
NONE
,
image_dir_or_bucket_prefix
:
str
,
md_make_mode
=
MakeMode
.
STANDARD_FORMAT
)
->
str
:
drop_mode
=
DropMode
.
NONE
,
md_make_mode
=
MakeMode
.
STANDARD_FORMAT
,
)
->
str
:
"""Get Content List.
"""Get Content List.
Args:
Args:
...
@@ -93,7 +99,7 @@ class PipeResult:
...
@@ -93,7 +99,7 @@ class PipeResult:
file_path
:
str
,
file_path
:
str
,
image_dir_or_bucket_prefix
:
str
,
image_dir_or_bucket_prefix
:
str
,
drop_mode
=
DropMode
.
NONE
,
drop_mode
=
DropMode
.
NONE
,
md_make_mode
=
MakeMode
.
STANDARD_FORMAT
md_make_mode
=
MakeMode
.
STANDARD_FORMAT
,
):
):
"""Dump Content List.
"""Dump Content List.
...
@@ -104,11 +110,21 @@ class PipeResult:
...
@@ -104,11 +110,21 @@ class PipeResult:
drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.NONE.
drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.NONE.
md_make_mode (str, optional): The content Type of Markdown be made. Defaults to MakeMode.STANDARD_FORMAT.
md_make_mode (str, optional): The content Type of Markdown be made. Defaults to MakeMode.STANDARD_FORMAT.
"""
"""
content_list
=
self
.
get_content_list
(
image_dir_or_bucket_prefix
,
drop_mode
=
drop_mode
,
md_make_mode
=
md_make_mode
)
content_list
=
self
.
get_content_list
(
image_dir_or_bucket_prefix
,
drop_mode
=
drop_mode
,
md_make_mode
=
md_make_mode
)
writer
.
write_string
(
writer
.
write_string
(
file_path
,
json
.
dumps
(
content_list
,
ensure_ascii
=
False
,
indent
=
4
)
file_path
,
json
.
dumps
(
content_list
,
ensure_ascii
=
False
,
indent
=
4
)
)
)
def
get_middle_json
(
self
)
->
str
:
"""Get middle json.
Returns:
str: The content of middle json
"""
return
json
.
dumps
(
self
.
_pipe_res
,
ensure_ascii
=
False
,
indent
=
4
)
def
dump_middle_json
(
self
,
writer
:
DataWriter
,
file_path
:
str
):
def
dump_middle_json
(
self
,
writer
:
DataWriter
,
file_path
:
str
):
"""Dump the result of pipeline.
"""Dump the result of pipeline.
...
@@ -116,9 +132,8 @@ class PipeResult:
...
@@ -116,9 +132,8 @@ class PipeResult:
writer (DataWriter): File writer handler
writer (DataWriter): File writer handler
file_path (str): The file location of middle json
file_path (str): The file location of middle json
"""
"""
writer
.
write_string
(
middle_json
=
self
.
get_middle_json
()
file_path
,
json
.
dumps
(
self
.
_pipe_res
,
ensure_ascii
=
False
,
indent
=
4
)
writer
.
write_string
(
file_path
,
middle_json
)
)
def
draw_layout
(
self
,
file_path
:
str
)
->
None
:
def
draw_layout
(
self
,
file_path
:
str
)
->
None
:
"""Draw the layout.
"""Draw the layout.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment