Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
b1ac8d03
Commit
b1ac8d03
authored
Mar 15, 2024
by
赵小蒙
Browse files
book_name生成逻辑更新
parent
84867933
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
5 additions
and
5 deletions
+5
-5
magic_pdf/pipeline.py
magic_pdf/pipeline.py
+5
-5
No files found.
magic_pdf/pipeline.py
View file @
b1ac8d03
...
...
@@ -57,7 +57,7 @@ def meta_scan(jso: dict, doc_layout_check=True) -> dict:
try
:
data_source
=
get_data_source
(
jso
)
file_id
=
jso
.
get
(
'file_id'
)
book_name
=
data_source
+
"/"
+
file_id
book_name
=
f
"
{
data_source
}
/
{
file_id
}
"
# 首页存在超量drawing问题
# special_pdf_list = ['zlib/zlib_21822650']
...
...
@@ -103,7 +103,7 @@ def classify_by_type(jso: dict, debug_mode=False) -> dict:
pdf_meta
=
jso
.
get
(
'pdf_meta'
)
data_source
=
get_data_source
(
jso
)
file_id
=
jso
.
get
(
'file_id'
)
book_name
=
data_source
+
"/"
+
file_id
book_name
=
f
"
{
data_source
}
/
{
file_id
}
"
total_page
=
pdf_meta
[
"total_page"
]
page_width
=
pdf_meta
[
"page_width_pts"
]
page_height
=
pdf_meta
[
"page_height_pts"
]
...
...
@@ -169,7 +169,7 @@ def save_tables_to_s3(jso: dict, debug_mode=False) -> dict:
try
:
data_source
=
get_data_source
(
jso
)
file_id
=
jso
.
get
(
'file_id'
)
book_name
=
data_source
+
"/"
+
file_id
book_name
=
f
"
{
data_source
}
/
{
file_id
}
"
title
=
jso
.
get
(
'title'
)
url_encode_title
=
quote
(
title
,
safe
=
''
)
if
data_source
!=
'scihub'
:
...
...
@@ -262,7 +262,7 @@ def parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict:
model_output_json_list
=
jso
.
get
(
'doc_layout_result'
)
data_source
=
get_data_source
(
jso
)
file_id
=
jso
.
get
(
'file_id'
)
book_name
=
data_source
+
"/"
+
file_id
book_name
=
f
"
{
data_source
}
/
{
file_id
}
"
# 1.23.22已修复
# if debug_mode:
...
...
@@ -326,7 +326,7 @@ def ocr_parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict:
model_output_json_list
=
jso
.
get
(
'doc_layout_result'
)
data_source
=
get_data_source
(
jso
)
file_id
=
jso
.
get
(
'file_id'
)
book_name
=
data_source
+
"/"
+
file_id
book_name
=
f
"
{
data_source
}
/
{
file_id
}
"
try
:
save_path
=
"s3://mllm-raw-media/pdf2md_img/"
image_s3_config
=
get_s3_config
(
save_path
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment