Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
4f1f7d62
"docs/vscode:/vscode.git/clone" did not exist on "440e44a0e03d575bf682d35ab2771e88f774f4dc"
Commit
4f1f7d62
authored
Mar 21, 2024
by
许瑞
Browse files
feat: add layout
parent
90ea9096
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
4 additions
and
3 deletions
+4
-3
magic_pdf/pipeline.py
magic_pdf/pipeline.py
+1
-1
magic_pdf/train_utils/convert_to_train_format.py
magic_pdf/train_utils/convert_to_train_format.py
+3
-2
No files found.
magic_pdf/pipeline.py
View file @
4f1f7d62
...
...
@@ -620,9 +620,9 @@ def parse_pdf_for_model_train(jso: dict, start_page_id=0, debug_mode=False) -> d
jso
[
"need_drop"
]
=
True
jso
[
"drop_reason"
]
=
pdf_info_dict
[
"drop_reason"
]
else
:
# 正常返回,将 pdf_info_dict 压缩并存储
jso
[
"parsed_results"
]
=
convert_to_train_format
(
pdf_info_dict
)
pdf_info_dict
=
JsonCompressor
.
compress_json
(
pdf_info_dict
)
jso
[
"pdf_intermediate_dict"
]
=
pdf_info_dict
jso
[
"parsed_results"
]
=
convert_to_train_format
(
pdf_info_dict
)
end_time
=
time
.
time
()
# 记录完成时间
parse_time
=
int
(
end_time
-
start_time
)
# 计算执行时间
# 解析完成后打印一下book_name和耗时
...
...
magic_pdf/train_utils/convert_to_train_format.py
View file @
4f1f7d62
def
convert_to_train_format
(
jso
:
dict
)
->
[]:
pages
=
[]
for
k
,
v
in
jso
.
items
():
if
not
k
.
startswith
(
"page_"
):
continue
page_idx
=
v
[
"page_idx"
]
width
,
height
=
v
[
"page_size"
]
...
...
@@ -47,6 +47,7 @@ def convert_to_train_format(jso: dict) -> []:
bboxes
.
append
(
n_bbox
)
info
[
"bboxes"
]
=
bboxes
info
[
"layout_tree"
]
=
v
[
"layout_bboxes"
]
pages
.
append
(
info
)
return
pages
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment