Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
f65be6e0
Commit
f65be6e0
authored
Apr 08, 2024
by
赵小蒙
Browse files
pdf_parse_by_model.py ---> pdf_parse_by_txt.py
parent
0f3bfa10
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
5 additions
and
5 deletions
+5
-5
demo/pdf2md.py
demo/pdf2md.py
+2
-2
magic_pdf/pdf_parse_by_txt.py
magic_pdf/pdf_parse_by_txt.py
+1
-1
magic_pdf/pipeline.py
magic_pdf/pipeline.py
+2
-2
No files found.
demo/pdf2md.py
View file @
f65be6e0
...
@@ -8,7 +8,7 @@ from loguru import logger
...
@@ -8,7 +8,7 @@ from loguru import logger
from
magic_pdf.libs.commons
import
join_path
,
read_file
from
magic_pdf.libs.commons
import
join_path
,
read_file
from
magic_pdf.dict2md.mkcontent
import
mk_mm_markdown
,
mk_universal_format
from
magic_pdf.dict2md.mkcontent
import
mk_mm_markdown
,
mk_universal_format
from
magic_pdf.p
ipeline
import
parse_pdf_by_
model
from
magic_pdf.p
df_parse_by_txt
import
parse_pdf_by_
txt
...
@@ -25,7 +25,7 @@ def main(s3_pdf_path: str, s3_pdf_profile: str, pdf_model_path: str, pdf_model_p
...
@@ -25,7 +25,7 @@ def main(s3_pdf_path: str, s3_pdf_profile: str, pdf_model_path: str, pdf_model_p
pdf_bytes
=
read_file
(
s3_pdf_path
,
s3_pdf_profile
)
pdf_bytes
=
read_file
(
s3_pdf_path
,
s3_pdf_profile
)
try
:
try
:
paras_dict
=
parse_pdf_by_
model
(
paras_dict
=
parse_pdf_by_
txt
(
pdf_bytes
,
pdf_model_path
,
save_path
,
book_name
,
pdf_model_profile
,
start_page_num
,
debug_mode
=
debug_mode
pdf_bytes
,
pdf_model_path
,
save_path
,
book_name
,
pdf_model_profile
,
start_page_num
,
debug_mode
=
debug_mode
)
)
parent_dir
=
os
.
path
.
dirname
(
text_content_save_path
)
parent_dir
=
os
.
path
.
dirname
(
text_content_save_path
)
...
...
magic_pdf/pdf_parse_by_
model
.py
→
magic_pdf/pdf_parse_by_
txt
.py
View file @
f65be6e0
...
@@ -70,7 +70,7 @@ paraMergeException_msg = ParaMergeException().message
...
@@ -70,7 +70,7 @@ paraMergeException_msg = ParaMergeException().message
def
parse_pdf_by_
model
(
def
parse_pdf_by_
txt
(
pdf_bytes
,
pdf_bytes
,
pdf_model_output
,
pdf_model_output
,
save_path
,
save_path
,
...
...
magic_pdf/pipeline.py
View file @
f65be6e0
...
@@ -13,7 +13,7 @@ from magic_pdf.libs.commons import (
...
@@ -13,7 +13,7 @@ from magic_pdf.libs.commons import (
from
magic_pdf.libs.drop_reason
import
DropReason
from
magic_pdf.libs.drop_reason
import
DropReason
from
magic_pdf.libs.json_compressor
import
JsonCompressor
from
magic_pdf.libs.json_compressor
import
JsonCompressor
from
magic_pdf.dict2md.mkcontent
import
mk_universal_format
from
magic_pdf.dict2md.mkcontent
import
mk_universal_format
from
magic_pdf.pdf_parse_by_
model
import
parse_pdf_by_
model
from
magic_pdf.pdf_parse_by_
txt
import
parse_pdf_by_
txt
from
magic_pdf.filter.pdf_classify_by_type
import
classify
from
magic_pdf.filter.pdf_classify_by_type
import
classify
from
magic_pdf.filter.pdf_meta_scan
import
pdf_meta_scan
from
magic_pdf.filter.pdf_meta_scan
import
pdf_meta_scan
from
loguru
import
logger
from
loguru
import
logger
...
@@ -310,7 +310,7 @@ def parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict:
...
@@ -310,7 +310,7 @@ def parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict:
f
"book_name is:
{
book_name
}
,start_time is:
{
formatted_time
(
start_time
)
}
"
,
f
"book_name is:
{
book_name
}
,start_time is:
{
formatted_time
(
start_time
)
}
"
,
file
=
sys
.
stderr
,
file
=
sys
.
stderr
,
)
)
pdf_info_dict
=
parse_pdf_by_
model
(
pdf_info_dict
=
parse_pdf_by_
txt
(
pdf_bytes
,
pdf_bytes
,
model_output_json_list
,
model_output_json_list
,
save_path
,
save_path
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment