Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
zhougaofeng
magic_pdf
Commits
114161e0
Commit
114161e0
authored
Oct 23, 2024
by
zhougaofeng
Browse files
Update common.py
parent
d61fddef
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
17 additions
and
56 deletions
+17
-56
magic_pdf/tools/common.py
magic_pdf/tools/common.py
+17
-56
No files found.
magic_pdf/tools/common.py
View file @
114161e0
...
@@ -73,9 +73,6 @@ def do_parse(
...
@@ -73,9 +73,6 @@ def do_parse(
image_writer
,
md_writer
=
DiskReaderWriter
(
image_writer
,
md_writer
=
DiskReaderWriter
(
local_image_dir
),
DiskReaderWriter
(
local_md_dir
)
local_image_dir
),
DiskReaderWriter
(
local_md_dir
)
image_dir
=
str
(
os
.
path
.
basename
(
local_image_dir
))
image_dir
=
str
(
os
.
path
.
basename
(
local_image_dir
))
# logger.info(f'model_list:{model_list}')
# logger.info(f'local_image_dir:::{local_image_dir}')
# logger.info(f'image_dir:::{image_dir}')
if
parse_method
==
'auto'
:
if
parse_method
==
'auto'
:
jso_useful_key
=
{
'_pdf_type'
:
''
,
'model_list'
:
model_list
}
jso_useful_key
=
{
'_pdf_type'
:
''
,
'model_list'
:
model_list
}
pipe
=
UNIPipe
(
pdf_bytes
,
jso_useful_key
,
image_writer
,
is_debug
=
True
,
pipe
=
UNIPipe
(
pdf_bytes
,
jso_useful_key
,
image_writer
,
is_debug
=
True
,
...
@@ -96,7 +93,6 @@ def do_parse(
...
@@ -96,7 +93,6 @@ def do_parse(
if
len
(
model_list
)
==
0
:
if
len
(
model_list
)
==
0
:
if
model_config
.
__use_inside_model__
:
if
model_config
.
__use_inside_model__
:
pipe
.
pipe_analyze
()
pipe
.
pipe_analyze
()
# logger.info(f'执行pipe.pipe_analyze()之后的pipe.model_list:{pipe.model_list}')
orig_model_list
=
copy
.
deepcopy
(
pipe
.
model_list
)
orig_model_list
=
copy
.
deepcopy
(
pipe
.
model_list
)
else
:
else
:
logger
.
error
(
'need model list input'
)
logger
.
error
(
'need model list input'
)
...
@@ -106,11 +102,6 @@ def do_parse(
...
@@ -106,11 +102,6 @@ def do_parse(
pdf_info
=
pipe
.
pdf_mid_data
[
'pdf_info'
]
pdf_info
=
pipe
.
pdf_mid_data
[
'pdf_info'
]
if
f_draw_layout_bbox
:
if
f_draw_layout_bbox
:
draw_layout_bbox
(
pdf_info
,
pdf_bytes
,
local_md_dir
,
pdf_file_name
)
draw_layout_bbox
(
pdf_info
,
pdf_bytes
,
local_md_dir
,
pdf_file_name
)
# if f_draw_span_bbox:
# draw_span_bbox(pdf_info, pdf_bytes, local_md_dir, pdf_file_name)
# if f_draw_model_bbox:
# drow_model_bbox(copy.deepcopy(orig_model_list), pdf_bytes, local_md_dir, pdf_file_name)
md_content
=
pipe
.
pipe_mk_markdown
(
config_path
,
local_image_dir
,
md_content
=
pipe
.
pipe_mk_markdown
(
config_path
,
local_image_dir
,
drop_mode
=
DropMode
.
NONE
,
drop_mode
=
DropMode
.
NONE
,
md_make_mode
=
f_make_md_mode
)
md_make_mode
=
f_make_md_mode
)
...
@@ -121,55 +112,25 @@ def do_parse(
...
@@ -121,55 +112,25 @@ def do_parse(
path
=
f
'
{
pdf_file_name
}
.txt'
,
path
=
f
'
{
pdf_file_name
}
.txt'
,
mode
=
AbsReaderWriter
.
MODE_TXT
,
mode
=
AbsReaderWriter
.
MODE_TXT
,
)
)
except
Exception
as
e
:
logger
.
info
(
f
'
{
pdf_file_name
}
导出txt文件失败,具体原因为:
\n
{
e
}
'
)
filepath
=
os
.
path
.
join
(
str
(
local_md_dir
),
f
'
{
pdf_file_name
}
.txt'
)
filepath
=
os
.
path
.
join
(
str
(
local_md_dir
),
f
'
{
pdf_file_name
}
.txt'
)
logger
.
info
(
f
'txt文件保存在filepath:
{
filepath
}
'
)
logger
.
info
(
f
'txt文件保存在filepath:
{
filepath
}
'
)
remove_empty_lines_from_file
(
filepath
)
remove_empty_lines_from_file
(
filepath
)
# if f_dump_md:
# md_writer.write(
# content=md_content,
# path=f'{pdf_file_name}.md',
# mode=AbsReaderWriter.MODE_TXT,
# )
#
# if f_dump_middle_json:
# md_writer.write(
# content=json_parse.dumps(pipe.pdf_mid_data,
# ensure_ascii=False,
# indent=4),
# path=f'{pdf_file_name}_middle.json',
# mode=AbsReaderWriter.MODE_TXT,
# )
#
# if f_dump_model_json:
# md_writer.write(
# content=json_parse.dumps(orig_model_list,
# ensure_ascii=False,
# indent=4),
# path=f'{pdf_file_name}_model.json',
# mode=AbsReaderWriter.MODE_TXT,
# )
#
if
f_dump_orig_pdf
:
if
f_dump_orig_pdf
:
md_writer
.
write
(
md_writer
.
write
(
content
=
pdf_bytes
,
content
=
pdf_bytes
,
path
=
f
'
{
pdf_file_name
}
_origin.pdf'
,
path
=
f
'
{
pdf_file_name
}
_origin.pdf'
,
mode
=
AbsReaderWriter
.
MODE_BIN
,
mode
=
AbsReaderWriter
.
MODE_BIN
,
)
)
#
# content_list = pipe.pipe_mk_uni_format(image_dir, drop_mode=DropMode.NONE)
# if f_dump_content_list:
# md_writer.write(
# content=json_parse.dumps(content_list,
# ensure_ascii=False,
# indent=4),
# path=f'{pdf_file_name}_content_list.json',
# mode=AbsReaderWriter.MODE_TXT,
# )
logger
.
info
(
f
'local output dir is
{
local_md_dir
}
'
)
logger
.
info
(
f
'local output dir is
{
local_md_dir
}
'
)
return
filepath
except
Exception
as
e
:
logger
.
error
(
f
'
{
pdf_file_name
}
导出txt文件失败,具体原因为:
\n
{
e
}
'
)
return
None
parse_pdf_methods
=
click
.
Choice
([
'ocr'
,
'txt'
,
'auto'
])
parse_pdf_methods
=
click
.
Choice
([
'ocr'
,
'txt'
,
'auto'
])
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment