Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
zhougaofeng
magic_pdf
Commits
4f28604a
Commit
4f28604a
authored
Nov 15, 2024
by
zhougaofeng
Browse files
Update common.py
parent
e89b93b9
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
7 additions
and
1 deletion
+7
-1
magic_pdf/tools/common.py
magic_pdf/tools/common.py
+7
-1
No files found.
magic_pdf/tools/common.py
View file @
4f28604a
import
copy
import
copy
import
json
as
json_parse
import
json
as
json_parse
import
os
import
os
import
re
import
click
import
click
from
loguru
import
logger
from
loguru
import
logger
...
@@ -16,9 +17,13 @@ from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
...
@@ -16,9 +17,13 @@ from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
from
magic_pdf.rw.DiskReaderWriter
import
DiskReaderWriter
from
magic_pdf.rw.DiskReaderWriter
import
DiskReaderWriter
from
magic_pdf.post_proc.remove_spaces_html
import
remove_extra_spaces_html_txt
from
magic_pdf.post_proc.remove_spaces_html
import
remove_extra_spaces_html_txt
def
sanitize_filename
(
filename
:
str
)
->
str
:
return
re
.
sub
(
r
'[^a-zA-Z0-9_\u4e00-\u9fff-]'
,
'_'
,
filename
)
def
prepare_env
(
output_dir
,
pdf_file_name
,
method
):
def
prepare_env
(
output_dir
,
pdf_file_name
,
method
):
local_parent_dir
=
os
.
path
.
join
(
output_dir
,
pdf_file_name
,
method
)
# logger.info(f'pdf_file_name:{pdf_file_name}')
pdf_file_name
=
sanitize_filename
(
pdf_file_name
)
local_parent_dir
=
os
.
path
.
join
(
output_dir
,
pdf_file_name
)
local_image_dir
=
os
.
path
.
join
(
str
(
local_parent_dir
),
'images'
)
local_image_dir
=
os
.
path
.
join
(
str
(
local_parent_dir
),
'images'
)
local_md_dir
=
local_parent_dir
local_md_dir
=
local_parent_dir
...
@@ -116,6 +121,7 @@ def do_parse(
...
@@ -116,6 +121,7 @@ def do_parse(
md_make_mode
=
f_make_md_mode
)
md_make_mode
=
f_make_md_mode
)
try
:
try
:
pdf_file_name
=
sanitize_filename
(
pdf_file_name
)
txt_file
=
f
'
{
pdf_file_name
}
.txt'
txt_file
=
f
'
{
pdf_file_name
}
.txt'
md_writer
.
write
(
md_writer
.
write
(
content
=
md_content
,
content
=
md_content
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment