Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
zhougaofeng
magic_pdf
Commits
0a43c18c
Commit
0a43c18c
authored
Oct 24, 2024
by
zhougaofeng
Browse files
Update common_parse.py
parent
cbe2abc6
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
6 additions
and
6 deletions
+6
-6
magic_pdf/parse/common_parse.py
magic_pdf/parse/common_parse.py
+6
-6
No files found.
magic_pdf/parse/common_parse.py
View file @
0a43c18c
...
...
@@ -33,13 +33,13 @@ def parse_args():
args
=
parser
.
parse_args
()
return
args
def
process_file
(
file_path
,
pdf_ocr
,
excel_ocr
,
output_dir
,
config_path
):
def
process_file
(
file_path
,
pdf_ocr
,
excel_ocr
,
output_dir
):
"""Process a single file for OCR based on its extension."""
try
:
res
=
''
if
file_path
.
endswith
(
'.pdf'
):
res
=
pdf_ocr
.
ocr_pdf_client
(
path
=
file_path
,
output_dir
=
output_dir
,
config_path
=
config_path
)
res
=
pdf_ocr
.
ocr_pdf_client
(
path
=
file_path
,
output_dir
=
output_dir
)
elif
file_path
.
endswith
(
'.xls'
)
or
file_path
.
endswith
(
'.xlsx'
):
res
=
excel_ocr
.
parse
(
file_path
,
output_dir
)
...
...
@@ -63,17 +63,17 @@ def determine_output_dir(output_dir):
return
os
.
path
.
join
(
current_working_directory
,
output_dir
)
return
output_dir
def
process_input
(
input_path
,
pdf_ocr
,
excel_ocr
,
output_dir
,
config_path
):
def
process_input
(
input_path
,
pdf_ocr
,
excel_ocr
,
output_dir
):
"""Process the input path, which can be a directory or a single file."""
if
os
.
path
.
isdir
(
input_path
):
for
root
,
_
,
files
in
os
.
walk
(
input_path
):
for
file
in
files
:
file_path
=
os
.
path
.
join
(
root
,
file
)
logger
.
info
(
f
'正在处理文件:
{
file_path
}
'
)
process_file
(
file_path
,
pdf_ocr
,
excel_ocr
,
output_dir
,
config_path
)
process_file
(
file_path
,
pdf_ocr
,
excel_ocr
,
output_dir
)
else
:
logger
.
info
(
f
'正在处理单个文件:
{
input_path
}
'
)
process_file
(
input_path
,
pdf_ocr
,
excel_ocr
,
output_dir
,
config_path
)
process_file
(
input_path
,
pdf_ocr
,
excel_ocr
,
output_dir
)
def
main
():
args
=
parse_args
()
...
...
@@ -88,7 +88,7 @@ def main():
logger
.
info
(
f
'输入目录或文件的路径为:
{
input_path
}
'
)
logger
.
info
(
f
'输出目录为:
{
output_dir
}
'
)
process_input
(
input_path
,
pdf_ocr
,
excel_ocr
,
output_dir
,
args
.
config_path
)
process_input
(
input_path
,
pdf_ocr
,
excel_ocr
,
output_dir
)
if
__name__
==
"__main__"
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment