Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
zhougaofeng
magic_pdf
Commits
751928a0
Commit
751928a0
authored
Oct 22, 2024
by
zhougaofeng
Browse files
Update common_parse.py
parent
a410b338
Pipeline
#1792
failed with stages
in 0 seconds
Changes
1
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
52 additions
and
48 deletions
+52
-48
magic_pdf/parse/common_parse.py
magic_pdf/parse/common_parse.py
+52
-48
No files found.
magic_pdf/parse/common_parse.py
View file @
751928a0
...
@@ -29,63 +29,67 @@ def parse_args():
...
@@ -29,63 +29,67 @@ def parse_args():
return
args
return
args
import
os
import
requests
def
process_file
(
file_path
,
pdf_ocr
,
excel_ocr
,
output_dir
):
"""Process a single file for OCR based on its extension."""
try
:
res
=
''
if
file_path
.
endswith
(
'.pdf'
):
res
=
pdf_ocr
.
ocr_pdf_client
(
path
=
file_path
,
output_dir
=
output_dir
)
elif
file_path
.
endswith
(
'.xls'
)
or
file_path
.
endswith
(
'.xlsx'
):
res
=
excel_ocr
.
parse
(
file_path
,
output_dir
)
if
res
:
logger
.
info
(
f
"文件处理成功,输出文件路径为: '
{
res
}
'"
)
else
:
logger
.
warning
(
f
"文件处理结果为空: '
{
file_path
}
'"
)
except
requests
.
exceptions
.
RequestException
as
req_err
:
logger
.
error
(
f
"请求错误,文件: '
{
file_path
}
',错误信息:
{
req_err
}
"
)
except
Exception
as
err
:
logger
.
error
(
f
"处理文件时发生未知错误: '
{
file_path
}
',错误信息:
{
err
}
"
)
def
normalize_path
(
input_path
):
"""Normalize file paths to use forward slashes."""
return
input_path
.
replace
(
'
\\
'
,
'/'
)
def
determine_output_dir
(
output_dir
):
"""Determine if the output directory is an absolute path, else make it absolute."""
if
not
os
.
path
.
isabs
(
output_dir
):
current_working_directory
=
os
.
getcwd
()
return
os
.
path
.
join
(
current_working_directory
,
output_dir
)
return
output_dir
def
process_input
(
input_path
,
pdf_ocr
,
excel_ocr
,
output_dir
):
"""Process the input path, which can be a directory or a single file."""
if
os
.
path
.
isdir
(
input_path
):
for
root
,
_
,
files
in
os
.
walk
(
input_path
):
for
file
in
files
:
file_path
=
os
.
path
.
join
(
root
,
file
)
logger
.
info
(
f
'正在处理文件:
{
file_path
}
'
)
process_file
(
file_path
,
pdf_ocr
,
excel_ocr
,
output_dir
)
else
:
logger
.
info
(
f
'正在处理单个文件:
{
input_path
}
'
)
process_file
(
input_path
,
pdf_ocr
,
excel_ocr
,
output_dir
)
def
main
():
def
main
():
args
=
parse_args
()
args
=
parse_args
()
input_path
=
args
.
path
input_path
=
normalize_path
(
args
.
path
)
output_dir
=
determine_output_dir
(
args
.
output_dir
)
pdf_ocr
=
ocrPdfClient
(
args
.
url
)
pdf_ocr
=
ocrPdfClient
(
args
.
url
)
excel_ocr
=
ExcelParser
()
excel_ocr
=
ExcelParser
()
if
not
os
.
path
.
isabs
(
args
.
output_dir
):
current_working_directory
=
os
.
getcwd
()
output_dir
=
os
.
path
.
join
(
current_working_directory
,
args
.
output_dir
)
# logger.info(f'相对路径output_dir:{output_dir}')
else
:
output_dir
=
args
.
output_dir
if
'
\\
'
in
input_path
:
logger
.
info
(
f
'输入目录或文件的路径为:
{
input_path
}
'
)
input_path
=
input_path
.
replace
(
'
\\
'
,
'/'
)
logger
.
info
(
f
'输出目录为:
{
output_dir
}
'
)
logger
.
info
(
f
'输入目录或文件的路径为:
{
input_path
}
'
)
logger
.
info
(
f
'output_dir:
{
output_dir
}
'
)
if
os
.
path
.
isdir
(
input_path
):
process_input
(
input_path
,
pdf_ocr
,
excel_ocr
,
output_dir
)
for
root
,
dirs
,
files
in
os
.
walk
(
input_path
):
# 查找所有的pdf文件
for
file
in
files
:
# 打印pdf文件的完整路径
doc_path
=
os
.
path
.
join
(
root
,
file
)
logger
.
info
(
f
'正在解析:
{
doc_path
}
'
)
try
:
res
=
''
if
file
.
endswith
(
'.pdf'
):
res
=
pdf_ocr
.
ocr_pdf_client
(
path
=
doc_path
,
output_dir
=
output_dir
)
elif
file
.
endswith
(
'.xls'
)
or
file
.
endswith
(
'.xlsx'
):
res
=
excel_ocr
.
parse
(
doc_path
,
output_dir
)
if
res
:
logger
.
info
(
f
"输出文件的的路径为: '
{
res
}
'"
)
else
:
logger
.
warning
(
"None"
)
except
requests
.
exceptions
.
RequestException
as
e
:
logger
.
error
(
f
"Error while making request to reranker service:
{
e
}
"
)
except
Exception
as
e
:
logger
.
error
(
f
"Unexpected error occurred:
{
e
}
"
)
else
:
# Example usage:
try
:
# main()
res
=
''
if
input_path
.
endswith
(
'.pdf'
):
res
=
pdf_ocr
.
ocr_pdf_client
(
path
=
input_path
,
output_dir
=
output_dir
)
elif
input_path
.
endswith
(
'.xls'
)
or
input_path
.
endswith
(
'.xlsx'
):
res
=
excel_ocr
.
parse
(
input_path
,
output_dir
)
if
res
:
logger
.
info
(
f
"output_dir: '
{
res
}
'"
)
else
:
logger
.
warning
(
"None"
)
except
requests
.
exceptions
.
RequestException
as
e
:
logger
.
error
(
f
"Error while making request to reranker service:
{
e
}
"
)
except
Exception
as
e
:
logger
.
error
(
f
"Unexpected error occurred:
{
e
}
"
)
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
main
()
main
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment