Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
zhougaofeng
magic_pdf
Commits
fb058635
Commit
fb058635
authored
Nov 13, 2024
by
zhougaofeng
Browse files
Update common_parse.py
parent
22ddf1a8
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
15 additions
and
7 deletions
+15
-7
magic_pdf/parse/common_parse.py
magic_pdf/parse/common_parse.py
+15
-7
No files found.
magic_pdf/parse/common_parse.py
View file @
fb058635
...
...
@@ -8,11 +8,15 @@ from excel_parse import ExcelParser
import
os
import
requests
import
configparser
from
magic_pdf.parse.ofd_parse
import
parse_ofd
from
magic_pdf.parse.ofd_parse
import
ocrOfdClient
logger
.
add
(
"parse.log"
,
rotation
=
"10 MB"
,
level
=
"INFO"
,
format
=
"{time} {level} {message}"
,
encoding
=
'utf-8'
,
enqueue
=
True
)
config_path
=
None
ofd_ocr
=
None
pdf_ocr
=
None
excel_ocr
=
None
def
parse_args
():
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
...
...
@@ -32,7 +36,7 @@ def parse_args():
args
=
parser
.
parse_args
()
return
args
def
process_file
(
file_path
,
pdf_ocr
,
excel_ocr
,
output_dir
):
def
process_file
(
file_path
,
output_dir
):
"""Process a single file for OCR based on its extension."""
try
:
res
=
''
...
...
@@ -43,7 +47,7 @@ def process_file(file_path, pdf_ocr, excel_ocr, output_dir):
elif
file_path
.
endswith
(
'.xls'
)
or
file_path
.
endswith
(
'.xlsx'
):
res
=
excel_ocr
.
parse
(
file_path
,
output_dir
)
elif
file_path
.
endswith
(
'.ofd'
):
res
=
parse_ofd
(
config_path
,
file_path
,
output_dir
)
res
=
ofd_ocr
.
parse_ofd
(
config_path
,
file_path
,
output_dir
)
end
=
time
.
time
()
...
...
@@ -67,7 +71,7 @@ def determine_output_dir(output_dir):
return
os
.
path
.
join
(
current_working_directory
,
output_dir
)
return
output_dir
def
process_input
(
input_path
,
pdf_ocr
,
excel_ocr
,
output_dir
):
def
process_input
(
input_path
,
output_dir
):
"""Process the input path, which can be a directory or a single file."""
if
os
.
path
.
isdir
(
input_path
):
logger
.
info
(
f
'开始处理
{
input_path
}
目录下的文件'
)
...
...
@@ -75,10 +79,10 @@ def process_input(input_path, pdf_ocr, excel_ocr, output_dir):
for
file
in
files
:
file_path
=
os
.
path
.
join
(
root
,
file
)
logger
.
info
(
f
'正在解析文件:
{
file_path
}
'
)
process_file
(
file_path
,
pdf_ocr
,
excel_ocr
,
output_dir
)
process_file
(
file_path
,
output_dir
)
else
:
logger
.
info
(
f
'正在解析单个文件:
{
input_path
}
'
)
process_file
(
input_path
,
pdf_ocr
,
excel_ocr
,
output_dir
)
process_file
(
input_path
,
output_dir
)
def
main
():
args
=
parse_args
()
...
...
@@ -87,9 +91,13 @@ def main():
config
=
configparser
.
ConfigParser
()
config
.
read
(
args
.
config_path
)
global
config_path
global
pdf_ocr
global
ofd_ocr
global
excel_ocr
config_path
=
args
.
config_path
pdf_server
=
config
.
get
(
'server'
,
'pdf_server'
)
pdf_ocr
=
ocrPdfClient
(
pdf_server
)
ofd_ocr
=
ocrOfdClient
(
pdf_server
)
status
=
pdf_ocr
.
check_health
()
if
not
status
:
pdf_ocr
=
None
...
...
@@ -102,7 +110,7 @@ def main():
# logger.info(f'输入目录或文件的路径为: {input_path},输出目录为: {output_dir}')
# logger.info(f'输出目录为: {output_dir}')
process_input
(
input_path
,
pdf_ocr
,
excel_ocr
,
output_dir
)
process_input
(
input_path
,
output_dir
)
if
__name__
==
"__main__"
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment