Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
zhougaofeng
magic_pdf
Commits
bf156ede
"vscode:/vscode.git/clone" did not exist on "af279434d03e6e3be7808ecd15c652338b31024b"
Commit
bf156ede
authored
Oct 23, 2024
by
zhougaofeng
Browse files
Update common_parse.py
parent
b6c39f3b
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
16 additions
and
18 deletions
+16
-18
magic_pdf/parse/common_parse.py
magic_pdf/parse/common_parse.py
+16
-18
No files found.
magic_pdf/parse/common_parse.py
View file @
bf156ede
...
@@ -7,13 +7,15 @@ import argparse
...
@@ -7,13 +7,15 @@ import argparse
import
os
import
os
from
pdf_client
import
ocrPdfClient
from
pdf_client
import
ocrPdfClient
from
excel_parse
import
ExcelParser
from
excel_parse
import
ExcelParser
import
os
import
requests
import
configparser
def
parse_args
():
def
parse_args
():
parser
=
argparse
.
ArgumentParser
()
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
parser
.
add_argument
(
'--
url
'
,
'--
config_path
'
,
default
=
'
http://0.0.0.0:6030
'
,
default
=
'
/home/practice/magic_pdf-main/magic_pdf/config.ini
'
,
)
)
parser
.
add_argument
(
parser
.
add_argument
(
'--path'
,
'--path'
,
...
@@ -28,16 +30,13 @@ def parse_args():
...
@@ -28,16 +30,13 @@ def parse_args():
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
return
args
return
args
def
process_file
(
file_path
,
pdf_ocr
,
excel_ocr
,
output_dir
,
config_path
):
import
os
import
requests
def
process_file
(
file_path
,
pdf_ocr
,
excel_ocr
,
output_dir
):
"""Process a single file for OCR based on its extension."""
"""Process a single file for OCR based on its extension."""
try
:
try
:
res
=
''
res
=
''
if
file_path
.
endswith
(
'.pdf'
):
if
file_path
.
endswith
(
'.pdf'
):
res
=
pdf_ocr
.
ocr_pdf_client
(
path
=
file_path
,
output_dir
=
output_dir
)
res
=
pdf_ocr
.
ocr_pdf_client
(
path
=
file_path
,
output_dir
=
output_dir
,
config_path
=
config_path
)
elif
file_path
.
endswith
(
'.xls'
)
or
file_path
.
endswith
(
'.xlsx'
):
elif
file_path
.
endswith
(
'.xls'
)
or
file_path
.
endswith
(
'.xlsx'
):
res
=
excel_ocr
.
parse
(
file_path
,
output_dir
)
res
=
excel_ocr
.
parse
(
file_path
,
output_dir
)
...
@@ -61,33 +60,32 @@ def determine_output_dir(output_dir):
...
@@ -61,33 +60,32 @@ def determine_output_dir(output_dir):
return
os
.
path
.
join
(
current_working_directory
,
output_dir
)
return
os
.
path
.
join
(
current_working_directory
,
output_dir
)
return
output_dir
return
output_dir
def
process_input
(
input_path
,
pdf_ocr
,
excel_ocr
,
output_dir
):
def
process_input
(
input_path
,
pdf_ocr
,
excel_ocr
,
output_dir
,
config_path
):
"""Process the input path, which can be a directory or a single file."""
"""Process the input path, which can be a directory or a single file."""
if
os
.
path
.
isdir
(
input_path
):
if
os
.
path
.
isdir
(
input_path
):
for
root
,
_
,
files
in
os
.
walk
(
input_path
):
for
root
,
_
,
files
in
os
.
walk
(
input_path
):
for
file
in
files
:
for
file
in
files
:
file_path
=
os
.
path
.
join
(
root
,
file
)
file_path
=
os
.
path
.
join
(
root
,
file
)
logger
.
info
(
f
'正在处理文件:
{
file_path
}
'
)
logger
.
info
(
f
'正在处理文件:
{
file_path
}
'
)
process_file
(
file_path
,
pdf_ocr
,
excel_ocr
,
output_dir
)
process_file
(
file_path
,
pdf_ocr
,
excel_ocr
,
output_dir
,
config_path
)
else
:
else
:
logger
.
info
(
f
'正在处理单个文件:
{
input_path
}
'
)
logger
.
info
(
f
'正在处理单个文件:
{
input_path
}
'
)
process_file
(
input_path
,
pdf_ocr
,
excel_ocr
,
output_dir
)
process_file
(
input_path
,
pdf_ocr
,
excel_ocr
,
output_dir
,
config_path
)
def
main
():
def
main
():
args
=
parse_args
()
args
=
parse_args
()
input_path
=
normalize_path
(
args
.
path
)
input_path
=
normalize_path
(
args
.
path
)
output_dir
=
determine_output_dir
(
args
.
output_dir
)
output_dir
=
determine_output_dir
(
args
.
output_dir
)
config
=
configparser
.
ConfigParser
()
pdf_ocr
=
ocrPdfClient
(
args
.
url
)
config
.
read
(
args
.
config_path
)
pdf_server
=
config
.
get
(
'server'
,
'pdf_server'
)
pdf_ocr
=
ocrPdfClient
(
pdf_server
)
excel_ocr
=
ExcelParser
()
excel_ocr
=
ExcelParser
()
logger
.
info
(
f
'输入目录或文件的路径为:
{
input_path
}
'
)
logger
.
info
(
f
'输入目录或文件的路径为:
{
input_path
}
'
)
logger
.
info
(
f
'输出目录为:
{
output_dir
}
'
)
logger
.
info
(
f
'输出目录为:
{
output_dir
}
'
)
process_input
(
input_path
,
pdf_ocr
,
excel_ocr
,
output_dir
)
process_input
(
input_path
,
pdf_ocr
,
excel_ocr
,
output_dir
,
args
.
config_path
)
# Example usage:
# main()
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment