Commit fb058635 authored by zhougaofeng's avatar zhougaofeng
Browse files

Update common_parse.py

parent 22ddf1a8
...@@ -8,11 +8,15 @@ from excel_parse import ExcelParser ...@@ -8,11 +8,15 @@ from excel_parse import ExcelParser
import os import os
import requests import requests
import configparser import configparser
from magic_pdf.parse.ofd_parse import parse_ofd from magic_pdf.parse.ofd_parse import ocrOfdClient
logger.add("parse.log", rotation="10 MB", level="INFO", format="{time} {level} {message}", encoding='utf-8', enqueue=True) logger.add("parse.log", rotation="10 MB", level="INFO", format="{time} {level} {message}", encoding='utf-8', enqueue=True)
config_path = None config_path = None
ofd_ocr = None
pdf_ocr = None
excel_ocr = None
def parse_args(): def parse_args():
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument( parser.add_argument(
...@@ -32,7 +36,7 @@ def parse_args(): ...@@ -32,7 +36,7 @@ def parse_args():
args = parser.parse_args() args = parser.parse_args()
return args return args
def process_file(file_path, pdf_ocr, excel_ocr, output_dir): def process_file(file_path, output_dir):
"""Process a single file for OCR based on its extension.""" """Process a single file for OCR based on its extension."""
try: try:
res = '' res = ''
...@@ -43,7 +47,7 @@ def process_file(file_path, pdf_ocr, excel_ocr, output_dir): ...@@ -43,7 +47,7 @@ def process_file(file_path, pdf_ocr, excel_ocr, output_dir):
elif file_path.endswith('.xls') or file_path.endswith('.xlsx'): elif file_path.endswith('.xls') or file_path.endswith('.xlsx'):
res = excel_ocr.parse(file_path, output_dir) res = excel_ocr.parse(file_path, output_dir)
elif file_path.endswith('.ofd'): elif file_path.endswith('.ofd'):
res = parse_ofd(config_path,file_path,output_dir) res = ofd_ocr.parse_ofd(config_path,file_path,output_dir)
end = time.time() end = time.time()
...@@ -67,7 +71,7 @@ def determine_output_dir(output_dir): ...@@ -67,7 +71,7 @@ def determine_output_dir(output_dir):
return os.path.join(current_working_directory, output_dir) return os.path.join(current_working_directory, output_dir)
return output_dir return output_dir
def process_input(input_path, pdf_ocr, excel_ocr, output_dir): def process_input(input_path, output_dir):
"""Process the input path, which can be a directory or a single file.""" """Process the input path, which can be a directory or a single file."""
if os.path.isdir(input_path): if os.path.isdir(input_path):
logger.info(f'开始处理{input_path}目录下的文件') logger.info(f'开始处理{input_path}目录下的文件')
...@@ -75,10 +79,10 @@ def process_input(input_path, pdf_ocr, excel_ocr, output_dir): ...@@ -75,10 +79,10 @@ def process_input(input_path, pdf_ocr, excel_ocr, output_dir):
for file in files: for file in files:
file_path = os.path.join(root, file) file_path = os.path.join(root, file)
logger.info(f'正在解析文件: {file_path}') logger.info(f'正在解析文件: {file_path}')
process_file(file_path, pdf_ocr, excel_ocr, output_dir) process_file(file_path, output_dir)
else: else:
logger.info(f'正在解析单个文件: {input_path}') logger.info(f'正在解析单个文件: {input_path}')
process_file(input_path, pdf_ocr, excel_ocr, output_dir) process_file(input_path, output_dir)
def main(): def main():
args = parse_args() args = parse_args()
...@@ -87,9 +91,13 @@ def main(): ...@@ -87,9 +91,13 @@ def main():
config = configparser.ConfigParser() config = configparser.ConfigParser()
config.read(args.config_path) config.read(args.config_path)
global config_path global config_path
global pdf_ocr
global ofd_ocr
global excel_ocr
config_path = args.config_path config_path = args.config_path
pdf_server = config.get('server', 'pdf_server') pdf_server = config.get('server', 'pdf_server')
pdf_ocr = ocrPdfClient(pdf_server) pdf_ocr = ocrPdfClient(pdf_server)
ofd_ocr = ocrOfdClient(pdf_server)
status = pdf_ocr.check_health() status = pdf_ocr.check_health()
if not status: if not status:
pdf_ocr = None pdf_ocr = None
...@@ -102,7 +110,7 @@ def main(): ...@@ -102,7 +110,7 @@ def main():
# logger.info(f'输入目录或文件的路径为: {input_path},输出目录为: {output_dir}') # logger.info(f'输入目录或文件的路径为: {input_path},输出目录为: {output_dir}')
# logger.info(f'输出目录为: {output_dir}') # logger.info(f'输出目录为: {output_dir}')
process_input(input_path, pdf_ocr, excel_ocr, output_dir) process_input(input_path, output_dir)
if __name__ == "__main__": if __name__ == "__main__":
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment