Commit 0e8f989e authored by zhougaofeng's avatar zhougaofeng
Browse files

Update common_parse.py

parent 922eeb6e
......@@ -37,14 +37,15 @@ def process_file(file_path, pdf_ocr, excel_ocr, output_dir):
"""Process a single file for OCR based on its extension."""
try:
res = ''
start = time.time()
if file_path.endswith('.pdf'):
res = pdf_ocr.ocr_pdf_client(path=file_path, output_dir=output_dir)
elif file_path.endswith('.xls') or file_path.endswith('.xlsx'):
res = excel_ocr.parse(file_path, output_dir)
end = time.time()
if res:
logger.info(f"文件处理成功,输出文件路径为: '{res}'")
logger.info(f"文件处理成功,输出文件路径为: '{res}', 耗时为:{end-start}")
else:
logger.warning(f"文件处理结果为空: '{file_path}'")
except requests.exceptions.RequestException as req_err:
......@@ -83,12 +84,19 @@ def main():
config.read(args.config_path)
pdf_server = config.get('server', 'pdf_server')
pdf_ocr = ocrPdfClient(pdf_server)
excel_ocr = ExcelParser()
status = pdf_ocr.check_health()
if not status:
pdf_ocr = None
logger.warning(f'Health check failed. The server at "{pdf_server}" is not responding as expected.')
logger.info(f'文件解析服务无法正常运行')
return None
else:
excel_ocr = ExcelParser()
logger.info(f'输入目录或文件的路径为: {input_path}')
logger.info(f'输出目录为: {output_dir}')
logger.info(f'输入目录或文件的路径为: {input_path}')
logger.info(f'输出目录为: {output_dir}')
process_input(input_path, pdf_ocr, excel_ocr, output_dir)
process_input(input_path, pdf_ocr, excel_ocr, output_dir)
if __name__ == "__main__":
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment