Commit 751928a0 authored by zhougaofeng's avatar zhougaofeng
Browse files

Update common_parse.py

parent a410b338
Pipeline #1792 failed with stages
in 0 seconds
...@@ -29,63 +29,67 @@ def parse_args(): ...@@ -29,63 +29,67 @@ def parse_args():
return args return args
import os
import requests
def process_file(file_path, pdf_ocr, excel_ocr, output_dir):
"""Process a single file for OCR based on its extension."""
try:
res = ''
if file_path.endswith('.pdf'):
res = pdf_ocr.ocr_pdf_client(path=file_path, output_dir=output_dir)
elif file_path.endswith('.xls') or file_path.endswith('.xlsx'):
res = excel_ocr.parse(file_path, output_dir)
if res:
logger.info(f"文件处理成功,输出文件路径为: '{res}'")
else:
logger.warning(f"文件处理结果为空: '{file_path}'")
except requests.exceptions.RequestException as req_err:
logger.error(f"请求错误,文件: '{file_path}',错误信息: {req_err}")
except Exception as err:
logger.error(f"处理文件时发生未知错误: '{file_path}',错误信息: {err}")
def normalize_path(input_path):
"""Normalize file paths to use forward slashes."""
return input_path.replace('\\', '/')
def determine_output_dir(output_dir):
"""Determine if the output directory is an absolute path, else make it absolute."""
if not os.path.isabs(output_dir):
current_working_directory = os.getcwd()
return os.path.join(current_working_directory, output_dir)
return output_dir
def process_input(input_path, pdf_ocr, excel_ocr, output_dir):
"""Process the input path, which can be a directory or a single file."""
if os.path.isdir(input_path):
for root, _, files in os.walk(input_path):
for file in files:
file_path = os.path.join(root, file)
logger.info(f'正在处理文件: {file_path}')
process_file(file_path, pdf_ocr, excel_ocr, output_dir)
else:
logger.info(f'正在处理单个文件: {input_path}')
process_file(input_path, pdf_ocr, excel_ocr, output_dir)
def main(): def main():
args = parse_args() args = parse_args()
input_path = args.path input_path = normalize_path(args.path)
output_dir = determine_output_dir(args.output_dir)
pdf_ocr = ocrPdfClient(args.url) pdf_ocr = ocrPdfClient(args.url)
excel_ocr = ExcelParser() excel_ocr = ExcelParser()
if not os.path.isabs(args.output_dir):
current_working_directory = os.getcwd()
output_dir = os.path.join(current_working_directory, args.output_dir)
# logger.info(f'相对路径output_dir:{output_dir}')
else:
output_dir = args.output_dir
if '\\' in input_path: logger.info(f'输入目录或文件的路径为: {input_path}')
input_path = input_path.replace('\\', '/') logger.info(f'输出目录为: {output_dir}')
logger.info(f'输入目录或文件的路径为:{input_path}')
logger.info(f'output_dir:{output_dir}')
if os.path.isdir(input_path): process_input(input_path, pdf_ocr, excel_ocr, output_dir)
for root, dirs, files in os.walk(input_path):
# 查找所有的pdf文件
for file in files:
# 打印pdf文件的完整路径
doc_path = os.path.join(root, file)
logger.info(f'正在解析:{doc_path}')
try:
res = ''
if file.endswith('.pdf'):
res = pdf_ocr.ocr_pdf_client(path=doc_path,output_dir=output_dir)
elif file.endswith('.xls') or file.endswith('.xlsx'):
res = excel_ocr.parse(doc_path,output_dir)
if res:
logger.info(f"输出文件的的路径为: '{res}'")
else:
logger.warning("None")
except requests.exceptions.RequestException as e:
logger.error(f"Error while making request to reranker service: {e}")
except Exception as e:
logger.error(f"Unexpected error occurred: {e}")
else: # Example usage:
try: # main()
res = ''
if input_path.endswith('.pdf'):
res = pdf_ocr.ocr_pdf_client(path=input_path, output_dir=output_dir)
elif input_path.endswith('.xls') or input_path.endswith('.xlsx'):
res = excel_ocr.parse(input_path,output_dir)
if res:
logger.info(f"output_dir: '{res}'")
else:
logger.warning("None")
except requests.exceptions.RequestException as e:
logger.error(f"Error while making request to reranker service: {e}")
except Exception as e:
logger.error(f"Unexpected error occurred: {e}")
if __name__ == "__main__": if __name__ == "__main__":
main() main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment