Update common_parse.py

0e8f989e · zhougaofeng · 922eeb6e · 0e8f989e
Commit 0e8f989e authored Oct 24, 2024 by zhougaofeng
Hide whitespace changes
Inline Side-by-side

Showing with 14 additions and 6 deletions

magic_pdf/parse/common_parse.py magic_pdf/parse/common_parse.py +14 -6

No files found.
--- a/magic_pdf/parse/common_parse.py
+++ b/magic_pdf/parse/common_parse.py
@@ -37,14 +37,15 @@ def process_file(file_path, pdf_ocr, excel_ocr, output_dir):
    """Process a single file for OCR based on its extension."""
    try:
        res = ''
+        start = time.time()
        if file_path.endswith('.pdf'):

            res = pdf_ocr.ocr_pdf_client(path=file_path, output_dir=output_dir)
        elif file_path.endswith('.xls') or file_path.endswith('.xlsx'):
            res = excel_ocr.parse(file_path, output_dir)
-
+        end = time.time()
        if res:
-            logger.info(f"文件处理成功，输出文件路径为: '{res}'")
+            logger.info(f"文件处理成功，输出文件路径为: '{res}', 耗时为：{end-start}")
        else:
            logger.warning(f"文件处理结果为空: '{file_path}'")
    except requests.exceptions.RequestException as req_err:
@@ -83,12 +84,19 @@ def main():
    config.read(args.config_path)
    pdf_server = config.get('server', 'pdf_server')
    pdf_ocr = ocrPdfClient(pdf_server)
-    excel_ocr = ExcelParser()
+    status = pdf_ocr.check_health()
+    if not status:
+        pdf_ocr = None
+        logger.warning(f'Health check failed. The server at "{pdf_server}" is not responding as expected.')
+        logger.info(f'文件解析服务无法正常运行')
+        return None
+    else:
+        excel_ocr = ExcelParser()

-    logger.info(f'输入目录或文件的路径为: {input_path}')
-    logger.info(f'输出目录为: {output_dir}')
+        logger.info(f'输入目录或文件的路径为: {input_path}')
+        logger.info(f'输出目录为: {output_dir}')

-    process_input(input_path, pdf_ocr, excel_ocr, output_dir)
+        process_input(input_path, pdf_ocr, excel_ocr, output_dir)


 if __name__ == "__main__":