Update common_parse.py

751928a0 · zhougaofeng · a410b338 · 751928a0
Commit 751928a0 authored Oct 22, 2024 by zhougaofeng
Show whitespace changes
Inline Side-by-side

Showing with 52 additions and 48 deletions

magic_pdf/parse/common_parse.py magic_pdf/parse/common_parse.py +52 -48

No files found.
--- a/magic_pdf/parse/common_parse.py
+++ b/magic_pdf/parse/common_parse.py
@@ -29,63 +29,67 @@ def parse_args():
    return args
-def main():
+import os
-    args = parse_args()
+import requests
-    input_path = args.path
-    pdf_ocr = ocrPdfClient(args.url)
-    excel_ocr = ExcelParser()
-    if not os.path.isabs(args.output_dir):
-        current_working_directory = os.getcwd()
-        output_dir = os.path.join(current_working_directory, args.output_dir)
-        # logger.info(f'相对路径output_dir:{output_dir}')
-    else:
-        output_dir = args.output_dir
-    if '\\' in input_path:
-        input_path = input_path.replace('\\', '/')
-    logger.info(f'输入目录或文件的路径为:{input_path}')
-    logger.info(f'output_dir:{output_dir}')
-    if os.path.isdir(input_path):
+def process_file(file_path, pdf_ocr, excel_ocr, output_dir):
-        for root, dirs, files in os.walk(input_path):
+    """Process a single file for OCR based on its extension."""
-            # 查找所有的pdf文件
-            for file in files:
-                # 打印pdf文件的完整路径
-                doc_path = os.path.join(root, file)
-                logger.info(f'正在解析：{doc_path}')
    try:
        res = ''
-                    if file.endswith('.pdf'):
+        if file_path.endswith('.pdf'):
-                        res = pdf_ocr.ocr_pdf_client(path=doc_path,output_dir=output_dir)
+            res = pdf_ocr.ocr_pdf_client(path=file_path, output_dir=output_dir)
-                    elif file.endswith('.xls') or file.endswith('.xlsx'):
+        elif file_path.endswith('.xls') or file_path.endswith('.xlsx'):
-                        res = excel_ocr.parse(doc_path,output_dir)
+            res = excel_ocr.parse(file_path, output_dir)
        if res:
-                        logger.info(f"输出文件的的路径为: '{res}'")
+            logger.info(f"文件处理成功，输出文件路径为: '{res}'")
        else:
-                        logger.warning("None")
+            logger.warning(f"文件处理结果为空: '{file_path}'")
-                except requests.exceptions.RequestException as e:
+    except requests.exceptions.RequestException as req_err:
-                    logger.error(f"Error while making request to reranker service: {e}")
+        logger.error(f"请求错误，文件: '{file_path}'，错误信息: {req_err}")
-                except Exception as e:
+    except Exception as err:
-                    logger.error(f"Unexpected error occurred: {e}")
+        logger.error(f"处理文件时发生未知错误: '{file_path}'，错误信息: {err}")
+def normalize_path(input_path):
+    """Normalize file paths to use forward slashes."""
+    return input_path.replace('\\', '/')
+def determine_output_dir(output_dir):
+    """Determine if the output directory is an absolute path, else make it absolute."""
+    if not os.path.isabs(output_dir):
+        current_working_directory = os.getcwd()
+        return os.path.join(current_working_directory, output_dir)
+    return output_dir
+def process_input(input_path, pdf_ocr, excel_ocr, output_dir):
+    """Process the input path, which can be a directory or a single file."""
+    if os.path.isdir(input_path):
+        for root, _, files in os.walk(input_path):
+            for file in files:
+                file_path = os.path.join(root, file)
+                logger.info(f'正在处理文件: {file_path}')
+                process_file(file_path, pdf_ocr, excel_ocr, output_dir)
    else:
-        try:
+        logger.info(f'正在处理单个文件: {input_path}')
-            res = ''
+        process_file(input_path, pdf_ocr, excel_ocr, output_dir)
-            if input_path.endswith('.pdf'):
-                res = pdf_ocr.ocr_pdf_client(path=input_path, output_dir=output_dir)
+def main():
-            elif input_path.endswith('.xls') or input_path.endswith('.xlsx'):
+    args = parse_args()
-                res = excel_ocr.parse(input_path,output_dir)
+    input_path = normalize_path(args.path)
-            if res:
+    output_dir = determine_output_dir(args.output_dir)
-                logger.info(f"output_dir: '{res}'")
-            else:
+    pdf_ocr = ocrPdfClient(args.url)
-                logger.warning("None")
+    excel_ocr = ExcelParser()
-        except requests.exceptions.RequestException as e:
-            logger.error(f"Error while making request to reranker service: {e}")
+    logger.info(f'输入目录或文件的路径为: {input_path}')
-        except Exception as e:
+    logger.info(f'输出目录为: {output_dir}')
-            logger.error(f"Unexpected error occurred: {e}")
+    process_input(input_path, pdf_ocr, excel_ocr, output_dir)
+# Example usage:
+# main()
 if __name__ == "__main__":
    main()