"vscode:/vscode.git/clone" did not exist on "af279434d03e6e3be7808ecd15c652338b31024b"
Commit bf156ede authored by zhougaofeng's avatar zhougaofeng
Browse files

Update common_parse.py

parent b6c39f3b
...@@ -7,13 +7,15 @@ import argparse ...@@ -7,13 +7,15 @@ import argparse
import os import os
from pdf_client import ocrPdfClient from pdf_client import ocrPdfClient
from excel_parse import ExcelParser from excel_parse import ExcelParser
import os
import requests
import configparser
def parse_args(): def parse_args():
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument( parser.add_argument(
'--url', '--config_path',
default='http://0.0.0.0:6030', default='/home/practice/magic_pdf-main/magic_pdf/config.ini',
) )
parser.add_argument( parser.add_argument(
'--path', '--path',
...@@ -28,16 +30,13 @@ def parse_args(): ...@@ -28,16 +30,13 @@ def parse_args():
args = parser.parse_args() args = parser.parse_args()
return args return args
def process_file(file_path, pdf_ocr, excel_ocr, output_dir,config_path):
import os
import requests
def process_file(file_path, pdf_ocr, excel_ocr, output_dir):
"""Process a single file for OCR based on its extension.""" """Process a single file for OCR based on its extension."""
try: try:
res = '' res = ''
if file_path.endswith('.pdf'): if file_path.endswith('.pdf'):
res = pdf_ocr.ocr_pdf_client(path=file_path, output_dir=output_dir)
res = pdf_ocr.ocr_pdf_client(path=file_path, output_dir=output_dir,config_path=config_path)
elif file_path.endswith('.xls') or file_path.endswith('.xlsx'): elif file_path.endswith('.xls') or file_path.endswith('.xlsx'):
res = excel_ocr.parse(file_path, output_dir) res = excel_ocr.parse(file_path, output_dir)
...@@ -61,33 +60,32 @@ def determine_output_dir(output_dir): ...@@ -61,33 +60,32 @@ def determine_output_dir(output_dir):
return os.path.join(current_working_directory, output_dir) return os.path.join(current_working_directory, output_dir)
return output_dir return output_dir
def process_input(input_path, pdf_ocr, excel_ocr, output_dir): def process_input(input_path, pdf_ocr, excel_ocr, output_dir,config_path):
"""Process the input path, which can be a directory or a single file.""" """Process the input path, which can be a directory or a single file."""
if os.path.isdir(input_path): if os.path.isdir(input_path):
for root, _, files in os.walk(input_path): for root, _, files in os.walk(input_path):
for file in files: for file in files:
file_path = os.path.join(root, file) file_path = os.path.join(root, file)
logger.info(f'正在处理文件: {file_path}') logger.info(f'正在处理文件: {file_path}')
process_file(file_path, pdf_ocr, excel_ocr, output_dir) process_file(file_path, pdf_ocr, excel_ocr, output_dir,config_path)
else: else:
logger.info(f'正在处理单个文件: {input_path}') logger.info(f'正在处理单个文件: {input_path}')
process_file(input_path, pdf_ocr, excel_ocr, output_dir) process_file(input_path, pdf_ocr, excel_ocr, output_dir,config_path)
def main(): def main():
args = parse_args() args = parse_args()
input_path = normalize_path(args.path) input_path = normalize_path(args.path)
output_dir = determine_output_dir(args.output_dir) output_dir = determine_output_dir(args.output_dir)
config = configparser.ConfigParser()
pdf_ocr = ocrPdfClient(args.url) config.read(args.config_path)
pdf_server = config.get('server', 'pdf_server')
pdf_ocr = ocrPdfClient(pdf_server)
excel_ocr = ExcelParser() excel_ocr = ExcelParser()
logger.info(f'输入目录或文件的路径为: {input_path}') logger.info(f'输入目录或文件的路径为: {input_path}')
logger.info(f'输出目录为: {output_dir}') logger.info(f'输出目录为: {output_dir}')
process_input(input_path, pdf_ocr, excel_ocr, output_dir) process_input(input_path, pdf_ocr, excel_ocr, output_dir,args.config_path)
# Example usage:
# main()
if __name__ == "__main__": if __name__ == "__main__":
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment