# -*- coding: utf-8 -*-
import time

from loguru import logger
import argparse
from pdf_client import ocrPdfClient
from excel_parse import ExcelParser
import os
import requests
import configparser
from magic_pdf.parse.ofd_parse import parse_ofd

logger.add("parse.log", rotation="10 MB", level="INFO", format="{time} {level} {message}", encoding='utf-8', enqueue=True)
config_path = None

def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--config_path',
        default='/home/practice/magic_pdf-main/magic_pdf/config.ini',
        )
    parser.add_argument(
        '--path',
        '-p',
        required=True
        )
    parser.add_argument(
        '--output_dir',
        '-o',
        required=True
        )
    args = parser.parse_args()
    return args

def process_file(file_path, pdf_ocr, excel_ocr, output_dir):
    """Process a single file for OCR based on its extension."""
    try:
        res = ''
        start = time.time()
        if file_path.endswith('.pdf'):

            res = pdf_ocr.ocr_pdf_client(path=file_path, output_dir=output_dir)
        elif file_path.endswith('.xls') or file_path.endswith('.xlsx'):
            res = excel_ocr.parse(file_path, output_dir)
        elif file_path.endswith('.ofd'):
            res = parse_ofd(config_path,file_path,output_dir)

        end = time.time()

        if res:
            logger.info(f"文件处理成功，输出文件路径为: '{res}', 耗时为：{round(end-start,2)}")
        else:
            logger.warning(f"文件处理结果为空: '{file_path}'")
    except requests.exceptions.RequestException as req_err:
        logger.error(f"请求错误，文件: '{file_path}'，错误信息: {req_err}")
    except Exception as err:
        logger.error(f"处理文件时发生未知错误: '{file_path}'，错误信息: {err},res:{res}")

def normalize_path(input_path):
    """Normalize file paths to use forward slashes."""
    return input_path.replace('\\', '/')

def determine_output_dir(output_dir):
    """Determine if the output directory is an absolute path, else make it absolute."""
    if not os.path.isabs(output_dir):
        current_working_directory = os.getcwd()
        return os.path.join(current_working_directory, output_dir)
    return output_dir

def process_input(input_path, pdf_ocr, excel_ocr, output_dir):
    """Process the input path, which can be a directory or a single file."""
    if os.path.isdir(input_path):
        logger.info(f'开始处理{input_path}目录下的文件')
        for root, _, files in os.walk(input_path):
            for file in files:
                file_path = os.path.join(root, file)
                logger.info(f'正在解析文件: {file_path}')
                process_file(file_path, pdf_ocr, excel_ocr, output_dir)
    else:
        logger.info(f'正在解析单个文件: {input_path}')
        process_file(input_path, pdf_ocr, excel_ocr, output_dir)

def main():
    args = parse_args()
    input_path = normalize_path(args.path)
    output_dir = determine_output_dir(args.output_dir)
    config = configparser.ConfigParser()
    config.read(args.config_path)
    global config_path
    config_path = args.config_path
    pdf_server = config.get('server', 'pdf_server')
    pdf_ocr = ocrPdfClient(pdf_server)
    status = pdf_ocr.check_health()
    if not status:
        pdf_ocr = None
        logger.warning(f'Health check failed. The server at "{pdf_server}" is not responding as expected.')
        logger.info(f'文件解析服务无法正常运行')
        return None
    else:
        excel_ocr = ExcelParser()

        # logger.info(f'输入目录或文件的路径为: {input_path},输出目录为: {output_dir}')
        # logger.info(f'输出目录为: {output_dir}')

        process_input(input_path, pdf_ocr, excel_ocr, output_dir)


if __name__ == "__main__":
    main()


