# -*- coding: utf-8 -*-
import configparser
import time

import requests
from loguru import logger
import argparse
import os



class ocrPdfClient:
    def __init__(self, api_url):
        self.api_url = api_url

    def ocr_pdf_client(self, path, output_dir):
        payload = {
            "path": str(path),
            "output_dir": str(output_dir),
        }
        file_name_without_extension, _ = os.path.splitext(os.path.basename(path))
        logger.info(
            f'pdf_server: {self.api_url}, pdf path: {path}'
        )
        try:
            response = requests.post(f"{self.api_url}/pdf_ocr", json=payload)
            #logger.info(f'response:{response.json()}')
            output_dir = response.json()['output_path']
            response.raise_for_status()
            #logger.info(f'output_dir:{output_dir}')
            return output_dir if response.json()['status_code'] == 200 else None
        except requests.exceptions.RequestException as e:
            logger.error(f"OCR PDF API request failed: {e}")
            return None

def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--path',
        '-p',
        required=True
        )
    parser.add_argument(
        '--output_dir',
        '-o',
        required=True
        )
    parser.add_argument(
        '--config_path',
        default='/home/practice/magic_pdf-main/magic_pdf/config.ini',
        )

    args = parser.parse_args()
    return args


def main():
    args = parse_args()
    config = configparser.ConfigParser()
    config.read(args.config_path)
    pdf_server = config.get('server', 'pdf_server')
    embedder = ocrPdfClient(pdf_server)
    doc_analyze_start = time.time()

    if not os.path.isabs(args.output_dir):
        current_working_directory = os.getcwd()
        output_dir = os.path.join(current_working_directory, args.output_dir)
        # logger.info(f'相对路径output_dir:{output_dir}')
    else:
        output_dir = args.output_dir
    logger.info(f'output_dir:{output_dir}')


    try:
        res = embedder.ocr_pdf_client(path=args.path,output_dir=output_dir)
        if res:
            logger.info(f"output_dir: '{res}'")
        else:
            logger.warning("None")
    except requests.exceptions.RequestException as e:
        logger.error(f"Error while making request to reranker service: {e}")
    except Exception as e:
        logger.error(f"Unexpected error occurred: {e}")
    doc_analyze_cost = time.time() - doc_analyze_start

    logger.info(f'解析当前pdf{args.path}耗时为:{doc_analyze_cost}')

if __name__ == "__main__":
    main()


