import os
from pathlib import Path

import click
from loguru import logger
from typing import List
from fastapi import FastAPI, HTTPException, Request
import magic_pdf.model as model_config
from magic_pdf.libs.version import __version__
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
from magic_pdf.tools.common import do_parse, parse_pdf_methods
from argparse import ArgumentParser
from pydantic import BaseModel
import uvicorn
import time


app = FastAPI()
method = 'auto'
class ocrRequest(BaseModel):
    path: str
    output_dir: str

def parse_args():
    parser = ArgumentParser()
    parser.add_argument(
        '--dcu_id',
        default='0',
        help='设置DCU')
    parser.add_argument(
        '--pdf_port',
        default=6030,
        help='设置DCU')
    parser.add_argument(
        '--method',
        type=parse_pdf_methods,
        help = """the method for parsing pdf.
        ocr: using ocr technique to extract information from pdf.
        txt: suitable for the text-based pdf only and outperform ocr.
        auto: automatically choose the best method for parsing pdf from ocr and txt.
        without method specified, auto will be used by default.""",
        default = 'auto',
        )

    # parser.add_argument(
    #     '--start',
    #     type=int,
    #     help='The starting page for PDF parsing, beginning from 0.',
    #     default=0,
    # )
    # parser.add_argument(
    #     '--end',
    #     type=int,
    #     help='The ending page for PDF parsing, beginning from 0.',
    #     default=None,
    # )

    parser.add_argument(
        '--debug',
        type=bool,
        help='Enables detailed debugging information during the execution of the CLI commands.',
        default=False,
    )

    args = parser.parse_args()
    return args

def ocr_pdf_serve(args: str):
    os.environ["CUDA_VISIBLE_DEVICES"] = args.dcu_id
    uvicorn.run(app, host="0.0.0.0", port=args.pdf_port)

@app.post("/pdf_ocr")
# def cli(path, output_dir, method, debug_able, start_page_id, end_page_id):
def pdf_ocr(request: ocrRequest):
    model_config.__use_inside_model__ = True
    model_config.__model_mode__ = 'full'
    output_dir = request.output_dir
    path = request.path
    os.makedirs(output_dir, exist_ok=True)
    debug_able = False
    start_page_id = 0
    end_page_id = None
    logger.info(f'method:{method},path:{path},output_dir{output_dir}')
    def read_fn(path):
        disk_rw = DiskReaderWriter(os.path.dirname(path))
        return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN)

    def parse_doc(doc_path: str):
        try:
            file_name = str(Path(doc_path).stem)
            pdf_data = read_fn(doc_path)
            do_parse(
                output_dir,
                file_name,
                pdf_data,
                [],
                method,
                debug_able,
                start_page_id=start_page_id,
                end_page_id=end_page_id,
            )

        except Exception as e:
            logger.exception(e)

    if os.path.isdir(path):
        for root, dirs, files in os.walk(path):
            # 查找所有的pdf文件
            for file in files:
                if file.endswith('.pdf'):
                    # 打印pdf文件的完整路径
                    doc_path = os.path.join(root, file)
                    start = time.time()
                    logger.info(f'正在解析：{doc_path}')
                    parse_doc(doc_path)
                    end = time.time()
                    logger.info(f'解析：{doc_path}的耗时为：{end -start}')
    else:
        logger.info(f'正在解析：{path}')
        parse_doc(path)

def main():
    args = parse_args()
    ocr_pdf_serve(args)



if __name__ == '__main__':
    main()




