# -*- coding: utf-8 -*-
import base64
import os
from magic_pdf.tools.ofd import OFD
from loguru import logger
from magic_pdf.dict2md.ocr_vllm_client import PredictClient,compress_image
# from magic_pdf.dict2md.ocr_client import PredictClient,compress_image
import configparser
from magic_pdf.parse.pdf_client import ocrPdfClient
import html

def decode_html_entities(text):
    # 将 HTML 实体转换为相应的字符
    return html.unescape(text)

def json_to_txt(json_data):
    txt_lines = []

    def parse_dict(d, indent=0):
        for key, value in d.items():
            if isinstance(value, dict):
                txt_lines.append(f"{' ' * indent}{key}:")
                parse_dict(value, indent + 2)
            elif isinstance(value, list):
                txt_lines.append(f"{' ' * indent}{key}:")
                parse_list(value, indent + 2)
            else:
                txt_lines.append(f"{' ' * indent}{key}: {value}")

    def parse_list(lst, indent=0):
        for i, item in enumerate(lst):
            if isinstance(item, dict):
                txt_lines.append(f"{' ' * indent}- Item {i + 1}:")
                parse_dict(item, indent + 2)
            elif isinstance(item, list):
                txt_lines.append(f"{' ' * indent}- List {i + 1}:")
                parse_list(item, indent + 2)
            else:
                txt_lines.append(f"{' ' * indent}- {item}")

    # Start parsing JSON data
    if isinstance(json_data, dict):
        parse_dict(json_data)
    elif isinstance(json_data, list):
        parse_list(json_data)
    else:
        txt_lines.append(str(json_data))

    return "\n".join(txt_lines)


def ofd2pdf(file_path,output_dir,pdfbytes):
    """
    ofd2pdf
    ofd2img
    """
    file_prefix = os.path.splitext(os.path.split(file_path)[1])[0]
    # logger.info(f'file_prefix:{file_prefix}')
    # logger.info(f'file_path:{file_path}')
    with open(file_path, "rb") as f:
        ofdb64 = str(base64.b64encode(f.read()), "utf-8")
    ofd = OFD()  # 初始化OFD 工具类
    file_outpath = os.path.join(output_dir, file_prefix)
    # logger.info(f'file_outpath:{file_outpath}')

    # ofd.read(ofdb64, save_xml=False, xml_name=f"{file_outpath}_xml")  # 读取ofdb64
    # pdf_bytes = ofd.to_pdf()  # 转pdf
    ofd.del_data()

    with open(f"{file_outpath}.pdf", "wb") as f:
        f.write(pdfbytes)
    return f"{file_outpath}.pdf"

def ofd2img(file_path,output_dir):

    file_prefix = os.path.splitext(os.path.split(file_path)[1])[0]
    output_file = os.path.join(output_dir,file_prefix)
    with open(file_path, "rb") as f:
        ofdb64 = str(base64.b64encode(f.read()), "utf-8")
    ofd = OFD()  # 初始化OFD 工具类
    ofd.read(ofdb64, save_xml=False, xml_name=f"{output_file}_xml")  # 读取ofdb64
    img_np,pdfbytes = ofd.to_jpg()  # 转图片
    ofd.del_data()
    output_files = []

    for idx, img in enumerate(img_np):
        # im = Image.fromarray(img)
        img.save(f"{output_file}_{idx}.jpg")
        output_files.append(f'{output_file}_{idx}.jpg')

    return output_files,pdfbytes

def parse_ofd(config_path,file_path,output_dir):
    config = configparser.ConfigParser()
    config.read(config_path)
    url = config.get('server', 'ocr_server')
    client = PredictClient(url)
    ofd_imgs,pdfbytes = ofd2img(file_path,output_dir)
    # logger.info(f'url:{url}\tofd_img:{ofd_imgs}')
    text = '判断图片是否是发票，如果是发票精确提取图片中的内容，否则返回False'
    ofd_txts = ''
    for ofd_img in ofd_imgs:
        compress_image(ofd_img)
        res = client.predict(ofd_img,text)
        if 'False' in res or 'false' in res:
            ofd_pdf = ofd2pdf(file_path,output_dir,pdfbytes)
            logger.info(f'ofd_pdf:{ofd_pdf}')
            pdf_server = config.get('server', 'pdf_server')
            pdf_ocr = ocrPdfClient(pdf_server)
            ofd_txt = pdf_ocr.ocr_pdf_client(path=ofd_pdf, output_dir=output_dir)
            break
        else:
            res = decode_html_entities(res)
            res = json_to_txt(res)
            ofd_txts = ofd_txts + res + '\n'

    if ofd_txts != '':
        file_name = os.path.basename(file_name).split('.')
        ofd_txt = os.path.join(output_dir,file_name) + '.txt'
        logger.info(f'ofd_txt:{ofd_txt}')
        with open(ofd_txt, 'w', encoding='utf-8') as f:
            f.write(str(ofd_txts))

    return ofd_txt
#
# if __name__ == '__main__':
#     file_path = ''
#     out_path = ''
#     ofd2pdf()
