Merge remote-tracking branch 'origin/master'

# Conflicts: # magic_pdf/model/pdf_extract_kit.py

Merge remote-tracking branch 'origin/master'
# Conflicts: # magic_pdf/model/pdf_extract_kit.py
eb4625a9 · zhaoxiaomeng · 4101c357 · 899c7918 · eb4625a9 · eb4625a9
Commit eb4625a9 authored Jul 12, 2024 by zhaoxiaomeng
14 changed files
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -18,15 +18,12 @@ on:
  workflow_dispatch:
 jobs:
  pdf-test:
-    runs-on: mineru
+    runs-on: ubuntu-latest
    timeout-minutes: 180
    strategy:
      fail-fast: true
    steps:
-    - name: config-net
-      run: |
-          source activate base
    - name: PDF benchmark
      uses: actions/checkout@v3
      with:

--- a/.github/workflows/cli.yml
+++ b/.github/workflows/cli.yml
@@ -18,15 +18,12 @@ on:
  workflow_dispatch:
 jobs:
  cli-test:
-    runs-on: mineru
+    runs-on: ubuntu-latest
    timeout-minutes: 40
    strategy:
      fail-fast: true
    steps:
-    - name: config-net
-      run: |
-        source activate base
    - name: PDF cli
      uses: actions/checkout@v3
      with:
@@ -34,19 +31,11 @@ jobs:
    - name: check-requirements
      run: |
-        changed_files=$(git diff --name-only -r HEAD~1 HEAD)
+        pip install -r requirements.txt
-        echo $changed_files
+        pip install -r requirements-qa.txt
-        if [[ $changed_files =~ "requirements.txt" ]] || [[ $changed_files =~ "requirements-qa.txt" ]]; then
-          pip install -r requirements.txt
-          pip install -r requirements-qa.txt
-        fi
-    - name: config-net-reset
-      run: |
-        export http_proxy=""
-        export https_proxy=""
    - name: test_cli
      run: |
+        cp magic-pdf.template.json ~/magic-pdf.json
        echo $GITHUB_WORKSPACE
        cd $GITHUB_WORKSPACE && export PYTHONPATH=. && pytest -s -v tests/test_unit.py
        cd $GITHUB_WORKSPACE &&  pytest -s -v tests/test_cli/test_cli.py

--- a/others/README.md
+++ b/others/README.md
-# pdf_toolbox
-pdf 解析基础函数
-## pdf是否是文字类型/扫描类型的区分
-```shell
-cat s3_pdf_path.example.pdf | parallel --colsep ' ' -j 10 "python pdf_meta_scan.py --s3-pdf-path {2} --s3-profile {1} >> {/}.jsonl"
-find dir/to/jsonl/ -type f -name "*.jsonl" | parallel -j 10 "python pdf_classfy_by_type.py --json_file {} >> {/}.jsonl"
-```
-```shell
-# 如果单独运行脚本，合并到code-clean之后需要运行，参考如下：
-python -m pdf_meta_scan --s3-pdf-path "D:\pdf_files\内容排序测试_pdf\p3_图文混排 5.pdf" --s3-profile s2
-```
-## pdf
--- a/others/check_inline_formula.py
+++ b/others/check_inline_formula.py
-# 最终版：把那种text_block有重叠，且inline_formula位置在重叠部分的，认定整个页面都有问题，所有的inline_formula都改成no_check
-from magic_pdf.libs import fitz
-def check_inline_formula(page, inline_formula_boxes):
-    """
-    :param page :fitz读取的当前页的内容
-    :param inline_formula_boxes: list类型，每一个元素是一个元祖 (L, U, R, D)
-    :return: inline_formula_check: list类型，每一个元素是一个类别，其顺序对应输入的inline_formula_boxes，给每个行内公式打一个标签，包括：
-        - nocheck_inline_formula：这个公式框没有与任何span相交，有可能存在问题
-        - wrong_text_block：这个公式框同时存在多个block里，可能页面的text block存在问题
-        - false_inline_formula：只涉及一个span并且只占据这个span的小部分面积，判断可能不是公式
-        - true_inline_formula：两种情况判断为公式，一是横跨多个span，二是只涉及一个span但是几乎占据了这个span大部分的面积
-    """
-    # count = defaultdict(int)
-    ## ------------------------ Text --------------------------------------------
-    blocks = page.get_text(
-            "dict",
-            flags=fitz.TEXTFLAGS_TEXT,
-            #clip=clip,
-        )["blocks"]
-    # iterate over the bboxes
-    inline_formula_check = []
-    for result in inline_formula_boxes:
-        (x1, y1, x2, y2) = (result[0], result[1], result[2], result[3])
-        ## 逐个block##
-        in_block = 0
-        for bbox in blocks:
-            # image = cv2.rectangle(image, (int(bbox['bbox'][0]), int(bbox['bbox'][1])), (int(bbox['bbox'][2]), int(bbox['bbox'][3])), (0, 255, 0), 1)
-            if (y1 >= bbox['bbox'][1] and y2 <= bbox['bbox'][3]) and (x1 >= bbox['bbox'][0] and x2 <= bbox['bbox'][2]):       # 判定公式在哪一个block
-                in_block += 1
-                intersect = []
-                # ## 逐个span###
-                for line in bbox['lines']:
-                    if line['bbox'][1] <= ((y2 - y1) / 2) + y1 <= line['bbox'][3]:   # 判断公式在哪一行
-                        for item in line['spans']:
-                            (t_x1, t_y1, t_x2, t_y2) = item['bbox']
-                            if not ((t_x1 < x1 and t_x2 < x1) or (t_x1 > x2 and t_x2 > x2) or (t_y1 < y1 and t_y2 < y1) or (t_y1 > y2 and t_y2 > y2)):   # 判断是否相交
-                                intersect.append(item['bbox'])
-                                # image = cv2.rectangle(image, (int(t_x1), int(t_y1)), (int(t_x2), int(t_y2)), (0, 255, 0), 1)    # 可视化涉及到的span
-                # 可视化公式的分类
-                if len(intersect) == 0:  # 没有与任何一个span有相交，这个span或者这个inline_formula_box可能有问题
-                    # print(f'Wrong location, check {img_path}')
-                    inline_formula_check_result = "nocheck_inline_formula"
-                    # count['not_in_line'] += 1
-                elif len(intersect) == 1:  
-                    if abs((intersect[0][2] - intersect[0][0]) - (x2 - x1)) < (x2 - x1)*0.5: # 只涉及一个span但是几乎占据了这个span大部分的面积，判定为公式
-                        # image = cv2.rectangle(image, (int(x1), int(y1)), (int(x2), int(y2)), (0, 255, 0), 1)   
-                        inline_formula_check_result = "true_inline_formula"
-                        # count['one_span_large'] += 1
-                    else:  # 只涉及一个span并且只占据这个span的小部分面积，判断可能不是公式
-                        # image = cv2.rectangle(image, (int(x1), int(y1)), (int(x2), int(y2)), (0, 0, 255), 1)
-                        inline_formula_check_result = "false_inline_formula"
-                        # count['fail'] += 1
-                else:  # 横跨多个span,判定为公式
-                    # image = cv2.rectangle(image, (int(x1), int(y1)), (int(x2), int(y2)), (255, 0, 0), 1)
-                    inline_formula_check_result = "true_inline_formula"
-                    # count['multi_span'] += 1
-        if in_block == 0:  # 这个公式没有在任何的block里，这个公式可能有问题
-            # image = cv2.rectangle(image, (int(x1), int(y1)), (int(x2), int(y2)), (255, 255, 0), 1)
-            inline_formula_check_result = "nocheck_inline_formula"
-            # count['not_in_block'] += 1
-        elif in_block > 1: # 这个公式存在于多个block里，这个页面可能有问题
-            inline_formula_check_result = "wrong_text_block"
-        inline_formula_check.append(inline_formula_check_result)
-    return inline_formula_check
--- a/others/pdf2json_infer.py
+++ b/others/pdf2json_infer.py
-import sys
-from typing import Tuple
-import os
-import boto3, json
-from botocore.config import Config
-from magic_pdf.libs import fitz
-from loguru import logger
-from pathlib import Path
-from tqdm import tqdm
-import numpy as np
-# sys.path.insert(0, "/mnt/petrelfs/ouyanglinke/code-clean/")
-# print(sys.path)
-from validation import cal_edit_distance, format_gt_bbox, label_match, detect_val
-# from pdf2text_recogFigure_20231107 import parse_images        # 获取figures的bbox
-# from pdf2text_recogTable_20231107 import parse_tables         # 获取tables的bbox
-# from pdf2text_recogEquation_20231108 import parse_equations    # 获取equations的bbox
-# from pdf2text_recogTitle_20231113 import parse_titles           # 获取Title的bbox
-# from pdf2text_recogPara import parse_blocks_per_page    
-# from bbox_sort import bbox_sort, CONTENT_IDX, CONTENT_TYPE_IDX
-from magic_pdf.layout.bbox_sort import bbox_sort, CONTENT_IDX, CONTENT_TYPE_IDX
-from magic_pdf.pre_proc import parse_images          # 获取figures的bbox
-from magic_pdf.pre_proc.detect_tables import parse_tables           # 获取tables的bbox
-from magic_pdf.pre_proc import parse_equations     # 获取equations的bbox
-# from pdf2text_recogFootnote import parse_footnotes     # 获取footnotes的bbox
-from magic_pdf.post_proc.detect_para import process_blocks_per_page
-from magic_pdf.libs import parse_aws_param, parse_bucket_key, read_file, join_path
-def cut_image(bbox: Tuple, page_num: int, page: fitz.Page, save_parent_path: str, s3_profile: str):
-    """
-    从第page_num页的page中，根据bbox进行裁剪出一张jpg图片，返回图片路径
-    save_path：需要同时支持s3和本地, 图片存放在save_path下，文件名是: {page_num}_{bbox[0]}_{bbox[1]}_{bbox[2]}_{bbox[3]}.jpg , bbox内数字取整。
-    """
-    # 拼接路径
-    image_save_path = join_path(save_parent_path, f"{page_num}_{int(bbox[0])}_{int(bbox[1])}_{int(bbox[2])}_{int(bbox[3])}.jpg")
-    try:
-        # 将坐标转换为fitz.Rect对象
-        rect = fitz.Rect(*bbox)
-        # 配置缩放倍数为3倍
-        zoom = fitz.Matrix(3, 3)
-        # 截取图片
-        pix = page.get_pixmap(clip=rect, matrix=zoom)
-        # 打印图片文件名
-        # print(f"Saved {image_save_path}")
-        if image_save_path.startswith("s3://"):
-            ak, sk, end_point, addressing_style = parse_aws_param(s3_profile)
-            cli = boto3.client(service_name="s3", aws_access_key_id=ak, aws_secret_access_key=sk, endpoint_url=end_point,
-                            config=Config(s3={'addressing_style': addressing_style}))
-            bucket_name, bucket_key = parse_bucket_key(image_save_path)
-            # 将字节流上传到s3
-            cli.upload_fileobj(pix.tobytes(output='jpeg', jpg_quality=95), bucket_name, bucket_key)
-        else:
-            # 保存图片到本地
-            # 先检查一下image_save_path的父目录是否存在，如果不存在，就创建
-            parent_dir = os.path.dirname(image_save_path)
-            if not os.path.exists(parent_dir):
-                os.makedirs(parent_dir)
-            pix.save(image_save_path, jpg_quality=95)
-            # 为了直接能在markdown里看，这里把地址改为相对于mardown的地址
-            pth = Path(image_save_path)
-            image_save_path =  f"{pth.parent.name}/{pth.name}"
-            return image_save_path
-    except Exception as e:
-        logger.exception(e)
-        return image_save_path
-def get_images_by_bboxes(book_name:str, page_num:int, page: fitz.Page, save_path:str, s3_profile:str, image_bboxes:list, table_bboxes:list, equation_inline_bboxes:list, equation_interline_bboxes:list) -> dict:
-    """
-    返回一个dict, key为bbox, 值是图片地址
-    """
-    ret = {}
-    # 图片的保存路径组成是这样的： {s3_or_local_path}/{book_name}/{images|tables|equations}/{page_num}_{bbox[0]}_{bbox[1]}_{bbox[2]}_{bbox[3]}.jpg
-    image_save_path = join_path(save_path, book_name, "images") 
-    table_save_path = join_path(save_path, book_name, "tables") 
-    equation_inline_save_path = join_path(save_path, book_name, "equations_inline")
-    equation_interline_save_path = join_path(save_path, book_name, "equation_interline")
-    for bbox in image_bboxes:
-        image_path = cut_image(bbox, page_num, page, image_save_path, s3_profile)
-        ret[bbox] = (image_path, "image") # 第二个元素是"image"，表示是图片
-    for bbox in table_bboxes:
-        image_path = cut_image(bbox, page_num, page, table_save_path, s3_profile)
-        ret[bbox] = (image_path, "table")
-    # 对公式目前只截图，不返回
-    for bbox in equation_inline_bboxes:
-        cut_image(bbox, page_num, page, equation_inline_save_path, s3_profile)
-    for bbox in equation_interline_bboxes:
-        cut_image(bbox, page_num, page, equation_interline_save_path, s3_profile)
-    return ret
-def reformat_bboxes(images_box_path_dict:list, paras_dict:dict):
-    """
-    把bbox重新组装成一个list，每个元素[x0, y0, x1, y1, block_content, idx_x, idx_y], 初始时候idx_x, idx_y都是None. 对于图片、公式来说，block_content是图片的地址， 对于段落来说，block_content是段落的内容
-    """
-    all_bboxes = []
-    for bbox, image_info in images_box_path_dict.items():
-        all_bboxes.append([bbox[0], bbox[1], bbox[2], bbox[3], image_info, None, None, 'image'])
-    paras_dict = paras_dict[f"page_{paras_dict['page_id']}"]
-    for block_id, kvpair in paras_dict.items():
-        bbox = kvpair['bbox']
-        content = kvpair
-        all_bboxes.append([bbox[0], bbox[1], bbox[2], bbox[3], content, None, None, 'text'])
-    return all_bboxes
-def concat2markdown(all_bboxes:list):
-    """
-    对排序后的bboxes拼接内容
-    """
-    content_md = ""
-    for box in all_bboxes:
-        content_type = box[CONTENT_TYPE_IDX]
-        if content_type == 'image':
-            image_type = box[CONTENT_IDX][1]
-            image_path = box[CONTENT_IDX][0]
-            content_md += f"![{image_type}]({image_path})"
-            content_md += "\n\n"
-        elif content_type == 'text': # 组装文本
-            paras = box[CONTENT_IDX]['paras']
-            text_content = ""
-            for para_id, para in paras.items():# 拼装内部的段落文本
-                text_content += para['text']
-                text_content += "\n\n"
-            content_md += text_content
-        else:
-            raise Exception(f"ERROR: {content_type} is not supported!")
-    return content_md
-def main(s3_pdf_path: str, s3_pdf_profile: str, pdf_model_path:str, pdf_model_profile:str, save_path: str, page_num: int):
-    """
-    """
-    pth = Path(s3_pdf_path)
-    book_name = pth.name
-    #book_name = "".join(os.path.basename(s3_pdf_path).split(".")[0:-1])
-    res_dir_path = None
-    exclude_bboxes = []
-    # text_content_save_path = f"{save_path}/{book_name}/book.md"
-    # metadata_save_path = f"{save_path}/{book_name}/metadata.json"  
-    try:
-        pdf_bytes = read_file(s3_pdf_path, s3_pdf_profile)
-        pdf_docs = fitz.open("pdf", pdf_bytes)
-        page_id = page_num - 1
-        page = pdf_docs[page_id] # 验证集只需要读取特定页面即可
-        model_output_json = join_path(pdf_model_path, f"page_{page_num}.json") # 模型输出的页面编号从1开始的
-        json_from_docx = read_file(model_output_json, pdf_model_profile) # TODO 这个读取方法名字应该改一下，避免语义歧义
-        json_from_docx_obj = json.loads(json_from_docx)
-        # 解析图片
-        image_bboxes = parse_images(page_id, page, json_from_docx_obj)
-        # 解析表格
-        table_bboxes = parse_tables(page_id, page, json_from_docx_obj)
-        # 解析公式
-        equations_interline_bboxes, equations_inline_bboxes = parse_equations(page_id, page, json_from_docx_obj)
-        # # 解析标题
-        # title_bboxs = parse_titles(page_id, page, res_dir_path, json_from_docx_obj, exclude_bboxes)
-        # # 解析页眉
-        # header_bboxs = parse_headers(page_id, page, res_dir_path, json_from_docx_obj, exclude_bboxes)
-        # # 解析页码
-        # pageNo_bboxs = parse_pageNos(page_id, page, res_dir_path, json_from_docx_obj, exclude_bboxes)
-        # # 解析脚注
-        # footnote_bboxs = parse_footnotes(page_id, page, res_dir_path, json_from_docx_obj, exclude_bboxes)
-        # # 解析页脚
-        # footer_bboxs = parse_footers(page_id, page, res_dir_path, json_from_docx_obj, exclude_bboxes)
-        # # 评估Layout是否规整、简单
-        # isSimpleLayout_flag, fullColumn_cnt, subColumn_cnt, curPage_loss = evaluate_pdf_layout(page_id, page, res_dir_path, json_from_docx_obj, exclude_bboxes)
-        # 把图、表、公式都进行截图，保存到本地，返回图片路径作为内容
-        images_box_path_dict = get_images_by_bboxes(book_name, page_id, page, save_path, s3_pdf_profile, image_bboxes, table_bboxes, equations_inline_bboxes,
-                                                    equations_interline_bboxes)  # 只要表格和图片的截图
-        # 解析文字段落
-        footer_bboxes = []
-        header_bboxes = []
-        exclude_bboxes = image_bboxes + table_bboxes
-        paras_dict = process_blocks_per_page(page, page_id, image_bboxes, table_bboxes, equations_inline_bboxes, equations_interline_bboxes, footer_bboxes, header_bboxes)
-        # paras_dict = postprocess_paras_pipeline(paras_dict)
-        # 最后一步，根据bbox进行从左到右，从上到下的排序，之后拼接起来, 排序
-        all_bboxes = reformat_bboxes(images_box_path_dict, paras_dict)  # 由于公式目前还没有，所以equation_bboxes是None，多数存在段落里，暂时不解析
-        # 返回的是一个数组，每个元素[x0, y0, x1, y1, block_content, idx_x, idx_y, type], 初始时候idx_x, idx_y都是None. 对于图片、公式来说，block_content是图片的地址， 对于段落来说，block_content是段落的内容
-        # sorted_bboxes = bbox_sort(all_bboxes)
-        # markdown_text = concat2markdown(sorted_bboxes)
-        # parent_dir = os.path.dirname(text_content_save_path)
-        # if not os.path.exists(parent_dir):
-        #     os.makedirs(parent_dir)
-        # with open(text_content_save_path, "a") as f:
-        #     f.write(markdown_text)
-        #     f.write(chr(12)) #换页符   
-        # end for
-        # 写一个小的json,记录元数据
-        # metadata = {"book_name": book_name, "pdf_path": s3_pdf_path, "pdf_model_path": pdf_model_path, "save_path": save_path}
-        # with open(metadata_save_path, "w") as f:
-        #     json.dump(metadata, f, ensure_ascii=False, indent=4)
-        return all_bboxes
-    except Exception as e:
-        print(f"ERROR: {s3_pdf_path}, {e}", file=sys.stderr)
-        logger.exception(e)
-# @click.command()
-# @click.option('--pdf-file-sub-path', help='s3上pdf文件的路径')
-# @click.option('--save-path', help='解析出来的图片，文本的保存父目录')
-def validation(validation_dataset: str, pdf_bin_file_profile: str, pdf_model_dir: str, pdf_model_profile: str, save_path: str):
-    #pdf_bin_file_path = "s3://llm-raw-snew/llm-raw-scihub/scimag07865000-07865999/10.1007/"
-    # pdf_bin_file_parent_path = "s3://llm-raw-snew/llm-raw-scihub/"
-    # pdf_model_parent_dir = "s3://llm-pdf-text/layout_det/scihub/"
-    # p = Path(pdf_file_sub_path)
-    # pdf_parent_path = p.parent
-    # pdf_file_name = p.name   # pdf文件名字，含后缀
-    # pdf_bin_file_path  = join_path(pdf_bin_file_parent_path, pdf_parent_path)
-    with open(validation_dataset, 'r') as f:
-        samples = json.load(f)
-    labels = []
-    det_res = []
-    edit_distance_list = []
-    for sample in tqdm(samples):
-        pdf_name = sample['pdf_name']
-        s3_pdf_path = sample['s3_path']
-        page_num = sample['page']
-        gt_order = sample['order']
-        pre = main(s3_pdf_path, pdf_bin_file_profile, join_path(pdf_model_dir, pdf_name), pdf_model_profile, save_path, page_num)
-        pre_dict_list = []
-        for item in pre:
-            pre_sample = {
-                'box': [item[0],item[1],item[2],item[3]],
-                'type': item[7],
-                'score': 1
-            }
-            pre_dict_list.append(pre_sample)
-        det_res.append(pre_dict_list)
-        match_change_dict = {   # 待确认
-            "figure": "image",
-            "svg_figure": "image",
-            "inline_fomula": "equations_inline",
-            "fomula": "equation_interline",
-            "figure_caption": "text",
-            "table_caption": "text",
-            "fomula_caption": "text"
-        }
-        gt_annos = sample['annotations']
-        matched_label = label_match(gt_annos, match_change_dict)
-        labels.append(matched_label)
-        # 判断排序函数的精度
-        # 目前不考虑caption与图表相同序号的问题
-        ignore_category = ['abandon', 'figure_caption', 'table_caption', 'formula_caption'] 
-        gt_bboxes = format_gt_bbox(gt_annos, ignore_category)
-        sorted_bboxes = bbox_sort(gt_bboxes)
-        edit_distance = cal_edit_distance(sorted_bboxes)
-        edit_distance_list.append(edit_distance)
-    label_classes = ["image", "text", "table", "equation_interline"]
-    detect_matrix = detect_val(labels, det_res, label_classes)
-    print('detect_matrix', detect_matrix)
-    edit_distance_mean = np.mean(edit_distance_list)
-    print('edit_distance_mean', edit_distance_mean)
-if __name__ == '__main__':
-    # 输入可以用以下命令生成批量pdf
-    # aws s3 ls s3://llm-pdf-text/layout_det/scihub/ --profile langchao | tail -n 10 | awk '{print "s3://llm-pdf-text/layout_det/scihub/"$4}' | xargs -I{}  aws s3 ls {} --recursive --profile langchao  | awk '{print substr($4,19)}' | parallel -j 1 echo {//} | sort -u
-    pdf_bin_file_profile = "outsider"
-    pdf_model_dir = "s3://llm-pdf-text/eval_1k/layout_res/"
-    pdf_model_profile = "langchao"
-    # validation_dataset = "/mnt/petrelfs/share_data/ouyanglinke/OCR/OCR_validation_dataset.json"
-    validation_dataset = "/mnt/petrelfs/share_data/ouyanglinke/OCR/OCR_validation_dataset_subset.json" # 测试
-    save_path = "/mnt/petrelfs/share_data/ouyanglinke/OCR/OCR_val_result"
-    validation(validation_dataset, pdf_bin_file_profile, pdf_model_dir, pdf_model_profile, save_path)
--- a/others/pdf2text_evaluatePdfLayout.py
+++ b/others/pdf2text_evaluatePdfLayout.py
-from magic_pdf.libs import fitz             # pyMuPDF库
-def calculate_overlapRatio_between_rect1_and_rect2(L1: float, U1: float, R1: float, D1: float, L2: float, U2: float, R2: float, D2: float) -> (float, float):
-    # 计算两个rect，重叠面积各占2个rect面积的比例
-    if min(R1, R2) < max(L1, L2) or min(D1, D2) < max(U1, U2):
-        return 0, 0
-    square_1 = (R1 - L1) * (D1 - U1)
-    square_2 = (R2 - L2) * (D2 - U2)
-    if square_1 == 0 or square_2 == 0:
-        return 0, 0
-    square_overlap = (min(R1, R2) - max(L1, L2)) * (min(D1, D2) - max(U1, U2))
-    return square_overlap / square_1, square_overlap / square_2
-def evaluate_pdf_layout(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict):
-    """
-    :param page_ID: int类型，当前page在当前pdf文档中是第page_D页。
-    :param page :fitz读取的当前页的内容
-    :param res_dir_path: str类型，是每一个pdf文档，在当前.py文件的目录下生成一个与pdf文档同名的文件夹，res_dir_path就是文件夹的dir
-    :param json_from_DocXchain_obj: dict类型，把pdf文档送入DocXChain模型中后，提取bbox，结果保存到pdf文档同名文件夹下的 page_ID.json文件中了。json_from_DocXchain_obj就是打开后的dict
-    """
-    DPI = 72  # use this resolution
-    pix = page.get_pixmap(dpi=DPI)
-    pageL = 0
-    pageR = int(pix.w)
-    pageU = 0
-    pageD = int(pix.h)
-    #--------- 通过json_from_DocXchain来获取 title ---------#
-    title_bbox_from_DocXChain = []
-    xf_json = json_from_DocXchain_obj
-    width_from_json = xf_json['page_info']['width']
-    height_from_json = xf_json['page_info']['height']
-    LR_scaleRatio = width_from_json / (pageR - pageL)
-    UD_scaleRatio = height_from_json / (pageD - pageU)
-    # {0: 'title',  # 标题
-    # 1: 'figure', # 图片
-    #  2: 'plain text',  # 文本
-    #  3: 'header',      # 页眉
-    #  4: 'page number', # 页码
-    #  5: 'footnote',    # 脚注
-    #  6: 'footer',      # 页脚
-    #  7: 'table',       # 表格
-    #  8: 'table caption',  # 表格描述
-    #  9: 'figure caption', # 图片描述
-    #  10: 'equation',      # 公式
-    #  11: 'full column',   # 单栏
-    #  12: 'sub column',    # 多栏
-    #  13: 'embedding',     # 嵌入公式
-    #  14: 'isolated'}      # 单行公式
-    LOSS_THRESHOLD = 2000               # 经验值
-    fullColumn_bboxs = []
-    subColumn_bboxs = []
-    plainText_bboxs = []
-    #### read information of plain text
-    for xf in xf_json['layout_dets']:
-        L = xf['poly'][0] / LR_scaleRatio
-        U = xf['poly'][1] / UD_scaleRatio
-        R = xf['poly'][2] / LR_scaleRatio
-        D = xf['poly'][5] / UD_scaleRatio
-        L, R = min(L, R), max(L, R)
-        U, D = min(U, D), max(U, D)
-        if xf['category_id'] == 2:
-            plainText_bboxs.append((L, U, R, D))
-    #### read information of column
-    for xf in xf_json['subfield_dets']:
-        L = xf['poly'][0] / LR_scaleRatio
-        U = xf['poly'][1] / UD_scaleRatio
-        R = xf['poly'][2] / LR_scaleRatio
-        D = xf['poly'][5] / UD_scaleRatio
-        L, R = min(L, R), max(L, R)
-        U, D = min(U, D), max(U, D)
-        if xf['category_id'] == 11:
-            fullColumn_bboxs.append((L, U, R, D))
-        elif xf['category_id'] == 12:
-            subColumn_bboxs.append((L, U, R, D))
-    curPage_loss = 0        # 当前页的loss
-    fail_cnt = 0            # Text文本块没被圈到的情形。
-    for L, U, R, D in plainText_bboxs:
-        find = False
-        for L2, U2, R2, D2 in (fullColumn_bboxs + subColumn_bboxs):
-            ratio_1, _ = calculate_overlapRatio_between_rect1_and_rect2(L, U, R, D, L2, U2, R2, D2)
-            if ratio_1 >= 0.9:
-                loss_1 = (L + R) / 2 - (L2 + R2) / 2
-                loss_2 = L - L2
-                cur_loss = min(abs(loss_1), abs(loss_2))
-                curPage_loss += cur_loss
-                find = True
-                break
-        if find == False:
-            fail_cnt += 1
-    isSimpleLayout_flag = False
-    if fail_cnt == 0 and len(fullColumn_bboxs) <= 1 and len(subColumn_bboxs) <= 2:
-        if curPage_loss <= LOSS_THRESHOLD:
-            isSimpleLayout_flag  = True
-    return isSimpleLayout_flag, len(fullColumn_bboxs), len(subColumn_bboxs), curPage_loss
--- a/others/pdf2text_getNumberOfColumn.py
+++ b/others/pdf2text_getNumberOfColumn.py
-from magic_pdf.libs import fitz
-from typing import List
-def show_image(item, title=""):
-    """Display a pixmap.
-    Just to display Pixmap image of "item" - ignore the man behind the curtain.
-    Args:
-        item: any PyMuPDF object having a "get_pixmap" method.
-        title: a string to be used as image title
-    Generates an RGB Pixmap from item using a constant DPI and using matplotlib
-    to show it inline of the notebook.
-    """
-    DPI = 150  # use this resolution
-    import numpy as np
-    import matplotlib.pyplot as plt
-    # %matplotlib inline
-    pix = item.get_pixmap(dpi=DPI)
-    img = np.ndarray([pix.h, pix.w, 3], dtype=np.uint8, buffer=pix.samples_mv)
-    plt.figure(dpi=DPI)  # set the figure's DPI
-    plt.title(title)  # set title of image
-    _ = plt.imshow(img, extent=(0, pix.w * 72 / DPI, pix.h * 72 / DPI, 0))
-def calculate_overlapRatio_between_line1_and_line2(L1: float, R1: float, L2: float, R2: float) -> (float, float):
-    # 计算两个line，重叠line各占2个line长度的比例
-    if max(L1, L2) > min(R1, R2):
-        return 0, 0
-    if L1 == R1 or L2 == R2:
-        return 0, 0
-    overlap_line = min(R1, R2) - max(L1, L2)
-    return overlap_line / (R1 - L1), overlap_line / (R2 - L2)
-def get_targetAxis_and_splitAxis(page_ID: int, page: fitz.Page, columnNumber: int, textBboxs: List[(float, float, float, float)]) -> (List[float], List[float]):
-    """
-    param: page: fitz解析出来的格式
-    param: columnNumber: Text的列数
-    param: textBboxs: 文本块list。 [(L, U, R, D), ... ]
-    return: 
-    """
-    INF = 10 ** 9
-    pageL, pageU, pageR, pageD = INF, INF, 0, 0
-    for L, U, R, D in textBboxs:
-        assert L <= R and U <= D
-        pageL = min(pageL, L)
-        pageR = max(pageR, R)
-        pageU = min(pageU, U)
-        pageD = max(pageD, D)
-    pageWidth = pageR - pageL
-    pageHeight = pageD - pageU
-    pageL -= pageWidth / 10  # 10是经验值
-    pageR += pageWidth / 10
-    pageU -= pageHeight / 10
-    pageD += pageHeight / 10
-    pageWidth = pageR - pageL
-    pageHeight = pageD - pageU
-    x_targetAxis = []
-    x_splitAxis = []
-    for i in range(0, columnNumber * 2 + 1):
-        if i & 1:
-            x_targetAxis.append(pageL + pageWidth / (2 * columnNumber) * i)
-        else:
-            x_splitAxis.append(pageL + pageWidth / (2 * columnNumber) * i)
-    # # 可视化：分列的外框
-    # path_bbox = []
-    # N = len(x_targetAxis)
-    # for i in range(N):
-    #     L, R = x_splitAxis[i], x_splitAxis[i + 1]
-    #     path_bbox.append((L, pageU, R, pageD))
-    # shape = page.new_shape()
-    # # iterate over the bboxes
-    # color_map = [fitz.pdfcolor["red"], fitz.pdfcolor["blue"], fitz.pdfcolor["yellow"], fitz.pdfcolor["black"], fitz.pdfcolor["green"], fitz.pdfcolor["brown"]]
-    # for i, rect in enumerate(path_bbox):
-    #     # if i < 20:
-    #     #     continue
-    #     shape.draw_rect(rect)  # draw a border
-    #     shape.insert_text(Point(rect[0], rect[1])+(5, 15), str(i), color=fitz.pdfcolor["blue"])
-    #     shape.finish(color=color_map[i%len(color_map)])
-    #     # shape.finish(color=fitz.pdfcolor["blue"])
-    #     shape.commit()  # store to the page
-    #     # if i == 3:
-    #     #     print(rect)
-    #     #     break
-    #     # print(rect)
-    # show_image(page, f"Table & Header BBoxes")            
-    return x_targetAxis, x_splitAxis
-def calculate_loss(page_ID: int, x_targetAxis: List[float], x_splitAxis: List[float], textBboxs: List[(float, float, float, float)]) -> (float, bool):
-    INF = 10 ** 9
-    # page_artbox = page.artbox
-    # pageL, pageU, pageR, pageD = page_artbox[0], page_artbox[1], page_artbox[2], page_artbox[3]
-    pageL, pageU, pageR, pageD = INF, INF, 0, 0
-    for L, U, R, D in textBboxs:
-        assert L <= R and U <= D
-        pageL = min(pageL, L)
-        pageR = max(pageR, R)
-        pageU = min(pageU, U)
-        pageD = max(pageD, D)
-    pageWidth = pageR - pageL
-    pageHeight = pageD - pageU
-    pageL -= pageWidth / 10
-    pageR += pageWidth / 10
-    pageU -= pageHeight / 10
-    pageD += pageHeight / 10
-    pageWidth = pageR - pageL
-    pageHeight = pageD - pageU
-    col_N = len(x_targetAxis)  # 列数
-    col_texts_mid = [[] for _ in range(col_N)]
-    col_texts_LR = [[] for _ in range(col_N)]
-    oneLocateLoss_mid = 0
-    oneLocateLoss_LR = 0
-    oneLocateCnt_mid = 0  # 完美在一列中的个数
-    oneLocateCnt_LR = 0
-    oneLocateSquare_mid = 0.0  # 完美在一列的面积
-    oneLocateSquare_LR = 0.0
-    multiLocateLoss_mid = 0
-    multiLocateLoss_LR = 0
-    multiLocateCnt_mid = 0  # 在多列中的个数
-    multiLocateCnt_LR = 0
-    multiLocateSquare_mid = 0.0  # 在多列中的面积
-    multiLocateSquare_LR = 0.0
-    allLocateLoss_mid = 0
-    allLocateLoss_LR = 0
-    allLocateCnt_mid = 0  # 横跨页面的大框的个数
-    allLocateCnt_LR = 0
-    allLocateSquare_mid = 0.0  # 横跨整个页面的个数
-    allLocateSquare_LR = 0.0
-    isSimpleCondition = True  # 就1个。2种方式，只要有一种情况不规整，就是不规整。
-    colID_Textcnt_mid = [0 for _ in range(col_N)]  # 每一列中有多少个Text块，根据mid判断的
-    colID_Textcnt_LR = [0 for _ in range(col_N)]  # 每一列中有多少个Text块，根据区间边界判断
-    allLocateBboxs_mid = []  # 跨整页的，bbox
-    allLocateBboxs_LR = []
-    non_allLocateBboxs_mid = []
-    non_allLocateBboxs_LR = []  # 不在单独某一列，但又不是全列
-    for L, U, R, D in textBboxs:
-        if D - U < 40:  # 现在还没拼接好。先简单这样过滤页眉。也会牺牲一些很窄的长条
-            continue
-        if R - L < 40:
-            continue
-        located_cols_mid = []
-        located_cols_LR = []
-        for col_ID in range(col_N):
-            if col_N == 1:
-                located_cols_mid.append(col_ID)
-                located_cols_LR.append(col_ID)
-            else:
-                if L <= x_targetAxis[col_ID] <= R:
-                    located_cols_mid.append(col_ID)
-                if calculate_overlapRatio_between_line1_and_line2(x_splitAxis[col_ID], x_splitAxis[col_ID + 1], L, R)[0] >= 0.2:
-                    located_cols_LR.append(col_ID)
-        if len(located_cols_mid) == col_N:
-            allLocateBboxs_mid.append((L, U, R, D))
-        else:
-            non_allLocateBboxs_mid.append((L, U, R, D))
-        if len(located_cols_LR) == col_N:
-            allLocateBboxs_LR.append((L, U, R, D))
-        else:
-            non_allLocateBboxs_LR.append((L, U, R, D))
-    allLocateBboxs_mid.sort(key=lambda LURD: (LURD[1], LURD[0]))
-    non_allLocateBboxs_mid.sort(key=lambda LURD: (LURD[1], LURD[0]))
-    allLocateBboxs_LR.sort(key=lambda LURD: (LURD[1], LURD[0]))
-    non_allLocateBboxs_LR.sort(key=lambda LURD: (LURD[1], LURD[0]))
-    # --------------------判断，是不是有标题类的小块，掺杂在一列的pdf页面里。-------------#
-    isOneClumn = False
-    under_cnt = 0
-    under_square = 0.0
-    before_cnt = 0
-    before_square = 0.0
-    for nL, nU, nR, nD in non_allLocateBboxs_mid:
-        cnt = 0
-        for L, U, R, D in allLocateBboxs_mid:
-            if nD <= U:
-                cnt += 1
-        if cnt >= 1:
-            before_cnt += cnt
-            before_square += (R - L) * (D - U) * cnt
-        else:
-            under_cnt += 1
-            under_square += (R - L) * (D - U) * cnt
-    if (before_square + under_square) != 0 and before_square / (before_square + under_square) >= 0.2:
-        isOneClumn = True
-    if isOneClumn == True and col_N != 1:
-        return INF, False
-    if isOneClumn == True and col_N == 1:
-        return 0, True
-    #### 根据边界的统计情况，再判断一次
-    isOneClumn = False
-    under_cnt = 0
-    under_square = 0.0
-    before_cnt = 0
-    before_square = 0.0
-    for nL, nU, nR, nD in non_allLocateBboxs_LR:
-        cnt = 0
-        for L, U, R, D in allLocateBboxs_LR:
-            if nD <= U:
-                cnt += 1
-        if cnt >= 1:
-            before_cnt += cnt
-            before_square += (R - L) * (D - U) * cnt
-        else:
-            under_cnt += 1
-            under_square += (R - L) * (D - U) * cnt
-    if (before_square + under_square) != 0 and before_square / (before_square + under_square) >= 0.2:
-        isOneClumn = True
-    if isOneClumn == True and col_N != 1:
-        return INF, False
-    if isOneClumn == True and col_N == 1:
-        return 0, True
-    for L, U, R, D in textBboxs:
-        assert L < R and U < D, 'There is an error on bbox of text when calculate loss!'
-        # 简单排除页眉、迷你小块
-        # if (D - U) < pageHeight / 15 < 40 or (R - L) < pageWidth / 8:
-        if (D - U) < 40:
-            continue
-        if (R - L) < 40:
-            continue
-        mid = (L + R) / 2
-        located_cols_mid = []  # 在哪一列里，根据中点来判断
-        located_cols_LR = []  # 在哪一列里，根据边界判断
-        for col_ID in range(col_N):
-            if col_N == 1:
-                located_cols_mid.append(col_ID)
-            else:
-                # 根据中点判断
-                if L <= x_targetAxis[col_ID] <= R:
-                    located_cols_mid.append(col_ID)
-                # 根据边界判断
-                if calculate_overlapRatio_between_line1_and_line2(x_splitAxis[col_ID], x_splitAxis[col_ID + 1], L, R)[0] >= 0.2:
-                    located_cols_LR.append(col_ID)
-        ## 1列的情形
-        if col_N == 1:
-            oneLocateLoss_mid += abs(mid - x_targetAxis[located_cols_mid[0]]) * (D - U) * (R - L)
-            # oneLocateLoss_mid += abs(L - x_splitAxis[located_cols[0]]) * (D - U) * (R - L)
-            oneLocateLoss_LR += abs(L - x_splitAxis[located_cols_mid[0]]) * (D - U) * (R - L)
-            oneLocateCnt_mid += 1
-            oneLocateSquare_mid += (D - U) * (R - L)
-        ## 多列的情形
-        else:
-            ######## 根据mid判断
-            if len(located_cols_mid) == 1:
-                oneLocateLoss_mid += abs(mid - x_targetAxis[located_cols_mid[0]]) * (D - U) * (R - L)
-                # oneLocateLoss_mid += abs(L - x_splitAxis[located_cols[0]]) * (D - U) * (R - L)
-                oneLocateCnt_mid += 1
-                oneLocateSquare_mid += (D - U) * (R - L)
-            elif 1 <= len(located_cols_mid) < col_N:
-                ll, rr = located_cols_mid[0], located_cols_mid[-1]
-                # multiLocateLoss_mid += abs(mid - (x_targetAxis[ll] + x_targetAxis[rr]) / 2) * (D - U) * (R - L)
-                multiLocateLoss_mid += abs(mid - x_targetAxis[ll]) * (D - U) * (R - L)
-                # multiLocateLoss_mid += abs(mid - (pageL + pageR) / 2) * (D - U) * (R - L)
-                multiLocateCnt_mid += 1
-                multiLocateSquare_mid += (D - U) * (R - L)
-                isSimpleCondition = False
-            else:
-                allLocateLoss_mid += abs(mid - (pageR + pageL) / 2) * (D - U) * (R - L)
-                allLocateCnt_mid += 1
-                allLocateSquare_mid += (D - U) * (R - L)
-                isSimpleCondition = False
-            ######## 根据区间的边界判断
-            if len(located_cols_LR) == 1:
-                oneLocateLoss_LR += abs(mid - x_targetAxis[located_cols_LR[0]]) * (D - U) * (R - L)
-                # oneLocateLoss_LR += abs(L - x_splitAxis[located_cols_LR[0]]) * (D - U) * (R - L)
-                oneLocateCnt_LR += 1
-                oneLocateSquare_LR += (D - U) * (R - L)
-            elif 1 <= len(located_cols_LR) < col_N:
-                ll, rr = located_cols_LR[0], located_cols_LR[-1]
-                # multiLocateLoss_LR += abs(mid - (x_targetAxis[ll] + x_targetAxis[rr]) / 2) * (D - U) * (R - L)
-                multiLocateLoss_LR += abs(mid - x_targetAxis[ll]) * (D - U) * (R - L)
-                # multiLocateLoss_LR += abs(mid - (pageL + pageR) / 2) * (D - U) * (R - L)
-                multiLocateCnt_LR += 1
-                multiLocateSquare_LR += (D - U) * (R - L)
-                isSimpleCondition = False
-            else:
-                allLocateLoss_LR += abs(mid - (pageR + pageL) / 2) * (D - U) * (R - L)
-                allLocateCnt_LR += 1
-                allLocateSquare_LR += (D - U) * (R - L)
-                isSimpleCondition = False
-    tot_TextCnt = oneLocateCnt_mid + multiLocateCnt_mid + allLocateCnt_mid
-    tot_TextSquare = oneLocateSquare_mid + multiLocateSquare_mid + allLocateSquare_mid
-    # 1列的情形
-    if tot_TextSquare != 0 and allLocateSquare_mid / tot_TextSquare >= 0.85 and col_N == 1:
-        return 0, True
-    # 多列的情形
-    # if col_N >= 2:
-    #     if allLocateCnt >= 1:
-    #         oneLocateLoss_mid += ((pageR - pageL)) * oneLocateCnt_mid
-    #         multiLocateLoss_mid += ((pageR - pageL) ) * multiLocateCnt_mid
-    #     else:
-    #         if multiLocateCnt_mid >= 1:
-    #             oneLocateLoss_mid += ((pageR - pageL)) * oneLocateCnt_mid
-    totLoss_mid = oneLocateLoss_mid + multiLocateLoss_mid + allLocateLoss_mid
-    totLoss_LR = oneLocateCnt_LR + multiLocateCnt_LR + allLocateLoss_LR
-    return totLoss_mid + totLoss_LR, isSimpleCondition
-def get_columnNumber(page_ID: int, page: fitz.Page, textBboxs) -> (int, float):
-    columnNumber_loss = dict()
-    columnNumber_isSimpleCondition = dict()
-    #### 枚举列数
-    for columnNumber in range(1, 5):
-        # print('---------{}--------'.format(columnNumber))
-        x_targetAxis, x_splitAxis = get_targetAxis_and_splitAxis(page_ID, page, columnNumber, textBboxs)
-        loss, isSimpleCondition = calculate_loss(page_ID, x_targetAxis, x_splitAxis, textBboxs)
-        columnNumber_loss[columnNumber] = loss
-        columnNumber_isSimpleCondition[columnNumber] = isSimpleCondition
-    col_idxs = [i for i in range(1, len(columnNumber_loss) + 1)]
-    col_idxs.sort(key=lambda i: (columnNumber_loss[i], i))
-    return col_idxs, columnNumber_loss, columnNumber_isSimpleCondition
--- a/others/pdf2text_recogFootnoteLine.py
+++ b/others/pdf2text_recogFootnoteLine.py
-import re
-from magic_pdf.libs import _is_in_or_part_overlap
-from magic_pdf.libs import fitz
-import collections
-def calculate_overlapRatio_between_rect1_and_rect2(L1: float, U1: float, R1: float, D1: float, L2: float, U2: float, R2: float, D2: float) -> (float, float):
-    # 计算两个rect，重叠面积各占2个rect面积的比例
-    if min(R1, R2) < max(L1, L2) or min(D1, D2) < max(U1, U2):
-        return 0, 0
-    square_1 = (R1 - L1) * (D1 - U1)
-    square_2 = (R2 - L2) * (D2 - U2)
-    if square_1 == 0 or square_2 == 0:
-        return 0, 0
-    square_overlap = (min(R1, R2) - max(L1, L2)) * (min(D1, D2) - max(U1, U2))
-    return square_overlap / square_1, square_overlap / square_2
-def calculate_overlapRatio_between_line1_and_line2(L1: float, R1: float, L2: float, R2: float) -> (float, float):
-    # 计算两个line，重叠区间各占2个line长度的比例
-    if max(L1, L2) > min(R1, R2):
-        return 0, 0
-    if L1 == R1 or L2 == R2:
-        return 0, 0
-    overlap_line = min(R1, R2) - max(L1, L2)
-    return overlap_line / (R1 - L1), overlap_line / (R2 - L2)
-def parse_footnoteLine(page_ID: int, page: fitz.Page, json_from_DocXchain_obj, exclude_bboxes):
-    """
-    :param page_ID: int类型，当前page在当前pdf文档中是第page_D页。
-    :param page :fitz读取的当前页的内容
-    :param res_dir_path: str类型，是每一个pdf文档，在当前.py文件的目录下生成一个与pdf文档同名的文件夹，res_dir_path就是文件夹的dir
-    :param json_from_DocXchain_obj: dict类型，把pdf文档送入DocXChain模型中后，提取bbox，结果保存到pdf文档同名文件夹下的 page_ID.json文件中了。json_from_DocXchain_obj就是打开后的dict
-    """
-    DPI = 72  # use this resolution
-    pix = page.get_pixmap(dpi=DPI)
-    pageL = 0
-    pageR = int(pix.w)
-    pageU = 0
-    pageD = int(pix.h)
-    #---------------------- PyMuPDF解析text --------------------#
-    textSize_freq = collections.defaultdict(float)        # text块中，textSize的频率
-    textBlock_bboxs = []
-    textLine_bboxs = []
-    text_blocks = page.get_text(
-            "dict",
-            flags=fitz.TEXTFLAGS_TEXT,
-            #clip=clip,
-        )["blocks"]
-    totText_list = []
-    for i in range(len(text_blocks)):
-        # print(blocks[i])                #### print
-        bbox = text_blocks[i]['bbox']
-        textBlock_bboxs.append(bbox)
-        # print(bbox) 
-        cur_block_text_list = []
-        for tt in text_blocks[i]['lines']:
-            # 当前line
-            cur_line_text_list = []
-            cur_line_bbox = None                            # 当前line，最右侧的section的bbox
-            for xf in tt['spans']:
-                L, U, R, D = xf['bbox']
-                L, R = min(L, R), max(L, R)
-                U, D = min(U, D), max(U, D)
-                textLine_bboxs.append((L, U, R, D))
-                cur_line_text_list.append(xf['text'])
-                textSize_freq[xf['size']] += len(xf['text'])
-            cur_lines_text = ' '.join(cur_line_text_list)
-            cur_block_text_list.append(cur_lines_text)
-        totText_list.append('\n'.join(cur_block_text_list))
-    totText = '\n'.join(totText_list)
-    # print(totText)                              # 打印Text
-    textLine_bboxs.sort(key = lambda LURD: (LURD[0], LURD[1]))
-    textBlock_bboxs.sort(key = lambda LURD: (LURD[0], LURD[1]))
-    # print('------------ textSize_freq -----------')
-    max_sizeFreq = 0                        # 出现频率最高的textSize
-    textSize_withMaxFreq = 0
-    for x, f in textSize_freq.items():
-        # print(x, f)
-        if f > max_sizeFreq:
-            max_sizeFreq = f
-            textSize_withMaxFreq = x
-    #**********************************************************#
-    #------------------ PyMuPDF读取drawings -----------------#
-    horizon_lines = []
-    drawings = page.get_cdrawings()
-    for drawing in drawings:
-        try:
-            rect = drawing['rect']
-            L, U, R, D = rect
-            # if (L, U, R, D) in exclude_bboxes:
-            #     continue        # 如果是Fiugre, Table, Equation。注释掉是因为，可以暂时先不消，先自我对消。最后再判读需不需要排除。
-            # 如果是水平线
-            if U <= D and D - U <= 3:
-                # 如果长度够
-                if (pageR - pageL) / 15 <= R - L:
-                    if not(80/800 * pageD <= U <= 750/800 * pageD):
-                        continue    # 很可能是页眉和页脚的线
-                    horizon_lines.append((L, U, R, D))
-                    # print((L, U, R, D))
-        except:
-            pass
-    horizon_lines.sort(key = lambda LURD: (LURD[1]))
-    #********************************************************#
-    #----------------- 两条线可能是在表格中 ------------------#
-    def has_text_below_line(L: float, U: float, R: float, D: float, inLowerArea: bool) -> bool:
-        """
-        检查线下是否紧挨着text
-        """
-        Uu, Du = U - textSize_withMaxFreq, U        # 线上的一个矩形
-        Lu, Ru = L, R
-        Ud, Dd = U, U + textSize_withMaxFreq        # 线下的一个矩形
-        Ld, Rd = L, R
-        find = 0                        # 在线下的文字。统计面积。
-        leftTextCnt = 0                 # 不在线底下的文字（整体在线左侧的文字），说明不是个脚注线。统计面积。
-        English_alpha_cnt = 0           # 英文字母个数
-        nonEnglish_alpha_cnt = 0        # 非英文字母个数
-        punctuation_mark_cnt = 0        # 常见标点符号个数
-        digit_cnt = 0                   # 数字个数
-        distance_nearest_up_line = None
-        distance_nearest_down_line = None
-        for i in range(len(text_blocks)):
-            # print(blocks[i])                #### print
-            bbox = text_blocks[i]['bbox']
-            L0, U0, R0, D0 = bbox
-            if 0< (R0 - L0) < pageR / 6 and (D0 - U0) / (R0 - L0) > 10 :
-                continue                # 一个很窄的，竖直的长条。比如，arXiv预印本，左侧的arXiv标志信息。
-            textBlock_bboxs.append(bbox)
-            # print(bbox) 
-            cur_block_text_list = []
-            for tt in text_blocks[i]['lines']:
-                # 当前line
-                cur_line_text_list = []
-                cur_line_bbox = None                            # 当前line，最右侧的section的bbox
-                for xf in tt['spans']:
-                    L2, U2, R2, D2 = xf['bbox']
-                    L2, R2 = min(L2, R2), max(L2, R2)
-                    U2, D2 = min(U2, D2), max(U2, D2)
-                    textLine = xf['text']
-                    if L>0 and L2 < L and (L - L2) / L > 0.2:                        
-                        leftTextCnt += abs(R2 - L2) * abs(D2 - U2)
-                    else:
-                        ## 线下的部分
-                        ratio_1, ratio_2 = calculate_overlapRatio_between_line1_and_line2(Ud, Dd, U2, D2)
-                        ratio_3, ratio_4 = calculate_overlapRatio_between_line1_and_line2(Ld, Rd, L2, R2)
-                        if U < (U2 + D2) / 2 and ratio_1 > 0 and ratio_2 > 0:
-                            if max(ratio_3, ratio_4) > 0.8:
-                                # if 444 <= U1 < 445 and 55 <= L2 < 56:
-                                #     print('匹配的框', L2, U2, R2, D2)
-                                # if xf['size'] > 1.2 * textSize_withMaxFreq:
-                                #     return False        # 可能是个标题。不能这样卡
-                                find += abs(R2 - L2) * abs(D2 - U2)
-                                distance_nearest_down_line = (U2 + D2) / 2 - U
-                                for c in textLine:
-                                    if c == ' ':
-                                        continue
-                                    elif c.isdigit() == True:
-                                        digit_cnt += 1
-                                    elif c in ',.:!?[]()%，。、！？：【】（）《》-':
-                                        punctuation_mark_cnt += 1
-                                    elif c.isalpha() == True:
-                                        English_alpha_cnt += 1
-                                    else:
-                                        nonEnglish_alpha_cnt += 1
-                        ## 线上的部分
-                        ratio_5, ratio_6 = calculate_overlapRatio_between_line1_and_line2(Uu, Du, U2, D2)
-                        ratio_7, ratio_8 = calculate_overlapRatio_between_line1_and_line2(Lu, Ru, L2, R2)
-                        if (U2 + D2) / 2 < U and ratio_5 > 0 and ratio_6 > 0:
-                            if max(ratio_7, ratio_8) > 0.8:
-                                distance_nearest_up_line = U - (U2 + D2) / 2
-                                # if distance_nearest_up_line < 0:
-                                #     print(Lu, Uu, Ru, Du, L2, U2, R2, D2)
-        # print(distance_nearest_up_line, distance_nearest_down_line)
-        if distance_nearest_up_line != None and distance_nearest_down_line != None:
-            if distance_nearest_up_line * 1.5 < distance_nearest_down_line:
-                return False                        # 如果，一根线。距离上面的文字line更近。说明是个下划线，而不是footnoteLine
-        ## 在上面的线条，要考虑左侧的text块儿。在很靠下的线条，就暂时不考虑左侧text块儿了。
-        if inLowerArea == False:
-            if leftTextCnt >= 2000/500000 * pageR * pageD:
-                return False
-            return find >= 0 and (English_alpha_cnt + nonEnglish_alpha_cnt + digit_cnt) >= 10
-        ## 最下面区域的线条，判断时。
-        # print(English_alpha_cnt, nonEnglish_alpha_cnt, digit_cnt)
-        if (English_alpha_cnt + nonEnglish_alpha_cnt + digit_cnt) == 0:
-            return False
-        if (English_alpha_cnt + digit_cnt) / (English_alpha_cnt + nonEnglish_alpha_cnt + digit_cnt) > 0.5:
-            if nonEnglish_alpha_cnt / (English_alpha_cnt + nonEnglish_alpha_cnt + digit_cnt) > 0.4:
-                return False
-            else:
-                return True
-        return True
-    visited = [False for _ in range(len(horizon_lines))]
-    for i, b1 in enumerate(horizon_lines):
-        for j in range(i + 1, len(horizon_lines)):
-            L1, U1, R1, D1 = horizon_lines[i]
-            L2, U2, R2, D2 = horizon_lines[j]
-            ## 在一条水平线，且挨着
-            if L1 > L2:
-                L1, U1, R1, D1, L2, U2, R2, D2 = L2, U2, R2, D2, L1, U1, R1, D1
-            in_horizontal_line_flag = (max(U1, D1, U2, D2) - min(U1, D1, U2, D2) <= 5) and (L2 - R1 <= pageR/10)
-            if in_horizontal_line_flag == True:
-                visited[i] = True
-                visited[j] = True
-            ## 在竖直方向上是一致的。(表格，或者有的文章就是喜欢划线）
-            L1, U1, R1, D1 = horizon_lines[i]
-            L2, U2, R2, D2 = horizon_lines[j]            
-            ratio_1, ratio_2 = calculate_overlapRatio_between_line1_and_line2(L1, R1, L2, R2)
-            # print(L1, U1, R1, D1, L2, U2, R2, D2, ratio_1, ratio_2)
-            in_vertical_line_flag = (ratio_1 > 0.9 and ratio_2 > 0.9) or (max(ratio_1, ratio_2) > 0.95)
-            if in_vertical_line_flag == True:
-                visited[i] = True         
-                # if (U2 < pageD * 0.8 or (U2 - U1) < pageD * 0.3) and has_text_below_line(L2, U2, R2, D2, False) == False:
-                #     visited[j] = True             # 最最底下的线先不要动
-            else:
-                if ratio_1 > 0 and (R2 - L2) / (R1 - L1) > 1:
-                    visited[i] = True
-    # print(horizon_lines)
-    horizon_lines = [horizon_lines[i] for i in range(len(horizon_lines)) if visited[i] == False]
-    # print(horizon_lines)
-    #*****************************************************************#    
-    #------- 靠上的，就不是脚注。用一个THRESHOLD直接卡掉位于上半页的 -------#
-    visited = [False for _ in range(len(horizon_lines))]
-    THRESHOLD = (pageD - pageU) * 0.5
-    for i, (L, U, R, D) in enumerate(horizon_lines):
-        if U < THRESHOLD:
-            visited[i] = True
-    horizon_lines = [horizon_lines[i] for i in range(len(horizon_lines)) if visited[i] == False]
-    #******************************************************#
-    #--------------- 此时，还有遮挡的，上面的丢弃 ---------------#
-    visited = [False for _ in range(len(horizon_lines))]
-    for i, (L1, U1, R1, D1) in enumerate(horizon_lines):
-        for j in range(i + 1, len(horizon_lines)):
-            L2, U2, R2, D2 = horizon_lines[j]
-            ratio_1, ratio_2 = calculate_overlapRatio_between_line1_and_line2(L1, R1, L2, R2)
-            if (ratio_1 > 0.2 and ratio_2 > 0.2) or max(ratio_1, ratio_2) > 0.7:
-                visited[i] = True
-    horizon_lines = [horizon_lines[i] for i in range(len(horizon_lines)) if visited[i] == False]
-    #********************************************************#
-    # print(horizon_lines)
-    ## 检查，线下面有没有紧挨着的text
-    horizon_lines = [LURD for LURD in horizon_lines if has_text_below_line(*(LURD), True) == True]
-    # print(horizon_lines)
-    ## 卡一下长度
-    # horizon_lines = [LURD for LURD in horizon_lines if (LURD[2] - LURD[0] >= pageR / 10)]
-    ## 上面最多保留2条
-    horizon_lines = horizon_lines[max(-2, -len(horizon_lines)) :]
-    #----------------------------------------------------- 第2段 -----------------------------------------------------------#
-    #----------------------------------- 最下面的情形，用距离硬卡。还有在右侧的情形就被包含了 -----------------------------------#
-    #------------------ PyMuPDF读取drawings -----------------#
-    down_horizon_lines = []
-    drawings = page.get_cdrawings()
-    for drawing in drawings:
-        try:
-            rect = drawing['rect']
-            L, U, R, D = rect
-            # if (L, U, R, D) in exclude_bboxes:
-            #     continue        # 如果是Fiugre, Table, Equation。目前是Figure识别的比较好。但是Table和Equation识别的不好
-            # 如果是水平线
-            if U <= D and D - U <= 3 and U > pageD * 0.85:
-                # 如果长度够
-                if (pageR - pageL) / 15 <= R - L:
-                    down_horizon_lines.append((L, U, R, D))
-                    # print((L, U, R, D))
-        except:
-            pass
-    down_horizon_lines.sort(key = lambda LURD: (LURD[0], LURD[2], LURD[1]))
-    visited = [False for _ in range(len(down_horizon_lines))]
-    for i in range(len(down_horizon_lines) - 1):
-        L1, U1, R1, D1 = down_horizon_lines[i]
-        L2, U2, R2, D2 = down_horizon_lines[i + 1]
-        ratio_1, ratio_2 = calculate_overlapRatio_between_line1_and_line2(L1, R1, L2, R2)
-        if ratio_1 <= 0.1 and ratio_2 <= 0.1:
-            if L2 - R1 <= pageR / 3:
-                visited[i] = True
-                visited[i + 1] = True
-    down_horizon_lines = [down_horizon_lines[i] for i in range(len(down_horizon_lines)) if visited[i] == False]
-    down_horizon_lines = [LURD for LURD in down_horizon_lines if has_text_below_line(*(LURD), True) == True]
-    # for LURD in down_horizon_lines:
-    #     print('第2阶段，LURD是： ', LURD)
-    #     print(has_text_below_line(*(LURD), True))
-    footnoteLines = horizon_lines + down_horizon_lines
-    footnoteLines = list(set(footnoteLines))
-    footnoteLines = footnoteLines[max(-2, -len(footnoteLines)) : ]
-    #-------------------------- 最后再检查一遍。是否在图片、表格、公式中。 ------------------------------#
-    def line_in_specialBboxes(L: float, U: float, R: float, D: float, specialBboxes) -> bool:
-        L2, U2, R2, D2 = L, U, R, D     # 当前这根线
-        for L1, U1, R1, D1 in specialBboxes:
-            if U1 <= U2 <= D2 < D1:
-                ratio_1, ratio_2 = calculate_overlapRatio_between_line1_and_line2(L1, R1, L2, R2)
-                if ratio_1 > 0 and ratio_2 > 0.6:
-                    return True
-            # else:
-                # U1 -= min(textSize_withMaxFreq * 2, 20)
-                # D1 += min(textSize_withMaxFreq * 2, 20)
-                # if U1 <= U2 <= D2 < D1:
-                #     ratio_1, ratio_2 = calculate_overlapRatio_between_line1_and_line2(L1, R1, L2, R2)
-                #     if ratio_1 > 0 and ratio_2 > 0.8:
-                #         return True
-        return False                
-    footnoteLines = [LURD for LURD in footnoteLines if line_in_specialBboxes(*(LURD), exclude_bboxes) == False]
-    #-------------------------- 检查，线，是否在当前column的左侧，而不是在一段文字的中间 （通过DocXChain识别的column或者徐超老师写的Layout识别）------------------------------#
-    # #--------- 通过json_from_DocXchain来获取 column ---------#
-    # column_bbox_from_DocXChain = []
-    # xf_json = json_from_DocXchain_obj
-    # width_from_json = xf_json['page_info']['width']
-    # height_from_json = xf_json['page_info']['height']
-    # LR_scaleRatio = width_from_json / (pageR - pageL)
-    # UD_scaleRatio = height_from_json / (pageD - pageU)
-    # # {0: 'title',  # 标题
-    # # 1: 'figure', # 图片
-    # #  2: 'plain text',  # 文本
-    # #  3: 'header',      # 页眉
-    # #  4: 'page number', # 页码
-    # #  5: 'footnote',    # 脚注
-    # #  6: 'footer',      # 页脚
-    # #  7: 'table',       # 表格
-    # #  8: 'table caption',  # 表格描述
-    # #  9: 'figure caption', # 图片描述
-    # #  10: 'equation',      # 公式
-    # #  11: 'full column',   # 单栏
-    # #  12: 'sub column',    # 多栏
-    # #  13: 'embedding',     # 嵌入公式
-    # #  14: 'isolated'}      # 单行公式
-    # for xf in xf_json['layout_dets']:
-    #     L = xf['poly'][0] / LR_scaleRatio
-    #     U = xf['poly'][1] / UD_scaleRatio
-    #     R = xf['poly'][2] / LR_scaleRatio
-    #     D = xf['poly'][5] / UD_scaleRatio
-    #     # L += pageL          # 有的页面，artBox偏移了。不在（0,0）
-    #     # R += pageL
-    #     # U += pageU
-    #     # D += pageU
-    #     L, R = min(L, R), max(L, R)
-    #     U, D = min(U, D), max(U, D)
-    #     if (xf['category_id'] == 11 or xf['category_id'] == 12) and xf['score'] >= 0.3:
-    #         column_bbox_from_DocXChain.append((L, U, R, D))
-    #---------------手写，检查，线是否是与某个column的左端对齐 ------------------#
-    def check_isOnTheLeftOfColumn(L: float, U: float, R: float, D: float) -> bool:
-        LL = L - textSize_withMaxFreq
-        RR = LL
-        UU = max(pageD * 0.02, U - 100/800 * pageD)
-        DD = min(U + 50/800 * pageD, pageD * 0.98)
-        # print(LL, UU, RR, DD)
-        cnt = 0
-        for bbox in textLine_bboxs:
-            L2, U2, R2, D2 = bbox
-            ratio_1, ratio_2 = calculate_overlapRatio_between_line1_and_line2(UU, DD, U2, D2)
-            ratio_3, ratio_4 = calculate_overlapRatio_between_line1_and_line2(L, R, L2, R2)
-            if ratio_1 > 0 and ratio_2 > 0:
-                if max(ratio_3, ratio_4) > 0.8:
-                    if abs(LL - L2) <= 20/700 * pageR:
-                        cnt += 1
-                    # else:
-                    #     if (R2 - L2) >= 30/700 * pageR:
-                    #         print(LL, UU, RR, DD, L2, U2, R2, D2)
-                    #         return False                  # 不能这样卡。有些注释里面，单独的特殊符号就是一个textLineBbox
-        # print('cnt: ', cnt)
-        return cnt >= 4
-    # def check_isOnTheLeftOfColumn_considerLayout(L0: float, U0: float, R0: float, D0: float) -> bool:
-    #     LL = L0 - textSize_withMaxFreq * 1.5
-    #     RR = LL
-    #     UU = 100/800 * pageD
-    #     DD = 700/800 * pageD
-    #     STEP = textSize_withMaxFreq / 2
-    #     def check_ok(L: float, U: float, R: float, D: float) -> bool:
-    #         for bbox in textBlock_bboxs:
-    #             L2, U2, R2, D2 = bbox
-    #             ratio_3, ratio_4 = calculate_overlapRatio_between_line1_and_line2(L, R, L2, R2)
-    #             if max(ratio_3, ratio_4) > 0.8:
-    #                 if (R2 - L2) > 1/4 * pageR and L2 < LL <= RR < R2:
-    #                     if abs(LL - L2) < 50/700 * pageR or abs(RR - R2) < 50/700 * pageR:
-    #                         continue
-    #                     else:
-    #                         return False
-    #         return True
-    #     ## 先探上面
-    #     u = UU
-    #     d = U0
-    #     while u + STEP/2 < d:
-    #         mid = (u + d) / 2
-    #         if check_ok(L0, mid, R0, U0) == True:
-    #             d = mid
-    #         else:
-    #             u = mid + STEP
-    #             print(mid)
-    #     dist_up = U0 - u
-    #     print(u)
-    #     ## 再探下面
-    #     u = D0
-    #     d = DD
-    #     while u + STEP/2 < d:
-    #         mid = (u + d) / 2
-    #         if check_ok(L0, mid, R0, D0) == True:
-    #             u = mid
-    #         else:
-    #             d = mid - STEP
-    #     print(u)
-    #     print('^^^^^^^^^^^^^^')
-    #     dist_down = u - D0
-    #     if dist_up + dist_down < textSize_withMaxFreq * 10:
-    #         return False
-    #     return True
-    footnoteLines = [LURD for LURD in footnoteLines if check_isOnTheLeftOfColumn(*(LURD)) == True]
-    # footnoteLines = [LURD for LURD in footnoteLines if check_isOnTheLeftOfColumn_considerLayout(*(LURD)) == True]     # 不具有泛化性。不用了。
-    #--------------------------------- 通过footnoteLine获取bbox -------------------------------#
-    def get_footnoteBbox(L: float, U: float, R: float, D: float) -> (float, float, float, float):
-        """
-        检查线下是否紧挨着text
-        """
-        L1, U1, R1, D1 = L, U, R, D
-        raw_bboxes = []
-        for i in range(len(text_blocks)):
-            bbox = text_blocks[i]['bbox']
-            L2, U2, R2, D2 = bbox
-            if (D2 - U2) / (R2 - L2) > 10 and (R2 - L2) < pageR / 6:
-                continue                # 一个很窄的，竖直的长条。比如，arXiv预印本，左侧的arXiv标志信息。
-            if U2 < D2 < U1:
-                continue                # 在线上面
-            under_THRESHOLD = min(D1 + textSize_withMaxFreq * 20, pageD * 0.98)
-            if U2 < under_THRESHOLD:
-                ratio_1, ratio_2 = calculate_overlapRatio_between_line1_and_line2(L1, R1, L2, R2)
-                if max(ratio_1, ratio_2) > 0.8:
-                    raw_bboxes.append((L2, U2, R2, D2))
-        # print(L1, U1, R1, D1)
-        # print(raw_bboxes)
-        if len(raw_bboxes) == 0:
-            return []
-        raw_bboxes.sort(key = lambda LURD: (LURD[1], LURD[0]))
-        raw_bboxes = [LURD for LURD in raw_bboxes if (abs(LURD[0] - L1) < textSize_withMaxFreq * 6 or L1 < LURD[0])]  # footnote的bbox，应该都是左端对齐的
-        if len(raw_bboxes) == 0:
-            return []
-        #------------------ full column和sub column混合，肯定也不行 ------------------#
-        LL, UU, RR, DD = raw_bboxes[0]
-        for L, U, R, D in raw_bboxes:
-            LL, UU, RR, DD = min(LL, L), min(UU, U), max(RR, R), max(DD, D)
-        for L, U, R, D in raw_bboxes:
-            if (RR - LL) > pageR*0.8 and (R - L) > pageR * 0.15 and (RR - LL) / (R - L) > 2:
-                return []
-            if abs(LL - L) > textSize_withMaxFreq * 3:
-                return []       
-        #-------------------- 太高了的，full column的框。不行 ----------------------#
-        if UU < 650/800 * pageD and (RR - LL) > 0.5 * pageR:
-            return []
-        #-------------- 第一段字数很少。后面的段字数很多，也不行 ----------------#
-        if len(raw_bboxes) > 1:
-            bbox_square = []
-            for L, U, R, D in raw_bboxes:
-                cur_s = abs(R - L) * abs(D - U)
-                bbox_square.append(cur_s)
-            s0 = bbox_square[0]
-            s1n = sum(bbox_square[1: ]) / len(bbox_square[1: ])
-            if s1n / s0 > 10 or max(bbox_square) / s0 > 15:
-                return []
-        raw_bboxes += [(LL, UU, RR, DD)]
-        return raw_bboxes            
-    # print(footnoteLines)
-    footnoteBboxes = []
-    for L, U, R, D in footnoteLines:
-        cur = get_footnoteBbox(L, U, R, D)
-        if len(cur) > 0:
-            footnoteBboxes.append((L, U, R, D))
-            footnoteBboxes += cur
-    footnoteBboxes = list(set(footnoteBboxes))
-    return footnoteBboxes
-def __bbox_in(box1, box2):
-    """
-    box1是否在box2中
-    """
-    L1, U1, R1, D1 = box1
-    L2, U2, R2, D2 = box2
-    if int(L2) <= int(L1) and int(U2) <= int(U1) and int(R1) <= int(R2) and int(D1) <= int(D2):
-        return True
-    return False
-def remove_footnote_text(raw_text_block, footnote_bboxes):
-    """
-    :param raw_text_block: str类型，是当前页的文本内容
-    :param footnoteBboxes: list类型，是当前页的脚注bbox
-    """
-    footnote_text_blocks = []
-    for block in raw_text_block:
-        text_bbox = block['bbox']
-        # TODO 更严谨点在line级别做
-        if any([_is_in_or_part_overlap(text_bbox, footnote_bbox) for footnote_bbox in footnote_bboxes]):
-        #if any([text_bbox[3]>=footnote_bbox[1] for footnote_bbox in footnote_bboxes]):
-            block['tag'] = 'footnote'
-            footnote_text_blocks.append(block)
-            #raw_text_block.remove(block)
-    # 移除，不能再内部移除，否则会出错
-    for block in footnote_text_blocks:
-        raw_text_block.remove(block)
-    return raw_text_block, footnote_text_blocks
-def remove_footnote_image(image_blocks, footnote_bboxes):
-    """
-    :param image_bboxes: list类型，是当前页的图片bbox(结构体)
-    :param footnoteBboxes: list类型，是当前页的脚注bbox
-    """
-    footnote_imgs_blocks = []
-    for image_block in image_blocks:
-        if any([__bbox_in(image_block['bbox'], footnote_bbox) for footnote_bbox in footnote_bboxes]):
-            footnote_imgs_blocks.append(image_block)
-    for footnote_imgs_block in footnote_imgs_blocks:
-        image_blocks.remove(footnote_imgs_block)
-    return image_blocks, footnote_imgs_blocks
-def remove_headder_footer_one_page(text_raw_blocks, image_bboxes, table_bboxes, header_bboxs, footer_bboxs, page_no_bboxs, page_w, page_h):
-    """
-    删除页眉页脚，页码
-    从line级别进行删除，删除之后观察这个text-block是否是空的，如果是空的，则移动到remove_list中
-    """
-    header = []
-    footer = []
-    if len(header)==0:
-        model_header = header_bboxs
-        if model_header:
-            x0 = min([x for x,_,_,_ in model_header])
-            y0 = min([y for _,y,_,_ in model_header])
-            x1 = max([x1 for _,_,x1,_ in model_header])
-            y1 = max([y1 for _,_,_,y1 in model_header])
-            header = [x0, y0, x1, y1]
-    if len(footer)==0:
-        model_footer = footer_bboxs
-        if model_footer:
-            x0 = min([x for x,_,_,_ in model_footer])
-            y0 = min([y for _,y,_,_ in model_footer])
-            x1 = max([x1 for _,_,x1,_ in model_footer])
-            y1 = max([y1 for _,_,_,y1 in model_footer])
-            footer = [x0, y0, x1, y1]
-    header_y0 = 0 if len(header) == 0 else header[3]
-    footer_y0 = page_h if len(footer) == 0 else footer[1]
-    if page_no_bboxs:
-        top_part = [b for b in page_no_bboxs if b[3] < page_h/2]
-        btn_part = [b for b in page_no_bboxs if b[1] > page_h/2]
-        top_max_y0 = max([b[1] for b in top_part]) if top_part else 0
-        btn_min_y1 = min([b[3] for b in btn_part]) if btn_part else page_h
-        header_y0 = max(header_y0, top_max_y0)
-        footer_y0 = min(footer_y0, btn_min_y1)
-    content_boundry = [0, header_y0, page_w, footer_y0]
-    header = [0,0, page_w, header_y0]
-    footer = [0, footer_y0, page_w, page_h]
-    """以上计算出来了页眉页脚的边界，下面开始进行删除"""
-    text_block_to_remove = []
-    # 首先检查每个textblock
-    for blk in text_raw_blocks:
-        if len(blk['lines']) > 0:
-            for line in blk['lines']:
-                line_del = []
-                for span in line['spans']:
-                    span_del = []
-                    if span['bbox'][3] < header_y0:
-                        span_del.append(span)
-                    elif _is_in_or_part_overlap(span['bbox'], header) or _is_in_or_part_overlap(span['bbox'], footer):
-                        span_del.append(span)
-                for span in span_del:
-                    line['spans'].remove(span)
-                if not line['spans']:
-                    line_del.append(line)
-            for line in line_del:
-                blk['lines'].remove(line)
-        else:
-        # if not blk['lines']:
-            blk['tag'] = 'in-foot-header-area'
-            text_block_to_remove.append(blk)
-    """有的时候由于pageNo太小了，总是会有一点和content_boundry重叠一点，被放入正文，因此对于pageNo，进行span粒度的删除"""
-    page_no_block_2_remove = []
-    if page_no_bboxs:
-        for pagenobox in page_no_bboxs:
-            for block in text_raw_blocks:
-                if _is_in_or_part_overlap(pagenobox, block['bbox']): # 在span级别删除页码
-                    for line in block['lines']:
-                        for span in line['spans']:
-                            if _is_in_or_part_overlap(pagenobox, span['bbox']):
-                                #span['text'] = ''
-                                span['tag'] = "page-no"
-                                # 检查这个block是否只有这一个span，如果是，那么就把这个block也删除
-                                if len(line['spans']) == 1 and len(block['lines'])==1:
-                                    page_no_block_2_remove.append(block)
-    else:
-        # 测试最后一个是不是页码：规则是，最后一个block仅有1个line,一个span,且text是数字，空格，符号组成，不含字母,并且包含数字
-        if len(text_raw_blocks) > 0:
-            text_raw_blocks.sort(key=lambda x: x['bbox'][1], reverse=True)
-            last_block = text_raw_blocks[0]
-            if len(last_block['lines']) == 1:
-                last_line = last_block['lines'][0]
-                if len(last_line['spans']) == 1:
-                    last_span = last_line['spans'][0]
-                    if last_span['text'].strip() and not re.search('[a-zA-Z]', last_span['text']) and re.search('[0-9]', last_span['text']):
-                        last_span['tag'] = "page-no"
-                        page_no_block_2_remove.append(last_block)
-    for b in page_no_block_2_remove:
-        text_block_to_remove.append(b)
-    for blk in text_block_to_remove:
-        if blk in text_raw_blocks:
-            text_raw_blocks.remove(blk)
-    text_block_remain = text_raw_blocks
-    image_bbox_to_remove = [bbox for bbox in image_bboxes if not _is_in_or_part_overlap(bbox, content_boundry)]
-    image_bbox_remain = [bbox for bbox in image_bboxes if _is_in_or_part_overlap(bbox, content_boundry)]
-    table_bbox_to_remove = [bbox for bbox in table_bboxes if not _is_in_or_part_overlap(bbox, content_boundry)]
-    table_bbox_remain = [bbox for bbox in table_bboxes if _is_in_or_part_overlap(bbox, content_boundry)]
-    return image_bbox_remain, table_bbox_remain, text_block_remain, text_block_to_remove, image_bbox_to_remove, table_bbox_to_remove
--- a/others/pdf2text_recogPara_v2.py
+++ b/others/pdf2text_recogPara_v2.py
-import os
-import sys
-import json
-import re
-import math
-import unicodedata
-from collections import Counter
-import numpy as np
-from termcolor import cprint
-from magic_pdf.libs import fitz
-from magic_pdf.libs import NLPModels
-if sys.version_info[0] >= 3:
-    sys.stdout.reconfigure(encoding="utf-8")  # type: ignore
-def open_pdf(pdf_path):
-    try:
-        pdf_document = fitz.open(pdf_path)  # type: ignore
-        return pdf_document
-    except Exception as e:
-        print(f"无法打开PDF文件：{pdf_path}。原因是：{e}")
-        raise e
-def print_green_on_red(text):
-    cprint(text, "green", "on_red", attrs=["bold"], end="\n\n")
-def print_green(text):
-    print()
-    cprint(text, "green", attrs=["bold"], end="\n\n")
-def print_red(text):
-    print()
-    cprint(text, "red", attrs=["bold"], end="\n\n")
-def print_yellow(text):
-    print()
-    cprint(text, "yellow", attrs=["bold"], end="\n\n")
-def safe_get(dict_obj, key, default):
-    val = dict_obj.get(key)
-    if val is None:
-        return default
-    else:
-        return val
-def is_bbox_overlap(bbox1, bbox2):
-    """
-    This function checks if bbox1 and bbox2 overlap or not
-    Parameters
-    ----------
-    bbox1 : list
-        bbox1
-    bbox2 : list
-        bbox2
-    Returns
-    -------
-    bool
-        True if bbox1 and bbox2 overlap, else False
-    """
-    x0_1, y0_1, x1_1, y1_1 = bbox1
-    x0_2, y0_2, x1_2, y1_2 = bbox2
-    if x0_1 > x1_2 or x0_2 > x1_1:
-        return False
-    if y0_1 > y1_2 or y0_2 > y1_1:
-        return False
-    return True
-def is_in_bbox(bbox1, bbox2):
-    """
-    This function checks if bbox1 is in bbox2
-    Parameters
-    ----------
-    bbox1 : list
-        bbox1
-    bbox2 : list
-        bbox2
-    Returns
-    -------
-    bool
-        True if bbox1 is in bbox2, else False
-    """
-    x0_1, y0_1, x1_1, y1_1 = bbox1
-    x0_2, y0_2, x1_2, y1_2 = bbox2
-    if x0_1 >= x0_2 and y0_1 >= y0_2 and x1_1 <= x1_2 and y1_1 <= y1_2:
-        return True
-    else:
-        return False
-def calculate_para_bbox(lines):
-    """
-    This function calculates the minimum bbox of the paragraph
-    Parameters
-    ----------
-    lines : list
-        lines
-    Returns
-    -------
-    para_bbox : list
-        bbox of the paragraph
-    """
-    x0 = min(line["bbox"][0] for line in lines)
-    y0 = min(line["bbox"][1] for line in lines)
-    x1 = max(line["bbox"][2] for line in lines)
-    y1 = max(line["bbox"][3] for line in lines)
-    return [x0, y0, x1, y1]
-def is_line_right_aligned_from_neighbors(curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_width, direction=2):
-    """
-    This function checks if the line is right aligned from its neighbors
-    Parameters
-    ----------
-    curr_line_bbox : list
-        bbox of the current line
-    prev_line_bbox : list
-        bbox of the previous line
-    next_line_bbox : list
-        bbox of the next line
-    avg_char_width : float
-        average of char widths
-    direction : int
-        0 for prev, 1 for next, 2 for both
-    Returns
-    -------
-    bool
-        True if the line is right aligned from its neighbors, False otherwise.
-    """
-    horizontal_ratio = 0.5
-    horizontal_thres = horizontal_ratio * avg_char_width
-    _, _, x1, _ = curr_line_bbox
-    _, _, prev_x1, _ = prev_line_bbox if prev_line_bbox else (0, 0, 0, 0)
-    _, _, next_x1, _ = next_line_bbox if next_line_bbox else (0, 0, 0, 0)
-    if direction == 0:
-        return abs(x1 - prev_x1) < horizontal_thres
-    elif direction == 1:
-        return abs(x1 - next_x1) < horizontal_thres
-    elif direction == 2:
-        return abs(x1 - prev_x1) < horizontal_thres and abs(x1 - next_x1) < horizontal_thres
-    else:
-        return False
-def is_line_left_aligned_from_neighbors(curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_width, direction=2):
-    """
-    This function checks if the line is left aligned from its neighbors
-    Parameters
-    ----------
-    curr_line_bbox : list
-        bbox of the current line
-    prev_line_bbox : list
-        bbox of the previous line
-    next_line_bbox : list
-        bbox of the next line
-    avg_char_width : float
-        average of char widths
-    direction : int
-        0 for prev, 1 for next, 2 for both
-    Returns
-    -------
-    bool
-        True if the line is left aligned from its neighbors, False otherwise.
-    """
-    horizontal_ratio = 0.5
-    horizontal_thres = horizontal_ratio * avg_char_width
-    x0, _, _, _ = curr_line_bbox
-    prev_x0, _, _, _ = prev_line_bbox if prev_line_bbox else (0, 0, 0, 0)
-    next_x0, _, _, _ = next_line_bbox if next_line_bbox else (0, 0, 0, 0)
-    if direction == 0:
-        return abs(x0 - prev_x0) < horizontal_thres
-    elif direction == 1:
-        return abs(x0 - next_x0) < horizontal_thres
-    elif direction == 2:
-        return abs(x0 - prev_x0) < horizontal_thres and abs(x0 - next_x0) < horizontal_thres
-    else:
-        return False
-def end_with_punctuation(line_text):
-    """
-    This function checks if the line ends with punctuation marks
-    """
-    english_end_puncs = [".", "?", "!"]
-    chinese_end_puncs = ["。", "？", "！"]
-    end_puncs = english_end_puncs + chinese_end_puncs
-    last_non_space_char = None
-    for ch in line_text[::-1]:
-        if not ch.isspace():
-            last_non_space_char = ch
-            break
-    if last_non_space_char is None:
-        return False
-    return last_non_space_char in end_puncs
-def is_nested_list(lst):
-    if isinstance(lst, list):
-        return any(isinstance(sub, list) for sub in lst)
-    return False
-class DenseSingleLineBlockException(Exception):
-    """
-    This class defines the exception type for dense single line-block.
-    """
-    def __init__(self, message="DenseSingleLineBlockException"):
-        self.message = message
-        super().__init__(self.message)
-    def __str__(self):
-        return f"{self.message}"
-    def __repr__(self):
-        return f"{self.message}"
-class TitleDetectionException(Exception):
-    """
-    This class defines the exception type for title detection.
-    """
-    def __init__(self, message="TitleDetectionException"):
-        self.message = message
-        super().__init__(self.message)
-    def __str__(self):
-        return f"{self.message}"
-    def __repr__(self):
-        return f"{self.message}"
-class TitleLevelException(Exception):
-    """
-    This class defines the exception type for title level.
-    """
-    def __init__(self, message="TitleLevelException"):
-        self.message = message
-        super().__init__(self.message)
-    def __str__(self):
-        return f"{self.message}"
-    def __repr__(self):
-        return f"{self.message}"
-class ParaSplitException(Exception):
-    """
-    This class defines the exception type for paragraph splitting.
-    """
-    def __init__(self, message="ParaSplitException"):
-        self.message = message
-        super().__init__(self.message)
-    def __str__(self):
-        return f"{self.message}"
-    def __repr__(self):
-        return f"{self.message}"
-class ParaMergeException(Exception):
-    """
-    This class defines the exception type for paragraph merging.
-    """
-    def __init__(self, message="ParaMergeException"):
-        self.message = message
-        super().__init__(self.message)
-    def __str__(self):
-        return f"{self.message}"
-    def __repr__(self):
-        return f"{self.message}"
-class DiscardByException:
-    """
-    This class discards pdf files by exception
-    """
-    def __init__(self) -> None:
-        pass
-    def discard_by_single_line_block(self, pdf_dic, exception: DenseSingleLineBlockException):
-        """
-        This function discards pdf files by single line block exception
-        Parameters
-        ----------
-        pdf_dic : dict
-            pdf dictionary
-        exception : str
-            exception message
-        Returns
-        -------
-        error_message : str
-        """
-        exception_page_nums = 0
-        page_num = 0
-        for page_id, page in pdf_dic.items():
-            if page_id.startswith("page_"):
-                page_num += 1
-                if "preproc_blocks" in page.keys():
-                    preproc_blocks = page["preproc_blocks"]
-                    all_single_line_blocks = []
-                    for block in preproc_blocks:
-                        if len(block["lines"]) == 1:
-                            all_single_line_blocks.append(block)
-                    if len(preproc_blocks) > 0 and len(all_single_line_blocks) / len(preproc_blocks) > 0.9:
-                        exception_page_nums += 1
-        if page_num == 0:
-            return None
-        if exception_page_nums / page_num > 0.1:  # Low ratio means basically, whenever this is the case, it is discarded
-            return exception.message
-        return None
-    def discard_by_title_detection(self, pdf_dic, exception: TitleDetectionException):
-        """
-        This function discards pdf files by title detection exception
-        Parameters
-        ----------
-        pdf_dic : dict
-            pdf dictionary
-        exception : str
-            exception message
-        Returns
-        -------
-        error_message : str
-        """
-        # return exception.message
-        return None
-    def discard_by_title_level(self, pdf_dic, exception: TitleLevelException):
-        """
-        This function discards pdf files by title level exception
-        Parameters
-        ----------
-        pdf_dic : dict
-            pdf dictionary
-        exception : str
-            exception message
-        Returns
-        -------
-        error_message : str
-        """
-        # return exception.message
-        return None
-    def discard_by_split_para(self, pdf_dic, exception: ParaSplitException):
-        """
-        This function discards pdf files by split para exception
-        Parameters
-        ----------
-        pdf_dic : dict
-            pdf dictionary
-        exception : str
-            exception message
-        Returns
-        -------
-        error_message : str
-        """
-        # return exception.message
-        return None
-    def discard_by_merge_para(self, pdf_dic, exception: ParaMergeException):
-        """
-        This function discards pdf files by merge para exception
-        Parameters
-        ----------
-        pdf_dic : dict
-            pdf dictionary
-        exception : str
-            exception message
-        Returns
-        -------
-        error_message : str
-        """
-        # return exception.message
-        return None
-class LayoutFilterProcessor:
-    def __init__(self) -> None:
-        pass
-    def batch_process_blocks(self, pdf_dict):
-        """
-        This function processes the blocks in batch.
-        Parameters
-        ----------
-        self : object
-            The instance of the class.
-        pdf_dict : dict
-            pdf dictionary
-        Returns
-        -------
-        pdf_dict : dict
-            pdf dictionary
-        """
-        for page_id, blocks in pdf_dict.items():
-            if page_id.startswith("page_"):
-                if "layout_bboxes" in blocks.keys() and "para_blocks" in blocks.keys():
-                    layout_bbox_objs = blocks["layout_bboxes"]
-                    if layout_bbox_objs is None:
-                        continue
-                    layout_bboxes = [bbox_obj["layout_bbox"] for bbox_obj in layout_bbox_objs]
-                    # Enlarge each value of x0, y0, x1, y1 for each layout_bbox to prevent loss of text.
-                    layout_bboxes = [
-                        [math.ceil(x0), math.ceil(y0), math.ceil(x1), math.ceil(y1)] for x0, y0, x1, y1 in layout_bboxes
-                    ]
-                    para_blocks = blocks["para_blocks"]
-                    if para_blocks is None:
-                        continue
-                    for lb_bbox in layout_bboxes:
-                        for i, para_block in enumerate(para_blocks):
-                            para_bbox = para_block["bbox"]
-                            para_blocks[i]["in_layout"] = 0
-                            if is_in_bbox(para_bbox, lb_bbox):
-                                para_blocks[i]["in_layout"] = 1
-                    blocks["para_blocks"] = para_blocks
-        return pdf_dict
-class RawBlockProcessor:
-    def __init__(self) -> None:
-        self.y_tolerance = 2
-        self.pdf_dic = {}
-    def __span_flags_decomposer(self, span_flags):
-        """
-        Make font flags human readable.
-        Parameters
-        ----------
-        self : object
-            The instance of the class.
-        span_flags : int
-            span flags
-        Returns
-        -------
-        l : dict
-            decomposed flags
-        """
-        l = {
-            "is_superscript": False,
-            "is_italic": False,
-            "is_serifed": False,
-            "is_sans_serifed": False,
-            "is_monospaced": False,
-            "is_proportional": False,
-            "is_bold": False,
-        }
-        if span_flags & 2**0:
-            l["is_superscript"] = True  # 表示上标
-        if span_flags & 2**1:
-            l["is_italic"] = True  # 表示斜体
-        if span_flags & 2**2:
-            l["is_serifed"] = True  # 表示衬线字体
-        else:
-            l["is_sans_serifed"] = True  # 表示非衬线字体
-        if span_flags & 2**3:
-            l["is_monospaced"] = True  # 表示等宽字体
-        else:
-            l["is_proportional"] = True  # 表示比例字体
-        if span_flags & 2**4:
-            l["is_bold"] = True  # 表示粗体
-        return l
-    def __make_new_lines(self, raw_lines):
-        """
-        This function makes new lines.
-        Parameters
-        ----------
-        self : object
-            The instance of the class.
-        raw_lines : list
-            raw lines
-        Returns
-        -------
-        new_lines : list
-            new lines
-        """
-        new_lines = []
-        new_line = None
-        for raw_line in raw_lines:
-            raw_line_bbox = raw_line["bbox"]
-            raw_line_spans = raw_line["spans"]
-            raw_line_text = "".join([span["text"] for span in raw_line_spans])
-            raw_line_dir = raw_line.get("dir", None)
-            decomposed_line_spans = []
-            for span in raw_line_spans:
-                raw_flags = span["flags"]
-                decomposed_flags = self.__span_flags_decomposer(raw_flags)
-                span["decomposed_flags"] = decomposed_flags
-                decomposed_line_spans.append(span)
-            if new_line is None:  # Handle the first line
-                new_line = {
-                    "bbox": raw_line_bbox,
-                    "text": raw_line_text,
-                    "dir": raw_line_dir if raw_line_dir else (0, 0),
-                    "spans": decomposed_line_spans,
-                }
-            else:  # Handle the rest lines
-                if (
-                    abs(raw_line_bbox[1] - new_line["bbox"][1]) <= self.y_tolerance
-                    and abs(raw_line_bbox[3] - new_line["bbox"][3]) <= self.y_tolerance
-                ):
-                    new_line["bbox"] = (
-                        min(new_line["bbox"][0], raw_line_bbox[0]),  # left
-                        new_line["bbox"][1],  # top
-                        max(new_line["bbox"][2], raw_line_bbox[2]),  # right
-                        raw_line_bbox[3],  # bottom
-                    )
-                    new_line["text"] += raw_line_text
-                    new_line["spans"].extend(raw_line_spans)
-                    new_line["dir"] = (
-                        new_line["dir"][0] + raw_line_dir[0],
-                        new_line["dir"][1] + raw_line_dir[1],
-                    )
-                else:
-                    new_lines.append(new_line)
-                    new_line = {
-                        "bbox": raw_line_bbox,
-                        "text": raw_line_text,
-                        "dir": raw_line_dir if raw_line_dir else (0, 0),
-                        "spans": raw_line_spans,
-                    }
-        if new_line:
-            new_lines.append(new_line)
-        return new_lines
-    def __make_new_block(self, raw_block):
-        """
-        This function makes a new block.
-        Parameters
-        ----------
-        self : object
-            The instance of the class.
-        ----------
-        raw_block : dict
-            a raw block
-        Returns
-        -------
-        new_block : dict
-        """
-        new_block = {}
-        block_id = raw_block["number"]
-        block_bbox = raw_block["bbox"]
-        block_text = "".join(span["text"] for line in raw_block["lines"] for span in line["spans"])
-        raw_lines = raw_block["lines"]
-        block_lines = self.__make_new_lines(raw_lines)
-        new_block["block_id"] = block_id
-        new_block["bbox"] = block_bbox
-        new_block["text"] = block_text
-        new_block["lines"] = block_lines
-        return new_block
-    def batch_process_blocks(self, pdf_dic):
-        """
-        This function processes the blocks in batch.
-        Parameters
-        ----------
-        self : object
-            The instance of the class.
-        ----------
-        blocks : list
-            Input block is a list of raw blocks.
-        Returns
-        -------
-        result_dict : dict
-            result dictionary
-        """
-        for page_id, blocks in pdf_dic.items():
-            if page_id.startswith("page_"):
-                para_blocks = []
-                if "preproc_blocks" in blocks.keys():
-                    input_blocks = blocks["preproc_blocks"]
-                    for raw_block in input_blocks:
-                        new_block = self.__make_new_block(raw_block)
-                        para_blocks.append(new_block)
-                blocks["para_blocks"] = para_blocks
-        return pdf_dic
-class BlockStatisticsCalculator:
-    """
-    This class calculates the statistics of the block.
-    """
-    def __init__(self) -> None:
-        pass
-    def __calc_stats_of_new_lines(self, new_lines):
-        """
-        This function calculates the paragraph metrics
-        Parameters
-        ----------
-        combined_lines : list
-            combined lines
-        Returns
-        -------
-        X0 : float
-            Median of x0 values, which represents the left average boundary of the block
-        X1 : float
-            Median of x1 values, which represents the right average boundary of the block
-        avg_char_width : float
-            Average of char widths, which represents the average char width of the block
-        avg_char_height : float
-            Average of line heights, which represents the average line height of the block
-        """
-        x0_values = []
-        x1_values = []
-        char_widths = []
-        char_heights = []
-        block_font_types = []
-        block_font_sizes = []
-        block_directions = []
-        if len(new_lines) > 0:
-            for i, line in enumerate(new_lines):
-                line_bbox = line["bbox"]
-                line_text = line["text"]
-                line_spans = line["spans"]
-                num_chars = len([ch for ch in line_text if not ch.isspace()])
-                x0_values.append(line_bbox[0])
-                x1_values.append(line_bbox[2])
-                if num_chars > 0:
-                    char_width = (line_bbox[2] - line_bbox[0]) / num_chars
-                    char_widths.append(char_width)
-                for span in line_spans:
-                    block_font_types.append(span["font"])
-                    block_font_sizes.append(span["size"])
-                if "dir" in line:
-                    block_directions.append(line["dir"])
-                # line_font_types = [span["font"] for span in line_spans]
-                char_heights = [span["size"] for span in line_spans]
-        X0 = np.median(x0_values) if x0_values else 0
-        X1 = np.median(x1_values) if x1_values else 0
-        avg_char_width = sum(char_widths) / len(char_widths) if char_widths else 0
-        avg_char_height = sum(char_heights) / len(char_heights) if char_heights else 0
-        # max_freq_font_type = max(set(block_font_types), key=block_font_types.count) if block_font_types else None
-        max_span_length = 0
-        max_span_font_type = None
-        for line in new_lines:
-            line_spans = line["spans"]
-            for span in line_spans:
-                span_length = span["bbox"][2] - span["bbox"][0]
-                if span_length > max_span_length:
-                    max_span_length = span_length
-                    max_span_font_type = span["font"]
-        max_freq_font_type = max_span_font_type
-        avg_font_size = sum(block_font_sizes) / len(block_font_sizes) if block_font_sizes else None
-        avg_dir_horizontal = sum([dir[0] for dir in block_directions]) / len(block_directions) if block_directions else 0
-        avg_dir_vertical = sum([dir[1] for dir in block_directions]) / len(block_directions) if block_directions else 0
-        median_font_size = float(np.median(block_font_sizes)) if block_font_sizes else None
-        return (
-            X0,
-            X1,
-            avg_char_width,
-            avg_char_height,
-            max_freq_font_type,
-            avg_font_size,
-            (avg_dir_horizontal, avg_dir_vertical),
-            median_font_size,
-        )
-    def __make_new_block(self, input_block):
-        new_block = {}
-        raw_lines = input_block["lines"]
-        stats = self.__calc_stats_of_new_lines(raw_lines)
-        block_id = input_block["block_id"]
-        block_bbox = input_block["bbox"]
-        block_text = input_block["text"]
-        block_lines = raw_lines
-        block_avg_left_boundary = stats[0]
-        block_avg_right_boundary = stats[1]
-        block_avg_char_width = stats[2]
-        block_avg_char_height = stats[3]
-        block_font_type = stats[4]
-        block_font_size = stats[5]
-        block_direction = stats[6]
-        block_median_font_size = stats[7]
-        new_block["block_id"] = block_id
-        new_block["bbox"] = block_bbox
-        new_block["text"] = block_text
-        new_block["dir"] = block_direction
-        new_block["X0"] = block_avg_left_boundary
-        new_block["X1"] = block_avg_right_boundary
-        new_block["avg_char_width"] = block_avg_char_width
-        new_block["avg_char_height"] = block_avg_char_height
-        new_block["block_font_type"] = block_font_type
-        new_block["block_font_size"] = block_font_size
-        new_block["lines"] = block_lines
-        new_block["median_font_size"] = block_median_font_size
-        return new_block
-    def batch_process_blocks(self, pdf_dic):
-        """
-        This function processes the blocks in batch.
-        Parameters
-        ----------
-        self : object
-            The instance of the class.
-        ----------
-        blocks : list
-            Input block is a list of raw blocks.
-            Schema can refer to the value of key ""preproc_blocks".
-        Returns
-        -------
-        result_dict : dict
-            result dictionary
-        """
-        for page_id, blocks in pdf_dic.items():
-            if page_id.startswith("page_"):
-                para_blocks = []
-                if "para_blocks" in blocks.keys():
-                    input_blocks = blocks["para_blocks"]
-                    for input_block in input_blocks:
-                        new_block = self.__make_new_block(input_block)
-                        para_blocks.append(new_block)
-                blocks["para_blocks"] = para_blocks
-        return pdf_dic
-class DocStatisticsCalculator:
-    """
-    This class calculates the statistics of the document.
-    """
-    def __init__(self) -> None:
-        pass
-    def calc_stats_of_doc(self, pdf_dict):
-        """
-        This function computes the statistics of the document
-        Parameters
-        ----------
-        result_dict : dict
-            result dictionary
-        Returns
-        -------
-        statistics : dict
-            statistics of the document
-        """
-        total_text_length = 0
-        total_num_blocks = 0
-        for page_id, blocks in pdf_dict.items():
-            if page_id.startswith("page_"):
-                if "para_blocks" in blocks.keys():
-                    para_blocks = blocks["para_blocks"]
-                    for para_block in para_blocks:
-                        total_text_length += len(para_block["text"])
-                        total_num_blocks += 1
-        avg_text_length = total_text_length / total_num_blocks if total_num_blocks else 0
-        font_list = []
-        for page_id, blocks in pdf_dict.items():
-            if page_id.startswith("page_"):
-                if "para_blocks" in blocks.keys():
-                    input_blocks = blocks["para_blocks"]
-                    for input_block in input_blocks:
-                        block_text_length = len(input_block.get("text", ""))
-                        if block_text_length < avg_text_length * 0.5:
-                            continue
-                        block_font_type = safe_get(input_block, "block_font_type", "")
-                        block_font_size = safe_get(input_block, "block_font_size", 0)
-                        font_list.append((block_font_type, block_font_size))
-        font_counter = Counter(font_list)
-        most_common_font = font_counter.most_common(1)[0] if font_list else (("", 0), 0)
-        second_most_common_font = font_counter.most_common(2)[1] if len(font_counter) > 1 else (("", 0), 0)
-        statistics = {
-            "num_pages": 0,
-            "num_blocks": 0,
-            "num_paras": 0,
-            "num_titles": 0,
-            "num_header_blocks": 0,
-            "num_footer_blocks": 0,
-            "num_watermark_blocks": 0,
-            "num_vertical_margin_note_blocks": 0,
-            "most_common_font_type": most_common_font[0][0],
-            "most_common_font_size": most_common_font[0][1],
-            "number_of_most_common_font": most_common_font[1],
-            "second_most_common_font_type": second_most_common_font[0][0],
-            "second_most_common_font_size": second_most_common_font[0][1],
-            "number_of_second_most_common_font": second_most_common_font[1],
-            "avg_text_length": avg_text_length,
-        }
-        for page_id, blocks in pdf_dict.items():
-            if page_id.startswith("page_"):
-                blocks = pdf_dict[page_id]["para_blocks"]
-                statistics["num_pages"] += 1
-                for block_id, block_data in enumerate(blocks):
-                    statistics["num_blocks"] += 1
-                    if "paras" in block_data.keys():
-                        statistics["num_paras"] += len(block_data["paras"])
-                    for line in block_data["lines"]:
-                        if line.get("is_title", 0):
-                            statistics["num_titles"] += 1
-                    if block_data.get("is_header", 0):
-                        statistics["num_header_blocks"] += 1
-                    if block_data.get("is_footer", 0):
-                        statistics["num_footer_blocks"] += 1
-                    if block_data.get("is_watermark", 0):
-                        statistics["num_watermark_blocks"] += 1
-                    if block_data.get("is_vertical_margin_note", 0):
-                        statistics["num_vertical_margin_note_blocks"] += 1
-        pdf_dict["statistics"] = statistics
-        return pdf_dict
-class TitleProcessor:
-    """
-    This class processes the title.
-    """
-    def __init__(self, *doc_statistics) -> None:
-        if len(doc_statistics) > 0:
-            self.doc_statistics = doc_statistics[0]
-        self.nlp_model = NLPModels()
-        self.MAX_TITLE_LEVEL = 3
-        self.numbered_title_pattern = r"""
-            ^                                 # 行首
-            (                                 # 开始捕获组
-                [\(\（]\d+[\)\）]              # 括号内数字，支持中文和英文括号，例如：(1) 或 （1）
-                |\d+[\)\）]\s                  # 数字后跟右括号和空格，支持中文和英文括号，例如：2) 或 2）
-                |[\(\（][A-Z][\)\）]            # 括号内大写字母，支持中文和英文括号，例如：(A) 或 （A）
-                |[A-Z][\)\）]\s                # 大写字母后跟右括号和空格，例如：A) 或 A）
-                |[\(\（][IVXLCDM]+[\)\）]       # 括号内罗马数字，支持中文和英文括号，例如：(I) 或 （I）
-                |[IVXLCDM]+[\)\）]\s            # 罗马数字后跟右括号和空格，例如：I) 或 I）
-                |\d+(\.\d+)*\s                # 数字或复合数字编号后跟空格，例如：1. 或 3.2.1 
-                |[一二三四五六七八九十百千]+[、\s]       # 中文序号后跟顿号和空格，例如：一、
-                |[\（|\(][一二三四五六七八九十百千]+[\）|\)]\s*  # 中文括号内中文序号后跟空格，例如：（一）
-                |[A-Z]\.\d+(\.\d+)?\s         # 大写字母后跟点和数字，例如：A.1 或 A.1.1
-                |[\(\（][a-z][\)\）]            # 括号内小写字母，支持中文和英文括号，例如：(a) 或 （a）
-                |[a-z]\)\s                    # 小写字母后跟右括号和空格，例如：a) 
-                |[A-Z]-\s                     # 大写字母后跟短横线和空格，例如：A- 
-                |\w+:\s                       # 英文序号词后跟冒号和空格，例如：First: 
-                |第[一二三四五六七八九十百千]+[章节部分条款]\s # 以“第”开头的中文标题后跟空格
-                |[IVXLCDM]+\.                 # 罗马数字后跟点，例如：I.
-                |\d+\.\s                      # 单个数字后跟点和空格，例如：1. 
-            )                                 # 结束捕获组
-            .+                                # 标题的其余部分
-        """
-        self.stage = (
-            0  # Used for distinguishing the stage of title detection, the number is occurred in paragraph process pipeline
-        )
-    def _is_potential_title(
-        self,
-        curr_line,
-        prev_line,
-        prev_line_is_title,
-        next_line,
-        avg_char_width,
-        avg_char_height,
-        median_font_size,
-    ):
-        """
-        This function checks if the line is a potential title.
-        Parameters
-        ----------
-        curr_line : dict
-            current line
-        prev_line : dict
-            previous line
-        next_line : dict
-            next line
-        avg_char_width : float
-            average of char widths
-        avg_char_height : float
-            average of line heights
-        Returns
-        -------
-        bool
-            True if the line is a potential title, False otherwise.
-        """
-        def __is_line_centered(line_bbox, page_bbox, avg_char_width):
-            """
-            This function checks if the line is centered on the page
-            Parameters
-            ----------
-            line_bbox : list
-                bbox of the line
-            page_bbox : list
-                bbox of the page
-            avg_char_width : float
-                average of char widths
-            Returns
-            -------
-            bool
-                True if the line is centered on the page, False otherwise.
-            """
-            horizontal_ratio = 0.5
-            horizontal_thres = horizontal_ratio * avg_char_width
-            x0, _, x1, _ = line_bbox
-            _, _, page_x1, _ = page_bbox
-            return abs((x0 + x1) / 2 - page_x1 / 2) < horizontal_thres
-        def __is_bold_font_line(line):
-            """
-            Check if a line contains any bold font style.
-            """
-            def _is_bold_span(span):
-                # if span text is empty or only contains space, return False
-                if not span["text"].strip():
-                    return False
-                return bool(span["flags"] & 2**4)  # Check if the font is bold
-            for span in line["spans"]:
-                if not _is_bold_span(span):
-                    return False
-            return True
-        def __is_italic_font_line(line):
-            """
-            Check if a line contains any italic font style.
-            """
-            def __is_italic_span(span):
-                return bool(span["flags"] & 2**1)  # Check if the font is italic
-            for span in line["spans"]:
-                if not __is_italic_span(span):
-                    return False
-            return True
-        def __is_punctuation_heavy(line_text):
-            """
-            Check if the line contains a high ratio of punctuation marks, which may indicate
-            that the line is not a title.
-            Parameters:
-            line_text (str): Text of the line.
-            Returns:
-            bool: True if the line is heavy with punctuation, False otherwise.
-            """
-            # Pattern for common title format like "X.Y. Title"
-            pattern = r"\b\d+\.\d+\..*\b"
-            # If the line matches the title format, return False
-            if re.match(pattern, line_text.strip()):
-                return False
-            # Find all punctuation marks in the line
-            punctuation_marks = re.findall(r"[^\w\s]", line_text)
-            number_of_punctuation_marks = len(punctuation_marks)
-            text_length = len(line_text)
-            if text_length == 0:
-                return False
-            punctuation_ratio = number_of_punctuation_marks / text_length
-            if punctuation_ratio >= 0.1:
-                return True
-            return False
-        def __has_mixed_font_styles(spans, strict_mode=False):
-            """
-            This function checks if the line has mixed font styles, the strict mode will compare the font types
-            Parameters
-            ----------
-            spans : list
-                spans of the line
-            strict_mode : bool
-                True for strict mode, the font types will be fully compared
-                False for non-strict mode, the font types will be compared by the most longest common prefix
-            Returns
-            -------
-            bool
-                True if the line has mixed font styles, False otherwise.
-            """
-            if strict_mode:
-                font_styles = set()
-                for span in spans:
-                    font_style = span["font"].lower()
-                    font_styles.add(font_style)
-                return len(font_styles) > 1
-            else:  # non-strict mode
-                font_styles = []
-                for span in spans:
-                    font_style = span["font"].lower()
-                    font_styles.append(font_style)
-                if len(font_styles) > 1:
-                    longest_common_prefix = os.path.commonprefix(font_styles)
-                    if len(longest_common_prefix) > 0:
-                        return False
-                    else:
-                        return True
-                else:
-                    return False
-        def __is_different_font_type_from_neighbors(curr_line_font_type, prev_line_font_type, next_line_font_type):
-            """
-            This function checks if the current line has a different font type from the previous and next lines
-            Parameters
-            ----------
-            curr_line_font_type : str
-                font type of the current line
-            prev_line_font_type : str
-                font type of the previous line
-            next_line_font_type : str
-                font type of the next line
-            Returns
-            -------
-            bool
-                True if the current line has a different font type from the previous and next lines, False otherwise.
-            """
-            return all(
-                curr_line_font_type != other_font_type.lower()
-                for other_font_type in [prev_line_font_type, next_line_font_type]
-                if other_font_type is not None
-            )
-        def __is_larger_font_size_from_neighbors(curr_line_font_size, prev_line_font_size, next_line_font_size):
-            """
-            This function checks if the current line has a larger font size than the previous and next lines
-            Parameters
-            ----------
-            curr_line_font_size : float
-                font size of the current line
-            prev_line_font_size : float
-                font size of the previous line
-            next_line_font_size : float
-                font size of the next line
-            Returns
-            -------
-            bool
-                True if the current line has a larger font size than the previous and next lines, False otherwise.
-            """
-            return all(
-                curr_line_font_size > other_font_size * 1.2
-                for other_font_size in [prev_line_font_size, next_line_font_size]
-                if other_font_size is not None
-            )
-        def __is_similar_to_pre_line(curr_line_font_type, prev_line_font_type, curr_line_font_size, prev_line_font_size):
-            """
-            This function checks if the current line is similar to the previous line
-            Parameters
-            ----------
-            curr_line : dict
-                current line
-            prev_line : dict
-                previous line
-            Returns
-            -------
-            bool
-                True if the current line is similar to the previous line, False otherwise.
-            """
-            if curr_line_font_type == prev_line_font_type and curr_line_font_size == prev_line_font_size:
-                return True
-            else:
-                return False
-        def __is_same_font_type_of_docAvg(curr_line_font_type):
-            """
-            This function checks if the current line has the same font type as the document average font type
-            Parameters
-            ----------
-            curr_line_font_type : str
-                font type of the current line
-            Returns
-            -------
-            bool
-                True if the current line has the same font type as the document average font type, False otherwise.
-            """
-            doc_most_common_font_type = safe_get(self.doc_statistics, "most_common_font_type", "").lower()
-            doc_second_most_common_font_type = safe_get(self.doc_statistics, "second_most_common_font_type", "").lower()
-            return curr_line_font_type.lower() in [doc_most_common_font_type, doc_second_most_common_font_type]
-        def __is_font_size_not_less_than_docAvg(curr_line_font_size, ratio: float = 1):
-            """
-            This function checks if the current line has a large enough font size
-            Parameters
-            ----------
-            curr_line_font_size : float
-                font size of the current line
-            ratio : float
-                ratio of the current line font size to the document average font size
-            Returns
-            -------
-            bool
-                True if the current line has a large enough font size, False otherwise.
-            """
-            doc_most_common_font_size = safe_get(self.doc_statistics, "most_common_font_size", 0)
-            doc_second_most_common_font_size = safe_get(self.doc_statistics, "second_most_common_font_size", 0)
-            doc_avg_font_size = min(doc_most_common_font_size, doc_second_most_common_font_size)
-            return curr_line_font_size >= doc_avg_font_size * ratio
-        def __is_sufficient_spacing_above_and_below(
-            curr_line_bbox,
-            prev_line_bbox,
-            next_line_bbox,
-            avg_char_height,
-            median_font_size,
-        ):
-            """
-            This function checks if the current line has sufficient spacing above and below
-            Parameters
-            ----------
-            curr_line_bbox : list
-                bbox of the current line
-            prev_line_bbox : list
-                bbox of the previous line
-            next_line_bbox : list
-                bbox of the next line
-            avg_char_width : float
-                average of char widths
-            avg_char_height : float
-                average of line heights
-            Returns
-            -------
-            bool
-                True if the current line has sufficient spacing above and below, False otherwise.
-            """
-            vertical_ratio = 1.25
-            vertical_thres = vertical_ratio * median_font_size
-            _, y0, _, y1 = curr_line_bbox
-            sufficient_spacing_above = False
-            if prev_line_bbox:
-                vertical_spacing_above = min(y0 - prev_line_bbox[1], y1 - prev_line_bbox[3])
-                sufficient_spacing_above = vertical_spacing_above > vertical_thres
-            else:
-                sufficient_spacing_above = True
-            sufficient_spacing_below = False
-            if next_line_bbox:
-                vertical_spacing_below = min(next_line_bbox[1] - y0, next_line_bbox[3] - y1)
-                sufficient_spacing_below = vertical_spacing_below > vertical_thres
-            else:
-                sufficient_spacing_below = True
-            return (sufficient_spacing_above, sufficient_spacing_below)
-        def __is_word_list_line_by_rules(curr_line_text):
-            """
-            This function checks if the current line is a word list
-            Parameters
-            ----------
-            curr_line_text : str
-                text of the current line
-            Returns
-            -------
-            bool
-                True if the current line is a name list, False otherwise.
-            """
-            # name_list_pattern = r"([a-zA-Z][a-zA-Z\s]{0,20}[a-zA-Z]|[\u4e00-\u9fa5·]{2,16})(?=[，,;；\s]|$)"
-            name_list_pattern = r"(?<![\u4e00-\u9fa5])([A-Z][a-z]{0,19}\s[A-Z][a-z]{0,19}|[\u4e00-\u9fa5]{2,6})(?=[，,;；\s]|$)"
-            compiled_pattern = re.compile(name_list_pattern)
-            if compiled_pattern.search(curr_line_text):
-                return True
-            else:
-                return False
-        def __get_text_catgr_by_nlp(curr_line_text):
-            """
-            This function checks if the current line is a name list using nlp model, such as spacy
-            Parameters
-            ----------
-            curr_line_text : str
-                text of the current line
-            Returns
-            -------
-            bool
-                True if the current line is a name list, False otherwise.
-            """
-            result = self.nlp_model.detect_entity_catgr_using_nlp(curr_line_text)
-            return result
-        def __is_numbered_title(curr_line_text):
-            """
-            This function checks if the current line is a numbered list
-            Parameters
-            ----------
-            curr_line_text : str
-                text of the current line
-            Returns
-            -------
-            bool
-                True if the current line is a numbered list, False otherwise.
-            """
-            compiled_pattern = re.compile(self.numbered_title_pattern, re.VERBOSE)
-            if compiled_pattern.search(curr_line_text):
-                return True
-            else:
-                return False
-        def __is_end_with_ending_puncs(line_text):
-            """
-            This function checks if the current line ends with a ending punctuation mark
-            Parameters
-            ----------
-            line_text : str
-                text of the current line
-            Returns
-            -------
-            bool
-                True if the current line ends with a punctuation mark, False otherwise.
-            """
-            end_puncs = [".", "?", "!", "。", "？", "！", "…"]
-            line_text = line_text.rstrip()
-            if line_text[-1] in end_puncs:
-                return True
-            return False
-        def __contains_only_no_meaning_symbols(line_text):
-            """
-            This function checks if the current line contains only symbols that have no meaning, if so, it is not a title.
-            Situation contains:
-            1. Only have punctuation marks
-            2. Only have other non-meaning symbols
-            Parameters
-            ----------
-            line_text : str
-                text of the current line
-            Returns
-            -------
-            bool
-                True if the current line contains only symbols that have no meaning, False otherwise.
-            """
-            punctuation_marks = re.findall(r"[^\w\s]", line_text)  # find all punctuation marks
-            number_of_punctuation_marks = len(punctuation_marks)
-            text_length = len(line_text)
-            if text_length == 0:
-                return False
-            punctuation_ratio = number_of_punctuation_marks / text_length
-            if punctuation_ratio >= 0.9:
-                return True
-            return False
-        def __is_equation(line_text):
-            """
-            This function checks if the current line is an equation.
-            Parameters
-            ----------
-            line_text : str
-            Returns
-            -------
-            bool
-                True if the current line is an equation, False otherwise.
-            """
-            equation_reg = r"\$.*?\\overline.*?\$"  # to match interline equations
-            if re.search(equation_reg, line_text):
-                return True
-            else:
-                return False
-        def __is_title_by_len(text, max_length=200):
-            """
-            This function checks if the current line is a title by length.
-            Parameters
-            ----------
-            text : str
-                text of the current line
-            max_length : int
-                max length of the title
-            Returns
-            -------
-            bool
-                True if the current line is a title, False otherwise.
-            """
-            text = text.strip()
-            return len(text) <= max_length
-        def __compute_line_font_type_and_size(curr_line):
-            """
-            This function computes the font type and font size of the line.
-            Parameters
-            ----------
-            line : dict
-                line
-            Returns
-            -------
-            font_type : str
-                font type of the line
-            font_size : float
-                font size of the line
-            """
-            spans = curr_line["spans"]
-            max_accumulated_length = 0
-            max_span_font_size = curr_line["spans"][0]["size"]  # default value, float type
-            max_span_font_type = curr_line["spans"][0]["font"].lower()  # default value, string type
-            for span in spans:
-                if span["text"].isspace():
-                    continue
-                span_length = span["bbox"][2] - span["bbox"][0]
-                if span_length > max_accumulated_length:
-                    max_accumulated_length = span_length
-                    max_span_font_size = span["size"]
-                    max_span_font_type = span["font"].lower()
-            return max_span_font_type, max_span_font_size
-        def __is_a_consistent_sub_title(pre_line, curr_line):
-            """
-            This function checks if the current line is a consistent sub title.
-            Parameters
-            ----------
-            pre_line : dict
-                previous line
-            curr_line : dict
-                current line
-            Returns
-            -------
-            bool
-                True if the current line is a consistent sub title, False otherwise.
-            """
-            if pre_line is None:
-                return False
-            start_letter_of_pre_line = pre_line["text"][0]
-            start_letter_of_curr_line = curr_line["text"][0]
-            has_same_prefix_digit = (
-                start_letter_of_pre_line.isdigit()
-                and start_letter_of_curr_line.isdigit()
-                and start_letter_of_pre_line == start_letter_of_curr_line
-            )
-            # prefix text of curr_line satisfies the following title format: x.x
-            prefix_text_pattern = r"^\d+\.\d+"
-            subtitle_format_match = re.match(prefix_text_pattern, curr_line["text"])
-            if subtitle_format_match:
-                has_subtitle_format = True
-            else:
-                has_subtitle_format = False
-            if has_same_prefix_digit or has_subtitle_format:
-                print("is a consistent sub title")
-                return True
-        """
-        Title detecting main Process.
-        """
-        """
-        Basic features about the current line.
-        """
-        curr_line_bbox = curr_line["bbox"]
-        curr_line_text = curr_line["text"]
-        curr_line_font_type, curr_line_font_size = __compute_line_font_type_and_size(curr_line)
-        if len(curr_line_text.strip()) == 0:  # skip empty lines
-            return False, False
-        prev_line_bbox = prev_line["bbox"] if prev_line else None
-        if prev_line:
-            prev_line_font_type, prev_line_font_size = __compute_line_font_type_and_size(prev_line)
-        else:
-            prev_line_font_type, prev_line_font_size = None, None
-        next_line_bbox = next_line["bbox"] if next_line else None
-        if next_line:
-            next_line_font_type, next_line_font_size = __compute_line_font_type_and_size(next_line)
-        else:
-            next_line_font_type, next_line_font_size = None, None
-        """
-        Aggregated features about the current line.
-        """
-        is_italc_font = __is_italic_font_line(curr_line)
-        is_bold_font = __is_bold_font_line(curr_line)
-        is_font_size_little_less_than_doc_avg = __is_font_size_not_less_than_docAvg(curr_line_font_size, ratio=0.8)
-        is_font_size_not_less_than_doc_avg = __is_font_size_not_less_than_docAvg(curr_line_font_size, ratio=1)
-        is_much_larger_font_than_doc_avg = __is_font_size_not_less_than_docAvg(curr_line_font_size, ratio=1.6)
-        is_not_same_font_type_of_docAvg = not __is_same_font_type_of_docAvg(curr_line_font_type)
-        is_potential_title_font = is_bold_font or is_font_size_not_less_than_doc_avg or is_not_same_font_type_of_docAvg
-        is_mix_font_styles_strict = __has_mixed_font_styles(curr_line["spans"], strict_mode=True)
-        is_mix_font_styles_loose = __has_mixed_font_styles(curr_line["spans"], strict_mode=False)
-        is_punctuation_heavy = __is_punctuation_heavy(curr_line_text)
-        is_word_list_line_by_rules = __is_word_list_line_by_rules(curr_line_text)
-        is_person_or_org_list_line_by_nlp = __get_text_catgr_by_nlp(curr_line_text) in ["PERSON", "GPE", "ORG"]
-        is_font_size_larger_than_neighbors = __is_larger_font_size_from_neighbors(
-            curr_line_font_size, prev_line_font_size, next_line_font_size
-        )
-        is_font_type_diff_from_neighbors = __is_different_font_type_from_neighbors(
-            curr_line_font_type, prev_line_font_type, next_line_font_type
-        )
-        has_sufficient_spaces_above, has_sufficient_spaces_below = __is_sufficient_spacing_above_and_below(
-            curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_height, median_font_size
-        )
-        is_similar_to_pre_line = __is_similar_to_pre_line(
-            curr_line_font_type, prev_line_font_type, curr_line_font_size, prev_line_font_size
-        )
-        is_consistent_sub_title = __is_a_consistent_sub_title(prev_line, curr_line)
-        """
-        Further aggregated features about the current line.
-        Attention:
-            Features that start with __ are for internal use.
-        """
-        __is_line_left_aligned_from_neighbors = is_line_left_aligned_from_neighbors(
-            curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_width
-        )
-        __is_font_diff_from_neighbors = is_font_size_larger_than_neighbors or is_font_type_diff_from_neighbors
-        is_a_left_inline_title = (
-            is_mix_font_styles_strict and __is_line_left_aligned_from_neighbors and __is_font_diff_from_neighbors
-        )
-        is_title_by_check_prev_line = prev_line is None and has_sufficient_spaces_above and is_potential_title_font
-        is_title_by_check_next_line = next_line is None and has_sufficient_spaces_below and is_potential_title_font
-        is_title_by_check_pre_and_next_line = (
-            (prev_line is not None or next_line is not None)
-            and has_sufficient_spaces_above
-            and has_sufficient_spaces_below
-            and is_potential_title_font
-        )
-        is_numbered_title = __is_numbered_title(curr_line_text) and (
-            (has_sufficient_spaces_above or prev_line is None) and (has_sufficient_spaces_below or next_line is None)
-        )
-        is_not_end_with_ending_puncs = not __is_end_with_ending_puncs(curr_line_text)
-        is_not_only_no_meaning_symbols = not __contains_only_no_meaning_symbols(curr_line_text)
-        is_equation = __is_equation(curr_line_text)
-        is_title_by_len = __is_title_by_len(curr_line_text)
-        """
-        Decide if the line is a title.
-        """
-        is_title = (
-            is_not_end_with_ending_puncs  # not end with ending punctuation marks
-            and is_not_only_no_meaning_symbols  # not only have no meaning symbols
-            and is_title_by_len  # is a title by length, default max length is 200
-            and not is_equation  # an interline equation should never be a title
-            and is_potential_title_font  # is a potential title font, which is bold or larger than the document average font size or not the same font type as the document average font type
-            and (
-                (is_not_same_font_type_of_docAvg and is_font_size_not_less_than_doc_avg)
-                or (is_bold_font and is_much_larger_font_than_doc_avg and is_not_same_font_type_of_docAvg)
-                or (
-                    is_much_larger_font_than_doc_avg
-                    and (is_title_by_check_prev_line or is_title_by_check_next_line or is_title_by_check_pre_and_next_line)
-                )
-                or (
-                    is_font_size_little_less_than_doc_avg
-                    and is_bold_font
-                    and (is_title_by_check_prev_line or is_title_by_check_next_line or is_title_by_check_pre_and_next_line)
-                )
-            )  # not the same font type as the document average font type, which includes the most common font type and the second most common font type
-            and (
-                (
-                    not is_person_or_org_list_line_by_nlp
-                    and (
-                        is_much_larger_font_than_doc_avg
-                        or (is_not_same_font_type_of_docAvg and is_font_size_not_less_than_doc_avg)
-                    )
-                )
-                or (
-                    not (is_word_list_line_by_rules and is_person_or_org_list_line_by_nlp)
-                    and not is_a_left_inline_title
-                    and not is_punctuation_heavy
-                    and (is_title_by_check_prev_line or is_title_by_check_next_line or is_title_by_check_pre_and_next_line)
-                )
-                or (
-                    is_person_or_org_list_line_by_nlp
-                    and (is_bold_font and is_much_larger_font_than_doc_avg and is_not_same_font_type_of_docAvg)
-                    and (is_bold_font and is_much_larger_font_than_doc_avg and is_not_same_font_type_of_docAvg)
-                )
-                or (is_numbered_title and not is_a_left_inline_title)
-            )
-            # )
-        ) or (prev_line_is_title and is_consistent_sub_title)
-        is_name_or_org_list_to_be_removed = (
-            (is_person_or_org_list_line_by_nlp)
-            and is_punctuation_heavy
-            and (is_title_by_check_prev_line or is_title_by_check_next_line or is_title_by_check_pre_and_next_line)
-        ) and not is_title
-        if is_name_or_org_list_to_be_removed:
-            is_author_or_org_list = True
-        else:
-            is_author_or_org_list = False
-        # return is_title, is_author_or_org_list
-        # """
-        """
-        # print reason why the line is a title
-        if is_title:
-            print_green("This line is a title.")
-            print_green("↓" * 10)
-            print()
-            print("curr_line_text: ", curr_line_text)
-            print()
-            print(f"prev_line_is_title: {prev_line_is_title}")
-            print()
-            print(f"is_consistent_sub_title: {is_consistent_sub_title}")
-        """
-        # print reason why the line is not a title
-        # line_text = curr_line_text.strip()
-        # test_text = "Career/Personal Life"
-        # text_content_condition = line_text == test_text
-        # if not is_title and text_content_condition: # Print specific line
-        """
-        if not is_title: # Print each line
-            print_red("This line is not a title.")
-            print_red("↓" * 10)
-            print()
-            print("curr_line_text: ", curr_line_text)
-            print()
-            if is_not_end_with_ending_puncs:
-                print_green(f"is_not_end_with_ending_puncs")
-            else:
-                print_red(f"is_end_with_ending_puncs")
-            if is_not_only_no_meaning_symbols:
-                print_green(f"is_not_only_no_meaning_symbols")
-            else:
-                print_red(f"is_only_no_meaning_symbols")
-            if is_title_by_len:
-                print_green(f"is_title_by_len: {is_title_by_len}")
-            else:
-                print_red(f"is_not_title_by_len: {is_title_by_len}")
-            if is_equation:
-                print_red(f"is_equation")
-            else:
-                print_green(f"is_not_equation")
-            if is_potential_title_font:
-                print_green(f"is_potential_title_font")
-            else:
-                print_red(f"is_not_potential_title_font")
-            if is_punctuation_heavy:
-                print_red("is_punctuation_heavy")
-            else:
-                print_green("is_not_punctuation_heavy")
-            if is_bold_font:
-                print_green(f"is_bold_font")
-            else:
-                print_red(f"is_not_bold_font")
-            if is_font_size_not_less_than_doc_avg:
-                print_green(f"is_larger_font_than_doc_avg")
-            else:
-                print_red(f"is_not_larger_font_than_doc_avg")
-            if is_much_larger_font_than_doc_avg:
-                print_green(f"is_much_larger_font_than_doc_avg")
-            else:
-                print_red(f"is_not_much_larger_font_than_doc_avg")
-            if is_not_same_font_type_of_docAvg:
-                print_green(f"is_not_same_font_type_of_docAvg")
-            else:
-                print_red(f"is_same_font_type_of_docAvg")
-            if is_word_list_line_by_rules:
-                print_red("is_word_list_line_by_rules")
-            else:
-                print_green("is_not_name_list_by_rules")
-            if is_person_or_org_list_line_by_nlp:
-                print_red("is_person_or_org_list_line_by_nlp")
-            else:
-                print_green("is_not_person_or_org_list_line_by_nlp")
-            if not is_numbered_title:
-                print_red("is_not_numbered_title")
-            else:
-                print_green("is_numbered_title")
-            if is_a_left_inline_title:
-                print_red("is_a_left_inline_title")
-            else:
-                print_green("is_not_a_left_inline_title")
-            if not is_title_by_check_prev_line:
-                print_red("is_not_title_by_check_prev_line")
-            else:
-                print_green("is_title_by_check_prev_line")
-            if not is_title_by_check_next_line:
-                print_red("is_not_title_by_check_next_line")
-            else:
-                print_green("is_title_by_check_next_line")
-            if not is_title_by_check_pre_and_next_line:
-                print_red("is_not_title_by_check_pre_and_next_line")
-            else:
-                print_green("is_title_by_check_pre_and_next_line")
-        # print_green("Common features:")
-        # print_green("↓" * 10)
-        # print(f"    curr_line_font_type: {curr_line_font_type}")
-        # print(f"    curr_line_font_size: {curr_line_font_size}")
-        # print()
-        """
-        # """
-        return is_title, is_author_or_org_list
-    def _detect_title(self, curr_block, pre_block):
-        """
-        Use the functions 'is_potential_title' to detect titles of each paragraph block.
-        If a line is a title, then the value of key 'is_title' of the line will be set to True.
-        """
-        raw_lines = curr_block["lines"]
-        blk_avg_char_width = curr_block["avg_char_width"]
-        blk_avg_char_height = curr_block["avg_char_height"]
-        blk_media_font_size = curr_block["median_font_size"]
-        if self.stage == 0:
-            is_prev_line_a_title = False
-            for i, curr_line in enumerate(raw_lines):
-                prev_line = raw_lines[i - 1] if i > 0 else None
-                next_line = raw_lines[i + 1] if i < len(raw_lines) - 1 else None
-                is_line_a_title, is_line_an_entities_list = self._is_potential_title(
-                    curr_line,
-                    prev_line,
-                    is_prev_line_a_title,
-                    next_line,
-                    blk_avg_char_width,
-                    blk_avg_char_height,
-                    blk_media_font_size,
-                )
-                if is_line_a_title:
-                    curr_line["is_title"] = is_line_a_title
-                    is_prev_line_a_title = True  # set the flag to True for the next line
-                else:
-                    curr_line["is_title"] = False
-                    is_prev_line_a_title = False  # set the flag to False for the next line
-                if is_line_an_entities_list:
-                    curr_line["is_author_or_org_list"] = is_line_an_entities_list
-                else:
-                    curr_line["is_author_or_org_list"] = False
-            return curr_block
-        if self.stage == 1:  # Check the block consistent titles.
-            if pre_block and "paras" in pre_block.keys():
-                print_red(f"Checking cross block title...")
-                last_para_content = None
-                paras_of_pre_block = pre_block["paras"]
-                last_key = sorted(paras_of_pre_block.keys())[-1]
-                last_para_content = paras_of_pre_block[last_key]
-                if last_para_content is not None:
-                    last_line_of_last_para_of_last_block = pre_block["lines"][-1]
-                    first_line_of_curr_block = raw_lines[0]
-                    next_line_of_curr_block = raw_lines[1] if len(raw_lines) > 1 else None
-                    is_line_a_title, is_line_an_entities_list = self._is_potential_title(
-                        first_line_of_curr_block,
-                        last_line_of_last_para_of_last_block,
-                        last_line_of_last_para_of_last_block["is_title"],
-                        next_line_of_curr_block,
-                        blk_avg_char_width,
-                        blk_avg_char_height,
-                        blk_media_font_size,
-                    )
-                    if is_line_a_title:
-                        first_line_of_curr_block["is_title"] = is_line_a_title
-                    else:
-                        first_line_of_curr_block["is_title"] = False
-                    if is_line_an_entities_list:
-                        first_line_of_curr_block["is_author_or_org_list"] = is_line_an_entities_list
-                    else:
-                        first_line_of_curr_block["is_author_or_org_list"] = False
-                    # print(f"first_line_of_curr_block: {first_line_of_curr_block['text']}")
-                    # print(f"last_line_of_pre_block: {last_line_of_last_para['text']}")
-                    return curr_block
-                else:
-                    print_red(f"last_para_content is None")
-    def batch_detect_titles(self, pdf_dic):
-        """
-        This function batch process the blocks to detect titles.
-        Parameters
-        ----------
-        pdf_dict : dict
-            result dictionary
-        Returns
-        -------
-        pdf_dict : dict
-            result dictionary
-        """
-        num_titles = 0
-        for page_id, page_content in pdf_dic.items():
-            if page_id.startswith("page_"):
-                para_blocks = []
-                if "para_blocks" in page_content.keys():
-                    para_blocks = page_content["para_blocks"]
-                    all_single_line_blocks = []
-                    for block in para_blocks:
-                        if len(block["lines"]) == 1:
-                            all_single_line_blocks.append(block)
-                    new_para_blocks = []
-                    if not len(all_single_line_blocks) == len(para_blocks):  # Not all blocks are single line blocks.
-                        for para_idx, para_block in enumerate(para_blocks):
-                            print(f"______________________________________________________")
-                            print(f"page_id: {page_id}")
-                            print(f"para_block id: {para_block['block_id']}")
-                            print(f"para_idx: {para_idx}")
-                            pre_block = para_blocks[para_idx - 1] if para_idx > 0 else None
-                            curr_block = para_block
-                            print_yellow(f"text of current block: {curr_block['text'] if curr_block else None}")
-                            print_green(f"text of previous block: {pre_block['text'] if pre_block else None}")
-                            new_block = self._detect_title(curr_block, pre_block)
-                            new_para_blocks.append(new_block)
-                            # num_titles += sum([line.get("is_title", 0) for line in new_block["lines"]])
-                            if new_block is not None:
-                                num_titles += sum([line.get("is_title", 0) for line in new_block["lines"]])
-                            else:
-                                num_titles += 0
-                    else:  # All blocks are single line blocks.
-                        for para_block in para_blocks:
-                            new_para_blocks.append(para_block)
-                            num_titles += sum([line.get("is_title", 0) for line in para_block["lines"]])
-                    para_blocks = new_para_blocks
-                page_content["para_blocks"] = para_blocks
-                for para_block in para_blocks:
-                    if para_block is not None:
-                        all_titles = all(safe_get(line, "is_title", False) for line in para_block["lines"])
-                        para_text_len = sum([len(line["text"]) for line in para_block["lines"]])
-                        if (
-                            all_titles and para_text_len < 200
-                        ):  # total length of the paragraph is less than 200, more than this should not be a title
-                            para_block["is_block_title"] = 1
-                        else:
-                            para_block["is_block_title"] = 0
-                        all_name_or_org_list_to_be_removed = all(
-                            safe_get(line, "is_author_or_org_list", False) for line in para_block["lines"]
-                        )
-                        if all_name_or_org_list_to_be_removed and page_id == "page_0":
-                            para_block["is_block_an_author_or_org_list"] = 1
-                        else:
-                            para_block["is_block_an_author_or_org_list"] = 0
-                    else:
-                        all_titles = False
-                        # para_block["is_block_title"] = 0
-                        # para_block["is_block_an_author_or_org_list"] = 0
-                # page_content["para_blocks"] = para_blocks
-        pdf_dic["statistics"]["num_titles"] = num_titles
-        return pdf_dic
-    def _recog_title_level(self, title_blocks):
-        """
-        This function determines the title level based on the font size of the title.
-        Parameters
-        ----------
-        title_blocks : list
-        Returns
-        -------
-        title_blocks : list
-        """
-        font_sizes = np.array([safe_get(tb["block"], "block_font_size", 0) for tb in title_blocks])
-        # Use the mean and std of font sizes to remove extreme values
-        mean_font_size = np.mean(font_sizes)
-        std_font_size = np.std(font_sizes)
-        min_extreme_font_size = mean_font_size - std_font_size  # type: ignore
-        max_extreme_font_size = mean_font_size + std_font_size  # type: ignore
-        # Compute the threshold for title level
-        middle_font_sizes = font_sizes[(font_sizes > min_extreme_font_size) & (font_sizes < max_extreme_font_size)]
-        if middle_font_sizes.size > 0:
-            middle_mean_font_size = np.mean(middle_font_sizes)
-            level_threshold = middle_mean_font_size
-        else:
-            level_threshold = mean_font_size
-        for tb in title_blocks:
-            title_block = tb["block"]
-            title_font_size = safe_get(title_block, "block_font_size", 0)
-            current_level = 1  # Initialize title level, the biggest level is 1
-            # print(f"Before adjustment by font size, {current_level}")
-            if title_font_size >= max_extreme_font_size:
-                current_level = 1
-            elif title_font_size <= min_extreme_font_size:
-                current_level = 3
-            elif float(title_font_size) >= float(level_threshold):
-                current_level = 2
-            else:
-                current_level = 3
-            # print(f"After adjustment by font size, {current_level}")
-            title_block["block_title_level"] = current_level
-        return title_blocks
-    def batch_recog_title_level(self, pdf_dic):
-        """
-        This function batch process the blocks to recognize title level.
-        Parameters
-        ----------
-        pdf_dict : dict
-            result dictionary
-        Returns
-        -------
-        pdf_dict : dict
-            result dictionary
-        """
-        title_blocks = []
-        # Collect all titles
-        for page_id, blocks in pdf_dic.items():
-            if page_id.startswith("page_"):
-                para_blocks = blocks.get("para_blocks", [])
-                for block in para_blocks:
-                    if block.get("is_block_title"):
-                        title_obj = {"page_id": page_id, "block": block}
-                        title_blocks.append(title_obj)
-        # Determine title level
-        if title_blocks:
-            # Determine title level based on font size
-            title_blocks = self._recog_title_level(title_blocks)
-        return pdf_dic
-class BlockTerminationProcessor:
-    """
-    This class is used to process the block termination.
-    """
-    def __init__(self) -> None:
-        pass
-    def _is_consistent_lines(
-        self,
-        curr_line,
-        prev_line,
-        next_line,
-        consistent_direction,  # 0 for prev, 1 for next, 2 for both
-    ):
-        """
-        This function checks if the line is consistent with its neighbors
-        Parameters
-        ----------
-        curr_line : dict
-            current line
-        prev_line : dict
-            previous line
-        next_line : dict
-            next line
-        consistent_direction : int
-            0 for prev, 1 for next, 2 for both
-        Returns
-        -------
-        bool
-            True if the line is consistent with its neighbors, False otherwise.
-        """
-        curr_line_font_size = curr_line["spans"][0]["size"]
-        curr_line_font_type = curr_line["spans"][0]["font"].lower()
-        if consistent_direction == 0:
-            if prev_line:
-                prev_line_font_size = prev_line["spans"][0]["size"]
-                prev_line_font_type = prev_line["spans"][0]["font"].lower()
-                return curr_line_font_size == prev_line_font_size and curr_line_font_type == prev_line_font_type
-            else:
-                return False
-        elif consistent_direction == 1:
-            if next_line:
-                next_line_font_size = next_line["spans"][0]["size"]
-                next_line_font_type = next_line["spans"][0]["font"].lower()
-                return curr_line_font_size == next_line_font_size and curr_line_font_type == next_line_font_type
-            else:
-                return False
-        elif consistent_direction == 2:
-            if prev_line and next_line:
-                prev_line_font_size = prev_line["spans"][0]["size"]
-                prev_line_font_type = prev_line["spans"][0]["font"].lower()
-                next_line_font_size = next_line["spans"][0]["size"]
-                next_line_font_type = next_line["spans"][0]["font"].lower()
-                return (curr_line_font_size == prev_line_font_size and curr_line_font_type == prev_line_font_type) and (
-                    curr_line_font_size == next_line_font_size and curr_line_font_type == next_line_font_type
-                )
-            else:
-                return False
-        else:
-            return False
-    def _is_regular_line(self, curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_width, X0, X1, avg_line_height):
-        """
-        This function checks if the line is a regular line
-        Parameters
-        ----------
-        curr_line_bbox : list
-            bbox of the current line
-        prev_line_bbox : list
-            bbox of the previous line
-        next_line_bbox : list
-            bbox of the next line
-        avg_char_width : float
-            average of char widths
-        X0 : float
-            median of x0 values, which represents the left average boundary of the page
-        X1 : float
-            median of x1 values, which represents the right average boundary of the page
-        avg_line_height : float
-            average of line heights
-        Returns
-        -------
-        bool
-            True if the line is a regular line, False otherwise.
-        """
-        horizontal_ratio = 0.5
-        vertical_ratio = 0.5
-        horizontal_thres = horizontal_ratio * avg_char_width
-        vertical_thres = vertical_ratio * avg_line_height
-        x0, y0, x1, y1 = curr_line_bbox
-        x0_near_X0 = abs(x0 - X0) < horizontal_thres
-        x1_near_X1 = abs(x1 - X1) < horizontal_thres
-        prev_line_is_end_of_para = prev_line_bbox and (abs(prev_line_bbox[2] - X1) > avg_char_width)
-        sufficient_spacing_above = False
-        if prev_line_bbox:
-            vertical_spacing_above = y1 - prev_line_bbox[3]
-            sufficient_spacing_above = vertical_spacing_above > vertical_thres
-        sufficient_spacing_below = False
-        if next_line_bbox:
-            vertical_spacing_below = next_line_bbox[1] - y0
-            sufficient_spacing_below = vertical_spacing_below > vertical_thres
-        return (
-            (sufficient_spacing_above or sufficient_spacing_below)
-            or (not x0_near_X0 and not x1_near_X1)
-            or prev_line_is_end_of_para
-        )
-    def _is_possible_start_of_para(self, curr_line, prev_line, next_line, X0, X1, avg_char_width, avg_font_size):
-        """
-        This function checks if the line is a possible start of a paragraph
-        Parameters
-        ----------
-        curr_line : dict
-            current line
-        prev_line : dict
-            previous line
-        next_line : dict
-            next line
-        X0 : float
-            median of x0 values, which represents the left average boundary of the page
-        X1 : float
-            median of x1 values, which represents the right average boundary of the page
-        avg_char_width : float
-            average of char widths
-        avg_line_height : float
-            average of line heights
-        Returns
-        -------
-        bool
-            True if the line is a possible start of a paragraph, False otherwise.
-        """
-        start_confidence = 0.5  # Initial confidence of the line being a start of a paragraph
-        decision_path = []  # Record the decision path
-        curr_line_bbox = curr_line["bbox"]
-        prev_line_bbox = prev_line["bbox"] if prev_line else None
-        next_line_bbox = next_line["bbox"] if next_line else None
-        indent_ratio = 1
-        vertical_ratio = 1.5
-        vertical_thres = vertical_ratio * avg_font_size
-        left_horizontal_ratio = 0.5
-        left_horizontal_thres = left_horizontal_ratio * avg_char_width
-        right_horizontal_ratio = 2.5
-        right_horizontal_thres = right_horizontal_ratio * avg_char_width
-        x0, y0, x1, y1 = curr_line_bbox
-        indent_condition = x0 > X0 + indent_ratio * avg_char_width
-        if indent_condition:
-            start_confidence += 0.2
-            decision_path.append("indent_condition_met")
-        x0_near_X0 = abs(x0 - X0) < left_horizontal_thres
-        if x0_near_X0:
-            start_confidence += 0.1
-            decision_path.append("x0_near_X0")
-        x1_near_X1 = abs(x1 - X1) < right_horizontal_thres
-        if x1_near_X1:
-            start_confidence += 0.1
-            decision_path.append("x1_near_X1")
-        if prev_line is None:
-            prev_line_is_end_of_para = True
-            start_confidence += 0.2
-            decision_path.append("no_prev_line")
-        else:
-            prev_line_is_end_of_para, _, _ = self._is_possible_end_of_para(prev_line, next_line, X0, X1, avg_char_width)
-            if prev_line_is_end_of_para:
-                start_confidence += 0.1
-                decision_path.append("prev_line_is_end_of_para")
-        sufficient_spacing_above = False
-        if prev_line_bbox:
-            vertical_spacing_above = y1 - prev_line_bbox[3]
-            sufficient_spacing_above = vertical_spacing_above > vertical_thres
-            if sufficient_spacing_above:
-                start_confidence += 0.2
-                decision_path.append("sufficient_spacing_above")
-        sufficient_spacing_below = False
-        if next_line_bbox:
-            vertical_spacing_below = next_line_bbox[1] - y0
-            sufficient_spacing_below = vertical_spacing_below > vertical_thres
-            if sufficient_spacing_below:
-                start_confidence += 0.2
-                decision_path.append("sufficient_spacing_below")
-        is_regular_line = self._is_regular_line(
-            curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_width, X0, X1, avg_font_size
-        )
-        if is_regular_line:
-            start_confidence += 0.1
-            decision_path.append("is_regular_line")
-        is_start_of_para = (
-            (sufficient_spacing_above or sufficient_spacing_below)
-            or (indent_condition)
-            or (not indent_condition and x0_near_X0 and x1_near_X1 and not is_regular_line)
-            or prev_line_is_end_of_para
-        )
-        return (is_start_of_para, start_confidence, decision_path)
-    def _is_possible_end_of_para(self, curr_line, next_line, X0, X1, avg_char_width):
-        """
-        This function checks if the line is a possible end of a paragraph
-        Parameters
-        ----------
-        curr_line : dict
-            current line
-        next_line : dict
-            next line
-        X0 : float
-            median of x0 values, which represents the left average boundary of the page
-        X1 : float
-            median of x1 values, which represents the right average boundary of the page
-        avg_char_width : float
-            average of char widths
-        Returns
-        -------
-        bool
-            True if the line is a possible end of a paragraph, False otherwise.
-        """
-        end_confidence = 0.5  # Initial confidence of the line being a end of a paragraph
-        decision_path = []  # Record the decision path
-        curr_line_bbox = curr_line["bbox"]
-        next_line_bbox = next_line["bbox"] if next_line else None
-        left_horizontal_ratio = 0.5
-        right_horizontal_ratio = 0.5
-        x0, _, x1, y1 = curr_line_bbox
-        next_x0, next_y0, _, _ = next_line_bbox if next_line_bbox else (0, 0, 0, 0)
-        x0_near_X0 = abs(x0 - X0) < left_horizontal_ratio * avg_char_width
-        if x0_near_X0:
-            end_confidence += 0.1
-            decision_path.append("x0_near_X0")
-        x1_smaller_than_X1 = x1 < X1 - right_horizontal_ratio * avg_char_width
-        if x1_smaller_than_X1:
-            end_confidence += 0.1
-            decision_path.append("x1_smaller_than_X1")
-        next_line_is_start_of_para = (
-            next_line_bbox
-            and (next_x0 > X0 + left_horizontal_ratio * avg_char_width)
-            and (not is_line_left_aligned_from_neighbors(curr_line_bbox, None, next_line_bbox, avg_char_width, direction=1))
-        )
-        if next_line_is_start_of_para:
-            end_confidence += 0.2
-            decision_path.append("next_line_is_start_of_para")
-        is_line_left_aligned_from_neighbors_bool = is_line_left_aligned_from_neighbors(
-            curr_line_bbox, None, next_line_bbox, avg_char_width
-        )
-        if is_line_left_aligned_from_neighbors_bool:
-            end_confidence += 0.1
-            decision_path.append("line_is_left_aligned_from_neighbors")
-        is_line_right_aligned_from_neighbors_bool = is_line_right_aligned_from_neighbors(
-            curr_line_bbox, None, next_line_bbox, avg_char_width
-        )
-        if not is_line_right_aligned_from_neighbors_bool:
-            end_confidence += 0.1
-            decision_path.append("line_is_not_right_aligned_from_neighbors")
-        is_end_of_para = end_with_punctuation(curr_line["text"]) and (
-            (x0_near_X0 and x1_smaller_than_X1)
-            or (is_line_left_aligned_from_neighbors_bool and not is_line_right_aligned_from_neighbors_bool)
-        )
-        return (is_end_of_para, end_confidence, decision_path)
-    def _cut_paras_per_block(
-        self,
-        block,
-    ):
-        """
-        Processes a raw block from PyMuPDF and returns the processed block.
-        Parameters
-        ----------
-        raw_block : dict
-            A raw block from pymupdf.
-        Returns
-        -------
-        processed_block : dict
-        """
-        def _construct_para(lines, is_block_title, para_title_level):
-            """
-            Construct a paragraph from given lines.
-            """
-            font_sizes = [span["size"] for line in lines for span in line["spans"]]
-            avg_font_size = sum(font_sizes) / len(font_sizes) if font_sizes else 0
-            font_colors = [span["color"] for line in lines for span in line["spans"]]
-            most_common_font_color = max(set(font_colors), key=font_colors.count) if font_colors else None
-            font_type_lengths = {}
-            for line in lines:
-                for span in line["spans"]:
-                    font_type = span["font"]
-                    bbox_width = span["bbox"][2] - span["bbox"][0]
-                    if font_type in font_type_lengths:
-                        font_type_lengths[font_type] += bbox_width
-                    else:
-                        font_type_lengths[font_type] = bbox_width
-            # get the font type with the longest bbox width
-            most_common_font_type = max(font_type_lengths, key=font_type_lengths.get) if font_type_lengths else None  # type: ignore
-            para_bbox = calculate_para_bbox(lines)
-            para_text = " ".join(line["text"] for line in lines)
-            return {
-                "para_bbox": para_bbox,
-                "para_text": para_text,
-                "para_font_type": most_common_font_type,
-                "para_font_size": avg_font_size,
-                "para_font_color": most_common_font_color,
-                "is_para_title": is_block_title,
-                "para_title_level": para_title_level,
-            }
-        block_bbox = block["bbox"]
-        block_text = block["text"]
-        block_lines = block["lines"]
-        X0 = safe_get(block, "X0", 0)
-        X1 = safe_get(block, "X1", 0)
-        avg_char_width = safe_get(block, "avg_char_width", 0)
-        avg_char_height = safe_get(block, "avg_char_height", 0)
-        avg_font_size = safe_get(block, "avg_font_size", 0)
-        is_block_title = safe_get(block, "is_block_title", False)
-        para_title_level = safe_get(block, "block_title_level", 0)
-        # Segment into paragraphs
-        para_ranges = []
-        in_paragraph = False
-        start_idx_of_para = None
-        # Create the processed paragraphs
-        processed_paras = {}
-        para_bboxes = []
-        end_idx_of_para = 0
-        for line_index, line in enumerate(block_lines):
-            curr_line = line
-            prev_line = block_lines[line_index - 1] if line_index > 0 else None
-            next_line = block_lines[line_index + 1] if line_index < len(block_lines) - 1 else None
-            """
-            Start processing paragraphs.
-            """
-            # Check if the line is the start of a paragraph
-            is_start_of_para, start_confidence, decision_path = self._is_possible_start_of_para(
-                curr_line, prev_line, next_line, X0, X1, avg_char_width, avg_font_size
-            )
-            if not in_paragraph and is_start_of_para:
-                in_paragraph = True
-                start_idx_of_para = line_index
-                # print_green(">>> Start of a paragraph")
-                # print("    curr_line_text: ", curr_line["text"])
-                # print("    start_confidence: ", start_confidence)
-                # print("    decision_path: ", decision_path)
-            # Check if the line is the end of a paragraph
-            is_end_of_para, end_confidence, decision_path = self._is_possible_end_of_para(
-                curr_line, next_line, X0, X1, avg_char_width
-            )
-            if in_paragraph and (is_end_of_para or not next_line):
-                para_ranges.append((start_idx_of_para, line_index))
-                start_idx_of_para = None
-                in_paragraph = False
-                # print_red(">>> End of a paragraph")
-                # print("    curr_line_text: ", curr_line["text"])
-                # print("    end_confidence: ", end_confidence)
-                # print("    decision_path: ", decision_path)
-        # Add the last paragraph if it is not added
-        if in_paragraph and start_idx_of_para is not None:
-            para_ranges.append((start_idx_of_para, len(block_lines) - 1))
-        # Process the matched paragraphs
-        for para_index, (start_idx, end_idx) in enumerate(para_ranges):
-            matched_lines = block_lines[start_idx : end_idx + 1]
-            para_properties = _construct_para(matched_lines, is_block_title, para_title_level)
-            para_key = f"para_{len(processed_paras)}"
-            processed_paras[para_key] = para_properties
-            para_bboxes.append(para_properties["para_bbox"])
-            end_idx_of_para = end_idx + 1
-        # Deal with the remaining lines
-        if end_idx_of_para < len(block_lines):
-            unmatched_lines = block_lines[end_idx_of_para:]
-            unmatched_properties = _construct_para(unmatched_lines, is_block_title, para_title_level)
-            unmatched_key = f"para_{len(processed_paras)}"
-            processed_paras[unmatched_key] = unmatched_properties
-            para_bboxes.append(unmatched_properties["para_bbox"])
-        block["paras"] = processed_paras
-        return block
-    def batch_process_blocks(self, pdf_dict):
-        """
-        Parses the blocks of all pages.
-        Parameters
-        ----------
-        pdf_dict : dict
-            PDF dictionary.
-        filter_blocks : list
-            List of bounding boxes to filter.
-        Returns
-        -------
-        result_dict : dict
-            Result dictionary.
-        """
-        num_paras = 0
-        for page_id, page in pdf_dict.items():
-            if page_id.startswith("page_"):
-                para_blocks = []
-                if "para_blocks" in page.keys():
-                    input_blocks = page["para_blocks"]
-                    for input_block in input_blocks:
-                        new_block = self._cut_paras_per_block(input_block)
-                        para_blocks.append(new_block)
-                        num_paras += len(new_block["paras"])
-                page["para_blocks"] = para_blocks
-        pdf_dict["statistics"]["num_paras"] = num_paras
-        return pdf_dict
-class BlockContinuationProcessor:
-    """
-    This class is used to process the blocks to detect block continuations.
-    """
-    def __init__(self) -> None:
-        pass
-    def __is_similar_font_type(self, font_type_1, font_type_2, prefix_length_ratio=0.3):
-        """
-        This function checks if the two font types are similar.
-        Definition of similar font types: the two font types have a common prefix,
-        and the length of the common prefix is at least a certain ratio of the length of the shorter font type.
-        Parameters
-        ----------
-        font_type1 : str
-            font type 1
-        font_type2 : str
-            font type 2
-        prefix_length_ratio : float
-            minimum ratio of the common prefix length to the length of the shorter font type
-        Returns
-        -------
-        bool
-            True if the two font types are similar, False otherwise.
-        """
-        if isinstance(font_type_1, list):
-            font_type_1 = font_type_1[0] if font_type_1 else ""
-        if isinstance(font_type_2, list):
-            font_type_2 = font_type_2[0] if font_type_2 else ""
-        if font_type_1 == font_type_2:
-            return True
-        # Find the length of the common prefix
-        common_prefix_length = len(os.path.commonprefix([font_type_1, font_type_2]))
-        # Calculate the minimum prefix length based on the ratio
-        min_prefix_length = int(min(len(font_type_1), len(font_type_2)) * prefix_length_ratio)
-        return common_prefix_length >= min_prefix_length
-    def __is_same_block_font(self, block_1, block_2):
-        """
-        This function compares the font of block1 and block2
-        Parameters
-        ----------
-        block1 : dict
-            block1
-        block2 : dict
-            block2
-        Returns
-        -------
-        is_same : bool
-            True if block1 and block2 have the same font, else False
-        """
-        block_1_font_type = safe_get(block_1, "block_font_type", "")
-        block_1_font_size = safe_get(block_1, "block_font_size", 0)
-        block_1_avg_char_width = safe_get(block_1, "avg_char_width", 0)
-        block_2_font_type = safe_get(block_2, "block_font_type", "")
-        block_2_font_size = safe_get(block_2, "block_font_size", 0)
-        block_2_avg_char_width = safe_get(block_2, "avg_char_width", 0)
-        if isinstance(block_1_font_size, list):
-            block_1_font_size = block_1_font_size[0] if block_1_font_size else 0
-        if isinstance(block_2_font_size, list):
-            block_2_font_size = block_2_font_size[0] if block_2_font_size else 0
-        block_1_text = safe_get(block_1, "text", "")
-        block_2_text = safe_get(block_2, "text", "")
-        if block_1_avg_char_width == 0 or block_2_avg_char_width == 0:
-            return False
-        if not block_1_text or not block_2_text:
-            return False
-        else:
-            text_len_ratio = len(block_2_text) / len(block_1_text)
-            if text_len_ratio < 0.2:
-                avg_char_width_condition = (
-                    abs(block_1_avg_char_width - block_2_avg_char_width) / min(block_1_avg_char_width, block_2_avg_char_width)
-                    < 0.5
-                )
-            else:
-                avg_char_width_condition = (
-                    abs(block_1_avg_char_width - block_2_avg_char_width) / min(block_1_avg_char_width, block_2_avg_char_width)
-                    < 0.2
-                )
-        block_font_size_condition = abs(block_1_font_size - block_2_font_size) < 1
-        return (
-            self.__is_similar_font_type(block_1_font_type, block_2_font_type)
-            and avg_char_width_condition
-            and block_font_size_condition
-        )
-    def _is_alphabet_char(self, char):
-        if (char >= "\u0041" and char <= "\u005a") or (char >= "\u0061" and char <= "\u007a"):
-            return True
-        else:
-            return False
-    def _is_chinese_char(self, char):
-        if char >= "\u4e00" and char <= "\u9fa5":
-            return True
-        else:
-            return False
-    def _is_other_letter_char(self, char):
-        try:
-            cat = unicodedata.category(char)
-            if cat == "Lu" or cat == "Ll":
-                return not self._is_alphabet_char(char) and not self._is_chinese_char(char)
-        except TypeError:
-            print("The input to the function must be a single character.")
-        return False
-    def _is_year(self, s: str):
-        try:
-            number = int(s)
-            return 1900 <= number <= 2099
-        except ValueError:
-            return False
-    def _match_brackets(self, text):
-        # pattern = r"^[\(\)\[\]（）【】{}｛｝<>＜＞〔〕〘〙\"\'“”‘’]"
-        pattern = r"^[\(\)\]（）】{}｛｝>＞〕〙\"\'“”‘’]"
-        return bool(re.match(pattern, text))
-    def _is_para_font_consistent(self, para_1, para_2):
-        """
-        This function compares the font of para1 and para2
-        Parameters
-        ----------
-        para1 : dict
-            para1
-        para2 : dict
-            para2
-        Returns
-        -------
-        is_same : bool
-            True if para1 and para2 have the same font, else False
-        """
-        if para_1 is None or para_2 is None:
-            return False
-        para_1_font_type = safe_get(para_1, "para_font_type", "")
-        para_1_font_size = safe_get(para_1, "para_font_size", 0)
-        para_1_font_color = safe_get(para_1, "para_font_color", "")
-        para_2_font_type = safe_get(para_2, "para_font_type", "")
-        para_2_font_size = safe_get(para_2, "para_font_size", 0)
-        para_2_font_color = safe_get(para_2, "para_font_color", "")
-        if isinstance(para_1_font_type, list):  # get the most common font type
-            para_1_font_type = max(set(para_1_font_type), key=para_1_font_type.count)
-        if isinstance(para_2_font_type, list):
-            para_2_font_type = max(set(para_2_font_type), key=para_2_font_type.count)
-        if isinstance(para_1_font_size, list):  # compute average font type
-            para_1_font_size = sum(para_1_font_size) / len(para_1_font_size)
-        if isinstance(para_2_font_size, list):  # compute average font type
-            para_2_font_size = sum(para_2_font_size) / len(para_2_font_size)
-        return (
-            self.__is_similar_font_type(para_1_font_type, para_2_font_type)
-            and abs(para_1_font_size - para_2_font_size) < 1.5
-            # and para_font_color1 == para_font_color2
-        )
-    def _is_para_puncs_consistent(self, para_1, para_2):
-        """
-        This function determines whether para1 and para2 are originally from the same paragraph by checking the puncs of para1(former) and para2(latter)
-        Parameters
-        ----------
-        para1 : dict
-            para1
-        para2 : dict
-            para2
-        Returns
-        -------
-        is_same : bool
-            True if para1 and para2 are from the same paragraph by using the puncs, else False
-        """
-        para_1_text = safe_get(para_1, "para_text", "").strip()
-        para_2_text = safe_get(para_2, "para_text", "").strip()
-        para_1_bboxes = safe_get(para_1, "para_bbox", [])
-        para_1_font_sizes = safe_get(para_1, "para_font_size", 0)
-        para_2_bboxes = safe_get(para_2, "para_bbox", [])
-        para_2_font_sizes = safe_get(para_2, "para_font_size", 0)
-        # print_yellow("    Features of determine puncs_consistent:")
-        # print(f"    para_1_text: {para_1_text}")
-        # print(f"    para_2_text: {para_2_text}")
-        # print(f"    para_1_bboxes: {para_1_bboxes}")
-        # print(f"    para_2_bboxes: {para_2_bboxes}")
-        # print(f"    para_1_font_sizes: {para_1_font_sizes}")
-        # print(f"    para_2_font_sizes: {para_2_font_sizes}")
-        if is_nested_list(para_1_bboxes):
-            x0_1, y0_1, x1_1, y1_1 = para_1_bboxes[-1]
-        else:
-            x0_1, y0_1, x1_1, y1_1 = para_1_bboxes
-        if is_nested_list(para_2_bboxes):
-            x0_2, y0_2, x1_2, y1_2 = para_2_bboxes[0]
-            para_2_font_sizes = para_2_font_sizes[0]  # type: ignore
-        else:
-            x0_2, y0_2, x1_2, y1_2 = para_2_bboxes
-        right_align_threshold = 0.5 * (para_1_font_sizes + para_2_font_sizes) * 0.8
-        are_two_paras_right_aligned = abs(x1_1 - x1_2) < right_align_threshold
-        left_indent_threshold = 0.5 * (para_1_font_sizes + para_2_font_sizes) * 0.8
-        is_para1_left_indent_than_papa2 = x0_1 - x0_2 > left_indent_threshold
-        is_para2_left_indent_than_papa1 = x0_2 - x0_1 > left_indent_threshold
-        # Check if either para_text1 or para_text2 is empty
-        if not para_1_text or not para_2_text:
-            return False
-        # Define the end puncs for a sentence to end and hyphen
-        end_puncs = [".", "?", "!", "。", "？", "！", "…"]
-        hyphen = ["-", "—"]
-        # Check if para_text1 ends with either hyphen or non-end punctuation or spaces
-        para_1_end_with_hyphen = para_1_text and para_1_text[-1] in hyphen
-        para_1_end_with_end_punc = para_1_text and para_1_text[-1] in end_puncs
-        para_1_end_with_space = para_1_text and para_1_text[-1] == " "
-        para_1_not_end_with_end_punc = para_1_text and para_1_text[-1] not in end_puncs
-        # print_yellow(f"    para_1_end_with_hyphen: {para_1_end_with_hyphen}")
-        # print_yellow(f"    para_1_end_with_end_punc: {para_1_end_with_end_punc}")
-        # print_yellow(f"    para_1_not_end_with_end_punc: {para_1_not_end_with_end_punc}")
-        # print_yellow(f"    para_1_end_with_space: {para_1_end_with_space}")
-        if para_1_end_with_hyphen:  # If para_text1 ends with hyphen
-            # print_red(f"para_1 is end with hyphen.")
-            para_2_is_consistent = para_2_text and (
-                para_2_text[0] in hyphen
-                or (self._is_alphabet_char(para_2_text[0]) and para_2_text[0].islower())
-                or (self._is_chinese_char(para_2_text[0]))
-                or (self._is_other_letter_char(para_2_text[0]))
-            )
-            if para_2_is_consistent:
-                # print(f"para_2 is consistent.\n")
-                return True
-            else:
-                # print(f"para_2 is not consistent.\n")
-                pass
-        elif para_1_end_with_end_punc:  # If para_text1 ends with ending punctuations
-            # print_red(f"para_1 is end with end_punc.")
-            para_2_is_consistent = (
-                para_2_text
-                and (
-                    para_2_text[0]
-                    == " "
-                    # or (self._is_alphabet_char(para_2_text[0]) and para_2_text[0].isupper())
-                    # or (self._is_chinese_char(para_2_text[0]))
-                    # or (self._is_other_letter_char(para_2_text[0]))
-                )
-                and not is_para2_left_indent_than_papa1
-            )
-            if para_2_is_consistent:
-                # print(f"para_2 is consistent.\n")
-                return True
-            else:
-                # print(f"para_2 is not consistent.\n")
-                pass
-        elif para_1_not_end_with_end_punc:  # If para_text1 is not end with ending punctuations
-            # print_red(f"para_1 is NOT end with end_punc.")
-            para_2_is_consistent = para_2_text and (
-                para_2_text[0] == " "
-                or (self._is_alphabet_char(para_2_text[0]) and para_2_text[0].islower())
-                or (self._is_alphabet_char(para_2_text[0]))
-                or (self._is_year(para_2_text[0:4]))
-                or (are_two_paras_right_aligned or is_para1_left_indent_than_papa2)
-                or (self._is_chinese_char(para_2_text[0]))
-                or (self._is_other_letter_char(para_2_text[0]))
-                or (self._match_brackets(para_2_text[0]))
-            )
-            if para_2_is_consistent:
-                # print(f"para_2 is consistent.\n")
-                return True
-            else:
-                # print(f"para_2 is not consistent.\n")
-                pass
-        elif para_1_end_with_space:  # If para_text1 ends with space
-            # print_red(f"para_1 is end with space.")
-            para_2_is_consistent = para_2_text and (
-                para_2_text[0] == " "
-                or (self._is_alphabet_char(para_2_text[0]) and para_2_text[0].islower())
-                or (self._is_chinese_char(para_2_text[0]))
-                or (self._is_other_letter_char(para_2_text[0]))
-            )
-            if para_2_is_consistent:
-                # print(f"para_2 is consistent.\n")
-                return True
-            else:
-                pass
-                # print(f"para_2 is not consistent.\n")
-        return False
-    def _is_block_consistent(self, block_1, block_2):
-        """
-        This function determines whether block1 and block2 are originally from the same block
-        Parameters
-        ----------
-        block1 : dict
-            block1s
-        block2 : dict
-            block2
-        Returns
-        -------
-        is_same : bool
-            True if block1 and block2 are from the same block, else False
-        """
-        return self.__is_same_block_font(block_1, block_2)
-    def _is_para_continued(self, para_1, para_2):
-        """
-        This function determines whether para1 and para2 are originally from the same paragraph
-        Parameters
-        ----------
-        para1 : dict
-            para1
-        para2 : dict
-            para2
-        Returns
-        -------
-        is_same : bool
-            True if para1 and para2 are from the same paragraph, else False
-        """
-        is_para_font_consistent = self._is_para_font_consistent(para_1, para_2)
-        is_para_puncs_consistent = self._is_para_puncs_consistent(para_1, para_2)
-        return is_para_font_consistent and is_para_puncs_consistent
-    def _are_boundaries_of_block_consistent(self, block_1, block_2):
-        """
-        This function checks if the boundaries of block1 and block2 are consistent
-        Parameters
-        ----------
-        block1 : dict
-            block1
-        block2 : dict
-            block2
-        Returns
-        -------
-        is_consistent : bool
-            True if the boundaries of block1 and block2 are consistent, else False
-        """
-        last_line_of_block_1 = block_1["lines"][-1]
-        first_line_of_block_2 = block_2["lines"][0]
-        spans_of_last_line_of_block_1 = last_line_of_block_1["spans"]
-        spans_of_first_line_of_block_2 = first_line_of_block_2["spans"]
-        font_type_of_last_line_of_block_1 = spans_of_last_line_of_block_1[0]["font"].lower()
-        font_size_of_last_line_of_block_1 = spans_of_last_line_of_block_1[0]["size"]
-        font_color_of_last_line_of_block_1 = spans_of_last_line_of_block_1[0]["color"]
-        font_flags_of_last_line_of_block_1 = spans_of_last_line_of_block_1[0]["flags"]
-        font_type_of_first_line_of_block_2 = spans_of_first_line_of_block_2[0]["font"].lower()
-        font_size_of_first_line_of_block_2 = spans_of_first_line_of_block_2[0]["size"]
-        font_color_of_first_line_of_block_2 = spans_of_first_line_of_block_2[0]["color"]
-        font_flags_of_first_line_of_block_2 = spans_of_first_line_of_block_2[0]["flags"]
-        return (
-            self.__is_similar_font_type(font_type_of_last_line_of_block_1, font_type_of_first_line_of_block_2)
-            and abs(font_size_of_last_line_of_block_1 - font_size_of_first_line_of_block_2) < 1
-            # and font_color_of_last_line_of_block1 == font_color_of_first_line_of_block2
-            and font_flags_of_last_line_of_block_1 == font_flags_of_first_line_of_block_2
-        )
-    def should_merge_next_para(self, curr_para, next_para):
-        """
-        This function checks if the next_para should be merged into the curr_para.
-        Parameters
-        ----------
-        curr_para : dict
-            The current paragraph.
-        next_para : dict
-            The next paragraph.
-        Returns
-        -------
-        bool
-            True if the next_para should be merged into the curr_para, False otherwise.
-        """
-        if self._is_para_continued(curr_para, next_para):
-            return True
-        else:
-            return False
-    def batch_tag_paras(self, pdf_dict):
-        """
-        This function tags the paragraphs in the pdf_dict.
-        Parameters
-        ----------
-        pdf_dict : dict
-            PDF dictionary.
-        Returns
-        -------
-        pdf_dict : dict
-            PDF dictionary with tagged paragraphs.
-        """
-        the_last_page_id = len(pdf_dict) - 1
-        for curr_page_idx, (curr_page_id, curr_page_content) in enumerate(pdf_dict.items()):
-            if curr_page_id.startswith("page_") and curr_page_content.get("para_blocks", []):
-                para_blocks_of_curr_page = curr_page_content["para_blocks"]
-                next_page_idx = curr_page_idx + 1
-                next_page_id = f"page_{next_page_idx}"
-                next_page_content = pdf_dict.get(next_page_id, {})
-                for i, current_block in enumerate(para_blocks_of_curr_page):
-                    for para_id, curr_para in current_block["paras"].items():
-                        curr_para["curr_para_location"] = [
-                            curr_page_idx,
-                            current_block["block_id"],
-                            int(para_id.split("_")[-1]),
-                        ]
-                        curr_para["next_para_location"] = None  # 默认设置为None
-                        curr_para["merge_next_para"] = False  # 默认设置为False
-                    next_block = para_blocks_of_curr_page[i + 1] if i < len(para_blocks_of_curr_page) - 1 else None
-                    if next_block:
-                        curr_block_last_para_key = list(current_block["paras"].keys())[-1]
-                        curr_blk_last_para = current_block["paras"][curr_block_last_para_key]
-                        next_block_first_para_key = list(next_block["paras"].keys())[0]
-                        next_blk_first_para = next_block["paras"][next_block_first_para_key]
-                        if self.should_merge_next_para(curr_blk_last_para, next_blk_first_para):
-                            curr_blk_last_para["next_para_location"] = [
-                                curr_page_idx,
-                                next_block["block_id"],
-                                int(next_block_first_para_key.split("_")[-1]),
-                            ]
-                            curr_blk_last_para["merge_next_para"] = True
-                    else:
-                        # Handle the case where the next block is in a different page
-                        curr_block_last_para_key = list(current_block["paras"].keys())[-1]
-                        curr_blk_last_para = current_block["paras"][curr_block_last_para_key]
-                        while not next_page_content.get("para_blocks", []) and next_page_idx <= the_last_page_id:
-                            next_page_idx += 1
-                            next_page_id = f"page_{next_page_idx}"
-                            next_page_content = pdf_dict.get(next_page_id, {})
-                        if next_page_content.get("para_blocks", []):
-                            next_blk_first_para_key = list(next_page_content["para_blocks"][0]["paras"].keys())[0]
-                            next_blk_first_para = next_page_content["para_blocks"][0]["paras"][next_blk_first_para_key]
-                            if self.should_merge_next_para(curr_blk_last_para, next_blk_first_para):
-                                curr_blk_last_para["next_para_location"] = [
-                                    next_page_idx,
-                                    next_page_content["para_blocks"][0]["block_id"],
-                                    int(next_blk_first_para_key.split("_")[-1]),
-                                ]
-                                curr_blk_last_para["merge_next_para"] = True
-        return pdf_dict
-    def find_block_by_id(self, para_blocks, block_id):
-        """
-        This function finds a block by its id.
-        Parameters
-        ----------
-        para_blocks : list
-            List of blocks.
-        block_id : int
-            Id of the block to find.
-        Returns
-        -------
-        block : dict
-            The block with the given id.
-        """
-        for block in para_blocks:
-            if block.get("block_id") == block_id:
-                return block
-        return None
-    def batch_merge_paras(self, pdf_dict):
-        """
-        This function merges the paragraphs in the pdf_dict.
-        Parameters
-        ----------
-        pdf_dict : dict
-            PDF dictionary.
-        Returns
-        -------
-        pdf_dict : dict
-            PDF dictionary with merged paragraphs.
-        """
-        for page_id, page_content in pdf_dict.items():
-            if page_id.startswith("page_") and page_content.get("para_blocks", []):
-                para_blocks_of_page = page_content["para_blocks"]
-                for i in range(len(para_blocks_of_page)):
-                    current_block = para_blocks_of_page[i]
-                    paras = current_block["paras"]
-                    for para_id, curr_para in list(paras.items()):
-                        # 跳过标题段落
-                        if curr_para.get("is_para_title"):
-                            continue
-                        while curr_para.get("merge_next_para"):
-                            next_para_location = curr_para.get("next_para_location")
-                            if not next_para_location:
-                                break
-                            next_page_idx, next_block_id, next_para_id = next_para_location
-                            next_page_id = f"page_{next_page_idx}"
-                            next_page_content = pdf_dict.get(next_page_id)
-                            if not next_page_content:
-                                break
-                            next_block = self.find_block_by_id(next_page_content.get("para_blocks", []), next_block_id)
-                            if not next_block:
-                                break
-                            next_para = next_block["paras"].get(f"para_{next_para_id}")
-                            if not next_para or next_para.get("is_para_title"):
-                                break
-                            # 合并段落文本
-                            curr_para_text = curr_para.get("para_text", "")
-                            next_para_text = next_para.get("para_text", "")
-                            curr_para["para_text"] = curr_para_text + " " + next_para_text
-                            # 更新 next_para_location
-                            curr_para["next_para_location"] = next_para.get("next_para_location")
-                            # 将下一个段落文本置为空，表示已被合并
-                            next_para["para_text"] = ""
-                            # 更新 merge_next_para 标记
-                            curr_para["merge_next_para"] = next_para.get("merge_next_para", False)
-        return pdf_dict
-class DrawAnnos:
-    """
-    This class draws annotations on the pdf file
-    ----------------------------------------
-                Color Code
-    ----------------------------------------
-        Red: (1, 0, 0)
-        Green: (0, 1, 0)
-        Blue: (0, 0, 1)
-        Yellow: (1, 1, 0) - mix of red and green
-        Cyan: (0, 1, 1) - mix of green and blue
-        Magenta: (1, 0, 1) - mix of red and blue
-        White: (1, 1, 1) - red, green and blue full intensity
-        Black: (0, 0, 0) - no color component whatsoever
-        Gray: (0.5, 0.5, 0.5) - equal and medium intensity of red, green and blue color components
-        Orange: (1, 0.65, 0) - maximum intensity of red, medium intensity of green, no blue component
-    """
-    def __init__(self) -> None:
-        pass
-    def __is_nested_list(self, lst):
-        """
-        This function returns True if the given list is a nested list of any degree.
-        """
-        if isinstance(lst, list):
-            return any(self.__is_nested_list(i) for i in lst) or any(isinstance(i, list) for i in lst)
-        return False
-    def __valid_rect(self, bbox):
-        # Ensure that the rectangle is not empty or invalid
-        if isinstance(bbox[0], list):
-            return False  # It's a nested list, hence it can't be valid rect
-        else:
-            return bbox[0] < bbox[2] and bbox[1] < bbox[3]
-    def __draw_nested_boxes(self, page, nested_bbox, color=(0, 1, 1)):
-        """
-        This function draws the nested boxes
-        Parameters
-        ----------
-        page : fitz.Page
-            page
-        nested_bbox : list
-            nested bbox
-        color : tuple
-            color, by default (0, 1, 1)    # draw with cyan color for combined paragraph
-        """
-        if self.__is_nested_list(nested_bbox):  # If it's a nested list
-            for bbox in nested_bbox:
-                self.__draw_nested_boxes(page, bbox, color)  # Recursively call the function
-        elif self.__valid_rect(nested_bbox):  # If valid rectangle
-            para_rect = fitz.Rect(nested_bbox)
-            para_anno = page.add_rect_annot(para_rect)
-            para_anno.set_colors(stroke=color)  # draw with cyan color for combined paragraph
-            para_anno.set_border(width=1)
-            para_anno.update()
-    def draw_annos(self, input_pdf_path, pdf_dic, output_pdf_path):
-        """
-        This function draws annotations on the pdf file.
-        Parameters
-        ----------
-        input_pdf_path : str
-            path to the input pdf file
-        pdf_dic : dict
-            pdf dictionary
-        output_pdf_path : str
-            path to the output pdf file
-        pdf_dic : dict
-            pdf dictionary
-        """
-        pdf_doc = open_pdf(input_pdf_path)
-        if pdf_dic is None:
-            pdf_dic = {}
-        if output_pdf_path is None:
-            output_pdf_path = input_pdf_path.replace(".pdf", "_anno.pdf")
-        for page_id, page in enumerate(pdf_doc):  # type: ignore
-            page_key = f"page_{page_id}"
-            for ele_key, ele_data in pdf_dic[page_key].items():
-                if ele_key == "para_blocks":
-                    para_blocks = ele_data
-                    for para_block in para_blocks:
-                        if "paras" in para_block.keys():
-                            paras = para_block["paras"]
-                            for para_key, para_content in paras.items():
-                                para_bbox = para_content["para_bbox"]
-                                # print(f"para_bbox: {para_bbox}")
-                                # print(f"is a nested list: {self.__is_nested_list(para_bbox)}")
-                                if self.__is_nested_list(para_bbox) and len(para_bbox) > 1:
-                                    color = (0, 1, 1)
-                                    self.__draw_nested_boxes(
-                                        page, para_bbox, color
-                                    )  # draw with cyan color for combined paragraph
-                                else:
-                                    if self.__valid_rect(para_bbox):
-                                        para_rect = fitz.Rect(para_bbox)
-                                        para_anno = page.add_rect_annot(para_rect)
-                                        para_anno.set_colors(stroke=(0, 1, 0))  # draw with green color for normal paragraph
-                                        para_anno.set_border(width=0.5)
-                                        para_anno.update()
-                                is_para_title = para_content["is_para_title"]
-                                if is_para_title:
-                                    if self.__is_nested_list(para_content["para_bbox"]) and len(para_content["para_bbox"]) > 1:
-                                        color = (0, 0, 1)
-                                        self.__draw_nested_boxes(
-                                            page, para_content["para_bbox"], color
-                                        )  # draw with cyan color for combined title
-                                    else:
-                                        if self.__valid_rect(para_content["para_bbox"]):
-                                            para_rect = fitz.Rect(para_content["para_bbox"])
-                                            if self.__valid_rect(para_content["para_bbox"]):
-                                                para_anno = page.add_rect_annot(para_rect)
-                                                para_anno.set_colors(stroke=(0, 0, 1))  # draw with blue color for normal title
-                                                para_anno.set_border(width=0.5)
-                                                para_anno.update()
-        pdf_doc.save(output_pdf_path)
-        pdf_doc.close()
-class ParaProcessPipeline:
-    def __init__(self) -> None:
-        pass
-    def para_process_pipeline(self, pdf_info_dict, para_debug_mode=None, input_pdf_path=None, output_pdf_path=None):
-        """
-        This function processes the paragraphs, including:
-        1. Read raw input json file into pdf_dic
-        2. Detect and replace equations
-        3. Combine spans into a natural line
-        4. Check if the paragraphs are inside bboxes passed from "layout_bboxes" key
-        5. Compute statistics for each block
-        6. Detect titles in the document
-        7. Detect paragraphs inside each block
-        8. Divide the level of the titles
-        9. Detect and combine paragraphs from different blocks into one paragraph
-        10. Check whether the final results after checking headings, dividing paragraphs within blocks, and merging paragraphs between blocks are plausible and reasonable.
-        11. Draw annotations on the pdf file
-        Parameters
-        ----------
-        pdf_dic_json_fpath : str
-            path to the pdf dictionary json file.
-            Notice: data noises, including overlap blocks, header, footer, watermark, vertical margin note have been removed already.
-        input_pdf_doc : str
-            path to the input pdf file
-        output_pdf_path : str
-            path to the output pdf file
-        Returns
-        -------
-        pdf_dict : dict
-            result dictionary
-        """
-        error_info = None
-        output_json_file = ""
-        output_dir = ""
-        if input_pdf_path is not None:
-            input_pdf_path = os.path.abspath(input_pdf_path)
-            # print_green_on_red(f">>>>>>>>>>>>>>>>>>> Process the paragraphs of {input_pdf_path}")
-        if output_pdf_path is not None:
-            output_dir = os.path.dirname(output_pdf_path)
-            output_json_file = f"{output_dir}/pdf_dic.json"
-        def __save_pdf_dic(pdf_dic, output_pdf_path, stage="0", para_debug_mode=para_debug_mode):
-            """
-            Save the pdf_dic to a json file
-            """
-            output_pdf_file_name = os.path.basename(output_pdf_path)
-            # output_dir = os.path.dirname(output_pdf_path)
-            output_dir = "\\tmp\\pdf_parse"
-            output_pdf_file_name = output_pdf_file_name.replace(".pdf", f"_stage_{stage}.json")
-            pdf_dic_json_fpath = os.path.join(output_dir, output_pdf_file_name)
-            if not os.path.exists(output_dir):
-                os.makedirs(output_dir)
-            if para_debug_mode == "full":
-                with open(pdf_dic_json_fpath, "w", encoding="utf-8") as f:
-                    json.dump(pdf_dic, f, indent=2, ensure_ascii=False)
-            # Validate the output already exists
-            if not os.path.exists(pdf_dic_json_fpath):
-                print_red(f"Failed to save the pdf_dic to {pdf_dic_json_fpath}")
-                return None
-            else:
-                print_green(f"Succeed to save the pdf_dic to {pdf_dic_json_fpath}")
-            return pdf_dic_json_fpath
-        """
-        Preprocess the lines of block
-        """
-        # Combine spans into a natural line
-        rawBlockProcessor = RawBlockProcessor()
-        pdf_dic = rawBlockProcessor.batch_process_blocks(pdf_info_dict)
-        # print(f"pdf_dic['page_0']['para_blocks'][0]: {pdf_dic['page_0']['para_blocks'][0]}", end="\n\n")
-        # Check if the paragraphs are inside bboxes passed from "layout_bboxes" key
-        layoutFilter = LayoutFilterProcessor()
-        pdf_dic = layoutFilter.batch_process_blocks(pdf_dic)
-        # Compute statistics for each block
-        blockStatisticsCalculator = BlockStatisticsCalculator()
-        pdf_dic = blockStatisticsCalculator.batch_process_blocks(pdf_dic)
-        # print(f"pdf_dic['page_0']['para_blocks'][0]: {pdf_dic['page_0']['para_blocks'][0]}", end="\n\n")
-        # Compute statistics for all blocks(namely this pdf document)
-        docStatisticsCalculator = DocStatisticsCalculator()
-        pdf_dic = docStatisticsCalculator.calc_stats_of_doc(pdf_dic)
-        # print(f"pdf_dic['statistics']: {pdf_dic['statistics']}", end="\n\n")
-        # Dump the first three stages of pdf_dic to a json file
-        if para_debug_mode == "full":
-            pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="0", para_debug_mode=para_debug_mode)
-        """
-        Detect titles in the document
-        """
-        doc_statistics = pdf_dic["statistics"]
-        titleProcessor = TitleProcessor(doc_statistics)
-        titleProcessor.stage = 0
-        pdf_dic = titleProcessor.batch_detect_titles(pdf_dic)
-        if para_debug_mode == "full":
-            pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="1", para_debug_mode=para_debug_mode)
-        """
-        Detect and divide the level of the titles
-        """
-        titleProcessor = TitleProcessor()
-        pdf_dic = titleProcessor.batch_recog_title_level(pdf_dic)
-        if para_debug_mode == "full":
-            pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="2", para_debug_mode=para_debug_mode)
-        """
-        Detect and split paragraphs inside each block
-        """
-        blockInnerParasProcessor = BlockTerminationProcessor()
-        pdf_dic = blockInnerParasProcessor.batch_process_blocks(pdf_dic)
-        if para_debug_mode == "full":
-            pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="3", para_debug_mode=para_debug_mode)
-        # pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="3", para_debug_mode="full")
-        # print_green(f"pdf_dic_json_fpath: {pdf_dic_json_fpath}")
-        """
-        Detect and combine paragraphs from different blocks into one paragraph
-        """
-        blockContinuationProcessor = BlockContinuationProcessor()
-        pdf_dic = blockContinuationProcessor.batch_tag_paras(pdf_dic)
-        pdf_dic = blockContinuationProcessor.batch_merge_paras(pdf_dic)
-        """
-        Detect titles in the document again
-        """
-        doc_statistics = pdf_dic["statistics"]
-        titleProcessor = TitleProcessor(doc_statistics)
-        titleProcessor.stage = 1
-        # pdf_dic = titleProcessor.batch_detect_titles(pdf_dic)
-        if para_debug_mode == "full":
-            pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="4", para_debug_mode=para_debug_mode)
-        # pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="4", para_debug_mode="full")
-        # print_green(f"pdf_dic_json_fpath: {pdf_dic_json_fpath}")
-        """
-        Discard pdf files by checking exceptions and return the error info to the caller
-        """
-        discardByException = DiscardByException()
-        is_discard_by_single_line_block = discardByException.discard_by_single_line_block(
-            pdf_dic, exception=DenseSingleLineBlockException()
-        )
-        is_discard_by_title_detection = discardByException.discard_by_title_detection(
-            pdf_dic, exception=TitleDetectionException()
-        )
-        is_discard_by_title_level = discardByException.discard_by_title_level(pdf_dic, exception=TitleLevelException())
-        is_discard_by_split_para = discardByException.discard_by_split_para(pdf_dic, exception=ParaSplitException())
-        is_discard_by_merge_para = discardByException.discard_by_merge_para(pdf_dic, exception=ParaMergeException())
-        if is_discard_by_single_line_block is not None:
-            error_info = is_discard_by_single_line_block
-        elif is_discard_by_title_detection is not None:
-            error_info = is_discard_by_title_detection
-        elif is_discard_by_title_level is not None:
-            error_info = is_discard_by_title_level
-        elif is_discard_by_split_para is not None:
-            error_info = is_discard_by_split_para
-        elif is_discard_by_merge_para is not None:
-            error_info = is_discard_by_merge_para
-        if error_info is not None:
-            return pdf_dic, error_info
-        """
-        Dump the final pdf_dic to a json file
-        """
-        if para_debug_mode is not None:
-            with open(output_json_file, "w", encoding="utf-8") as f:
-                json.dump(pdf_info_dict, f, ensure_ascii=False, indent=4)
-        """
-        Draw the annotations
-        """
-        if para_debug_mode is not None:
-            drawAnnos = DrawAnnos()
-            drawAnnos.draw_annos(input_pdf_path, pdf_dic, output_pdf_path)
-        """
-        Remove the intermediate files which are generated in the process of paragraph processing if debug_mode is simple
-        """
-        if para_debug_mode is not None:
-            for fpath in os.listdir(output_dir):
-                if fpath.endswith(".json") and "stage" in fpath:
-                    os.remove(os.path.join(output_dir, fpath))
-        return pdf_dic, error_info
-"""
-Run this script to test the function with Command: 
-python detect_para.py [pdf_path] [output_pdf_path]
-Params:
- pdf_path: the path of the pdf file
- output_pdf_path: the path of the output pdf file
-"""
-if __name__ == "__main__":
-    DEFAULT_PDF_PATH = (
-        "app/pdf_toolbox/tests/assets/paper/paper.pdf" if os.name != "nt" else "app\\pdf_toolbox\\tests\\assets\\paper\\paper.pdf"
-    )
-    input_pdf_path = sys.argv[1] if len(sys.argv) > 1 else DEFAULT_PDF_PATH
-    output_pdf_path = sys.argv[2] if len(sys.argv) > 2 else input_pdf_path.split(".")[0] + "_recogPara.pdf"
-    output_json_path = sys.argv[3] if len(sys.argv) > 3 else input_pdf_path.split(".")[0] + "_recogPara.json"
-    import stat
-    # Remove existing output file if it exists
-    if os.path.exists(output_pdf_path):
-        os.chmod(output_pdf_path, stat.S_IWRITE)
-        os.remove(output_pdf_path)
-    input_pdf_doc = open_pdf(input_pdf_path)
-    # postprocess the paragraphs
-    paraProcessPipeline = ParaProcessPipeline()
-    # parse paragraph and save to json file
-    pdf_dic = {}
-    blockInnerParasProcessor = BlockTerminationProcessor()
-    """
-    Construct the pdf dictionary.
-    """
-    for page_id, page in enumerate(input_pdf_doc):  # type: ignore
-        # print(f"Processing page {page_id}")
-        # print(f"page: {page}")
-        raw_blocks = page.get_text("dict")["blocks"]
-        # Save text blocks to "preproc_blocks"
-        preproc_blocks = []
-        for block in raw_blocks:
-            if block["type"] == 0:
-                preproc_blocks.append(block)
-        layout_bboxes = []
-        # Construct the pdf dictionary as schema above
-        page_dict = {
-            "para_blocks": None,
-            "preproc_blocks": preproc_blocks,
-            "images": None,
-            "tables": None,
-            "interline_equations": None,
-            "inline_equations": None,
-            "layout_bboxes": None,
-            "pymu_raw_blocks": None,
-            "global_statistic": None,
-            "droped_text_block": None,
-            "droped_image_block": None,
-            "droped_table_block": None,
-            "image_backup": None,
-            "table_backup": None,
-        }
-        pdf_dic[f"page_{page_id}"] = page_dict
-    # print(f"pdf_dic: {pdf_dic}")
-    with open(output_json_path, "w", encoding="utf-8") as f:
-        json.dump(pdf_dic, f, ensure_ascii=False, indent=4)
-    pdf_dic = paraProcessPipeline.para_process_pipeline(output_json_path, input_pdf_doc, output_pdf_path)
--- a/others/pdf2text_recogTitle.py
+++ b/others/pdf2text_recogTitle.py
-from magic_pdf.libs.commons import fitz             # pyMuPDF库
-def parse_titles(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict, exclude_bboxes):
-    """
-    :param page_ID: int类型，当前page在当前pdf文档中是第page_D页。
-    :param page :fitz读取的当前页的内容
-    :param res_dir_path: str类型，是每一个pdf文档，在当前.py文件的目录下生成一个与pdf文档同名的文件夹，res_dir_path就是文件夹的dir
-    :param json_from_DocXchain_obj: dict类型，把pdf文档送入DocXChain模型中后，提取bbox，结果保存到pdf文档同名文件夹下的 page_ID.json文件中了。json_from_DocXchain_obj就是打开后的dict
-    """
-    DPI = 72  # use this resolution
-    pix = page.get_pixmap(dpi=DPI)
-    pageL = 0
-    pageR = int(pix.w)
-    pageU = 0
-    pageD = int(pix.h)
-    #--------- 通过json_from_DocXchain来获取 title ---------#
-    title_bbox_from_DocXChain = []
-    xf_json = json_from_DocXchain_obj
-    width_from_json = xf_json['page_info']['width']
-    height_from_json = xf_json['page_info']['height']
-    LR_scaleRatio = width_from_json / (pageR - pageL)
-    UD_scaleRatio = height_from_json / (pageD - pageU)
-    # {0: 'title',  # 标题
-    # 1: 'figure', # 图片
-    #  2: 'plain text',  # 文本
-    #  3: 'header',      # 页眉
-    #  4: 'page number', # 页码
-    #  5: 'footnote',    # 脚注
-    #  6: 'footer',      # 页脚
-    #  7: 'table',       # 表格
-    #  8: 'table caption',  # 表格描述
-    #  9: 'figure caption', # 图片描述
-    #  10: 'equation',      # 公式
-    #  11: 'full column',   # 单栏
-    #  12: 'sub column',    # 多栏
-    #  13: 'embedding',     # 嵌入公式
-    #  14: 'isolated'}      # 单行公式
-    for xf in xf_json['layout_dets']:
-        L = xf['poly'][0] / LR_scaleRatio
-        U = xf['poly'][1] / UD_scaleRatio
-        R = xf['poly'][2] / LR_scaleRatio
-        D = xf['poly'][5] / UD_scaleRatio
-        # L += pageL          # 有的页面，artBox偏移了。不在（0,0）
-        # R += pageL
-        # U += pageU
-        # D += pageU
-        L, R = min(L, R), max(L, R)
-        U, D = min(U, D), max(U, D)
-        if xf['category_id'] == 0 and xf['score'] >= 0.3:
-            title_bbox_from_DocXChain.append((L, U, R, D))
-    title_final_names = []
-    title_final_bboxs = []
-    title_ID = 0
-    for L, U, R, D in title_bbox_from_DocXChain:
-        # cur_title = page.get_pixmap(clip=(L,U,R,D))
-        new_title_name = "title_{}_{}.png".format(page_ID, title_ID)    # 标题name
-        # cur_title.save(res_dir_path + '/' + new_title_name)           # 把标题存储在新建的文件夹，并命名
-        title_final_names.append(new_title_name)                        # 把标题的名字存在list中
-        title_final_bboxs.append((L, U, R, D))
-        title_ID += 1
-    title_final_bboxs.sort(key = lambda LURD: (LURD[1], LURD[0]))
-    curPage_all_title_bboxs = title_final_bboxs
-    return curPage_all_title_bboxs
--- a/others/vali_bbox_sort.py
+++ b/others/vali_bbox_sort.py
-import numpy as np
-import tqdm
-import json
-from validation import cal_edit_distance, format_gt_bbox
-from magic_pdf.layout.layout_sort import sort_with_layout
-with open('/mnt/petrelfs/share_data/ouyanglinke/OCR/OCR_validation_dataset_final_rotated_formulafix_highdpi_scihub.json', 'r') as f:
-    samples = json.load(f)
-# labels = []
-# det_res = []
-edit_distance_dict = []
-edit_distance_list = []
-for i, sample in tqdm.tqdm(enumerate(samples)):
-    pdf_name = sample['pdf_name']
-    s3_pdf_path = sample['s3_path']
-    page_num = sample['page']
-    page_width = sample['annotations']['width']
-    page_height = sample['annotations']['height']
-    # pre = main(s3_pdf_path, pdf_bin_file_profile, join_path(pdf_model_dir, pdf_name), pdf_model_profile, save_path, page_num)
-    # pre_dict_list = []
-    # for item in pre:
-    #     pre_sample = {
-    #         'box': [item[0],item[1],item[2],item[3]],
-    #         'type': item[7],
-    #         'score': 1
-    #     }
-    #     pre_dict_list.append(pre_sample)
-    # det_res.append(pre_dict_list)
-    # match_change_dict = {   # 待确认
-    #     "figure": "image",
-    #     "svg_figure": "image",
-    #     "inline_fomula": "equations_inline",
-    #     "fomula": "equation_interline",
-    #     "figure_caption": "text",
-    #     "table_caption": "text",
-    #     "fomula_caption": "text"
-    # }
-    gt_annos = sample['annotations']
-    # matched_label = label_match(gt_annos, match_change_dict)
-    # labels.append(matched_label)
-    # 判断排序函数的精度
-    # 目前不考虑caption与图表相同序号的问题
-    ignore_category = ['abandon', 'figure_caption', 'table_caption', 'formula_caption', 'inline_fomula'] 
-    gt_bboxes = format_gt_bbox(gt_annos, ignore_category)
-    sorted_bboxes, _ = sort_with_layout(gt_bboxes, page_width, page_height)
-    if sorted_bboxes:
-        edit_distance = cal_edit_distance(sorted_bboxes)
-        edit_distance_list.append(edit_distance)
-        edit_distance_dict.append({
-            "sample_id": i,
-            "s3_path": s3_pdf_path,
-            "page_num": page_num,
-            "page_s2_path": sample['page_path'],
-            "edit_distance": edit_distance
-        })
-# label_classes = ["image", "text", "table", "equation_interline"]
-# detect_matrix = detect_val(labels, det_res, label_classes)
-# print('detect_matrix', detect_matrix)
-edit_distance_mean = np.mean(edit_distance_list)
-print('edit_distance_mean', edit_distance_mean)
-edit_distance_dict_sorted = sorted(edit_distance_dict, key=lambda x: x['edit_distance'], reverse=True)
-# print(edit_distance_dict_sorted)
-result = {
-    "edit_distance_mean": edit_distance_mean,
-    "edit_distance_dict_sorted": edit_distance_dict_sorted
-}
-with open('vali_bbox_sort_result.json', 'w') as f:
-    json.dump(result, f)
\ No newline at end of file
--- a/others/validation.py
+++ b/others/validation.py
-import numpy as np
-from mmeval import COCODetection
-import distance
-def reformat_gt_and_pred(labels, det_res, label_classes):
-    preds = []
-    gts = []
-    for idx, (ann, pred) in enumerate(zip(labels, det_res)):
-        # with open(label_path, "r") as f:
-        #     ann = json.load(f)
-        gt_bboxes = []
-        gt_labels = []
-        for item in ann['step_1']['result']:
-            if item['attribute'] in label_classes:
-                gt_bboxes.append([item['x'], item['y'], item['x']+item['width'], item['y']+item['height']])
-                gt_labels.append(label_classes.index(item['attribute']))
-        gts.append({
-            'img_id': idx,
-            'width': ann['width'],
-            'height': ann['height'],
-            'bboxes': np.array(gt_bboxes),
-            'labels': np.array(gt_labels),
-            'ignore_flags': [False]*len(gt_labels),
-        })
-        bboxes = []
-        labels = []
-        scores = []
-        for item in pred:
-            bboxes.append(item['box'])
-            labels.append(label_classes.index(item['type']))
-            scores.append(item['score'])
-        preds.append({
-            'img_id': idx,
-            'bboxes': np.array(bboxes),
-            'scores': np.array(scores),
-            'labels': np.array(labels),
-        })
-    return gts, preds
-def detect_val(labels, det_res, label_classes):
-    # label_classes = ['inline_formula', "formula"]
-    meta={'CLASSES':tuple(label_classes)}
-    coco_det_metric = COCODetection(dataset_meta=meta, metric=['bbox'])
-    gts, preds = reformat_gt_and_pred(labels, det_res, label_classes)
-    res = coco_det_metric(predictions=preds, groundtruths=gts)
-    return res
-def label_match(annotations, match_change_dict):
-    for item in annotations['step_1']['result']:
-        if item['attribute'] in match_change_dict.keys():
-            item['attribute'] = match_change_dict[item['attribute']]
-    return annotations
-def format_gt_bbox(annotations, ignore_category):
-    gt_bboxes = []
-    for item in annotations['step_1']['result']:
-        if item['textAttribute'] and item['attribute'] not in ignore_category:
-            x0 = item['x']
-            y0 = item['y']
-            x1 = item['x'] + item['width']
-            y1 = item['y'] + item['height']
-            order = item['textAttribute']
-            category = item['attribute']
-            gt_bboxes.append([x0, y0, x1, y1, order, None, None, category])
-    return gt_bboxes
-def cal_edit_distance(sorted_bboxes):  
-    # order_list = [int(bbox[4]) for bbox in sorted_bboxes]
-    # print(sorted_bboxes[0][0][12])
-    order_list = [int(bbox[12]) for bbox in sorted_bboxes]
-    sorted_order = sorted(order_list, key=int)
-    distance_cal = distance.levenshtein(order_list, sorted_order)
-    if len(order_list) > 0:
-        return distance_cal / len(order_list)
-    else:
-        return 0
\ No newline at end of file
--- a/requirements-qa.txt
+++ b/requirements-qa.txt
+pytest
 Levenshtein
 nltk
 rapidfuzz
@@ -11,4 +12,5 @@ scipy
 scikit-learn
 tqdm
 htmltabletomd
 pypandoc
\ No newline at end of file
+pyopenssl==24.0.0
\ No newline at end of file
--- a/tests/test_cli/conf/conf.py
+++ b/tests/test_cli/conf/conf.py
@@ -2,7 +2,6 @@ import os
 conf = {
 "code_path": os.environ.get('GITHUB_WORKSPACE'),
 "pdf_dev_path" : os.environ.get('GITHUB_WORKSPACE') + "/tests/test_cli/pdf_dev",
-"pdf_res_path": "/share/quyuan/mineru/data/mineru"
+"pdf_res_path": "/tmp"
 }