Deleted magic_pdf/__pycache__/__init__.cpython-310.pyc,...

Deleted magic_pdf/__pycache__/__init__.cpython-310.pyc, magic_pdf/__pycache__/pdf_parse_by_ocr.cpython-310.pyc, magic_pdf/__pycache__/pdf_parse_by_txt.cpython-310.pyc, magic_pdf/__pycache__/pdf_parse_union_core.cpython-310.pyc, magic_pdf/__pycache__/user_api.cpython-310.pyc, magic_pdf/dict2md/__pycache__/__init__.cpython-310.pyc, magic_pdf/dict2md/__pycache__/ocr_client.cpython-310.pyc, magic_pdf/dict2md/__pycache__/ocr_mkcontent.cpython-310.pyc, magic_pdf/dict2md/__init__.py, magic_pdf/dict2md/mkcontent.py, magic_pdf/dict2md/ocr_client.py, magic_pdf/dict2md/ocr_mkcontent.py, magic_pdf/dict2md/ocr_server.py, magic_pdf/filter/__init__.py, magic_pdf/filter/pdf_classify_by_type.py, magic_pdf/filter/pdf_meta_scan.py, magic_pdf/integrations/rag/__init__.py, magic_pdf/integrations/rag/api.py, magic_pdf/integrations/rag/type.py, magic_pdf/integrations/rag/utils.py, magic_pdf/integrations/__init__.py, magic_pdf/layout/__init__.py, magic_pdf/layout/bbox_sort.py, magic_pdf/layout/layout_det_utils.py, magic_pdf/layout/layout_sort.py, magic_pdf/layout/layout_spiler_recog.py, magic_pdf/layout/mcol_sort.py, magic_pdf/libs/Constants.py, magic_pdf/libs/MakeContentConfig.py, magic_pdf/libs/ModelBlockTypeEnum.py, magic_pdf/libs/__init__.py, magic_pdf/libs/boxbase.py, magic_pdf/libs/calc_span_stats.py, magic_pdf/libs/commons.py, magic_pdf/libs/config_reader.py, magic_pdf/libs/convert_utils.py, magic_pdf/libs/coordinate_transform.py, magic_pdf/libs/detect_language_from_model.py, magic_pdf/libs/draw_bbox.py, magic_pdf/libs/drop_reason.py, magic_pdf/libs/drop_tag.py, magic_pdf/libs/hash_utils.py, magic_pdf/libs/json_compressor.py, magic_pdf/libs/language.py, magic_pdf/libs/local_math.py, magic_pdf/libs/markdown_utils.py, magic_pdf/libs/nlp_utils.py, magic_pdf/libs/ocr_content_type.py, magic_pdf/libs/path_utils.py, magic_pdf/libs/pdf_check.py, magic_pdf/libs/pdf_image_tools.py, magic_pdf/libs/safe_filename.py, magic_pdf/libs/textbase.py, magic_pdf/libs/version.py, magic_pdf/libs/vis_utils.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/cord.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/data_collator.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/funsd.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/image_utils.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/xfund.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/configuration_layoutlmv3.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/modeling_layoutlmv3.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/backbone.py, magic_pdf/model/pek_sub_modules/layoutlmv3/beit.py, magic_pdf/model/pek_sub_modules/layoutlmv3/deit.py, magic_pdf/model/pek_sub_modules/layoutlmv3/model_init.py, magic_pdf/model/pek_sub_modules/layoutlmv3/rcnn_vl.py, magic_pdf/model/pek_sub_modules/layoutlmv3/visualizer.py, magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py, magic_pdf/model/pek_sub_modules/structeqtable/__init__.py, magic_pdf/model/pek_sub_modules/__init__.py, magic_pdf/model/pek_sub_modules/post_process.py, magic_pdf/model/pek_sub_modules/self_modify.py, magic_pdf/model/__init__.py, magic_pdf/model/doc_analyze_by_custom_model.py, magic_pdf/model/magic_model.py, magic_pdf/model/model_list.py, magic_pdf/model/pdf_extract_kit.py, magic_pdf/model/ppTableModel.py, magic_pdf/model/pp_structure_v2.py, magic_pdf/para/__init__.py, magic_pdf/para/block_continuation_processor.py, magic_pdf/para/block_termination_processor.py, magic_pdf/para/commons.py, magic_pdf/para/denoise.py, magic_pdf/para/draw.py, magic_pdf/para/exceptions.py, magic_pdf/para/layout_match_processor.py, magic_pdf/para/para_pipeline.py, magic_pdf/para/para_split.py, magic_pdf/para/para_split_v2.py, magic_pdf/para/raw_processor.py, magic_pdf/para/stats.py, magic_pdf/para/title_processor.py, magic_pdf/parse/__init__.py, magic_pdf/parse/common_parse.py, magic_pdf/parse/excel_parse.py, magic_pdf/parse/pdf_client.py, magic_pdf/pipe/AbsPipe.py, magic_pdf/pipe/OCRPipe.py, magic_pdf/pipe/TXTPipe.py, magic_pdf/pipe/UNIPipe.py, magic_pdf/pipe/__init__.py, magic_pdf/post_proc/__init__.py, magic_pdf/post_proc/detect_para.py, magic_pdf/post_proc/pdf_post_filter.py, magic_pdf/post_proc/remove_footnote.py, magic_pdf/pre_proc/__init__.py, magic_pdf/pre_proc/citationmarker_remove.py, magic_pdf/pre_proc/construct_page_dict.py, magic_pdf/pre_proc/cut_image.py, magic_pdf/pre_proc/detect_equation.py, magic_pdf/pre_proc/detect_footer_by_model.py, magic_pdf/pre_proc/detect_footer_header_by_statistics.py, magic_pdf/pre_proc/detect_footnote.py, magic_pdf/pre_proc/detect_header.py, magic_pdf/pre_proc/detect_images.py, magic_pdf/pre_proc/detect_page_number.py, magic_pdf/pre_proc/detect_tables.py, magic_pdf/pre_proc/equations_replace.py, magic_pdf/pre_proc/fix_image.py, magic_pdf/pre_proc/fix_table.py, magic_pdf/pre_proc/main_text_font.py, magic_pdf/pre_proc/ocr_detect_all_bboxes.py, magic_pdf/pre_proc/ocr_detect_layout.py, magic_pdf/pre_proc/ocr_dict_merge.py, magic_pdf/pre_proc/ocr_span_list_modify.py, magic_pdf/pre_proc/pdf_pre_filter.py, magic_pdf/pre_proc/post_layout_split.py, magic_pdf/pre_proc/remove_bbox_overlap.py, magic_pdf/pre_proc/remove_colored_strip_bbox.py, magic_pdf/pre_proc/remove_footer_header.py, magic_pdf/pre_proc/remove_rotate_bbox.py, magic_pdf/pre_proc/resolve_bbox_conflict.py, magic_pdf/pre_proc/solve_line_alien.py, magic_pdf/pre_proc/statistics.py, magic_pdf/resources/fasttext-langdetect/lid.176.ftz, magic_pdf/resources/model_config/UniMERNet/demo.yaml, magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml, magic_pdf/resources/model_config/model_configs.yaml, magic_pdf/rw/AbsReaderWriter.py, magic_pdf/rw/DiskReaderWriter.py, magic_pdf/rw/S3ReaderWriter.py, magic_pdf/rw/__init__.py, magic_pdf/spark/__init__.py, magic_pdf/spark/spark_api.py, magic_pdf/tools/__init__.py, magic_pdf/tools/cli.py, magic_pdf/tools/cli_dev.py, magic_pdf/tools/common.py, magic_pdf/tools/pdf_server.py, magic_pdf/__init__.py, magic_pdf/config.ini, magic_pdf/pdf_parse_by_ocr.py, magic_pdf/pdf_parse_by_txt.py, magic_pdf/pdf_parse_union_core.py, magic_pdf/user_api.py files

Deleted magic_pdf/pycache/init.cpython-310.pyc,...
Deleted magic_pdf/__pycache__/__init__.cpython-310.pyc, magic_pdf/__pycache__/pdf_parse_by_ocr.cpython-310.pyc, magic_pdf/__pycache__/pdf_parse_by_txt.cpython-310.pyc, magic_pdf/__pycache__/pdf_parse_union_core.cpython-310.pyc, magic_pdf/__pycache__/user_api.cpython-310.pyc, magic_pdf/dict2md/__pycache__/__init__.cpython-310.pyc, magic_pdf/dict2md/__pycache__/ocr_client.cpython-310.pyc, magic_pdf/dict2md/__pycache__/ocr_mkcontent.cpython-310.pyc, magic_pdf/dict2md/__init__.py, magic_pdf/dict2md/mkcontent.py, magic_pdf/dict2md/ocr_client.py, magic_pdf/dict2md/ocr_mkcontent.py, magic_pdf/dict2md/ocr_server.py, magic_pdf/filter/__init__.py, magic_pdf/filter/pdf_classify_by_type.py, magic_pdf/filter/pdf_meta_scan.py, magic_pdf/integrations/rag/__init__.py, magic_pdf/integrations/rag/api.py, magic_pdf/integrations/rag/type.py, magic_pdf/integrations/rag/utils.py, magic_pdf/integrations/__init__.py, magic_pdf/layout/__init__.py, magic_pdf/layout/bbox_sort.py, magic_pdf/layout/layout_det_utils.py, magic_pdf/layout/layout_sort.py, magic_pdf/layout/layout_spiler_recog.py, magic_pdf/layout/mcol_sort.py, magic_pdf/libs/Constants.py, magic_pdf/libs/MakeContentConfig.py, magic_pdf/libs/ModelBlockTypeEnum.py, magic_pdf/libs/__init__.py, magic_pdf/libs/boxbase.py, magic_pdf/libs/calc_span_stats.py, magic_pdf/libs/commons.py, magic_pdf/libs/config_reader.py, magic_pdf/libs/convert_utils.py, magic_pdf/libs/coordinate_transform.py, magic_pdf/libs/detect_language_from_model.py, magic_pdf/libs/draw_bbox.py, magic_pdf/libs/drop_reason.py, magic_pdf/libs/drop_tag.py, magic_pdf/libs/hash_utils.py, magic_pdf/libs/json_compressor.py, magic_pdf/libs/language.py, magic_pdf/libs/local_math.py, magic_pdf/libs/markdown_utils.py, magic_pdf/libs/nlp_utils.py, magic_pdf/libs/ocr_content_type.py, magic_pdf/libs/path_utils.py, magic_pdf/libs/pdf_check.py, magic_pdf/libs/pdf_image_tools.py, magic_pdf/libs/safe_filename.py, magic_pdf/libs/textbase.py, magic_pdf/libs/version.py, magic_pdf/libs/vis_utils.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/cord.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/data_collator.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/funsd.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/image_utils.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/xfund.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/configuration_layoutlmv3.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/modeling_layoutlmv3.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/backbone.py, magic_pdf/model/pek_sub_modules/layoutlmv3/beit.py, magic_pdf/model/pek_sub_modules/layoutlmv3/deit.py, magic_pdf/model/pek_sub_modules/layoutlmv3/model_init.py, magic_pdf/model/pek_sub_modules/layoutlmv3/rcnn_vl.py, magic_pdf/model/pek_sub_modules/layoutlmv3/visualizer.py, magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py, magic_pdf/model/pek_sub_modules/structeqtable/__init__.py, magic_pdf/model/pek_sub_modules/__init__.py, magic_pdf/model/pek_sub_modules/post_process.py, magic_pdf/model/pek_sub_modules/self_modify.py, magic_pdf/model/__init__.py, magic_pdf/model/doc_analyze_by_custom_model.py, magic_pdf/model/magic_model.py, magic_pdf/model/model_list.py, magic_pdf/model/pdf_extract_kit.py, magic_pdf/model/ppTableModel.py, magic_pdf/model/pp_structure_v2.py, magic_pdf/para/__init__.py, magic_pdf/para/block_continuation_processor.py, magic_pdf/para/block_termination_processor.py, magic_pdf/para/commons.py, magic_pdf/para/denoise.py, magic_pdf/para/draw.py, magic_pdf/para/exceptions.py, magic_pdf/para/layout_match_processor.py, magic_pdf/para/para_pipeline.py, magic_pdf/para/para_split.py, magic_pdf/para/para_split_v2.py, magic_pdf/para/raw_processor.py, magic_pdf/para/stats.py, magic_pdf/para/title_processor.py, magic_pdf/parse/__init__.py, magic_pdf/parse/common_parse.py, magic_pdf/parse/excel_parse.py, magic_pdf/parse/pdf_client.py, magic_pdf/pipe/AbsPipe.py, magic_pdf/pipe/OCRPipe.py, magic_pdf/pipe/TXTPipe.py, magic_pdf/pipe/UNIPipe.py, magic_pdf/pipe/__init__.py, magic_pdf/post_proc/__init__.py, magic_pdf/post_proc/detect_para.py, magic_pdf/post_proc/pdf_post_filter.py, magic_pdf/post_proc/remove_footnote.py, magic_pdf/pre_proc/__init__.py, magic_pdf/pre_proc/citationmarker_remove.py, magic_pdf/pre_proc/construct_page_dict.py, magic_pdf/pre_proc/cut_image.py, magic_pdf/pre_proc/detect_equation.py, magic_pdf/pre_proc/detect_footer_by_model.py, magic_pdf/pre_proc/detect_footer_header_by_statistics.py, magic_pdf/pre_proc/detect_footnote.py, magic_pdf/pre_proc/detect_header.py, magic_pdf/pre_proc/detect_images.py, magic_pdf/pre_proc/detect_page_number.py, magic_pdf/pre_proc/detect_tables.py, magic_pdf/pre_proc/equations_replace.py, magic_pdf/pre_proc/fix_image.py, magic_pdf/pre_proc/fix_table.py, magic_pdf/pre_proc/main_text_font.py, magic_pdf/pre_proc/ocr_detect_all_bboxes.py, magic_pdf/pre_proc/ocr_detect_layout.py, magic_pdf/pre_proc/ocr_dict_merge.py, magic_pdf/pre_proc/ocr_span_list_modify.py, magic_pdf/pre_proc/pdf_pre_filter.py, magic_pdf/pre_proc/post_layout_split.py, magic_pdf/pre_proc/remove_bbox_overlap.py, magic_pdf/pre_proc/remove_colored_strip_bbox.py, magic_pdf/pre_proc/remove_footer_header.py, magic_pdf/pre_proc/remove_rotate_bbox.py, magic_pdf/pre_proc/resolve_bbox_conflict.py, magic_pdf/pre_proc/solve_line_alien.py, magic_pdf/pre_proc/statistics.py, magic_pdf/resources/fasttext-langdetect/lid.176.ftz, magic_pdf/resources/model_config/UniMERNet/demo.yaml, magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml, magic_pdf/resources/model_config/model_configs.yaml, magic_pdf/rw/AbsReaderWriter.py, magic_pdf/rw/DiskReaderWriter.py, magic_pdf/rw/S3ReaderWriter.py, magic_pdf/rw/__init__.py, magic_pdf/spark/__init__.py, magic_pdf/spark/spark_api.py, magic_pdf/tools/__init__.py, magic_pdf/tools/cli.py, magic_pdf/tools/cli_dev.py, magic_pdf/tools/common.py, magic_pdf/tools/pdf_server.py, magic_pdf/__init__.py, magic_pdf/config.ini, magic_pdf/pdf_parse_by_ocr.py, magic_pdf/pdf_parse_by_txt.py, magic_pdf/pdf_parse_union_core.py, magic_pdf/user_api.py files
826086d2 · zhougaofeng · 57aaa1cf · 57aaa1cf · 57aaa1cf · 57aaa1cf
Commit 826086d2 authored Nov 12, 2024 by zhougaofeng
20 changed files
--- a/magic_pdf/model/pek_sub_modules/layoutlmv3/model_init.py
+++ b/magic_pdf/model/pek_sub_modules/layoutlmv3/model_init.py
-from .visualizer import Visualizer
-from .rcnn_vl import *
-from .backbone import *
-from detectron2.config import get_cfg
-from detectron2.config import CfgNode as CN
-from detectron2.data import MetadataCatalog, DatasetCatalog
-from detectron2.data.datasets import register_coco_instances
-from detectron2.engine import DefaultTrainer, default_argument_parser, default_setup, launch, DefaultPredictor
-def add_vit_config(cfg):
-    """
-    Add config for VIT.
-    """
-    _C = cfg
-    _C.MODEL.VIT = CN()
-    # CoaT model name.
-    _C.MODEL.VIT.NAME = ""
-    # Output features from CoaT backbone.
-    _C.MODEL.VIT.OUT_FEATURES = ["layer3", "layer5", "layer7", "layer11"]
-    _C.MODEL.VIT.IMG_SIZE = [224, 224]
-    _C.MODEL.VIT.POS_TYPE = "shared_rel"
-    _C.MODEL.VIT.DROP_PATH = 0.
-    _C.MODEL.VIT.MODEL_KWARGS = "{}"
-    _C.SOLVER.OPTIMIZER = "ADAMW"
-    _C.SOLVER.BACKBONE_MULTIPLIER = 1.0
-    _C.AUG = CN()
-    _C.AUG.DETR = False
-    _C.MODEL.IMAGE_ONLY = True
-    _C.PUBLAYNET_DATA_DIR_TRAIN = ""
-    _C.PUBLAYNET_DATA_DIR_TEST = ""
-    _C.FOOTNOTE_DATA_DIR_TRAIN = ""
-    _C.FOOTNOTE_DATA_DIR_VAL = ""
-    _C.SCIHUB_DATA_DIR_TRAIN = ""
-    _C.SCIHUB_DATA_DIR_TEST = ""
-    _C.JIAOCAI_DATA_DIR_TRAIN = ""
-    _C.JIAOCAI_DATA_DIR_TEST = ""
-    _C.ICDAR_DATA_DIR_TRAIN = ""
-    _C.ICDAR_DATA_DIR_TEST = ""
-    _C.M6DOC_DATA_DIR_TEST = ""
-    _C.DOCSTRUCTBENCH_DATA_DIR_TEST = ""
-    _C.DOCSTRUCTBENCHv2_DATA_DIR_TEST = ""
-    _C.CACHE_DIR = ""
-    _C.MODEL.CONFIG_PATH = ""
-    # effective update steps would be MAX_ITER/GRADIENT_ACCUMULATION_STEPS
-    # maybe need to set MAX_ITER *= GRADIENT_ACCUMULATION_STEPS
-    _C.SOLVER.GRADIENT_ACCUMULATION_STEPS = 1
-def setup(args, device):
-    """
-    Create configs and perform basic setups.
-    """
-    cfg = get_cfg()
-    # add_coat_config(cfg)
-    add_vit_config(cfg)
-    cfg.merge_from_file(args.config_file)
-    cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.2  # set threshold for this model
-    cfg.merge_from_list(args.opts)
-    # 使用统一的device配置
-    cfg.MODEL.DEVICE = device
-    cfg.freeze()
-    default_setup(cfg, args)
-    #@todo 可以删掉这块？
-    # register_coco_instances(
-    #     "scihub_train",
-    #     {},
-    #     cfg.SCIHUB_DATA_DIR_TRAIN + ".json",
-    #     cfg.SCIHUB_DATA_DIR_TRAIN
-    # )
-    return cfg
-class DotDict(dict):
-    def __init__(self, *args, **kwargs):
-        super(DotDict, self).__init__(*args, **kwargs)
-    def __getattr__(self, key):
-        if key not in self.keys():
-            return None
-        value = self[key]
-        if isinstance(value, dict):
-            value = DotDict(value)
-        return value
-    def __setattr__(self, key, value):
-        self[key] = value
-class Layoutlmv3_Predictor(object):
-    def __init__(self, weights, config_file, device):
-        layout_args = {
-            "config_file": config_file,
-            "resume": False,
-            "eval_only": False,
-            "num_gpus": 1,
-            "num_machines": 1,
-            "machine_rank": 0,
-            "dist_url": "tcp://127.0.0.1:57823",
-            "opts": ["MODEL.WEIGHTS", weights],
-        }
-        layout_args = DotDict(layout_args)
-        cfg = setup(layout_args, device)
-        self.mapping = ["title", "plain text", "abandon", "figure", "figure_caption", "table", "table_caption",
-                        "table_footnote", "isolate_formula", "formula_caption"]
-        MetadataCatalog.get(cfg.DATASETS.TRAIN[0]).thing_classes = self.mapping
-        self.predictor = DefaultPredictor(cfg)
-    def __call__(self, image, ignore_catids=[]):
-        # page_layout_result = {
-        #     "layout_dets": []
-        # }
-        layout_dets = []
-        outputs = self.predictor(image)
-        boxes = outputs["instances"].to("cpu")._fields["pred_boxes"].tensor.tolist()
-        labels = outputs["instances"].to("cpu")._fields["pred_classes"].tolist()
-        scores = outputs["instances"].to("cpu")._fields["scores"].tolist()
-        for bbox_idx in range(len(boxes)):
-            if labels[bbox_idx] in ignore_catids:
-                continue
-            layout_dets.append({
-                "category_id": labels[bbox_idx],
-                "poly": [
-                    boxes[bbox_idx][0], boxes[bbox_idx][1],
-                    boxes[bbox_idx][2], boxes[bbox_idx][1],
-                    boxes[bbox_idx][2], boxes[bbox_idx][3],
-                    boxes[bbox_idx][0], boxes[bbox_idx][3],
-                ],
-                "score": scores[bbox_idx]
-            })
-        return layout_dets
--- a/magic_pdf/model/pek_sub_modules/layoutlmv3/rcnn_vl.py
+++ b/magic_pdf/model/pek_sub_modules/layoutlmv3/rcnn_vl.py
-# Copyright (c) Facebook, Inc. and its affiliates.
-import logging
-import numpy as np
-from typing import Dict, List, Optional, Tuple
-import torch
-from torch import nn
-from detectron2.config import configurable
-from detectron2.structures import ImageList, Instances
-from detectron2.utils.events import get_event_storage
-from detectron2.modeling.backbone import Backbone, build_backbone
-from detectron2.modeling.meta_arch.build import META_ARCH_REGISTRY
-from detectron2.modeling.meta_arch import GeneralizedRCNN
-from detectron2.modeling.postprocessing import detector_postprocess
-from detectron2.modeling.roi_heads.fast_rcnn import fast_rcnn_inference_single_image
-from contextlib import contextmanager
-from itertools import count
-@META_ARCH_REGISTRY.register()
-class VLGeneralizedRCNN(GeneralizedRCNN):
-    """
-    Generalized R-CNN. Any models that contains the following three components:
-    1. Per-image feature extraction (aka backbone)
-    2. Region proposal generation
-    3. Per-region feature extraction and prediction
-    """
-    def forward(self, batched_inputs: List[Dict[str, torch.Tensor]]):
-        """
-        Args:
-            batched_inputs: a list, batched outputs of :class:`DatasetMapper` .
-                Each item in the list contains the inputs for one image.
-                For now, each item in the list is a dict that contains:
-                * image: Tensor, image in (C, H, W) format.
-                * instances (optional): groundtruth :class:`Instances`
-                * proposals (optional): :class:`Instances`, precomputed proposals.
-                Other information that's included in the original dicts, such as:
-                * "height", "width" (int): the output resolution of the model, used in inference.
-                  See :meth:`postprocess` for details.
-        Returns:
-            list[dict]:
-                Each dict is the output for one input image.
-                The dict contains one key "instances" whose value is a :class:`Instances`.
-                The :class:`Instances` object has the following keys:
-                "pred_boxes", "pred_classes", "scores", "pred_masks", "pred_keypoints"
-        """
-        if not self.training:
-            return self.inference(batched_inputs)
-        images = self.preprocess_image(batched_inputs)
-        if "instances" in batched_inputs[0]:
-            gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
-        else:
-            gt_instances = None
-        # features = self.backbone(images.tensor)
-        input = self.get_batch(batched_inputs, images)
-        features = self.backbone(input)
-        if self.proposal_generator is not None:
-            proposals, proposal_losses = self.proposal_generator(images, features, gt_instances)
-        else:
-            assert "proposals" in batched_inputs[0]
-            proposals = [x["proposals"].to(self.device) for x in batched_inputs]
-            proposal_losses = {}
-        _, detector_losses = self.roi_heads(images, features, proposals, gt_instances)
-        if self.vis_period > 0:
-            storage = get_event_storage()
-            if storage.iter % self.vis_period == 0:
-                self.visualize_training(batched_inputs, proposals)
-        losses = {}
-        losses.update(detector_losses)
-        losses.update(proposal_losses)
-        return losses
-    def inference(
-        self,
-        batched_inputs: List[Dict[str, torch.Tensor]],
-        detected_instances: Optional[List[Instances]] = None,
-        do_postprocess: bool = True,
-    ):
-        """
-        Run inference on the given inputs.
-        Args:
-            batched_inputs (list[dict]): same as in :meth:`forward`
-            detected_instances (None or list[Instances]): if not None, it
-                contains an `Instances` object per image. The `Instances`
-                object contains "pred_boxes" and "pred_classes" which are
-                known boxes in the image.
-                The inference will then skip the detection of bounding boxes,
-                and only predict other per-ROI outputs.
-            do_postprocess (bool): whether to apply post-processing on the outputs.
-        Returns:
-            When do_postprocess=True, same as in :meth:`forward`.
-            Otherwise, a list[Instances] containing raw network outputs.
-        """
-        assert not self.training
-        images = self.preprocess_image(batched_inputs)
-        # features = self.backbone(images.tensor)
-        input = self.get_batch(batched_inputs, images)
-        features = self.backbone(input)
-        if detected_instances is None:
-            if self.proposal_generator is not None:
-                proposals, _ = self.proposal_generator(images, features, None)
-            else:
-                assert "proposals" in batched_inputs[0]
-                proposals = [x["proposals"].to(self.device) for x in batched_inputs]
-            results, _ = self.roi_heads(images, features, proposals, None)
-        else:
-            detected_instances = [x.to(self.device) for x in detected_instances]
-            results = self.roi_heads.forward_with_given_boxes(features, detected_instances)
-        if do_postprocess:
-            assert not torch.jit.is_scripting(), "Scripting is not supported for postprocess."
-            return GeneralizedRCNN._postprocess(results, batched_inputs, images.image_sizes)
-        else:
-            return results
-    def get_batch(self, examples, images):
-        if len(examples) >= 1 and "bbox" not in examples[0]:  # image_only
-            return {"images": images.tensor}
-        return input
-    def _batch_inference(self, batched_inputs, detected_instances=None):
-        """
-        Execute inference on a list of inputs,
-        using batch size = self.batch_size (e.g., 2), instead of the length of the list.
-        Inputs & outputs have the same format as :meth:`GeneralizedRCNN.inference`
-        """
-        if detected_instances is None:
-            detected_instances = [None] * len(batched_inputs)
-        outputs = []
-        inputs, instances = [], []
-        for idx, input, instance in zip(count(), batched_inputs, detected_instances):
-            inputs.append(input)
-            instances.append(instance)
-            if len(inputs) == 2 or idx == len(batched_inputs) - 1:
-                outputs.extend(
-                    self.inference(
-                        inputs,
-                        instances if instances[0] is not None else None,
-                        do_postprocess=True,  # False
-                    )
-                )
-                inputs, instances = [], []
-        return outputs
--- a/magic_pdf/model/pek_sub_modules/layoutlmv3/visualizer.py
+++ b/magic_pdf/model/pek_sub_modules/layoutlmv3/visualizer.py
-# Copyright (c) Facebook, Inc. and its affiliates.
-import colorsys
-import logging
-import math
-import numpy as np
-from enum import Enum, unique
-import cv2
-import matplotlib as mpl
-import matplotlib.colors as mplc
-import matplotlib.figure as mplfigure
-import pycocotools.mask as mask_util
-import torch
-from matplotlib.backends.backend_agg import FigureCanvasAgg
-from PIL import Image
-from detectron2.data import MetadataCatalog
-from detectron2.structures import BitMasks, Boxes, BoxMode, Keypoints, PolygonMasks, RotatedBoxes
-from detectron2.utils.file_io import PathManager
-from detectron2.utils.colormap import random_color
-import pdb
-logger = logging.getLogger(__name__)
-__all__ = ["ColorMode", "VisImage", "Visualizer"]
-_SMALL_OBJECT_AREA_THRESH = 1000
-_LARGE_MASK_AREA_THRESH = 120000
-_OFF_WHITE = (1.0, 1.0, 240.0 / 255)
-_BLACK = (0, 0, 0)
-_RED = (1.0, 0, 0)
-_KEYPOINT_THRESHOLD = 0.05
-#CLASS_NAMES = ["footnote", "footer", "header"]
-@unique
-class ColorMode(Enum):
-    """
-    Enum of different color modes to use for instance visualizations.
-    """
-    IMAGE = 0
-    """
-    Picks a random color for every instance and overlay segmentations with low opacity.
-    """
-    SEGMENTATION = 1
-    """
-    Let instances of the same category have similar colors
-    (from metadata.thing_colors), and overlay them with
-    high opacity. This provides more attention on the quality of segmentation.
-    """
-    IMAGE_BW = 2
-    """
-    Same as IMAGE, but convert all areas without masks to gray-scale.
-    Only available for drawing per-instance mask predictions.
-    """
-class GenericMask:
-    """
-    Attribute:
-        polygons (list[ndarray]): list[ndarray]: polygons for this mask.
-            Each ndarray has format [x, y, x, y, ...]
-        mask (ndarray): a binary mask
-    """
-    def __init__(self, mask_or_polygons, height, width):
-        self._mask = self._polygons = self._has_holes = None
-        self.height = height
-        self.width = width
-        m = mask_or_polygons
-        if isinstance(m, dict):
-            # RLEs
-            assert "counts" in m and "size" in m
-            if isinstance(m["counts"], list):  # uncompressed RLEs
-                h, w = m["size"]
-                assert h == height and w == width
-                m = mask_util.frPyObjects(m, h, w)
-            self._mask = mask_util.decode(m)[:, :]
-            return
-        if isinstance(m, list):  # list[ndarray]
-            self._polygons = [np.asarray(x).reshape(-1) for x in m]
-            return
-        if isinstance(m, np.ndarray):  # assumed to be a binary mask
-            assert m.shape[1] != 2, m.shape
-            assert m.shape == (
-                height,
-                width,
-            ), f"mask shape: {m.shape}, target dims: {height}, {width}"
-            self._mask = m.astype("uint8")
-            return
-        raise ValueError("GenericMask cannot handle object {} of type '{}'".format(m, type(m)))
-    @property
-    def mask(self):
-        if self._mask is None:
-            self._mask = self.polygons_to_mask(self._polygons)
-        return self._mask
-    @property
-    def polygons(self):
-        if self._polygons is None:
-            self._polygons, self._has_holes = self.mask_to_polygons(self._mask)
-        return self._polygons
-    @property
-    def has_holes(self):
-        if self._has_holes is None:
-            if self._mask is not None:
-                self._polygons, self._has_holes = self.mask_to_polygons(self._mask)
-            else:
-                self._has_holes = False  # if original format is polygon, does not have holes
-        return self._has_holes
-    def mask_to_polygons(self, mask):
-        # cv2.RETR_CCOMP flag retrieves all the contours and arranges them to a 2-level
-        # hierarchy. External contours (boundary) of the object are placed in hierarchy-1.
-        # Internal contours (holes) are placed in hierarchy-2.
-        # cv2.CHAIN_APPROX_NONE flag gets vertices of polygons from contours.
-        mask = np.ascontiguousarray(mask)  # some versions of cv2 does not support incontiguous arr
-        res = cv2.findContours(mask.astype("uint8"), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_NONE)
-        hierarchy = res[-1]
-        if hierarchy is None:  # empty mask
-            return [], False
-        has_holes = (hierarchy.reshape(-1, 4)[:, 3] >= 0).sum() > 0
-        res = res[-2]
-        res = [x.flatten() for x in res]
-        # These coordinates from OpenCV are integers in range [0, W-1 or H-1].
-        # We add 0.5 to turn them into real-value coordinate space. A better solution
-        # would be to first +0.5 and then dilate the returned polygon by 0.5.
-        res = [x + 0.5 for x in res if len(x) >= 6]
-        return res, has_holes
-    def polygons_to_mask(self, polygons):
-        rle = mask_util.frPyObjects(polygons, self.height, self.width)
-        rle = mask_util.merge(rle)
-        return mask_util.decode(rle)[:, :]
-    def area(self):
-        return self.mask.sum()
-    def bbox(self):
-        p = mask_util.frPyObjects(self.polygons, self.height, self.width)
-        p = mask_util.merge(p)
-        bbox = mask_util.toBbox(p)
-        bbox[2] += bbox[0]
-        bbox[3] += bbox[1]
-        return bbox
-class _PanopticPrediction:
-    """
-    Unify different panoptic annotation/prediction formats
-    """
-    def __init__(self, panoptic_seg, segments_info, metadata=None):
-        if segments_info is None:
-            assert metadata is not None
-            # If "segments_info" is None, we assume "panoptic_img" is a
-            # H*W int32 image storing the panoptic_id in the format of
-            # category_id * label_divisor + instance_id. We reserve -1 for
-            # VOID label.
-            label_divisor = metadata.label_divisor
-            segments_info = []
-            for panoptic_label in np.unique(panoptic_seg.numpy()):
-                if panoptic_label == -1:
-                    # VOID region.
-                    continue
-                pred_class = panoptic_label // label_divisor
-                isthing = pred_class in metadata.thing_dataset_id_to_contiguous_id.values()
-                segments_info.append(
-                    {
-                        "id": int(panoptic_label),
-                        "category_id": int(pred_class),
-                        "isthing": bool(isthing),
-                    }
-                )
-        del metadata
-        self._seg = panoptic_seg
-        self._sinfo = {s["id"]: s for s in segments_info}  # seg id -> seg info
-        segment_ids, areas = torch.unique(panoptic_seg, sorted=True, return_counts=True)
-        areas = areas.numpy()
-        sorted_idxs = np.argsort(-areas)
-        self._seg_ids, self._seg_areas = segment_ids[sorted_idxs], areas[sorted_idxs]
-        self._seg_ids = self._seg_ids.tolist()
-        for sid, area in zip(self._seg_ids, self._seg_areas):
-            if sid in self._sinfo:
-                self._sinfo[sid]["area"] = float(area)
-    def non_empty_mask(self):
-        """
-        Returns:
-            (H, W) array, a mask for all pixels that have a prediction
-        """
-        empty_ids = []
-        for id in self._seg_ids:
-            if id not in self._sinfo:
-                empty_ids.append(id)
-        if len(empty_ids) == 0:
-            return np.zeros(self._seg.shape, dtype=np.uint8)
-        assert (
-            len(empty_ids) == 1
-        ), ">1 ids corresponds to no labels. This is currently not supported"
-        return (self._seg != empty_ids[0]).numpy().astype(np.bool)
-    def semantic_masks(self):
-        for sid in self._seg_ids:
-            sinfo = self._sinfo.get(sid)
-            if sinfo is None or sinfo["isthing"]:
-                # Some pixels (e.g. id 0 in PanopticFPN) have no instance or semantic predictions.
-                continue
-            yield (self._seg == sid).numpy().astype(np.bool), sinfo
-    def instance_masks(self):
-        for sid in self._seg_ids:
-            sinfo = self._sinfo.get(sid)
-            if sinfo is None or not sinfo["isthing"]:
-                continue
-            mask = (self._seg == sid).numpy().astype(np.bool)
-            if mask.sum() > 0:
-                yield mask, sinfo
-def _create_text_labels(classes, scores, class_names, is_crowd=None):
-    """
-    Args:
-        classes (list[int] or None):
-        scores (list[float] or None):
-        class_names (list[str] or None):
-        is_crowd (list[bool] or None):
-    Returns:
-        list[str] or None
-    """
-    #class_names = CLASS_NAMES
-    labels = None
-    if classes is not None:
-        if class_names is not None and len(class_names) > 0:
-            labels = [class_names[i] for i in classes]
-        else:
-            labels = [str(i) for i in classes]
-    if scores is not None:
-        if labels is None:
-            labels = ["{:.0f}%".format(s * 100) for s in scores]
-        else:
-            labels = ["{} {:.0f}%".format(l, s * 100) for l, s in zip(labels, scores)]
-    if labels is not None and is_crowd is not None:
-        labels = [l + ("|crowd" if crowd else "") for l, crowd in zip(labels, is_crowd)]
-    return labels
-class VisImage:
-    def __init__(self, img, scale=1.0):
-        """
-        Args:
-            img (ndarray): an RGB image of shape (H, W, 3) in range [0, 255].
-            scale (float): scale the input image
-        """
-        self.img = img
-        self.scale = scale
-        self.width, self.height = img.shape[1], img.shape[0]
-        self._setup_figure(img)
-    def _setup_figure(self, img):
-        """
-        Args:
-            Same as in :meth:`__init__()`.
-        Returns:
-            fig (matplotlib.pyplot.figure): top level container for all the image plot elements.
-            ax (matplotlib.pyplot.Axes): contains figure elements and sets the coordinate system.
-        """
-        fig = mplfigure.Figure(frameon=False)
-        self.dpi = fig.get_dpi()
-        # add a small 1e-2 to avoid precision lost due to matplotlib's truncation
-        # (https://github.com/matplotlib/matplotlib/issues/15363)
-        fig.set_size_inches(
-            (self.width * self.scale + 1e-2) / self.dpi,
-            (self.height * self.scale + 1e-2) / self.dpi,
-        )
-        self.canvas = FigureCanvasAgg(fig)
-        # self.canvas = mpl.backends.backend_cairo.FigureCanvasCairo(fig)
-        ax = fig.add_axes([0.0, 0.0, 1.0, 1.0])
-        ax.axis("off")
-        self.fig = fig
-        self.ax = ax
-        self.reset_image(img)
-    def reset_image(self, img):
-        """
-        Args:
-            img: same as in __init__
-        """
-        img = img.astype("uint8")
-        self.ax.imshow(img, extent=(0, self.width, self.height, 0), interpolation="nearest")
-    def save(self, filepath):
-        """
-        Args:
-            filepath (str): a string that contains the absolute path, including the file name, where
-                the visualized image will be saved.
-        """
-        self.fig.savefig(filepath)
-    def get_image(self):
-        """
-        Returns:
-            ndarray:
-                the visualized image of shape (H, W, 3) (RGB) in uint8 type.
-                The shape is scaled w.r.t the input image using the given `scale` argument.
-        """
-        canvas = self.canvas
-        s, (width, height) = canvas.print_to_buffer()
-        # buf = io.BytesIO()  # works for cairo backend
-        # canvas.print_rgba(buf)
-        # width, height = self.width, self.height
-        # s = buf.getvalue()
-        buffer = np.frombuffer(s, dtype="uint8")
-        img_rgba = buffer.reshape(height, width, 4)
-        rgb, alpha = np.split(img_rgba, [3], axis=2)
-        return rgb.astype("uint8")
-class Visualizer:
-    """
-    Visualizer that draws data about detection/segmentation on images.
-    It contains methods like `draw_{text,box,circle,line,binary_mask,polygon}`
-    that draw primitive objects to images, as well as high-level wrappers like
-    `draw_{instance_predictions,sem_seg,panoptic_seg_predictions,dataset_dict}`
-    that draw composite data in some pre-defined style.
-    Note that the exact visualization style for the high-level wrappers are subject to change.
-    Style such as color, opacity, label contents, visibility of labels, or even the visibility
-    of objects themselves (e.g. when the object is too small) may change according
-    to different heuristics, as long as the results still look visually reasonable.
-    To obtain a consistent style, you can implement custom drawing functions with the
-    abovementioned primitive methods instead. If you need more customized visualization
-    styles, you can process the data yourself following their format documented in
-    tutorials (:doc:`/tutorials/models`, :doc:`/tutorials/datasets`). This class does not
-    intend to satisfy everyone's preference on drawing styles.
-    This visualizer focuses on high rendering quality rather than performance. It is not
-    designed to be used for real-time applications.
-    """
-    # TODO implement a fast, rasterized version using OpenCV
-    def __init__(self, img_rgb, metadata=None, scale=1.0, instance_mode=ColorMode.IMAGE):
-        """
-        Args:
-            img_rgb: a numpy array of shape (H, W, C), where H and W correspond to
-                the height and width of the image respectively. C is the number of
-                color channels. The image is required to be in RGB format since that
-                is a requirement of the Matplotlib library. The image is also expected
-                to be in the range [0, 255].
-            metadata (Metadata): dataset metadata (e.g. class names and colors)
-            instance_mode (ColorMode): defines one of the pre-defined style for drawing
-                instances on an image.
-        """
-        self.img = np.asarray(img_rgb).clip(0, 255).astype(np.uint8)
-        if metadata is None:
-            metadata = MetadataCatalog.get("__nonexist__")
-        self.metadata = metadata
-        self.output = VisImage(self.img, scale=scale)
-        self.cpu_device = torch.device("cpu")
-        # too small texts are useless, therefore clamp to 9
-        self._default_font_size = max(
-            np.sqrt(self.output.height * self.output.width) // 90, 10 // scale
-        )
-        self._instance_mode = instance_mode
-        self.keypoint_threshold = _KEYPOINT_THRESHOLD
-    def draw_instance_predictions(self, predictions):
-        """
-        Draw instance-level prediction results on an image.
-        Args:
-            predictions (Instances): the output of an instance detection/segmentation
-                model. Following fields will be used to draw:
-                "pred_boxes", "pred_classes", "scores", "pred_masks" (or "pred_masks_rle").
-        Returns:
-            output (VisImage): image object with visualizations.
-        """
-        boxes = predictions.pred_boxes if predictions.has("pred_boxes") else None
-        scores = predictions.scores if predictions.has("scores") else None
-        classes = predictions.pred_classes.tolist() if predictions.has("pred_classes") else None
-        labels = _create_text_labels(classes, scores, self.metadata.get("thing_classes", None))
-        keypoints = predictions.pred_keypoints if predictions.has("pred_keypoints") else None
-        if predictions.has("pred_masks"):
-            masks = np.asarray(predictions.pred_masks)
-            masks = [GenericMask(x, self.output.height, self.output.width) for x in masks]
-        else:
-            masks = None
-        if self._instance_mode == ColorMode.SEGMENTATION and self.metadata.get("thing_colors"):
-            colors = [
-                self._jitter([x / 255 for x in self.metadata.thing_colors[c]]) for c in classes
-            ]
-            alpha = 0.8
-        else:
-            colors = None
-            alpha = 0.5
-        if self._instance_mode == ColorMode.IMAGE_BW:
-            self.output.reset_image(
-                self._create_grayscale_image(
-                    (predictions.pred_masks.any(dim=0) > 0).numpy()
-                    if predictions.has("pred_masks")
-                    else None
-                )
-            )
-            alpha = 0.3
-        self.overlay_instances(
-            masks=masks,
-            boxes=boxes,
-            labels=labels,
-            keypoints=keypoints,
-            assigned_colors=colors,
-            alpha=alpha,
-        )
-        return self.output
-    def draw_sem_seg(self, sem_seg, area_threshold=None, alpha=0.8):
-        """
-        Draw semantic segmentation predictions/labels.
-        Args:
-            sem_seg (Tensor or ndarray): the segmentation of shape (H, W).
-                Each value is the integer label of the pixel.
-            area_threshold (int): segments with less than `area_threshold` are not drawn.
-            alpha (float): the larger it is, the more opaque the segmentations are.
-        Returns:
-            output (VisImage): image object with visualizations.
-        """
-        if isinstance(sem_seg, torch.Tensor):
-            sem_seg = sem_seg.numpy()
-        labels, areas = np.unique(sem_seg, return_counts=True)
-        sorted_idxs = np.argsort(-areas).tolist()
-        labels = labels[sorted_idxs]
-        for label in filter(lambda l: l < len(self.metadata.stuff_classes), labels):
-            try:
-                mask_color = [x / 255 for x in self.metadata.stuff_colors[label]]
-            except (AttributeError, IndexError):
-                mask_color = None
-            binary_mask = (sem_seg == label).astype(np.uint8)
-            text = self.metadata.stuff_classes[label]
-            self.draw_binary_mask(
-                binary_mask,
-                color=mask_color,
-                edge_color=_OFF_WHITE,
-                text=text,
-                alpha=alpha,
-                area_threshold=area_threshold,
-            )
-        return self.output
-    def draw_panoptic_seg(self, panoptic_seg, segments_info, area_threshold=None, alpha=0.7):
-        """
-        Draw panoptic prediction annotations or results.
-        Args:
-            panoptic_seg (Tensor): of shape (height, width) where the values are ids for each
-                segment.
-            segments_info (list[dict] or None): Describe each segment in `panoptic_seg`.
-                If it is a ``list[dict]``, each dict contains keys "id", "category_id".
-                If None, category id of each pixel is computed by
-                ``pixel // metadata.label_divisor``.
-            area_threshold (int): stuff segments with less than `area_threshold` are not drawn.
-        Returns:
-            output (VisImage): image object with visualizations.
-        """
-        pred = _PanopticPrediction(panoptic_seg, segments_info, self.metadata)
-        if self._instance_mode == ColorMode.IMAGE_BW:
-            self.output.reset_image(self._create_grayscale_image(pred.non_empty_mask()))
-        # draw mask for all semantic segments first i.e. "stuff"
-        for mask, sinfo in pred.semantic_masks():
-            category_idx = sinfo["category_id"]
-            try:
-                mask_color = [x / 255 for x in self.metadata.stuff_colors[category_idx]]
-            except AttributeError:
-                mask_color = None
-            text = self.metadata.stuff_classes[category_idx]
-            self.draw_binary_mask(
-                mask,
-                color=mask_color,
-                edge_color=_OFF_WHITE,
-                text=text,
-                alpha=alpha,
-                area_threshold=area_threshold,
-            )
-        # draw mask for all instances second
-        all_instances = list(pred.instance_masks())
-        if len(all_instances) == 0:
-            return self.output
-        masks, sinfo = list(zip(*all_instances))
-        category_ids = [x["category_id"] for x in sinfo]
-        try:
-            scores = [x["score"] for x in sinfo]
-        except KeyError:
-            scores = None
-        labels = _create_text_labels(
-            category_ids, scores, self.metadata.thing_classes, [x.get("iscrowd", 0) for x in sinfo]
-        )
-        try:
-            colors = [
-                self._jitter([x / 255 for x in self.metadata.thing_colors[c]]) for c in category_ids
-            ]
-        except AttributeError:
-            colors = None
-        self.overlay_instances(masks=masks, labels=labels, assigned_colors=colors, alpha=alpha)
-        return self.output
-    draw_panoptic_seg_predictions = draw_panoptic_seg  # backward compatibility
-    def draw_dataset_dict(self, dic):
-        """
-        Draw annotations/segmentaions in Detectron2 Dataset format.
-        Args:
-            dic (dict): annotation/segmentation data of one image, in Detectron2 Dataset format.
-        Returns:
-            output (VisImage): image object with visualizations.
-        """
-        annos = dic.get("annotations", None)
-        if annos:
-            if "segmentation" in annos[0]:
-                masks = [x["segmentation"] for x in annos]
-            else:
-                masks = None
-            if "keypoints" in annos[0]:
-                keypts = [x["keypoints"] for x in annos]
-                keypts = np.array(keypts).reshape(len(annos), -1, 3)
-            else:
-                keypts = None
-            boxes = [
-                BoxMode.convert(x["bbox"], x["bbox_mode"], BoxMode.XYXY_ABS)
-                if len(x["bbox"]) == 4
-                else x["bbox"]
-                for x in annos
-            ]
-            colors = None
-            category_ids = [x["category_id"] for x in annos]
-            if self._instance_mode == ColorMode.SEGMENTATION and self.metadata.get("thing_colors"):
-                colors = [
-                    self._jitter([x / 255 for x in self.metadata.thing_colors[c]])
-                    for c in category_ids
-                ]
-            names = self.metadata.get("thing_classes", None)
-            labels = _create_text_labels(
-                category_ids,
-                scores=None,
-                class_names=names,
-                is_crowd=[x.get("iscrowd", 0) for x in annos],
-            )
-            self.overlay_instances(
-                labels=labels, boxes=boxes, masks=masks, keypoints=keypts, assigned_colors=colors
-            )
-        sem_seg = dic.get("sem_seg", None)
-        if sem_seg is None and "sem_seg_file_name" in dic:
-            with PathManager.open(dic["sem_seg_file_name"], "rb") as f:
-                sem_seg = Image.open(f)
-                sem_seg = np.asarray(sem_seg, dtype="uint8")
-        if sem_seg is not None:
-            self.draw_sem_seg(sem_seg, area_threshold=0, alpha=0.5)
-        pan_seg = dic.get("pan_seg", None)
-        if pan_seg is None and "pan_seg_file_name" in dic:
-            with PathManager.open(dic["pan_seg_file_name"], "rb") as f:
-                pan_seg = Image.open(f)
-                pan_seg = np.asarray(pan_seg)
-                from panopticapi.utils import rgb2id
-                pan_seg = rgb2id(pan_seg)
-        if pan_seg is not None:
-            segments_info = dic["segments_info"]
-            pan_seg = torch.tensor(pan_seg)
-            self.draw_panoptic_seg(pan_seg, segments_info, area_threshold=0, alpha=0.5)
-        return self.output
-    def overlay_instances(
-        self,
-        *,
-        boxes=None,
-        labels=None,
-        masks=None,
-        keypoints=None,
-        assigned_colors=None,
-        alpha=0.5,
-    ):
-        """
-        Args:
-            boxes (Boxes, RotatedBoxes or ndarray): either a :class:`Boxes`,
-                or an Nx4 numpy array of XYXY_ABS format for the N objects in a single image,
-                or a :class:`RotatedBoxes`,
-                or an Nx5 numpy array of (x_center, y_center, width, height, angle_degrees) format
-                for the N objects in a single image,
-            labels (list[str]): the text to be displayed for each instance.
-            masks (masks-like object): Supported types are:
-                * :class:`detectron2.structures.PolygonMasks`,
-                  :class:`detectron2.structures.BitMasks`.
-                * list[list[ndarray]]: contains the segmentation masks for all objects in one image.
-                  The first level of the list corresponds to individual instances. The second
-                  level to all the polygon that compose the instance, and the third level
-                  to the polygon coordinates. The third level should have the format of
-                  [x0, y0, x1, y1, ..., xn, yn] (n >= 3).
-                * list[ndarray]: each ndarray is a binary mask of shape (H, W).
-                * list[dict]: each dict is a COCO-style RLE.
-            keypoints (Keypoint or array like): an array-like object of shape (N, K, 3),
-                where the N is the number of instances and K is the number of keypoints.
-                The last dimension corresponds to (x, y, visibility or score).
-            assigned_colors (list[matplotlib.colors]): a list of colors, where each color
-                corresponds to each mask or box in the image. Refer to 'matplotlib.colors'
-                for full list of formats that the colors are accepted in.
-        Returns:
-            output (VisImage): image object with visualizations.
-        """
-        num_instances = 0
-        if boxes is not None:
-            boxes = self._convert_boxes(boxes)
-            num_instances = len(boxes)
-        if masks is not None:
-            masks = self._convert_masks(masks)
-            if num_instances:
-                assert len(masks) == num_instances
-            else:
-                num_instances = len(masks)
-        if keypoints is not None:
-            if num_instances:
-                assert len(keypoints) == num_instances
-            else:
-                num_instances = len(keypoints)
-            keypoints = self._convert_keypoints(keypoints)
-        if labels is not None:
-            assert len(labels) == num_instances
-        if assigned_colors is None:
-            assigned_colors = [random_color(rgb=True, maximum=1) for _ in range(num_instances)]
-        if num_instances == 0:
-            return self.output
-        if boxes is not None and boxes.shape[1] == 5:
-            return self.overlay_rotated_instances(
-                boxes=boxes, labels=labels, assigned_colors=assigned_colors
-            )
-        # Display in largest to smallest order to reduce occlusion.
-        areas = None
-        if boxes is not None:
-            areas = np.prod(boxes[:, 2:] - boxes[:, :2], axis=1)
-        elif masks is not None:
-            areas = np.asarray([x.area() for x in masks])
-        if areas is not None:
-            sorted_idxs = np.argsort(-areas).tolist()
-            # Re-order overlapped instances in descending order.
-            boxes = boxes[sorted_idxs] if boxes is not None else None
-            labels = [labels[k] for k in sorted_idxs] if labels is not None else None
-            masks = [masks[idx] for idx in sorted_idxs] if masks is not None else None
-            assigned_colors = [assigned_colors[idx] for idx in sorted_idxs]
-            keypoints = keypoints[sorted_idxs] if keypoints is not None else None
-        for i in range(num_instances):
-            color = assigned_colors[i]
-            if boxes is not None:
-                self.draw_box(boxes[i], edge_color=color)
-            if masks is not None:
-                for segment in masks[i].polygons:
-                    self.draw_polygon(segment.reshape(-1, 2), color, alpha=alpha)
-            if labels is not None:
-                # first get a box
-                if boxes is not None:
-                    x0, y0, x1, y1 = boxes[i]
-                    text_pos = (x0, y0)  # if drawing boxes, put text on the box corner.
-                    horiz_align = "left"
-                elif masks is not None:
-                    # skip small mask without polygon
-                    if len(masks[i].polygons) == 0:
-                        continue
-                    x0, y0, x1, y1 = masks[i].bbox()
-                    # draw text in the center (defined by median) when box is not drawn
-                    # median is less sensitive to outliers.
-                    text_pos = np.median(masks[i].mask.nonzero(), axis=1)[::-1]
-                    horiz_align = "center"
-                else:
-                    continue  # drawing the box confidence for keypoints isn't very useful.
-                # for small objects, draw text at the side to avoid occlusion
-                instance_area = (y1 - y0) * (x1 - x0)
-                if (
-                    instance_area < _SMALL_OBJECT_AREA_THRESH * self.output.scale
-                    or y1 - y0 < 40 * self.output.scale
-                ):
-                    if y1 >= self.output.height - 5:
-                        text_pos = (x1, y0)
-                    else:
-                        text_pos = (x0, y1)
-                height_ratio = (y1 - y0) / np.sqrt(self.output.height * self.output.width)
-                lighter_color = self._change_color_brightness(color, brightness_factor=0.7)
-                font_size = (
-                    np.clip((height_ratio - 0.02) / 0.08 + 1, 1.2, 2)
-                    * 0.5
-                    * self._default_font_size
-                )
-                self.draw_text(
-                    labels[i],
-                    text_pos,
-                    color=lighter_color,
-                    horizontal_alignment=horiz_align,
-                    font_size=font_size,
-                )
-        # draw keypoints
-        if keypoints is not None:
-            for keypoints_per_instance in keypoints:
-                self.draw_and_connect_keypoints(keypoints_per_instance)
-        return self.output
-    def overlay_rotated_instances(self, boxes=None, labels=None, assigned_colors=None):
-        """
-        Args:
-            boxes (ndarray): an Nx5 numpy array of
-                (x_center, y_center, width, height, angle_degrees) format
-                for the N objects in a single image.
-            labels (list[str]): the text to be displayed for each instance.
-            assigned_colors (list[matplotlib.colors]): a list of colors, where each color
-                corresponds to each mask or box in the image. Refer to 'matplotlib.colors'
-                for full list of formats that the colors are accepted in.
-        Returns:
-            output (VisImage): image object with visualizations.
-        """
-        num_instances = len(boxes)
-        if assigned_colors is None:
-            assigned_colors = [random_color(rgb=True, maximum=1) for _ in range(num_instances)]
-        if num_instances == 0:
-            return self.output
-        # Display in largest to smallest order to reduce occlusion.
-        if boxes is not None:
-            areas = boxes[:, 2] * boxes[:, 3]
-        sorted_idxs = np.argsort(-areas).tolist()
-        # Re-order overlapped instances in descending order.
-        boxes = boxes[sorted_idxs]
-        labels = [labels[k] for k in sorted_idxs] if labels is not None else None
-        colors = [assigned_colors[idx] for idx in sorted_idxs]
-        for i in range(num_instances):
-            self.draw_rotated_box_with_label(
-                boxes[i], edge_color=colors[i], label=labels[i] if labels is not None else None
-            )
-        return self.output
-    def draw_and_connect_keypoints(self, keypoints):
-        """
-        Draws keypoints of an instance and follows the rules for keypoint connections
-        to draw lines between appropriate keypoints. This follows color heuristics for
-        line color.
-        Args:
-            keypoints (Tensor): a tensor of shape (K, 3), where K is the number of keypoints
-                and the last dimension corresponds to (x, y, probability).
-        Returns:
-            output (VisImage): image object with visualizations.
-        """
-        visible = {}
-        keypoint_names = self.metadata.get("keypoint_names")
-        for idx, keypoint in enumerate(keypoints):
-            # draw keypoint
-            x, y, prob = keypoint
-            if prob > self.keypoint_threshold:
-                self.draw_circle((x, y), color=_RED)
-                if keypoint_names:
-                    keypoint_name = keypoint_names[idx]
-                    visible[keypoint_name] = (x, y)
-        if self.metadata.get("keypoint_connection_rules"):
-            for kp0, kp1, color in self.metadata.keypoint_connection_rules:
-                if kp0 in visible and kp1 in visible:
-                    x0, y0 = visible[kp0]
-                    x1, y1 = visible[kp1]
-                    color = tuple(x / 255.0 for x in color)
-                    self.draw_line([x0, x1], [y0, y1], color=color)
-        # draw lines from nose to mid-shoulder and mid-shoulder to mid-hip
-        # Note that this strategy is specific to person keypoints.
-        # For other keypoints, it should just do nothing
-        try:
-            ls_x, ls_y = visible["left_shoulder"]
-            rs_x, rs_y = visible["right_shoulder"]
-            mid_shoulder_x, mid_shoulder_y = (ls_x + rs_x) / 2, (ls_y + rs_y) / 2
-        except KeyError:
-            pass
-        else:
-            # draw line from nose to mid-shoulder
-            nose_x, nose_y = visible.get("nose", (None, None))
-            if nose_x is not None:
-                self.draw_line([nose_x, mid_shoulder_x], [nose_y, mid_shoulder_y], color=_RED)
-            try:
-                # draw line from mid-shoulder to mid-hip
-                lh_x, lh_y = visible["left_hip"]
-                rh_x, rh_y = visible["right_hip"]
-            except KeyError:
-                pass
-            else:
-                mid_hip_x, mid_hip_y = (lh_x + rh_x) / 2, (lh_y + rh_y) / 2
-                self.draw_line([mid_hip_x, mid_shoulder_x], [mid_hip_y, mid_shoulder_y], color=_RED)
-        return self.output
-    """
-    Primitive drawing functions:
-    """
-    def draw_text(
-        self,
-        text,
-        position,
-        *,
-        font_size=None,
-        color="g",
-        horizontal_alignment="center",
-        rotation=0,
-    ):
-        """
-        Args:
-            text (str): class label
-            position (tuple): a tuple of the x and y coordinates to place text on image.
-            font_size (int, optional): font of the text. If not provided, a font size
-                proportional to the image width is calculated and used.
-            color: color of the text. Refer to `matplotlib.colors` for full list
-                of formats that are accepted.
-            horizontal_alignment (str): see `matplotlib.text.Text`
-            rotation: rotation angle in degrees CCW
-        Returns:
-            output (VisImage): image object with text drawn.
-        """
-        if not font_size:
-            font_size = self._default_font_size
-        # since the text background is dark, we don't want the text to be dark
-        color = np.maximum(list(mplc.to_rgb(color)), 0.2)
-        color[np.argmax(color)] = max(0.8, np.max(color))
-        x, y = position
-        self.output.ax.text(
-            x,
-            y,
-            text,
-            size=font_size * self.output.scale,
-            family="sans-serif",
-            bbox={"facecolor": "black", "alpha": 0.8, "pad": 0.7, "edgecolor": "none"},
-            verticalalignment="top",
-            horizontalalignment=horizontal_alignment,
-            color=color,
-            zorder=10,
-            rotation=rotation,
-        )
-        return self.output
-    def draw_box(self, box_coord, alpha=0.5, edge_color="g", line_style="-"):
-        """
-        Args:
-            box_coord (tuple): a tuple containing x0, y0, x1, y1 coordinates, where x0 and y0
-                are the coordinates of the image's top left corner. x1 and y1 are the
-                coordinates of the image's bottom right corner.
-            alpha (float): blending efficient. Smaller values lead to more transparent masks.
-            edge_color: color of the outline of the box. Refer to `matplotlib.colors`
-                for full list of formats that are accepted.
-            line_style (string): the string to use to create the outline of the boxes.
-        Returns:
-            output (VisImage): image object with box drawn.
-        """
-        x0, y0, x1, y1 = box_coord
-        width = x1 - x0
-        height = y1 - y0
-        linewidth = max(self._default_font_size / 4, 1)
-        self.output.ax.add_patch(
-            mpl.patches.Rectangle(
-                (x0, y0),
-                width,
-                height,
-                fill=False,
-                edgecolor=edge_color,
-                linewidth=linewidth * self.output.scale,
-                alpha=alpha,
-                linestyle=line_style,
-            )
-        )
-        return self.output
-    def draw_rotated_box_with_label(
-        self, rotated_box, alpha=0.5, edge_color="g", line_style="-", label=None
-    ):
-        """
-        Draw a rotated box with label on its top-left corner.
-        Args:
-            rotated_box (tuple): a tuple containing (cnt_x, cnt_y, w, h, angle),
-                where cnt_x and cnt_y are the center coordinates of the box.
-                w and h are the width and height of the box. angle represents how
-                many degrees the box is rotated CCW with regard to the 0-degree box.
-            alpha (float): blending efficient. Smaller values lead to more transparent masks.
-            edge_color: color of the outline of the box. Refer to `matplotlib.colors`
-                for full list of formats that are accepted.
-            line_style (string): the string to use to create the outline of the boxes.
-            label (string): label for rotated box. It will not be rendered when set to None.
-        Returns:
-            output (VisImage): image object with box drawn.
-        """
-        cnt_x, cnt_y, w, h, angle = rotated_box
-        area = w * h
-        # use thinner lines when the box is small
-        linewidth = self._default_font_size / (
-            6 if area < _SMALL_OBJECT_AREA_THRESH * self.output.scale else 3
-        )
-        theta = angle * math.pi / 180.0
-        c = math.cos(theta)
-        s = math.sin(theta)
-        rect = [(-w / 2, h / 2), (-w / 2, -h / 2), (w / 2, -h / 2), (w / 2, h / 2)]
-        # x: left->right ; y: top->down
-        rotated_rect = [(s * yy + c * xx + cnt_x, c * yy - s * xx + cnt_y) for (xx, yy) in rect]
-        for k in range(4):
-            j = (k + 1) % 4
-            self.draw_line(
-                [rotated_rect[k][0], rotated_rect[j][0]],
-                [rotated_rect[k][1], rotated_rect[j][1]],
-                color=edge_color,
-                linestyle="--" if k == 1 else line_style,
-                linewidth=linewidth,
-            )
-        if label is not None:
-            text_pos = rotated_rect[1]  # topleft corner
-            height_ratio = h / np.sqrt(self.output.height * self.output.width)
-            label_color = self._change_color_brightness(edge_color, brightness_factor=0.7)
-            font_size = (
-                np.clip((height_ratio - 0.02) / 0.08 + 1, 1.2, 2) * 0.5 * self._default_font_size
-            )
-            self.draw_text(label, text_pos, color=label_color, font_size=font_size, rotation=angle)
-        return self.output
-    def draw_circle(self, circle_coord, color, radius=3):
-        """
-        Args:
-            circle_coord (list(int) or tuple(int)): contains the x and y coordinates
-                of the center of the circle.
-            color: color of the polygon. Refer to `matplotlib.colors` for a full list of
-                formats that are accepted.
-            radius (int): radius of the circle.
-        Returns:
-            output (VisImage): image object with box drawn.
-        """
-        x, y = circle_coord
-        self.output.ax.add_patch(
-            mpl.patches.Circle(circle_coord, radius=radius, fill=True, color=color)
-        )
-        return self.output
-    def draw_line(self, x_data, y_data, color, linestyle="-", linewidth=None):
-        """
-        Args:
-            x_data (list[int]): a list containing x values of all the points being drawn.
-                Length of list should match the length of y_data.
-            y_data (list[int]): a list containing y values of all the points being drawn.
-                Length of list should match the length of x_data.
-            color: color of the line. Refer to `matplotlib.colors` for a full list of
-                formats that are accepted.
-            linestyle: style of the line. Refer to `matplotlib.lines.Line2D`
-                for a full list of formats that are accepted.
-            linewidth (float or None): width of the line. When it's None,
-                a default value will be computed and used.
-        Returns:
-            output (VisImage): image object with line drawn.
-        """
-        if linewidth is None:
-            linewidth = self._default_font_size / 3
-        linewidth = max(linewidth, 1)
-        self.output.ax.add_line(
-            mpl.lines.Line2D(
-                x_data,
-                y_data,
-                linewidth=linewidth * self.output.scale,
-                color=color,
-                linestyle=linestyle,
-            )
-        )
-        return self.output
-    def draw_binary_mask(
-        self, binary_mask, color=None, *, edge_color=None, text=None, alpha=0.5, area_threshold=0
-    ):
-        """
-        Args:
-            binary_mask (ndarray): numpy array of shape (H, W), where H is the image height and
-                W is the image width. Each value in the array is either a 0 or 1 value of uint8
-                type.
-            color: color of the mask. Refer to `matplotlib.colors` for a full list of
-                formats that are accepted. If None, will pick a random color.
-            edge_color: color of the polygon edges. Refer to `matplotlib.colors` for a
-                full list of formats that are accepted.
-            text (str): if None, will be drawn in the object's center of mass.
-            alpha (float): blending efficient. Smaller values lead to more transparent masks.
-            area_threshold (float): a connected component small than this will not be shown.
-        Returns:
-            output (VisImage): image object with mask drawn.
-        """
-        if color is None:
-            color = random_color(rgb=True, maximum=1)
-        color = mplc.to_rgb(color)
-        has_valid_segment = False
-        binary_mask = binary_mask.astype("uint8")  # opencv needs uint8
-        mask = GenericMask(binary_mask, self.output.height, self.output.width)
-        shape2d = (binary_mask.shape[0], binary_mask.shape[1])
-        if not mask.has_holes:
-            # draw polygons for regular masks
-            for segment in mask.polygons:
-                area = mask_util.area(mask_util.frPyObjects([segment], shape2d[0], shape2d[1]))
-                if area < (area_threshold or 0):
-                    continue
-                has_valid_segment = True
-                segment = segment.reshape(-1, 2)
-                self.draw_polygon(segment, color=color, edge_color=edge_color, alpha=alpha)
-        else:
-            # TODO: Use Path/PathPatch to draw vector graphics:
-            # https://stackoverflow.com/questions/8919719/how-to-plot-a-complex-polygon
-            rgba = np.zeros(shape2d + (4,), dtype="float32")
-            rgba[:, :, :3] = color
-            rgba[:, :, 3] = (mask.mask == 1).astype("float32") * alpha
-            has_valid_segment = True
-            self.output.ax.imshow(rgba, extent=(0, self.output.width, self.output.height, 0))
-        if text is not None and has_valid_segment:
-            # TODO sometimes drawn on wrong objects. the heuristics here can improve.
-            lighter_color = self._change_color_brightness(color, brightness_factor=0.7)
-            _num_cc, cc_labels, stats, centroids = cv2.connectedComponentsWithStats(binary_mask, 8)
-            largest_component_id = np.argmax(stats[1:, -1]) + 1
-            # draw text on the largest component, as well as other very large components.
-            for cid in range(1, _num_cc):
-                if cid == largest_component_id or stats[cid, -1] > _LARGE_MASK_AREA_THRESH:
-                    # median is more stable than centroid
-                    # center = centroids[largest_component_id]
-                    center = np.median((cc_labels == cid).nonzero(), axis=1)[::-1]
-                    self.draw_text(text, center, color=lighter_color)
-        return self.output
-    def draw_polygon(self, segment, color, edge_color=None, alpha=0.5):
-        """
-        Args:
-            segment: numpy array of shape Nx2, containing all the points in the polygon.
-            color: color of the polygon. Refer to `matplotlib.colors` for a full list of
-                formats that are accepted.
-            edge_color: color of the polygon edges. Refer to `matplotlib.colors` for a
-                full list of formats that are accepted. If not provided, a darker shade
-                of the polygon color will be used instead.
-            alpha (float): blending efficient. Smaller values lead to more transparent masks.
-        Returns:
-            output (VisImage): image object with polygon drawn.
-        """
-        if edge_color is None:
-            # make edge color darker than the polygon color
-            if alpha > 0.8:
-                edge_color = self._change_color_brightness(color, brightness_factor=-0.7)
-            else:
-                edge_color = color
-        edge_color = mplc.to_rgb(edge_color) + (1,)
-        polygon = mpl.patches.Polygon(
-            segment,
-            fill=True,
-            facecolor=mplc.to_rgb(color) + (alpha,),
-            edgecolor=edge_color,
-            linewidth=max(self._default_font_size // 15 * self.output.scale, 1),
-        )
-        self.output.ax.add_patch(polygon)
-        return self.output
-    """
-    Internal methods:
-    """
-    def _jitter(self, color):
-        """
-        Randomly modifies given color to produce a slightly different color than the color given.
-        Args:
-            color (tuple[double]): a tuple of 3 elements, containing the RGB values of the color
-                picked. The values in the list are in the [0.0, 1.0] range.
-        Returns:
-            jittered_color (tuple[double]): a tuple of 3 elements, containing the RGB values of the
-                color after being jittered. The values in the list are in the [0.0, 1.0] range.
-        """
-        color = mplc.to_rgb(color)
-        vec = np.random.rand(3)
-        # better to do it in another color space
-        vec = vec / np.linalg.norm(vec) * 0.5
-        res = np.clip(vec + color, 0, 1)
-        return tuple(res)
-    def _create_grayscale_image(self, mask=None):
-        """
-        Create a grayscale version of the original image.
-        The colors in masked area, if given, will be kept.
-        """
-        img_bw = self.img.astype("f4").mean(axis=2)
-        img_bw = np.stack([img_bw] * 3, axis=2)
-        if mask is not None:
-            img_bw[mask] = self.img[mask]
-        return img_bw
-    def _change_color_brightness(self, color, brightness_factor):
-        """
-        Depending on the brightness_factor, gives a lighter or darker color i.e. a color with
-        less or more saturation than the original color.
-        Args:
-            color: color of the polygon. Refer to `matplotlib.colors` for a full list of
-                formats that are accepted.
-            brightness_factor (float): a value in [-1.0, 1.0] range. A lightness factor of
-                0 will correspond to no change, a factor in [-1.0, 0) range will result in
-                a darker color and a factor in (0, 1.0] range will result in a lighter color.
-        Returns:
-            modified_color (tuple[double]): a tuple containing the RGB values of the
-                modified color. Each value in the tuple is in the [0.0, 1.0] range.
-        """
-        assert brightness_factor >= -1.0 and brightness_factor <= 1.0
-        color = mplc.to_rgb(color)
-        polygon_color = colorsys.rgb_to_hls(*mplc.to_rgb(color))
-        modified_lightness = polygon_color[1] + (brightness_factor * polygon_color[1])
-        modified_lightness = 0.0 if modified_lightness < 0.0 else modified_lightness
-        modified_lightness = 1.0 if modified_lightness > 1.0 else modified_lightness
-        modified_color = colorsys.hls_to_rgb(polygon_color[0], modified_lightness, polygon_color[2])
-        return modified_color
-    def _convert_boxes(self, boxes):
-        """
-        Convert different format of boxes to an NxB array, where B = 4 or 5 is the box dimension.
-        """
-        if isinstance(boxes, Boxes) or isinstance(boxes, RotatedBoxes):
-            return boxes.tensor.detach().numpy()
-        else:
-            return np.asarray(boxes)
-    def _convert_masks(self, masks_or_polygons):
-        """
-        Convert different format of masks or polygons to a tuple of masks and polygons.
-        Returns:
-            list[GenericMask]:
-        """
-        m = masks_or_polygons
-        if isinstance(m, PolygonMasks):
-            m = m.polygons
-        if isinstance(m, BitMasks):
-            m = m.tensor.numpy()
-        if isinstance(m, torch.Tensor):
-            m = m.numpy()
-        ret = []
-        for x in m:
-            if isinstance(x, GenericMask):
-                ret.append(x)
-            else:
-                ret.append(GenericMask(x, self.output.height, self.output.width))
-        return ret
-    def _convert_keypoints(self, keypoints):
-        if isinstance(keypoints, Keypoints):
-            keypoints = keypoints.tensor
-        keypoints = np.asarray(keypoints)
-        return keypoints
-    def get_output(self):
-        """
-        Returns:
-            output (VisImage): the image output containing the visualizations added
-            to the image.
-        """
-        return self.output
--- a/magic_pdf/model/pek_sub_modules/post_process.py
+++ b/magic_pdf/model/pek_sub_modules/post_process.py
-import re
-def layout_rm_equation(layout_res):
-    rm_idxs = []
-    for idx, ele in enumerate(layout_res['layout_dets']):
-        if ele['category_id'] == 10:
-            rm_idxs.append(idx)
-    for idx in rm_idxs[::-1]:
-        del layout_res['layout_dets'][idx]
-    return layout_res
-def get_croped_image(image_pil, bbox):
-    x_min, y_min, x_max, y_max = bbox
-    croped_img = image_pil.crop((x_min, y_min, x_max, y_max))
-    return croped_img
-def latex_rm_whitespace(s: str):
-    """Remove unnecessary whitespace from LaTeX code.
-    """
-    text_reg = r'(\\(operatorname|mathrm|text|mathbf)\s?\*? {.*?})'
-    letter = '[a-zA-Z]'
-    noletter = '[\W_^\d]'
-    names = [x[0].replace(' ', '') for x in re.findall(text_reg, s)]
-    s = re.sub(text_reg, lambda match: str(names.pop(0)), s)
-    news = s
-    while True:
-        s = news
-        news = re.sub(r'(?!\\ )(%s)\s+?(%s)' % (noletter, noletter), r'\1\2', s)
-        news = re.sub(r'(?!\\ )(%s)\s+?(%s)' % (noletter, letter), r'\1\2', news)
-        news = re.sub(r'(%s)\s+?(%s)' % (letter, noletter), r'\1\2', news)
-        if news == s:
-            break
-    return s
\ No newline at end of file
--- a/magic_pdf/model/pek_sub_modules/self_modify.py
+++ b/magic_pdf/model/pek_sub_modules/self_modify.py
-import time
-import copy
-import base64
-import cv2
-import numpy as np
-from io import BytesIO
-from PIL import Image
-from paddleocr import PaddleOCR
-from paddleocr.ppocr.utils.logging import get_logger
-from paddleocr.ppocr.utils.utility import check_and_read, alpha_to_color, binarize_img
-from paddleocr.tools.infer.utility import draw_ocr_box_txt, get_rotate_crop_image, get_minarea_rect_crop
-from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold
-from magic_pdf.pre_proc.ocr_dict_merge import merge_spans_to_line
-logger = get_logger()
-def img_decode(content: bytes):
-    np_arr = np.frombuffer(content, dtype=np.uint8)
-    return cv2.imdecode(np_arr, cv2.IMREAD_UNCHANGED)
-def check_img(img):
-    if isinstance(img, bytes):
-        img = img_decode(img)
-    if isinstance(img, str):
-        image_file = img
-        img, flag_gif, flag_pdf = check_and_read(image_file)
-        if not flag_gif and not flag_pdf:
-            with open(image_file, 'rb') as f:
-                img_str = f.read()
-                img = img_decode(img_str)
-            if img is None:
-                try:
-                    buf = BytesIO()
-                    image = BytesIO(img_str)
-                    im = Image.open(image)
-                    rgb = im.convert('RGB')
-                    rgb.save(buf, 'jpeg')
-                    buf.seek(0)
-                    image_bytes = buf.read()
-                    data_base64 = str(base64.b64encode(image_bytes),
-                                      encoding="utf-8")
-                    image_decode = base64.b64decode(data_base64)
-                    img_array = np.frombuffer(image_decode, np.uint8)
-                    img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
-                except:
-                    logger.error("error in loading image:{}".format(image_file))
-                    return None
-        if img is None:
-            logger.error("error in loading image:{}".format(image_file))
-            return None
-    if isinstance(img, np.ndarray) and len(img.shape) == 2:
-        img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
-    return img
-def sorted_boxes(dt_boxes):
-    """
-    Sort text boxes in order from top to bottom, left to right
-    args:
-        dt_boxes(array):detected text boxes with shape [4, 2]
-    return:
-        sorted boxes(array) with shape [4, 2]
-    """
-    num_boxes = dt_boxes.shape[0]
-    sorted_boxes = sorted(dt_boxes, key=lambda x: (x[0][1], x[0][0]))
-    _boxes = list(sorted_boxes)
-    for i in range(num_boxes - 1):
-        for j in range(i, -1, -1):
-            if abs(_boxes[j + 1][0][1] - _boxes[j][0][1]) < 10 and \
-                    (_boxes[j + 1][0][0] < _boxes[j][0][0]):
-                tmp = _boxes[j]
-                _boxes[j] = _boxes[j + 1]
-                _boxes[j + 1] = tmp
-            else:
-                break
-    return _boxes
-def bbox_to_points(bbox):
-    """ 将bbox格式转换为四个顶点的数组 """
-    x0, y0, x1, y1 = bbox
-    return np.array([[x0, y0], [x1, y0], [x1, y1], [x0, y1]]).astype('float32')
-def points_to_bbox(points):
-    """ 将四个顶点的数组转换为bbox格式 """
-    x0, y0 = points[0]
-    x1, _ = points[1]
-    _, y1 = points[2]
-    return [x0, y0, x1, y1]
-def merge_intervals(intervals):
-    # Sort the intervals based on the start value
-    intervals.sort(key=lambda x: x[0])
-    merged = []
-    for interval in intervals:
-        # If the list of merged intervals is empty or if the current
-        # interval does not overlap with the previous, simply append it.
-        if not merged or merged[-1][1] < interval[0]:
-            merged.append(interval)
-        else:
-            # Otherwise, there is overlap, so we merge the current and previous intervals.
-            merged[-1][1] = max(merged[-1][1], interval[1])
-    return merged
-def remove_intervals(original, masks):
-    # Merge all mask intervals
-    merged_masks = merge_intervals(masks)
-    result = []
-    original_start, original_end = original
-    for mask in merged_masks:
-        mask_start, mask_end = mask
-        # If the mask starts after the original range, ignore it
-        if mask_start > original_end:
-            continue
-        # If the mask ends before the original range starts, ignore it
-        if mask_end < original_start:
-            continue
-        # Remove the masked part from the original range
-        if original_start < mask_start:
-            result.append([original_start, mask_start - 1])
-        original_start = max(mask_end + 1, original_start)
-    # Add the remaining part of the original range, if any
-    if original_start <= original_end:
-        result.append([original_start, original_end])
-    return result
-def update_det_boxes(dt_boxes, mfd_res):
-    new_dt_boxes = []
-    for text_box in dt_boxes:
-        text_bbox = points_to_bbox(text_box)
-        masks_list = []
-        for mf_box in mfd_res:
-            mf_bbox = mf_box['bbox']
-            if __is_overlaps_y_exceeds_threshold(text_bbox, mf_bbox):
-                masks_list.append([mf_bbox[0], mf_bbox[2]])
-        text_x_range = [text_bbox[0], text_bbox[2]]
-        text_remove_mask_range = remove_intervals(text_x_range, masks_list)
-        temp_dt_box = []
-        for text_remove_mask in text_remove_mask_range:
-            temp_dt_box.append(bbox_to_points([text_remove_mask[0], text_bbox[1], text_remove_mask[1], text_bbox[3]]))
-        if len(temp_dt_box) > 0:
-            new_dt_boxes.extend(temp_dt_box)
-    return new_dt_boxes
-def merge_overlapping_spans(spans):
-    """
-    Merges overlapping spans on the same line.
-    :param spans: A list of span coordinates [(x1, y1, x2, y2), ...]
-    :return: A list of merged spans
-    """
-    # Return an empty list if the input spans list is empty
-    if not spans:
-        return []
-    # Sort spans by their starting x-coordinate
-    spans.sort(key=lambda x: x[0])
-    # Initialize the list of merged spans
-    merged = []
-    for span in spans:
-        # Unpack span coordinates
-        x1, y1, x2, y2 = span
-        # If the merged list is empty or there's no horizontal overlap, add the span directly
-        if not merged or merged[-1][2] < x1:
-            merged.append(span)
-        else:
-            # If there is horizontal overlap, merge the current span with the previous one
-            last_span = merged.pop()
-            # Update the merged span's top-left corner to the smaller (x1, y1) and bottom-right to the larger (x2, y2)
-            x1 = min(last_span[0], x1)
-            y1 = min(last_span[1], y1)
-            x2 = max(last_span[2], x2)
-            y2 = max(last_span[3], y2)
-            # Add the merged span back to the list
-            merged.append((x1, y1, x2, y2))
-    # Return the list of merged spans
-    return merged
-def merge_det_boxes(dt_boxes):
-    """
-    Merge detection boxes.
-    This function takes a list of detected bounding boxes, each represented by four corner points.
-    The goal is to merge these bounding boxes into larger text regions.
-    Parameters:
-    dt_boxes (list): A list containing multiple text detection boxes, where each box is defined by four corner points.
-    Returns:
-    list: A list containing the merged text regions, where each region is represented by four corner points.
-    """
-    # Convert the detection boxes into a dictionary format with bounding boxes and type
-    dt_boxes_dict_list = []
-    for text_box in dt_boxes:
-        text_bbox = points_to_bbox(text_box)
-        text_box_dict = {
-            'bbox': text_bbox,
-            'type': 'text',
-        }
-        dt_boxes_dict_list.append(text_box_dict)
-    # Merge adjacent text regions into lines
-    lines = merge_spans_to_line(dt_boxes_dict_list)
-    # Initialize a new list for storing the merged text regions
-    new_dt_boxes = []
-    for line in lines:
-        line_bbox_list = []
-        for span in line:
-            line_bbox_list.append(span['bbox'])
-        # Merge overlapping text regions within the same line
-        merged_spans = merge_overlapping_spans(line_bbox_list)
-        # Convert the merged text regions back to point format and add them to the new detection box list
-        for span in merged_spans:
-            new_dt_boxes.append(bbox_to_points(span))
-    return new_dt_boxes
-class ModifiedPaddleOCR(PaddleOCR):
-    def ocr(self, img, det=True, rec=True, cls=True, bin=False, inv=False, mfd_res=None, alpha_color=(255, 255, 255)):
-        """
-        OCR with PaddleOCR
-        args：
-            img: img for OCR, support ndarray, img_path and list or ndarray
-            det: use text detection or not. If False, only rec will be exec. Default is True
-            rec: use text recognition or not. If False, only det will be exec. Default is True
-            cls: use angle classifier or not. Default is True. If True, the text with rotation of 180 degrees can be recognized. If no text is rotated by 180 degrees, use cls=False to get better performance. Text with rotation of 90 or 270 degrees can be recognized even if cls=False.
-            bin: binarize image to black and white. Default is False.
-            inv: invert image colors. Default is False.
-            alpha_color: set RGB color Tuple for transparent parts replacement. Default is pure white.
-        """
-        assert isinstance(img, (np.ndarray, list, str, bytes))
-        if isinstance(img, list) and det == True:
-            logger.error('When input a list of images, det must be false')
-            exit(0)
-        if cls == True and self.use_angle_cls == False:
-            pass
-            # logger.warning(
-            #     'Since the angle classifier is not initialized, it will not be used during the forward process'
-            # )
-        img = check_img(img)
-        # for infer pdf file
-        if isinstance(img, list):
-            if self.page_num > len(img) or self.page_num == 0:
-                self.page_num = len(img)
-            imgs = img[:self.page_num]
-        else:
-            imgs = [img]
-        def preprocess_image(_image):
-            _image = alpha_to_color(_image, alpha_color)
-            if inv:
-                _image = cv2.bitwise_not(_image)
-            if bin:
-                _image = binarize_img(_image)
-            return _image
-        if det and rec:
-            ocr_res = []
-            for idx, img in enumerate(imgs):
-                img = preprocess_image(img)
-                dt_boxes, rec_res, _ = self.__call__(img, cls, mfd_res=mfd_res)
-                if not dt_boxes and not rec_res:
-                    ocr_res.append(None)
-                    continue
-                tmp_res = [[box.tolist(), res]
-                           for box, res in zip(dt_boxes, rec_res)]
-                ocr_res.append(tmp_res)
-            return ocr_res
-        elif det and not rec:
-            ocr_res = []
-            for idx, img in enumerate(imgs):
-                img = preprocess_image(img)
-                dt_boxes, elapse = self.text_detector(img)
-                if not dt_boxes:
-                    ocr_res.append(None)
-                    continue
-                tmp_res = [box.tolist() for box in dt_boxes]
-                ocr_res.append(tmp_res)
-            return ocr_res
-        else:
-            ocr_res = []
-            cls_res = []
-            for idx, img in enumerate(imgs):
-                if not isinstance(img, list):
-                    img = preprocess_image(img)
-                    img = [img]
-                if self.use_angle_cls and cls:
-                    img, cls_res_tmp, elapse = self.text_classifier(img)
-                    if not rec:
-                        cls_res.append(cls_res_tmp)
-                rec_res, elapse = self.text_recognizer(img)
-                ocr_res.append(rec_res)
-            if not rec:
-                return cls_res
-            return ocr_res
-    def __call__(self, img, cls=True, mfd_res=None):
-        time_dict = {'det': 0, 'rec': 0, 'cls': 0, 'all': 0}
-        if img is None:
-            logger.debug("no valid image provided")
-            return None, None, time_dict
-        start = time.time()
-        ori_im = img.copy()
-        dt_boxes, elapse = self.text_detector(img)
-        time_dict['det'] = elapse
-        if dt_boxes is None:
-            logger.debug("no dt_boxes found, elapsed : {}".format(elapse))
-            end = time.time()
-            time_dict['all'] = end - start
-            return None, None, time_dict
-        else:
-            logger.debug("dt_boxes num : {}, elapsed : {}".format(
-                len(dt_boxes), elapse))
-        img_crop_list = []
-        dt_boxes = sorted_boxes(dt_boxes)
-        dt_boxes = merge_det_boxes(dt_boxes)
-        if mfd_res:
-            bef = time.time()
-            dt_boxes = update_det_boxes(dt_boxes, mfd_res)
-            aft = time.time()
-            logger.debug("split text box by formula, new dt_boxes num : {}, elapsed : {}".format(
-                len(dt_boxes), aft - bef))
-        for bno in range(len(dt_boxes)):
-            tmp_box = copy.deepcopy(dt_boxes[bno])
-            if self.args.det_box_type == "quad":
-                img_crop = get_rotate_crop_image(ori_im, tmp_box)
-            else:
-                img_crop = get_minarea_rect_crop(ori_im, tmp_box)
-            img_crop_list.append(img_crop)
-        if self.use_angle_cls and cls:
-            img_crop_list, angle_list, elapse = self.text_classifier(
-                img_crop_list)
-            time_dict['cls'] = elapse
-            logger.debug("cls num  : {}, elapsed : {}".format(
-                len(img_crop_list), elapse))
-        rec_res, elapse = self.text_recognizer(img_crop_list)
-        time_dict['rec'] = elapse
-        logger.debug("rec_res num  : {}, elapsed : {}".format(
-            len(rec_res), elapse))
-        if self.args.save_crop_res:
-            self.draw_crop_rec_res(self.args.crop_res_save_dir, img_crop_list,
-                                   rec_res)
-        filter_boxes, filter_rec_res = [], []
-        for box, rec_result in zip(dt_boxes, rec_res):
-            text, score = rec_result
-            if score >= self.drop_score:
-                filter_boxes.append(box)
-                filter_rec_res.append(rec_result)
-        end = time.time()
-        time_dict['all'] = end - start
-        return filter_boxes, filter_rec_res, time_dict
\ No newline at end of file
--- a/magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py
+++ b/magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py
-from struct_eqtable.model import StructTable
-from pypandoc import convert_text
-class StructTableModel:
-    def __init__(self, model_path, max_new_tokens=2048, max_time=400, device = 'cpu'):
-        # init
-        self.model_path = model_path
-        self.max_new_tokens = max_new_tokens # maximum output tokens length
-        self.max_time = max_time # timeout for processing in seconds
-        if device == 'cuda':
-            self.model = StructTable(self.model_path, self.max_new_tokens, self.max_time).cuda()
-        else:
-            self.model = StructTable(self.model_path, self.max_new_tokens, self.max_time)
-    def image2latex(self, image) -> str:
-        table_latex = self.model.forward(image)
-        return table_latex
-    def image2html(self, image) -> str:
-        table_latex = self.image2latex(image)
-        table_html = convert_text(table_latex, 'html', format='latex')
-        return table_html
--- a/magic_pdf/model/pek_sub_modules/structeqtable/__init__.py
+++ b/magic_pdf/model/pek_sub_modules/structeqtable/__init__.py
--- a/magic_pdf/model/ppTableModel.py
+++ b/magic_pdf/model/ppTableModel.py
-from paddleocr.ppstructure.table.predict_table import TableSystem
-from paddleocr.ppstructure.utility import init_args
-from magic_pdf.libs.Constants import *
-import os
-from PIL import Image
-import numpy as np
-class ppTableModel(object):
-    """
-        This class is responsible for converting image of table into HTML format using a pre-trained model.
-        Attributes:
-        - table_sys: An instance of TableSystem initialized with parsed arguments.
-        Methods:
-        - __init__(config): Initializes the model with configuration parameters.
-        - img2html(image): Converts a PIL Image or NumPy array to HTML string.
-        - parse_args(**kwargs): Parses configuration arguments.
-    """
-    def __init__(self, config):
-        """
-        Parameters:
-        - config (dict): Configuration dictionary containing model_dir and device.
-        """
-        args = self.parse_args(**config)
-        self.table_sys = TableSystem(args)
-    def img2html(self, image):
-        """
-        Parameters:
-        - image (PIL.Image or np.ndarray): The image of the table to be converted.
-        Return:
-        - HTML (str): A string representing the HTML structure with content of the table.
-        """
-        if isinstance(image, Image.Image):
-            image = np.array(image)
-        pred_res, _ = self.table_sys(image)
-        pred_html = pred_res["html"]
-        res = '<td><table  border="1">' + pred_html.replace("<html><body><table>", "").replace("</table></body></html>",
-                                                                                               "") + "</table></td>\n"
-        return res
-    def parse_args(self, **kwargs):
-        parser = init_args()
-        model_dir = kwargs.get("model_dir")
-        table_model_dir = os.path.join(model_dir, TABLE_MASTER_DIR)
-        table_char_dict_path = os.path.join(model_dir, TABLE_MASTER_DICT)
-        det_model_dir = os.path.join(model_dir, DETECT_MODEL_DIR)
-        rec_model_dir = os.path.join(model_dir, REC_MODEL_DIR)
-        rec_char_dict_path = os.path.join(model_dir, REC_CHAR_DICT)
-        device = kwargs.get("device", "cpu")
-        use_gpu = True if device == "cuda" else False
-        config = {
-            "use_gpu": use_gpu,
-            "table_max_len": kwargs.get("table_max_len", TABLE_MAX_LEN),
-            "table_algorithm": TABLE_MASTER,
-            "table_model_dir": table_model_dir,
-            "table_char_dict_path": table_char_dict_path,
-            "det_model_dir": det_model_dir,
-            "rec_model_dir": rec_model_dir,
-            "rec_char_dict_path": rec_char_dict_path,
-        }
-        parser.set_defaults(**config)
-        return parser.parse_args([])
--- a/magic_pdf/model/pp_structure_v2.py
+++ b/magic_pdf/model/pp_structure_v2.py
-import random
-from loguru import logger
-try:
-    from paddleocr import PPStructure
-except ImportError:
-    logger.error('paddleocr not installed, please install by "pip install magic-pdf[lite]"')
-    exit(1)
-def region_to_bbox(region):
-    x0 = region[0][0]
-    y0 = region[0][1]
-    x1 = region[2][0]
-    y1 = region[2][1]
-    return [x0, y0, x1, y1]
-class CustomPaddleModel:
-    def __init__(self, ocr: bool = False, show_log: bool = False):
-        self.model = PPStructure(table=False, ocr=ocr, show_log=show_log)
-    def __call__(self, img,index,end_page_id):
-        try:
-            import cv2
-        except ImportError:
-            logger.error("opencv-python not installed, please install by pip.")
-            exit(1)
-        # 将RGB图片转换为BGR格式适配paddle
-        img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
-        result = self.model(img)
-        spans = []
-        for line in result:
-            line.pop("img")
-            """
-            为paddle输出适配type no.    
-            title: 0 # 标题
-            text: 1 # 文本
-            header: 2 # abandon
-            footer: 2 # abandon
-            reference: 1 # 文本 or abandon
-            equation: 8 # 行间公式 block
-            equation: 14 # 行间公式 text
-            figure: 3 # 图片
-            figure_caption: 4 # 图片描述
-            table: 5 # 表格
-            table_caption: 6 # 表格描述
-            """
-            if line["type"] == "title":
-                line["category_id"] = 0
-            elif line["type"] in ["text", "reference"]:
-                line["category_id"] = 1
-            elif line["type"] == "figure":
-                line["category_id"] = 3
-            elif line["type"] == "figure_caption":
-                line["category_id"] = 4
-            elif line["type"] == "table":
-                line["category_id"] = 5
-            elif line["type"] == "table_caption":
-                line["category_id"] = 6
-            elif line["type"] == "equation":
-                line["category_id"] = 8
-            elif line["type"] in ["header", "footer"]:
-                line["category_id"] = 2
-            else:
-                logger.warning(f"unknown type: {line['type']}")
-            # 兼容不输出score的paddleocr版本
-            if line.get("score") is None:
-                line["score"] = 0.5 + random.random() * 0.5
-            res = line.pop("res", None)
-            if res is not None and len(res) > 0:
-                for span in res:
-                    new_span = {
-                        "category_id": 15,
-                        "bbox": region_to_bbox(span["text_region"]),
-                        "score": span["confidence"],
-                        "text": span["text"],
-                    }
-                    spans.append(new_span)
-        if len(spans) > 0:
-            result.extend(spans)
-        return result
--- a/magic_pdf/para/__init__.py
+++ b/magic_pdf/para/__init__.py
--- a/magic_pdf/para/block_continuation_processor.py
+++ b/magic_pdf/para/block_continuation_processor.py
-import os
-import unicodedata
-from magic_pdf.para.commons import *
-if sys.version_info[0] >= 3:
-    sys.stdout.reconfigure(encoding="utf-8")  # type: ignore
-class BlockContinuationProcessor:
-    """
-    This class is used to process the blocks to detect block continuations.
-    """
-    def __init__(self) -> None:
-        pass
-    def __is_similar_font_type(self, font_type1, font_type2, prefix_length_ratio=0.3):
-        """
-        This function checks if the two font types are similar.
-        Definition of similar font types: the two font types have a common prefix,
-        and the length of the common prefix is at least a certain ratio of the length of the shorter font type.
-        Parameters
-        ----------
-        font_type1 : str
-            font type 1
-        font_type2 : str
-            font type 2
-        prefix_length_ratio : float
-            minimum ratio of the common prefix length to the length of the shorter font type
-        Returns
-        -------
-        bool
-            True if the two font types are similar, False otherwise.
-        """
-        if isinstance(font_type1, list):
-            font_type1 = font_type1[0] if font_type1 else ""
-        if isinstance(font_type2, list):
-            font_type2 = font_type2[0] if font_type2 else ""
-        if font_type1 == font_type2:
-            return True
-        # Find the length of the common prefix
-        common_prefix_length = len(os.path.commonprefix([font_type1, font_type2]))
-        # Calculate the minimum prefix length based on the ratio
-        min_prefix_length = int(min(len(font_type1), len(font_type2)) * prefix_length_ratio)
-        return common_prefix_length >= min_prefix_length
-    def __is_same_block_font(self, block1, block2):
-        """
-        This function compares the font of block1 and block2
-        Parameters
-        ----------
-        block1 : dict
-            block1
-        block2 : dict
-            block2
-        Returns
-        -------
-        is_same : bool
-            True if block1 and block2 have the same font, else False
-        """
-        block_1_font_type = safe_get(block1, "block_font_type", "")
-        block_1_font_size = safe_get(block1, "block_font_size", 0)
-        block_1_avg_char_width = safe_get(block1, "avg_char_width", 0)
-        block_2_font_type = safe_get(block2, "block_font_type", "")
-        block_2_font_size = safe_get(block2, "block_font_size", 0)
-        block_2_avg_char_width = safe_get(block2, "avg_char_width", 0)
-        if isinstance(block_1_font_size, list):
-            block_1_font_size = block_1_font_size[0] if block_1_font_size else 0
-        if isinstance(block_2_font_size, list):
-            block_2_font_size = block_2_font_size[0] if block_2_font_size else 0
-        block_1_text = safe_get(block1, "text", "")
-        block_2_text = safe_get(block2, "text", "")
-        if block_1_avg_char_width == 0 or block_2_avg_char_width == 0:
-            return False
-        if not block_1_text or not block_2_text:
-            return False
-        else:
-            text_len_ratio = len(block_2_text) / len(block_1_text)
-            if text_len_ratio < 0.2:
-                avg_char_width_condition = (
-                    abs(block_1_avg_char_width - block_2_avg_char_width) / min(block_1_avg_char_width, block_2_avg_char_width)
-                    < 0.5
-                )
-            else:
-                avg_char_width_condition = (
-                    abs(block_1_avg_char_width - block_2_avg_char_width) / min(block_1_avg_char_width, block_2_avg_char_width)
-                    < 0.2
-                )
-        block_font_size_condtion = abs(block_1_font_size - block_2_font_size) < 1
-        return (
-            self.__is_similar_font_type(block_1_font_type, block_2_font_type)
-            and avg_char_width_condition
-            and block_font_size_condtion
-        )
-    def _is_alphabet_char(self, char):
-        if (char >= "\u0041" and char <= "\u005a") or (char >= "\u0061" and char <= "\u007a"):
-            return True
-        else:
-            return False
-    def _is_chinese_char(self, char):
-        if char >= "\u4e00" and char <= "\u9fa5":
-            return True
-        else:
-            return False
-    def _is_other_letter_char(self, char):
-        try:
-            cat = unicodedata.category(char)
-            if cat == "Lu" or cat == "Ll":
-                return not self._is_alphabet_char(char) and not self._is_chinese_char(char)
-        except TypeError:
-            print("The input to the function must be a single character.")
-        return False
-    def _is_year(self, s: str):
-        try:
-            number = int(s)
-            return 1900 <= number <= 2099
-        except ValueError:
-            return False
-    def __is_para_font_consistent(self, para_1, para_2):
-        """
-        This function compares the font of para1 and para2
-        Parameters
-        ----------
-        para1 : dict
-            para1
-        para2 : dict
-            para2
-        Returns
-        -------
-        is_same : bool
-            True if para1 and para2 have the same font, else False
-        """
-        if para_1 is None or para_2 is None:
-            return False
-        para_1_font_type = safe_get(para_1, "para_font_type", "")
-        para_1_font_size = safe_get(para_1, "para_font_size", 0)
-        para_1_font_color = safe_get(para_1, "para_font_color", "")
-        para_2_font_type = safe_get(para_2, "para_font_type", "")
-        para_2_font_size = safe_get(para_2, "para_font_size", 0)
-        para_2_font_color = safe_get(para_2, "para_font_color", "")
-        if isinstance(para_1_font_type, list):  # get the most common font type
-            para_1_font_type = max(set(para_1_font_type), key=para_1_font_type.count)
-        if isinstance(para_2_font_type, list):
-            para_2_font_type = max(set(para_2_font_type), key=para_2_font_type.count)
-        if isinstance(para_1_font_size, list):  # compute average font type
-            para_1_font_size = sum(para_1_font_size) / len(para_1_font_size)
-        if isinstance(para_2_font_size, list):  # compute average font type
-            para_2_font_size = sum(para_2_font_size) / len(para_2_font_size)
-        return (
-            self.__is_similar_font_type(para_1_font_type, para_2_font_type)
-            and abs(para_1_font_size - para_2_font_size) < 1.5
-            # and para_font_color1 == para_font_color2
-        )
-    def _is_para_puncs_consistent(self, para_1, para_2):
-        """
-        This function determines whether para1 and para2 are originally from the same paragraph by checking the puncs of para1(former) and para2(latter)
-        Parameters
-        ----------
-        para1 : dict
-            para1
-        para2 : dict
-            para2
-        Returns
-        -------
-        is_same : bool
-            True if para1 and para2 are from the same paragraph by using the puncs, else False
-        """
-        para_1_text = safe_get(para_1, "para_text", "").strip()
-        para_2_text = safe_get(para_2, "para_text", "").strip()
-        para_1_bboxes = safe_get(para_1, "para_bbox", [])
-        para_1_font_sizes = safe_get(para_1, "para_font_size", 0)
-        para_2_bboxes = safe_get(para_2, "para_bbox", [])
-        para_2_font_sizes = safe_get(para_2, "para_font_size", 0)
-        # print_yellow("    Features of determine puncs_consistent:")
-        # print(f"    para_1_text: {para_1_text}")
-        # print(f"    para_2_text: {para_2_text}")
-        # print(f"    para_1_bboxes: {para_1_bboxes}")
-        # print(f"    para_2_bboxes: {para_2_bboxes}")
-        # print(f"    para_1_font_sizes: {para_1_font_sizes}")
-        # print(f"    para_2_font_sizes: {para_2_font_sizes}")
-        if is_nested_list(para_1_bboxes):
-            x0_1, y0_1, x1_1, y1_1 = para_1_bboxes[-1]
-        else:
-            x0_1, y0_1, x1_1, y1_1 = para_1_bboxes
-        if is_nested_list(para_2_bboxes):
-            x0_2, y0_2, x1_2, y1_2 = para_2_bboxes[0]
-            para_2_font_sizes = para_2_font_sizes[0]  # type: ignore
-        else:
-            x0_2, y0_2, x1_2, y1_2 = para_2_bboxes
-        right_align_threshold = 0.5 * (para_1_font_sizes + para_2_font_sizes) * 0.8
-        are_two_paras_right_aligned = abs(x1_1 - x1_2) < right_align_threshold
-        left_indent_threshold = 0.5 * (para_1_font_sizes + para_2_font_sizes) * 0.8
-        is_para1_left_indent_than_papa2 = x0_1 - x0_2 > left_indent_threshold
-        is_para2_left_indent_than_papa1 = x0_2 - x0_1 > left_indent_threshold
-        # Check if either para_text1 or para_text2 is empty
-        if not para_1_text or not para_2_text:
-            return False
-        # Define the end puncs for a sentence to end and hyphen
-        end_puncs = [".", "?", "!", "。", "？", "！", "…"]
-        hyphen = ["-", "—"]
-        # Check if para_text1 ends with either hyphen or non-end punctuation or spaces
-        para_1_end_with_hyphen = para_1_text and para_1_text[-1] in hyphen
-        para_1_end_with_end_punc = para_1_text and para_1_text[-1] in end_puncs
-        para_1_end_with_space = para_1_text and para_1_text[-1] == " "
-        para_1_not_end_with_end_punc = para_1_text and para_1_text[-1] not in end_puncs
-        # print_yellow(f"    para_1_end_with_hyphen: {para_1_end_with_hyphen}")
-        # print_yellow(f"    para_1_end_with_end_punc: {para_1_end_with_end_punc}")
-        # print_yellow(f"    para_1_not_end_with_end_punc: {para_1_not_end_with_end_punc}")
-        # print_yellow(f"    para_1_end_with_space: {para_1_end_with_space}")
-        if para_1_end_with_hyphen:  # If para_text1 ends with hyphen
-            # print_red(f"para_1 is end with hyphen.")
-            para_2_is_consistent = para_2_text and (
-                para_2_text[0] in hyphen
-                or (self._is_alphabet_char(para_2_text[0]) and para_2_text[0].islower())
-                or (self._is_chinese_char(para_2_text[0]))
-                or (self._is_other_letter_char(para_2_text[0]))
-            )
-            if para_2_is_consistent:
-                # print(f"para_2 is consistent.\n")
-                return True
-            else:
-                # print(f"para_2 is not consistent.\n")
-                pass
-        elif para_1_end_with_end_punc:  # If para_text1 ends with ending punctuations
-            # print_red(f"para_1 is end with end_punc.")
-            para_2_is_consistent = (
-                para_2_text
-                and (
-                    para_2_text[0] == " "
-                    or (self._is_alphabet_char(para_2_text[0]) and para_2_text[0].isupper())
-                    or (self._is_chinese_char(para_2_text[0]))
-                    or (self._is_other_letter_char(para_2_text[0]))
-                )
-                and not is_para2_left_indent_than_papa1
-            )
-            if para_2_is_consistent:
-                # print(f"para_2 is consistent.\n")
-                return True
-            else:
-                # print(f"para_2 is not consistent.\n")
-                pass
-        elif para_1_not_end_with_end_punc:  # If para_text1 is not end with ending punctuations
-            # print_red(f"para_1 is NOT end with end_punc.")
-            para_2_is_consistent = para_2_text and (
-                para_2_text[0] == " "
-                or (self._is_alphabet_char(para_2_text[0]) and para_2_text[0].islower())
-                or (self._is_alphabet_char(para_2_text[0]))
-                or (self._is_year(para_2_text[0:4]))
-                or (are_two_paras_right_aligned or is_para1_left_indent_than_papa2)
-                or (self._is_chinese_char(para_2_text[0]))
-                or (self._is_other_letter_char(para_2_text[0]))
-            )
-            if para_2_is_consistent:
-                # print(f"para_2 is consistent.\n")
-                return True
-            else:
-                # print(f"para_2 is not consistent.\n")
-                pass
-        elif para_1_end_with_space:  # If para_text1 ends with space
-            # print_red(f"para_1 is end with space.")
-            para_2_is_consistent = para_2_text and (
-                para_2_text[0] == " "
-                or (self._is_alphabet_char(para_2_text[0]) and para_2_text[0].islower())
-                or (self._is_chinese_char(para_2_text[0]))
-                or (self._is_other_letter_char(para_2_text[0]))
-            )
-            if para_2_is_consistent:
-                # print(f"para_2 is consistent.\n")
-                return True
-            else:
-                pass
-                # print(f"para_2 is not consistent.\n")
-        return False
-    def _is_block_consistent(self, block1, block2):
-        """
-        This function determines whether block1 and block2 are originally from the same block
-        Parameters
-        ----------
-        block1 : dict
-            block1s
-        block2 : dict
-            block2
-        Returns
-        -------
-        is_same : bool
-            True if block1 and block2 are from the same block, else False
-        """
-        return self.__is_same_block_font(block1, block2)
-    def _is_para_continued(self, para1, para2):
-        """
-        This function determines whether para1 and para2 are originally from the same paragraph
-        Parameters
-        ----------
-        para1 : dict
-            para1
-        para2 : dict
-            para2
-        Returns
-        -------
-        is_same : bool
-            True if para1 and para2 are from the same paragraph, else False
-        """
-        is_para_font_consistent = self.__is_para_font_consistent(para1, para2)
-        is_para_puncs_consistent = self._is_para_puncs_consistent(para1, para2)
-        return is_para_font_consistent and is_para_puncs_consistent
-    def _are_boundaries_of_block_consistent(self, block1, block2):
-        """
-        This function checks if the boundaries of block1 and block2 are consistent
-        Parameters
-        ----------
-        block1 : dict
-            block1
-        block2 : dict
-            block2
-        Returns
-        -------
-        is_consistent : bool
-            True if the boundaries of block1 and block2 are consistent, else False
-        """
-        last_line_of_block1 = block1["lines"][-1]
-        first_line_of_block2 = block2["lines"][0]
-        spans_of_last_line_of_block1 = last_line_of_block1["spans"]
-        spans_of_first_line_of_block2 = first_line_of_block2["spans"]
-        font_type_of_last_line_of_block1 = spans_of_last_line_of_block1[0]["font"].lower()
-        font_size_of_last_line_of_block1 = spans_of_last_line_of_block1[0]["size"]
-        font_color_of_last_line_of_block1 = spans_of_last_line_of_block1[0]["color"]
-        font_flags_of_last_line_of_block1 = spans_of_last_line_of_block1[0]["flags"]
-        font_type_of_first_line_of_block2 = spans_of_first_line_of_block2[0]["font"].lower()
-        font_size_of_first_line_of_block2 = spans_of_first_line_of_block2[0]["size"]
-        font_color_of_first_line_of_block2 = spans_of_first_line_of_block2[0]["color"]
-        font_flags_of_first_line_of_block2 = spans_of_first_line_of_block2[0]["flags"]
-        return (
-            self.__is_similar_font_type(font_type_of_last_line_of_block1, font_type_of_first_line_of_block2)
-            and abs(font_size_of_last_line_of_block1 - font_size_of_first_line_of_block2) < 1
-            # and font_color_of_last_line_of_block1 == font_color_of_first_line_of_block2
-            and font_flags_of_last_line_of_block1 == font_flags_of_first_line_of_block2
-        )
-    def _get_last_paragraph(self, block):
-        """
-        Retrieves the last paragraph from a block.
-        Parameters
-        ----------
-        block : dict
-            The block from which to retrieve the paragraph.
-        Returns
-        -------
-        dict
-            The last paragraph of the block.
-        """
-        if block["paras"]:
-            last_para_key = list(block["paras"].keys())[-1]
-            return block["paras"][last_para_key]
-        else:
-            return None
-    def _get_first_paragraph(self, block):
-        """
-        Retrieves the first paragraph from a block.
-        Parameters
-        ----------
-        block : dict
-            The block from which to retrieve the paragraph.
-        Returns
-        -------
-        dict
-            The first paragraph of the block.
-        """
-        if block["paras"]:
-            first_para_key = list(block["paras"].keys())[0]
-            return block["paras"][first_para_key]
-        else:
-            return None
-    def should_merge_next_para(self, curr_para, next_para):
-        if self._is_para_continued(curr_para, next_para):
-            return True
-        else:
-            return False
-    def batch_tag_paras(self, pdf_dict):
-        the_last_page_id = len(pdf_dict) - 1
-        for curr_page_idx, (curr_page_id, curr_page_content) in enumerate(pdf_dict.items()):
-            if curr_page_id.startswith("page_") and curr_page_content.get("para_blocks", []):
-                para_blocks_of_curr_page = curr_page_content["para_blocks"]
-                next_page_idx = curr_page_idx + 1
-                next_page_id = f"page_{next_page_idx}"
-                next_page_content = pdf_dict.get(next_page_id, {})
-                for i, current_block in enumerate(para_blocks_of_curr_page):
-                    for para_id, curr_para in current_block["paras"].items():
-                        curr_para["curr_para_location"] = [
-                            curr_page_idx,
-                            current_block["block_id"],
-                            int(para_id.split("_")[-1]),
-                        ]
-                        curr_para["next_para_location"] = None  # 默认设置为None
-                        curr_para["merge_next_para"] = False  # 默认设置为False
-                    next_block = para_blocks_of_curr_page[i + 1] if i < len(para_blocks_of_curr_page) - 1 else None
-                    if next_block:
-                        curr_block_last_para_key = list(current_block["paras"].keys())[-1]
-                        curr_blk_last_para = current_block["paras"][curr_block_last_para_key]
-                        next_block_first_para_key = list(next_block["paras"].keys())[0]
-                        next_blk_first_para = next_block["paras"][next_block_first_para_key]
-                        if self.should_merge_next_para(curr_blk_last_para, next_blk_first_para):
-                            curr_blk_last_para["next_para_location"] = [
-                                curr_page_idx,
-                                next_block["block_id"],
-                                int(next_block_first_para_key.split("_")[-1]),
-                            ]
-                            curr_blk_last_para["merge_next_para"] = True
-                    else:
-                        # Handle the case where the next block is in a different page
-                        curr_block_last_para_key = list(current_block["paras"].keys())[-1]
-                        curr_blk_last_para = current_block["paras"][curr_block_last_para_key]
-                        while not next_page_content.get("para_blocks", []) and next_page_idx <= the_last_page_id:
-                            next_page_idx += 1
-                            next_page_id = f"page_{next_page_idx}"
-                            next_page_content = pdf_dict.get(next_page_id, {})
-                        if next_page_content.get("para_blocks", []):
-                            next_blk_first_para_key = list(next_page_content["para_blocks"][0]["paras"].keys())[0]
-                            next_blk_first_para = next_page_content["para_blocks"][0]["paras"][next_blk_first_para_key]
-                            if self.should_merge_next_para(curr_blk_last_para, next_blk_first_para):
-                                curr_blk_last_para["next_para_location"] = [
-                                    next_page_idx,
-                                    next_page_content["para_blocks"][0]["block_id"],
-                                    int(next_blk_first_para_key.split("_")[-1]),
-                                ]
-                                curr_blk_last_para["merge_next_para"] = True
-        return pdf_dict
-    def find_block_by_id(self, para_blocks, block_id):
-        for block in para_blocks:
-            if block.get("block_id") == block_id:
-                return block
-        return None
-    def batch_merge_paras(self, pdf_dict):
-        for page_id, page_content in pdf_dict.items():
-            if page_id.startswith("page_") and page_content.get("para_blocks", []):
-                para_blocks_of_page = page_content["para_blocks"]
-                for i in range(len(para_blocks_of_page)):
-                    current_block = para_blocks_of_page[i]
-                    paras = current_block["paras"]
-                    for para_id, curr_para in list(paras.items()):
-                        # 跳过标题段落
-                        if curr_para.get("is_para_title"):
-                            continue
-                        while curr_para.get("merge_next_para"):
-                            next_para_location = curr_para.get("next_para_location")
-                            if not next_para_location:
-                                break
-                            next_page_idx, next_block_id, next_para_id = next_para_location
-                            next_page_id = f"page_{next_page_idx}"
-                            next_page_content = pdf_dict.get(next_page_id)
-                            if not next_page_content:
-                                break
-                            next_block = self.find_block_by_id(next_page_content.get("para_blocks", []), next_block_id)
-                            if not next_block:
-                                break
-                            next_para = next_block["paras"].get(f"para_{next_para_id}")
-                            if not next_para or next_para.get("is_para_title"):
-                                break
-                            # 合并段落文本
-                            curr_para_text = curr_para.get("para_text", "")
-                            next_para_text = next_para.get("para_text", "")
-                            curr_para["para_text"] = curr_para_text + " " + next_para_text
-                            # 更新 next_para_location
-                            curr_para["next_para_location"] = next_para.get("next_para_location")
-                            # 将下一个段落文本置为空，表示已被合并
-                            next_para["para_text"] = ""
-                            # 更新 merge_next_para 标记
-                            curr_para["merge_next_para"] = next_para.get("merge_next_para", False)
-        return pdf_dict
--- a/magic_pdf/para/block_termination_processor.py
+++ b/magic_pdf/para/block_termination_processor.py
-from magic_pdf.para.commons import *
-if sys.version_info[0] >= 3:
-    sys.stdout.reconfigure(encoding="utf-8")  # type: ignore
-class BlockTerminationProcessor:
-    def __init__(self) -> None:
-        pass
-    def _is_consistent_lines(
-        self,
-        curr_line,
-        prev_line,
-        next_line,
-        consistent_direction,  # 0 for prev, 1 for next, 2 for both
-    ):
-        """
-        This function checks if the line is consistent with its neighbors
-        Parameters
-        ----------
-        curr_line : dict
-            current line
-        prev_line : dict
-            previous line
-        next_line : dict
-            next line
-        consistent_direction : int
-            0 for prev, 1 for next, 2 for both
-        Returns
-        -------
-        bool
-            True if the line is consistent with its neighbors, False otherwise.
-        """
-        curr_line_font_size = curr_line["spans"][0]["size"]
-        curr_line_font_type = curr_line["spans"][0]["font"].lower()
-        if consistent_direction == 0:
-            if prev_line:
-                prev_line_font_size = prev_line["spans"][0]["size"]
-                prev_line_font_type = prev_line["spans"][0]["font"].lower()
-                return curr_line_font_size == prev_line_font_size and curr_line_font_type == prev_line_font_type
-            else:
-                return False
-        elif consistent_direction == 1:
-            if next_line:
-                next_line_font_size = next_line["spans"][0]["size"]
-                next_line_font_type = next_line["spans"][0]["font"].lower()
-                return curr_line_font_size == next_line_font_size and curr_line_font_type == next_line_font_type
-            else:
-                return False
-        elif consistent_direction == 2:
-            if prev_line and next_line:
-                prev_line_font_size = prev_line["spans"][0]["size"]
-                prev_line_font_type = prev_line["spans"][0]["font"].lower()
-                next_line_font_size = next_line["spans"][0]["size"]
-                next_line_font_type = next_line["spans"][0]["font"].lower()
-                return (curr_line_font_size == prev_line_font_size and curr_line_font_type == prev_line_font_type) and (
-                    curr_line_font_size == next_line_font_size and curr_line_font_type == next_line_font_type
-                )
-            else:
-                return False
-        else:
-            return False
-    def _is_regular_line(self, curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_width, X0, X1, avg_line_height):
-        """
-        This function checks if the line is a regular line
-        Parameters
-        ----------
-        curr_line_bbox : list
-            bbox of the current line
-        prev_line_bbox : list
-            bbox of the previous line
-        next_line_bbox : list
-            bbox of the next line
-        avg_char_width : float
-            average of char widths
-        X0 : float
-            median of x0 values, which represents the left average boundary of the page
-        X1 : float
-            median of x1 values, which represents the right average boundary of the page
-        avg_line_height : float
-            average of line heights
-        Returns
-        -------
-        bool
-            True if the line is a regular line, False otherwise.
-        """
-        horizontal_ratio = 0.5
-        vertical_ratio = 0.5
-        horizontal_thres = horizontal_ratio * avg_char_width
-        vertical_thres = vertical_ratio * avg_line_height
-        x0, y0, x1, y1 = curr_line_bbox
-        x0_near_X0 = abs(x0 - X0) < horizontal_thres
-        x1_near_X1 = abs(x1 - X1) < horizontal_thres
-        prev_line_is_end_of_para = prev_line_bbox and (abs(prev_line_bbox[2] - X1) > avg_char_width)
-        sufficient_spacing_above = False
-        if prev_line_bbox:
-            vertical_spacing_above = y1 - prev_line_bbox[3]
-            sufficient_spacing_above = vertical_spacing_above > vertical_thres
-        sufficient_spacing_below = False
-        if next_line_bbox:
-            vertical_spacing_below = next_line_bbox[1] - y0
-            sufficient_spacing_below = vertical_spacing_below > vertical_thres
-        return (
-            (sufficient_spacing_above or sufficient_spacing_below)
-            or (not x0_near_X0 and not x1_near_X1)
-            or prev_line_is_end_of_para
-        )
-    def _is_possible_start_of_para(self, curr_line, prev_line, next_line, X0, X1, avg_char_width, avg_font_size):
-        """
-        This function checks if the line is a possible start of a paragraph
-        Parameters
-        ----------
-        curr_line : dict
-            current line
-        prev_line : dict
-            previous line
-        next_line : dict
-            next line
-        X0 : float
-            median of x0 values, which represents the left average boundary of the page
-        X1 : float
-            median of x1 values, which represents the right average boundary of the page
-        avg_char_width : float
-            average of char widths
-        avg_line_height : float
-            average of line heights
-        Returns
-        -------
-        bool
-            True if the line is a possible start of a paragraph, False otherwise.
-        """
-        start_confidence = 0.5  # Initial confidence of the line being a start of a paragraph
-        decision_path = []  # Record the decision path
-        curr_line_bbox = curr_line["bbox"]
-        prev_line_bbox = prev_line["bbox"] if prev_line else None
-        next_line_bbox = next_line["bbox"] if next_line else None
-        indent_ratio = 1
-        vertical_ratio = 1.5
-        vertical_thres = vertical_ratio * avg_font_size
-        left_horizontal_ratio = 0.5
-        left_horizontal_thres = left_horizontal_ratio * avg_char_width
-        right_horizontal_ratio = 2.5
-        right_horizontal_thres = right_horizontal_ratio * avg_char_width
-        x0, y0, x1, y1 = curr_line_bbox
-        indent_condition = x0 > X0 + indent_ratio * avg_char_width
-        if indent_condition:
-            start_confidence += 0.2
-            decision_path.append("indent_condition_met")
-        x0_near_X0 = abs(x0 - X0) < left_horizontal_thres
-        if x0_near_X0:
-            start_confidence += 0.1
-            decision_path.append("x0_near_X0")
-        x1_near_X1 = abs(x1 - X1) < right_horizontal_thres
-        if x1_near_X1:
-            start_confidence += 0.1
-            decision_path.append("x1_near_X1")
-        if prev_line is None:
-            prev_line_is_end_of_para = True
-            start_confidence += 0.2
-            decision_path.append("no_prev_line")
-        else:
-            prev_line_is_end_of_para, _, _ = self._is_possible_end_of_para(prev_line, next_line, X0, X1, avg_char_width)
-            if prev_line_is_end_of_para:
-                start_confidence += 0.1
-                decision_path.append("prev_line_is_end_of_para")
-        sufficient_spacing_above = False
-        if prev_line_bbox:
-            vertical_spacing_above = y1 - prev_line_bbox[3]
-            sufficient_spacing_above = vertical_spacing_above > vertical_thres
-            if sufficient_spacing_above:
-                start_confidence += 0.2
-                decision_path.append("sufficient_spacing_above")
-        sufficient_spacing_below = False
-        if next_line_bbox:
-            vertical_spacing_below = next_line_bbox[1] - y0
-            sufficient_spacing_below = vertical_spacing_below > vertical_thres
-            if sufficient_spacing_below:
-                start_confidence += 0.2
-                decision_path.append("sufficient_spacing_below")
-        is_regular_line = self._is_regular_line(
-            curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_width, X0, X1, avg_font_size
-        )
-        if is_regular_line:
-            start_confidence += 0.1
-            decision_path.append("is_regular_line")
-        is_start_of_para = (
-            (sufficient_spacing_above or sufficient_spacing_below)
-            or (indent_condition)
-            or (not indent_condition and x0_near_X0 and x1_near_X1 and not is_regular_line)
-            or prev_line_is_end_of_para
-        )
-        return (is_start_of_para, start_confidence, decision_path)
-    def _is_possible_end_of_para(self, curr_line, next_line, X0, X1, avg_char_width):
-        """
-        This function checks if the line is a possible end of a paragraph
-        Parameters
-        ----------
-        curr_line : dict
-            current line
-        next_line : dict
-            next line
-        X0 : float
-            median of x0 values, which represents the left average boundary of the page
-        X1 : float
-            median of x1 values, which represents the right average boundary of the page
-        avg_char_width : float
-            average of char widths
-        Returns
-        -------
-        bool
-            True if the line is a possible end of a paragraph, False otherwise.
-        """
-        end_confidence = 0.5  # Initial confidence of the line being a end of a paragraph
-        decision_path = []  # Record the decision path
-        curr_line_bbox = curr_line["bbox"]
-        next_line_bbox = next_line["bbox"] if next_line else None
-        left_horizontal_ratio = 0.5
-        right_horizontal_ratio = 0.5
-        x0, _, x1, y1 = curr_line_bbox
-        next_x0, next_y0, _, _ = next_line_bbox if next_line_bbox else (0, 0, 0, 0)
-        x0_near_X0 = abs(x0 - X0) < left_horizontal_ratio * avg_char_width
-        if x0_near_X0:
-            end_confidence += 0.1
-            decision_path.append("x0_near_X0")
-        x1_smaller_than_X1 = x1 < X1 - right_horizontal_ratio * avg_char_width
-        if x1_smaller_than_X1:
-            end_confidence += 0.1
-            decision_path.append("x1_smaller_than_X1")
-        next_line_is_start_of_para = (
-            next_line_bbox
-            and (next_x0 > X0 + left_horizontal_ratio * avg_char_width)
-            and (not is_line_left_aligned_from_neighbors(curr_line_bbox, None, next_line_bbox, avg_char_width, direction=1))
-        )
-        if next_line_is_start_of_para:
-            end_confidence += 0.2
-            decision_path.append("next_line_is_start_of_para")
-        is_line_left_aligned_from_neighbors_bool = is_line_left_aligned_from_neighbors(
-            curr_line_bbox, None, next_line_bbox, avg_char_width
-        )
-        if is_line_left_aligned_from_neighbors_bool:
-            end_confidence += 0.1
-            decision_path.append("line_is_left_aligned_from_neighbors")
-        is_line_right_aligned_from_neighbors_bool = is_line_right_aligned_from_neighbors(
-            curr_line_bbox, None, next_line_bbox, avg_char_width
-        )
-        if not is_line_right_aligned_from_neighbors_bool:
-            end_confidence += 0.1
-            decision_path.append("line_is_not_right_aligned_from_neighbors")
-        is_end_of_para = end_with_punctuation(curr_line["text"]) and (
-            (x0_near_X0 and x1_smaller_than_X1)
-            or (is_line_left_aligned_from_neighbors_bool and not is_line_right_aligned_from_neighbors_bool)
-        )
-        return (is_end_of_para, end_confidence, decision_path)
-    def _cut_paras_per_block(
-        self,
-        block,
-    ):
-        """
-        Processes a raw block from PyMuPDF and returns the processed block.
-        Parameters
-        ----------
-        raw_block : dict
-            A raw block from pymupdf.
-        Returns
-        -------
-        processed_block : dict
-        """
-        def _construct_para(lines, is_block_title, para_title_level):
-            """
-            Construct a paragraph from given lines.
-            """
-            font_sizes = [span["size"] for line in lines for span in line["spans"]]
-            avg_font_size = sum(font_sizes) / len(font_sizes) if font_sizes else 0
-            font_colors = [span["color"] for line in lines for span in line["spans"]]
-            most_common_font_color = max(set(font_colors), key=font_colors.count) if font_colors else None
-            # font_types = [span["font"] for line in lines for span in line["spans"]]
-            # most_common_font_type = max(set(font_types), key=font_types.count) if font_types else None
-            font_type_lengths = {}
-            for line in lines:
-                for span in line["spans"]:
-                    font_type = span["font"]
-                    bbox_width = span["bbox"][2] - span["bbox"][0]
-                    if font_type in font_type_lengths:
-                        font_type_lengths[font_type] += bbox_width
-                    else:
-                        font_type_lengths[font_type] = bbox_width
-            # get the font type with the longest bbox width
-            most_common_font_type = max(font_type_lengths, key=font_type_lengths.get) if font_type_lengths else None  # type: ignore
-            para_bbox = calculate_para_bbox(lines)
-            para_text = " ".join(line["text"] for line in lines)
-            return {
-                "para_bbox": para_bbox,
-                "para_text": para_text,
-                "para_font_type": most_common_font_type,
-                "para_font_size": avg_font_size,
-                "para_font_color": most_common_font_color,
-                "is_para_title": is_block_title,
-                "para_title_level": para_title_level,
-            }
-        block_bbox = block["bbox"]
-        block_text = block["text"]
-        block_lines = block["lines"]
-        X0 = safe_get(block, "X0", 0)
-        X1 = safe_get(block, "X1", 0)
-        avg_char_width = safe_get(block, "avg_char_width", 0)
-        avg_char_height = safe_get(block, "avg_char_height", 0)
-        avg_font_size = safe_get(block, "avg_font_size", 0)
-        is_block_title = safe_get(block, "is_block_title", False)
-        para_title_level = safe_get(block, "block_title_level", 0)
-        # Segment into paragraphs
-        para_ranges = []
-        in_paragraph = False
-        start_idx_of_para = None
-        # Create the processed paragraphs
-        processed_paras = {}
-        para_bboxes = []
-        end_idx_of_para = 0
-        for line_index, line in enumerate(block_lines):
-            curr_line = line
-            prev_line = block_lines[line_index - 1] if line_index > 0 else None
-            next_line = block_lines[line_index + 1] if line_index < len(block_lines) - 1 else None
-            """
-            Start processing paragraphs.
-            """
-            # Check if the line is the start of a paragraph
-            is_start_of_para, start_confidence, decision_path = self._is_possible_start_of_para(
-                curr_line, prev_line, next_line, X0, X1, avg_char_width, avg_font_size
-            )
-            if not in_paragraph and is_start_of_para:
-                in_paragraph = True
-                start_idx_of_para = line_index
-                # print_green(">>> Start of a paragraph")
-                # print("    curr_line_text: ", curr_line["text"])
-                # print("    start_confidence: ", start_confidence)
-                # print("    decision_path: ", decision_path)
-            # Check if the line is the end of a paragraph
-            is_end_of_para, end_confidence, decision_path = self._is_possible_end_of_para(
-                curr_line, next_line, X0, X1, avg_char_width
-            )
-            if in_paragraph and (is_end_of_para or not next_line):
-                para_ranges.append((start_idx_of_para, line_index))
-                start_idx_of_para = None
-                in_paragraph = False
-                # print_red(">>> End of a paragraph")
-                # print("    curr_line_text: ", curr_line["text"])
-                # print("    end_confidence: ", end_confidence)
-                # print("    decision_path: ", decision_path)
-        # Add the last paragraph if it is not added
-        if in_paragraph and start_idx_of_para is not None:
-            para_ranges.append((start_idx_of_para, len(block_lines) - 1))
-        # Process the matched paragraphs
-        for para_index, (start_idx, end_idx) in enumerate(para_ranges):
-            matched_lines = block_lines[start_idx : end_idx + 1]
-            para_properties = _construct_para(matched_lines, is_block_title, para_title_level)
-            para_key = f"para_{len(processed_paras)}"
-            processed_paras[para_key] = para_properties
-            para_bboxes.append(para_properties["para_bbox"])
-            end_idx_of_para = end_idx + 1
-        # Deal with the remaining lines
-        if end_idx_of_para < len(block_lines):
-            unmatched_lines = block_lines[end_idx_of_para:]
-            unmatched_properties = _construct_para(unmatched_lines, is_block_title, para_title_level)
-            unmatched_key = f"para_{len(processed_paras)}"
-            processed_paras[unmatched_key] = unmatched_properties
-            para_bboxes.append(unmatched_properties["para_bbox"])
-        block["paras"] = processed_paras
-        return block
-    def batch_process_blocks(self, pdf_dict):
-        """
-        Parses the blocks of all pages.
-        Parameters
-        ----------
-        pdf_dict : dict
-            PDF dictionary.
-        filter_blocks : list
-            List of bounding boxes to filter.
-        Returns
-        -------
-        result_dict : dict
-            Result dictionary.
-        """
-        num_paras = 0
-        for page_id, page in pdf_dict.items():
-            if page_id.startswith("page_"):
-                para_blocks = []
-                if "para_blocks" in page.keys():
-                    input_blocks = page["para_blocks"]
-                    for input_block in input_blocks:
-                        new_block = self._cut_paras_per_block(input_block)
-                        para_blocks.append(new_block)
-                        num_paras += len(new_block["paras"])
-                page["para_blocks"] = para_blocks
-        pdf_dict["statistics"]["num_paras"] = num_paras
-        return pdf_dict
--- a/magic_pdf/para/commons.py
+++ b/magic_pdf/para/commons.py
-import sys
-from magic_pdf.libs.commons import fitz
-from termcolor import cprint
-if sys.version_info[0] >= 3:
-    sys.stdout.reconfigure(encoding="utf-8")  # type: ignore
-def open_pdf(pdf_path):
-    try:
-        pdf_document = fitz.open(pdf_path)  # type: ignore
-        return pdf_document
-    except Exception as e:
-        print(f"无法打开PDF文件：{pdf_path}。原因是：{e}")
-        raise e
-def print_green_on_red(text):
-    cprint(text, "green", "on_red", attrs=["bold"], end="\n\n")
-def print_green(text):
-    print()
-    cprint(text, "green", attrs=["bold"], end="\n\n")
-def print_red(text):
-    print()
-    cprint(text, "red", attrs=["bold"], end="\n\n")
-def print_yellow(text):
-    print()
-    cprint(text, "yellow", attrs=["bold"], end="\n\n")
-def safe_get(dict_obj, key, default):
-    val = dict_obj.get(key)
-    if val is None:
-        return default
-    else:
-        return val
-def is_bbox_overlap(bbox1, bbox2):
-    """
-    This function checks if bbox1 and bbox2 overlap or not
-    Parameters
-    ----------
-    bbox1 : list
-        bbox1
-    bbox2 : list
-        bbox2
-    Returns
-    -------
-    bool
-        True if bbox1 and bbox2 overlap, else False
-    """
-    x0_1, y0_1, x1_1, y1_1 = bbox1
-    x0_2, y0_2, x1_2, y1_2 = bbox2
-    if x0_1 > x1_2 or x0_2 > x1_1:
-        return False
-    if y0_1 > y1_2 or y0_2 > y1_1:
-        return False
-    return True
-def is_in_bbox(bbox1, bbox2):
-    """
-    This function checks if bbox1 is in bbox2
-    Parameters
-    ----------
-    bbox1 : list
-        bbox1
-    bbox2 : list
-        bbox2
-    Returns
-    -------
-    bool
-        True if bbox1 is in bbox2, else False
-    """
-    x0_1, y0_1, x1_1, y1_1 = bbox1
-    x0_2, y0_2, x1_2, y1_2 = bbox2
-    if x0_1 >= x0_2 and y0_1 >= y0_2 and x1_1 <= x1_2 and y1_1 <= y1_2:
-        return True
-    else:
-        return False
-def calculate_para_bbox(lines):
-    """
-    This function calculates the minimum bbox of the paragraph
-    Parameters
-    ----------
-    lines : list
-        lines
-    Returns
-    -------
-    para_bbox : list
-        bbox of the paragraph
-    """
-    x0 = min(line["bbox"][0] for line in lines)
-    y0 = min(line["bbox"][1] for line in lines)
-    x1 = max(line["bbox"][2] for line in lines)
-    y1 = max(line["bbox"][3] for line in lines)
-    return [x0, y0, x1, y1]
-def is_line_right_aligned_from_neighbors(curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_width, direction=2):
-    """
-    This function checks if the line is right aligned from its neighbors
-    Parameters
-    ----------
-    curr_line_bbox : list
-        bbox of the current line
-    prev_line_bbox : list
-        bbox of the previous line
-    next_line_bbox : list
-        bbox of the next line
-    avg_char_width : float
-        average of char widths
-    direction : int
-        0 for prev, 1 for next, 2 for both
-    Returns
-    -------
-    bool
-        True if the line is right aligned from its neighbors, False otherwise.
-    """
-    horizontal_ratio = 0.5
-    horizontal_thres = horizontal_ratio * avg_char_width
-    _, _, x1, _ = curr_line_bbox
-    _, _, prev_x1, _ = prev_line_bbox if prev_line_bbox else (0, 0, 0, 0)
-    _, _, next_x1, _ = next_line_bbox if next_line_bbox else (0, 0, 0, 0)
-    if direction == 0:
-        return abs(x1 - prev_x1) < horizontal_thres
-    elif direction == 1:
-        return abs(x1 - next_x1) < horizontal_thres
-    elif direction == 2:
-        return abs(x1 - prev_x1) < horizontal_thres and abs(x1 - next_x1) < horizontal_thres
-    else:
-        return False
-def is_line_left_aligned_from_neighbors(curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_width, direction=2):
-    """
-    This function checks if the line is left aligned from its neighbors
-    Parameters
-    ----------
-    curr_line_bbox : list
-        bbox of the current line
-    prev_line_bbox : list
-        bbox of the previous line
-    next_line_bbox : list
-        bbox of the next line
-    avg_char_width : float
-        average of char widths
-    direction : int
-        0 for prev, 1 for next, 2 for both
-    Returns
-    -------
-    bool
-        True if the line is left aligned from its neighbors, False otherwise.
-    """
-    horizontal_ratio = 0.5
-    horizontal_thres = horizontal_ratio * avg_char_width
-    x0, _, _, _ = curr_line_bbox
-    prev_x0, _, _, _ = prev_line_bbox if prev_line_bbox else (0, 0, 0, 0)
-    next_x0, _, _, _ = next_line_bbox if next_line_bbox else (0, 0, 0, 0)
-    if direction == 0:
-        return abs(x0 - prev_x0) < horizontal_thres
-    elif direction == 1:
-        return abs(x0 - next_x0) < horizontal_thres
-    elif direction == 2:
-        return abs(x0 - prev_x0) < horizontal_thres and abs(x0 - next_x0) < horizontal_thres
-    else:
-        return False
-def end_with_punctuation(line_text):
-    """
-    This function checks if the line ends with punctuation marks
-    """
-    english_end_puncs = [".", "?", "!"]
-    chinese_end_puncs = ["。", "？", "！"]
-    end_puncs = english_end_puncs + chinese_end_puncs
-    last_non_space_char = None
-    for ch in line_text[::-1]:
-        if not ch.isspace():
-            last_non_space_char = ch
-            break
-    if last_non_space_char is None:
-        return False
-    return last_non_space_char in end_puncs
-def is_nested_list(lst):
-    if isinstance(lst, list):
-        return any(isinstance(sub, list) for sub in lst)
-    return False
--- a/magic_pdf/para/denoise.py
+++ b/magic_pdf/para/denoise.py
-import math
-from collections import defaultdict
-from magic_pdf.para.commons import *
-if sys.version_info[0] >= 3:
-    sys.stdout.reconfigure(encoding="utf-8")  # type: ignore
-class HeaderFooterProcessor:
-    def __init__(self) -> None:
-        pass
-    def get_most_common_bboxes(self, bboxes, page_height, position="top", threshold=0.25, num_bboxes=3, min_frequency=2):
-        """
-        This function gets the most common bboxes from the bboxes
-        Parameters
-        ----------
-        bboxes : list
-            bboxes
-        page_height : float
-            height of the page
-        position : str, optional
-            "top" or "bottom", by default "top"
-        threshold : float, optional
-            threshold, by default 0.25
-        num_bboxes : int, optional
-            number of bboxes to return, by default 3
-        min_frequency : int, optional
-            minimum frequency of the bbox, by default 2
-        Returns
-        -------
-        common_bboxes : list
-            common bboxes
-        """
-        # Filter bbox by position
-        if position == "top":
-            filtered_bboxes = [bbox for bbox in bboxes if bbox[1] < page_height * threshold]
-        else:
-            filtered_bboxes = [bbox for bbox in bboxes if bbox[3] > page_height * (1 - threshold)]
-        # Find the most common bbox
-        bbox_count = defaultdict(int)
-        for bbox in filtered_bboxes:
-            bbox_count[tuple(bbox)] += 1
-        # Get the most frequently occurring bbox, but only consider it when the frequency exceeds min_frequency
-        common_bboxes = [
-            bbox for bbox, count in sorted(bbox_count.items(), key=lambda item: item[1], reverse=True) if count >= min_frequency
-        ][:num_bboxes]
-        return common_bboxes
-    def detect_footer_header(self, result_dict, similarity_threshold=0.5):
-        """
-        This function detects the header and footer of the document.
-        Parameters
-        ----------
-        result_dict : dict
-            result dictionary
-        Returns
-        -------
-        result_dict : dict
-            result dictionary
-        """
-        def compare_bbox_with_list(bbox, bbox_list, tolerance=1):
-            return any(all(abs(a - b) < tolerance for a, b in zip(bbox, common_bbox)) for common_bbox in bbox_list)
-        def is_single_line_block(block):
-            # Determine based on the width and height of the block
-            block_width = block["X1"] - block["X0"]
-            block_height = block["bbox"][3] - block["bbox"][1]
-            # If the height of the block is close to the average character height and the width is large, it is considered a single line
-            return block_height <= block["avg_char_height"] * 3 and block_width > block["avg_char_width"] * 3
-        # Traverse all blocks in the document
-        single_preproc_blocks = 0
-        total_blocks = 0
-        single_preproc_blocks = 0
-        for page_id, blocks in result_dict.items():
-            if page_id.startswith("page_"):
-                for block_key, block in blocks.items():
-                    if block_key.startswith("block_"):
-                        total_blocks += 1
-                        if is_single_line_block(block):
-                            single_preproc_blocks += 1
-        # If there are no blocks, skip the header and footer detection
-        if total_blocks == 0:
-            print("No blocks found. Skipping header/footer detection.")
-            return result_dict
-        # If most of the blocks are single-line, skip the header and footer detection
-        if single_preproc_blocks / total_blocks > 0.5:  # 50% of the blocks are single-line
-            return result_dict
-        # Collect the bounding boxes of all blocks
-        all_bboxes = []
-        all_texts = []
-        for page_id, blocks in result_dict.items():
-            if page_id.startswith("page_"):
-                for block_key, block in blocks.items():
-                    if block_key.startswith("block_"):
-                        all_bboxes.append(block["bbox"])
-        # Get the height of the page
-        page_height = max(bbox[3] for bbox in all_bboxes)
-        # Get the most common bbox lists for headers and footers
-        common_header_bboxes = self.get_most_common_bboxes(all_bboxes, page_height, position="top") if all_bboxes else []
-        common_footer_bboxes = self.get_most_common_bboxes(all_bboxes, page_height, position="bottom") if all_bboxes else []
-        # Detect and mark headers and footers
-        for page_id, blocks in result_dict.items():
-            if page_id.startswith("page_"):
-                for block_key, block in blocks.items():
-                    if block_key.startswith("block_"):
-                        bbox = block["bbox"]
-                        text = block["text"]
-                        is_header = compare_bbox_with_list(bbox, common_header_bboxes)
-                        is_footer = compare_bbox_with_list(bbox, common_footer_bboxes)
-                        block["is_header"] = int(is_header)
-                        block["is_footer"] = int(is_footer)
-        return result_dict
-class NonHorizontalTextProcessor:
-    def __init__(self) -> None:
-        pass
-    def detect_non_horizontal_texts(self, result_dict):
-        """
-        This function detects watermarks and vertical margin notes in the document.
-        Watermarks are identified by finding blocks with the same coordinates and frequently occurring identical texts across multiple pages.
-        If these conditions are met, the blocks are highly likely to be watermarks, as opposed to headers or footers, which can change from page to page.
-        If the direction of these blocks is not horizontal, they are definitely considered to be watermarks.
-        Vertical margin notes are identified by finding blocks with the same coordinates and frequently occurring identical texts across multiple pages.
-        If these conditions are met, the blocks are highly likely to be vertical margin notes, which typically appear on the left and right sides of the page.
-        If the direction of these blocks is vertical, they are definitely considered to be vertical margin notes.
-        Parameters
-        ----------
-        result_dict : dict
-            The result dictionary.
-        Returns
-        -------
-        result_dict : dict
-            The updated result dictionary.
-        """
-        # Dictionary to store information about potential watermarks
-        potential_watermarks = {}
-        potential_margin_notes = {}
-        for page_id, page_content in result_dict.items():
-            if page_id.startswith("page_"):
-                for block_id, block_data in page_content.items():
-                    if block_id.startswith("block_"):
-                        if "dir" in block_data:
-                            coordinates_text = (block_data["bbox"], block_data["text"])  # Tuple of coordinates and text
-                            angle = math.atan2(block_data["dir"][1], block_data["dir"][0])
-                            angle = abs(math.degrees(angle))
-                            if angle > 5 and angle < 85:  # Check if direction is watermarks
-                                if coordinates_text in potential_watermarks:
-                                    potential_watermarks[coordinates_text] += 1
-                                else:
-                                    potential_watermarks[coordinates_text] = 1
-                            if angle > 85 and angle < 105:  # Check if direction is vertical
-                                if coordinates_text in potential_margin_notes:
-                                    potential_margin_notes[coordinates_text] += 1  # Increment count
-                                else:
-                                    potential_margin_notes[coordinates_text] = 1  # Initialize count
-        # Identify watermarks by finding entries with counts higher than a threshold (e.g., appearing on more than half of the pages)
-        watermark_threshold = len(result_dict) // 2
-        watermarks = {k: v for k, v in potential_watermarks.items() if v > watermark_threshold}
-        # Identify margin notes by finding entries with counts higher than a threshold (e.g., appearing on more than half of the pages)
-        margin_note_threshold = len(result_dict) // 2
-        margin_notes = {k: v for k, v in potential_margin_notes.items() if v > margin_note_threshold}
-        # Add watermark information to the result dictionary
-        for page_id, blocks in result_dict.items():
-            if page_id.startswith("page_"):
-                for block_id, block_data in blocks.items():
-                    coordinates_text = (block_data["bbox"], block_data["text"])
-                    if coordinates_text in watermarks:
-                        block_data["is_watermark"] = 1
-                    else:
-                        block_data["is_watermark"] = 0
-                    if coordinates_text in margin_notes:
-                        block_data["is_vertical_margin_note"] = 1
-                    else:
-                        block_data["is_vertical_margin_note"] = 0
-        return result_dict
-class NoiseRemover:
-    def __init__(self) -> None:
-        pass
-    def skip_data_noises(self, result_dict):
-        """
-        This function skips the data noises, including overlap blocks, header, footer, watermark, vertical margin note, title
-        """
-        filtered_result_dict = {}
-        for page_id, blocks in result_dict.items():
-            if page_id.startswith("page_"):
-                filtered_blocks = {}
-                for block_id, block in blocks.items():
-                    if block_id.startswith("block_"):
-                        if any(
-                            block.get(key, 0)
-                            for key in [
-                                "is_overlap",
-                                "is_header",
-                                "is_footer",
-                                "is_watermark",
-                                "is_vertical_margin_note",
-                                "is_block_title",
-                            ]
-                        ):
-                            continue
-                        filtered_blocks[block_id] = block
-                if filtered_blocks:
-                    filtered_result_dict[page_id] = filtered_blocks
-        return filtered_result_dict
--- a/magic_pdf/para/draw.py
+++ b/magic_pdf/para/draw.py
-from magic_pdf.libs.commons import fitz
-from magic_pdf.para.commons import *
-if sys.version_info[0] >= 3:
-    sys.stdout.reconfigure(encoding="utf-8")  # type: ignore
-class DrawAnnos:
-    """
-    This class draws annotations on the pdf file
-    ----------------------------------------
-                Color Code
-    ----------------------------------------
-        Red: (1, 0, 0)
-        Green: (0, 1, 0)
-        Blue: (0, 0, 1)
-        Yellow: (1, 1, 0) - mix of red and green
-        Cyan: (0, 1, 1) - mix of green and blue
-        Magenta: (1, 0, 1) - mix of red and blue
-        White: (1, 1, 1) - red, green and blue full intensity
-        Black: (0, 0, 0) - no color component whatsoever
-        Gray: (0.5, 0.5, 0.5) - equal and medium intensity of red, green and blue color components
-        Orange: (1, 0.65, 0) - maximum intensity of red, medium intensity of green, no blue component
-    """
-    def __init__(self) -> None:
-        pass
-    def __is_nested_list(self, lst):
-        """
-        This function returns True if the given list is a nested list of any degree.
-        """
-        if isinstance(lst, list):
-            return any(self.__is_nested_list(i) for i in lst) or any(isinstance(i, list) for i in lst)
-        return False
-    def __valid_rect(self, bbox):
-        # Ensure that the rectangle is not empty or invalid
-        if isinstance(bbox[0], list):
-            return False  # It's a nested list, hence it can't be valid rect
-        else:
-            return bbox[0] < bbox[2] and bbox[1] < bbox[3]
-    def __draw_nested_boxes(self, page, nested_bbox, color=(0, 1, 1)):
-        """
-        This function draws the nested boxes
-        Parameters
-        ----------
-        page : fitz.Page
-            page
-        nested_bbox : list
-            nested bbox
-        color : tuple
-            color, by default (0, 1, 1)    # draw with cyan color for combined paragraph
-        """
-        if self.__is_nested_list(nested_bbox):  # If it's a nested list
-            for bbox in nested_bbox:
-                self.__draw_nested_boxes(page, bbox, color)  # Recursively call the function
-        elif self.__valid_rect(nested_bbox):  # If valid rectangle
-            para_rect = fitz.Rect(nested_bbox)
-            para_anno = page.add_rect_annot(para_rect)
-            para_anno.set_colors(stroke=color)  # draw with cyan color for combined paragraph
-            para_anno.set_border(width=1)
-            para_anno.update()
-    def draw_annos(self, input_pdf_path, pdf_dic, output_pdf_path):
-        pdf_doc = open_pdf(input_pdf_path)
-        if pdf_dic is None:
-            pdf_dic = {}
-        if output_pdf_path is None:
-            output_pdf_path = input_pdf_path.replace(".pdf", "_anno.pdf")
-        for page_id, page in enumerate(pdf_doc):  # type: ignore
-            page_key = f"page_{page_id}"
-            for ele_key, ele_data in pdf_dic[page_key].items():
-                if ele_key == "para_blocks":
-                    para_blocks = ele_data
-                    for para_block in para_blocks:
-                        if "paras" in para_block.keys():
-                            paras = para_block["paras"]
-                            for para_key, para_content in paras.items():
-                                para_bbox = para_content["para_bbox"]
-                                # print(f"para_bbox: {para_bbox}")
-                                # print(f"is a nested list: {self.__is_nested_list(para_bbox)}")
-                                if self.__is_nested_list(para_bbox) and len(para_bbox) > 1:
-                                    color = (0, 1, 1)
-                                    self.__draw_nested_boxes(
-                                        page, para_bbox, color
-                                    )  # draw with cyan color for combined paragraph
-                                else:
-                                    if self.__valid_rect(para_bbox):
-                                        para_rect = fitz.Rect(para_bbox)
-                                        para_anno = page.add_rect_annot(para_rect)
-                                        para_anno.set_colors(stroke=(0, 1, 0))  # draw with green color for normal paragraph
-                                        para_anno.set_border(width=0.5)
-                                        para_anno.update()
-                                is_para_title = para_content["is_para_title"]
-                                if is_para_title:
-                                    if self.__is_nested_list(para_content["para_bbox"]) and len(para_content["para_bbox"]) > 1:
-                                        color = (0, 0, 1)
-                                        self.__draw_nested_boxes(
-                                            page, para_content["para_bbox"], color
-                                        )  # draw with cyan color for combined title
-                                    else:
-                                        if self.__valid_rect(para_content["para_bbox"]):
-                                            para_rect = fitz.Rect(para_content["para_bbox"])
-                                            if self.__valid_rect(para_content["para_bbox"]):
-                                                para_anno = page.add_rect_annot(para_rect)
-                                                para_anno.set_colors(stroke=(0, 0, 1))  # draw with blue color for normal title
-                                                para_anno.set_border(width=0.5)
-                                                para_anno.update()
-        pdf_doc.save(output_pdf_path)
-        pdf_doc.close()
--- a/magic_pdf/para/exceptions.py
+++ b/magic_pdf/para/exceptions.py
-class DenseSingleLineBlockException(Exception):
-    """
-    This class defines the exception type for dense single line-block.
-    """
-    def __init__(self, message="DenseSingleLineBlockException"):
-        self.message = message
-        super().__init__(self.message)
-    def __str__(self):
-        return f"{self.message}"
-    def __repr__(self):
-        return f"{self.message}"
-class TitleDetectionException(Exception):
-    """
-    This class defines the exception type for title detection.
-    """
-    def __init__(self, message="TitleDetectionException"):
-        self.message = message
-        super().__init__(self.message)
-    def __str__(self):
-        return f"{self.message}"
-    def __repr__(self):
-        return f"{self.message}"
-class TitleLevelException(Exception):
-    """
-    This class defines the exception type for title level.
-    """
-    def __init__(self, message="TitleLevelException"):
-        self.message = message
-        super().__init__(self.message)
-    def __str__(self):
-        return f"{self.message}"
-    def __repr__(self):
-        return f"{self.message}"
-class ParaSplitException(Exception):
-    """
-    This class defines the exception type for paragraph splitting.
-    """
-    def __init__(self, message="ParaSplitException"):
-        self.message = message
-        super().__init__(self.message)
-    def __str__(self):
-        return f"{self.message}"
-    def __repr__(self):
-        return f"{self.message}"
-class ParaMergeException(Exception):
-    """
-    This class defines the exception type for paragraph merging.
-    """
-    def __init__(self, message="ParaMergeException"):
-        self.message = message
-        super().__init__(self.message)
-    def __str__(self):
-        return f"{self.message}"
-    def __repr__(self):
-        return f"{self.message}"
-class DiscardByException:
-    """
-    This class discards pdf files by exception
-    """
-    def __init__(self) -> None:
-        pass
-    def discard_by_single_line_block(self, pdf_dic, exception: DenseSingleLineBlockException):
-        """
-        This function discards pdf files by single line block exception
-        Parameters
-        ----------
-        pdf_dic : dict
-            pdf dictionary
-        exception : str
-            exception message
-        Returns
-        -------
-        error_message : str
-        """
-        exception_page_nums = 0
-        page_num = 0
-        for page_id, page in pdf_dic.items():
-            if page_id.startswith("page_"):
-                page_num += 1
-                if "preproc_blocks" in page.keys():
-                    preproc_blocks = page["preproc_blocks"]
-                    all_single_line_blocks = []
-                    for block in preproc_blocks:
-                        if len(block["lines"]) == 1:
-                            all_single_line_blocks.append(block)
-                    if len(preproc_blocks) > 0 and len(all_single_line_blocks) / len(preproc_blocks) > 0.9:
-                        exception_page_nums += 1
-        if page_num == 0:
-            return None
-        if exception_page_nums / page_num > 0.1:  # Low ratio means basically, whenever this is the case, it is discarded
-            return exception.message
-        return None
-    def discard_by_title_detection(self, pdf_dic, exception: TitleDetectionException):
-        """
-        This function discards pdf files by title detection exception
-        Parameters
-        ----------
-        pdf_dic : dict
-            pdf dictionary
-        exception : str
-            exception message
-        Returns
-        -------
-        error_message : str
-        """
-        # return exception.message
-        return None
-    def discard_by_title_level(self, pdf_dic, exception: TitleLevelException):
-        """
-        This function discards pdf files by title level exception
-        Parameters
-        ----------
-        pdf_dic : dict
-            pdf dictionary
-        exception : str
-            exception message
-        Returns
-        -------
-        error_message : str
-        """
-        # return exception.message
-        return None
-    def discard_by_split_para(self, pdf_dic, exception: ParaSplitException):
-        """
-        This function discards pdf files by split para exception
-        Parameters
-        ----------
-        pdf_dic : dict
-            pdf dictionary
-        exception : str
-            exception message
-        Returns
-        -------
-        error_message : str
-        """
-        # return exception.message
-        return None
-    def discard_by_merge_para(self, pdf_dic, exception: ParaMergeException):
-        """
-        This function discards pdf files by merge para exception
-        Parameters
-        ----------
-        pdf_dic : dict
-            pdf dictionary
-        exception : str
-            exception message
-        Returns
-        -------
-        error_message : str
-        """
-        # return exception.message
-        return None
--- a/magic_pdf/para/layout_match_processor.py
+++ b/magic_pdf/para/layout_match_processor.py
-import math
-from magic_pdf.para.commons import *
-if sys.version_info[0] >= 3:
-    sys.stdout.reconfigure(encoding="utf-8")  # type: ignore
-class LayoutFilterProcessor:
-    def __init__(self) -> None:
-        pass
-    def batch_process_blocks(self, pdf_dict):
-        for page_id, blocks in pdf_dict.items():
-            if page_id.startswith("page_"):
-                if "layout_bboxes" in blocks.keys() and "para_blocks" in blocks.keys():
-                    layout_bbox_objs = blocks["layout_bboxes"]
-                    if layout_bbox_objs is None:
-                        continue
-                    layout_bboxes = [bbox_obj["layout_bbox"] for bbox_obj in layout_bbox_objs]
-                    # Use math.ceil function to enlarge each value of x0, y0, x1, y1 of each layout_bbox
-                    layout_bboxes = [
-                        [math.ceil(x0), math.ceil(y0), math.ceil(x1), math.ceil(y1)] for x0, y0, x1, y1 in layout_bboxes
-                    ]
-                    para_blocks = blocks["para_blocks"]
-                    if para_blocks is None:
-                        continue
-                    for lb_bbox in layout_bboxes:
-                        for i, para_block in enumerate(para_blocks):
-                            para_bbox = para_block["bbox"]
-                            para_blocks[i]["in_layout"] = 0
-                            if is_in_bbox(para_bbox, lb_bbox):
-                                para_blocks[i]["in_layout"] = 1
-                    blocks["para_blocks"] = para_blocks
-        return pdf_dict
--- a/magic_pdf/para/para_pipeline.py
+++ b/magic_pdf/para/para_pipeline.py
-import os
-import json
-from magic_pdf.para.commons import *
-from magic_pdf.para.raw_processor import RawBlockProcessor
-from magic_pdf.para.layout_match_processor import LayoutFilterProcessor
-from magic_pdf.para.stats import BlockStatisticsCalculator
-from magic_pdf.para.stats import DocStatisticsCalculator
-from magic_pdf.para.title_processor import TitleProcessor
-from magic_pdf.para.block_termination_processor import BlockTerminationProcessor
-from magic_pdf.para.block_continuation_processor import BlockContinuationProcessor
-from magic_pdf.para.draw import DrawAnnos
-from magic_pdf.para.exceptions import (
-    DenseSingleLineBlockException,
-    TitleDetectionException,
-    TitleLevelException,
-    ParaSplitException,
-    ParaMergeException,
-    DiscardByException,
-)
-if sys.version_info[0] >= 3:
-    sys.stdout.reconfigure(encoding="utf-8")  # type: ignore
-class ParaProcessPipeline:
-    def __init__(self) -> None:
-        pass
-    def para_process_pipeline(self, pdf_info_dict, para_debug_mode=None, input_pdf_path=None, output_pdf_path=None):
-        """
-        This function processes the paragraphs, including:
-        1. Read raw input json file into pdf_dic
-        2. Detect and replace equations
-        3. Combine spans into a natural line
-        4. Check if the paragraphs are inside bboxes passed from "layout_bboxes" key
-        5. Compute statistics for each block
-        6. Detect titles in the document
-        7. Detect paragraphs inside each block
-        8. Divide the level of the titles
-        9. Detect and combine paragraphs from different blocks into one paragraph
-        10. Check whether the final results after checking headings, dividing paragraphs within blocks, and merging paragraphs between blocks are plausible and reasonable.
-        11. Draw annotations on the pdf file
-        Parameters
-        ----------
-        pdf_dic_json_fpath : str
-            path to the pdf dictionary json file.
-            Notice: data noises, including overlap blocks, header, footer, watermark, vertical margin note have been removed already.
-        input_pdf_doc : str
-            path to the input pdf file
-        output_pdf_path : str
-            path to the output pdf file
-        Returns
-        -------
-        pdf_dict : dict
-            result dictionary
-        """
-        error_info = None
-        output_json_file = ""
-        output_dir = ""
-        if input_pdf_path is not None:
-            input_pdf_path = os.path.abspath(input_pdf_path)
-            # print_green_on_red(f">>>>>>>>>>>>>>>>>>> Process the paragraphs of {input_pdf_path}")
-        if output_pdf_path is not None:
-            output_dir = os.path.dirname(output_pdf_path)
-            output_json_file = f"{output_dir}/pdf_dic.json"
-        def __save_pdf_dic(pdf_dic, output_pdf_path, stage="0", para_debug_mode=para_debug_mode):
-            """
-            Save the pdf_dic to a json file
-            """
-            output_pdf_file_name = os.path.basename(output_pdf_path)
-            # output_dir = os.path.dirname(output_pdf_path)
-            output_dir = "\\tmp\\pdf_parse"
-            output_pdf_file_name = output_pdf_file_name.replace(".pdf", f"_stage_{stage}.json")
-            pdf_dic_json_fpath = os.path.join(output_dir, output_pdf_file_name)
-            if not os.path.exists(output_dir):
-                os.makedirs(output_dir)
-            if para_debug_mode == "full":
-                with open(pdf_dic_json_fpath, "w", encoding="utf-8") as f:
-                    json.dump(pdf_dic, f, indent=2, ensure_ascii=False)
-            # Validate the output already exists
-            if not os.path.exists(pdf_dic_json_fpath):
-                print_red(f"Failed to save the pdf_dic to {pdf_dic_json_fpath}")
-                return None
-            else:
-                print_green(f"Succeed to save the pdf_dic to {pdf_dic_json_fpath}")
-            return pdf_dic_json_fpath
-        """
-        Preprocess the lines of block
-        """
-        # Find and replace the interline and inline equations, should be better done before the paragraph processing
-        # Create "para_blocks" for each page.
-        # equationProcessor = EquationsProcessor()
-        # pdf_dic = equationProcessor.batch_process_blocks(pdf_info_dict)
-        # Combine spans into a natural line
-        rawBlockProcessor = RawBlockProcessor()
-        pdf_dic = rawBlockProcessor.batch_process_blocks(pdf_info_dict)
-        # print(f"pdf_dic['page_0']['para_blocks'][0]: {pdf_dic['page_0']['para_blocks'][0]}", end="\n\n")
-        # Check if the paragraphs are inside bboxes passed from "layout_bboxes" key
-        layoutFilter = LayoutFilterProcessor()
-        pdf_dic = layoutFilter.batch_process_blocks(pdf_dic)
-        # Compute statistics for each block
-        blockStatisticsCalculator = BlockStatisticsCalculator()
-        pdf_dic = blockStatisticsCalculator.batch_process_blocks(pdf_dic)
-        # print(f"pdf_dic['page_0']['para_blocks'][0]: {pdf_dic['page_0']['para_blocks'][0]}", end="\n\n")
-        # Compute statistics for all blocks(namely this pdf document)
-        docStatisticsCalculator = DocStatisticsCalculator()
-        pdf_dic = docStatisticsCalculator.calc_stats_of_doc(pdf_dic)
-        # print(f"pdf_dic['statistics']: {pdf_dic['statistics']}", end="\n\n")
-        # Dump the first three stages of pdf_dic to a json file
-        if para_debug_mode == "full":
-            pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="0", para_debug_mode=para_debug_mode)
-        """
-        Detect titles in the document
-        """
-        doc_statistics = pdf_dic["statistics"]
-        titleProcessor = TitleProcessor(doc_statistics)
-        pdf_dic = titleProcessor.batch_process_blocks_detect_titles(pdf_dic)
-        if para_debug_mode == "full":
-            pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="1", para_debug_mode=para_debug_mode)
-        """
-        Detect and divide the level of the titles
-        """
-        titleProcessor = TitleProcessor()
-        pdf_dic = titleProcessor.batch_process_blocks_recog_title_level(pdf_dic)
-        if para_debug_mode == "full":
-            pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="2", para_debug_mode=para_debug_mode)
-        """
-        Detect and split paragraphs inside each block
-        """
-        blockInnerParasProcessor = BlockTerminationProcessor()
-        pdf_dic = blockInnerParasProcessor.batch_process_blocks(pdf_dic)
-        if para_debug_mode == "full":
-            pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="3", para_debug_mode=para_debug_mode)
-        # pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="3", para_debug_mode="full")
-        # print_green(f"pdf_dic_json_fpath: {pdf_dic_json_fpath}")
-        """
-        Detect and combine paragraphs from different blocks into one paragraph
-        """
-        blockContinuationProcessor = BlockContinuationProcessor()
-        pdf_dic = blockContinuationProcessor.batch_tag_paras(pdf_dic)
-        pdf_dic = blockContinuationProcessor.batch_merge_paras(pdf_dic)
-        if para_debug_mode == "full":
-            pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="4", para_debug_mode=para_debug_mode)
-        # pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="4", para_debug_mode="full")
-        # print_green(f"pdf_dic_json_fpath: {pdf_dic_json_fpath}")
-        """
-        Discard pdf files by checking exceptions and return the error info to the caller
-        """
-        discardByException = DiscardByException()
-        is_discard_by_single_line_block = discardByException.discard_by_single_line_block(
-            pdf_dic, exception=DenseSingleLineBlockException()
-        )
-        is_discard_by_title_detection = discardByException.discard_by_title_detection(
-            pdf_dic, exception=TitleDetectionException()
-        )
-        is_discard_by_title_level = discardByException.discard_by_title_level(pdf_dic, exception=TitleLevelException())
-        is_discard_by_split_para = discardByException.discard_by_split_para(pdf_dic, exception=ParaSplitException())
-        is_discard_by_merge_para = discardByException.discard_by_merge_para(pdf_dic, exception=ParaMergeException())
-        """
-        if any(
-            info is not None
-            for info in [
-                is_discard_by_single_line_block,
-                is_discard_by_title_detection,
-                is_discard_by_title_level,
-                is_discard_by_split_para,
-                is_discard_by_merge_para,
-            ]
-        ):
-            error_info = next(
-                (
-                    info
-                    for info in [
-                        is_discard_by_single_line_block,
-                        is_discard_by_title_detection,
-                        is_discard_by_title_level,
-                        is_discard_by_split_para,
-                        is_discard_by_merge_para,
-                    ]
-                    if info is not None
-                ),
-                None,
-            )
-            return pdf_dic, error_info
-        if any(
-            info is not None
-            for info in [
-                is_discard_by_single_line_block,
-                is_discard_by_title_detection,
-                is_discard_by_title_level,
-                is_discard_by_split_para,
-                is_discard_by_merge_para,
-            ]
-        ):
-            error_info = next(
-                (
-                    info
-                    for info in [
-                        is_discard_by_single_line_block,
-                        is_discard_by_title_detection,
-                        is_discard_by_title_level,
-                        is_discard_by_split_para,
-                        is_discard_by_merge_para,
-                    ]
-                    if info is not None
-                ),
-                None,
-            )
-            return pdf_dic, error_info
-        """
-        """
-        Dump the final pdf_dic to a json file
-        """
-        if para_debug_mode is not None:
-            with open(output_json_file, "w", encoding="utf-8") as f:
-                json.dump(pdf_info_dict, f, ensure_ascii=False, indent=4)
-        """
-        Draw the annotations
-        """
-        if is_discard_by_single_line_block is not None:
-            error_info = is_discard_by_single_line_block
-        elif is_discard_by_title_detection is not None:
-            error_info = is_discard_by_title_detection
-        elif is_discard_by_title_level is not None:
-            error_info = is_discard_by_title_level
-        elif is_discard_by_split_para is not None:
-            error_info = is_discard_by_split_para
-        elif is_discard_by_merge_para is not None:
-            error_info = is_discard_by_merge_para
-        if error_info is not None:
-            return pdf_dic, error_info
-        """
-        Dump the final pdf_dic to a json file
-        """
-        if para_debug_mode is not None:
-            with open(output_json_file, "w", encoding="utf-8") as f:
-                json.dump(pdf_info_dict, f, ensure_ascii=False, indent=4)
-        """
-        Draw the annotations
-        """
-        if para_debug_mode is not None:
-            drawAnnos = DrawAnnos()
-            drawAnnos.draw_annos(input_pdf_path, pdf_dic, output_pdf_path)
-        """
-        Remove the intermediate files which are generated in the process of paragraph processing if debug_mode is simple
-        """
-        if para_debug_mode is not None:
-            for fpath in os.listdir(output_dir):
-                if fpath.endswith(".json") and "stage" in fpath:
-                    os.remove(os.path.join(output_dir, fpath))
-        return pdf_dic, error_info
--- a/magic_pdf/para/para_split.py
+++ b/magic_pdf/para/para_split.py
-from sklearn.cluster import DBSCAN
-import numpy as np
-from loguru import logger
-from magic_pdf.libs.boxbase import _is_in_or_part_overlap_with_area_ratio as is_in_layout
-from magic_pdf.libs.ocr_content_type import ContentType
-LINE_STOP_FLAG = ['.', '!', '?', '。', '！', '？',"：", ":", ")", "）", ";"]
-INLINE_EQUATION = ContentType.InlineEquation
-INTERLINE_EQUATION = ContentType.InterlineEquation
-TEXT = ContentType.Text
-def __get_span_text(span):
-    c = span.get('content', '')
-    if len(c)==0:
-        c = span.get('image_path', '')
-    return c
-def __detect_list_lines(lines, new_layout_bboxes, lang):
-    """
-    探测是否包含了列表，并且把列表的行分开.
-    这样的段落特点是，顶格字母大写/数字，紧跟着几行缩进的。缩进的行首字母含小写的。
-    """
-    def find_repeating_patterns(lst):
-        indices = []
-        ones_indices = []
-        i = 0
-        while i < len(lst) - 1:  # 确保余下元素至少有2个
-            if lst[i] == 1 and lst[i+1] in [2, 3]:  # 额外检查以防止连续出现的1
-                start = i
-                ones_in_this_interval = [i]
-                i += 1
-                while i < len(lst) and lst[i] in [2, 3]:
-                    i += 1
-                # 验证下一个序列是否符合条件
-                if i < len(lst) - 1 and lst[i] == 1 and lst[i+1] in [2, 3] and lst[i-1] in [2, 3]:
-                    while i < len(lst) and lst[i] in [1, 2, 3]:
-                        if lst[i] == 1:
-                            ones_in_this_interval.append(i)
-                        i += 1
-                    indices.append((start, i - 1))
-                    ones_indices.append(ones_in_this_interval)
-                else:
-                    i += 1
-            else:
-                i += 1
-        return indices, ones_indices
-    """===================="""
-    def split_indices(slen, index_array):
-        result = []
-        last_end = 0
-        for start, end in sorted(index_array):
-            if start > last_end:
-                # 前一个区间结束到下一个区间开始之间的部分标记为"text"
-                result.append(('text', last_end, start - 1))
-            # 区间内标记为"list"
-            result.append(('list', start, end))
-            last_end = end + 1
-        if last_end < slen:
-            # 如果最后一个区间结束后还有剩余的字符串，将其标记为"text"
-            result.append(('text', last_end, slen - 1))
-        return result
-    """===================="""
-    if lang!='en':
-        return lines, None
-    else:
-        total_lines = len(lines)
-        line_fea_encode = []
-        """
-        对每一行进行特征编码，编码规则如下：
-        1. 如果行顶格，且大写字母开头或者数字开头，编码为1
-        2. 如果顶格，其他非大写开头编码为4
-        3. 如果非顶格，首字符大写，编码为2
-        4. 如果非顶格，首字符非大写编码为3
-        """
-        for l in lines:
-            first_char = __get_span_text(l['spans'][0])[0]
-            layout_left = __find_layout_bbox_by_line(l['bbox'], new_layout_bboxes)[0]
-            if l['bbox'][0] == layout_left:
-                if first_char.isupper() or first_char.isdigit():
-                    line_fea_encode.append(1)
-                else:
-                    line_fea_encode.append(4)
-            else:
-                if first_char.isupper():
-                    line_fea_encode.append(2)
-                else:
-                    line_fea_encode.append(3)
-        # 然后根据编码进行分段, 选出来 1,2,3连续出现至少2次的行，认为是列表。
-        list_indice, list_start_idx  = find_repeating_patterns(line_fea_encode)
-        # if len(list_indice)>0:
-        #     logger.info(f"发现了列表，列表行数：{list_indice}， {list_start_idx}")
-        # TODO check一下这个特列表里缩进的行左侧是不是对齐的。
-        segments = []
-        for start, end in list_indice:
-            for i in range(start, end+1):
-                if i>0:
-                    if line_fea_encode[i] == 4:
-                        # logger.info(f"列表行的第{i}行不是顶格的")
-                        break
-            # else:
-            #     logger.info(f"列表行的第{start}到第{end}行是列表")
-        return split_indices(total_lines, list_indice), list_start_idx
-def __valign_lines(blocks, layout_bboxes):
-    """
-    在一个layoutbox内对齐行的左侧和右侧。
-    扫描行的左侧和右侧，如果x0, x1差距不超过一个阈值，就强行对齐到所处layout的左右两侧（和layout有一段距离）。
-    3是个经验值，TODO，计算得来，可以设置为1.5个正文字符。
-    """
-    min_distance = 3
-    min_sample = 2
-    new_layout_bboxes = []
-    for layout_box in layout_bboxes:
-        blocks_in_layoutbox = [b for b in blocks if is_in_layout(b['bbox'], layout_box['layout_bbox'])]
-        if len(blocks_in_layoutbox)==0:
-            continue
-        x0_lst = np.array([[line['bbox'][0], 0] for block in blocks_in_layoutbox for line in block['lines']])
-        x1_lst = np.array([[line['bbox'][2], 0] for block in blocks_in_layoutbox for line in block['lines']])
-        x0_clusters = DBSCAN(eps=min_distance, min_samples=min_sample).fit(x0_lst)
-        x1_clusters = DBSCAN(eps=min_distance, min_samples=min_sample).fit(x1_lst)
-        x0_uniq_label = np.unique(x0_clusters.labels_)
-        x1_uniq_label = np.unique(x1_clusters.labels_)
-        x0_2_new_val = {} # 存储旧值对应的新值映射
-        x1_2_new_val = {}
-        for label in x0_uniq_label:
-            if label==-1:
-                continue
-            x0_index_of_label = np.where(x0_clusters.labels_==label)
-            x0_raw_val = x0_lst[x0_index_of_label][:,0]
-            x0_new_val = np.min(x0_lst[x0_index_of_label][:,0])
-            x0_2_new_val.update({idx: x0_new_val for idx in x0_raw_val})
-        for label in x1_uniq_label:
-            if label==-1:
-                continue
-            x1_index_of_label = np.where(x1_clusters.labels_==label)
-            x1_raw_val = x1_lst[x1_index_of_label][:,0]
-            x1_new_val = np.max(x1_lst[x1_index_of_label][:,0])
-            x1_2_new_val.update({idx: x1_new_val for idx in x1_raw_val})
-        for block in blocks_in_layoutbox:
-            for line in block['lines']:
-                x0, x1 = line['bbox'][0], line['bbox'][2]
-                if x0 in x0_2_new_val:
-                    line['bbox'][0] = int(x0_2_new_val[x0])
-                if x1 in x1_2_new_val:
-                    line['bbox'][2] = int(x1_2_new_val[x1])
-            # 其余对不齐的保持不动
-        # 由于修改了block里的line长度，现在需要重新计算block的bbox
-        for block in blocks_in_layoutbox:
-            block['bbox'] = [min([line['bbox'][0] for line in block['lines']]), 
-                            min([line['bbox'][1] for line in block['lines']]), 
-                            max([line['bbox'][2] for line in block['lines']]), 
-                            max([line['bbox'][3] for line in block['lines']])]
-        """新计算layout的bbox，因为block的bbox变了。"""
-        layout_x0 = min([block['bbox'][0] for block in blocks_in_layoutbox])
-        layout_y0 = min([block['bbox'][1] for block in blocks_in_layoutbox])
-        layout_x1 = max([block['bbox'][2] for block in blocks_in_layoutbox])
-        layout_y1 = max([block['bbox'][3] for block in blocks_in_layoutbox])
-        new_layout_bboxes.append([layout_x0, layout_y0, layout_x1, layout_y1])
-    return new_layout_bboxes
-def __align_text_in_layout(blocks, layout_bboxes):
-    """
-    由于ocr出来的line，有时候会在前后有一段空白，这个时候需要对文本进行对齐，超出的部分被layout左右侧截断。
-    """
-    for layout in layout_bboxes:
-        lb = layout['layout_bbox']
-        blocks_in_layoutbox = [b for b in blocks if is_in_layout(b['bbox'], lb)]
-        if len(blocks_in_layoutbox)==0:
-            continue
-        for block in blocks_in_layoutbox:
-            for line in block['lines']:
-                x0, x1 = line['bbox'][0], line['bbox'][2]
-                if x0 < lb[0]:
-                    line['bbox'][0] = lb[0]
-                if x1 > lb[2]:
-                    line['bbox'][2] = lb[2]
-def __common_pre_proc(blocks, layout_bboxes):
-    """
-    不分语言的，对文本进行预处理
-    """
-    #__add_line_period(blocks, layout_bboxes)
-    __align_text_in_layout(blocks, layout_bboxes)
-    aligned_layout_bboxes = __valign_lines(blocks, layout_bboxes)
-    return aligned_layout_bboxes
-def __pre_proc_zh_blocks(blocks, layout_bboxes):
-    """
-    对中文文本进行分段预处理
-    """
-    pass
-def __pre_proc_en_blocks(blocks, layout_bboxes):
-    """
-    对英文文本进行分段预处理
-    """
-    pass
-def __group_line_by_layout(blocks, layout_bboxes, lang="en"):
-    """
-    每个layout内的行进行聚合
-    """
-    # 因为只是一个block一行目前, 一个block就是一个段落
-    lines_group = []
-    for lyout in layout_bboxes:
-        lines = [line for block in blocks if is_in_layout(block['bbox'], lyout['layout_bbox']) for line in block['lines']]
-        lines_group.append(lines)
-    return lines_group
-def __split_para_in_layoutbox(lines_group, new_layout_bbox, lang="en", char_avg_len=10):
-    """
-    lines_group 进行行分段——layout内部进行分段。lines_group内每个元素是一个Layoutbox内的所有行。
-    1. 先计算每个group的左右边界。
-    2. 然后根据行末尾特征进行分段。
-        末尾特征：以句号等结束符结尾。并且距离右侧边界有一定距离。
-        且下一行开头不留空白。
-    """
-    list_info = [] # 这个layout最后是不是列表,记录每一个layout里是不是列表开头，列表结尾
-    layout_paras = []
-    right_tail_distance = 1.5 * char_avg_len
-    for lines in lines_group:
-        paras = []
-        total_lines = len(lines)
-        if total_lines==0:
-            continue # 0行无需处理
-        if total_lines==1: # 1行无法分段。
-            layout_paras.append([lines])
-            list_info.append([False, False])
-            continue
-        """在进入到真正的分段之前，要对文字块从统计维度进行对齐方式的探测，
-            对齐方式分为以下：
-            1. 左对齐的文本块(特点是左侧顶格，或者左侧不顶格但是右侧顶格的行数大于非顶格的行数，顶格的首字母有大写也有小写)
-                1) 右侧对齐的行，单独成一段
-                2) 中间对齐的行，按照字体/行高聚合成一段
-            2. 左对齐的列表块（其特点是左侧顶格的行数小于等于非顶格的行数，非定格首字母会有小写，顶格90%是大写。并且左侧顶格行数大于1，大于1是为了这种模式连续出现才能称之为列表）
-                这样的文本块，顶格的为一个段落开头，紧随其后非顶格的行属于这个段落。
-        """
-        text_segments, list_start_line = __detect_list_lines(lines, new_layout_bbox, lang)
-        """根据list_range，把lines分成几个部分
-        """
-        layout_right = __find_layout_bbox_by_line(lines[0]['bbox'], new_layout_bbox)[2]
-        layout_left = __find_layout_bbox_by_line(lines[0]['bbox'], new_layout_bbox)[0]
-        para = [] # 元素是line
-        layout_list_info = [False, False] # 这个layout最后是不是列表,记录每一个layout里是不是列表开头，列表结尾
-        for content_type, start, end in text_segments:
-            if content_type == 'list':
-                for i, line in enumerate(lines[start:end+1]):
-                    line_x0 = line['bbox'][0]
-                    if line_x0 == layout_left: # 列表开头
-                        if len(para)>0:
-                            paras.append(para)
-                            para = []
-                        para.append(line)
-                    else:
-                        para.append(line)
-                if len(para)>0:
-                    paras.append(para)
-                    para = []
-                if start==0:
-                    layout_list_info[0] = True
-                if end==total_lines-1:
-                    layout_list_info[1] = True
-            else: # 是普通文本
-                for i, line in enumerate(lines[start:end+1]):
-                    # 如果i有下一行，那么就要根据下一行位置综合判断是否要分段。如果i之后没有行，那么只需要判断i行自己的结尾特征。
-                    cur_line_type = line['spans'][-1]['type']
-                    next_line = lines[i+1] if i<total_lines-1 else None
-                    if cur_line_type in [TEXT, INLINE_EQUATION]:
-                        if line['bbox'][2] < layout_right - right_tail_distance:
-                            para.append(line)
-                            paras.append(para)
-                            para = []
-                        elif line['bbox'][2] >= layout_right - right_tail_distance and next_line and next_line['bbox'][0] == layout_left: # 现在这行到了行尾沾满，下一行存在且顶格。
-                            para.append(line)
-                        else: 
-                            para.append(line)
-                            paras.append(para)
-                            para = []
-                    else: # 其他，图片、表格、行间公式，各自占一段
-                        if len(para)>0:  # 先把之前的段落加入到结果中
-                            paras.append(para)
-                            para = []
-                        paras.append([line]) # 再把当前行加入到结果中。当前行为行间公式、图、表等。
-                        para = []
-                if len(para)>0:
-                    paras.append(para)
-                    para = []
-        list_info.append(layout_list_info)
-        layout_paras.append(paras)
-        paras = []
-    return layout_paras, list_info
-def __connect_list_inter_layout(layout_paras, new_layout_bbox, layout_list_info, page_num, lang):
-    """
-    如果上个layout的最后一个段落是列表，下一个layout的第一个段落也是列表，那么将他们连接起来。 TODO 因为没有区分列表和段落，所以这个方法暂时不实现。
-    根据layout_list_info判断是不是列表。，下个layout的第一个段如果不是列表，那么看他们是否有几行都有相同的缩进。
-    """
-    if len(layout_paras)==0 or len(layout_list_info)==0: # 0的时候最后的return 会出错
-        return layout_paras, [False, False]
-    for i in range(1, len(layout_paras)):
-        pre_layout_list_info = layout_list_info[i-1]
-        next_layout_list_info = layout_list_info[i]
-        pre_last_para = layout_paras[i-1][-1]
-        next_paras = layout_paras[i]
-        next_first_para = next_paras[0]
-        if pre_layout_list_info[1] and not next_layout_list_info[0]: # 前一个是列表结尾，后一个是非列表开头，此时检测是否有相同的缩进
-            # logger.info(f"连接page {page_num} 内的list")
-            # 向layout_paras[i] 寻找开头具有相同缩进的连续的行
-            may_list_lines = []
-            for j in range(len(next_paras)):
-                line = next_paras[j]
-                if len(line)==1: # 只可能是一行，多行情况再需要分析了
-                    if line[0]['bbox'][0] > __find_layout_bbox_by_line(line[0]['bbox'], new_layout_bbox)[0]:
-                        may_list_lines.append(line[0])
-                    else:
-                        break
-                else:
-                    break
-            # 如果这些行的缩进是相等的，那么连到上一个layout的最后一个段落上。
-            if len(may_list_lines)>0 and len(set([x['bbox'][0] for x in may_list_lines]))==1:
-                pre_last_para.extend(may_list_lines)
-                layout_paras[i] = layout_paras[i][len(may_list_lines):]
-    return layout_paras, [layout_list_info[0][0], layout_list_info[-1][1]] # 同时还返回了这个页面级别的开头、结尾是不是列表的信息
-def __connect_list_inter_page(pre_page_paras, next_page_paras, pre_page_layout_bbox, next_page_layout_bbox,  pre_page_list_info, next_page_list_info, page_num, lang):
-    """
-    如果上个layout的最后一个段落是列表，下一个layout的第一个段落也是列表，那么将他们连接起来。 TODO 因为没有区分列表和段落，所以这个方法暂时不实现。
-    根据layout_list_info判断是不是列表。，下个layout的第一个段如果不是列表，那么看他们是否有几行都有相同的缩进。
-    """
-    if len(pre_page_paras)==0 or len(next_page_paras)==0: # 0的时候最后的return 会出错
-        return False
-    if pre_page_list_info[1] and not next_page_list_info[0]: # 前一个是列表结尾，后一个是非列表开头，此时检测是否有相同的缩进
-        # logger.info(f"连接page {page_num} 内的list")
-        # 向layout_paras[i] 寻找开头具有相同缩进的连续的行
-        may_list_lines = []
-        for j in range(len(next_page_paras[0])):
-            line = next_page_paras[0][j]
-            if len(line)==1: # 只可能是一行，多行情况再需要分析了
-                if line[0]['bbox'][0] > __find_layout_bbox_by_line(line[0]['bbox'], next_page_layout_bbox)[0]:
-                    may_list_lines.append(line[0])
-                else:
-                    break
-            else:
-                break
-        # 如果这些行的缩进是相等的，那么连到上一个layout的最后一个段落上。
-        if len(may_list_lines)>0 and len(set([x['bbox'][0] for x in may_list_lines]))==1:
-            pre_page_paras[-1].append(may_list_lines)
-            next_page_paras[0] = next_page_paras[0][len(may_list_lines):]
-            return True
-    return False
-def __find_layout_bbox_by_line(line_bbox, layout_bboxes):
-    """
-    根据line找到所在的layout
-    """
-    for layout in layout_bboxes:
-        if is_in_layout(line_bbox, layout):
-            return layout
-    return None
-def __connect_para_inter_layoutbox(layout_paras, new_layout_bbox, lang):
-    """
-    layout之间进行分段。
-    主要是计算前一个layOut的最后一行和后一个layout的第一行是否可以连接。
-    连接的条件需要同时满足：
-    1. 上一个layout的最后一行沾满整个行。并且没有结尾符号。
-    2. 下一行开头不留空白。
-    """
-    connected_layout_paras = []
-    if len(layout_paras)==0:
-        return connected_layout_paras
-    connected_layout_paras.append(layout_paras[0])
-    for i in range(1, len(layout_paras)):
-        try:
-            if len(layout_paras[i])==0 or len(layout_paras[i-1])==0: #  TODO 考虑连接问题，
-                continue
-            pre_last_line = layout_paras[i-1][-1][-1]
-            next_first_line = layout_paras[i][0][0]
-        except Exception as e:
-            # logger.error(f"page layout {i} has no line")
-            continue
-        pre_last_line_text = ''.join([__get_span_text(span) for span in pre_last_line['spans']])
-        pre_last_line_type = pre_last_line['spans'][-1]['type']
-        next_first_line_text = ''.join([__get_span_text(span) for span in next_first_line['spans']])
-        next_first_line_type = next_first_line['spans'][0]['type']
-        if pre_last_line_type not in [TEXT, INLINE_EQUATION] or next_first_line_type not in [TEXT, INLINE_EQUATION]:
-            connected_layout_paras.append(layout_paras[i])
-            continue
-        pre_x2_max = __find_layout_bbox_by_line(pre_last_line['bbox'], new_layout_bbox)[2]
-        next_x0_min = __find_layout_bbox_by_line(next_first_line['bbox'], new_layout_bbox)[0]
-        pre_last_line_text = pre_last_line_text.strip()
-        next_first_line_text = next_first_line_text.strip()
-        if pre_last_line['bbox'][2] == pre_x2_max and pre_last_line_text[-1] not in LINE_STOP_FLAG and next_first_line['bbox'][0]==next_x0_min: # 前面一行沾满了整个行，并且没有结尾符号.下一行没有空白开头。
-            """连接段落条件成立，将前一个layout的段落和后一个layout的段落连接。"""
-            connected_layout_paras[-1][-1].extend(layout_paras[i][0])
-            layout_paras[i].pop(0) # 删除后一个layout的第一个段落， 因为他已经被合并到前一个layout的最后一个段落了。
-            if len(layout_paras[i])==0:
-                layout_paras.pop(i)
-            else:
-                connected_layout_paras.append(layout_paras[i])
-        else:                            
-            """连接段落条件不成立，将前一个layout的段落加入到结果中。"""
-            connected_layout_paras.append(layout_paras[i])
-    return connected_layout_paras
-def __connect_para_inter_page(pre_page_paras, next_page_paras, pre_page_layout_bbox, next_page_layout_bbox, page_num, lang):
-    """
-    连接起来相邻两个页面的段落——前一个页面最后一个段落和后一个页面的第一个段落。
-    是否可以连接的条件：
-    1. 前一个页面的最后一个段落最后一行沾满整个行。并且没有结尾符号。
-    2. 后一个页面的第一个段落第一行没有空白开头。
-    """
-    # 有的页面可能压根没有文字
-    if len(pre_page_paras)==0 or len(next_page_paras)==0 or len(pre_page_paras[0])==0 or len(next_page_paras[0])==0: # TODO [[]]为什么出现在pre_page_paras里？
-        return False
-    pre_last_para = pre_page_paras[-1][-1]
-    next_first_para = next_page_paras[0][0]
-    pre_last_line = pre_last_para[-1]
-    next_first_line = next_first_para[0]
-    pre_last_line_text = ''.join([__get_span_text(span) for span in pre_last_line['spans']])
-    pre_last_line_type = pre_last_line['spans'][-1]['type']
-    next_first_line_text = ''.join([__get_span_text(span) for span in next_first_line['spans']])
-    next_first_line_type = next_first_line['spans'][0]['type']
-    if pre_last_line_type not in [TEXT, INLINE_EQUATION] or next_first_line_type not in [TEXT, INLINE_EQUATION]: # TODO，真的要做好，要考虑跨table, image, 行间的情况
-        # 不是文本，不连接
-        return False
-    pre_x2_max = __find_layout_bbox_by_line(pre_last_line['bbox'], pre_page_layout_bbox)[2]
-    next_x0_min = __find_layout_bbox_by_line(next_first_line['bbox'], next_page_layout_bbox)[0]
-    pre_last_line_text = pre_last_line_text.strip()
-    next_first_line_text = next_first_line_text.strip()
-    if pre_last_line['bbox'][2] == pre_x2_max and pre_last_line_text[-1] not in LINE_STOP_FLAG and next_first_line['bbox'][0]==next_x0_min: # 前面一行沾满了整个行，并且没有结尾符号.下一行没有空白开头。
-        """连接段落条件成立，将前一个layout的段落和后一个layout的段落连接。"""
-        pre_last_para.extend(next_first_para)
-        next_page_paras[0].pop(0) # 删除后一个页面的第一个段落， 因为他已经被合并到前一个页面的最后一个段落了。
-        return True
-    else:
-        return False
-def find_consecutive_true_regions(input_array):
-    start_index = None  # 连续True区域的起始索引
-    regions = []  # 用于保存所有连续True区域的起始和结束索引
-    for i in range(len(input_array)):
-        # 如果我们找到了一个True值，并且当前并没有在连续True区域中
-        if input_array[i] and start_index is None:
-            start_index = i  # 记录连续True区域的起始索引
-        # 如果我们找到了一个False值，并且当前在连续True区域中
-        elif not input_array[i] and start_index is not None:
-            # 如果连续True区域长度大于1，那么将其添加到结果列表中
-            if i - start_index > 1: 
-                regions.append((start_index, i-1)) 
-            start_index = None  # 重置起始索引
-    # 如果最后一个元素是True，那么需要将最后一个连续True区域加入到结果列表中
-    if start_index is not None and len(input_array) - start_index > 1:
-        regions.append((start_index, len(input_array)-1))
-    return regions
-def __connect_middle_align_text(page_paras, new_layout_bbox, page_num, lang, debug_mode):
-    """
-    找出来中间对齐的连续单行文本，如果连续行高度相同，那么合并为一个段落。
-    一个line居中的条件是：
-    1. 水平中心点跨越layout的中心点。
-    2. 左右两侧都有空白
-    """
-    for layout_i, layout_para in enumerate(page_paras):
-        layout_box = new_layout_bbox[layout_i]
-        single_line_paras_tag = []
-        for i in range(len(layout_para)):
-            single_line_paras_tag.append(len(layout_para[i])==1 and layout_para[i][0]['spans'][0]['type']==TEXT)
-        """找出来连续的单行文本，如果连续行高度相同，那么合并为一个段落。"""
-        consecutive_single_line_indices = find_consecutive_true_regions(single_line_paras_tag)
-        if len(consecutive_single_line_indices)>0:
-            index_offset = 0
-            """检查这些行是否是高度相同的，居中的"""
-            for start, end in consecutive_single_line_indices:
-                start += index_offset
-                end += index_offset
-                line_hi = np.array([line[0]['bbox'][3]-line[0]['bbox'][1] for line in layout_para[start:end+1]])
-                first_line_text = ''.join([__get_span_text(span) for span in layout_para[start][0]['spans']])
-                if "Table" in first_line_text or "Figure" in first_line_text:
-                    pass
-                if debug_mode:
-                    # logger.debug(line_hi.std())
-                if line_hi.std()<2:
-                    """行高度相同，那么判断是否居中"""
-                    all_left_x0 = [line[0]['bbox'][0] for line in layout_para[start:end+1]]
-                    all_right_x1 = [line[0]['bbox'][2] for line in layout_para[start:end+1]]
-                    layout_center = (layout_box[0] + layout_box[2]) / 2
-                    if all([x0 < layout_center < x1 for x0, x1 in zip(all_left_x0, all_right_x1)]) \
-                    and not all([x0==layout_box[0] for x0 in all_left_x0]) \
-                    and not all([x1==layout_box[2] for x1 in all_right_x1]):
-                        merge_para = [l[0] for l in layout_para[start:end+1]]
-                        para_text = ''.join([__get_span_text(span) for line in merge_para for span in line['spans']])
-                        # if debug_mode:
-                            # logger.debug(para_text)
-                        layout_para[start:end+1] = [merge_para]
-                        index_offset -= end-start
-    return
-def __merge_signle_list_text(page_paras, new_layout_bbox, page_num, lang):
-    """
-    找出来连续的单行文本，如果首行顶格，接下来的几个单行段落缩进对齐，那么合并为一个段落。
-    """
-    pass
-def __do_split_page(blocks, layout_bboxes, new_layout_bbox, page_num, lang):
-    """
-    根据line和layout情况进行分段
-    先实现一个根据行末尾特征分段的简单方法。
-    """
-    """
-    算法思路：
-    1. 扫描layout里每一行，找出来行尾距离layout有边界有一定距离的行。
-    2. 从上述行中找到末尾是句号等可作为断行标志的行。
-    3. 参照上述行尾特征进行分段。
-    4. 图、表，目前独占一行，不考虑分段。
-    """
-    if page_num==343:
-        pass
-    lines_group = __group_line_by_layout(blocks, layout_bboxes, lang) # block内分段
-    layout_paras, layout_list_info = __split_para_in_layoutbox(lines_group, new_layout_bbox, lang) # layout内分段
-    layout_paras2, page_list_info = __connect_list_inter_layout(layout_paras, new_layout_bbox, layout_list_info, page_num, lang) # layout之间连接列表段落
-    connected_layout_paras = __connect_para_inter_layoutbox(layout_paras2, new_layout_bbox, lang) # layout间链接段落
-    return connected_layout_paras, page_list_info
-def para_split(pdf_info_dict, debug_mode, lang="en"):
-    """
-    根据line和layout情况进行分段
-    """
-    new_layout_of_pages = [] # 数组的数组，每个元素是一个页面的layoutS
-    all_page_list_info = [] # 保存每个页面开头和结尾是否是列表
-    for page_num, page in pdf_info_dict.items():
-        blocks = page['preproc_blocks']
-        layout_bboxes = page['layout_bboxes']
-        new_layout_bbox = __common_pre_proc(blocks, layout_bboxes)
-        new_layout_of_pages.append(new_layout_bbox)
-        splited_blocks, page_list_info = __do_split_page(blocks, layout_bboxes, new_layout_bbox, page_num, lang)
-        all_page_list_info.append(page_list_info)
-        page['para_blocks'] = splited_blocks
-    """连接页面与页面之间的可能合并的段落"""
-    pdf_infos = list(pdf_info_dict.values())
-    for page_num, page in enumerate(pdf_info_dict.values()):
-        if page_num==0:
-            continue
-        pre_page_paras = pdf_infos[page_num-1]['para_blocks']
-        next_page_paras = pdf_infos[page_num]['para_blocks']
-        pre_page_layout_bbox = new_layout_of_pages[page_num-1]
-        next_page_layout_bbox = new_layout_of_pages[page_num]
-        is_conn = __connect_para_inter_page(pre_page_paras, next_page_paras, pre_page_layout_bbox, next_page_layout_bbox, page_num, lang)
-        # if debug_mode:
-        #     if is_conn:
-        #         logger.info(f"连接了第{page_num-1}页和第{page_num}页的段落")
-        #     
-        is_list_conn = __connect_list_inter_page(pre_page_paras, next_page_paras, pre_page_layout_bbox, next_page_layout_bbox, all_page_list_info[page_num-1], all_page_list_info[page_num], page_num, lang)
-        # if debug_mode:
-        #     if is_list_conn:
-        #         logger.info(f"连接了第{page_num-1}页和第{page_num}页的列表段落")
-    """接下来可能会漏掉一些特别的一些可以合并的内容，对他们进行段落连接
-    1. 正文中有时出现一个行顶格，接下来几行缩进的情况。
-    2. 居中的一些连续单行，如果高度相同，那么可能是一个段落。
-    """
-    for page_num, page in enumerate(pdf_info_dict.values()):
-        page_paras = page['para_blocks']
-        new_layout_bbox = new_layout_of_pages[page_num]
-        __connect_middle_align_text(page_paras, new_layout_bbox, page_num, lang, debug_mode=debug_mode)
-        __merge_signle_list_text(page_paras, new_layout_bbox, page_num, lang)
--- a/magic_pdf/para/para_split_v2.py
+++ b/magic_pdf/para/para_split_v2.py
-import copy
-from sklearn.cluster import DBSCAN
-import numpy as np
-from loguru import logger
-import re
-from magic_pdf.libs.boxbase import _is_in_or_part_overlap_with_area_ratio as is_in_layout
-from magic_pdf.libs.ocr_content_type import ContentType, BlockType
-from magic_pdf.model.magic_model import MagicModel
-from magic_pdf.libs.Constants import *
-LINE_STOP_FLAG = ['.', '!', '?', '。', '！', '？', "：", ":", ")", "）", ";"]
-INLINE_EQUATION = ContentType.InlineEquation
-INTERLINE_EQUATION = ContentType.InterlineEquation
-TEXT = ContentType.Text
-debug_able = False
-def __get_span_text(span):
-    c = span.get('content', '')
-    if len(c) == 0:
-        c = span.get('image_path', '')
-    return c
-def __detect_list_lines(lines, new_layout_bboxes, lang):
-    global debug_able
-    """
-    探测是否包含了列表，并且把列表的行分开.
-    这样的段落特点是，顶格字母大写/数字，紧跟着几行缩进的。缩进的行首字母含小写的。
-    """
-    def find_repeating_patterns2(lst):
-        indices = []
-        ones_indices = []
-        i = 0
-        while i < len(lst):  # Loop through the entire list
-            if lst[i] == 1:  # If we encounter a '1', we might be at the start of a pattern
-                start = i
-                ones_in_this_interval = [i]
-                i += 1
-                # Traverse elements that are 1, 2 or 3, until we encounter something else
-                while i < len(lst) and lst[i] in [1, 2, 3]:
-                    if lst[i] == 1:
-                        ones_in_this_interval.append(i)
-                    i += 1
-                if len(ones_in_this_interval) > 1 or (
-                        start < len(lst) - 1 and ones_in_this_interval and lst[start + 1] in [2, 3]):
-                    indices.append((start, i - 1))
-                    ones_indices.append(ones_in_this_interval)
-            else:
-                i += 1
-        return indices, ones_indices
-    def find_repeating_patterns(lst):
-        indices = []
-        ones_indices = []
-        i = 0
-        while i < len(lst) - 1:  # 确保余下元素至少有2个
-            if lst[i] == 1 and lst[i + 1] in [2, 3]:  # 额外检查以防止连续出现的1
-                start = i
-                ones_in_this_interval = [i]
-                i += 1
-                while i < len(lst) and lst[i] in [2, 3]:
-                    i += 1
-                # 验证下一个序列是否符合条件
-                if i < len(lst) - 1 and lst[i] == 1 and lst[i + 1] in [2, 3] and lst[i - 1] in [2, 3]:
-                    while i < len(lst) and lst[i] in [1, 2, 3]:
-                        if lst[i] == 1:
-                            ones_in_this_interval.append(i)
-                        i += 1
-                    indices.append((start, i - 1))
-                    ones_indices.append(ones_in_this_interval)
-                else:
-                    i += 1
-            else:
-                i += 1
-        return indices, ones_indices
-    """===================="""
-    def split_indices(slen, index_array):
-        result = []
-        last_end = 0
-        for start, end in sorted(index_array):
-            if start > last_end:
-                # 前一个区间结束到下一个区间开始之间的部分标记为"text"
-                result.append(('text', last_end, start - 1))
-            # 区间内标记为"list"
-            result.append(('list', start, end))
-            last_end = end + 1
-        if last_end < slen:
-            # 如果最后一个区间结束后还有剩余的字符串，将其标记为"text"
-            result.append(('text', last_end, slen - 1))
-        return result
-    """===================="""
-    if lang != 'en':
-        return lines, None
-    total_lines = len(lines)
-    line_fea_encode = []
-    """
-    对每一行进行特征编码，编码规则如下：
-    1. 如果行顶格，且大写字母开头或者数字开头，编码为1
-    2. 如果顶格，其他非大写开头编码为4
-    3. 如果非顶格，首字符大写，编码为2
-    4. 如果非顶格，首字符非大写编码为3
-    """
-    if len(lines) > 0:
-        x_map_tag_dict, min_x_tag = cluster_line_x(lines)
-    for l in lines:
-        span_text = __get_span_text(l['spans'][0])
-        if not span_text:
-            line_fea_encode.append(0)
-            continue
-        first_char = span_text[0]
-        layout = __find_layout_bbox_by_line(l['bbox'], new_layout_bboxes)
-        if not layout:
-            line_fea_encode.append(0)
-        else:
-            #
-            if x_map_tag_dict[round(l['bbox'][0])] == min_x_tag:
-                # if first_char.isupper() or first_char.isdigit() or not first_char.isalnum():
-                if not first_char.isalnum() or if_match_reference_list(span_text):
-                    line_fea_encode.append(1)
-                else:
-                    line_fea_encode.append(4)
-            else:
-                if first_char.isupper():
-                    line_fea_encode.append(2)
-                else:
-                    line_fea_encode.append(3)
-    # 然后根据编码进行分段, 选出来 1,2,3连续出现至少2次的行，认为是列表。
-    list_indice, list_start_idx = find_repeating_patterns2(line_fea_encode)
-    # if len(list_indice) > 0:
-    #     if debug_able:
-    #         logger.info(f"发现了列表，列表行数：{list_indice}， {list_start_idx}")
-    # TODO check一下这个特列表里缩进的行左侧是不是对齐的。
-    segments = []
-    for start, end in list_indice:
-        for i in range(start, end + 1):
-            if i > 0:
-                if line_fea_encode[i] == 4:
-                    # if debug_able:
-                    #     logger.info(f"列表行的第{i}行不是顶格的")
-                    break
-        # else:
-        #     if debug_able:
-        #         logger.info(f"列表行的第{start}到第{end}行是列表")
-    return split_indices(total_lines, list_indice), list_start_idx
-def cluster_line_x(lines: list) -> dict:
-    """
-    对一个block内所有lines的bbox的x0聚类
-    """
-    min_distance = 5
-    min_sample = 1
-    x0_lst = np.array([[round(line['bbox'][0]), 0] for line in lines])
-    x0_clusters = DBSCAN(eps=min_distance, min_samples=min_sample).fit(x0_lst)
-    x0_uniq_label = np.unique(x0_clusters.labels_)
-    # x1_lst = np.array([[line['bbox'][2], 0] for line in lines])
-    x0_2_new_val = {}  # 存储旧值对应的新值映射
-    min_x0 = round(lines[0]["bbox"][0])
-    for label in x0_uniq_label:
-        if label == -1:
-            continue
-        x0_index_of_label = np.where(x0_clusters.labels_ == label)
-        x0_raw_val = x0_lst[x0_index_of_label][:, 0]
-        x0_new_val = np.min(x0_lst[x0_index_of_label][:, 0])
-        x0_2_new_val.update({round(raw_val): round(x0_new_val) for raw_val in x0_raw_val})
-        if x0_new_val < min_x0:
-            min_x0 = x0_new_val
-    return x0_2_new_val, min_x0
-def if_match_reference_list(text: str) -> bool:
-    pattern = re.compile(r'^\d+\..*')
-    if pattern.match(text):
-        return True
-    else:
-        return False
-def __valign_lines(blocks, layout_bboxes):
-    """
-    在一个layoutbox内对齐行的左侧和右侧。
-    扫描行的左侧和右侧，如果x0, x1差距不超过一个阈值，就强行对齐到所处layout的左右两侧（和layout有一段距离）。
-    3是个经验值，TODO，计算得来，可以设置为1.5个正文字符。
-    """
-    min_distance = 3
-    min_sample = 2
-    new_layout_bboxes = []
-    # add bbox_fs for para split calculation
-    for block in blocks:
-        block["bbox_fs"] = copy.deepcopy(block["bbox"])
-    for layout_box in layout_bboxes:
-        blocks_in_layoutbox = [b for b in blocks if
-                               b["type"] == BlockType.Text and is_in_layout(b['bbox'], layout_box['layout_bbox'])]
-        if len(blocks_in_layoutbox) == 0 or len(blocks_in_layoutbox[0]["lines"]) == 0:
-            new_layout_bboxes.append(layout_box['layout_bbox'])
-            continue
-        x0_lst = np.array([[line['bbox'][0], 0] for block in blocks_in_layoutbox for line in block['lines']])
-        x1_lst = np.array([[line['bbox'][2], 0] for block in blocks_in_layoutbox for line in block['lines']])
-        x0_clusters = DBSCAN(eps=min_distance, min_samples=min_sample).fit(x0_lst)
-        x1_clusters = DBSCAN(eps=min_distance, min_samples=min_sample).fit(x1_lst)
-        x0_uniq_label = np.unique(x0_clusters.labels_)
-        x1_uniq_label = np.unique(x1_clusters.labels_)
-        x0_2_new_val = {}  # 存储旧值对应的新值映射
-        x1_2_new_val = {}
-        for label in x0_uniq_label:
-            if label == -1:
-                continue
-            x0_index_of_label = np.where(x0_clusters.labels_ == label)
-            x0_raw_val = x0_lst[x0_index_of_label][:, 0]
-            x0_new_val = np.min(x0_lst[x0_index_of_label][:, 0])
-            x0_2_new_val.update({idx: x0_new_val for idx in x0_raw_val})
-        for label in x1_uniq_label:
-            if label == -1:
-                continue
-            x1_index_of_label = np.where(x1_clusters.labels_ == label)
-            x1_raw_val = x1_lst[x1_index_of_label][:, 0]
-            x1_new_val = np.max(x1_lst[x1_index_of_label][:, 0])
-            x1_2_new_val.update({idx: x1_new_val for idx in x1_raw_val})
-        for block in blocks_in_layoutbox:
-            for line in block['lines']:
-                x0, x1 = line['bbox'][0], line['bbox'][2]
-                if x0 in x0_2_new_val:
-                    line['bbox'][0] = int(x0_2_new_val[x0])
-                if x1 in x1_2_new_val:
-                    line['bbox'][2] = int(x1_2_new_val[x1])
-            # 其余对不齐的保持不动
-        # 由于修改了block里的line长度，现在需要重新计算block的bbox
-        for block in blocks_in_layoutbox:
-            if len(block["lines"]) > 0:
-                block['bbox_fs'] = [min([line['bbox'][0] for line in block['lines']]),
-                                    min([line['bbox'][1] for line in block['lines']]),
-                                    max([line['bbox'][2] for line in block['lines']]),
-                                    max([line['bbox'][3] for line in block['lines']])]
-        """新计算layout的bbox，因为block的bbox变了。"""
-        layout_x0 = min([block['bbox_fs'][0] for block in blocks_in_layoutbox])
-        layout_y0 = min([block['bbox_fs'][1] for block in blocks_in_layoutbox])
-        layout_x1 = max([block['bbox_fs'][2] for block in blocks_in_layoutbox])
-        layout_y1 = max([block['bbox_fs'][3] for block in blocks_in_layoutbox])
-        new_layout_bboxes.append([layout_x0, layout_y0, layout_x1, layout_y1])
-    return new_layout_bboxes
-def __align_text_in_layout(blocks, layout_bboxes):
-    """
-    由于ocr出来的line，有时候会在前后有一段空白，这个时候需要对文本进行对齐，超出的部分被layout左右侧截断。
-    """
-    for layout in layout_bboxes:
-        lb = layout['layout_bbox']
-        blocks_in_layoutbox = [block for block in blocks if
-                               block["type"] == BlockType.Text and is_in_layout(block['bbox'], lb)]
-        if len(blocks_in_layoutbox) == 0:
-            continue
-        for block in blocks_in_layoutbox:
-            for line in block.get("lines", []):
-                x0, x1 = line['bbox'][0], line['bbox'][2]
-                if x0 < lb[0]:
-                    line['bbox'][0] = lb[0]
-                if x1 > lb[2]:
-                    line['bbox'][2] = lb[2]
-def __common_pre_proc(blocks, layout_bboxes):
-    """
-    不分语言的，对文本进行预处理
-    """
-    # __add_line_period(blocks, layout_bboxes)
-    __align_text_in_layout(blocks, layout_bboxes)
-    aligned_layout_bboxes = __valign_lines(blocks, layout_bboxes)
-    return aligned_layout_bboxes
-def __pre_proc_zh_blocks(blocks, layout_bboxes):
-    """
-    对中文文本进行分段预处理
-    """
-    pass
-def __pre_proc_en_blocks(blocks, layout_bboxes):
-    """
-    对英文文本进行分段预处理
-    """
-    pass
-def __group_line_by_layout(blocks, layout_bboxes):
-    """
-    每个layout内的行进行聚合
-    """
-    # 因为只是一个block一行目前, 一个block就是一个段落
-    blocks_group = []
-    for lyout in layout_bboxes:
-        blocks_in_layout = [block for block in blocks if is_in_layout(block.get('bbox_fs', None), lyout['layout_bbox'])]
-        blocks_group.append(blocks_in_layout)
-    return blocks_group
-def __split_para_in_layoutbox(blocks_group, new_layout_bbox, lang="en"):
-    """
-    lines_group 进行行分段——layout内部进行分段。lines_group内每个元素是一个Layoutbox内的所有行。
-    1. 先计算每个group的左右边界。
-    2. 然后根据行末尾特征进行分段。
-        末尾特征：以句号等结束符结尾。并且距离右侧边界有一定距离。
-        且下一行开头不留空白。
-    """
-    list_info = []  # 这个layout最后是不是列表,记录每一个layout里是不是列表开头，列表结尾
-    for blocks in blocks_group:
-        is_start_list = None
-        is_end_list = None
-        if len(blocks) == 0:
-            list_info.append([False, False])
-            continue
-        if blocks[0]["type"] != BlockType.Text and blocks[-1]["type"] != BlockType.Text:
-            list_info.append([False, False])
-            continue
-        if blocks[0]["type"] != BlockType.Text:
-            is_start_list = False
-        if blocks[-1]["type"] != BlockType.Text:
-            is_end_list = False
-        lines = [line for block in blocks if
-                 block["type"] == BlockType.Text for line in
-                 block['lines']]
-        total_lines = len(lines)
-        if total_lines == 1 or total_lines == 0:
-            list_info.append([False, False])
-            continue
-        """在进入到真正的分段之前，要对文字块从统计维度进行对齐方式的探测，
-                    对齐方式分为以下：
-                    1. 左对齐的文本块(特点是左侧顶格，或者左侧不顶格但是右侧顶格的行数大于非顶格的行数，顶格的首字母有大写也有小写)
-                        1) 右侧对齐的行，单独成一段
-                        2) 中间对齐的行，按照字体/行高聚合成一段
-                    2. 左对齐的列表块（其特点是左侧顶格的行数小于等于非顶格的行数，非定格首字母会有小写，顶格90%是大写。并且左侧顶格行数大于1，大于1是为了这种模式连续出现才能称之为列表）
-                        这样的文本块，顶格的为一个段落开头，紧随其后非顶格的行属于这个段落。
-        """
-        text_segments, list_start_line = __detect_list_lines(lines, new_layout_bbox, lang)
-        """根据list_range，把lines分成几个部分
-        """
-        for list_start in list_start_line:
-            if len(list_start) > 1:
-                for i in range(0, len(list_start)):
-                    index = list_start[i] - 1
-                    if index >= 0:
-                        if "content" in lines[index]["spans"][-1] and lines[index]["spans"][-1].get('type', '') not in [
-                            ContentType.InlineEquation, ContentType.InterlineEquation]:
-                            lines[index]["spans"][-1]["content"] += '\n\n'
-        layout_list_info = [False, False]  # 这个layout最后是不是列表,记录每一个layout里是不是列表开头，列表结尾
-        for content_type, start, end in text_segments:
-            if content_type == 'list':
-                if start == 0 and is_start_list is None:
-                    layout_list_info[0] = True
-                if end == total_lines - 1 and is_end_list is None:
-                    layout_list_info[1] = True
-        list_info.append(layout_list_info)
-    return list_info
-def __split_para_lines(lines: list, text_blocks: list) -> list:
-    text_paras = []
-    other_paras = []
-    text_lines = []
-    for line in lines:
-        spans_types = [span["type"] for span in line]
-        if ContentType.Table in spans_types:
-            other_paras.append([line])
-            continue
-        if ContentType.Image in spans_types:
-            other_paras.append([line])
-            continue
-        if ContentType.InterlineEquation in spans_types:
-            other_paras.append([line])
-            continue
-        text_lines.append(line)
-    for block in text_blocks:
-        block_bbox = block["bbox"]
-        para = []
-        for line in text_lines:
-            bbox = line["bbox"]
-            if is_in_layout(bbox, block_bbox):
-                para.append(line)
-        if len(para) > 0:
-            text_paras.append(para)
-    paras = other_paras.extend(text_paras)
-    paras_sorted = sorted(paras, key=lambda x: x[0]["bbox"][1])
-    return paras_sorted
-def __connect_list_inter_layout(blocks_group, new_layout_bbox, layout_list_info, page_num, lang):
-    global debug_able
-    """
-    如果上个layout的最后一个段落是列表，下一个layout的第一个段落也是列表，那么将他们连接起来。 TODO 因为没有区分列表和段落，所以这个方法暂时不实现。
-    根据layout_list_info判断是不是列表。，下个layout的第一个段如果不是列表，那么看他们是否有几行都有相同的缩进。
-    """
-    if len(blocks_group) == 0 or len(blocks_group) == 0:  # 0的时候最后的return 会出错
-        return blocks_group, [False, False]
-    for i in range(1, len(blocks_group)):
-        if len(blocks_group[i]) == 0 or len(blocks_group[i - 1]) == 0:
-            continue
-        pre_layout_list_info = layout_list_info[i - 1]
-        next_layout_list_info = layout_list_info[i]
-        pre_last_para = blocks_group[i - 1][-1].get("lines", [])
-        next_paras = blocks_group[i]
-        next_first_para = next_paras[0]
-        if pre_layout_list_info[1] and not next_layout_list_info[0] and next_first_para[
-            "type"] == BlockType.Text:  # 前一个是列表结尾，后一个是非列表开头，此时检测是否有相同的缩进
-            # if debug_able:
-            #     logger.info(f"连接page {page_num} 内的list")
-            # 向layout_paras[i] 寻找开头具有相同缩进的连续的行
-            may_list_lines = []
-            lines = next_first_para.get("lines", [])
-            for line in lines:
-                if line['bbox'][0] > __find_layout_bbox_by_line(line['bbox'], new_layout_bbox)[0]:
-                    may_list_lines.append(line)
-                else:
-                    break
-            # 如果这些行的缩进是相等的，那么连到上一个layout的最后一个段落上。
-            if len(may_list_lines) > 0 and len(set([x['bbox'][0] for x in may_list_lines])) == 1:
-                pre_last_para.extend(may_list_lines)
-                next_first_para["lines"] = next_first_para["lines"][len(may_list_lines):]
-    return blocks_group, [layout_list_info[0][0], layout_list_info[-1][1]]  # 同时还返回了这个页面级别的开头、结尾是不是列表的信息
-def __connect_list_inter_page(pre_page_paras, next_page_paras, pre_page_layout_bbox, next_page_layout_bbox,
-                              pre_page_list_info, next_page_list_info, page_num, lang):
-    """
-    如果上个layout的最后一个段落是列表，下一个layout的第一个段落也是列表，那么将他们连接起来。 TODO 因为没有区分列表和段落，所以这个方法暂时不实现。
-    根据layout_list_info判断是不是列表。，下个layout的第一个段如果不是列表，那么看他们是否有几行都有相同的缩进。
-    """
-    if len(pre_page_paras) == 0 or len(next_page_paras) == 0:  # 0的时候最后的return 会出错
-        return False
-    if len(pre_page_paras[-1]) == 0 or len(next_page_paras[0]) == 0:
-        return False
-    if pre_page_paras[-1][-1]["type"] != BlockType.Text or next_page_paras[0][0]["type"] != BlockType.Text:
-        return False
-    if pre_page_list_info[1] and not next_page_list_info[0]:  # 前一个是列表结尾，后一个是非列表开头，此时检测是否有相同的缩进
-        # if debug_able:
-        #     logger.info(f"连接page {page_num} 内的list")
-        # 向layout_paras[i] 寻找开头具有相同缩进的连续的行
-        may_list_lines = []
-        next_page_first_para = next_page_paras[0][0]
-        if next_page_first_para["type"] == BlockType.Text:
-            lines = next_page_first_para["lines"]
-            for line in lines:
-                if line['bbox'][0] > __find_layout_bbox_by_line(line['bbox'], next_page_layout_bbox)[0]:
-                    may_list_lines.append(line)
-                else:
-                    break
-        # 如果这些行的缩进是相等的，那么连到上一个layout的最后一个段落上。
-        if len(may_list_lines) > 0 and len(set([x['bbox'][0] for x in may_list_lines])) == 1:
-            # pre_page_paras[-1].append(may_list_lines)
-            # 下一页合并到上一页最后一段，打一个cross_page的标签
-            for line in may_list_lines:
-                for span in line["spans"]:
-                    span[CROSS_PAGE] = True
-            pre_page_paras[-1][-1]["lines"].extend(may_list_lines)
-            next_page_first_para["lines"] = next_page_first_para["lines"][len(may_list_lines):]
-            return True
-    return False
-def __find_layout_bbox_by_line(line_bbox, layout_bboxes):
-    """
-    根据line找到所在的layout
-    """
-    for layout in layout_bboxes:
-        if is_in_layout(line_bbox, layout):
-            return layout
-    return None
-def __connect_para_inter_layoutbox(blocks_group, new_layout_bbox):
-    """
-    layout之间进行分段。
-    主要是计算前一个layOut的最后一行和后一个layout的第一行是否可以连接。
-    连接的条件需要同时满足：
-    1. 上一个layout的最后一行沾满整个行。并且没有结尾符号。
-    2. 下一行开头不留空白。
-    """
-    connected_layout_blocks = []
-    if len(blocks_group) == 0:
-        return connected_layout_blocks
-    connected_layout_blocks.append(blocks_group[0])
-    for i in range(1, len(blocks_group)):
-        try:
-            if len(blocks_group[i]) == 0:
-                continue
-            if len(blocks_group[i - 1]) == 0:  # TODO 考虑连接问题，
-                connected_layout_blocks.append(blocks_group[i])
-                continue
-            # text类型的段才需要考虑layout间的合并
-            if blocks_group[i - 1][-1]["type"] != BlockType.Text or blocks_group[i][0]["type"] != BlockType.Text:
-                connected_layout_blocks.append(blocks_group[i])
-                continue
-            if len(blocks_group[i - 1][-1]["lines"]) == 0 or len(blocks_group[i][0]["lines"]) == 0:
-                connected_layout_blocks.append(blocks_group[i])
-                continue
-            pre_last_line = blocks_group[i - 1][-1]["lines"][-1]
-            next_first_line = blocks_group[i][0]["lines"][0]
-        except Exception as e:
-            logger.error(f"page layout {i} has no line")
-            continue
-        pre_last_line_text = ''.join([__get_span_text(span) for span in pre_last_line['spans']])
-        pre_last_line_type = pre_last_line['spans'][-1]['type']
-        next_first_line_text = ''.join([__get_span_text(span) for span in next_first_line['spans']])
-        next_first_line_type = next_first_line['spans'][0]['type']
-        if pre_last_line_type not in [TEXT, INLINE_EQUATION] or next_first_line_type not in [TEXT, INLINE_EQUATION]:
-            connected_layout_blocks.append(blocks_group[i])
-            continue
-        pre_layout = __find_layout_bbox_by_line(pre_last_line['bbox'], new_layout_bbox)
-        next_layout = __find_layout_bbox_by_line(next_first_line['bbox'], new_layout_bbox)
-        pre_x2_max = pre_layout[2] if pre_layout else -1
-        next_x0_min = next_layout[0] if next_layout else -1
-        pre_last_line_text = pre_last_line_text.strip()
-        next_first_line_text = next_first_line_text.strip()
-        if pre_last_line['bbox'][2] == pre_x2_max and pre_last_line_text and pre_last_line_text[
-            -1] not in LINE_STOP_FLAG and \
-                next_first_line['bbox'][0] == next_x0_min:  # 前面一行沾满了整个行，并且没有结尾符号.下一行没有空白开头。
-            """连接段落条件成立，将前一个layout的段落和后一个layout的段落连接。"""
-            connected_layout_blocks[-1][-1]["lines"].extend(blocks_group[i][0]["lines"])
-            blocks_group[i][0]["lines"] = []  # 删除后一个layout第一个段落中的lines，因为他已经被合并到前一个layout的最后一个段落了
-            blocks_group[i][0][LINES_DELETED] = True
-            # if len(layout_paras[i]) == 0:
-            #     layout_paras.pop(i)
-            # else:
-            #     connected_layout_paras.append(layout_paras[i])
-            connected_layout_blocks.append(blocks_group[i])
-        else:
-            """连接段落条件不成立，将前一个layout的段落加入到结果中。"""
-            connected_layout_blocks.append(blocks_group[i])
-    return connected_layout_blocks
-def __connect_para_inter_page(pre_page_paras, next_page_paras, pre_page_layout_bbox, next_page_layout_bbox, page_num,
-                              lang):
-    """
-    连接起来相邻两个页面的段落——前一个页面最后一个段落和后一个页面的第一个段落。
-    是否可以连接的条件：
-    1. 前一个页面的最后一个段落最后一行沾满整个行。并且没有结尾符号。
-    2. 后一个页面的第一个段落第一行没有空白开头。
-    """
-    # 有的页面可能压根没有文字
-    if len(pre_page_paras) == 0 or len(next_page_paras) == 0 or len(pre_page_paras[0]) == 0 or len(
-            next_page_paras[0]) == 0:  # TODO [[]]为什么出现在pre_page_paras里？
-        return False
-    pre_last_block = pre_page_paras[-1][-1]
-    next_first_block = next_page_paras[0][0]
-    if pre_last_block["type"] != BlockType.Text or next_first_block["type"] != BlockType.Text:
-        return False
-    if len(pre_last_block["lines"]) == 0 or len(next_first_block["lines"]) == 0:
-        return False
-    pre_last_para = pre_last_block["lines"]
-    next_first_para = next_first_block["lines"]
-    pre_last_line = pre_last_para[-1]
-    next_first_line = next_first_para[0]
-    pre_last_line_text = ''.join([__get_span_text(span) for span in pre_last_line['spans']])
-    pre_last_line_type = pre_last_line['spans'][-1]['type']
-    next_first_line_text = ''.join([__get_span_text(span) for span in next_first_line['spans']])
-    next_first_line_type = next_first_line['spans'][0]['type']
-    if pre_last_line_type not in [TEXT, INLINE_EQUATION] or next_first_line_type not in [TEXT,
-                                                                                         INLINE_EQUATION]:  # TODO，真的要做好，要考虑跨table, image, 行间的情况
-        # 不是文本，不连接
-        return False
-    pre_x2_max_bbox = __find_layout_bbox_by_line(pre_last_line['bbox'], pre_page_layout_bbox)
-    if not pre_x2_max_bbox:
-        return False
-    next_x0_min_bbox = __find_layout_bbox_by_line(next_first_line['bbox'], next_page_layout_bbox)
-    if not next_x0_min_bbox:
-        return False
-    pre_x2_max = pre_x2_max_bbox[2]
-    next_x0_min = next_x0_min_bbox[0]
-    pre_last_line_text = pre_last_line_text.strip()
-    next_first_line_text = next_first_line_text.strip()
-    if pre_last_line['bbox'][2] == pre_x2_max and pre_last_line_text[-1] not in LINE_STOP_FLAG and \
-            next_first_line['bbox'][0] == next_x0_min:  # 前面一行沾满了整个行，并且没有结尾符号.下一行没有空白开头。
-        """连接段落条件成立，将前一个layout的段落和后一个layout的段落连接。"""
-        # 下一页合并到上一页最后一段，打一个cross_page的标签
-        for line in next_first_para:
-            for span in line["spans"]:
-                span[CROSS_PAGE] = True
-        pre_last_para.extend(next_first_para)
-        # next_page_paras[0].pop(0)  # 删除后一个页面的第一个段落， 因为他已经被合并到前一个页面的最后一个段落了。
-        next_page_paras[0][0]["lines"] = []
-        next_page_paras[0][0][LINES_DELETED] = True
-        return True
-    else:
-        return False
-def find_consecutive_true_regions(input_array):
-    start_index = None  # 连续True区域的起始索引
-    regions = []  # 用于保存所有连续True区域的起始和结束索引
-    for i in range(len(input_array)):
-        # 如果我们找到了一个True值，并且当前并没有在连续True区域中
-        if input_array[i] and start_index is None:
-            start_index = i  # 记录连续True区域的起始索引
-        # 如果我们找到了一个False值，并且当前在连续True区域中
-        elif not input_array[i] and start_index is not None:
-            # 如果连续True区域长度大于1，那么将其添加到结果列表中
-            if i - start_index > 1:
-                regions.append((start_index, i - 1))
-            start_index = None  # 重置起始索引
-    # 如果最后一个元素是True，那么需要将最后一个连续True区域加入到结果列表中
-    if start_index is not None and len(input_array) - start_index > 1:
-        regions.append((start_index, len(input_array) - 1))
-    return regions
-def __connect_middle_align_text(page_paras, new_layout_bbox, page_num, lang):
-    global debug_able
-    """
-    找出来中间对齐的连续单行文本，如果连续行高度相同，那么合并为一个段落。
-    一个line居中的条件是：
-    1. 水平中心点跨越layout的中心点。
-    2. 左右两侧都有空白
-    """
-    for layout_i, layout_para in enumerate(page_paras):
-        layout_box = new_layout_bbox[layout_i]
-        single_line_paras_tag = []
-        for i in range(len(layout_para)):
-            # single_line_paras_tag.append(len(layout_para[i]) == 1 and layout_para[i][0]['spans'][0]['type'] == TEXT)
-            single_line_paras_tag.append(layout_para[i]['type'] == BlockType.Text and len(layout_para[i]["lines"]) == 1)
-        """找出来连续的单行文本，如果连续行高度相同，那么合并为一个段落。"""
-        consecutive_single_line_indices = find_consecutive_true_regions(single_line_paras_tag)
-        if len(consecutive_single_line_indices) > 0:
-            """检查这些行是否是高度相同的，居中的"""
-            for start, end in consecutive_single_line_indices:
-                # start += index_offset
-                # end += index_offset
-                line_hi = np.array([block["lines"][0]['bbox'][3] - block["lines"][0]['bbox'][1] for block in
-                                    layout_para[start:end + 1]])
-                first_line_text = ''.join([__get_span_text(span) for span in layout_para[start]["lines"][0]['spans']])
-                if "Table" in first_line_text or "Figure" in first_line_text:
-                    pass
-                # if debug_able:
-                #     logger.info(line_hi.std())
-                if line_hi.std() < 2:
-                    """行高度相同，那么判断是否居中"""
-                    all_left_x0 = [block["lines"][0]['bbox'][0] for block in layout_para[start:end + 1]]
-                    all_right_x1 = [block["lines"][0]['bbox'][2] for block in layout_para[start:end + 1]]
-                    layout_center = (layout_box[0] + layout_box[2]) / 2
-                    if all([x0 < layout_center < x1 for x0, x1 in zip(all_left_x0, all_right_x1)]) \
-                            and not all([x0 == layout_box[0] for x0 in all_left_x0]) \
-                            and not all([x1 == layout_box[2] for x1 in all_right_x1]):
-                        merge_para = [block["lines"][0] for block in layout_para[start:end + 1]]
-                        para_text = ''.join([__get_span_text(span) for line in merge_para for span in line['spans']])
-                        # if debug_able:
-                        #     logger.info(para_text)
-                        layout_para[start]["lines"] = merge_para
-                        for i_para in range(start + 1, end + 1):
-                            layout_para[i_para]["lines"] = []
-                            layout_para[i_para][LINES_DELETED] = True
-                        # layout_para[start:end + 1] = [merge_para]
-                        # index_offset -= end - start
-    return
-def __merge_signle_list_text(page_paras, new_layout_bbox, page_num, lang):
-    """
-    找出来连续的单行文本，如果首行顶格，接下来的几个单行段落缩进对齐，那么合并为一个段落。
-    """
-    pass
-def __do_split_page(blocks, layout_bboxes, new_layout_bbox, page_num, lang):
-    """
-    根据line和layout情况进行分段
-    先实现一个根据行末尾特征分段的简单方法。
-    """
-    """
-    算法思路：
-    1. 扫描layout里每一行，找出来行尾距离layout有边界有一定距离的行。
-    2. 从上述行中找到末尾是句号等可作为断行标志的行。
-    3. 参照上述行尾特征进行分段。
-    4. 图、表，目前独占一行，不考虑分段。
-    """
-    blocks_group = __group_line_by_layout(blocks, layout_bboxes)  # block内分段
-    layout_list_info = __split_para_in_layoutbox(blocks_group, new_layout_bbox, lang)  # layout内分段
-    blocks_group, page_list_info = __connect_list_inter_layout(blocks_group, new_layout_bbox, layout_list_info,
-                                                               page_num, lang)  # layout之间连接列表段落
-    connected_layout_blocks = __connect_para_inter_layoutbox(blocks_group, new_layout_bbox)  # layout间链接段落
-    return connected_layout_blocks, page_list_info
-def para_split(pdf_info_dict, debug_mode, lang="en"):
-    global debug_able
-    debug_able = debug_mode
-    new_layout_of_pages = []  # 数组的数组，每个元素是一个页面的layoutS
-    all_page_list_info = []  # 保存每个页面开头和结尾是否是列表
-    for page_num, page in pdf_info_dict.items():
-        blocks = copy.deepcopy(page['preproc_blocks'])
-        layout_bboxes = page['layout_bboxes']
-        new_layout_bbox = __common_pre_proc(blocks, layout_bboxes)
-        new_layout_of_pages.append(new_layout_bbox)
-        splited_blocks, page_list_info = __do_split_page(blocks, layout_bboxes, new_layout_bbox, page_num, lang)
-        all_page_list_info.append(page_list_info)
-        page['para_blocks'] = splited_blocks
-#        logger.info(f'page_list_info:\n{page_list_info}')
- #       logger.info(f'splited_blocks:\n{splited_blocks}')
-    """连接页面与页面之间的可能合并的段落"""
-    pdf_infos = list(pdf_info_dict.values())
-    for page_num, page in enumerate(pdf_info_dict.values()):
-        if page_num == 0:
-            continue
-        pre_page_paras = pdf_infos[page_num - 1]['para_blocks']
-        next_page_paras = pdf_infos[page_num]['para_blocks']
-        pre_page_layout_bbox = new_layout_of_pages[page_num - 1]
-        next_page_layout_bbox = new_layout_of_pages[page_num]
-        is_conn = __connect_para_inter_page(pre_page_paras, next_page_paras, pre_page_layout_bbox,
-                                            next_page_layout_bbox, page_num, lang)
-        # if debug_able:
-        #     if is_conn:
-        #         logger.info(f"连接了第{page_num - 1}页和第{page_num}页的段落")
-        is_list_conn = __connect_list_inter_page(pre_page_paras, next_page_paras, pre_page_layout_bbox,
-                                                 next_page_layout_bbox, all_page_list_info[page_num - 1],
-                                                 all_page_list_info[page_num], page_num, lang)
-        # if debug_able:
-        #     if is_list_conn:
-        #         logger.info(f"连接了第{page_num - 1}页和第{page_num}页的列表段落")
-    """接下来可能会漏掉一些特别的一些可以合并的内容，对他们进行段落连接
-    1. 正文中有时出现一个行顶格，接下来几行缩进的情况。
-    2. 居中的一些连续单行，如果高度相同，那么可能是一个段落。
-    """
-    for page_num, page in enumerate(pdf_info_dict.values()):
-        page_paras = page['para_blocks']
-        new_layout_bbox = new_layout_of_pages[page_num]
-        __connect_middle_align_text(page_paras, new_layout_bbox, page_num, lang)
-        __merge_signle_list_text(page_paras, new_layout_bbox, page_num, lang)
-    # layout展平
-    for page_num, page in enumerate(pdf_info_dict.values()):
-        page_paras = page['para_blocks']
-        page_blocks = [block for layout in page_paras for block in layout]
-        page["para_blocks"] = page_blocks