Initial commit

c2e5c36f · 赵小蒙 · c2e5c36f · c2e5c36f · c2e5c36f · c2e5c36f
Commit c2e5c36f authored Feb 29, 2024 by 赵小蒙
20 changed files
--- a/libs/json_compressor.py
+++ b/libs/json_compressor.py
+import json
+import brotli
+import base64
+
+class JsonCompressor:
+
+    @staticmethod
+    def compress_json(data):
+        """
+        Compress a json object and encode it with base64
+        """
+        json_str = json.dumps(data)
+        json_bytes = json_str.encode('utf-8')
+        compressed = brotli.compress(json_bytes, quality=6)
+        compressed_str = base64.b64encode(compressed).decode('utf-8')  # convert bytes to string
+        return compressed_str
+
+    @staticmethod
+    def decompress_json(compressed_str):
+        """
+        Decode the base64 string and decompress the json object
+        """
+        compressed = base64.b64decode(compressed_str.encode('utf-8'))  # convert string to bytes
+        decompressed_bytes = brotli.decompress(compressed)
+        json_str = decompressed_bytes.decode('utf-8')
+        data = json.loads(json_str)
+        return data
--- a/libs/language.py
+++ b/libs/language.py
+import pycld2 as cld2
+import regex
+import unicodedata
+
+
+RE_BAD_CHARS = regex.compile(r"\p{Cc}|\p{Cs}")
+
+
+def remove_bad_chars(text):
+    return RE_BAD_CHARS.sub("", text)
+
+
+def detect_lang(text: str) -> str:
+    if len(text) == 0:
+        return ""
+
+    try:
+        _, _, details = cld2.detect(text)
+    except:
+        # cld2 doesn't like control characters
+        # https://github.com/mikemccand/chromium-compact-language-detector/issues/22#issuecomment-435904616
+        html_no_ctrl_chars = ''.join([l for l in text if unicodedata.category(l)[0] not in ['C',]])
+        _, _, details = cld2.detect(html_no_ctrl_chars)
+    lang = ""
+    try:
+        lang = details[0][1].lower()
+    except:
+        lang = ""
+    return lang
+
+
+if __name__ == '__main__':
+    print(detect_lang("This is a test."))
+    print(detect_lang("<html>This is a test</html>"))
+    print(detect_lang("这个是中文测试。"))
+    print(detect_lang("<html>这个是中文测试。</html>"))
\ No newline at end of file
--- a/libs/markdown_utils.py
+++ b/libs/markdown_utils.py
+import re
+
+
+def escape_special_markdown_char(pymu_blocks):
+    """
+    转义正文里对markdown语法有特殊意义的字符
+    """
+    special_chars = ["*", "`", "~", "$"]
+    for blk in pymu_blocks:
+        for line in blk['lines']:
+            for span in line['spans']:
+                for char in special_chars:
+                    span_text = span['text']
+                    span_type = span.get("_type", None)
+                    if span_type in ['inline-equation', 'interline-equation']:
+                        continue
+                    elif span_text:
+                        span['text'] = span['text'].replace(char, "\\" + char)
+
+    return pymu_blocks
--- a/libs/nlp_utils.py
+++ b/libs/nlp_utils.py
+import re
+from os import path
+
+from collections import Counter
+
+from loguru import logger
+
+# from langdetect import detect
+import spacy
+import en_core_web_sm
+import zh_core_web_sm
+
+from libs.language import detect_lang
+
+
+class NLPModels:
+    """
+    How to upload local models to s3:
+        - config aws cli:
+            doc\SETUP-CLI.md
+            doc\setup_cli.sh
+            app\config\__init__.py
+        - $ cd {local_dir_storing_models}
+        - $ ls models
+            en_core_web_sm-3.7.1/
+            zh_core_web_sm-3.7.0/
+        - $ aws s3 sync models/ s3://llm-infra/models --profile=p_project_norm
+        - $ aws s3 --profile=p_project_norm ls  s3://llm-infra/models/
+            PRE en_core_web_sm-3.7.1/
+            PRE zh_core_web_sm-3.7.0/
+    """
+
+    def __init__(self):
+        # if OS is windows, set "TMP_DIR" to "D:/tmp"
+
+        home_dir = path.expanduser("~")
+        self.default_local_path = path.join(home_dir, ".nlp_models")
+        self.default_shared_path = "/share/pdf_processor/nlp_models"
+        self.default_hdfs_path = "hdfs://pdf_processor/nlp_models"
+        self.default_s3_path = "s3://llm-infra/models"
+        self.nlp_models = self.nlp_models = {
+            "en_core_web_sm": {
+                "type": "spacy",
+                "version": "3.7.1",
+            },
+            "en_core_web_md": {
+                "type": "spacy",
+                "version": "3.7.1",
+            },
+            "en_core_web_lg": {
+                "type": "spacy",
+                "version": "3.7.1",
+            },
+            "zh_core_web_sm": {
+                "type": "spacy",
+                "version": "3.7.0",
+            },
+            "zh_core_web_md": {
+                "type": "spacy",
+                "version": "3.7.0",
+            },
+            "zh_core_web_lg": {
+                "type": "spacy",
+                "version": "3.7.0",
+            },
+        }
+        self.en_core_web_sm_model = en_core_web_sm.load()
+        self.zh_core_web_sm_model = zh_core_web_sm.load()
+
+    def load_model(self, model_name, model_type, model_version):
+        if (
+            model_name in self.nlp_models
+            and self.nlp_models[model_name]["type"] == model_type
+            and self.nlp_models[model_name]["version"] == model_version
+        ):
+            return spacy.load(model_name) if spacy.util.is_package(model_name) else None
+
+        else:
+            logger.error(f"Unsupported model name or version: {model_name} {model_version}")
+            return None
+
+    def detect_language(self, text, use_langdetect=False):
+        if len(text) == 0:
+            return None
+        if use_langdetect:
+            # print("use_langdetect")
+            # print(detect_lang(text))
+            # return detect_lang(text)
+            if detect_lang(text) == "zh":
+                return "zh"
+            else:
+                return "en"
+
+        if not use_langdetect:
+            en_count = len(re.findall(r"[a-zA-Z]", text))
+            cn_count = len(re.findall(r"[\u4e00-\u9fff]", text))
+
+            if en_count > cn_count:
+                return "en"
+
+            if cn_count > en_count:
+                return "zh"
+
+    def detect_entity_catgr_using_nlp(self, text, threshold=0.5):
+        """
+        Detect entity categories using NLP models and return the most frequent entity types.
+
+        Parameters
+        ----------
+        text : str
+            Text to be processed.
+
+        Returns
+        -------
+        str
+            The most frequent entity type.
+        """
+        lang = self.detect_language(text, use_langdetect=True)
+
+        if lang == "en":
+            nlp_model = self.en_core_web_sm_model
+        elif lang == "zh":
+            nlp_model = self.zh_core_web_sm_model
+        else:
+            # logger.error(f"Unsupported language: {lang}")
+            return {}
+
+        # Splitting text into smaller parts
+        text_parts = re.split(r"[,;，；、\s & |]+", text)
+
+        text_parts = [part for part in text_parts if not re.match(r"[\d\W]+", part)]  # Remove non-words
+        text_combined = " ".join(text_parts)
+
+        try:
+            doc = nlp_model(text_combined)
+            entity_counts = Counter([ent.label_ for ent in doc.ents])
+            word_counts_in_entities = Counter()
+
+            for ent in doc.ents:
+                word_counts_in_entities[ent.label_] += len(ent.text.split())
+
+            total_words_in_entities = sum(word_counts_in_entities.values())
+            total_words = len([token for token in doc if not token.is_punct])
+
+            if total_words_in_entities == 0 or total_words == 0:
+                return None
+
+            entity_percentage = total_words_in_entities / total_words
+            if entity_percentage < 0.5:
+                return None
+
+            most_common_entity, word_count = word_counts_in_entities.most_common(1)[0]
+            entity_percentage = word_count / total_words_in_entities
+
+            if entity_percentage >= threshold:
+                return most_common_entity
+            else:
+                return None
+        except Exception as e:
+            logger.error(f"Error in entity detection: {e}")
+            return None
+
+
+def __main__():
+    nlpModel = NLPModels()
+
+    test_strings = [
+        "张三",
+        "张三, 李四，王五; 赵六",
+        "John Doe",
+        "Jane Smith",
+        "Lee, John",
+        "John Doe, Jane Smith; Alice Johnson，Bob Lee",
+        "孙七, Michael Jordan；赵八",
+        "David Smith  Michael O'Connor; Kevin ßáçøñ",
+        "李雷·韩梅梅, 张三·李四",
+        "Charles Robert Darwin, Isaac Newton",
+        "莱昂纳多·迪卡普里奥, 杰克·吉伦哈尔",
+        "John Doe, Jane Smith; Alice Johnson",
+        "张三, 李四，王五; 赵六",
+        "Lei Wang, Jia Li, and Xiaojun Chen, LINKE YANG OU, and YUAN ZHANG",
+        "Rachel Mills  &  William Barry  &  Susanne B. Haga",
+        "Claire Chabut* and Jean-François Bussières",
+        "1 Department of Chemistry, Northeastern University, Shenyang 110004, China 2 State Key Laboratory of Polymer Physics and Chemistry, Changchun Institute of Applied Chemistry, Chinese Academy of Sciences, Changchun 130022, China",
+        "Changchun",
+        "china",
+        "Rongjun Song, 1,2 Baoyan Zhang, 1 Baotong Huang, 2 Tao Tang 2",
+        "Synergistic Effect of Supported Nickel Catalyst with Intumescent Flame-Retardants on Flame Retardancy and Thermal Stability of Polypropylene",
+        "Synergistic Effect of Supported Nickel Catalyst with",
+        "Intumescent Flame-Retardants on Flame Retardancy",
+        "and Thermal Stability of Polypropylene",
+    ]
+
+    for test in test_strings:
+        print()
+        print(f"Original String: {test}")
+
+        result = nlpModel.detect_entity_catgr_using_nlp(test)
+        print(f"Detected entities: {result}")
+
+
+if __name__ == "__main__":
+    __main__()
--- a/libs/pdf_image_tools.py
+++ b/libs/pdf_image_tools.py
+import os
+from pathlib import Path
+from typing import Tuple
+import io
+
+# from app.common.s3 import get_s3_client
+from libs.commons import fitz
+from loguru import logger
+from libs.commons import parse_bucket_key, join_path
+
+
+def cut_image(bbox: Tuple, page_num: int, page: fitz.Page, save_parent_path: str, s3_return_path=None, img_s3_client=None, upload_switch=True):
+    """
+    从第page_num页的page中，根据bbox进行裁剪出一张jpg图片，返回图片路径
+    save_path：需要同时支持s3和本地, 图片存放在save_path下，文件名是: {page_num}_{bbox[0]}_{bbox[1]}_{bbox[2]}_{bbox[3]}.jpg , bbox内数字取整。
+    """
+    # 拼接文件名
+    filename = f"{page_num}_{int(bbox[0])}_{int(bbox[1])}_{int(bbox[2])}_{int(bbox[3])}.jpg"
+    # 拼接路径
+    image_save_path = join_path(save_parent_path, filename)
+    s3_img_path = join_path(s3_return_path, filename) if s3_return_path is not None else None
+    # 打印图片文件名
+    # print(f"Saved {image_save_path}")
+
+    #检查坐标
+    # x_check = int(bbox[2]) - int(bbox[0])
+    # y_check = int(bbox[3]) - int(bbox[1])
+    # if x_check <= 0 or y_check <= 0:
+    #
+    #     if image_save_path.startswith("s3://"):
+    #         logger.exception(f"传入图片坐标有误，x1<x0或y1<y0,{s3_img_path}")
+    #         return s3_img_path
+    #     else:
+    #         logger.exception(f"传入图片坐标有误，x1<x0或y1<y0,{image_save_path}")
+    #         return image_save_path
+
+
+    # 将坐标转换为fitz.Rect对象
+    rect = fitz.Rect(*bbox)
+    # 配置缩放倍数为3倍
+    zoom = fitz.Matrix(3, 3)
+    # 截取图片
+    pix = page.get_pixmap(clip=rect, matrix=zoom)
+
+    if image_save_path.startswith("s3://"):
+        if not upload_switch:
+            pass
+        else:
+            # 图片保存到s3
+            bucket_name, bucket_key = parse_bucket_key(image_save_path)
+            # 将字节流上传到s3
+            byte_data = pix.tobytes(output='jpeg', jpg_quality=95)
+            file_obj = io.BytesIO(byte_data)
+            if img_s3_client is not None:
+                img_s3_client.upload_fileobj(file_obj, bucket_name, bucket_key)
+                # 每个图片上传任务都创建一个新的client
+                # img_s3_client_once = get_s3_client(image_save_path)
+                # img_s3_client_once.upload_fileobj(file_obj, bucket_name, bucket_key)
+            else:
+                logger.exception("must input img_s3_client")
+        return s3_img_path
+    else:
+        # 保存图片到本地
+        # 先检查一下image_save_path的父目录是否存在，如果不存在，就创建
+        parent_dir = os.path.dirname(image_save_path)
+        if not os.path.exists(parent_dir):
+            os.makedirs(parent_dir)
+        pix.save(image_save_path, jpg_quality=95)
+        # 为了直接能在markdown里看，这里把地址改为相对于mardown的地址
+        pth = Path(image_save_path)
+        image_save_path = f"{pth.parent.name}/{pth.name}"
+        return image_save_path
+
+
+def save_images_by_bboxes(book_name: str, page_num: int, page: fitz.Page, save_path: str,
+                            image_bboxes: list, images_overlap_backup:list, table_bboxes: list, equation_inline_bboxes: list,
+                            equation_interline_bboxes: list, img_s3_client) -> dict:
+    """
+    返回一个dict, key为bbox, 值是图片地址
+    """
+    image_info = []
+    image_backup_info = []
+    table_info = []
+    inline_eq_info = []
+    interline_eq_info = []
+
+    # 图片的保存路径组成是这样的： {s3_or_local_path}/{book_name}/{images|tables|equations}/{page_num}_{bbox[0]}_{bbox[1]}_{bbox[2]}_{bbox[3]}.jpg
+    s3_return_image_path = join_path(book_name, "images")
+    image_save_path = join_path(save_path, s3_return_image_path)
+
+    s3_return_table_path = join_path(book_name, "tables")
+    table_save_path = join_path(save_path, s3_return_table_path)
+
+    s3_return_equations_inline_path = join_path(book_name, "equations_inline")
+    equation_inline_save_path = join_path(save_path, s3_return_equations_inline_path)
+
+    s3_return_equation_interline_path = join_path(book_name, "equation_interline")
+    equation_interline_save_path = join_path(save_path, s3_return_equation_interline_path)
+
+
+    for bbox in image_bboxes:
+        if any([bbox[0]>=bbox[2], bbox[1]>=bbox[3]]):
+            logger.warning(f"image_bboxes: 错误的box, {bbox}")
+            continue
+        
+        image_path = cut_image(bbox, page_num, page, image_save_path, s3_return_image_path, img_s3_client)
+        image_info.append({"bbox": bbox, "image_path": image_path})
+        
+    for bbox in images_overlap_backup:
+        if any([bbox[0]>=bbox[2], bbox[1]>=bbox[3]]):
+            logger.warning(f"images_overlap_backup: 错误的box, {bbox}")
+            continue
+        image_path = cut_image(bbox, page_num, page, image_save_path, s3_return_image_path, img_s3_client)
+        image_backup_info.append({"bbox": bbox, "image_path": image_path})
+
+    for bbox in table_bboxes:
+        if any([bbox[0]>=bbox[2], bbox[1]>=bbox[3]]):
+            logger.warning(f"table_bboxes: 错误的box, {bbox}")
+            continue
+        image_path = cut_image(bbox, page_num, page, table_save_path, s3_return_table_path, img_s3_client)
+        table_info.append({"bbox": bbox, "image_path": image_path})
+
+    for bbox in equation_inline_bboxes:
+        if any([bbox[0]>=bbox[2], bbox[1]>=bbox[3]]):
+            logger.warning(f"equation_inline_bboxes: 错误的box, {bbox}")
+            continue
+        image_path = cut_image(bbox[:4], page_num, page, equation_inline_save_path, s3_return_equations_inline_path, img_s3_client, upload_switch=False)
+        inline_eq_info.append({'bbox':bbox[:4], "image_path":image_path, "latex_text":bbox[4]})
+
+    for bbox in equation_interline_bboxes:
+        if any([bbox[0]>=bbox[2], bbox[1]>=bbox[3]]):
+            logger.warning(f"equation_interline_bboxes: 错误的box, {bbox}")
+            continue
+        image_path = cut_image(bbox[:4], page_num, page, equation_interline_save_path, s3_return_equation_interline_path, img_s3_client, upload_switch=False)
+        interline_eq_info.append({"bbox":bbox[:4], "image_path":image_path, "latex_text":bbox[4]})
+
+    return image_info, image_backup_info,  table_info, inline_eq_info, interline_eq_info
\ No newline at end of file
--- a/libs/safe_filename.py
+++ b/libs/safe_filename.py
+import os
+
+
+def sanitize_filename(filename, replacement="_"):
+    if os.name == 'nt':
+        invalid_chars = '<>:"|?*'
+
+        for char in invalid_chars:
+            filename = filename.replace(char, replacement)
+
+    return filename
--- a/libs/textbase.py
+++ b/libs/textbase.py
+import math
+
+
+def __inc_dict_val(mp, key, val_inc:int):
+    if mp.get(key):
+        mp[key] = mp[key] + val_inc
+    else:
+        mp[key] = val_inc
+        
+    
+
+def get_text_block_base_info(block):
+    """
+    获取这个文本块里的字体的颜色、字号、字体
+    按照正文字数最多的返回
+    """
+    
+    counter = {}
+    
+    for line in block['lines']:
+        for span in line['spans']:
+            color = span['color']
+            size = round(span['size'], 2)
+            font = span['font']
+            
+            txt_len = len(span['text'])
+            __inc_dict_val(counter, (color, size, font), txt_len)
+            
+    
+    c, s, ft = max(counter, key=counter.get)
+    
+    return c, s, ft
+    
\ No newline at end of file
--- a/libs/vis_utils.py
+++ b/libs/vis_utils.py
+from libs.commons import fitz
+import os
+from loguru import logger
+from layout.bbox_sort import CONTENT_TYPE_IDX
+
+
+def draw_bbox_on_page(raw_pdf_doc: fitz.Document, paras_dict:dict, save_path: str):
+    """
+    在page上画出bbox，保存到save_path
+    """
+    # 检查文件是否存在
+    is_new_pdf = False
+    if os.path.exists(save_path):
+        # 打开现有的 PDF 文件
+        doc = fitz.open(save_path)
+    else:
+        # 创建一个新的空白 PDF 文件
+        is_new_pdf = True
+        doc = fitz.open('')
+
+    color_map = {
+        'image': fitz.pdfcolor["yellow"],
+        'text': fitz.pdfcolor['blue'],
+        "table": fitz.pdfcolor['green']
+    }
+    
+    for k, v in paras_dict.items():
+        page_idx = v['page_idx']
+        width = raw_pdf_doc[page_idx].rect.width
+        height = raw_pdf_doc[page_idx].rect.height
+        new_page = doc.new_page(width=width, height=height)
+
+        shape = new_page.new_shape()
+        for order, block in enumerate(v['preproc_blocks']):
+            rect = fitz.Rect(block['bbox'])
+            shape = new_page.new_shape()
+            shape.draw_rect(rect)
+            shape.finish(color=None, fill=color_map['text'], fill_opacity=0.2)
+            shape.finish()
+            shape.commit()
+            
+        for img in v['images']:
+            # 原始box画上去
+            rect = fitz.Rect(img['bbox'])
+            shape = new_page.new_shape()
+            shape.draw_rect(rect)
+            shape.finish(color=None, fill=fitz.pdfcolor['yellow'])
+            shape.finish()
+            shape.commit()
+
+        for img in v['image_backup']:
+            # 原始box画上去
+            rect = fitz.Rect(img['bbox'])
+            shape = new_page.new_shape()
+            shape.draw_rect(rect)
+            shape.finish(color=fitz.pdfcolor['yellow'],  fill=None)
+            shape.finish()
+            shape.commit()
+            
+        for tb in v['droped_text_block']:
+            # 原始box画上去
+            rect = fitz.Rect(tb['bbox'])
+            shape = new_page.new_shape()
+            shape.draw_rect(rect)
+            shape.finish(color=None, fill=fitz.pdfcolor['black'], fill_opacity=0.4)
+            shape.finish()
+            shape.commit()
+            
+        # TODO table
+        for tb in v['tables']:
+            rect = fitz.Rect(tb['bbox'])
+            shape = new_page.new_shape()
+            shape.draw_rect(rect)
+            shape.finish(color=None, fill=fitz.pdfcolor['green'], fill_opacity=0.2)
+            shape.finish()
+            shape.commit()
+
+
+    parent_dir = os.path.dirname(save_path)
+    if not os.path.exists(parent_dir):
+        os.makedirs(parent_dir)
+
+    if is_new_pdf:
+        doc.save(save_path)
+    else:
+        doc.saveIncr()
+    doc.close()
+    
+
+def debug_show_bbox(raw_pdf_doc: fitz.Document, page_idx: int, bboxes: list, droped_bboxes:list,  expect_drop_bboxes:list, save_path: str, expected_page_id:int):
+    """
+    以覆盖的方式写个临时的pdf，用于debug
+    """
+    if page_idx!=expected_page_id:
+        return
+        
+    if os.path.exists(save_path):
+        # 删除已经存在的文件
+        os.remove(save_path)
+    # 创建一个新的空白 PDF 文件
+    doc = fitz.open('')
+
+    width = raw_pdf_doc[page_idx].rect.width
+    height = raw_pdf_doc[page_idx].rect.height
+    new_page = doc.new_page(width=width, height=height)
+
+    shape = new_page.new_shape()
+    for bbox in bboxes:
+        # 原始box画上去
+        rect = fitz.Rect(*bbox[0:4])
+        shape = new_page.new_shape()
+        shape.draw_rect(rect)
+        shape.finish(color=fitz.pdfcolor['red'], fill=fitz.pdfcolor['blue'], fill_opacity=0.2)
+        shape.finish()
+        shape.commit()
+        
+    for bbox in droped_bboxes:
+        # 原始box画上去
+        rect = fitz.Rect(*bbox[0:4])
+        shape = new_page.new_shape()
+        shape.draw_rect(rect)
+        shape.finish(color=None, fill=fitz.pdfcolor['yellow'], fill_opacity=0.2)
+        shape.finish()
+        shape.commit()
+        
+    for bbox in expect_drop_bboxes:
+        # 原始box画上去
+        rect = fitz.Rect(*bbox[0:4])
+        shape = new_page.new_shape()
+        shape.draw_rect(rect)
+        shape.finish(color=fitz.pdfcolor['red'], fill=None)
+        shape.finish()
+        shape.commit()
+
+    # shape.insert_textbox(fitz.Rect(200, 0, 600, 20), f"total bboxes: {len(bboxes)}", fontname="helv", fontsize=12,
+    #                      color=(0, 0, 0))
+    # shape.finish(color=fitz.pdfcolor['black'])
+    # shape.commit()
+
+    parent_dir = os.path.dirname(save_path)
+    if not os.path.exists(parent_dir):
+        os.makedirs(parent_dir)
+
+    doc.save(save_path)
+    doc.close()
+    
+
+def debug_show_page(page, bboxes1: list,bboxes2: list,bboxes3: list,):
+    save_path = "./tmp/debug.pdf"
+    if os.path.exists(save_path):
+        # 删除已经存在的文件
+        os.remove(save_path)
+    # 创建一个新的空白 PDF 文件
+    doc = fitz.open('')
+
+    width = page.rect.width
+    height = page.rect.height
+    new_page = doc.new_page(width=width, height=height)
+    
+    shape = new_page.new_shape()
+    for bbox in bboxes1:
+        # 原始box画上去
+        rect = fitz.Rect(*bbox[0:4])
+        shape = new_page.new_shape()
+        shape.draw_rect(rect)
+        shape.finish(color=fitz.pdfcolor['red'], fill=fitz.pdfcolor['blue'], fill_opacity=0.2)
+        shape.finish()
+        shape.commit()
+        
+    for bbox in bboxes2:
+        # 原始box画上去
+        rect = fitz.Rect(*bbox[0:4])
+        shape = new_page.new_shape()
+        shape.draw_rect(rect)
+        shape.finish(color=None, fill=fitz.pdfcolor['yellow'], fill_opacity=0.2)
+        shape.finish()
+        shape.commit()
+        
+    for bbox in bboxes3:
+        # 原始box画上去
+        rect = fitz.Rect(*bbox[0:4])
+        shape = new_page.new_shape()
+        shape.draw_rect(rect)
+        shape.finish(color=fitz.pdfcolor['red'], fill=None)
+        shape.finish()
+        shape.commit()
+        
+    parent_dir = os.path.dirname(save_path)
+    if not os.path.exists(parent_dir):
+        os.makedirs(parent_dir)
+
+    doc.save(save_path)
+    doc.close() 
+    
+    
+    
+    
+def draw_layout_bbox_on_page(raw_pdf_doc: fitz.Document, paras_dict:dict, header, footer, pdf_path: str):
+    """
+    在page上画出bbox，保存到save_path
+    """
+    # 检查文件是否存在
+    is_new_pdf = False
+    if os.path.exists(pdf_path):
+        # 打开现有的 PDF 文件
+        doc = fitz.open(pdf_path)
+    else:
+        # 创建一个新的空白 PDF 文件
+        is_new_pdf = True
+        doc = fitz.open('')
+
+    for k, v in paras_dict.items():
+        page_idx = v['page_idx']
+        layouts = v['layout_bboxes']
+        page = doc[page_idx]
+        shape = page.new_shape()
+        for order, layout in enumerate(layouts):
+            border_offset = 1
+            rect_box = layout['layout_bbox']
+            layout_label = layout['layout_label']
+            fill_color = fitz.pdfcolor['pink'] if layout_label=='U' else None
+            rect_box = [rect_box[0]+1, rect_box[1]-border_offset, rect_box[2]-1, rect_box[3]+border_offset]
+            rect = fitz.Rect(*rect_box)
+            shape.draw_rect(rect)
+            shape.finish(color=fitz.pdfcolor['red'], fill=fill_color, fill_opacity=0.4)
+            """
+            draw order text on layout box
+            """
+            font_size = 10
+            shape.insert_text((rect_box[0] + 1, rect_box[1] + font_size), f"{order}", fontsize=font_size, color=(0, 0, 0))
+        
+        """画上footer header"""
+        if header:
+            shape.draw_rect(fitz.Rect(header))
+            shape.finish(color=None, fill=fitz.pdfcolor['black'], fill_opacity=0.2)
+        if footer:
+            shape.draw_rect(fitz.Rect(footer))
+            shape.finish(color=None, fill=fitz.pdfcolor['black'], fill_opacity=0.2)
+        
+        shape.commit()
+    
+    if is_new_pdf:
+        doc.save(pdf_path)
+    else:
+        doc.saveIncr()
+    doc.close()
+        
+
+@DeprecationWarning
+def draw_layout_on_page(raw_pdf_doc: fitz.Document,  page_idx: int, page_layout: list, pdf_path: str):
+    """
+    把layout的box用红色边框花在pdf_path的page_idx上
+    """
+    def draw(shape, layout, fill_color=fitz.pdfcolor['pink']):
+        border_offset = 1
+        rect_box = layout['layout_bbox']
+        layout_label = layout['layout_label']
+        sub_layout = layout['sub_layout']
+        if len(sub_layout)==0:
+            fill_color = fill_color if layout_label=='U' else None
+            rect_box = [rect_box[0]+1, rect_box[1]-border_offset, rect_box[2]-1, rect_box[3]+border_offset]
+            rect = fitz.Rect(*rect_box)
+            shape.draw_rect(rect)
+            shape.finish(color=fitz.pdfcolor['red'], fill=fill_color, fill_opacity=0.2)
+            # if layout_label=='U':
+            #     bad_boxes = layout.get("bad_boxes", [])
+            #     for bad_box in bad_boxes:
+            #         rect = fitz.Rect(*bad_box)
+            #         shape.draw_rect(rect)
+            #         shape.finish(color=fitz.pdfcolor['red'], fill=fitz.pdfcolor['red'], fill_opacity=0.2)
+        # else:
+        #     rect = fitz.Rect(*rect_box)
+        #     shape.draw_rect(rect)
+        #     shape.finish(color=fitz.pdfcolor['blue'])
+        
+        for sub_layout in sub_layout:
+            draw(shape, sub_layout)
+        shape.commit()
+        
+    
+    # 检查文件是否存在
+    is_new_pdf = False
+    if os.path.exists(pdf_path):
+        # 打开现有的 PDF 文件
+        doc = fitz.open(pdf_path)
+    else:
+        # 创建一个新的空白 PDF 文件
+        is_new_pdf = True
+        doc = fitz.open('')
+
+    page = doc[page_idx]
+    shape = page.new_shape()
+    for order, layout in enumerate(page_layout):
+        draw(shape, layout, fitz.pdfcolor['yellow'])
+
+    # shape.insert_textbox(fitz.Rect(200, 0, 600, 20), f"total bboxes: {len(layout)}", fontname="helv", fontsize=12,
+    #                      color=(0, 0, 0))
+    # shape.finish(color=fitz.pdfcolor['black'])
+    # shape.commit()
+
+    parent_dir = os.path.dirname(pdf_path)
+    if not os.path.exists(parent_dir):
+        os.makedirs(parent_dir)
+
+    if is_new_pdf:
+        doc.save(pdf_path)
+    else:
+        doc.saveIncr()
+    doc.close()
+    
\ No newline at end of file
--- a/mkcontent.py
+++ b/mkcontent.py
+import re
+import math
+from loguru import logger
+
+from libs.boxbase import find_bottom_nearest_text_bbox, find_top_nearest_text_bbox
+
+
+def mk_nlp_markdown(para_dict: dict):
+    """
+    对排序后的bboxes拼接内容
+    """
+    content_lst = []
+    for _, page_info in para_dict.items():
+        para_blocks = page_info.get("para_blocks")
+        if not para_blocks:
+            continue
+
+        for block in para_blocks:
+            item = block["paras"]
+            for _, p in item.items():
+                para_text = p["para_text"]
+                is_title = p["is_para_title"]
+                title_level = p['para_title_level']
+                md_title_prefix = "#"*title_level
+                if is_title:
+                    content_lst.append(f"{md_title_prefix} {para_text}")
+                else:
+                    content_lst.append(para_text)
+
+    content_text = "\n\n".join(content_lst)
+
+    return content_text
+
+
+
+# 找到目标字符串在段落中的索引
+def __find_index(paragraph, target):
+    index = paragraph.find(target)
+    if index != -1:
+        return index
+    else:
+        return None
+
+
+def __insert_string(paragraph, target, postion):
+    new_paragraph = paragraph[:postion] + target + paragraph[postion:] 
+    return new_paragraph
+
+
+def __insert_after(content, image_content, target):
+    """
+    在content中找到target，将image_content插入到target后面
+    """
+    index = content.find(target)
+    if index != -1:
+        content = content[:index+len(target)] + "\n\n" + image_content + "\n\n" + content[index+len(target):]
+    else:
+        logger.error(f"Can't find the location of image {image_content} in the markdown file, search target is {target}")
+    return content
+
+def __insert_before(content, image_content, target):
+    """
+    在content中找到target，将image_content插入到target前面
+    """
+    index = content.find(target)
+    if index != -1:
+        content = content[:index] + "\n\n" + image_content + "\n\n" + content[index:]
+    else:
+        logger.error(f"Can't find the location of image {image_content} in the markdown file, search target is {target}")
+    return content
+
+
+
+def mk_mm_markdown(para_dict: dict):
+    """拼装多模态markdown"""
+    content_lst = []
+    for _, page_info in para_dict.items():
+        page_lst = [] # 一个page内的段落列表
+        para_blocks = page_info.get("para_blocks")
+        pymu_raw_blocks = page_info.get("preproc_blocks")  
+        
+        all_page_images = []
+        all_page_images.extend(page_info.get("images",[]))
+        all_page_images.extend(page_info.get("image_backup", []) )
+        all_page_images.extend(page_info.get("tables",[]))
+        all_page_images.extend(page_info.get("table_backup",[]) )
+        
+        if not para_blocks or not pymu_raw_blocks: # 只有图片的拼接的场景
+            for img in all_page_images:
+                page_lst.append(f"![]({img['image_path']})") # TODO 图片顺序
+            page_md = "\n\n".join(page_lst)
+            
+        else:
+            for block in para_blocks:
+                item = block["paras"]
+                for _, p in item.items():
+                    para_text = p["para_text"]
+                    is_title = p["is_para_title"]
+                    title_level = p['para_title_level']
+                    md_title_prefix = "#"*title_level
+                    if is_title:
+                        page_lst.append(f"{md_title_prefix} {para_text}")
+                    else:
+                        page_lst.append(para_text)
+                        
+            """拼装成一个页面的文本"""
+            page_md = "\n\n".join(page_lst)
+            """插入图片"""
+            for img in all_page_images:
+                imgbox = img['bbox']
+                img_content = f"![]({img['image_path']})"
+                # 先看在哪个block内
+                for block in pymu_raw_blocks:
+                    bbox = block['bbox']
+                    if bbox[0]-1 <= imgbox[0] < bbox[2]+1 and bbox[1]-1 <= imgbox[1] < bbox[3]+1:# 确定在block内
+                        for l in block['lines']:
+                            line_box = l['bbox']
+                            if line_box[0]-1 <= imgbox[0] < line_box[2]+1 and line_box[1]-1 <= imgbox[1] < line_box[3]+1: # 在line内的，插入line前面
+                                line_txt = "".join([s['text'] for s in l['spans']])
+                                page_md = __insert_before(page_md, img_content, line_txt)
+                                break
+                            break
+                        else:# 在行与行之间
+                            # 找到图片x0,y0与line的x0,y0最近的line
+                            min_distance = 100000
+                            min_line = None
+                            for l in block['lines']:
+                                line_box = l['bbox']
+                                distance = math.sqrt((line_box[0] - imgbox[0])**2 + (line_box[1] - imgbox[1])**2)
+                                if distance < min_distance:
+                                    min_distance = distance
+                                    min_line = l
+                            if min_line:
+                                line_txt = "".join([s['text'] for s in min_line['spans']])
+                                img_h = imgbox[3] - imgbox[1]
+                                if min_distance<img_h: # 文字在图片前面
+                                    page_md = __insert_after(page_md, img_content, line_txt)
+                                else:
+                                    page_md = __insert_before(page_md, img_content, line_txt)
+                            else:
+                                logger.error(f"Can't find the location of image {img['image_path']} in the markdown file")
+                else:# 应当在两个block之间
+                    # 找到上方最近的block，如果上方没有就找大下方最近的block
+                    top_txt_block = find_top_nearest_text_bbox(pymu_raw_blocks, imgbox)
+                    if top_txt_block:
+                        line_txt = "".join([s['text'] for s in top_txt_block['lines'][-1]['spans']])
+                        page_md = __insert_after(page_md, img_content, line_txt)
+                    else:
+                        bottom_txt_block = find_bottom_nearest_text_bbox(pymu_raw_blocks, imgbox)
+                        if bottom_txt_block:
+                            line_txt = "".join([s['text'] for s in bottom_txt_block['lines'][0]['spans']])
+                            page_md = __insert_before(page_md, img_content, line_txt)
+                        else:
+                            logger.error(f"Can't find the location of image {img['image_path']} in the markdown file")
+                    
+        content_lst.append(page_md)
+                    
+    """拼装成全部页面的文本"""
+    content_text = "\n\n".join(content_lst)
+
+    return content_text
+    
+    
+@DeprecationWarning
+def mk_mm_markdown_1(para_dict: dict):
+    """
+    得到images和tables变量
+    """
+    image_all_list = []
+    
+    for _, page_info in para_dict.items():
+        images = page_info.get("images",[])
+        tables = page_info.get("tables",[])
+        image_backup = page_info.get("image_backup", [])  
+        table_backup = page_info.get("table_backup",[]) 
+        all_page_images = []
+        all_page_images.extend(images)
+        all_page_images.extend(image_backup)
+        all_page_images.extend(tables)
+        all_page_images.extend(table_backup)
+        
+        pymu_raw_blocks = page_info.get("pymu_raw_blocks")  
+
+        # 提取每个图片所在位置
+        for image_info in all_page_images:
+            x0_image, y0_image, x1_image, y1_image = image_info['bbox'][:4]
+            image_path = image_info['image_path']
+            
+            # 判断图片处于原始PDF中哪个模块之间
+            image_internal_dict = {}
+            image_external_dict = {}
+            between_dict = {}
+            for block in pymu_raw_blocks:
+                x0, y0, x1, y1 = block['bbox'][:4]
+
+                # 在某个模块内部
+                if x0 <= x0_image < x1 and y0 <= y0_image < y1:
+                    image_internal_dict['bbox'] = [x0_image, y0_image, x1_image, y1_image]
+                    image_internal_dict['path'] = image_path
+                    
+                    # 确定图片在哪句文本之前
+                    y_pre = 0
+                    for line in block['lines']:
+                        x0, y0, x1, y1 = line['spans'][0]['bbox']
+                        if x0 <= x0_image < x1 and y_pre <= y0_image < y0: 
+                            text = line['spans']['text']
+                            image_internal_dict['text'] = text
+                            image_internal_dict['markdown_image'] = f'![image_path]({image_path})'
+                            break
+                        else:
+                            y_pre = y0
+                # 在某两个模块之间
+                elif x0 <= x0_image < x1:
+                    distance = math.sqrt((x1_image - x0)**2 + (y1_image - y0)**2)
+                    between_dict[block['number']] = distance
+            
+            # 找到与定位点距离最小的文本block
+            if between_dict:
+                min_key = min(between_dict, key=between_dict.get)
+                spans_list = []
+                for span in pymu_raw_blocks[min_key]['lines']: 
+                    for text_piece in span['spans']:
+                        # 防止索引定位文本内容过多
+                        if len(spans_list) < 60:
+                            spans_list.append(text_piece['text'])
+                text1 = ''.join(spans_list)
+                
+                image_external_dict['bbox'] = [x0_image, y0_image, x1_image, y1_image]
+                image_external_dict['path'] = image_path 
+                image_external_dict['text'] = text1
+                image_external_dict['markdown_image'] = f'![image_path]({image_path})'
+
+            # 将内部图片或外部图片存入当页所有图片的列表
+            if len(image_internal_dict) != 0:
+                image_all_list.append(image_internal_dict)
+            elif len(image_external_dict) != 0:
+                image_all_list.append(image_external_dict)
+            else:
+                logger.error(f"Can't find the location of image {image_path} in the markdown file")
+
+    content_text = mk_nlp_markdown(para_dict)
+
+    for image_info_extract in image_all_list:
+        loc = __find_index(content_text, image_info_extract['text'])
+        if loc is not None:
+            content_text = __insert_string(content_text, image_info_extract['markdown_image'], loc)
+        else:
+            logger.error(f"Can't find the location of image {image_info_extract['path']} in the markdown file")
+
+    return content_text
\ No newline at end of file
--- a/para/__init__.py
+++ b/para/__init__.py
--- a/para/block_continuation_processor.py
+++ b/para/block_continuation_processor.py
+import os
+import sys
+import unicodedata
+
+from para.commons import *
+
+
+if sys.version_info[0] >= 3:
+    sys.stdout.reconfigure(encoding="utf-8")  # type: ignore
+
+
+class BlockContinuationProcessor:
+    """
+    This class is used to process the blocks to detect block continuations.
+    """
+
+    def __init__(self) -> None:
+        pass
+
+    def __is_similar_font_type(self, font_type1, font_type2, prefix_length_ratio=0.3):
+        """
+        This function checks if the two font types are similar.
+        Definition of similar font types: the two font types have a common prefix,
+        and the length of the common prefix is at least a certain ratio of the length of the shorter font type.
+
+        Parameters
+        ----------
+        font_type1 : str
+            font type 1
+        font_type2 : str
+            font type 2
+        prefix_length_ratio : float
+            minimum ratio of the common prefix length to the length of the shorter font type
+
+        Returns
+        -------
+        bool
+            True if the two font types are similar, False otherwise.
+        """
+
+        if isinstance(font_type1, list):
+            font_type1 = font_type1[0] if font_type1 else ""
+        if isinstance(font_type2, list):
+            font_type2 = font_type2[0] if font_type2 else ""
+
+        if font_type1 == font_type2:
+            return True
+
+        # Find the length of the common prefix
+        common_prefix_length = len(os.path.commonprefix([font_type1, font_type2]))
+
+        # Calculate the minimum prefix length based on the ratio
+        min_prefix_length = int(min(len(font_type1), len(font_type2)) * prefix_length_ratio)
+
+        return common_prefix_length >= min_prefix_length
+
+    def __is_same_block_font(self, block1, block2):
+        """
+        This function compares the font of block1 and block2
+
+        Parameters
+        ----------
+        block1 : dict
+            block1
+        block2 : dict
+            block2
+
+        Returns
+        -------
+        is_same : bool
+            True if block1 and block2 have the same font, else False
+        """
+        block_1_font_type = safe_get(block1, "block_font_type", "")
+        block_1_font_size = safe_get(block1, "block_font_size", 0)
+        block_1_avg_char_width = safe_get(block1, "avg_char_width", 0)
+
+        block_2_font_type = safe_get(block2, "block_font_type", "")
+        block_2_font_size = safe_get(block2, "block_font_size", 0)
+        block_2_avg_char_width = safe_get(block2, "avg_char_width", 0)
+
+        if isinstance(block_1_font_size, list):
+            block_1_font_size = block_1_font_size[0] if block_1_font_size else 0
+        if isinstance(block_2_font_size, list):
+            block_2_font_size = block_2_font_size[0] if block_2_font_size else 0
+
+        block_1_text = safe_get(block1, "text", "")
+        block_2_text = safe_get(block2, "text", "")
+
+        if block_1_avg_char_width == 0 or block_2_avg_char_width == 0:
+            return False
+
+        if not block_1_text or not block_2_text:
+            return False
+        else:
+            text_len_ratio = len(block_2_text) / len(block_1_text)
+            if text_len_ratio < 0.2:
+                avg_char_width_condition = (
+                    abs(block_1_avg_char_width - block_2_avg_char_width) / min(block_1_avg_char_width, block_2_avg_char_width)
+                    < 0.5
+                )
+            else:
+                avg_char_width_condition = (
+                    abs(block_1_avg_char_width - block_2_avg_char_width) / min(block_1_avg_char_width, block_2_avg_char_width)
+                    < 0.2
+                )
+
+        block_font_size_condtion = abs(block_1_font_size - block_2_font_size) < 1
+
+        return (
+            self.__is_similar_font_type(block_1_font_type, block_2_font_type)
+            and avg_char_width_condition
+            and block_font_size_condtion
+        )
+
+    def _is_alphabet_char(self, char):
+        if (char >= "\u0041" and char <= "\u005a") or (char >= "\u0061" and char <= "\u007a"):
+            return True
+        else:
+            return False
+
+    def _is_chinese_char(self, char):
+        if char >= "\u4e00" and char <= "\u9fa5":
+            return True
+        else:
+            return False
+
+    def _is_other_letter_char(self, char):
+        try:
+            cat = unicodedata.category(char)
+            if cat == "Lu" or cat == "Ll":
+                return not self._is_alphabet_char(char) and not self._is_chinese_char(char)
+        except TypeError:
+            print("The input to the function must be a single character.")
+        return False
+
+    def _is_year(self, s: str):
+        try:
+            number = int(s)
+            return 1900 <= number <= 2099
+        except ValueError:
+            return False
+
+    def __is_para_font_consistent(self, para_1, para_2):
+        """
+        This function compares the font of para1 and para2
+
+        Parameters
+        ----------
+        para1 : dict
+            para1
+        para2 : dict
+            para2
+
+        Returns
+        -------
+        is_same : bool
+            True if para1 and para2 have the same font, else False
+        """
+        if para_1 is None or para_2 is None:
+            return False
+
+        para_1_font_type = safe_get(para_1, "para_font_type", "")
+        para_1_font_size = safe_get(para_1, "para_font_size", 0)
+        para_1_font_color = safe_get(para_1, "para_font_color", "")
+
+        para_2_font_type = safe_get(para_2, "para_font_type", "")
+        para_2_font_size = safe_get(para_2, "para_font_size", 0)
+        para_2_font_color = safe_get(para_2, "para_font_color", "")
+
+        if isinstance(para_1_font_type, list):  # get the most common font type
+            para_1_font_type = max(set(para_1_font_type), key=para_1_font_type.count)
+        if isinstance(para_2_font_type, list):
+            para_2_font_type = max(set(para_2_font_type), key=para_2_font_type.count)
+        if isinstance(para_1_font_size, list):  # compute average font type
+            para_1_font_size = sum(para_1_font_size) / len(para_1_font_size)
+        if isinstance(para_2_font_size, list):  # compute average font type
+            para_2_font_size = sum(para_2_font_size) / len(para_2_font_size)
+
+        return (
+            self.__is_similar_font_type(para_1_font_type, para_2_font_type)
+            and abs(para_1_font_size - para_2_font_size) < 1.5
+            # and para_font_color1 == para_font_color2
+        )
+
+    def _is_para_puncs_consistent(self, para_1, para_2):
+        """
+        This function determines whether para1 and para2 are originally from the same paragraph by checking the puncs of para1(former) and para2(latter)
+
+        Parameters
+        ----------
+        para1 : dict
+            para1
+        para2 : dict
+            para2
+
+        Returns
+        -------
+        is_same : bool
+            True if para1 and para2 are from the same paragraph by using the puncs, else False
+        """
+        para_1_text = safe_get(para_1, "para_text", "").strip()
+        para_2_text = safe_get(para_2, "para_text", "").strip()
+
+        para_1_bboxes = safe_get(para_1, "para_bbox", [])
+        para_1_font_sizes = safe_get(para_1, "para_font_size", 0)
+
+        para_2_bboxes = safe_get(para_2, "para_bbox", [])
+        para_2_font_sizes = safe_get(para_2, "para_font_size", 0)
+
+        # print_yellow("    Features of determine puncs_consistent:")
+        # print(f"    para_1_text: {para_1_text}")
+        # print(f"    para_2_text: {para_2_text}")
+        # print(f"    para_1_bboxes: {para_1_bboxes}")
+        # print(f"    para_2_bboxes: {para_2_bboxes}")
+        # print(f"    para_1_font_sizes: {para_1_font_sizes}")
+        # print(f"    para_2_font_sizes: {para_2_font_sizes}")
+
+        if is_nested_list(para_1_bboxes):
+            x0_1, y0_1, x1_1, y1_1 = para_1_bboxes[-1]
+        else:
+            x0_1, y0_1, x1_1, y1_1 = para_1_bboxes
+
+        if is_nested_list(para_2_bboxes):
+            x0_2, y0_2, x1_2, y1_2 = para_2_bboxes[0]
+            para_2_font_sizes = para_2_font_sizes[0]  # type: ignore
+        else:
+            x0_2, y0_2, x1_2, y1_2 = para_2_bboxes
+
+        right_align_threshold = 0.5 * (para_1_font_sizes + para_2_font_sizes) * 0.8
+        are_two_paras_right_aligned = abs(x1_1 - x1_2) < right_align_threshold
+
+        left_indent_threshold = 0.5 * (para_1_font_sizes + para_2_font_sizes) * 0.8
+        is_para1_left_indent_than_papa2 = x0_1 - x0_2 > left_indent_threshold
+        is_para2_left_indent_than_papa1 = x0_2 - x0_1 > left_indent_threshold
+
+        # Check if either para_text1 or para_text2 is empty
+        if not para_1_text or not para_2_text:
+            return False
+
+        # Define the end puncs for a sentence to end and hyphen
+        end_puncs = [".", "?", "!", "。", "？", "！", "…"]
+        hyphen = ["-", "—"]
+
+        # Check if para_text1 ends with either hyphen or non-end punctuation or spaces
+        para_1_end_with_hyphen = para_1_text and para_1_text[-1] in hyphen
+        para_1_end_with_end_punc = para_1_text and para_1_text[-1] in end_puncs
+        para_1_end_with_space = para_1_text and para_1_text[-1] == " "
+        para_1_not_end_with_end_punc = para_1_text and para_1_text[-1] not in end_puncs
+
+        # print_yellow(f"    para_1_end_with_hyphen: {para_1_end_with_hyphen}")
+        # print_yellow(f"    para_1_end_with_end_punc: {para_1_end_with_end_punc}")
+        # print_yellow(f"    para_1_not_end_with_end_punc: {para_1_not_end_with_end_punc}")
+        # print_yellow(f"    para_1_end_with_space: {para_1_end_with_space}")
+
+        if para_1_end_with_hyphen:  # If para_text1 ends with hyphen
+            # print_red(f"para_1 is end with hyphen.")
+            para_2_is_consistent = para_2_text and (
+                para_2_text[0] in hyphen
+                or (self._is_alphabet_char(para_2_text[0]) and para_2_text[0].islower())
+                or (self._is_chinese_char(para_2_text[0]))
+                or (self._is_other_letter_char(para_2_text[0]))
+            )
+            if para_2_is_consistent:
+                # print(f"para_2 is consistent.\n")
+                return True
+            else:
+                # print(f"para_2 is not consistent.\n")
+                pass
+
+        elif para_1_end_with_end_punc:  # If para_text1 ends with ending punctuations
+            # print_red(f"para_1 is end with end_punc.")
+            para_2_is_consistent = (
+                para_2_text
+                and (
+                    para_2_text[0] == " "
+                    or (self._is_alphabet_char(para_2_text[0]) and para_2_text[0].isupper())
+                    or (self._is_chinese_char(para_2_text[0]))
+                    or (self._is_other_letter_char(para_2_text[0]))
+                )
+                and not is_para2_left_indent_than_papa1
+            )
+            if para_2_is_consistent:
+                # print(f"para_2 is consistent.\n")
+                return True
+            else:
+                # print(f"para_2 is not consistent.\n")
+                pass
+
+        elif para_1_not_end_with_end_punc:  # If para_text1 is not end with ending punctuations
+            # print_red(f"para_1 is NOT end with end_punc.")
+            para_2_is_consistent = para_2_text and (
+                para_2_text[0] == " "
+                or (self._is_alphabet_char(para_2_text[0]) and para_2_text[0].islower())
+                or (self._is_alphabet_char(para_2_text[0]))
+                or (self._is_year(para_2_text[0:4]))
+                or (are_two_paras_right_aligned or is_para1_left_indent_than_papa2)
+                or (self._is_chinese_char(para_2_text[0]))
+                or (self._is_other_letter_char(para_2_text[0]))
+            )
+            if para_2_is_consistent:
+                # print(f"para_2 is consistent.\n")
+                return True
+            else:
+                # print(f"para_2 is not consistent.\n")
+                pass
+
+        elif para_1_end_with_space:  # If para_text1 ends with space
+            # print_red(f"para_1 is end with space.")
+            para_2_is_consistent = para_2_text and (
+                para_2_text[0] == " "
+                or (self._is_alphabet_char(para_2_text[0]) and para_2_text[0].islower())
+                or (self._is_chinese_char(para_2_text[0]))
+                or (self._is_other_letter_char(para_2_text[0]))
+            )
+            if para_2_is_consistent:
+                # print(f"para_2 is consistent.\n")
+                return True
+            else:
+                pass
+                # print(f"para_2 is not consistent.\n")
+
+        return False
+
+    def _is_block_consistent(self, block1, block2):
+        """
+        This function determines whether block1 and block2 are originally from the same block
+
+        Parameters
+        ----------
+        block1 : dict
+            block1s
+        block2 : dict
+            block2
+
+        Returns
+        -------
+        is_same : bool
+            True if block1 and block2 are from the same block, else False
+        """
+        return self.__is_same_block_font(block1, block2)
+
+    def _is_para_continued(self, para1, para2):
+        """
+        This function determines whether para1 and para2 are originally from the same paragraph
+
+        Parameters
+        ----------
+        para1 : dict
+            para1
+        para2 : dict
+            para2
+
+        Returns
+        -------
+        is_same : bool
+            True if para1 and para2 are from the same paragraph, else False
+        """
+        is_para_font_consistent = self.__is_para_font_consistent(para1, para2)
+        is_para_puncs_consistent = self._is_para_puncs_consistent(para1, para2)
+
+        return is_para_font_consistent and is_para_puncs_consistent
+
+    def _are_boundaries_of_block_consistent(self, block1, block2):
+        """
+        This function checks if the boundaries of block1 and block2 are consistent
+
+        Parameters
+        ----------
+        block1 : dict
+            block1
+
+        block2 : dict
+            block2
+
+        Returns
+        -------
+        is_consistent : bool
+            True if the boundaries of block1 and block2 are consistent, else False
+        """
+
+        last_line_of_block1 = block1["lines"][-1]
+        first_line_of_block2 = block2["lines"][0]
+
+        spans_of_last_line_of_block1 = last_line_of_block1["spans"]
+        spans_of_first_line_of_block2 = first_line_of_block2["spans"]
+
+        font_type_of_last_line_of_block1 = spans_of_last_line_of_block1[0]["font"].lower()
+        font_size_of_last_line_of_block1 = spans_of_last_line_of_block1[0]["size"]
+        font_color_of_last_line_of_block1 = spans_of_last_line_of_block1[0]["color"]
+        font_flags_of_last_line_of_block1 = spans_of_last_line_of_block1[0]["flags"]
+
+        font_type_of_first_line_of_block2 = spans_of_first_line_of_block2[0]["font"].lower()
+        font_size_of_first_line_of_block2 = spans_of_first_line_of_block2[0]["size"]
+        font_color_of_first_line_of_block2 = spans_of_first_line_of_block2[0]["color"]
+        font_flags_of_first_line_of_block2 = spans_of_first_line_of_block2[0]["flags"]
+
+        return (
+            self.__is_similar_font_type(font_type_of_last_line_of_block1, font_type_of_first_line_of_block2)
+            and abs(font_size_of_last_line_of_block1 - font_size_of_first_line_of_block2) < 1
+            # and font_color_of_last_line_of_block1 == font_color_of_first_line_of_block2
+            and font_flags_of_last_line_of_block1 == font_flags_of_first_line_of_block2
+        )
+
+    def _get_last_paragraph(self, block):
+        """
+        Retrieves the last paragraph from a block.
+
+        Parameters
+        ----------
+        block : dict
+            The block from which to retrieve the paragraph.
+
+        Returns
+        -------
+        dict
+            The last paragraph of the block.
+        """
+        if block["paras"]:
+            last_para_key = list(block["paras"].keys())[-1]
+            return block["paras"][last_para_key]
+        else:
+            return None
+
+    def _get_first_paragraph(self, block):
+        """
+        Retrieves the first paragraph from a block.
+
+        Parameters
+        ----------
+        block : dict
+            The block from which to retrieve the paragraph.
+
+        Returns
+        -------
+        dict
+            The first paragraph of the block.
+        """
+        if block["paras"]:
+            first_para_key = list(block["paras"].keys())[0]
+            return block["paras"][first_para_key]
+        else:
+            return None
+
+    def should_merge_next_para(self, curr_para, next_para):
+        if self._is_para_continued(curr_para, next_para):
+            return True
+        else:
+            return False
+
+    def batch_tag_paras(self, pdf_dict):
+        the_last_page_id = len(pdf_dict) - 1
+
+        for curr_page_idx, (curr_page_id, curr_page_content) in enumerate(pdf_dict.items()):
+            if curr_page_id.startswith("page_") and curr_page_content.get("para_blocks", []):
+                para_blocks_of_curr_page = curr_page_content["para_blocks"]
+                next_page_idx = curr_page_idx + 1
+                next_page_id = f"page_{next_page_idx}"
+                next_page_content = pdf_dict.get(next_page_id, {})
+
+                for i, current_block in enumerate(para_blocks_of_curr_page):
+                    for para_id, curr_para in current_block["paras"].items():
+                        curr_para["curr_para_location"] = [
+                            curr_page_idx,
+                            current_block["block_id"],
+                            int(para_id.split("_")[-1]),
+                        ]
+                        curr_para["next_para_location"] = None  # 默认设置为None
+                        curr_para["merge_next_para"] = False  # 默认设置为False
+
+                    next_block = para_blocks_of_curr_page[i + 1] if i < len(para_blocks_of_curr_page) - 1 else None
+
+                    if next_block:
+                        curr_block_last_para_key = list(current_block["paras"].keys())[-1]
+                        curr_blk_last_para = current_block["paras"][curr_block_last_para_key]
+
+                        next_block_first_para_key = list(next_block["paras"].keys())[0]
+                        next_blk_first_para = next_block["paras"][next_block_first_para_key]
+
+                        if self.should_merge_next_para(curr_blk_last_para, next_blk_first_para):
+                            curr_blk_last_para["next_para_location"] = [
+                                curr_page_idx,
+                                next_block["block_id"],
+                                int(next_block_first_para_key.split("_")[-1]),
+                            ]
+                            curr_blk_last_para["merge_next_para"] = True
+                    else:
+                        # Handle the case where the next block is in a different page
+                        curr_block_last_para_key = list(current_block["paras"].keys())[-1]
+                        curr_blk_last_para = current_block["paras"][curr_block_last_para_key]
+
+                        while not next_page_content.get("para_blocks", []) and next_page_idx <= the_last_page_id:
+                            next_page_idx += 1
+                            next_page_id = f"page_{next_page_idx}"
+                            next_page_content = pdf_dict.get(next_page_id, {})
+
+                        if next_page_content.get("para_blocks", []):
+                            next_blk_first_para_key = list(next_page_content["para_blocks"][0]["paras"].keys())[0]
+                            next_blk_first_para = next_page_content["para_blocks"][0]["paras"][next_blk_first_para_key]
+
+                            if self.should_merge_next_para(curr_blk_last_para, next_blk_first_para):
+                                curr_blk_last_para["next_para_location"] = [
+                                    next_page_idx,
+                                    next_page_content["para_blocks"][0]["block_id"],
+                                    int(next_blk_first_para_key.split("_")[-1]),
+                                ]
+                                curr_blk_last_para["merge_next_para"] = True
+
+        return pdf_dict
+
+    def find_block_by_id(self, para_blocks, block_id):
+        for block in para_blocks:
+            if block.get("block_id") == block_id:
+                return block
+        return None
+
+    def batch_merge_paras(self, pdf_dict):
+        for page_id, page_content in pdf_dict.items():
+            if page_id.startswith("page_") and page_content.get("para_blocks", []):
+                para_blocks_of_page = page_content["para_blocks"]
+
+                for i in range(len(para_blocks_of_page)):
+                    current_block = para_blocks_of_page[i]
+                    paras = current_block["paras"]
+
+                    for para_id, curr_para in list(paras.items()):
+                        # 跳过标题段落
+                        if curr_para.get("is_para_title"):
+                            continue
+
+                        while curr_para.get("merge_next_para"):
+                            next_para_location = curr_para.get("next_para_location")
+                            if not next_para_location:
+                                break
+
+                            next_page_idx, next_block_id, next_para_id = next_para_location
+                            next_page_id = f"page_{next_page_idx}"
+                            next_page_content = pdf_dict.get(next_page_id)
+                            if not next_page_content:
+                                break
+
+                            next_block = self.find_block_by_id(next_page_content.get("para_blocks", []), next_block_id)
+                            if not next_block:
+                                break
+
+                            next_para = next_block["paras"].get(f"para_{next_para_id}")
+                            if not next_para or next_para.get("is_para_title"):
+                                break
+
+                            # 合并段落文本
+                            curr_para_text = curr_para.get("para_text", "")
+                            next_para_text = next_para.get("para_text", "")
+                            curr_para["para_text"] = curr_para_text + " " + next_para_text
+
+                            # 更新 next_para_location
+                            curr_para["next_para_location"] = next_para.get("next_para_location")
+
+                            # 将下一个段落文本置为空，表示已被合并
+                            next_para["para_text"] = ""
+
+                            # 更新 merge_next_para 标记
+                            curr_para["merge_next_para"] = next_para.get("merge_next_para", False)
+
+        return pdf_dict
--- a/para/block_termination_processor.py
+++ b/para/block_termination_processor.py
+import sys
+
+from libs.commons import fitz
+
+from termcolor import cprint
+
+from para.commons import *
+
+
+if sys.version_info[0] >= 3:
+    sys.stdout.reconfigure(encoding="utf-8")  # type: ignore
+
+
+
+class BlockTerminationProcessor:
+    def __init__(self) -> None:
+        pass
+
+    def _is_consistent_lines(
+        self,
+        curr_line,
+        prev_line,
+        next_line,
+        consistent_direction,  # 0 for prev, 1 for next, 2 for both
+    ):
+        """
+        This function checks if the line is consistent with its neighbors
+
+        Parameters
+        ----------
+        curr_line : dict
+            current line
+        prev_line : dict
+            previous line
+        next_line : dict
+            next line
+        consistent_direction : int
+            0 for prev, 1 for next, 2 for both
+
+        Returns
+        -------
+        bool
+            True if the line is consistent with its neighbors, False otherwise.
+        """
+
+        curr_line_font_size = curr_line["spans"][0]["size"]
+        curr_line_font_type = curr_line["spans"][0]["font"].lower()
+
+        if consistent_direction == 0:
+            if prev_line:
+                prev_line_font_size = prev_line["spans"][0]["size"]
+                prev_line_font_type = prev_line["spans"][0]["font"].lower()
+                return curr_line_font_size == prev_line_font_size and curr_line_font_type == prev_line_font_type
+            else:
+                return False
+
+        elif consistent_direction == 1:
+            if next_line:
+                next_line_font_size = next_line["spans"][0]["size"]
+                next_line_font_type = next_line["spans"][0]["font"].lower()
+                return curr_line_font_size == next_line_font_size and curr_line_font_type == next_line_font_type
+            else:
+                return False
+
+        elif consistent_direction == 2:
+            if prev_line and next_line:
+                prev_line_font_size = prev_line["spans"][0]["size"]
+                prev_line_font_type = prev_line["spans"][0]["font"].lower()
+                next_line_font_size = next_line["spans"][0]["size"]
+                next_line_font_type = next_line["spans"][0]["font"].lower()
+                return (curr_line_font_size == prev_line_font_size and curr_line_font_type == prev_line_font_type) and (
+                    curr_line_font_size == next_line_font_size and curr_line_font_type == next_line_font_type
+                )
+            else:
+                return False
+
+        else:
+            return False
+
+    def _is_regular_line(self, curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_width, X0, X1, avg_line_height):
+        """
+        This function checks if the line is a regular line
+
+        Parameters
+        ----------
+        curr_line_bbox : list
+            bbox of the current line
+        prev_line_bbox : list
+            bbox of the previous line
+        next_line_bbox : list
+            bbox of the next line
+        avg_char_width : float
+            average of char widths
+        X0 : float
+            median of x0 values, which represents the left average boundary of the page
+        X1 : float
+            median of x1 values, which represents the right average boundary of the page
+        avg_line_height : float
+            average of line heights
+
+        Returns
+        -------
+        bool
+            True if the line is a regular line, False otherwise.
+        """
+        horizontal_ratio = 0.5
+        vertical_ratio = 0.5
+        horizontal_thres = horizontal_ratio * avg_char_width
+        vertical_thres = vertical_ratio * avg_line_height
+
+        x0, y0, x1, y1 = curr_line_bbox
+
+        x0_near_X0 = abs(x0 - X0) < horizontal_thres
+        x1_near_X1 = abs(x1 - X1) < horizontal_thres
+
+        prev_line_is_end_of_para = prev_line_bbox and (abs(prev_line_bbox[2] - X1) > avg_char_width)
+
+        sufficient_spacing_above = False
+        if prev_line_bbox:
+            vertical_spacing_above = y1 - prev_line_bbox[3]
+            sufficient_spacing_above = vertical_spacing_above > vertical_thres
+
+        sufficient_spacing_below = False
+        if next_line_bbox:
+            vertical_spacing_below = next_line_bbox[1] - y0
+            sufficient_spacing_below = vertical_spacing_below > vertical_thres
+
+        return (
+            (sufficient_spacing_above or sufficient_spacing_below)
+            or (not x0_near_X0 and not x1_near_X1)
+            or prev_line_is_end_of_para
+        )
+
+    def _is_possible_start_of_para(self, curr_line, prev_line, next_line, X0, X1, avg_char_width, avg_font_size):
+        """
+        This function checks if the line is a possible start of a paragraph
+
+        Parameters
+        ----------
+        curr_line : dict
+            current line
+        prev_line : dict
+            previous line
+        next_line : dict
+            next line
+        X0 : float
+            median of x0 values, which represents the left average boundary of the page
+        X1 : float
+            median of x1 values, which represents the right average boundary of the page
+        avg_char_width : float
+            average of char widths
+        avg_line_height : float
+            average of line heights
+
+        Returns
+        -------
+        bool
+            True if the line is a possible start of a paragraph, False otherwise.
+        """
+        start_confidence = 0.5  # Initial confidence of the line being a start of a paragraph
+        decision_path = []  # Record the decision path
+
+        curr_line_bbox = curr_line["bbox"]
+        prev_line_bbox = prev_line["bbox"] if prev_line else None
+        next_line_bbox = next_line["bbox"] if next_line else None
+
+        indent_ratio = 1
+
+        vertical_ratio = 1.5
+        vertical_thres = vertical_ratio * avg_font_size
+
+        left_horizontal_ratio = 0.5
+        left_horizontal_thres = left_horizontal_ratio * avg_char_width
+
+        right_horizontal_ratio = 2.5
+        right_horizontal_thres = right_horizontal_ratio * avg_char_width
+
+        x0, y0, x1, y1 = curr_line_bbox
+
+        indent_condition = x0 > X0 + indent_ratio * avg_char_width
+        if indent_condition:
+            start_confidence += 0.2
+            decision_path.append("indent_condition_met")
+
+        x0_near_X0 = abs(x0 - X0) < left_horizontal_thres
+        if x0_near_X0:
+            start_confidence += 0.1
+            decision_path.append("x0_near_X0")
+
+        x1_near_X1 = abs(x1 - X1) < right_horizontal_thres
+        if x1_near_X1:
+            start_confidence += 0.1
+            decision_path.append("x1_near_X1")
+
+        if prev_line is None:
+            prev_line_is_end_of_para = True
+            start_confidence += 0.2
+            decision_path.append("no_prev_line")
+        else:
+            prev_line_is_end_of_para, _, _ = self._is_possible_end_of_para(prev_line, next_line, X0, X1, avg_char_width)
+            if prev_line_is_end_of_para:
+                start_confidence += 0.1
+                decision_path.append("prev_line_is_end_of_para")
+
+        sufficient_spacing_above = False
+        if prev_line_bbox:
+            vertical_spacing_above = y1 - prev_line_bbox[3]
+            sufficient_spacing_above = vertical_spacing_above > vertical_thres
+            if sufficient_spacing_above:
+                start_confidence += 0.2
+                decision_path.append("sufficient_spacing_above")
+
+        sufficient_spacing_below = False
+        if next_line_bbox:
+            vertical_spacing_below = next_line_bbox[1] - y0
+            sufficient_spacing_below = vertical_spacing_below > vertical_thres
+            if sufficient_spacing_below:
+                start_confidence += 0.2
+                decision_path.append("sufficient_spacing_below")
+
+        is_regular_line = self._is_regular_line(
+            curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_width, X0, X1, avg_font_size
+        )
+        if is_regular_line:
+            start_confidence += 0.1
+            decision_path.append("is_regular_line")
+
+        is_start_of_para = (
+            (sufficient_spacing_above or sufficient_spacing_below)
+            or (indent_condition)
+            or (not indent_condition and x0_near_X0 and x1_near_X1 and not is_regular_line)
+            or prev_line_is_end_of_para
+        )
+        return (is_start_of_para, start_confidence, decision_path)
+
+    def _is_possible_end_of_para(self, curr_line, next_line, X0, X1, avg_char_width):
+        """
+        This function checks if the line is a possible end of a paragraph
+
+        Parameters
+        ----------
+        curr_line : dict
+            current line
+        next_line : dict
+            next line
+        X0 : float
+            median of x0 values, which represents the left average boundary of the page
+        X1 : float
+            median of x1 values, which represents the right average boundary of the page
+        avg_char_width : float
+            average of char widths
+
+        Returns
+        -------
+        bool
+            True if the line is a possible end of a paragraph, False otherwise.
+        """
+
+        end_confidence = 0.5  # Initial confidence of the line being a end of a paragraph
+        decision_path = []  # Record the decision path
+
+        curr_line_bbox = curr_line["bbox"]
+        next_line_bbox = next_line["bbox"] if next_line else None
+
+        left_horizontal_ratio = 0.5
+        right_horizontal_ratio = 0.5
+
+        x0, _, x1, y1 = curr_line_bbox
+        next_x0, next_y0, _, _ = next_line_bbox if next_line_bbox else (0, 0, 0, 0)
+
+        x0_near_X0 = abs(x0 - X0) < left_horizontal_ratio * avg_char_width
+        if x0_near_X0:
+            end_confidence += 0.1
+            decision_path.append("x0_near_X0")
+
+        x1_smaller_than_X1 = x1 < X1 - right_horizontal_ratio * avg_char_width
+        if x1_smaller_than_X1:
+            end_confidence += 0.1
+            decision_path.append("x1_smaller_than_X1")
+
+        next_line_is_start_of_para = (
+            next_line_bbox
+            and (next_x0 > X0 + left_horizontal_ratio * avg_char_width)
+            and (not is_line_left_aligned_from_neighbors(curr_line_bbox, None, next_line_bbox, avg_char_width, direction=1))
+        )
+        if next_line_is_start_of_para:
+            end_confidence += 0.2
+            decision_path.append("next_line_is_start_of_para")
+
+        is_line_left_aligned_from_neighbors_bool = is_line_left_aligned_from_neighbors(
+            curr_line_bbox, None, next_line_bbox, avg_char_width
+        )
+        if is_line_left_aligned_from_neighbors_bool:
+            end_confidence += 0.1
+            decision_path.append("line_is_left_aligned_from_neighbors")
+
+        is_line_right_aligned_from_neighbors_bool = is_line_right_aligned_from_neighbors(
+            curr_line_bbox, None, next_line_bbox, avg_char_width
+        )
+        if not is_line_right_aligned_from_neighbors_bool:
+            end_confidence += 0.1
+            decision_path.append("line_is_not_right_aligned_from_neighbors")
+
+        is_end_of_para = end_with_punctuation(curr_line["text"]) and (
+            (x0_near_X0 and x1_smaller_than_X1)
+            or (is_line_left_aligned_from_neighbors_bool and not is_line_right_aligned_from_neighbors_bool)
+        )
+
+        return (is_end_of_para, end_confidence, decision_path)
+
+    def _cut_paras_per_block(
+        self,
+        block,
+    ):
+        """
+        Processes a raw block from PyMuPDF and returns the processed block.
+
+        Parameters
+        ----------
+        raw_block : dict
+            A raw block from pymupdf.
+
+        Returns
+        -------
+        processed_block : dict
+
+        """
+
+        def _construct_para(lines, is_block_title, para_title_level):
+            """
+            Construct a paragraph from given lines.
+            """
+
+            font_sizes = [span["size"] for line in lines for span in line["spans"]]
+            avg_font_size = sum(font_sizes) / len(font_sizes) if font_sizes else 0
+
+            font_colors = [span["color"] for line in lines for span in line["spans"]]
+            most_common_font_color = max(set(font_colors), key=font_colors.count) if font_colors else None
+
+            # font_types = [span["font"] for line in lines for span in line["spans"]]
+            # most_common_font_type = max(set(font_types), key=font_types.count) if font_types else None
+
+            font_type_lengths = {}
+            for line in lines:
+                for span in line["spans"]:
+                    font_type = span["font"]
+                    bbox_width = span["bbox"][2] - span["bbox"][0]
+                    if font_type in font_type_lengths:
+                        font_type_lengths[font_type] += bbox_width
+                    else:
+                        font_type_lengths[font_type] = bbox_width
+
+            # get the font type with the longest bbox width
+            most_common_font_type = max(font_type_lengths, key=font_type_lengths.get) if font_type_lengths else None  # type: ignore
+
+            para_bbox = calculate_para_bbox(lines)
+            para_text = " ".join(line["text"] for line in lines)
+
+            return {
+                "para_bbox": para_bbox,
+                "para_text": para_text,
+                "para_font_type": most_common_font_type,
+                "para_font_size": avg_font_size,
+                "para_font_color": most_common_font_color,
+                "is_para_title": is_block_title,
+                "para_title_level": para_title_level,
+            }
+
+        block_bbox = block["bbox"]
+        block_text = block["text"]
+        block_lines = block["lines"]
+
+        X0 = safe_get(block, "X0", 0)
+        X1 = safe_get(block, "X1", 0)
+        avg_char_width = safe_get(block, "avg_char_width", 0)
+        avg_char_height = safe_get(block, "avg_char_height", 0)
+        avg_font_size = safe_get(block, "avg_font_size", 0)
+
+        is_block_title = safe_get(block, "is_block_title", False)
+        para_title_level = safe_get(block, "block_title_level", 0)
+
+        # Segment into paragraphs
+        para_ranges = []
+        in_paragraph = False
+        start_idx_of_para = None
+
+        # Create the processed paragraphs
+        processed_paras = {}
+        para_bboxes = []
+        end_idx_of_para = 0
+
+        for line_index, line in enumerate(block_lines):
+            curr_line = line
+            prev_line = block_lines[line_index - 1] if line_index > 0 else None
+            next_line = block_lines[line_index + 1] if line_index < len(block_lines) - 1 else None
+
+            """
+            Start processing paragraphs.
+            """
+
+            # Check if the line is the start of a paragraph
+            is_start_of_para, start_confidence, decision_path = self._is_possible_start_of_para(
+                curr_line, prev_line, next_line, X0, X1, avg_char_width, avg_font_size
+            )
+            if not in_paragraph and is_start_of_para:
+                in_paragraph = True
+                start_idx_of_para = line_index
+
+                # print_green(">>> Start of a paragraph")
+                # print("    curr_line_text: ", curr_line["text"])
+                # print("    start_confidence: ", start_confidence)
+                # print("    decision_path: ", decision_path)
+
+            # Check if the line is the end of a paragraph
+            is_end_of_para, end_confidence, decision_path = self._is_possible_end_of_para(
+                curr_line, next_line, X0, X1, avg_char_width
+            )
+            if in_paragraph and (is_end_of_para or not next_line):
+                para_ranges.append((start_idx_of_para, line_index))
+                start_idx_of_para = None
+                in_paragraph = False
+
+                # print_red(">>> End of a paragraph")
+                # print("    curr_line_text: ", curr_line["text"])
+                # print("    end_confidence: ", end_confidence)
+                # print("    decision_path: ", decision_path)
+
+        # Add the last paragraph if it is not added
+        if in_paragraph and start_idx_of_para is not None:
+            para_ranges.append((start_idx_of_para, len(block_lines) - 1))
+
+        # Process the matched paragraphs
+        for para_index, (start_idx, end_idx) in enumerate(para_ranges):
+            matched_lines = block_lines[start_idx : end_idx + 1]
+            para_properties = _construct_para(matched_lines, is_block_title, para_title_level)
+            para_key = f"para_{len(processed_paras)}"
+            processed_paras[para_key] = para_properties
+            para_bboxes.append(para_properties["para_bbox"])
+            end_idx_of_para = end_idx + 1
+
+        # Deal with the remaining lines
+        if end_idx_of_para < len(block_lines):
+            unmatched_lines = block_lines[end_idx_of_para:]
+            unmatched_properties = _construct_para(unmatched_lines, is_block_title, para_title_level)
+            unmatched_key = f"para_{len(processed_paras)}"
+            processed_paras[unmatched_key] = unmatched_properties
+            para_bboxes.append(unmatched_properties["para_bbox"])
+
+        block["paras"] = processed_paras
+
+        return block
+
+    def batch_process_blocks(self, pdf_dict):
+        """
+        Parses the blocks of all pages.
+
+        Parameters
+        ----------
+        pdf_dict : dict
+            PDF dictionary.
+        filter_blocks : list
+            List of bounding boxes to filter.
+
+        Returns
+        -------
+        result_dict : dict
+            Result dictionary.
+
+        """
+
+        num_paras = 0
+
+        for page_id, page in pdf_dict.items():
+            if page_id.startswith("page_"):
+                para_blocks = []
+                if "para_blocks" in page.keys():
+                    input_blocks = page["para_blocks"]
+                    for input_block in input_blocks:
+                        new_block = self._cut_paras_per_block(input_block)
+                        para_blocks.append(new_block)
+                        num_paras += len(new_block["paras"])
+
+                page["para_blocks"] = para_blocks
+
+        pdf_dict["statistics"]["num_paras"] = num_paras
+        return pdf_dict
--- a/para/commons.py
+++ b/para/commons.py
+import sys
+
+from libs.commons import fitz
+from termcolor import cprint
+
+
+if sys.version_info[0] >= 3:
+    sys.stdout.reconfigure(encoding="utf-8")  # type: ignore
+
+
+def open_pdf(pdf_path):
+    try:
+        pdf_document = fitz.open(pdf_path)  # type: ignore
+        return pdf_document
+    except Exception as e:
+        print(f"无法打开PDF文件：{pdf_path}。原因是：{e}")
+        raise e
+
+
+def print_green_on_red(text):
+    cprint(text, "green", "on_red", attrs=["bold"], end="\n\n")
+
+
+def print_green(text):
+    print()
+    cprint(text, "green", attrs=["bold"], end="\n\n")
+
+
+def print_red(text):
+    print()
+    cprint(text, "red", attrs=["bold"], end="\n\n")
+
+
+def print_yellow(text):
+    print()
+    cprint(text, "yellow", attrs=["bold"], end="\n\n")
+
+
+def safe_get(dict_obj, key, default):
+    val = dict_obj.get(key)
+    if val is None:
+        return default
+    else:
+        return val
+
+
+def is_bbox_overlap(bbox1, bbox2):
+    """
+    This function checks if bbox1 and bbox2 overlap or not
+
+    Parameters
+    ----------
+    bbox1 : list
+        bbox1
+    bbox2 : list
+        bbox2
+
+    Returns
+    -------
+    bool
+        True if bbox1 and bbox2 overlap, else False
+    """
+    x0_1, y0_1, x1_1, y1_1 = bbox1
+    x0_2, y0_2, x1_2, y1_2 = bbox2
+
+    if x0_1 > x1_2 or x0_2 > x1_1:
+        return False
+    if y0_1 > y1_2 or y0_2 > y1_1:
+        return False
+
+    return True
+
+
+def is_in_bbox(bbox1, bbox2):
+    """
+    This function checks if bbox1 is in bbox2
+
+    Parameters
+    ----------
+    bbox1 : list
+        bbox1
+    bbox2 : list
+        bbox2
+
+    Returns
+    -------
+    bool
+        True if bbox1 is in bbox2, else False
+    """
+    x0_1, y0_1, x1_1, y1_1 = bbox1
+    x0_2, y0_2, x1_2, y1_2 = bbox2
+
+    if x0_1 >= x0_2 and y0_1 >= y0_2 and x1_1 <= x1_2 and y1_1 <= y1_2:
+        return True
+    else:
+        return False
+
+
+def calculate_para_bbox(lines):
+    """
+    This function calculates the minimum bbox of the paragraph
+
+    Parameters
+    ----------
+    lines : list
+        lines
+
+    Returns
+    -------
+    para_bbox : list
+        bbox of the paragraph
+    """
+    x0 = min(line["bbox"][0] for line in lines)
+    y0 = min(line["bbox"][1] for line in lines)
+    x1 = max(line["bbox"][2] for line in lines)
+    y1 = max(line["bbox"][3] for line in lines)
+    return [x0, y0, x1, y1]
+
+
+def is_line_right_aligned_from_neighbors(curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_width, direction=2):
+    """
+    This function checks if the line is right aligned from its neighbors
+
+    Parameters
+    ----------
+    curr_line_bbox : list
+        bbox of the current line
+    prev_line_bbox : list
+        bbox of the previous line
+    next_line_bbox : list
+        bbox of the next line
+    avg_char_width : float
+        average of char widths
+    direction : int
+        0 for prev, 1 for next, 2 for both
+
+    Returns
+    -------
+    bool
+        True if the line is right aligned from its neighbors, False otherwise.
+    """
+    horizontal_ratio = 0.5
+    horizontal_thres = horizontal_ratio * avg_char_width
+
+    _, _, x1, _ = curr_line_bbox
+    _, _, prev_x1, _ = prev_line_bbox if prev_line_bbox else (0, 0, 0, 0)
+    _, _, next_x1, _ = next_line_bbox if next_line_bbox else (0, 0, 0, 0)
+
+    if direction == 0:
+        return abs(x1 - prev_x1) < horizontal_thres
+    elif direction == 1:
+        return abs(x1 - next_x1) < horizontal_thres
+    elif direction == 2:
+        return abs(x1 - prev_x1) < horizontal_thres and abs(x1 - next_x1) < horizontal_thres
+    else:
+        return False
+
+
+def is_line_left_aligned_from_neighbors(curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_width, direction=2):
+    """
+    This function checks if the line is left aligned from its neighbors
+
+    Parameters
+    ----------
+    curr_line_bbox : list
+        bbox of the current line
+    prev_line_bbox : list
+        bbox of the previous line
+    next_line_bbox : list
+        bbox of the next line
+    avg_char_width : float
+        average of char widths
+    direction : int
+        0 for prev, 1 for next, 2 for both
+
+    Returns
+    -------
+    bool
+        True if the line is left aligned from its neighbors, False otherwise.
+    """
+    horizontal_ratio = 0.5
+    horizontal_thres = horizontal_ratio * avg_char_width
+
+    x0, _, _, _ = curr_line_bbox
+    prev_x0, _, _, _ = prev_line_bbox if prev_line_bbox else (0, 0, 0, 0)
+    next_x0, _, _, _ = next_line_bbox if next_line_bbox else (0, 0, 0, 0)
+
+    if direction == 0:
+        return abs(x0 - prev_x0) < horizontal_thres
+    elif direction == 1:
+        return abs(x0 - next_x0) < horizontal_thres
+    elif direction == 2:
+        return abs(x0 - prev_x0) < horizontal_thres and abs(x0 - next_x0) < horizontal_thres
+    else:
+        return False
+
+
+def end_with_punctuation(line_text):
+    """
+    This function checks if the line ends with punctuation marks
+    """
+
+    english_end_puncs = [".", "?", "!"]
+    chinese_end_puncs = ["。", "？", "！"]
+    end_puncs = english_end_puncs + chinese_end_puncs
+
+    last_non_space_char = None
+    for ch in line_text[::-1]:
+        if not ch.isspace():
+            last_non_space_char = ch
+            break
+
+    if last_non_space_char is None:
+        return False
+
+    return last_non_space_char in end_puncs
+
+
+def is_nested_list(lst):
+    if isinstance(lst, list):
+        return any(isinstance(sub, list) for sub in lst)
+    return False
--- a/para/denoise.py
+++ b/para/denoise.py
+import sys
+import math
+
+from collections import defaultdict
+from para.commons import *
+
+if sys.version_info[0] >= 3:
+    sys.stdout.reconfigure(encoding="utf-8")  # type: ignore
+
+
+class HeaderFooterProcessor:
+    def __init__(self) -> None:
+        pass
+
+    def get_most_common_bboxes(self, bboxes, page_height, position="top", threshold=0.25, num_bboxes=3, min_frequency=2):
+        """
+        This function gets the most common bboxes from the bboxes
+
+        Parameters
+        ----------
+        bboxes : list
+            bboxes
+        page_height : float
+            height of the page
+        position : str, optional
+            "top" or "bottom", by default "top"
+        threshold : float, optional
+            threshold, by default 0.25
+        num_bboxes : int, optional
+            number of bboxes to return, by default 3
+        min_frequency : int, optional
+            minimum frequency of the bbox, by default 2
+
+        Returns
+        -------
+        common_bboxes : list
+            common bboxes
+        """
+        # Filter bbox by position
+        if position == "top":
+            filtered_bboxes = [bbox for bbox in bboxes if bbox[1] < page_height * threshold]
+        else:
+            filtered_bboxes = [bbox for bbox in bboxes if bbox[3] > page_height * (1 - threshold)]
+
+        # Find the most common bbox
+        bbox_count = defaultdict(int)
+        for bbox in filtered_bboxes:
+            bbox_count[tuple(bbox)] += 1
+
+        # Get the most frequently occurring bbox, but only consider it when the frequency exceeds min_frequency
+        common_bboxes = [
+            bbox for bbox, count in sorted(bbox_count.items(), key=lambda item: item[1], reverse=True) if count >= min_frequency
+        ][:num_bboxes]
+        return common_bboxes
+
+    def detect_footer_header(self, result_dict, similarity_threshold=0.5):
+        """
+        This function detects the header and footer of the document.
+
+        Parameters
+        ----------
+        result_dict : dict
+            result dictionary
+
+        Returns
+        -------
+        result_dict : dict
+            result dictionary
+        """
+
+        def compare_bbox_with_list(bbox, bbox_list, tolerance=1):
+            return any(all(abs(a - b) < tolerance for a, b in zip(bbox, common_bbox)) for common_bbox in bbox_list)
+
+        def is_single_line_block(block):
+            # Determine based on the width and height of the block
+            block_width = block["X1"] - block["X0"]
+            block_height = block["bbox"][3] - block["bbox"][1]
+
+            # If the height of the block is close to the average character height and the width is large, it is considered a single line
+            return block_height <= block["avg_char_height"] * 3 and block_width > block["avg_char_width"] * 3
+
+        # Traverse all blocks in the document
+        single_preproc_blocks = 0
+        total_blocks = 0
+        single_preproc_blocks = 0
+
+        for page_id, blocks in result_dict.items():
+            if page_id.startswith("page_"):
+                for block_key, block in blocks.items():
+                    if block_key.startswith("block_"):
+                        total_blocks += 1
+                        if is_single_line_block(block):
+                            single_preproc_blocks += 1
+
+        # If there are no blocks, skip the header and footer detection
+        if total_blocks == 0:
+            print("No blocks found. Skipping header/footer detection.")
+            return result_dict
+
+        # If most of the blocks are single-line, skip the header and footer detection
+        if single_preproc_blocks / total_blocks > 0.5:  # 50% of the blocks are single-line
+            return result_dict
+
+        # Collect the bounding boxes of all blocks
+        all_bboxes = []
+        all_texts = []
+
+        for page_id, blocks in result_dict.items():
+            if page_id.startswith("page_"):
+                for block_key, block in blocks.items():
+                    if block_key.startswith("block_"):
+                        all_bboxes.append(block["bbox"])
+
+        # Get the height of the page
+        page_height = max(bbox[3] for bbox in all_bboxes)
+
+        # Get the most common bbox lists for headers and footers
+        common_header_bboxes = self.get_most_common_bboxes(all_bboxes, page_height, position="top") if all_bboxes else []
+        common_footer_bboxes = self.get_most_common_bboxes(all_bboxes, page_height, position="bottom") if all_bboxes else []
+
+        # Detect and mark headers and footers
+        for page_id, blocks in result_dict.items():
+            if page_id.startswith("page_"):
+                for block_key, block in blocks.items():
+                    if block_key.startswith("block_"):
+                        bbox = block["bbox"]
+                        text = block["text"]
+
+                        is_header = compare_bbox_with_list(bbox, common_header_bboxes)
+                        is_footer = compare_bbox_with_list(bbox, common_footer_bboxes)
+
+                        block["is_header"] = int(is_header)
+                        block["is_footer"] = int(is_footer)
+
+        return result_dict
+
+
+class NonHorizontalTextProcessor:
+    def __init__(self) -> None:
+        pass
+
+    def detect_non_horizontal_texts(self, result_dict):
+        """
+        This function detects watermarks and vertical margin notes in the document.
+
+        Watermarks are identified by finding blocks with the same coordinates and frequently occurring identical texts across multiple pages.
+        If these conditions are met, the blocks are highly likely to be watermarks, as opposed to headers or footers, which can change from page to page.
+        If the direction of these blocks is not horizontal, they are definitely considered to be watermarks.
+
+        Vertical margin notes are identified by finding blocks with the same coordinates and frequently occurring identical texts across multiple pages.
+        If these conditions are met, the blocks are highly likely to be vertical margin notes, which typically appear on the left and right sides of the page.
+        If the direction of these blocks is vertical, they are definitely considered to be vertical margin notes.
+
+
+        Parameters
+        ----------
+        result_dict : dict
+            The result dictionary.
+
+        Returns
+        -------
+        result_dict : dict
+            The updated result dictionary.
+        """
+        # Dictionary to store information about potential watermarks
+        potential_watermarks = {}
+        potential_margin_notes = {}
+
+        for page_id, page_content in result_dict.items():
+            if page_id.startswith("page_"):
+                for block_id, block_data in page_content.items():
+                    if block_id.startswith("block_"):
+                        if "dir" in block_data:
+                            coordinates_text = (block_data["bbox"], block_data["text"])  # Tuple of coordinates and text
+
+                            angle = math.atan2(block_data["dir"][1], block_data["dir"][0])
+                            angle = abs(math.degrees(angle))
+
+                            if angle > 5 and angle < 85:  # Check if direction is watermarks
+                                if coordinates_text in potential_watermarks:
+                                    potential_watermarks[coordinates_text] += 1
+                                else:
+                                    potential_watermarks[coordinates_text] = 1
+
+                            if angle > 85 and angle < 105:  # Check if direction is vertical
+                                if coordinates_text in potential_margin_notes:
+                                    potential_margin_notes[coordinates_text] += 1  # Increment count
+                                else:
+                                    potential_margin_notes[coordinates_text] = 1  # Initialize count
+
+        # Identify watermarks by finding entries with counts higher than a threshold (e.g., appearing on more than half of the pages)
+        watermark_threshold = len(result_dict) // 2
+        watermarks = {k: v for k, v in potential_watermarks.items() if v > watermark_threshold}
+
+        # Identify margin notes by finding entries with counts higher than a threshold (e.g., appearing on more than half of the pages)
+        margin_note_threshold = len(result_dict) // 2
+        margin_notes = {k: v for k, v in potential_margin_notes.items() if v > margin_note_threshold}
+
+        # Add watermark information to the result dictionary
+        for page_id, blocks in result_dict.items():
+            if page_id.startswith("page_"):
+                for block_id, block_data in blocks.items():
+                    coordinates_text = (block_data["bbox"], block_data["text"])
+                    if coordinates_text in watermarks:
+                        block_data["is_watermark"] = 1
+                    else:
+                        block_data["is_watermark"] = 0
+
+                    if coordinates_text in margin_notes:
+                        block_data["is_vertical_margin_note"] = 1
+                    else:
+                        block_data["is_vertical_margin_note"] = 0
+
+        return result_dict
+
+
+class NoiseRemover:
+    def __init__(self) -> None:
+        pass
+
+    def skip_data_noises(self, result_dict):
+        """
+        This function skips the data noises, including overlap blocks, header, footer, watermark, vertical margin note, title
+        """
+        filtered_result_dict = {}
+        for page_id, blocks in result_dict.items():
+            if page_id.startswith("page_"):
+                filtered_blocks = {}
+                for block_id, block in blocks.items():
+                    if block_id.startswith("block_"):
+                        if any(
+                            block.get(key, 0)
+                            for key in [
+                                "is_overlap",
+                                "is_header",
+                                "is_footer",
+                                "is_watermark",
+                                "is_vertical_margin_note",
+                                "is_block_title",
+                            ]
+                        ):
+                            continue
+                        filtered_blocks[block_id] = block
+                if filtered_blocks:
+                    filtered_result_dict[page_id] = filtered_blocks
+
+        return filtered_result_dict
--- a/para/draw.py
+++ b/para/draw.py
+import sys
+
+from libs.commons import fitz
+
+from para.commons import *
+
+
+if sys.version_info[0] >= 3:
+    sys.stdout.reconfigure(encoding="utf-8")  # type: ignore
+
+
+class DrawAnnos:
+    """
+    This class draws annotations on the pdf file
+
+    ----------------------------------------
+                Color Code
+    ----------------------------------------
+        Red: (1, 0, 0)
+        Green: (0, 1, 0)
+        Blue: (0, 0, 1)
+        Yellow: (1, 1, 0) - mix of red and green
+        Cyan: (0, 1, 1) - mix of green and blue
+        Magenta: (1, 0, 1) - mix of red and blue
+        White: (1, 1, 1) - red, green and blue full intensity
+        Black: (0, 0, 0) - no color component whatsoever
+        Gray: (0.5, 0.5, 0.5) - equal and medium intensity of red, green and blue color components
+        Orange: (1, 0.65, 0) - maximum intensity of red, medium intensity of green, no blue component
+    """
+
+    def __init__(self) -> None:
+        pass
+
+    def __is_nested_list(self, lst):
+        """
+        This function returns True if the given list is a nested list of any degree.
+        """
+        if isinstance(lst, list):
+            return any(self.__is_nested_list(i) for i in lst) or any(isinstance(i, list) for i in lst)
+        return False
+
+    def __valid_rect(self, bbox):
+        # Ensure that the rectangle is not empty or invalid
+        if isinstance(bbox[0], list):
+            return False  # It's a nested list, hence it can't be valid rect
+        else:
+            return bbox[0] < bbox[2] and bbox[1] < bbox[3]
+
+    def __draw_nested_boxes(self, page, nested_bbox, color=(0, 1, 1)):
+        """
+        This function draws the nested boxes
+
+        Parameters
+        ----------
+        page : fitz.Page
+            page
+        nested_bbox : list
+            nested bbox
+        color : tuple
+            color, by default (0, 1, 1)    # draw with cyan color for combined paragraph
+        """
+        if self.__is_nested_list(nested_bbox):  # If it's a nested list
+            for bbox in nested_bbox:
+                self.__draw_nested_boxes(page, bbox, color)  # Recursively call the function
+        elif self.__valid_rect(nested_bbox):  # If valid rectangle
+            para_rect = fitz.Rect(nested_bbox)
+            para_anno = page.add_rect_annot(para_rect)
+            para_anno.set_colors(stroke=color)  # draw with cyan color for combined paragraph
+            para_anno.set_border(width=1)
+            para_anno.update()
+
+    def draw_annos(self, input_pdf_path, pdf_dic, output_pdf_path):
+        pdf_doc = open_pdf(input_pdf_path)
+
+        if pdf_dic is None:
+            pdf_dic = {}
+
+        if output_pdf_path is None:
+            output_pdf_path = input_pdf_path.replace(".pdf", "_anno.pdf")
+
+        for page_id, page in enumerate(pdf_doc):  # type: ignore
+            page_key = f"page_{page_id}"
+            for ele_key, ele_data in pdf_dic[page_key].items():
+                if ele_key == "para_blocks":
+                    para_blocks = ele_data
+                    for para_block in para_blocks:
+                        if "paras" in para_block.keys():
+                            paras = para_block["paras"]
+                            for para_key, para_content in paras.items():
+                                para_bbox = para_content["para_bbox"]
+                                # print(f"para_bbox: {para_bbox}")
+                                # print(f"is a nested list: {self.__is_nested_list(para_bbox)}")
+                                if self.__is_nested_list(para_bbox) and len(para_bbox) > 1:
+                                    color = (0, 1, 1)
+                                    self.__draw_nested_boxes(
+                                        page, para_bbox, color
+                                    )  # draw with cyan color for combined paragraph
+                                else:
+                                    if self.__valid_rect(para_bbox):
+                                        para_rect = fitz.Rect(para_bbox)
+                                        para_anno = page.add_rect_annot(para_rect)
+                                        para_anno.set_colors(stroke=(0, 1, 0))  # draw with green color for normal paragraph
+                                        para_anno.set_border(width=0.5)
+                                        para_anno.update()
+
+                                is_para_title = para_content["is_para_title"]
+                                if is_para_title:
+                                    if self.__is_nested_list(para_content["para_bbox"]) and len(para_content["para_bbox"]) > 1:
+                                        color = (0, 0, 1)
+                                        self.__draw_nested_boxes(
+                                            page, para_content["para_bbox"], color
+                                        )  # draw with cyan color for combined title
+                                    else:
+                                        if self.__valid_rect(para_content["para_bbox"]):
+                                            para_rect = fitz.Rect(para_content["para_bbox"])
+                                            if self.__valid_rect(para_content["para_bbox"]):
+                                                para_anno = page.add_rect_annot(para_rect)
+                                                para_anno.set_colors(stroke=(0, 0, 1))  # draw with blue color for normal title
+                                                para_anno.set_border(width=0.5)
+                                                para_anno.update()
+
+        pdf_doc.save(output_pdf_path)
+        pdf_doc.close()
--- a/para/exceptions.py
+++ b/para/exceptions.py
+class DenseSingleLineBlockException(Exception):
+    """
+    This class defines the exception type for dense single line-block.
+    """
+
+    def __init__(self, message="DenseSingleLineBlockException"):
+        self.message = message
+        super().__init__(self.message)
+
+    def __str__(self):
+        return f"{self.message}"
+
+    def __repr__(self):
+        return f"{self.message}"
+
+
+class TitleDetectionException(Exception):
+    """
+    This class defines the exception type for title detection.
+    """
+
+    def __init__(self, message="TitleDetectionException"):
+        self.message = message
+        super().__init__(self.message)
+
+    def __str__(self):
+        return f"{self.message}"
+
+    def __repr__(self):
+        return f"{self.message}"
+
+
+class TitleLevelException(Exception):
+    """
+    This class defines the exception type for title level.
+    """
+
+    def __init__(self, message="TitleLevelException"):
+        self.message = message
+        super().__init__(self.message)
+
+    def __str__(self):
+        return f"{self.message}"
+
+    def __repr__(self):
+        return f"{self.message}"
+
+
+class ParaSplitException(Exception):
+    """
+    This class defines the exception type for paragraph splitting.
+    """
+
+    def __init__(self, message="ParaSplitException"):
+        self.message = message
+        super().__init__(self.message)
+
+    def __str__(self):
+        return f"{self.message}"
+
+    def __repr__(self):
+        return f"{self.message}"
+
+
+class ParaMergeException(Exception):
+    """
+    This class defines the exception type for paragraph merging.
+    """
+
+    def __init__(self, message="ParaMergeException"):
+        self.message = message
+        super().__init__(self.message)
+
+    def __str__(self):
+        return f"{self.message}"
+
+    def __repr__(self):
+        return f"{self.message}"
+
+
+class DiscardByException:
+    """
+    This class discards pdf files by exception
+    """
+
+    def __init__(self) -> None:
+        pass
+
+    def discard_by_single_line_block(self, pdf_dic, exception: DenseSingleLineBlockException):
+        """
+        This function discards pdf files by single line block exception
+
+        Parameters
+        ----------
+        pdf_dic : dict
+            pdf dictionary
+        exception : str
+            exception message
+
+        Returns
+        -------
+        error_message : str
+        """
+        exception_page_nums = 0
+        page_num = 0
+        for page_id, page in pdf_dic.items():
+            if page_id.startswith("page_"):
+                page_num += 1
+                if "preproc_blocks" in page.keys():
+                    preproc_blocks = page["preproc_blocks"]
+
+                    all_single_line_blocks = []
+                    for block in preproc_blocks:
+                        if len(block["lines"]) == 1:
+                            all_single_line_blocks.append(block)
+
+                    if len(preproc_blocks) > 0 and len(all_single_line_blocks) / len(preproc_blocks) > 0.9:
+                        exception_page_nums += 1
+
+        if page_num == 0:
+            return None
+
+        if exception_page_nums / page_num > 0.1:  # Low ratio means basically, whenever this is the case, it is discarded
+            return exception.message
+
+        return None
+
+    def discard_by_title_detection(self, pdf_dic, exception: TitleDetectionException):
+        """
+        This function discards pdf files by title detection exception
+
+        Parameters
+        ----------
+        pdf_dic : dict
+            pdf dictionary
+        exception : str
+            exception message
+
+        Returns
+        -------
+        error_message : str
+        """
+        # return exception.message
+        return None
+
+    def discard_by_title_level(self, pdf_dic, exception: TitleLevelException):
+        """
+        This function discards pdf files by title level exception
+
+        Parameters
+        ----------
+        pdf_dic : dict
+            pdf dictionary
+        exception : str
+            exception message
+
+        Returns
+        -------
+        error_message : str
+        """
+        # return exception.message
+        return None
+
+    def discard_by_split_para(self, pdf_dic, exception: ParaSplitException):
+        """
+        This function discards pdf files by split para exception
+
+        Parameters
+        ----------
+        pdf_dic : dict
+            pdf dictionary
+        exception : str
+            exception message
+
+        Returns
+        -------
+        error_message : str
+        """
+        # return exception.message
+        return None
+
+    def discard_by_merge_para(self, pdf_dic, exception: ParaMergeException):
+        """
+        This function discards pdf files by merge para exception
+
+        Parameters
+        ----------
+        pdf_dic : dict
+            pdf dictionary
+        exception : str
+            exception message
+
+        Returns
+        -------
+        error_message : str
+        """
+        # return exception.message
+        return None
--- a/para/layout_match_processor.py
+++ b/para/layout_match_processor.py
+import sys
+import math
+from para.commons import *
+
+
+if sys.version_info[0] >= 3:
+    sys.stdout.reconfigure(encoding="utf-8")  # type: ignore
+
+
+class LayoutFilterProcessor:
+    def __init__(self) -> None:
+        pass
+
+    def batch_process_blocks(self, pdf_dict):
+        for page_id, blocks in pdf_dict.items():
+            if page_id.startswith("page_"):
+                if "layout_bboxes" in blocks.keys() and "para_blocks" in blocks.keys():
+                    layout_bbox_objs = blocks["layout_bboxes"]
+                    if layout_bbox_objs is None:
+                        continue
+                    layout_bboxes = [bbox_obj["layout_bbox"] for bbox_obj in layout_bbox_objs]
+
+                    # Use math.ceil function to enlarge each value of x0, y0, x1, y1 of each layout_bbox
+                    layout_bboxes = [
+                        [math.ceil(x0), math.ceil(y0), math.ceil(x1), math.ceil(y1)] for x0, y0, x1, y1 in layout_bboxes
+                    ]
+
+                    para_blocks = blocks["para_blocks"]
+                    if para_blocks is None:
+                        continue
+
+                    for lb_bbox in layout_bboxes:
+                        for i, para_block in enumerate(para_blocks):
+                            para_bbox = para_block["bbox"]
+                            para_blocks[i]["in_layout"] = 0
+                            if is_in_bbox(para_bbox, lb_bbox):
+                                para_blocks[i]["in_layout"] = 1
+
+                    blocks["para_blocks"] = para_blocks
+
+        return pdf_dict
--- a/para/para_pipeline.py
+++ b/para/para_pipeline.py
+import os
+import sys
+import json
+
+from para.commons import *
+
+from para.raw_processor import RawBlockProcessor
+from para.layout_match_processor import LayoutFilterProcessor
+from para.stats import BlockStatisticsCalculator
+from para.stats import DocStatisticsCalculator
+from para.title_processor import TitleProcessor
+from para.block_termination_processor import BlockTerminationProcessor
+from para.block_continuation_processor import BlockContinuationProcessor
+from para.draw import DrawAnnos
+from para.exceptions import (
+    DenseSingleLineBlockException,
+    TitleDetectionException,
+    TitleLevelException,
+    ParaSplitException,
+    ParaMergeException,
+    DiscardByException,
+)
+
+
+if sys.version_info[0] >= 3:
+    sys.stdout.reconfigure(encoding="utf-8")  # type: ignore
+
+
+class ParaProcessPipeline:
+    def __init__(self) -> None:
+        pass
+
+    def para_process_pipeline(self, pdf_info_dict, para_debug_mode=None, input_pdf_path=None, output_pdf_path=None):
+        """
+        This function processes the paragraphs, including:
+        1. Read raw input json file into pdf_dic
+        2. Detect and replace equations
+        3. Combine spans into a natural line
+        4. Check if the paragraphs are inside bboxes passed from "layout_bboxes" key
+        5. Compute statistics for each block
+        6. Detect titles in the document
+        7. Detect paragraphs inside each block
+        8. Divide the level of the titles
+        9. Detect and combine paragraphs from different blocks into one paragraph
+        10. Check whether the final results after checking headings, dividing paragraphs within blocks, and merging paragraphs between blocks are plausible and reasonable.
+        11. Draw annotations on the pdf file
+
+        Parameters
+        ----------
+        pdf_dic_json_fpath : str
+            path to the pdf dictionary json file.
+            Notice: data noises, including overlap blocks, header, footer, watermark, vertical margin note have been removed already.
+        input_pdf_doc : str
+            path to the input pdf file
+        output_pdf_path : str
+            path to the output pdf file
+
+        Returns
+        -------
+        pdf_dict : dict
+            result dictionary
+        """
+
+        error_info = None
+
+        output_json_file = ""
+        output_dir = ""
+
+        if input_pdf_path is not None:
+            input_pdf_path = os.path.abspath(input_pdf_path)
+
+            # print_green_on_red(f">>>>>>>>>>>>>>>>>>> Process the paragraphs of {input_pdf_path}")
+
+        if output_pdf_path is not None:
+            output_dir = os.path.dirname(output_pdf_path)
+            output_json_file = f"{output_dir}/pdf_dic.json"
+
+        def __save_pdf_dic(pdf_dic, output_pdf_path, stage="0", para_debug_mode=para_debug_mode):
+            """
+            Save the pdf_dic to a json file
+            """
+            output_pdf_file_name = os.path.basename(output_pdf_path)
+            # output_dir = os.path.dirname(output_pdf_path)
+            output_dir = "\\tmp\\pdf_parse"
+            output_pdf_file_name = output_pdf_file_name.replace(".pdf", f"_stage_{stage}.json")
+            pdf_dic_json_fpath = os.path.join(output_dir, output_pdf_file_name)
+
+            if not os.path.exists(output_dir):
+                os.makedirs(output_dir)
+
+            if para_debug_mode == "full":
+                with open(pdf_dic_json_fpath, "w", encoding="utf-8") as f:
+                    json.dump(pdf_dic, f, indent=2, ensure_ascii=False)
+
+            # Validate the output already exists
+            if not os.path.exists(pdf_dic_json_fpath):
+                print_red(f"Failed to save the pdf_dic to {pdf_dic_json_fpath}")
+                return None
+            else:
+                print_green(f"Succeed to save the pdf_dic to {pdf_dic_json_fpath}")
+
+            return pdf_dic_json_fpath
+
+        """
+        Preprocess the lines of block
+        """
+        # Find and replace the interline and inline equations, should be better done before the paragraph processing
+        # Create "para_blocks" for each page.
+        # equationProcessor = EquationsProcessor()
+        # pdf_dic = equationProcessor.batch_process_blocks(pdf_info_dict)
+
+        # Combine spans into a natural line
+        rawBlockProcessor = RawBlockProcessor()
+        pdf_dic = rawBlockProcessor.batch_process_blocks(pdf_info_dict)
+        # print(f"pdf_dic['page_0']['para_blocks'][0]: {pdf_dic['page_0']['para_blocks'][0]}", end="\n\n")
+
+        # Check if the paragraphs are inside bboxes passed from "layout_bboxes" key
+        layoutFilter = LayoutFilterProcessor()
+        pdf_dic = layoutFilter.batch_process_blocks(pdf_dic)
+
+        # Compute statistics for each block
+        blockStatisticsCalculator = BlockStatisticsCalculator()
+        pdf_dic = blockStatisticsCalculator.batch_process_blocks(pdf_dic)
+        # print(f"pdf_dic['page_0']['para_blocks'][0]: {pdf_dic['page_0']['para_blocks'][0]}", end="\n\n")
+
+        # Compute statistics for all blocks(namely this pdf document)
+        docStatisticsCalculator = DocStatisticsCalculator()
+        pdf_dic = docStatisticsCalculator.calc_stats_of_doc(pdf_dic)
+        # print(f"pdf_dic['statistics']: {pdf_dic['statistics']}", end="\n\n")
+
+        # Dump the first three stages of pdf_dic to a json file
+        if para_debug_mode == "full":
+            pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="0", para_debug_mode=para_debug_mode)
+
+        """
+        Detect titles in the document
+        """
+        doc_statistics = pdf_dic["statistics"]
+        titleProcessor = TitleProcessor(doc_statistics)
+        pdf_dic = titleProcessor.batch_process_blocks_detect_titles(pdf_dic)
+
+        if para_debug_mode == "full":
+            pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="1", para_debug_mode=para_debug_mode)
+
+        """
+        Detect and divide the level of the titles
+        """
+        titleProcessor = TitleProcessor()
+
+        pdf_dic = titleProcessor.batch_process_blocks_recog_title_level(pdf_dic)
+
+        if para_debug_mode == "full":
+            pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="2", para_debug_mode=para_debug_mode)
+
+        """
+        Detect and split paragraphs inside each block
+        """
+        blockInnerParasProcessor = BlockTerminationProcessor()
+
+        pdf_dic = blockInnerParasProcessor.batch_process_blocks(pdf_dic)
+
+        if para_debug_mode == "full":
+            pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="3", para_debug_mode=para_debug_mode)
+
+        # pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="3", para_debug_mode="full")
+        # print_green(f"pdf_dic_json_fpath: {pdf_dic_json_fpath}")
+
+        """
+        Detect and combine paragraphs from different blocks into one paragraph
+        """
+        blockContinuationProcessor = BlockContinuationProcessor()
+
+        pdf_dic = blockContinuationProcessor.batch_tag_paras(pdf_dic)
+        pdf_dic = blockContinuationProcessor.batch_merge_paras(pdf_dic)
+
+        if para_debug_mode == "full":
+            pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="4", para_debug_mode=para_debug_mode)
+
+        # pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="4", para_debug_mode="full")
+        # print_green(f"pdf_dic_json_fpath: {pdf_dic_json_fpath}")
+
+        """
+        Discard pdf files by checking exceptions and return the error info to the caller
+        """
+        discardByException = DiscardByException()
+
+        is_discard_by_single_line_block = discardByException.discard_by_single_line_block(
+            pdf_dic, exception=DenseSingleLineBlockException()
+        )
+        is_discard_by_title_detection = discardByException.discard_by_title_detection(
+            pdf_dic, exception=TitleDetectionException()
+        )
+        is_discard_by_title_level = discardByException.discard_by_title_level(pdf_dic, exception=TitleLevelException())
+        is_discard_by_split_para = discardByException.discard_by_split_para(pdf_dic, exception=ParaSplitException())
+        is_discard_by_merge_para = discardByException.discard_by_merge_para(pdf_dic, exception=ParaMergeException())
+
+        """
+        if any(
+            info is not None
+            for info in [
+                is_discard_by_single_line_block,
+                is_discard_by_title_detection,
+                is_discard_by_title_level,
+                is_discard_by_split_para,
+                is_discard_by_merge_para,
+            ]
+        ):
+            error_info = next(
+                (
+                    info
+                    for info in [
+                        is_discard_by_single_line_block,
+                        is_discard_by_title_detection,
+                        is_discard_by_title_level,
+                        is_discard_by_split_para,
+                        is_discard_by_merge_para,
+                    ]
+                    if info is not None
+                ),
+                None,
+            )
+            return pdf_dic, error_info
+
+        if any(
+            info is not None
+            for info in [
+                is_discard_by_single_line_block,
+                is_discard_by_title_detection,
+                is_discard_by_title_level,
+                is_discard_by_split_para,
+                is_discard_by_merge_para,
+            ]
+        ):
+            error_info = next(
+                (
+                    info
+                    for info in [
+                        is_discard_by_single_line_block,
+                        is_discard_by_title_detection,
+                        is_discard_by_title_level,
+                        is_discard_by_split_para,
+                        is_discard_by_merge_para,
+                    ]
+                    if info is not None
+                ),
+                None,
+            )
+            return pdf_dic, error_info
+        """
+
+        """
+        Dump the final pdf_dic to a json file
+        """
+        if para_debug_mode is not None:
+            with open(output_json_file, "w", encoding="utf-8") as f:
+                json.dump(pdf_info_dict, f, ensure_ascii=False, indent=4)
+
+        """
+        Draw the annotations
+        """
+
+        if is_discard_by_single_line_block is not None:
+            error_info = is_discard_by_single_line_block
+        elif is_discard_by_title_detection is not None:
+            error_info = is_discard_by_title_detection
+        elif is_discard_by_title_level is not None:
+            error_info = is_discard_by_title_level
+        elif is_discard_by_split_para is not None:
+            error_info = is_discard_by_split_para
+        elif is_discard_by_merge_para is not None:
+            error_info = is_discard_by_merge_para
+
+        if error_info is not None:
+            return pdf_dic, error_info
+
+        """
+        Dump the final pdf_dic to a json file
+        """
+        if para_debug_mode is not None:
+            with open(output_json_file, "w", encoding="utf-8") as f:
+                json.dump(pdf_info_dict, f, ensure_ascii=False, indent=4)
+
+        """
+        Draw the annotations
+        """
+        if para_debug_mode is not None:
+            drawAnnos = DrawAnnos()
+            drawAnnos.draw_annos(input_pdf_path, pdf_dic, output_pdf_path)
+
+        """
+        Remove the intermediate files which are generated in the process of paragraph processing if debug_mode is simple
+        """
+        if para_debug_mode is not None:
+            for fpath in os.listdir(output_dir):
+                if fpath.endswith(".json") and "stage" in fpath:
+                    os.remove(os.path.join(output_dir, fpath))
+
+        return pdf_dic, error_info
--- a/para/raw_processor.py
+++ b/para/raw_processor.py
+from para.commons import *
+
+class RawBlockProcessor:
+    def __init__(self) -> None:
+        self.y_tolerance = 2
+        self.pdf_dic = {}
+
+    def __span_flags_decomposer(self, span_flags):
+        """
+        Make font flags human readable.
+
+        Parameters
+        ----------
+        self : object
+            The instance of the class.
+
+        span_flags : int
+            span flags
+
+        Returns
+        -------
+        l : dict
+            decomposed flags
+        """
+
+        l = {
+            "is_superscript": False,
+            "is_italic": False,
+            "is_serifed": False,
+            "is_sans_serifed": False,
+            "is_monospaced": False,
+            "is_proportional": False,
+            "is_bold": False,
+        }
+
+        if span_flags & 2**0:
+            l["is_superscript"] = True  # 表示上标
+
+        if span_flags & 2**1:
+            l["is_italic"] = True  # 表示斜体
+
+        if span_flags & 2**2:
+            l["is_serifed"] = True  # 表示衬线字体
+        else:
+            l["is_sans_serifed"] = True  # 表示非衬线字体
+
+        if span_flags & 2**3:
+            l["is_monospaced"] = True  # 表示等宽字体
+        else:
+            l["is_proportional"] = True  # 表示比例字体
+
+        if span_flags & 2**4:
+            l["is_bold"] = True  # 表示粗体
+
+        return l
+
+    def __make_new_lines(self, raw_lines):
+        """
+        This function makes new lines.
+
+        Parameters
+        ----------
+        self : object
+            The instance of the class.
+
+        raw_lines : list
+            raw lines
+
+        Returns
+        -------
+        new_lines : list
+            new lines
+        """
+        new_lines = []
+        new_line = None
+
+        for raw_line in raw_lines:
+            raw_line_bbox = raw_line["bbox"]
+            raw_line_spans = raw_line["spans"]
+            raw_line_text = "".join([span["text"] for span in raw_line_spans])
+            raw_line_dir = raw_line.get("dir", None)
+
+            decomposed_line_spans = []
+            for span in raw_line_spans:
+                raw_flags = span["flags"]
+                decomposed_flags = self.__span_flags_decomposer(raw_flags)
+                span["decomposed_flags"] = decomposed_flags
+                decomposed_line_spans.append(span)
+
+            if new_line is None:
+                new_line = {
+                    "bbox": raw_line_bbox,
+                    "text": raw_line_text,
+                    "dir": raw_line_dir if raw_line_dir else (0, 0),
+                    "spans": decomposed_line_spans,
+                }
+            else:
+                if (
+                    abs(raw_line_bbox[1] - new_line["bbox"][1]) <= self.y_tolerance
+                    and abs(raw_line_bbox[3] - new_line["bbox"][3]) <= self.y_tolerance
+                ):
+                    new_line["bbox"] = (
+                        min(new_line["bbox"][0], raw_line_bbox[0]),  # left
+                        new_line["bbox"][1],  # top
+                        max(new_line["bbox"][2], raw_line_bbox[2]),  # right
+                        raw_line_bbox[3],  # bottom
+                    )
+                    new_line["text"] += " " + raw_line_text
+                    new_line["spans"].extend(raw_line_spans)
+                    new_line["dir"] = (
+                        new_line["dir"][0] + raw_line_dir[0],
+                        new_line["dir"][1] + raw_line_dir[1],
+                    )
+                else:
+                    new_lines.append(new_line)
+                    new_line = {
+                        "bbox": raw_line_bbox,
+                        "text": raw_line_text,
+                        "dir": raw_line_dir if raw_line_dir else (0, 0),
+                        "spans": raw_line_spans,
+                    }
+        if new_line:
+            new_lines.append(new_line)
+
+        return new_lines
+
+    def __make_new_block(self, raw_block):
+        """
+        This function makes a new block.
+
+        Parameters
+        ----------
+        self : object
+            The instance of the class.
+        ----------
+        raw_block : dict
+            a raw block
+
+        Returns
+        -------
+        new_block : dict
+
+        Schema of new_block:
+        {
+            "block_id": "block_1",
+            "bbox": [0, 0, 100, 100],
+            "text": "This is a block.",
+            "lines": [
+                {
+                    "bbox": [0, 0, 100, 100],
+                    "text": "This is a line.",
+                    "spans": [
+                        {
+                            "text": "This is a span.",
+                            "font": "Times New Roman",
+                            "size": 12,
+                            "color": "#000000",
+                        }
+                    ],
+                }
+            ],
+        }
+        """
+        new_block = {}
+
+        block_id = raw_block["number"]
+        block_bbox = raw_block["bbox"]
+        block_text = " ".join(span["text"] for line in raw_block["lines"] for span in line["spans"])
+        raw_lines = raw_block["lines"]
+        block_lines = self.__make_new_lines(raw_lines)
+
+        new_block["block_id"] = block_id
+        new_block["bbox"] = block_bbox
+        new_block["text"] = block_text
+        new_block["lines"] = block_lines
+
+        return new_block
+
+    def batch_process_blocks(self, pdf_dic):
+        """
+        This function processes the blocks in batch.
+
+        Parameters
+        ----------
+        self : object
+            The instance of the class.
+        ----------
+        blocks : list
+            Input block is a list of raw blocks. Schema can refer to the value of key ""preproc_blocks", demo file is app/pdf_toolbox/test/preproc_2_parasplit_example.json.
+
+        Returns
+        -------
+        result_dict : dict
+            result dictionary
+        """
+
+        for page_id, blocks in pdf_dic.items():
+            if page_id.startswith("page_"):
+                para_blocks = []
+                if "preproc_blocks" in blocks.keys():
+                    input_blocks = blocks["preproc_blocks"]
+                    for raw_block in input_blocks:
+                        new_block = self.__make_new_block(raw_block)
+                        para_blocks.append(new_block)
+
+                blocks["para_blocks"] = para_blocks
+
+        return pdf_dic
+
--- a/para/stats.py
+++ b/para/stats.py
+import sys
+from collections import Counter
+import numpy as np
+
+from para.commons import *
+
+
+if sys.version_info[0] >= 3:
+    sys.stdout.reconfigure(encoding="utf-8")  # type: ignore
+
+
+class BlockStatisticsCalculator:
+    def __init__(self) -> None:
+        pass
+
+    def __calc_stats_of_new_lines(self, new_lines):
+        """
+        This function calculates the paragraph metrics
+
+        Parameters
+        ----------
+        combined_lines : list
+            combined lines
+
+        Returns
+        -------
+        X0 : float
+            Median of x0 values, which represents the left average boundary of the block
+        X1 : float
+            Median of x1 values, which represents the right average boundary of the block
+        avg_char_width : float
+            Average of char widths, which represents the average char width of the block
+        avg_char_height : float
+            Average of line heights, which represents the average line height of the block
+
+        """
+        x0_values = []
+        x1_values = []
+        char_widths = []
+        char_heights = []
+
+        block_font_types = []
+        block_font_sizes = []
+        block_directions = []
+
+        if len(new_lines) > 0:
+            for i, line in enumerate(new_lines):
+                line_bbox = line["bbox"]
+                line_text = line["text"]
+                line_spans = line["spans"]
+
+                num_chars = len([ch for ch in line_text if not ch.isspace()])
+
+                x0_values.append(line_bbox[0])
+                x1_values.append(line_bbox[2])
+
+                if num_chars > 0:
+                    char_width = (line_bbox[2] - line_bbox[0]) / num_chars
+                    char_widths.append(char_width)
+
+                for span in line_spans:
+                    block_font_types.append(span["font"])
+                    block_font_sizes.append(span["size"])
+
+                if "dir" in line:
+                    block_directions.append(line["dir"])
+
+                # line_font_types = [span["font"] for span in line_spans]
+                char_heights = [span["size"] for span in line_spans]
+
+        X0 = np.median(x0_values) if x0_values else 0
+        X1 = np.median(x1_values) if x1_values else 0
+        avg_char_width = sum(char_widths) / len(char_widths) if char_widths else 0
+        avg_char_height = sum(char_heights) / len(char_heights) if char_heights else 0
+
+        # max_freq_font_type = max(set(block_font_types), key=block_font_types.count) if block_font_types else None
+
+        max_span_length = 0
+        max_span_font_type = None
+        for line in new_lines:
+            line_spans = line["spans"]
+            for span in line_spans:
+                span_length = span["bbox"][2] - span["bbox"][0]
+                if span_length > max_span_length:
+                    max_span_length = span_length
+                    max_span_font_type = span["font"]
+
+        max_freq_font_type = max_span_font_type
+
+        avg_font_size = sum(block_font_sizes) / len(block_font_sizes) if block_font_sizes else None
+
+        avg_dir_horizontal = sum([dir[0] for dir in block_directions]) / len(block_directions) if block_directions else 0
+        avg_dir_vertical = sum([dir[1] for dir in block_directions]) / len(block_directions) if block_directions else 0
+
+        median_font_size = float(np.median(block_font_sizes)) if block_font_sizes else None
+
+        return (
+            X0,
+            X1,
+            avg_char_width,
+            avg_char_height,
+            max_freq_font_type,
+            avg_font_size,
+            (avg_dir_horizontal, avg_dir_vertical),
+            median_font_size,
+        )
+
+    def __make_new_block(self, input_block):
+        new_block = {}
+
+        raw_lines = input_block["lines"]
+        stats = self.__calc_stats_of_new_lines(raw_lines)
+
+        block_id = input_block["block_id"]
+        block_bbox = input_block["bbox"]
+        block_text = input_block["text"]
+        block_lines = raw_lines
+        block_avg_left_boundary = stats[0]
+        block_avg_right_boundary = stats[1]
+        block_avg_char_width = stats[2]
+        block_avg_char_height = stats[3]
+        block_font_type = stats[4]
+        block_font_size = stats[5]
+        block_direction = stats[6]
+        block_median_font_size = stats[7]
+
+        new_block["block_id"] = block_id
+        new_block["bbox"] = block_bbox
+        new_block["text"] = block_text
+        new_block["dir"] = block_direction
+        new_block["X0"] = block_avg_left_boundary
+        new_block["X1"] = block_avg_right_boundary
+        new_block["avg_char_width"] = block_avg_char_width
+        new_block["avg_char_height"] = block_avg_char_height
+        new_block["block_font_type"] = block_font_type
+        new_block["block_font_size"] = block_font_size
+        new_block["lines"] = block_lines
+        new_block["median_font_size"] = block_median_font_size
+
+        return new_block
+
+    def batch_process_blocks(self, pdf_dic):
+        """
+        This function processes the blocks in batch.
+
+        Parameters
+        ----------
+        self : object
+            The instance of the class.
+        ----------
+        blocks : list
+            Input block is a list of raw blocks. Schema can refer to the value of key ""preproc_blocks", demo file is app/pdf_toolbox/test/preproc_2_parasplit_example.json
+
+        Returns
+        -------
+        result_dict : dict
+            result dictionary
+        """
+
+        for page_id, blocks in pdf_dic.items():
+            if page_id.startswith("page_"):
+                para_blocks = []
+                if "para_blocks" in blocks.keys():
+                    input_blocks = blocks["para_blocks"]
+                    for input_block in input_blocks:
+                        new_block = self.__make_new_block(input_block)
+                        para_blocks.append(new_block)
+
+                blocks["para_blocks"] = para_blocks
+
+        return pdf_dic
+
+
+class DocStatisticsCalculator:
+    def __init__(self) -> None:
+        pass
+
+    def calc_stats_of_doc(self, pdf_dict):
+        """
+        This function computes the statistics of the document
+
+        Parameters
+        ----------
+        result_dict : dict
+            result dictionary
+
+        Returns
+        -------
+        statistics : dict
+            statistics of the document
+        """
+
+        total_text_length = 0
+        total_num_blocks = 0
+
+        for page_id, blocks in pdf_dict.items():
+            if page_id.startswith("page_"):
+                if "para_blocks" in blocks.keys():
+                    para_blocks = blocks["para_blocks"]
+                    for para_block in para_blocks:
+                        total_text_length += len(para_block["text"])
+                        total_num_blocks += 1
+
+        avg_text_length = total_text_length / total_num_blocks if total_num_blocks else 0
+
+        font_list = []
+
+        for page_id, blocks in pdf_dict.items():
+            if page_id.startswith("page_"):
+                if "para_blocks" in blocks.keys():
+                    input_blocks = blocks["para_blocks"]
+                    for input_block in input_blocks:
+                        block_text_length = len(input_block.get("text", ""))
+                        if block_text_length < avg_text_length * 0.5:
+                            continue
+                        block_font_type = safe_get(input_block, "block_font_type", "")
+                        block_font_size = safe_get(input_block, "block_font_size", 0)
+                        font_list.append((block_font_type, block_font_size))
+
+        font_counter = Counter(font_list)
+        most_common_font = font_counter.most_common(1)[0] if font_list else (("", 0), 0)
+        second_most_common_font = font_counter.most_common(2)[1] if len(font_counter) > 1 else (("", 0), 0)
+
+        statistics = {
+            "num_pages": 0,
+            "num_blocks": 0,
+            "num_paras": 0,
+            "num_titles": 0,
+            "num_header_blocks": 0,
+            "num_footer_blocks": 0,
+            "num_watermark_blocks": 0,
+            "num_vertical_margin_note_blocks": 0,
+            "most_common_font_type": most_common_font[0][0],
+            "most_common_font_size": most_common_font[0][1],
+            "number_of_most_common_font": most_common_font[1],
+            "second_most_common_font_type": second_most_common_font[0][0],
+            "second_most_common_font_size": second_most_common_font[0][1],
+            "number_of_second_most_common_font": second_most_common_font[1],
+            "avg_text_length": avg_text_length,
+        }
+
+        for page_id, blocks in pdf_dict.items():
+            if page_id.startswith("page_"):
+                blocks = pdf_dict[page_id]["para_blocks"]
+                statistics["num_pages"] += 1
+                for block_id, block_data in enumerate(blocks):
+                    statistics["num_blocks"] += 1
+
+                    if "paras" in block_data.keys():
+                        statistics["num_paras"] += len(block_data["paras"])
+
+                    for line in block_data["lines"]:
+                        if line.get("is_title", 0):
+                            statistics["num_titles"] += 1
+
+                    if block_data.get("is_header", 0):
+                        statistics["num_header_blocks"] += 1
+                    if block_data.get("is_footer", 0):
+                        statistics["num_footer_blocks"] += 1
+                    if block_data.get("is_watermark", 0):
+                        statistics["num_watermark_blocks"] += 1
+                    if block_data.get("is_vertical_margin_note", 0):
+                        statistics["num_vertical_margin_note_blocks"] += 1
+
+        pdf_dict["statistics"] = statistics
+
+        return pdf_dict
+
+