Initial commit

c2e5c36f · 赵小蒙 · c2e5c36f · c2e5c36f · c2e5c36f · c2e5c36f
Commit c2e5c36f authored Feb 29, 2024 by 赵小蒙
20 changed files
--- a/README.md
+++ b/README.md
+# pdf_toolbox
+pdf 解析基础函数
+## pdf是否是文字类型/扫描类型的区分
+```shell
+cat s3_pdf_path.example.pdf | parallel --colsep ' ' -j 10 "python pdf_meta_scan.py --s3-pdf-path {2} --s3-profile {1} >> {/}.jsonl"
+find dir/to/jsonl/ -type f -name "*.jsonl" | parallel -j 10 "python pdf_classfy_by_type.py --json_file {} >> {/}.jsonl"
+```
+```shell
+# 如果单独运行脚本，合并到code-clean之后需要运行，参考如下：
+python -m pdf_meta_scan --s3-pdf-path "D:\pdf_files\内容排序测试_pdf\p3_图文混排 5.pdf" --s3-profile s2
+```
+## pdf
--- a/__init__.py
+++ b/__init__.py
--- a/check_inline_formula.py
+++ b/check_inline_formula.py
+# 最终版：把那种text_block有重叠，且inline_formula位置在重叠部分的，认定整个页面都有问题，所有的inline_formula都改成no_check
+from libs.commons import fitz
+def check_inline_formula(page, inline_formula_boxes):
+    """
+    :param page :fitz读取的当前页的内容
+    :param inline_formula_boxes: list类型，每一个元素是一个元祖 (L, U, R, D)
+    :return: inline_formula_check: list类型，每一个元素是一个类别，其顺序对应输入的inline_formula_boxes，给每个行内公式打一个标签，包括：
+        - nocheck_inline_formula：这个公式框没有与任何span相交，有可能存在问题
+        - wrong_text_block：这个公式框同时存在多个block里，可能页面的text block存在问题
+        - false_inline_formula：只涉及一个span并且只占据这个span的小部分面积，判断可能不是公式
+        - true_inline_formula：两种情况判断为公式，一是横跨多个span，二是只涉及一个span但是几乎占据了这个span大部分的面积
+    """
+    # count = defaultdict(int)
+    ## ------------------------ Text --------------------------------------------
+    blocks = page.get_text(
+            "dict",
+            flags=fitz.TEXTFLAGS_TEXT,
+            #clip=clip,
+        )["blocks"]
+    # iterate over the bboxes
+    inline_formula_check = []
+    for result in inline_formula_boxes:
+        (x1, y1, x2, y2) = (result[0], result[1], result[2], result[3])
+        ## 逐个block##
+        in_block = 0
+        for bbox in blocks:
+            # image = cv2.rectangle(image, (int(bbox['bbox'][0]), int(bbox['bbox'][1])), (int(bbox['bbox'][2]), int(bbox['bbox'][3])), (0, 255, 0), 1)
+            if (y1 >= bbox['bbox'][1] and y2 <= bbox['bbox'][3]) and (x1 >= bbox['bbox'][0] and x2 <= bbox['bbox'][2]):       # 判定公式在哪一个block
+                in_block += 1
+                intersect = []
+                # ## 逐个span###
+                for line in bbox['lines']:
+                    if line['bbox'][1] <= ((y2 - y1) / 2) + y1 <= line['bbox'][3]:   # 判断公式在哪一行
+                        for item in line['spans']:
+                            (t_x1, t_y1, t_x2, t_y2) = item['bbox']
+                            if not ((t_x1 < x1 and t_x2 < x1) or (t_x1 > x2 and t_x2 > x2) or (t_y1 < y1 and t_y2 < y1) or (t_y1 > y2 and t_y2 > y2)):   # 判断是否相交
+                                intersect.append(item['bbox'])
+                                # image = cv2.rectangle(image, (int(t_x1), int(t_y1)), (int(t_x2), int(t_y2)), (0, 255, 0), 1)    # 可视化涉及到的span
+                # 可视化公式的分类
+                if len(intersect) == 0:  # 没有与任何一个span有相交，这个span或者这个inline_formula_box可能有问题
+                    # print(f'Wrong location, check {img_path}')
+                    inline_formula_check_result = "nocheck_inline_formula"
+                    # count['not_in_line'] += 1
+                elif len(intersect) == 1:  
+                    if abs((intersect[0][2] - intersect[0][0]) - (x2 - x1)) < (x2 - x1)*0.5: # 只涉及一个span但是几乎占据了这个span大部分的面积，判定为公式
+                        # image = cv2.rectangle(image, (int(x1), int(y1)), (int(x2), int(y2)), (0, 255, 0), 1)   
+                        inline_formula_check_result = "true_inline_formula"
+                        # count['one_span_large'] += 1
+                    else:  # 只涉及一个span并且只占据这个span的小部分面积，判断可能不是公式
+                        # image = cv2.rectangle(image, (int(x1), int(y1)), (int(x2), int(y2)), (0, 0, 255), 1)
+                        inline_formula_check_result = "false_inline_formula"
+                        # count['fail'] += 1
+                else:  # 横跨多个span,判定为公式
+                    # image = cv2.rectangle(image, (int(x1), int(y1)), (int(x2), int(y2)), (255, 0, 0), 1)
+                    inline_formula_check_result = "true_inline_formula"
+                    # count['multi_span'] += 1
+        if in_block == 0:  # 这个公式没有在任何的block里，这个公式可能有问题
+            # image = cv2.rectangle(image, (int(x1), int(y1)), (int(x2), int(y2)), (255, 255, 0), 1)
+            inline_formula_check_result = "nocheck_inline_formula"
+            # count['not_in_block'] += 1
+        elif in_block > 1: # 这个公式存在于多个block里，这个页面可能有问题
+            inline_formula_check_result = "wrong_text_block"
+        inline_formula_check.append(inline_formula_check_result)
+    return inline_formula_check
--- a/download.py
+++ b/download.py
+import json
+import os
+from tqdm import tqdm
+from libs.commons import join_path
+with open('/mnt/petrelfs/share_data/ouyanglinke/OCR/OCR_validation_dataset.json', 'r') as f:
+    samples = json.load(f)
+pdf_model_dir = 's3://llm-pdf-text/eval_1k/layout_res/'
+labels = []
+det_res = []
+edit_distance_list = []
+for sample in tqdm(samples):
+    pdf_name = sample['pdf_name']
+    page_num = sample['page']
+    pdf_model_path = join_path(pdf_model_dir, pdf_name)
+    model_output_json = join_path(pdf_model_path, f"page_{page_num}.json") # 模型输出的页面编号从1开始的
+    save_root_path = '/mnt/petrelfs/share_data/ouyanglinke/OCR/OCR_val_docxchain/'
+    save_path = join_path(save_root_path, pdf_name)
+    os.makedirs(save_path, exist_ok=True)
+    # print("s3c cp {} {}".format(model_output_json, save_path))
+    os.system("aws --profile langchao --endpoint-url=http://10.140.85.161:80 s3 cp {} {}".format(model_output_json, save_path))
--- a/draw_bbox.py
+++ b/draw_bbox.py
+from libs.commons import fitz  # PyMuPDF
+# PDF文件路径
+pdf_path = "D:\\project\\20231108code-clean\\code-clean\\tmp\\unittest\\download-pdfs\\scihub\\scihub_53700000\\libgen.scimag53724000-53724999.zip_10.1097\\00129191-200509000-00018.pdf"
+doc = fitz.open(pdf_path)  # Open the PDF
+# 你的数据
+data = [[[-2, 0, 603, 80, 24]], [[-3, 0, 602, 80, 24]]]
+# 对每个页面进行处理
+for i, page in enumerate(doc):
+    # 获取当前页面的数据
+    page_data = data[i]
+    for img in page_data:
+        x0, y0, x1, y1, _ = img
+        rect_coords = fitz.Rect(x0, y0, x1, y1)  # Define the rectangle
+        page.draw_rect(rect_coords, color=(1, 0, 0), fill=None, width=1.5, overlay=True)  # Draw the rectangle
+# Save the PDF
+doc.save("D:\\project\\20231108code-clean\\code-clean\\tmp\\unittest\\download-pdfs\\scihub\\scihub_53700000\\libgen.scimag53724000-53724999.zip_10.1097\\00129191-200509000-00018_new.pdf")
\ No newline at end of file
--- a/filter/__init__.py
+++ b/filter/__init__.py
--- a/filter/pdf_classify_by_type.py
+++ b/filter/pdf_classify_by_type.py
--- a/filter/pdf_meta_scan.py
+++ b/filter/pdf_meta_scan.py
+"""
+输入： s3路径，每行一个
+输出： pdf文件元信息，包括每一页上的所有图片的长宽高，bbox位置
+"""
+import math
+import sys
+import click
+from libs.commons import read_file, mymax, get_top_percent_list
+import json
+from libs.commons import fitz
+from loguru import logger
+from collections import Counter
+from libs.drop_reason import DropReason
+from libs.language import detect_lang
+scan_max_page = 50
+junk_limit_min = 10
+def calculate_max_image_area_per_page(result:list, page_width_pts, page_height_pts):
+    max_image_area_per_page = [mymax([(x1 - x0) * (y1 - y0) for x0, y0, x1, y1, _ in page_img_sz]) for page_img_sz in
+                               result]
+    page_area = int(page_width_pts) * int(page_height_pts)
+    max_image_area_per_page = [area / page_area for area in max_image_area_per_page]
+    max_image_area_per_page = [area for area in max_image_area_per_page if area > 0.6]
+    return max_image_area_per_page
+def process_image(page, junk_img_bojids=[]):
+    page_result = []# 存每个页面里的多张图四元组信息
+    items = page.get_images()
+    dedup = set()
+    for img in items:
+        # 这里返回的是图片在page上的实际展示的大小。返回一个数组，每个元素第一部分是
+        img_bojid = img[0]# 在pdf文件中是全局唯一的，如果这个图反复出现在pdf里那么就可能是垃圾信息，例如水印、页眉页脚等
+        if img_bojid in junk_img_bojids:# 如果是垃圾图像，就跳过
+            continue
+        recs = page.get_image_rects(img, transform=True)
+        if recs:
+            rec = recs[0][0]
+            x0, y0, x1, y1 = map(int, rec)
+            width = x1 - x0
+            height = y1 - y0
+            if (x0, y0, x1, y1, img_bojid) in dedup:  # 这里面会出现一些重复的bbox，无需重复出现，需要去掉
+                continue
+            if not all([width, height]):  # 长和宽任何一个都不能是0，否则这个图片不可见，没有实际意义
+                continue
+            dedup.add((x0, y0, x1, y1, img_bojid))
+            page_result.append([x0, y0, x1, y1, img_bojid])
+    return page_result
+def get_image_info(doc: fitz.Document, page_width_pts, page_height_pts) -> list:
+    """
+    返回每个页面里的图片的四元组，每个页面多个图片。
+    :param doc:
+    :return:
+    """
+    # 使用 Counter 计数 img_bojid 的出现次数
+    img_bojid_counter = Counter(img[0] for page in doc for img in page.get_images())
+    # 找出出现次数超过 len(doc) 半数的 img_bojid
+    junk_limit = max(len(doc)*0.5, junk_limit_min)# 对一些页数比较少的进行豁免
+    junk_img_bojids = [img_bojid for img_bojid, count in img_bojid_counter.items() if count >= junk_limit]
+    #todo 加个判断，用前十页就行，这些垃圾图片需要满足两个条件，不止出现的次数要足够多，而且图片占书页面积的比例要足够大，且图与图大小都差不多
+    #有两种扫描版，一种文字版，这里可能会有误判
+    #扫描版1：每页都有所有扫描页图片，特点是图占比大，每页展示1张
+    #扫描版2，每页存储的扫描页图片数量递增，特点是图占比大，每页展示1张，需要清空junklist跑前50页图片信息用于分类判断
+    #文字版1.每页存储所有图片，特点是图片占页面比例不大，每页展示可能为0也可能不止1张 这种pdf需要拿前10页抽样检测img大小和个数，如果符合需要清空junklist
+    imgs_len_list = [len(page.get_images()) for page in doc]
+    special_limit_pages = 10
+    # 统一用前十页结果做判断
+    result = []
+    break_loop = False
+    for i, page in enumerate(doc):
+        if break_loop:
+            break
+        if i >= special_limit_pages:
+            break
+        page_result = process_image(page)  # 这里不传junk_img_bojids，拿前十页所有图片信息用于后续分析
+        result.append(page_result)
+        for item in result:
+            if not any(item):  # 如果任何一页没有图片，说明是个文字版，需要判断是否为特殊文字版
+                if max(imgs_len_list) == min(imgs_len_list) and max(imgs_len_list) >= junk_limit_min:# 如果是特殊文字版，就把junklist置空并break
+                    junk_img_bojids = []
+                else:# 不是特殊文字版，是个普通文字版，但是存在垃圾图片，不置空junklist
+                    pass
+                break_loop = True
+                break
+    if not break_loop:
+        # 获取前80%的元素
+        top_eighty_percent = get_top_percent_list(imgs_len_list, 0.8)
+        # 检查前80%的元素是否都相等
+        if len(set(top_eighty_percent)) == 1 and max(imgs_len_list) >= junk_limit_min:
+        # # 如果前10页跑完都有图，根据每页图片数量是否相等判断是否需要清除junklist
+        # if max(imgs_len_list) == min(imgs_len_list) and max(imgs_len_list) >= junk_limit_min:
+            #前10页都有图，且每页数量一致，需要检测图片大小占页面的比例判断是否需要清除junklist
+            max_image_area_per_page = calculate_max_image_area_per_page(result, page_width_pts, page_height_pts)
+            if len(max_image_area_per_page) < 0.8 * special_limit_pages:  # 前10页不全是大图，说明可能是个文字版pdf，把垃圾图片list置空
+                junk_img_bojids = []
+            else:# 前10页都有图，而且80%都是大图，且每页图片数量一致并都很多，说明是扫描版1，不需要清空junklist
+                pass
+        else:# 每页图片数量不一致，需要清掉junklist全量跑前50页图片
+            junk_img_bojids = []
+    #正式进入取前50页图片的信息流程
+    result = []
+    for i, page in enumerate(doc):
+        if i >= scan_max_page:
+            break
+        page_result = process_image(page, junk_img_bojids)
+        # logger.info(f"page {i} img_len: {len(page_result)}")
+        result.append(page_result)
+    return result, junk_img_bojids
+def get_pdf_page_size_pts(doc: fitz.Document):
+    page_cnt = len(doc)
+    l: int = min(page_cnt, 50)
+    #把所有宽度和高度塞到两个list 分别取中位数（中间遇到了个在纵页里塞横页的pdf，导致宽高互换了）
+    page_width_list = []
+    page_height_list = []
+    for i in range(l):
+        page = doc[i]
+        page_rect = page.rect
+        page_width_list.append(page_rect.width)
+        page_height_list.append(page_rect.height)
+    page_width_list.sort()
+    page_height_list.sort()
+    median_width = page_width_list[len(page_width_list) // 2]
+    median_height = page_height_list[len(page_height_list) // 2]
+    return median_width, median_height
+def get_pdf_textlen_per_page(doc: fitz.Document):
+    text_len_lst = []
+    for page in doc:
+        # 拿包含img和text的所有blocks
+        # text_block = page.get_text("blocks")
+        # 拿所有text的blocks
+        # text_block = page.get_text("words")
+        # text_block_len = sum([len(t[4]) for t in text_block])
+        #拿所有text的str
+        text_block = page.get_text("text")
+        text_block_len = len(text_block)
+        # logger.info(f"page {page.number} text_block_len: {text_block_len}")
+        text_len_lst.append(text_block_len)
+    return text_len_lst
+def get_pdf_text_layout_per_page(doc: fitz.Document):
+    """
+    根据PDF文档的每一页文本布局，判断该页的文本布局是横向、纵向还是未知。
+    Args:
+        doc (fitz.Document): PDF文档对象。
+    Returns:
+        List[str]: 每一页的文本布局（横向、纵向、未知）。
+    """
+    text_layout_list = []
+    for page_id, page in enumerate(doc):
+        if page_id >= scan_max_page:
+            break
+        # 创建每一页的纵向和横向的文本行数计数器
+        vertical_count = 0
+        horizontal_count = 0
+        text_dict = page.get_text("dict")
+        if "blocks" in text_dict:
+            for block in text_dict["blocks"]:
+                if 'lines' in block:
+                    for line in block["lines"]:
+                        # 获取line的bbox顶点坐标
+                        x0, y0, x1, y1 = line['bbox']
+                        # 计算bbox的宽高
+                        width = x1 - x0
+                        height = y1 - y0
+                        # 计算bbox的面积
+                        area = width * height
+                        font_sizes = []
+                        for span in line['spans']:
+                            if 'size' in span:
+                                font_sizes.append(span['size'])
+                        if len(font_sizes) > 0:
+                            average_font_size = sum(font_sizes) / len(font_sizes)
+                        else:
+                            average_font_size = 10  # 有的line拿不到font_size，先定一个阈值100
+                        if area <= average_font_size ** 2:  # 判断bbox的面积是否小于平均字体大小的平方,单字无法计算是横向还是纵向
+                            continue
+                        else:
+                            if 'wmode' in line:  # 通过wmode判断文本方向
+                                if line['wmode'] == 1:  # 判断是否为竖向文本
+                                    vertical_count += 1
+                                elif line['wmode'] == 0:  # 判断是否为横向文本
+                                    horizontal_count += 1
+                        #     if 'dir' in line:  # 通过旋转角度计算判断文本方向
+                        #         # 获取行的 "dir" 值
+                        #         dir_value = line['dir']
+                        #         cosine, sine = dir_value
+                        #         # 计算角度
+                        #         angle = math.degrees(math.acos(cosine))
+                        #
+                        #         # 判断是否为横向文本
+                        #         if abs(angle - 0) < 0.01 or abs(angle - 180) < 0.01:
+                        #             # line_text = ' '.join(span['text'] for span in line['spans'])
+                        #             # print('This line is horizontal:', line_text)
+                        #             horizontal_count += 1
+                        #         # 判断是否为纵向文本
+                        #         elif abs(angle - 90) < 0.01 or abs(angle - 270) < 0.01:
+                        #             # line_text = ' '.join(span['text'] for span in line['spans'])
+                        #             # print('This line is vertical:', line_text)
+                        #             vertical_count += 1
+        # print(f"page_id: {page_id}, vertical_count: {vertical_count}, horizontal_count: {horizontal_count}")
+        # 判断每一页的文本布局
+        if vertical_count == 0 and horizontal_count == 0:  # 该页没有文本，无法判断
+            text_layout_list.append("unknow")
+            continue
+        else:
+            if vertical_count > horizontal_count:  # 该页的文本纵向行数大于横向的
+                text_layout_list.append("vertical")
+            else:  # 该页的文本横向行数大于纵向的
+                text_layout_list.append("horizontal")
+        # logger.info(f"page_id: {page_id}, vertical_count: {vertical_count}, horizontal_count: {horizontal_count}")
+    return text_layout_list
+'''定义一个自定义异常用来抛出单页svg太多的pdf'''
+class PageSvgsTooManyError(Exception):
+    def __init__(self, message="Page SVGs are too many"):
+        self.message = message
+        super().__init__(self.message)
+def get_svgs_per_page(doc: fitz.Document):
+    svgs_len_list = []
+    for page_id, page in enumerate(doc):
+        # svgs = page.get_drawings()
+        svgs = page.get_cdrawings()  # 切换成get_cdrawings，效率更高
+        len_svgs = len(svgs)
+        if len_svgs >= 3000:
+            raise PageSvgsTooManyError()
+        else:
+            svgs_len_list.append(len_svgs)
+        # logger.info(f"page_id: {page_id}, svgs_len: {len(svgs)}")
+    return svgs_len_list
+def get_imgs_per_page(doc: fitz.Document):
+    imgs_len_list = []
+    for page_id, page in enumerate(doc):
+        imgs = page.get_images()
+        imgs_len_list.append(len(imgs))
+        # logger.info(f"page_id: {page}, imgs_len: {len(imgs)}")
+    return imgs_len_list
+def get_language(doc: fitz.Document):
+    """
+    获取PDF文档的语言。
+    Args:
+        doc (fitz.Document): PDF文档对象。
+    Returns:
+        str: 文档语言，如 "en-US"。
+    """
+    language_lst = []
+    for page_id, page in enumerate(doc):
+        if page_id >= scan_max_page:
+            break
+        # 拿所有text的str
+        text_block = page.get_text("text")
+        page_language = detect_lang(text_block)
+        language_lst.append(page_language)
+        # logger.info(f"page_id: {page_id}, page_language: {page_language}")
+    # 统计text_language_list中每种语言的个数
+    count_dict = Counter(language_lst)
+    # 输出text_language_list中出现的次数最多的语言
+    language = max(count_dict, key=count_dict.get)
+    return language
+def pdf_meta_scan(s3_pdf_path: str, pdf_bytes: bytes):
+    """
+    :param s3_pdf_path:
+    :param pdf_bytes: pdf文件的二进制数据
+    几个维度来评价：是否加密，是否需要密码，纸张大小，总页数，是否文字可提取
+    """
+    doc = fitz.open("pdf", pdf_bytes)
+    is_needs_password = doc.needs_pass
+    is_encrypted = doc.is_encrypted
+    total_page = len(doc)
+    if total_page == 0:
+        logger.warning(f"drop this pdf: {s3_pdf_path}, drop_reason: {DropReason.EMPTY_PDF}")
+        result = {"need_drop": True, "drop_reason": DropReason.EMPTY_PDF}
+        return result
+    else:
+        page_width_pts, page_height_pts = get_pdf_page_size_pts(doc)
+        # logger.info(f"page_width_pts: {page_width_pts}, page_height_pts: {page_height_pts}")
+        svgs_per_page = get_svgs_per_page(doc)
+        # logger.info(f"svgs_per_page: {svgs_per_page}")
+        imgs_per_page = get_imgs_per_page(doc)
+        # logger.info(f"imgs_per_page: {imgs_per_page}")
+        image_info_per_page, junk_img_bojids = get_image_info(doc, page_width_pts, page_height_pts)
+        # logger.info(f"image_info_per_page: {image_info_per_page}, junk_img_bojids: {junk_img_bojids}")
+        text_len_per_page = get_pdf_textlen_per_page(doc)
+        # logger.info(f"text_len_per_page: {text_len_per_page}")
+        text_layout_per_page = get_pdf_text_layout_per_page(doc)
+        # logger.info(f"text_layout_per_page: {text_layout_per_page}")
+        text_language = get_language(doc)
+        # logger.info(f"text_language: {text_language}")
+        # 最后输出一条json
+        res = {
+            "pdf_path": s3_pdf_path,
+            "is_needs_password": is_needs_password,
+            "is_encrypted": is_encrypted,
+            "total_page": total_page,
+            "page_width_pts": int(page_width_pts),
+            "page_height_pts": int(page_height_pts),
+            "image_info_per_page": image_info_per_page,
+            "text_len_per_page": text_len_per_page,
+            "text_layout_per_page": text_layout_per_page,
+            "text_language": text_language,
+            "svgs_per_page": svgs_per_page,
+            "imgs_per_page": imgs_per_page,  # 增加每页img数量list
+            "junk_img_bojids": junk_img_bojids,  # 增加垃圾图片的bojid list
+            "metadata": doc.metadata
+        }
+        # logger.info(json.dumps(res, ensure_ascii=False))
+        return res
+@click.command()
+@click.option('--s3-pdf-path', help='s3上pdf文件的路径')
+@click.option('--s3-profile', help='s3上的profile')
+def main(s3_pdf_path: str, s3_profile: str):
+    """
+    """
+    try:
+        file_content = read_file(s3_pdf_path, s3_profile)
+        pdf_meta_scan(s3_pdf_path, file_content)
+    except Exception as e:
+        print(f"ERROR: {s3_pdf_path}, {e}", file=sys.stderr)
+        logger.exception(e)
+if __name__ == '__main__':
+    main()
+    # "D:\project/20231108code-clean\pdf_cost_time\竖排例子\净空法师-大乘无量寿.pdf"
+    # "D:\project/20231108code-clean\pdf_cost_time\竖排例子\三国演义_繁体竖排版.pdf"
+    # "D:\project/20231108code-clean\pdf_cost_time\scihub\scihub_86800000\libgen.scimag86880000-86880999.zip_10.1021/acsami.1c03109.s002.pdf"
+    # "D:/project/20231108code-clean/pdf_cost_time/scihub/scihub_18600000/libgen.scimag18645000-18645999.zip_10.1021/om3006239.pdf"
+    # file_content = read_file("D:/project/20231108code-clean/pdf_cost_time/scihub/scihub_31000000/libgen.scimag31098000-31098999.zip_10.1109/isit.2006.261791.pdf","")
+    # file_content = read_file("D:\project/20231108code-clean\pdf_cost_time\竖排例子\净空法师_大乘无量寿.pdf","")
+    # doc = fitz.open("pdf", file_content)
+    # text_layout_lst = get_pdf_text_layout_per_page(doc)
+    # print(text_layout_lst)
\ No newline at end of file
--- a/layout/__init__.py
+++ b/layout/__init__.py
--- a/layout/bbox_sort.py
+++ b/layout/bbox_sort.py
--- a/layout/layout_det_utils.py
+++ b/layout/layout_det_utils.py
+from layout.bbox_sort import X0_EXT_IDX, X0_IDX, X1_EXT_IDX, X1_IDX, Y0_EXT_IDX, Y0_IDX, Y1_EXT_IDX, Y1_IDX
+from libs.boxbase import _is_bottom_full_overlap, _left_intersect, _right_intersect
+def find_all_left_bbox_direct(this_bbox, all_bboxes) -> list:
+    """
+    在all_bboxes里找到所有右侧垂直方向上和this_bbox有重叠的bbox， 不用延长线
+    并且要考虑两个box左右相交的情况，如果相交了，那么右侧的box就不算最左侧。
+    """
+    left_boxes = [box for box in all_bboxes if box[X1_IDX] <= this_bbox[X0_IDX] 
+         and any([
+         box[Y0_IDX] < this_bbox[Y0_IDX] < box[Y1_IDX], box[Y0_IDX] < this_bbox[Y1_IDX] < box[Y1_IDX],
+         this_bbox[Y0_IDX] < box[Y0_IDX] < this_bbox[Y1_IDX], this_bbox[Y0_IDX] < box[Y1_IDX] < this_bbox[Y1_IDX],
+         box[Y0_IDX]==this_bbox[Y0_IDX] and box[Y1_IDX]==this_bbox[Y1_IDX]]) or _left_intersect(box[:4], this_bbox[:4])]
+    # 然后再过滤一下，找到水平上距离this_bbox最近的那个——x1最大的那个
+    if len(left_boxes) > 0:
+        left_boxes.sort(key=lambda x: x[X1_EXT_IDX] if x[X1_EXT_IDX] else x[X1_IDX], reverse=True)
+        left_boxes = left_boxes[0]
+    else:
+        left_boxes = None
+    return left_boxes
+def find_all_right_bbox_direct(this_bbox, all_bboxes) -> list:
+    """
+    找到在this_bbox右侧且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
+    """
+    right_bboxes = [box for box in all_bboxes if box[X0_IDX] >= this_bbox[X1_IDX] 
+        and any([
+        this_bbox[Y0_IDX] < box[Y0_IDX] < this_bbox[Y1_IDX], this_bbox[Y0_IDX] < box[Y1_IDX] < this_bbox[Y1_IDX],
+        box[Y0_IDX] < this_bbox[Y0_IDX] < box[Y1_IDX], box[Y0_IDX] < this_bbox[Y1_IDX] < box[Y1_IDX],
+        box[Y0_IDX]==this_bbox[Y0_IDX] and box[Y1_IDX]==this_bbox[Y1_IDX]]) or _right_intersect(this_bbox[:4], box[:4])]
+    if len(right_bboxes)>0:
+        right_bboxes.sort(key=lambda x: x[X0_EXT_IDX] if x[X0_EXT_IDX] else x[X0_IDX])
+        right_bboxes = right_bboxes[0]
+    else:
+        right_bboxes = None
+    return right_bboxes
+def find_all_top_bbox_direct(this_bbox, all_bboxes) -> list:
+    """
+    找到在this_bbox上侧且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
+    """
+    top_bboxes = [box for box in all_bboxes if box[Y1_IDX] <= this_bbox[Y0_IDX] and any([
+        box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX],
+        this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX],
+        box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])]
+    if len(top_bboxes)>0:
+        top_bboxes.sort(key=lambda x: x[Y1_EXT_IDX] if x[Y1_EXT_IDX] else x[Y1_IDX], reverse=True)
+        top_bboxes = top_bboxes[0]
+    else:
+        top_bboxes = None
+    return top_bboxes
+def find_all_bottom_bbox_direct(this_bbox, all_bboxes) -> list:
+    """
+    找到在this_bbox下侧且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
+    """
+    bottom_bboxes = [box for box in all_bboxes if box[Y0_IDX] >= this_bbox[Y1_IDX] and any([
+        this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX],
+        box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX],
+        box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])]
+    if len(bottom_bboxes)>0:
+        bottom_bboxes.sort(key=lambda x:  x[Y0_IDX])
+        bottom_bboxes = bottom_bboxes[0]
+    else:
+        bottom_bboxes = None
+    return bottom_bboxes
+# ===================================================================================================================
+def find_bottom_bbox_direct_from_right_edge(this_bbox, all_bboxes) -> list:
+    """
+    找到在this_bbox下侧且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
+    """
+    bottom_bboxes = [box for box in all_bboxes if box[Y0_IDX] >= this_bbox[Y1_IDX] and any([
+        this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX],
+        box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX],
+        box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])]
+    if len(bottom_bboxes)>0:
+        # y0最小， X1最大的那个,也就是box上边缘最靠近this_bbox的那个,并且还最靠右
+        bottom_bboxes.sort(key=lambda x: x[Y0_IDX])
+        bottom_bboxes = [box for box in bottom_bboxes if box[Y0_IDX]==bottom_bboxes[0][Y0_IDX]]
+        # 然后再y1相同的情况下，找到x1最大的那个
+        bottom_bboxes.sort(key=lambda x: x[X1_IDX], reverse=True)
+        bottom_bboxes = bottom_bboxes[0]
+    else:
+        bottom_bboxes = None
+    return bottom_bboxes
+def find_bottom_bbox_direct_from_left_edge(this_bbox, all_bboxes) -> list:
+    """
+    找到在this_bbox下侧且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
+    """
+    bottom_bboxes = [box for box in all_bboxes if box[Y0_IDX] >= this_bbox[Y1_IDX] and any([
+        this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX],
+        box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX],
+        box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])]
+    if len(bottom_bboxes)>0:
+        # y0最小， X0最小的那个
+        bottom_bboxes.sort(key=lambda x: x[Y0_IDX])
+        bottom_bboxes = [box for box in bottom_bboxes if box[Y0_IDX]==bottom_bboxes[0][Y0_IDX]]
+        # 然后再y0相同的情况下，找到x0最小的那个
+        bottom_bboxes.sort(key=lambda x: x[X0_IDX])
+        bottom_bboxes = bottom_bboxes[0]
+    else:
+        bottom_bboxes = None
+    return bottom_bboxes
+def find_top_bbox_direct_from_left_edge(this_bbox, all_bboxes) -> list:
+    """
+    找到在this_bbox上侧且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
+    """
+    top_bboxes = [box for box in all_bboxes if box[Y1_IDX] <= this_bbox[Y0_IDX] and any([
+        box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX],
+        this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX],
+        box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])]
+    if len(top_bboxes)>0:
+        # y1最大， X0最小的那个
+        top_bboxes.sort(key=lambda x: x[Y1_IDX], reverse=True)
+        top_bboxes = [box for box in top_bboxes if box[Y1_IDX]==top_bboxes[0][Y1_IDX]]
+        # 然后再y1相同的情况下，找到x0最小的那个
+        top_bboxes.sort(key=lambda x: x[X0_IDX])
+        top_bboxes = top_bboxes[0]
+    else:
+        top_bboxes = None
+    return top_bboxes
+def find_top_bbox_direct_from_right_edge(this_bbox, all_bboxes) -> list:
+    """
+    找到在this_bbox上侧且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
+    """
+    top_bboxes = [box for box in all_bboxes if box[Y1_IDX] <= this_bbox[Y0_IDX] and any([
+        box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX],
+        this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX],
+        box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])]
+    if len(top_bboxes)>0:
+        # y1最大， X1最大的那个
+        top_bboxes.sort(key=lambda x: x[Y1_IDX], reverse=True)
+        top_bboxes = [box for box in top_bboxes if box[Y1_IDX]==top_bboxes[0][Y1_IDX]]
+        # 然后再y1相同的情况下，找到x1最大的那个
+        top_bboxes.sort(key=lambda x: x[X1_IDX], reverse=True)
+        top_bboxes = top_bboxes[0]
+    else:
+        top_bboxes = None
+    return top_bboxes
+# ===================================================================================================================
+def get_left_edge_bboxes(all_bboxes) -> list:
+    """
+    返回最左边的bbox
+    """
+    left_bboxes = [box for box in all_bboxes if find_all_left_bbox_direct(box, all_bboxes) is None]
+    return left_bboxes
+def get_right_edge_bboxes(all_bboxes) -> list:
+    """
+    返回最右边的bbox
+    """
+    right_bboxes = [box for box in all_bboxes if find_all_right_bbox_direct(box, all_bboxes) is None]
+    return right_bboxes
+def fix_vertical_bbox_pos(bboxes:list):
+    """
+    检查这批bbox在垂直方向是否有轻微的重叠，如果重叠了，就把重叠的bbox往下移动一点
+    在x方向上必须一个包含或者被包含，或者完全重叠，不能只有部分重叠
+    """
+    bboxes.sort(key=lambda x: x[Y0_IDX]) # 从上向下排列
+    for i in range(0, len(bboxes)):
+        for j in range(i+1, len(bboxes)):
+            if _is_bottom_full_overlap(bboxes[i][:4], bboxes[j][:4]):
+                # 如果两个bbox有部分重叠，那么就把下面的bbox往下移动一点
+                bboxes[j][Y0_IDX] = bboxes[i][Y1_IDX] + 2 # 2是个经验值
+                break
+    return bboxes
--- a/layout/layout_sort.py
+++ b/layout/layout_sort.py
--- a/layout/layout_spiler_recog.py
+++ b/layout/layout_spiler_recog.py
+"""
+找到能分割布局的水平的横线、色块
+"""
+import os,  fitz
+from libs.boxbase import _is_in_or_part_overlap
+def __rect_filter_by_width(rect, page_w, page_h):
+    mid_x = page_w/2
+    if rect[0]< mid_x < rect[2]:
+        return True
+    return False
+def __rect_filter_by_pos(rect, image_bboxes, table_bboxes):
+    """
+    不能出现在table和image的位置
+    """
+    for box in image_bboxes:
+        if _is_in_or_part_overlap(rect, box):
+            return False
+    for box in table_bboxes:
+        if _is_in_or_part_overlap(rect, box):
+            return False
+    return True
+def __debug_show_page(page, bboxes1: list,bboxes2: list,bboxes3: list,):
+    save_path = "./tmp/debug.pdf"
+    if os.path.exists(save_path):
+        # 删除已经存在的文件
+        os.remove(save_path)
+    # 创建一个新的空白 PDF 文件
+    doc = fitz.open('')
+    width = page.rect.width
+    height = page.rect.height
+    new_page = doc.new_page(width=width, height=height)
+    shape = new_page.new_shape()
+    for bbox in bboxes1:
+        # 原始box画上去
+        rect = fitz.Rect(*bbox[0:4])
+        shape = new_page.new_shape()
+        shape.draw_rect(rect)
+        shape.finish(color=fitz.pdfcolor['red'], fill=fitz.pdfcolor['blue'], fill_opacity=0.2)
+        shape.finish()
+        shape.commit()
+    for bbox in bboxes2:
+        # 原始box画上去
+        rect = fitz.Rect(*bbox[0:4])
+        shape = new_page.new_shape()
+        shape.draw_rect(rect)
+        shape.finish(color=None, fill=fitz.pdfcolor['yellow'], fill_opacity=0.2)
+        shape.finish()
+        shape.commit()
+    for bbox in bboxes3:
+        # 原始box画上去
+        rect = fitz.Rect(*bbox[0:4])
+        shape = new_page.new_shape()
+        shape.draw_rect(rect)
+        shape.finish(color=fitz.pdfcolor['red'], fill=None)
+        shape.finish()
+        shape.commit()
+    parent_dir = os.path.dirname(save_path)
+    if not os.path.exists(parent_dir):
+        os.makedirs(parent_dir)
+    doc.save(save_path)
+    doc.close() 
+def get_spilter_of_page(page, image_bboxes, table_bboxes):
+    """
+    获取到色块和横线
+    """
+    cdrawings = page.get_cdrawings()
+    spilter_bbox = []
+    for block in cdrawings:
+        if 'fill' in block:
+            fill = block['fill']
+        if 'fill' in block and block['fill'] and block['fill']!=(1.0,1.0,1.0):
+            rect = block['rect']
+            if __rect_filter_by_width(rect, page.rect.width, page.rect.height) and __rect_filter_by_pos(rect, image_bboxes, table_bboxes):
+                spilter_bbox.append(list(rect))
+    """过滤、修正一下这些box。因为有时候会有一些矩形，高度为0或者为负数，造成layout计算无限循环。如果是负高度或者0高度，统一修正为高度为1"""
+    for box in spilter_bbox:
+        if box[3]-box[1] <= 0:
+            box[3] = box[1] + 1
+    #__debug_show_page(page, spilter_bbox, [], [])
+    return spilter_bbox
--- a/layout/mcol_sort.py
+++ b/layout/mcol_sort.py
+"""
+This is an advanced PyMuPDF utility for detecting multi-column pages.
+It can be used in a shell script, or its main function can be imported and
+invoked as descript below.
+Features
+---------
+- Identify text belonging to (a variable number of) columns on the page.
+- Text with different background color is handled separately, allowing for
+  easier treatment of side remarks, comment boxes, etc.
+- Uses text block detection capability to identify text blocks and
+  uses the block bboxes as primary structuring principle.
+- Supports ignoring footers via a footer margin parameter.
+- Returns re-created text boundary boxes (integer coordinates), sorted ascending
+  by the top, then by the left coordinates.
+Restrictions
+-------------
+- Only supporting horizontal, left-to-right text
+- Returns a list of text boundary boxes - not the text itself. The caller is
+  expected to extract text from within the returned boxes.
+- Text written above images is ignored altogether (option).
+- This utility works as expected in most cases. The following situation cannot
+  be handled correctly:
+    * overlapping (non-disjoint) text blocks
+    * image captions are not recognized and are handled like normal text
+Usage
+------
+- As a CLI shell command use
+  python multi_column.py input.pdf footer_margin
+  Where footer margin is the height of the bottom stripe to ignore on each page.
+  This code is intended to be modified according to your need.
+- Use in a Python script as follows:
+  ----------------------------------------------------------------------------------
+  from multi_column import column_boxes
+  # for each page execute
+  bboxes = column_boxes(page, footer_margin=50, no_image_text=True)
+  # bboxes is a list of fitz.IRect objects, that are sort ascending by their y0,
+  # then x0 coordinates. Their text content can be extracted by all PyMuPDF
+  # get_text() variants, like for instance the following:
+  for rect in bboxes:
+      print(page.get_text(clip=rect, sort=True))
+  ----------------------------------------------------------------------------------
+"""
+import os
+import sys
+from libs.commons import fitz
+def column_boxes(page, footer_margin=50, header_margin=50, no_image_text=True):
+    """Determine bboxes which wrap a column."""
+    paths = page.get_drawings()
+    bboxes = []
+    # path rectangles
+    path_rects = []
+    # image bboxes
+    img_bboxes = []
+    # bboxes of non-horizontal text
+    # avoid when expanding horizontal text boxes
+    vert_bboxes = []
+    # compute relevant page area
+    clip = +page.rect
+    clip.y1 -= footer_margin  # Remove footer area
+    clip.y0 += header_margin  # Remove header area
+    def can_extend(temp, bb, bboxlist):
+        """Determines whether rectangle 'temp' can be extended by 'bb'
+        without intersecting any of the rectangles contained in 'bboxlist'.
+        Items of bboxlist may be None if they have been removed.
+        Returns:
+            True if 'temp' has no intersections with items of 'bboxlist'.
+        """
+        for b in bboxlist:
+            if not intersects_bboxes(temp, vert_bboxes) and (
+                b == None or b == bb or (temp & b).is_empty
+            ):
+                continue
+            return False
+        return True
+    def in_bbox(bb, bboxes):
+        """Return 1-based number if a bbox contains bb, else return 0."""
+        for i, bbox in enumerate(bboxes):
+            if bb in bbox:
+                return i + 1
+        return 0
+    def intersects_bboxes(bb, bboxes):
+        """Return True if a bbox intersects bb, else return False."""
+        for bbox in bboxes:
+            if not (bb & bbox).is_empty:
+                return True
+        return False
+    def extend_right(bboxes, width, path_bboxes, vert_bboxes, img_bboxes):
+        """Extend a bbox to the right page border.
+        Whenever there is no text to the right of a bbox, enlarge it up
+        to the right page border.
+        Args:
+            bboxes: (list[IRect]) bboxes to check
+            width: (int) page width
+            path_bboxes: (list[IRect]) bboxes with a background color
+            vert_bboxes: (list[IRect]) bboxes with vertical text
+            img_bboxes: (list[IRect]) bboxes of images
+        Returns:
+            Potentially modified bboxes.
+        """
+        for i, bb in enumerate(bboxes):
+            # do not extend text with background color
+            if in_bbox(bb, path_bboxes):
+                continue
+            # do not extend text in images
+            if in_bbox(bb, img_bboxes):
+                continue
+            # temp extends bb to the right page border
+            temp = +bb
+            temp.x1 = width
+            # do not cut through colored background or images
+            if intersects_bboxes(temp, path_bboxes + vert_bboxes + img_bboxes):
+                continue
+            # also, do not intersect other text bboxes
+            check = can_extend(temp, bb, bboxes)
+            if check:
+                bboxes[i] = temp  # replace with enlarged bbox
+        return [b for b in bboxes if b != None]
+    def clean_nblocks(nblocks):
+        """Do some elementary cleaning."""
+        # 1. remove any duplicate blocks.
+        blen = len(nblocks)
+        if blen < 2:
+            return nblocks
+        start = blen - 1
+        for i in range(start, -1, -1):
+            bb1 = nblocks[i]
+            bb0 = nblocks[i - 1]
+            if bb0 == bb1:
+                del nblocks[i]
+        # 2. repair sequence in special cases:
+        # consecutive bboxes with almost same bottom value are sorted ascending
+        # by x-coordinate.
+        y1 = nblocks[0].y1  # first bottom coordinate
+        i0 = 0  # its index
+        i1 = -1  # index of last bbox with same bottom
+        # Iterate over bboxes, identifying segments with approx. same bottom value.
+        # Replace every segment by its sorted version.
+        for i in range(1, len(nblocks)):
+            b1 = nblocks[i]
+            if abs(b1.y1 - y1) > 10:  # different bottom
+                if i1 > i0:  # segment length > 1? Sort it!
+                    nblocks[i0 : i1 + 1] = sorted(
+                        nblocks[i0 : i1 + 1], key=lambda b: b.x0
+                    )
+                y1 = b1.y1  # store new bottom value
+                i0 = i  # store its start index
+            i1 = i  # store current index
+        if i1 > i0:  # segment waiting to be sorted
+            nblocks[i0 : i1 + 1] = sorted(nblocks[i0 : i1 + 1], key=lambda b: b.x0)
+        return nblocks
+    # extract vector graphics
+    for p in paths:
+        path_rects.append(p["rect"].irect)
+    path_bboxes = path_rects
+    # sort path bboxes by ascending top, then left coordinates
+    path_bboxes.sort(key=lambda b: (b.y0, b.x0))
+    # bboxes of images on page, no need to sort them
+    for item in page.get_images():
+        img_bboxes.extend(page.get_image_rects(item[0]))
+    # blocks of text on page
+    blocks = page.get_text(
+        "dict",
+        flags=fitz.TEXTFLAGS_TEXT,
+        clip=clip,
+    )["blocks"]
+    # Make block rectangles, ignoring non-horizontal text
+    for b in blocks:
+        bbox = fitz.IRect(b["bbox"])  # bbox of the block
+        # ignore text written upon images
+        if no_image_text and in_bbox(bbox, img_bboxes):
+            continue
+        # confirm first line to be horizontal
+        line0 = b["lines"][0]  # get first line
+        if line0["dir"] != (1, 0):  # only accept horizontal text
+            vert_bboxes.append(bbox)
+            continue
+        srect = fitz.EMPTY_IRECT()
+        for line in b["lines"]:
+            lbbox = fitz.IRect(line["bbox"])
+            text = "".join([s["text"].strip() for s in line["spans"]])
+            if len(text) > 1:
+                srect |= lbbox
+        bbox = +srect
+        if not bbox.is_empty:
+            bboxes.append(bbox)
+    # Sort text bboxes by ascending background, top, then left coordinates
+    bboxes.sort(key=lambda k: (in_bbox(k, path_bboxes), k.y0, k.x0))
+    # Extend bboxes to the right where possible
+    bboxes = extend_right(
+        bboxes, int(page.rect.width), path_bboxes, vert_bboxes, img_bboxes
+    )
+    # immediately return of no text found
+    if bboxes == []:
+        return []
+    # --------------------------------------------------------------------
+    # Join bboxes to establish some column structure
+    # --------------------------------------------------------------------
+    # the final block bboxes on page
+    nblocks = [bboxes[0]]  # pre-fill with first bbox
+    bboxes = bboxes[1:]  # remaining old bboxes
+    for i, bb in enumerate(bboxes):  # iterate old bboxes
+        check = False  # indicates unwanted joins
+        # check if bb can extend one of the new blocks
+        for j in range(len(nblocks)):
+            nbb = nblocks[j]  # a new block
+            # never join across columns
+            if bb == None or nbb.x1 < bb.x0 or bb.x1 < nbb.x0:
+                continue
+            # never join across different background colors
+            if in_bbox(nbb, path_bboxes) != in_bbox(bb, path_bboxes):
+                continue
+            temp = bb | nbb  # temporary extension of new block
+            check = can_extend(temp, nbb, nblocks)
+            if check == True:
+                break
+        if not check:  # bb cannot be used to extend any of the new bboxes
+            nblocks.append(bb)  # so add it to the list
+            j = len(nblocks) - 1  # index of it
+            temp = nblocks[j]  # new bbox added
+        # check if some remaining bbox is contained in temp
+        check = can_extend(temp, bb, bboxes)
+        if check == False:
+            nblocks.append(bb)
+        else:
+            nblocks[j] = temp
+        bboxes[i] = None
+    # do some elementary cleaning
+    nblocks = clean_nblocks(nblocks)
+    # return identified text bboxes
+    return nblocks
+if __name__ == "__main__":
+    """Only for debugging purposes, currently.
+    Draw red borders around the returned text bboxes and insert
+    the bbox number.
+    Then save the file under the name "input-blocks.pdf".
+    """
+    # get the file name
+    filename = sys.argv[1]
+    # check if footer margin is given
+    if len(sys.argv) > 2:
+        footer_margin = int(sys.argv[2])
+    else:  # use default vaue
+        footer_margin = 50
+    # check if header margin is given
+    if len(sys.argv) > 3:
+        header_margin = int(sys.argv[3])
+    else:  # use default vaue
+        header_margin = 50
+    # open document
+    doc = fitz.open(filename)
+    # iterate over the pages
+    for page in doc:
+        # remove any geometry issues
+        page.wrap_contents()
+        # get the text bboxes
+        bboxes = column_boxes(page, footer_margin=footer_margin, header_margin=header_margin)
+        # prepare a canvas to draw rectangles and text
+        shape = page.new_shape()
+        # iterate over the bboxes
+        for i, rect in enumerate(bboxes):
+            shape.draw_rect(rect)  # draw a border
+            # write sequence number
+            shape.insert_text(rect.tl + (5, 15), str(i), color=fitz.pdfcolor["red"])
+        # finish drawing / text with color red
+        shape.finish(color=fitz.pdfcolor["red"])
+        shape.commit()  # store to the page
+    # save document with text bboxes
+    doc.ez_save(filename.replace(".pdf", "-blocks.pdf"))
\ No newline at end of file
--- a/libs/__init__.py
+++ b/libs/__init__.py
--- a/libs/boxbase.py
+++ b/libs/boxbase.py
+from loguru import logger
+def _is_in_or_part_overlap(box1, box2) -> bool:
+    """
+    两个bbox是否有部分重叠或者包含
+    """
+    if box1 is None or box2 is None:
+        return False
+    x0_1, y0_1, x1_1, y1_1 = box1
+    x0_2, y0_2, x1_2, y1_2 = box2
+    return not (x1_1 < x0_2 or  # box1在box2的左边
+                x0_1 > x1_2 or  # box1在box2的右边
+                y1_1 < y0_2 or  # box1在box2的上边
+                y0_1 > y1_2)    # box1在box2的下边
+def _is_in(box1, box2) -> bool:
+    """
+    box1是否完全在box2里面
+    """
+    x0_1, y0_1, x1_1, y1_1 = box1
+    x0_2, y0_2, x1_2, y1_2 = box2
+    return (x0_1 >= x0_2 and  # box1的左边界不在box2的左边外
+            y0_1 >= y0_2 and  # box1的上边界不在box2的上边外
+            x1_1 <= x1_2 and  # box1的右边界不在box2的右边外
+            y1_1 <= y1_2)     # box1的下边界不在box2的下边外
+def _is_part_overlap(box1, box2) -> bool:
+    """
+    两个bbox是否有部分重叠，但不完全包含
+    """
+    if box1 is None or box2 is None:
+        return False
+    return _is_in_or_part_overlap(box1, box2) and not _is_in(box1, box2)
+def _left_intersect(left_box, right_box):
+    "检查两个box的左边界是否有交集，也就是left_box的右边界是否在right_box的左边界内"
+    if left_box is None or right_box is None:
+        return False
+    x0_1, y0_1, x1_1, y1_1 = left_box
+    x0_2, y0_2, x1_2, y1_2 = right_box
+    return x1_1>x0_2 and x0_1<x0_2 and (y0_1<=y0_2<=y1_1 or y0_1<=y1_2<=y1_1)
+def _right_intersect(left_box, right_box):
+    """
+    检查box是否在右侧边界有交集，也就是left_box的左边界是否在right_box的右边界内
+    """
+    if left_box is None or right_box is None:
+        return False
+    x0_1, y0_1, x1_1, y1_1 = left_box
+    x0_2, y0_2, x1_2, y1_2 = right_box
+    return x0_1<x1_2 and x1_1>x1_2 and (y0_1<=y0_2<=y1_1 or y0_1<=y1_2<=y1_1)
+def _is_vertical_full_overlap(box1, box2, x_torlence=2):
+    """
+    x方向上：要么box1包含box2, 要么box2包含box1。不能部分包含
+    y方向上：box1和box2有重叠
+    """
+    # 解析box的坐标
+    x11, y11, x12, y12 = box1  # 左上角和右下角的坐标 (x1, y1, x2, y2)
+    x21, y21, x22, y22 = box2
+    # 在x轴方向上，box1是否包含box2 或 box2包含box1
+    contains_in_x = (x11-x_torlence <= x21 and x12+x_torlence >= x22) or (x21-x_torlence <= x11 and x22+x_torlence >= x12)
+    # 在y轴方向上，box1和box2是否有重叠
+    overlap_in_y = not (y12 < y21 or y11 > y22)
+    return contains_in_x and overlap_in_y
+def _is_bottom_full_overlap(box1, box2, y_tolerance=2):
+    """
+    检查box1下方和box2的上方有轻微的重叠，轻微程度收到y_tolerance的限制
+    这个函数和_is_vertical-full_overlap的区别是，这个函数允许box1和box2在x方向上有轻微的重叠,允许一定的模糊度
+    """
+    if box1 is None or box2 is None:
+        return False
+    x0_1, y0_1, x1_1, y1_1 = box1
+    x0_2, y0_2, x1_2, y1_2 = box2
+    tolerance_margin = 2
+    is_xdir_full_overlap = ((x0_1-tolerance_margin<=x0_2<=x1_1+tolerance_margin and x0_1-tolerance_margin<=x1_2<=x1_1+tolerance_margin) or (x0_2-tolerance_margin<=x0_1<=x1_2+tolerance_margin and x0_2-tolerance_margin<=x1_1<=x1_2+tolerance_margin))
+    return y0_2<y1_1 and 0<(y1_1-y0_2)<y_tolerance and is_xdir_full_overlap
+def _is_left_overlap(box1, box2,):
+    """
+    检查box1的左侧是否和box2有重叠
+    在Y方向上可以是部分重叠或者是完全重叠。不分box1和box2的上下关系，也就是无论box1在box2下方还是box2在box1下方，都可以检测到重叠。
+    X方向上
+    """
+    def __overlap_y(Ay1, Ay2, By1, By2):
+        return max(0, min(Ay2, By2) - max(Ay1, By1))
+    if box1 is None or box2 is None:
+        return False
+    x0_1, y0_1, x1_1, y1_1 = box1
+    x0_2, y0_2, x1_2, y1_2 = box2
+    y_overlap_len = __overlap_y(y0_1, y1_1, y0_2, y1_2)
+    ratio_1 = 1.0 * y_overlap_len / (y1_1 - y0_1) if y1_1-y0_1!=0 else 0
+    ratio_2 = 1.0 * y_overlap_len / (y1_2 - y0_2) if y1_2-y0_2!=0 else 0
+    vertical_overlap_cond = ratio_1 >= 0.5 or ratio_2 >= 0.5
+    #vertical_overlap_cond = y0_1<=y0_2<=y1_1 or y0_1<=y1_2<=y1_1 or y0_2<=y0_1<=y1_2 or y0_2<=y1_1<=y1_2
+    return x0_1<=x0_2<=x1_1 and vertical_overlap_cond
+def calculate_iou(bbox1, bbox2):
+    # Determine the coordinates of the intersection rectangle
+    x_left = max(bbox1[0], bbox2[0])
+    y_top = max(bbox1[1], bbox2[1])
+    x_right = min(bbox1[2], bbox2[2])
+    y_bottom = min(bbox1[3], bbox2[3])
+    if x_right < x_left or y_bottom < y_top:
+        return 0.0
+    # The area of overlap area
+    intersection_area = (x_right - x_left) * (y_bottom - y_top)
+    # The area of both rectangles
+    bbox1_area = (bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1])
+    bbox2_area = (bbox2[2] - bbox2[0]) * (bbox2[3] - bbox2[1])
+    # Compute the intersection over union by taking the intersection area 
+    # and dividing it by the sum of both areas minus the intersection area
+    iou = intersection_area / float(bbox1_area + bbox2_area - intersection_area)
+    return iou
+def calculate_overlap_area_2_minbox_area_ratio(bbox1, bbox2):
+    """
+    计算box1和box2的重叠面积占最小面积的box的比例
+    """
+    # Determine the coordinates of the intersection rectangle
+    x_left = max(bbox1[0], bbox2[0])
+    y_top = max(bbox1[1], bbox2[1])
+    x_right = min(bbox1[2], bbox2[2])
+    y_bottom = min(bbox1[3], bbox2[3])
+    if x_right < x_left or y_bottom < y_top:
+        return 0.0
+    # The area of overlap area
+    intersection_area = (x_right - x_left) * (y_bottom - y_top)
+    min_box_area = min([(bbox1[2]-bbox1[0])*(bbox1[3]-bbox1[1]), (bbox2[3]-bbox2[1])*(bbox2[2]-bbox2[0])])
+    if min_box_area==0:
+        return 0
+    else:
+        return intersection_area / min_box_area
+def get_bbox_in_boundry(bboxes:list, boundry:tuple)-> list:
+    x0, y0, x1, y1 = boundry
+    new_boxes = [box for box in bboxes if box[0] >= x0 and box[1] >= y0 and box[2] <= x1 and box[3] <= y1]
+    return new_boxes
+def is_vbox_on_side(bbox, width, height, side_threshold=0.2):
+    """
+    判断一个bbox是否在pdf页面的边缘
+    """
+    x0, x1 = bbox[0], bbox[2]
+    if x1<=width*side_threshold or x0>=width*(1-side_threshold):
+        return True
+    return False
+def find_top_nearest_text_bbox(pymu_blocks, obj_bbox):
+    tolerance_margin = 4
+    top_boxes = [box for box in pymu_blocks if obj_bbox[1]-box['bbox'][3] >=-tolerance_margin and not _is_in(box['bbox'], obj_bbox)]
+    # 然后找到X方向上有互相重叠的
+    top_boxes = [box for box in top_boxes if any([obj_bbox[0]-tolerance_margin <=box['bbox'][0]<=obj_bbox[2]+tolerance_margin, 
+                                                  obj_bbox[0]-tolerance_margin <=box['bbox'][2]<=obj_bbox[2]+tolerance_margin,
+                                                    box['bbox'][0]-tolerance_margin <=obj_bbox[0]<=box['bbox'][2]+tolerance_margin,
+                                                    box['bbox'][0]-tolerance_margin <=obj_bbox[2]<=box['bbox'][2]+tolerance_margin
+                                                  ])]
+    # 然后找到y1最大的那个
+    if len(top_boxes)>0:
+        top_boxes.sort(key=lambda x: x['bbox'][3], reverse=True)
+        return top_boxes[0]
+    else:
+        return None
+def find_bottom_nearest_text_bbox(pymu_blocks, obj_bbox):
+    bottom_boxes = [box for box in pymu_blocks if box['bbox'][1] - obj_bbox[3]>=-2 and not _is_in(box['bbox'], obj_bbox)]
+    # 然后找到X方向上有互相重叠的
+    bottom_boxes = [box for box in bottom_boxes if any([obj_bbox[0]-2 <=box['bbox'][0]<=obj_bbox[2]+2, 
+                                                  obj_bbox[0]-2 <=box['bbox'][2]<=obj_bbox[2]+2,
+                                                    box['bbox'][0]-2 <=obj_bbox[0]<=box['bbox'][2]+2,
+                                                    box['bbox'][0]-2 <=obj_bbox[2]<=box['bbox'][2]+2
+                                                  ])]
+    # 然后找到y0最小的那个
+    if len(bottom_boxes)>0:
+        bottom_boxes.sort(key=lambda x: x['bbox'][1], reverse=False)
+        return bottom_boxes[0]
+    else:
+        return None
+def find_left_nearest_text_bbox(pymu_blocks, obj_bbox):
+    """
+    寻找左侧最近的文本block
+    """
+    left_boxes = [box for box in pymu_blocks if obj_bbox[0]-box['bbox'][2]>=-2 and not _is_in(box['bbox'], obj_bbox)]
+    # 然后找到X方向上有互相重叠的
+    left_boxes = [box for box in left_boxes if any([obj_bbox[1]-2 <=box['bbox'][1]<=obj_bbox[3]+2, 
+                                                  obj_bbox[1]-2 <=box['bbox'][3]<=obj_bbox[3]+2,
+                                                    box['bbox'][1]-2 <=obj_bbox[1]<=box['bbox'][3]+2,
+                                                    box['bbox'][1]-2 <=obj_bbox[3]<=box['bbox'][3]+2
+                                                  ])]
+    # 然后找到x1最大的那个
+    if len(left_boxes)>0:
+        left_boxes.sort(key=lambda x: x['bbox'][2], reverse=True)
+        return left_boxes[0]
+    else:
+        return None
+def find_right_nearest_text_bbox(pymu_blocks, obj_bbox):
+    """
+    寻找右侧最近的文本block
+    """
+    right_boxes = [box for box in pymu_blocks if box['bbox'][0]-obj_bbox[2]>=-2 and not _is_in(box['bbox'], obj_bbox)]
+    # 然后找到X方向上有互相重叠的
+    right_boxes = [box for box in right_boxes if any([obj_bbox[1]-2 <=box['bbox'][1]<=obj_bbox[3]+2, 
+                                                  obj_bbox[1]-2 <=box['bbox'][3]<=obj_bbox[3]+2,
+                                                    box['bbox'][1]-2 <=obj_bbox[1]<=box['bbox'][3]+2,
+                                                    box['bbox'][1]-2 <=obj_bbox[3]<=box['bbox'][3]+2
+                                                  ])]
+    # 然后找到x0最小的那个
+    if len(right_boxes)>0:
+        right_boxes.sort(key=lambda x: x['bbox'][0], reverse=False)
+        return right_boxes[0]
+    else:
+        return None
--- a/libs/calc_span_stats.py
+++ b/libs/calc_span_stats.py
+import os
+import csv
+import json
+import pandas as pd
+from pandas import DataFrame as df
+from matplotlib import pyplot as plt
+from termcolor import cprint
+"""
+Execute this script in the following way:
+1. Make sure there are pdf_dic.json files under the directory code-clean/tmp/unittest/md/, such as the following:
+    code-clean/tmp/unittest/md/scihub/scihub_00500000/libgen.scimag00527000-00527999.zip_10.1002/app.25178/pdf_dic.json
+2. Under the directory code-clean, execute the following command:
+    $ python -m libs.calc_span_stats
+"""
+def print_green_on_red(text):
+    cprint(text, "green", "on_red", attrs=["bold"], end="\n\n")
+def print_green(text):
+    print()
+    cprint(text, "green", attrs=["bold"], end="\n\n")
+def print_red(text):
+    print()
+    cprint(text, "red", attrs=["bold"], end="\n\n")
+def safe_get(dict_obj, key, default):
+    val = dict_obj.get(key)
+    if val is None:
+        return default
+    else:
+        return val
+class SpanStatsCalc:
+    """Calculate statistics of span."""
+    def draw_charts(self, span_stats: pd.DataFrame, fig_num: int, save_path: str):
+        """Draw multiple figures in one figure."""
+        # make a canvas
+        fig = plt.figure(fig_num, figsize=(20, 20))
+        pass
+    def calc_stats_per_dict(self, pdf_dict) -> pd.DataFrame:
+        """Calculate statistics per pdf_dict."""
+        span_stats = pd.DataFrame()
+        span_stats = []
+        span_id = 0
+        for page_id, blocks in pdf_dict.items():
+            if page_id.startswith("page_"):
+                if "para_blocks" in blocks.keys():
+                    for para_block in blocks["para_blocks"]:
+                        for line in para_block["lines"]:
+                            for span in line["spans"]:
+                                span_text = safe_get(span, "text", "")
+                                span_font_name = safe_get(span, "font", "")
+                                span_font_size = safe_get(span, "size", 0)
+                                span_font_color = safe_get(span, "color", "")
+                                span_font_flags = safe_get(span, "flags", 0)
+                                span_font_flags_decoded = safe_get(span, "decomposed_flags", {})
+                                span_is_super_script = safe_get(span_font_flags_decoded, "is_superscript", False)
+                                span_is_italic = safe_get(span_font_flags_decoded, "is_italic", False)
+                                span_is_serifed = safe_get(span_font_flags_decoded, "is_serifed", False)
+                                span_is_sans_serifed = safe_get(span_font_flags_decoded, "is_sans_serifed", False)
+                                span_is_monospaced = safe_get(span_font_flags_decoded, "is_monospaced", False)
+                                span_is_proportional = safe_get(span_font_flags_decoded, "is_proportional", False)
+                                span_is_bold = safe_get(span_font_flags_decoded, "is_bold", False)
+                                span_stats.append(
+                                    {
+                                        "span_id": span_id,  # id of span
+                                        "page_id": page_id,  # page number of pdf
+                                        "span_text": span_text,  # text of span
+                                        "span_font_name": span_font_name,  # font name of span
+                                        "span_font_size": span_font_size,  # font size of span
+                                        "span_font_color": span_font_color,  # font color of span
+                                        "span_font_flags": span_font_flags,  # font flags of span
+                                        "span_is_superscript": int(
+                                            span_is_super_script
+                                        ),  # indicate whether the span is super script or not
+                                        "span_is_italic": int(span_is_italic),  # indicate whether the span is italic or not
+                                        "span_is_serifed": int(span_is_serifed),  # indicate whether the span is serifed or not
+                                        "span_is_sans_serifed": int(
+                                            span_is_sans_serifed
+                                        ),  # indicate whether the span is sans serifed or not
+                                        "span_is_monospaced": int(
+                                            span_is_monospaced
+                                        ),  # indicate whether the span is monospaced or not
+                                        "span_is_proportional": int(
+                                            span_is_proportional
+                                        ),  # indicate whether the span is proportional or not
+                                        "span_is_bold": int(span_is_bold),  # indicate whether the span is bold or not
+                                    }
+                                )
+                                span_id += 1
+        span_stats = pd.DataFrame(span_stats)
+        # print(span_stats)
+        return span_stats
+def __find_pdf_dic_files(
+    jf_name="pdf_dic.json",
+    base_code_name="code-clean",
+    tgt_base_dir_name="tmp",
+    unittest_dir_name="unittest",
+    md_dir_name="md",
+    book_names=[
+        "scihub",
+    ],  # other possible values: "zlib", "arxiv" and so on
+):
+    pdf_dict_files = []
+    curr_dir = os.path.dirname(__file__)
+    for i in range(len(curr_dir)):
+        if curr_dir[i : i + len(base_code_name)] == base_code_name:
+            base_code_dir_name = curr_dir[: i + len(base_code_name)]
+            for book_name in book_names:
+                search_dir_relative_name = os.path.join(tgt_base_dir_name, unittest_dir_name, md_dir_name, book_name)
+                if os.path.exists(base_code_dir_name):
+                    search_dir_name = os.path.join(base_code_dir_name, search_dir_relative_name)
+                    for root, dirs, files in os.walk(search_dir_name):
+                        for file in files:
+                            if file == jf_name:
+                                pdf_dict_files.append(os.path.join(root, file))
+                break
+    return pdf_dict_files
+def combine_span_texts(group_df, span_stats):
+    combined_span_texts = []
+    for _, row in group_df.iterrows():
+        curr_span_id = row.name
+        curr_span_text = row["span_text"]
+        pre_span_id = curr_span_id - 1
+        pre_span_text = span_stats.at[pre_span_id, "span_text"] if pre_span_id in span_stats.index else ""
+        next_span_id = curr_span_id + 1
+        next_span_text = span_stats.at[next_span_id, "span_text"] if next_span_id in span_stats.index else ""
+        # pointer_sign is a right arrow if the span is superscript, otherwise it is a down arrow
+        pointer_sign = "→ → → "
+        combined_text = "\n".join([pointer_sign + pre_span_text, pointer_sign + curr_span_text, pointer_sign + next_span_text])
+        combined_span_texts.append(combined_text)
+    return "\n\n".join(combined_span_texts)
+# pd.set_option("display.max_colwidth", None)  # 设置为 None 来显示完整的文本
+pd.set_option("display.max_rows", None)  # 设置为 None 来显示更多的行
+def main():
+    pdf_dict_files = __find_pdf_dic_files()
+    # print(pdf_dict_files)
+    span_stats_calc = SpanStatsCalc()
+    for pdf_dict_file in pdf_dict_files:
+        print("-" * 100)
+        print_green_on_red(f"Processing {pdf_dict_file}")
+        with open(pdf_dict_file, "r", encoding="utf-8") as f:
+            pdf_dict = json.load(f)
+            raw_df = span_stats_calc.calc_stats_per_dict(pdf_dict)
+            save_path = pdf_dict_file.replace("pdf_dic.json", "span_stats_raw.csv")
+            raw_df.to_csv(save_path, index=False)
+            filtered_df = raw_df[raw_df["span_is_superscript"] == 1]
+            if filtered_df.empty:
+                print("No superscript span found!")
+                continue
+            filtered_grouped_df = filtered_df.groupby(["span_font_name", "span_font_size", "span_font_color"])
+            combined_span_texts = filtered_grouped_df.apply(combine_span_texts, span_stats=raw_df)  # type: ignore
+            final_df = filtered_grouped_df.size().reset_index(name="count")
+            final_df["span_texts"] = combined_span_texts.reset_index(level=[0, 1, 2], drop=True)
+            print(final_df)
+            final_df["span_texts"] = final_df["span_texts"].apply(lambda x: x.replace("\n", "\r\n"))
+            save_path = pdf_dict_file.replace("pdf_dic.json", "span_stats_final.csv")
+            # 使用 UTF-8 编码并添加 BOM，确保所有字段被双引号包围
+            final_df.to_csv(save_path, index=False, encoding="utf-8-sig", quoting=csv.QUOTE_ALL)
+            # 创建一个 2x2 的图表布局
+            fig, axs = plt.subplots(2, 2, figsize=(15, 10))
+            # 按照 span_font_name 分类作图
+            final_df.groupby("span_font_name")["count"].sum().plot(kind="bar", ax=axs[0, 0], title="By Font Name")
+            # 按照 span_font_size 分类作图
+            final_df.groupby("span_font_size")["count"].sum().plot(kind="bar", ax=axs[0, 1], title="By Font Size")
+            # 按照 span_font_color 分类作图
+            final_df.groupby("span_font_color")["count"].sum().plot(kind="bar", ax=axs[1, 0], title="By Font Color")
+            # 按照 span_font_name、span_font_size 和 span_font_color 共同分类作图
+            grouped = final_df.groupby(["span_font_name", "span_font_size", "span_font_color"])
+            grouped["count"].sum().unstack().plot(kind="bar", ax=axs[1, 1], title="Combined Grouping")
+            # 调整布局
+            plt.tight_layout()
+            # 显示图表
+            # plt.show()
+            # 保存图表到 PNG 文件
+            save_path = pdf_dict_file.replace("pdf_dic.json", "span_stats_combined.png")
+            plt.savefig(save_path)
+            # 清除画布
+            plt.clf()
+if __name__ == "__main__":
+    main()
--- a/libs/commons.py
+++ b/libs/commons.py
+import datetime
+import os, re, configparser
+import time
+import boto3
+from loguru import logger
+from boto3.s3.transfer import TransferConfig
+from botocore.config import Config
+import fitz  # 1.23.9中已经切换到rebase
+# import fitz_new as fitz  # 使用rebased的新版pymupdf库
+def get_delta_time(input_time):
+    return round(time.time() - input_time, 2)
+def join_path(*args):
+    return '/'.join(s.rstrip('/') for s in args)
+#配置全局的errlog_path，方便demo同步引用
+error_log_path = "s3://llm-pdf-text/err_logs/"
+# json_dump_path = "s3://pdf_books_temp/json_dump/" # 这条路径仅用于临时本地测试,不能提交到main
+json_dump_path = "s3://llm-pdf-text/json_dump/"
+def get_top_percent_list(num_list, percent):
+    """
+    获取列表中前百分之多少的元素
+    :param num_list:
+    :param percent:
+    :return:
+    """
+    if len(num_list) == 0:
+        top_percent_list = []
+    else:
+        # 对imgs_len_list排序
+        sorted_imgs_len_list = sorted(num_list, reverse=True)
+        # 计算 percent 的索引
+        top_percent_index = int(len(sorted_imgs_len_list) * percent)
+        # 取前80%的元素
+        top_percent_list = sorted_imgs_len_list[:top_percent_index]
+    return top_percent_list
+def formatted_time(time_stamp):
+    dt_object = datetime.datetime.fromtimestamp(time_stamp)
+    output_time = dt_object.strftime("%Y-%m-%d-%H:%M:%S")
+    return output_time
+def mymax(alist: list):
+    if len(alist) == 0:
+        return 0  # 空是0， 0*0也是0大小q
+    else:
+        return max(alist)
+def parse_aws_param(profile):
+    if isinstance(profile, str):
+        # 解析配置文件
+        config_file = join_path(os.path.expanduser("~"), ".aws", "config")
+        credentials_file = join_path(os.path.expanduser("~"), ".aws", "credentials")
+        config = configparser.ConfigParser()
+        config.read(credentials_file)
+        config.read(config_file)
+        # 获取 AWS 账户相关信息
+        ak = config.get(profile, "aws_access_key_id")
+        sk = config.get(profile, "aws_secret_access_key")
+        if profile == "default":
+            s3_str = config.get(f"{profile}", "s3")
+        else:
+            s3_str = config.get(f"profile {profile}", "s3")
+        end_match = re.search("endpoint_url[\s]*=[\s]*([^\s\n]+)[\s\n]*$", s3_str, re.MULTILINE)
+        if end_match:
+            endpoint = end_match.group(1)
+        else:
+            raise ValueError(f"aws 配置文件中没有找到 endpoint_url")
+        style_match = re.search("addressing_style[\s]*=[\s]*([^\s\n]+)[\s\n]*$", s3_str, re.MULTILINE)
+        if style_match:
+            addressing_style = style_match.group(1)
+        else:
+            addressing_style = "path"
+    elif isinstance(profile, dict):
+        ak = profile["ak"]
+        sk = profile["sk"]
+        endpoint = profile["endpoint"]
+        addressing_style = "auto"
+    return ak, sk, endpoint, addressing_style
+def parse_bucket_key(s3_full_path: str):
+    """
+    输入 s3://bucket/path/to/my/file.txt
+    输出 bucket, path/to/my/file.txt
+    """
+    s3_full_path = s3_full_path.strip()
+    if s3_full_path.startswith("s3://"):
+        s3_full_path = s3_full_path[5:]
+    if s3_full_path.startswith("/"):
+        s3_full_path = s3_full_path[1:]
+    bucket, key = s3_full_path.split("/", 1)
+    return bucket, key
+def read_file(pdf_path: str, s3_profile):
+    if pdf_path.startswith("s3://"):
+        ak, sk, end_point, addressing_style = parse_aws_param(s3_profile)
+        cli = boto3.client(service_name="s3", aws_access_key_id=ak, aws_secret_access_key=sk, endpoint_url=end_point,
+                           config=Config(s3={'addressing_style': addressing_style}, retries={'max_attempts': 10, 'mode': 'standard'}))
+        bucket_name, bucket_key = parse_bucket_key(pdf_path)
+        res = cli.get_object(Bucket=bucket_name, Key=bucket_key)
+        file_content = res["Body"].read()
+        return file_content
+    else:
+        with open(pdf_path, "rb") as f:
+            return f.read()
+def list_dir(dir_path:str, s3_profile:str):
+    """
+    列出dir_path下的所有文件
+    """
+    ret = []
+    if dir_path.startswith("s3"):
+        ak, sk, end_point, addressing_style = parse_aws_param(s3_profile)
+        s3info = re.findall(r"s3:\/\/([^\/]+)\/(.*)", dir_path)
+        bucket, path = s3info[0][0], s3info[0][1]
+        try:
+            cli = boto3.client(service_name="s3", aws_access_key_id=ak, aws_secret_access_key=sk, endpoint_url=end_point,
+                                            config=Config(s3={'addressing_style': addressing_style}))
+            def list_obj_scluster():
+                marker = None
+                while True:
+                    list_kwargs = dict(MaxKeys=1000, Bucket=bucket, Prefix=path)
+                    if marker:
+                        list_kwargs['Marker'] = marker
+                    response = cli.list_objects(**list_kwargs)
+                    contents = response.get("Contents", [])
+                    yield from contents
+                    if not response.get("IsTruncated") or len(contents)==0:
+                        break
+                    marker = contents[-1]['Key']
+            for info in list_obj_scluster():
+                file_path = info['Key']
+                #size = info['Size']
+                if path!="":
+                    afile = file_path[len(path):]
+                    if afile.endswith(".json"):
+                        ret.append(f"s3://{bucket}/{file_path}")
+            return ret
+        except Exception as e:
+            logger.exception(e)
+            exit(-1)
+    else: #本地的目录，那么扫描本地目录并返会这个目录里的所有jsonl文件
+        for root, dirs, files in os.walk(dir_path):
+            for file in files:
+                if file.endswith(".json"):
+                    ret.append(join_path(root, file))
+        ret.sort()
+        return ret
+def get_img_s3_client(save_path:str, image_s3_config:str):
+    """
+    """
+    if save_path.startswith("s3://"):  # 放这里是为了最少创建一个s3 client
+        ak, sk, end_point, addressing_style = parse_aws_param(image_s3_config)
+        img_s3_client = boto3.client(
+            service_name="s3",
+            aws_access_key_id=ak,
+            aws_secret_access_key=sk,
+            endpoint_url=end_point,
+            config=Config(s3={"addressing_style": addressing_style}, retries={'max_attempts': 5, 'mode': 'standard'}),
+        )
+    else:
+        img_s3_client = None
+    return img_s3_client
+# def get_s3_object(path):
+#     src_cli_config = Config(**{
+#
+#         "connect_timeout": 60,
+#         "read_timeout": 20,
+#         "max_pool_connections": 500,
+#         "s3": {
+#             "addressing_style": "path",
+#         },
+#         "retries": {
+#             "max_attempts": 3,
+#         }
+#     })
+#     full_path = f"{bucket_name}/{bucket_prefix}/{path}"
+#     try:
+#         src_cli = boto3.session.Session().client("s3", aws_access_key_id=ak, aws_secret_access_key=sk, endpoint_url=endpoint, region_name='', config=src_cli_config)
+#         res = src_cli.get_object(Bucket=bucket_name, Key=f"{bucket_prefix}/{path}")
+#         file_content = res["Body"].read()
+#         return file_content
+#     except Exception as e:
+#         logger.error(f"get_s3_object({full_path}) error: {e}")
+#         return b''
+if __name__=="__main__":
+    s3_path = "s3://llm-pdf-text/layout_det/scihub/scimag07865000-07865999/10.1007/s10729-011-9175-6.pdf/"
+    s3_profile = "langchao"
+    ret = list_dir(s3_path, s3_profile)
+    print(ret)
\ No newline at end of file
--- a/libs/drop_reason.py
+++ b/libs/drop_reason.py
+class DropReason:
+    TEXT_BLCOK_HOR_OVERLAP = "text_block_horizontal_overlap" # 文字块有水平互相覆盖，导致无法准确定位文字顺序
+    COMPLICATED_LAYOUT = "complicated_layout" # 复杂的布局，暂时不支持
+    TOO_MANY_LAYOUT_COLUMNS = "too_many_layout_columns" # 目前不支持分栏超过2列的
+    COLOR_BACKGROUND_TEXT_BOX = "color_background_text_box" # 含有带色块的PDF，色块会改变阅读顺序，目前不支持带底色文字块的PDF。
+    HIGH_COMPUTATIONAL_lOAD_BY_IMGS = "high_computational_load_by_imgs" # 含特殊图片，计算量太大，从而丢弃
+    HIGH_COMPUTATIONAL_lOAD_BY_SVGS = "high_computational_load_by_svgs" # 特殊的SVG图，计算量太大，从而丢弃
+    HIGH_COMPUTATIONAL_lOAD_BY_TOTAL_PAGES = "high_computational_load_by_total_pages" # 计算量超过负荷，当前方法下计算量消耗过大
+    MISS_DOC_LAYOUT_RESULT = "missing doc_layout_result" # 版面分析失败
+    Exception = "exception" # 解析中发生异常
+    ENCRYPTED = "encrypted" # PDF是加密的
+    EMPTY_PDF = "total_page=0" # PDF页面总数为0
+    NOT_IS_TEXT_PDF = "not_is_text_pdf" # 不是文字版PDF，无法直接解析
+    DENSE_SINGLE_LINE_BLOCK = "dense_single_line_block" # 无法清晰的分段
+    TITLE_DETECTION_FAILED = "title_detection_failed" # 探测标题失败
+    TITLE_LEVEL_FAILED = "title_level_failed" # 分析标题级别失败（例如一级、二级、三级标题）
+    PARA_SPLIT_FAILED = "para_split_failed" # 识别段落失败
+    PARA_MERGE_FAILED = "para_merge_failed" # 段落合并失败
+    NOT_ALLOW_LANGUAGE = "not_allow_language" # 不支持的语种
+    SPECIAL_PDF = "special_pdf"
+    PSEUDO_SINGLE_COLUMN = "pseudo_single_column" # 无法精确判断文字分栏
+    CAN_NOT_DETECT_PAGE_LAYOUT="can_not_detect_page_layout" # 无法分析页面的版面
\ No newline at end of file
--- a/libs/drop_tag.py
+++ b/libs/drop_tag.py
+COLOR_BG_HEADER_TXT_BLOCK = "color_background_header_txt_block"
\ No newline at end of file