Initial commit

c2e5c36f · 赵小蒙 · c2e5c36f · c2e5c36f · c2e5c36f · c2e5c36f
Commit c2e5c36f authored Feb 29, 2024 by 赵小蒙
20 changed files
--- a/pre_proc/__init__.py
+++ b/pre_proc/__init__.py
--- a/pre_proc/citationmarker_remove.py
+++ b/pre_proc/citationmarker_remove.py
+"""
+去掉正文的引文引用marker
+https://aicarrier.feishu.cn/wiki/YLOPwo1PGiwFRdkwmyhcZmr0n3d
+"""
+import re
+from loguru import logger
+from libs.nlp_utils import NLPModels
+__NLP_MODEL = NLPModels()
+def check_1(spans, cur_span_i):
+    """寻找前一个char,如果是句号，逗号，那么就是角标"""
+    if cur_span_i==0:
+        return False # 不是角标
+    pre_span = spans[cur_span_i-1]
+    pre_char = pre_span['chars'][-1]['c']
+    if pre_char in ['。', '，', '.', ',']:
+        return True
+    return False
+def check_2(spans, cur_span_i):
+    """检查前面一个span的最后一个单词，如果长度大于5，全都是字母，并且不含大写，就是角标"""
+    pattern = r'\b[A-Z]\.\s[A-Z][a-z]*\b' # 形如A. Bcde, L. Bcde, 人名的缩写
+    if cur_span_i==0 and len(spans)>1:
+        next_span = spans[cur_span_i+1]
+        next_txt = "".join([c['c'] for c in next_span['chars']])
+        result = __NLP_MODEL.detect_entity_catgr_using_nlp(next_txt)
+        if result in ["PERSON", "GPE", "ORG"]:
+            return True
+        if re.findall(pattern, next_txt):
+            return True
+        return False # 不是角标
+    elif cur_span_i==0 and len(spans)==1: # 角标占用了整行？谨慎删除
+        return False
+    # 如果这个span是最后一个span,
+    if cur_span_i==len(spans)-1:
+        pre_span = spans[cur_span_i-1]
+        pre_txt = "".join([c['c'] for c in pre_span['chars']])
+        pre_word = pre_txt.split(' ')[-1]
+        result = __NLP_MODEL.detect_entity_catgr_using_nlp(pre_txt)
+        if result in ["PERSON", "GPE", "ORG"]:
+            return True
+        if re.findall(pattern, pre_txt):
+            return True
+        return len(pre_word) > 5 and pre_word.isalpha() and pre_word.islower()
+    else: # 既不是第一个span，也不是最后一个span，那么此时检查一下这个角标距离前后哪个单词更近就属于谁的角标
+        pre_span = spans[cur_span_i-1]
+        next_span = spans[cur_span_i+1]
+        cur_span = spans[cur_span_i]
+        # 找到前一个和后一个span里的距离最近的单词
+        pre_distance = 10000 # 一个很大的数
+        next_distance = 10000 # 一个很大的数
+        for c in pre_span['chars'][::-1]:
+            if c['c'].isalpha():
+                pre_distance = cur_span['bbox'][0] - c['bbox'][2]
+                break
+        for c in next_span['chars']:
+            if c['c'].isalpha():
+                next_distance = c['bbox'][0] - cur_span['bbox'][2]
+                break
+        if pre_distance<next_distance:
+            belong_to_span = pre_span
+        else:
+            belong_to_span = next_span
+        txt = "".join([c['c'] for c in belong_to_span['chars']])
+        pre_word = txt.split(' ')[-1]
+        result = __NLP_MODEL.detect_entity_catgr_using_nlp(txt)
+        if result in ["PERSON", "GPE", "ORG"]:
+            return True
+        if re.findall(pattern, txt):
+            return True
+        return len(pre_word) > 5 and pre_word.isalpha() and pre_word.islower()
+def check_3(spans, cur_span_i):
+    """上标里有[], 有*， 有-， 有逗号"""
+    # 如[2-3],[22]  
+    # 如 2,3,4
+    cur_span_txt = ''.join(c['c'] for c in spans[cur_span_i]['chars']).strip()
+    bad_char = ['[', ']', '*', ',']
+    if any([c in cur_span_txt for c in bad_char]) and any(character.isdigit() for character in cur_span_txt):
+        return True
+    # 如2-3, a-b
+    patterns = [r'\d+-\d+', r'[a-zA-Z]-[a-zA-Z]', r'[a-zA-Z],[a-zA-Z]']
+    for pattern in patterns:  
+        match = re.match(pattern, cur_span_txt)
+        if match is not None:
+            return True
+    return False
+def remove_citation_marker(with_char_text_blcoks):
+    for blk in with_char_text_blcoks:
+        for line in blk['lines']:
+            # 如果span里的个数少于2个，那只能忽略，角标不可能自己独占一行
+            if len(line['spans'])<=1:
+                continue
+            # 找到高度最高的span作为位置比较的基准
+            max_hi_span = line['spans'][0]['bbox']
+            min_font_sz = 10000
+            for s in line['spans']:
+                if max_hi_span[3]-max_hi_span[1]<s['bbox'][3]-s['bbox'][1]:
+                    max_hi_span = s['bbox']
+                if min_font_sz>s['size']:
+                    min_font_sz = s['size']
+            base_span_mid_y = (max_hi_span[3]+max_hi_span[1])/2
+            span_to_del = []
+            for i, span in enumerate(line['spans']):
+                span_hi = span['bbox'][3]-span['bbox'][1]
+                span_mid_y = (span['bbox'][3]+span['bbox'][1])/2
+                span_font_sz = span['size']
+                if (base_span_mid_y-span_mid_y)/span_hi>0.2 or (base_span_mid_y-span_mid_y>0 and abs(span_font_sz-min_font_sz)/min_font_sz<0.1):
+                    """
+                    1. 它的前一个char如果是句号或者逗号的话，那么肯定是角标而不是公式
+                    2. 如果这个角标的前面是一个单词（长度大于5）而不是任何大写或小写的短字母的话 应该也是角标
+                    3. 上标里有数字和逗号或者数字+星号的组合，方括号，一般肯定就是角标了
+                    4. 这个角标属于前文还是后文要根据距离来判断，如果距离前面的文本太近，那么就是前面的角标，否则就是后面的角标
+                    """
+                    if check_1(line['spans'], i) or check_2(line['spans'], i) or check_3(line['spans'], i):
+                        """删除掉这个角标：删除这个span, 同时还要更新line的text"""
+                        span_to_del.append(span)
+            if len(span_to_del)>0:
+                for span in span_to_del:
+                    line['spans'].remove(span)
+                line['text'] = ''.join([c['c'] for s in line['spans'] for c in s['chars']])
+    return with_char_text_blcoks
--- a/pre_proc/construct_paras.py
+++ b/pre_proc/construct_paras.py
+def construct_page_component(page_id, image_info, table_info,  text_blocks_preproc, layout_bboxes, inline_eq_info, interline_eq_info, raw_pymu_blocks, 
+                             removed_text_blocks, removed_image_blocks, images_backup, droped_table_block, table_backup,layout_tree,
+                             page_w, page_h, footnote_bboxes_tmp):
+    """
+    """
+    return_dict = {}
+    return_dict['para_blocks'] = {}
+    return_dict['preproc_blocks'] = text_blocks_preproc
+    return_dict['images'] = image_info
+    return_dict['tables'] = table_info
+    return_dict['interline_equations'] = interline_eq_info
+    return_dict['inline_equations'] = inline_eq_info
+    return_dict['layout_bboxes'] = layout_bboxes
+    return_dict['pymu_raw_blocks'] = raw_pymu_blocks
+    return_dict['global_statistic'] = {}
+    return_dict['droped_text_block'] = removed_text_blocks
+    return_dict['droped_image_block'] = removed_image_blocks
+    return_dict['droped_table_block'] = []
+    return_dict['image_backup'] = images_backup
+    return_dict['table_backup'] = []    
+    return_dict['page_idx'] = page_id
+    return_dict['page_size'] = [page_w, page_h]
+    return_dict['_layout_tree'] = layout_tree # 辅助分析layout作用
+    return_dict['footnote_bboxes_tmp'] = footnote_bboxes_tmp
+    return return_dict
--- a/pre_proc/detect_footer_header.py
+++ b/pre_proc/detect_footer_header.py
+from collections import defaultdict
+from loguru import logger
+from libs.boxbase import _is_in, calculate_iou
+def compare_bbox_with_list(bbox, bbox_list, tolerance=1):
+    return any(all(abs(a - b) < tolerance for a, b in zip(bbox, common_bbox)) for common_bbox in bbox_list)
+def is_single_line_block(block):
+    # Determine based on the width and height of the block
+    block_width = block["X1"] - block["X0"]
+    block_height = block["bbox"][3] - block["bbox"][1]
+    # If the height of the block is close to the average character height and the width is large, it is considered a single line
+    return block_height <= block["avg_char_height"] * 3 and block_width > block["avg_char_width"] * 3
+def get_most_common_bboxes(bboxes, page_height, position="top", threshold=0.25, num_bboxes=3, min_frequency=2):
+    """
+    This function gets the most common bboxes from the bboxes
+    Parameters
+    ----------
+    bboxes : list
+        bboxes
+    page_height : float
+        height of the page
+    position : str, optional
+        "top" or "bottom", by default "top"
+    threshold : float, optional
+        threshold, by default 0.25
+    num_bboxes : int, optional
+        number of bboxes to return, by default 3
+    min_frequency : int, optional
+        minimum frequency of the bbox, by default 2
+    Returns
+    -------
+    common_bboxes : list
+        common bboxes
+    """
+    # Filter bbox by position
+    if position == "top":
+        filtered_bboxes = [bbox for bbox in bboxes if bbox[1] < page_height * threshold]
+    else:
+        filtered_bboxes = [bbox for bbox in bboxes if bbox[3] > page_height * (1 - threshold)]
+    # Find the most common bbox
+    bbox_count = defaultdict(int)
+    for bbox in filtered_bboxes:
+        bbox_count[tuple(bbox)] += 1
+    # Get the most frequently occurring bbox, but only consider it when the frequency exceeds min_frequency
+    common_bboxes = [
+        bbox for bbox, count in sorted(bbox_count.items(), key=lambda item: item[1], reverse=True) if count >= min_frequency
+    ][:num_bboxes]
+    return common_bboxes
+def detect_footer_header2(result_dict, similarity_threshold=0.5):
+    """
+    This function detects the header and footer of the document.
+    Parameters
+    ----------
+    result_dict : dict
+        result dictionary
+    Returns
+    -------
+    result_dict : dict
+        result dictionary
+    """
+    # Traverse all blocks in the document
+    single_line_blocks = 0
+    total_blocks = 0
+    single_line_blocks = 0
+    for page_id, blocks in result_dict.items():
+        if page_id.startswith("page_"):
+            for block_key, block in blocks.items():
+                if block_key.startswith("block_"):
+                    total_blocks += 1
+                    if is_single_line_block(block):
+                        single_line_blocks += 1
+    # If there are no blocks, skip the header and footer detection
+    if total_blocks == 0:
+        print("No blocks found. Skipping header/footer detection.")
+        return result_dict
+    # If most of the blocks are single-line, skip the header and footer detection
+    if single_line_blocks / total_blocks > 0.5:  # 50% of the blocks are single-line
+        # print("Skipping header/footer detection for text-dense document.")
+        return result_dict
+    # Collect the bounding boxes of all blocks
+    all_bboxes = []
+    all_texts = []
+    for page_id, blocks in result_dict.items():
+        if page_id.startswith("page_"):
+            for block_key, block in blocks.items():
+                if block_key.startswith("block_"):
+                    all_bboxes.append(block["bbox"])
+    # Get the height of the page
+    page_height = max(bbox[3] for bbox in all_bboxes)
+    # Get the most common bbox lists for headers and footers
+    common_header_bboxes = get_most_common_bboxes(all_bboxes, page_height, position="top") if all_bboxes else []
+    common_footer_bboxes = get_most_common_bboxes(all_bboxes, page_height, position="bottom") if all_bboxes else []
+    # Detect and mark headers and footers
+    for page_id, blocks in result_dict.items():
+        if page_id.startswith("page_"):
+            for block_key, block in blocks.items():
+                if block_key.startswith("block_"):
+                    bbox = block["bbox"]
+                    text = block["text"]
+                    is_header = compare_bbox_with_list(bbox, common_header_bboxes)
+                    is_footer = compare_bbox_with_list(bbox, common_footer_bboxes)
+                    block["is_header"] = int(is_header)
+                    block["is_footer"] = int(is_footer)
+    return result_dict
+def __get_page_size(page_sizes:list):
+    """
+    页面大小可能不一样
+    """
+    w = sum([w for w,h in page_sizes])/len(page_sizes)
+    h = sum([h for w,h  in page_sizes])/len(page_sizes)
+    return w, h
+def __calculate_iou(bbox1, bbox2):
+    iou = calculate_iou(bbox1, bbox2)
+    return iou
+def __is_same_pos(box1, box2, iou_threshold):
+    iou = __calculate_iou(box1, box2)
+    return iou >= iou_threshold
+def get_most_common_bbox(bboxes:list, page_size:list, page_cnt:int,  page_range_threshold=0.2, iou_threshold=0.9):
+    """
+    common bbox必须大于page_cnt的1/3
+    """
+    min_occurance_cnt = max(3, page_cnt//4)
+    header_det_bbox = []
+    footer_det_bbox = []
+    hdr_same_pos_group = []
+    btn_same_pos_group = []
+    page_w, page_h = __get_page_size(page_size)
+    top_y, bottom_y = page_w*page_range_threshold, page_h*(1-page_range_threshold)
+    top_bbox = [b for b in bboxes if b[3]<top_y]
+    bottom_bbox = [b for b in bboxes if b[1]>bottom_y]
+    # 然后开始排序，寻找最经常出现的bbox, 寻找的时候如果IOU>iou_threshold就算是一个
+    for i in range(0, len(top_bbox)):
+        hdr_same_pos_group.append([top_bbox[i]])
+        for j in range(i+1, len(top_bbox)):
+            if __is_same_pos(top_bbox[i], top_bbox[j], iou_threshold):
+                #header_det_bbox = [min(top_bbox[i][0], top_bbox[j][0]), min(top_bbox[i][1], top_bbox[j][1]), max(top_bbox[i][2], top_bbox[j][2]), max(top_bbox[i][3],top_bbox[j][3])]
+                hdr_same_pos_group[i].append(top_bbox[j])
+    for i in range(0, len(bottom_bbox)):
+        btn_same_pos_group.append([bottom_bbox[i]])
+        for j in range(i+1, len(bottom_bbox)):
+            if __is_same_pos(bottom_bbox[i], bottom_bbox[j], iou_threshold):
+                #footer_det_bbox = [min(bottom_bbox[i][0], bottom_bbox[j][0]), min(bottom_bbox[i][1], bottom_bbox[j][1]), max(bottom_bbox[i][2], bottom_bbox[j][2]), max(bottom_bbox[i][3],bottom_bbox[j][3])]
+                btn_same_pos_group[i].append(bottom_bbox[j])
+    # 然后看下每一组的bbox，是否符合大于page_cnt一定比例
+    hdr_same_pos_group = [g for g in hdr_same_pos_group if len(g)>=min_occurance_cnt]
+    btn_same_pos_group = [g for g in btn_same_pos_group if len(g)>=min_occurance_cnt]
+    # 平铺2个list[list]
+    hdr_same_pos_group = [bbox for g in hdr_same_pos_group for bbox in g]
+    btn_same_pos_group = [bbox for g in btn_same_pos_group for bbox in g]
+    # 寻找hdr_same_pos_group中的box[3]最大值，btn_same_pos_group中的box[1]最小值
+    hdr_same_pos_group.sort(key=lambda b:b[3])
+    btn_same_pos_group.sort(key=lambda b:b[1])
+    hdr_y = hdr_same_pos_group[-1][3] if hdr_same_pos_group else 0
+    btn_y = btn_same_pos_group[0][1] if btn_same_pos_group else page_h
+    header_det_bbox = [0, 0, page_w, hdr_y]
+    footer_det_bbox = [0, btn_y, page_w, page_h]
+    # logger.warning(f"header: {header_det_bbox}, footer: {footer_det_bbox}")
+    return header_det_bbox, footer_det_bbox, page_w, page_h
+def drop_footer_header(pdf_info_dict:dict):
+    """
+    启用规则探测,在全局的视角上通过统计的方法。
+    """
+    header = []
+    footer = []
+    all_text_bboxes = [blk['bbox'] for _, val in pdf_info_dict.items() for blk in val['preproc_blocks']]
+    image_bboxes = [img['bbox'] for _, val in pdf_info_dict.items() for img in val['images']] + [img['bbox'] for _, val in pdf_info_dict.items() for img in val['image_backup']]
+    page_size = [val['page_size'] for _, val in pdf_info_dict.items()]
+    page_cnt = len(pdf_info_dict.keys()) # 一共多少页
+    header, footer, page_w, page_h = get_most_common_bbox(all_text_bboxes+image_bboxes, page_size, page_cnt)
+    """"
+    把范围扩展到页面水平的整个方向上
+    """        
+    if header:
+        header = [0, 0, page_w, header[3]+1]
+    if footer:
+        footer = [0, footer[1]-1, page_w, page_h]
+    # 找到footer, header范围之后，针对每一页pdf，从text、图片中删除这些范围内的内容
+    # 移除text block
+    for _, page_info in pdf_info_dict.items():
+        header_text_blk = []
+        footer_text_blk = []
+        for blk in page_info['preproc_blocks']:
+            blk_bbox = blk['bbox']
+            if header and blk_bbox[3]<=header[3]:
+                blk['tag'] = "header"
+                header_text_blk.append(blk)
+            elif footer and blk_bbox[1]>=footer[1]:
+                blk['tag'] = "footer"
+                footer_text_blk.append(blk)
+        # 放入text_block_droped中
+        page_info['droped_text_block'].extend(header_text_blk)
+        page_info['droped_text_block'].extend(footer_text_blk)
+        for blk in header_text_blk:
+            page_info['preproc_blocks'].remove(blk)
+        for blk in footer_text_blk:
+            page_info['preproc_blocks'].remove(blk)
+        """接下来把footer、header上的图片也删除掉。图片包括正常的和backup的"""
+        header_image = []
+        footer_image = []
+        for image_info in page_info['images']:
+            img_bbox = image_info['bbox']
+            if header and img_bbox[3]<=header[3]:
+                image_info['tag'] = "header"
+                header_image.append(image_info)
+            elif footer and img_bbox[1]>=footer[1]:
+                image_info['tag'] = "footer"
+                footer_image.append(image_info)
+        page_info['droped_image_block'].extend(header_image)
+        page_info['droped_image_block'].extend(footer_image)
+        for img in header_image:
+            page_info['images'].remove(img)
+        for img in footer_image:
+            page_info['images'].remove(img)
+        """接下来吧backup的图片也删除掉"""
+        header_image = []
+        footer_image = []
+        for image_info in page_info['image_backup']:
+            img_bbox = image_info['bbox']
+            if header and img_bbox[3]<=header[3]:
+                image_info['tag'] = "header"
+                header_image.append(image_info)
+            elif footer and img_bbox[1]>=footer[1]:
+                image_info['tag'] = "footer"
+                footer_image.append(image_info)
+        page_info['droped_image_block'].extend(header_image)
+        page_info['droped_image_block'].extend(footer_image)
+        for img in header_image:
+            page_info['image_backup'].remove(img)
+        for img in footer_image:
+            page_info['image_backup'].remove(img)
+    return header, footer
--- a/pre_proc/equations_replace.py
+++ b/pre_proc/equations_replace.py
+"""
+对pymupdf返回的结构里的公式进行替换，替换为模型识别的公式结果
+"""
+import fitz
+import json
+import os
+from pathlib import Path
+from loguru import logger
+TYPE_INLINE_EQUATION = "inline-equation"
+TYPE_INTERLINE_EQUATION = "interline-equation"
+def combine_chars_to_pymudict(block_dict, char_dict):
+    """
+    把block级别的pymupdf 结构里加入char结构
+    """
+    # 因为block_dict 被裁剪过，因此先把他和char_dict文字块对齐，才能进行补充
+    char_map = {tuple(item['bbox']):item for item in char_dict}
+    for i in range(len(block_dict)): # blcok
+        block = block_dict[i]
+        key = block['bbox']
+        char_dict_item = char_map[tuple(key)]
+        char_dict_map = {tuple(item['bbox']):item for item in char_dict_item['lines']}
+        for j in range(len(block['lines'])):
+            lines = block['lines'][j]
+            with_char_lines = char_dict_map[lines['bbox']]
+            for k in range(len(lines['spans'])):
+                spans = lines['spans'][k]
+                try:
+                    chars = with_char_lines['spans'][k]['chars']
+                except Exception as e:
+                    logger.error(char_dict[i]['lines'][j])
+                spans['chars'] = chars
+    return block_dict
+def calculate_overlap_area_2_minbox_area_ratio(bbox1, min_bbox):
+    """
+    计算box1和box2的重叠面积占最小面积的box的比例
+    """
+    # Determine the coordinates of the intersection rectangle
+    x_left = max(bbox1[0], min_bbox[0])
+    y_top = max(bbox1[1], min_bbox[1])
+    x_right = min(bbox1[2], min_bbox[2])
+    y_bottom = min(bbox1[3], min_bbox[3])
+    if x_right < x_left or y_bottom < y_top:
+        return 0.0
+    # The area of overlap area
+    intersection_area = (x_right - x_left) * (y_bottom - y_top)
+    min_box_area = (min_bbox[3]-min_bbox[1])*(min_bbox[2]-min_bbox[0])
+    if min_box_area==0:
+        return 0
+    else:
+        return intersection_area / min_box_area
+def _is_xin(bbox1, bbox2):
+    area1 = abs(bbox1[2]-bbox1[0])*abs(bbox1[3]-bbox1[1])
+    area2 = abs(bbox2[2]-bbox2[0])*abs(bbox2[3]-bbox2[1])
+    if area1<area2:
+        ratio = calculate_overlap_area_2_minbox_area_ratio(bbox2, bbox1)
+    else:
+        ratio = calculate_overlap_area_2_minbox_area_ratio(bbox1, bbox2)
+    return ratio>0.6
+def remove_text_block_in_interline_equation_bbox(interline_bboxes, text_blocks):
+    """消除掉整个块都在行间公式块内部的文本块"""
+    for eq_bbox in interline_bboxes:
+        removed_txt_blk = []
+        for text_blk in text_blocks:
+            text_bbox = text_blk['bbox']
+            if calculate_overlap_area_2_minbox_area_ratio(eq_bbox['bbox'], text_bbox)>=0.7:
+                removed_txt_blk.append(text_blk)
+        for blk in removed_txt_blk:
+            text_blocks.remove(blk)
+    return text_blocks
+def _is_in_or_part_overlap(box1, box2) -> bool:
+    """
+    两个bbox是否有部分重叠或者包含
+    """
+    if box1 is None or box2 is None:
+        return False
+    x0_1, y0_1, x1_1, y1_1 = box1
+    x0_2, y0_2, x1_2, y1_2 = box2
+    return not (x1_1 < x0_2 or  # box1在box2的左边
+                x0_1 > x1_2 or  # box1在box2的右边
+                y1_1 < y0_2 or  # box1在box2的上边
+                y0_1 > y1_2)    # box1在box2的下边
+def remove_text_block_overlap_interline_equation_bbox(interline_eq_bboxes, pymu_block_list):
+    """消除掉行行内公式有部分重叠的文本块的内容。
+    同时重新计算消除重叠之后文本块的大小"""
+    deleted_block = []
+    for text_block in pymu_block_list:
+        deleted_line = []
+        for line in text_block['lines']:
+            deleted_span = []
+            for span in line['spans']:
+                deleted_chars = []
+                for char in span['chars']:
+                    if any([_is_in_or_part_overlap(char['bbox'], eq_bbox['bbox']) for eq_bbox in interline_eq_bboxes]):
+                        deleted_chars.append(char)
+                # 检查span里没有char则删除这个span
+                for char in deleted_chars:
+                    span['chars'].remove(char)
+                # 重新计算这个span的大小
+                if len(span['chars'])==0: # 删除这个span
+                    deleted_span.append(span)
+                else:
+                    span['bbox'] = min([b['bbox'][0] for b in span['chars']]),min([b['bbox'][1] for b in span['chars']]),max([b['bbox'][2] for b in span['chars']]), max([b['bbox'][3] for b in span['chars']])
+            # 检查这个span
+            for span in deleted_span:
+                line['spans'].remove(span)
+            if len(line['spans'])==0: #删除这个line
+                deleted_line.append(line)
+            else:
+                line['bbox'] = min([b['bbox'][0] for b in line['spans']]),min([b['bbox'][1] for b in line['spans']]),max([b['bbox'][2] for b in line['spans']]), max([b['bbox'][3] for b in line['spans']])
+        # 检查这个block是否可以删除
+        for line in deleted_line:
+            text_block['lines'].remove(line)
+        if len(text_block['lines'])==0: # 删除block
+            deleted_block.append(text_block)
+        else:
+            text_block['bbox'] = min([b['bbox'][0] for b in text_block['lines']]),min([b['bbox'][1] for b in text_block['lines']]),max([b['bbox'][2] for b in text_block['lines']]), max([b['bbox'][3] for b in text_block['lines']])
+    # 检查text block删除
+    for block in deleted_block:
+        pymu_block_list.remove(block)
+    if len(pymu_block_list)==0:
+        return []
+    return pymu_block_list
+def insert_interline_equations_textblock(interline_eq_bboxes, pymu_block_list):
+    """在行间公式对应的地方插上一个伪造的block"""
+    for eq in interline_eq_bboxes:
+        bbox = eq['bbox']
+        latex_content = eq['latex_text']
+        text_block = {
+                "number": len(pymu_block_list),
+                "type": 0,
+                "bbox": bbox,
+                "lines": [
+                    {
+                        "spans": [
+                            {
+                                "size": 9.962599754333496,
+                                "_type": TYPE_INTERLINE_EQUATION,
+                                "flags": 4,
+                                "font": TYPE_INTERLINE_EQUATION,
+                                "color": 0,
+                                "ascender": 0.9409999847412109,
+                                "descender": -0.3050000071525574,
+                                "text": f"\n$$\n{latex_content}\n$$\n",
+                                "origin": [
+                                    bbox[0],
+                                    bbox[1]
+                                ],
+                                "bbox": bbox
+                            }
+                        ],
+                        "wmode": 0,
+                        "dir": [
+                            1.0,
+                            0.0
+                        ],
+                        "bbox": bbox
+                    }
+                ]
+            }
+        pymu_block_list.append(text_block)
+def x_overlap_ratio(box1, box2):
+    a, _, c, _ = box1
+    e, _, g, _ = box2
+    # 计算重叠宽度
+    overlap_x = max(min(c, g) - max(a, e), 0)
+    # 计算box1的宽度
+    width1 = g - e
+    # 计算重叠比例
+    overlap_ratio = overlap_x / width1 if width1 != 0 else 0
+    return overlap_ratio
+def __is_x_dir_overlap(bbox1, bbox2):
+    return not (bbox1[2]<bbox2[0] or bbox1[0]>bbox2[2])
+def __y_overlap_ratio(box1, box2):
+    """"""
+    _, b, _, d = box1
+    _, f, _, h = box2
+    # 计算重叠高度
+    overlap_y = max(min(d, h) - max(b, f), 0)
+    # 计算box1的高度
+    height1 = d - b
+    # 计算重叠比例
+    overlap_ratio = overlap_y / height1 if height1 != 0 else 0
+    return overlap_ratio
+def replace_line_v2(eqinfo, line):
+    """    
+    扫描这一行所有的和公式框X方向重叠的char,然后计算char的左、右x0, x1,位于这个区间内的span删除掉。
+    最后与这个x0,x1有相交的span0, span1内部进行分割。
+    """
+    first_overlap_span = -1
+    first_overlap_span_idx = -1
+    last_overlap_span = -1
+    delete_chars = []
+    for i in range(0, len(line['spans'])):
+        if line['spans'][i].get("_type", None) is not None:
+            continue # 忽略，因为已经是插入的伪造span公式了
+        for char in line['spans'][i]['chars']:
+            if __is_x_dir_overlap(eqinfo['bbox'], char['bbox']):
+                line_txt = ""
+                for span in line['spans']:
+                    span_txt = "<span>"
+                    for ch in span['chars']:
+                        span_txt = span_txt + ch['c']
+                    span_txt = span_txt + "</span>"
+                    line_txt = line_txt + span_txt
+                if first_overlap_span_idx == -1:
+                    first_overlap_span = line['spans'][i]
+                    first_overlap_span_idx = i
+                last_overlap_span = line['spans'][i]
+                delete_chars.append(char)
+    # 第一个和最后一个char要进行检查，到底属于公式多还是属于正常span多
+    if len(delete_chars)>0:
+        ch0_bbox = delete_chars[0]['bbox']
+        if x_overlap_ratio(eqinfo['bbox'], ch0_bbox)<0.51:
+            delete_chars.remove(delete_chars[0])
+    if len(delete_chars)>0:
+        ch0_bbox = delete_chars[-1]['bbox']
+        if x_overlap_ratio(eqinfo['bbox'], ch0_bbox)<0.51:
+            delete_chars.remove(delete_chars[-1])
+    # 计算x方向上被删除区间内的char的真实x0, x1
+    if len(delete_chars):
+        x0, x1 = min([b['bbox'][0] for b in delete_chars]), max([b['bbox'][2] for b in delete_chars])
+    else:
+        logger.debug(f"行内公式替换没有发生，尝试下一行匹配, eqinfo={eqinfo}")
+        return False
+    # 删除位于x0, x1这两个中间的span
+    delete_span = []
+    for span in line['spans']:
+        span_box = span['bbox']
+        if x0<=span_box[0] and span_box[2]<=x1:
+            delete_span.append(span)
+    for span in delete_span:
+        line['spans'].remove(span)
+    equation_span = {
+                                "size": 9.962599754333496,
+                                "_type": TYPE_INLINE_EQUATION,
+                                "flags": 4,
+                                "font": TYPE_INLINE_EQUATION,
+                                "color": 0,
+                                "ascender": 0.9409999847412109,
+                                "descender": -0.3050000071525574,
+                                "text": "",
+                                "origin": [
+                                    337.1410153102337,
+                                    216.0205245153934
+                                ],
+                                "bbox": [
+                                    337.1410153102337,
+                                    216.0205245153934,
+                                    390.4496373892022,
+                                    228.50171037628277
+                                ]
+                            }
+    #equation_span = line['spans'][0].copy()
+    equation_span['text'] = f" ${eqinfo['latex_text']}$ "
+    equation_span['bbox'] = [x0, equation_span['bbox'][1], x1, equation_span['bbox'][3]] 
+    equation_span['origin'] = [equation_span['bbox'][0], equation_span['bbox'][1]]
+    equation_span['chars'] = delete_chars
+    equation_span['_type'] = TYPE_INLINE_EQUATION
+    equation_span['_eq_bbox'] = eqinfo['bbox']
+    line['spans'].insert(first_overlap_span_idx+1, equation_span) # 放入公式
+    # logger.info(f"==>text is 【{line_txt}】, equation is 【{eqinfo['latex_text']}】")
+    # 第一个、和最后一个有overlap的span进行分割,然后插入对应的位置
+    first_span_chars = [char for char in first_overlap_span['chars'] if (char['bbox'][2]+char['bbox'][0])/2<x0]
+    tail_span_chars  = [char for char in last_overlap_span['chars'] if (char['bbox'][0]+char['bbox'][2])/2>x1]
+    if len(first_span_chars)>0:
+        first_overlap_span['chars'] = first_span_chars
+        first_overlap_span['text'] = ''.join([char['c'] for char in first_span_chars])
+        first_overlap_span['bbox'] = (first_overlap_span['bbox'][0], first_overlap_span['bbox'][1], max([chr['bbox'][2] for chr in first_span_chars]), first_overlap_span['bbox'][3])
+        # first_overlap_span['_type'] = "first"
+    else:
+        # 删掉
+        if first_overlap_span not in delete_span:
+            line['spans'].remove(first_overlap_span)
+    if len(tail_span_chars)>0:
+        if last_overlap_span==first_overlap_span: # 这个时候应该插入一个新的
+            tail_span_txt = ''.join([char['c'] for char in tail_span_chars])
+            last_span_to_insert =  last_overlap_span.copy()
+            last_span_to_insert['chars'] = tail_span_chars
+            last_span_to_insert['text'] = ''.join([char['c'] for char in tail_span_chars])
+            last_span_to_insert['bbox'] = (min([chr['bbox'][0] for chr in tail_span_chars]), last_overlap_span['bbox'][1], last_overlap_span['bbox'][2], last_overlap_span['bbox'][3])
+            # 插入到公式对象之后
+            equation_idx = line['spans'].index(equation_span)
+            line['spans'].insert(equation_idx+1, last_span_to_insert) # 放入公式
+        else: # 直接修改原来的span
+            last_overlap_span['chars'] = tail_span_chars
+            last_overlap_span['text'] = ''.join([char['c'] for char in tail_span_chars])
+            last_overlap_span['bbox'] = (min([chr['bbox'][0] for chr in tail_span_chars]), last_overlap_span['bbox'][1], last_overlap_span['bbox'][2], last_overlap_span['bbox'][3])
+    else:
+        # 删掉
+        if last_overlap_span not in delete_span and last_overlap_span!=first_overlap_span:
+            line['spans'].remove(last_overlap_span)
+    remain_txt = ""
+    for span in line['spans']:
+        span_txt = "<span>"
+        for char in span['chars']:
+            span_txt = span_txt + char['c']
+        span_txt = span_txt + "</span>"
+        remain_txt = remain_txt + span_txt
+    # logger.info(f"<== succ replace, text is 【{remain_txt}】, equation is 【{eqinfo['latex_text']}】")
+    return True
+def replace_eq_blk(eqinfo, text_block):
+    """替换行内公式"""
+    for line in text_block['lines']:
+        line_bbox = line['bbox']
+        if _is_xin(eqinfo['bbox'], line_bbox) or __y_overlap_ratio(eqinfo['bbox'], line_bbox)>0.6: # 定位到行, 使用y方向重合率是因为有的时候，一个行的宽度会小于公式位置宽度：行很高，公式很窄，
+            replace_succ =  replace_line_v2(eqinfo, line)
+            if not replace_succ: # 有的时候，一个pdf的line高度从API里会计算的有问题，因此在行内span级别会替换不成功，这就需要继续重试下一行
+                continue
+            else:
+                break
+    else:
+        return False
+    return True
+def replace_inline_equations(inline_equation_bboxes, raw_text_blocks):
+    """替换行内公式"""
+    for eqinfo in inline_equation_bboxes:
+        eqbox = eqinfo['bbox']
+        for blk in raw_text_blocks:
+            if _is_xin(eqbox, blk['bbox']):
+                if not replace_eq_blk(eqinfo, blk):
+                    logger.error(f"行内公式没有替换成功：{eqinfo} ")
+                else:
+                    break
+    return raw_text_blocks
+def remove_chars_in_text_blocks(text_blocks):
+    """删除text_blocks里的char"""
+    for blk in text_blocks:
+        for line in blk['lines']:
+            for span in line['spans']:
+                _ = span.pop("chars", "no such key")
+    return text_blocks
+def replace_equations_in_textblock(raw_text_blocks, inline_equation_bboxes, interline_equation_bboxes):
+    """
+    替换行间和和行内公式为latex
+    """
+    raw_text_blocks = remove_text_block_in_interline_equation_bbox(interline_equation_bboxes, raw_text_blocks) # 消除重叠：第一步，在公式内部的
+    raw_text_blocks = remove_text_block_overlap_interline_equation_bbox(interline_equation_bboxes, raw_text_blocks) # 消重，第二步，和公式覆盖的
+    insert_interline_equations_textblock(interline_equation_bboxes, raw_text_blocks)
+    raw_text_blocks = replace_inline_equations(inline_equation_bboxes, raw_text_blocks)
+    return raw_text_blocks
+def draw_block_on_pdf_with_txt_replace_eq_bbox(json_path, pdf_path):
+    """
+    """
+    new_pdf = f"{Path(pdf_path).parent}/{Path(pdf_path).stem}.step3-消除行内公式text_block.pdf"
+    with open(json_path, "r", encoding='utf-8') as f:
+        obj = json.loads(f.read())
+    if os.path.exists(new_pdf):
+        os.remove(new_pdf)
+    new_doc = fitz.open('')
+    doc = fitz.open(pdf_path)
+    new_doc = fitz.open(pdf_path)
+    for i in range(len(new_doc)):
+        page = new_doc[i]
+        inline_equation_bboxes = obj[f"page_{i}"]['inline_equations']
+        interline_equation_bboxes = obj[f"page_{i}"]['interline_equations']
+        raw_text_blocks = obj[f'page_{i}']['preproc_blocks']
+        raw_text_blocks = remove_text_block_in_interline_equation_bbox(interline_equation_bboxes, raw_text_blocks) # 消除重叠：第一步，在公式内部的
+        raw_text_blocks = remove_text_block_overlap_interline_equation_bbox(interline_equation_bboxes, raw_text_blocks) # 消重，第二步，和公式覆盖的
+        insert_interline_equations_textblock(interline_equation_bboxes, raw_text_blocks)
+        raw_text_blocks = replace_inline_equations(inline_equation_bboxes, raw_text_blocks)
+        # 为了检验公式是否重复，把每一行里，含有公式的span背景改成黄色的
+        color_map = [fitz.pdfcolor['blue'],fitz.pdfcolor['green']]
+        j = 0
+        for blk in raw_text_blocks:
+            for i,line in enumerate(blk['lines']):
+                # line_box = line['bbox']
+                # shape = page.new_shape()
+                # shape.draw_rect(line_box)
+                # shape.finish(color=fitz.pdfcolor['red'], fill=color_map[j%2], fill_opacity=0.3)
+                # shape.commit()
+                # j = j+1
+                for i, span in enumerate(line['spans']):
+                    shape_page = page.new_shape()
+                    span_type = span.get('_type')
+                    color = fitz.pdfcolor['blue']
+                    if span_type=='first':
+                        color = fitz.pdfcolor['blue']
+                    elif span_type=='tail':
+                        color = fitz.pdfcolor['green']
+                    elif span_type==TYPE_INLINE_EQUATION:
+                        color = fitz.pdfcolor['black']
+                    else:
+                        color = None
+                    b = span['bbox']
+                    shape_page.draw_rect(b)
+                    shape_page.finish(color=None, fill=color, fill_opacity=0.3)
+                    shape_page.commit()
+    new_doc.save(new_pdf)
+    logger.info(f"save ok {new_pdf}")
+    final_json = json.dumps(obj, ensure_ascii=False,indent=2)
+    with open("equations_test/final_json.json", "w") as f:
+        f.write(final_json)
+    return new_pdf
+if __name__=="__main__":
+    # draw_block_on_pdf_with_txt_replace_eq_bbox(new_json_path, equation_color_pdf)
+    pass
--- a/pre_proc/image_fix.py
+++ b/pre_proc/image_fix.py
+import re    
+from libs.boxbase import  _is_in_or_part_overlap, _is_part_overlap, _is_in, find_bottom_nearest_text_bbox, find_left_nearest_text_bbox, find_right_nearest_text_bbox, find_top_nearest_text_bbox
+from loguru import logger
+from libs.textbase import get_text_block_base_info
+def fix_image_vertical(image_bboxes:list, text_blocks:list):
+    """
+    修正图片的位置
+    如果图片与文字block发生一定重叠（也就是图片切到了一部分文字），那么减少图片边缘，让文字和图片不再重叠。
+    只对垂直方向进行。
+    """
+    for image_bbox in image_bboxes:
+        for text_block in text_blocks:
+            text_bbox = text_block["bbox"]
+            if _is_part_overlap(text_bbox, image_bbox) and any([text_bbox[0]>=image_bbox[0] and text_bbox[2]<=image_bbox[2], text_bbox[0]<=image_bbox[0] and text_bbox[2]>=image_bbox[2]]):
+                if text_bbox[1] < image_bbox[1]:#在图片上方
+                    image_bbox[1] = text_bbox[3]+1
+                elif text_bbox[3]>image_bbox[3]:#在图片下方
+                    image_bbox[3] = text_bbox[1]-1
+    return image_bboxes
+def __merge_if_common_edge(bbox1, bbox2):
+    x_min_1, y_min_1, x_max_1, y_max_1 = bbox1
+    x_min_2, y_min_2, x_max_2, y_max_2 = bbox2
+    # 检查是否有公共的水平边
+    if y_min_1 == y_min_2 or y_max_1 == y_max_2:
+        # 确保一个框的x范围在另一个框的x范围内
+        if max(x_min_1, x_min_2) <= min(x_max_1, x_max_2):
+            return [min(x_min_1, x_min_2), min(y_min_1, y_min_2), max(x_max_1, x_max_2), max(y_max_1, y_max_2)]
+    # 检查是否有公共的垂直边
+    if x_min_1 == x_min_2 or x_max_1 == x_max_2:
+        # 确保一个框的y范围在另一个框的y范围内
+        if max(y_min_1, y_min_2) <= min(y_max_1, y_max_2):
+            return [min(x_min_1, x_min_2), min(y_min_1, y_min_2), max(x_max_1, x_max_2), max(y_max_1, y_max_2)]
+    # 如果没有公共边
+    return None
+def fix_seperated_image(image_bboxes:list):
+    """
+    如果2个图片有一个边重叠，那么合并2个图片
+    """
+    new_images = []
+    droped_img_idx = []
+    for i in range(0, len(image_bboxes)):
+        for j in range(i+1, len(image_bboxes)):
+            new_img = __merge_if_common_edge(image_bboxes[i], image_bboxes[j])
+            if new_img is not None:
+                new_images.append(new_img)
+                droped_img_idx.append(i)
+                droped_img_idx.append(j)
+                break
+    for i in range(0, len(image_bboxes)):
+        if i not in droped_img_idx:
+            new_images.append(image_bboxes[i])
+    return new_images
+def __check_img_title_pattern(text):
+    """
+    检查文本段是否是表格的标题
+    """
+    patterns = [r"^(fig|figure).*", r"^(scheme).*"]
+    text = text.strip()
+    for pattern in patterns:
+        match = re.match(pattern, text, re.IGNORECASE)
+        if match:
+            return True
+    return False
+def __get_fig_caption_text(text_block):
+    txt = " ".join(span['text'] for line in text_block['lines'] for span in line['spans'])
+    line_cnt = len(text_block['lines'])
+    txt = txt.replace("Ž . ", '')
+    return txt, line_cnt
+def __find_and_extend_bottom_caption(text_block, pymu_blocks, image_box):
+    """
+    继续向下方寻找和图片caption字号，字体，颜色一样的文字框，合并入caption。
+    text_block是已经找到的图片catpion（这个caption可能不全，多行被划分到多个pymu block里了）
+    """
+    combined_image_caption_text_block = list(text_block.copy()['bbox'])
+    base_font_color, base_font_size, base_font_type = get_text_block_base_info(text_block)
+    while True:
+        tb_add = find_bottom_nearest_text_bbox(pymu_blocks, combined_image_caption_text_block)
+        if not tb_add:
+            break
+        tb_font_color, tb_font_size, tb_font_type = get_text_block_base_info(tb_add)
+        if tb_font_color==base_font_color and tb_font_size==base_font_size and tb_font_type==base_font_type:
+            combined_image_caption_text_block[0] = min(combined_image_caption_text_block[0], tb_add['bbox'][0])
+            combined_image_caption_text_block[2] = max(combined_image_caption_text_block[2], tb_add['bbox'][2])
+            combined_image_caption_text_block[3] = tb_add['bbox'][3]
+        else:
+            break
+    image_box[0] = min(image_box[0], combined_image_caption_text_block[0])
+    image_box[1] = min(image_box[1], combined_image_caption_text_block[1])
+    image_box[2] = max(image_box[2], combined_image_caption_text_block[2])
+    image_box[3] = max(image_box[3], combined_image_caption_text_block[3])
+    text_block['_image_caption'] = True
+def include_img_title(pymu_blocks, image_bboxes: list):
+    """
+    向上方和下方寻找符合图片title的文本block，合并到图片里
+    如果图片上下都有fig的情况怎么办？寻找标题距离最近的那个。
+    ---
+    增加对左侧和右侧图片标题的寻找
+    """
+    for tb in image_bboxes:
+        # 优先找下方的
+        max_find_cnt = 3 # 向上，向下最多找3个就停止
+        temp_box = tb.copy()
+        while max_find_cnt>0:
+            text_block_btn = find_bottom_nearest_text_bbox(pymu_blocks, temp_box)
+            if text_block_btn:
+                txt, line_cnt = __get_fig_caption_text(text_block_btn)
+                if len(txt.strip())>0:
+                    if not __check_img_title_pattern(txt) and max_find_cnt>0 and line_cnt<3: # 设置line_cnt<=2目的是为了跳过子标题，或者有时候图片下方文字没有被图片识别模型放入图片里
+                        max_find_cnt = max_find_cnt - 1
+                        temp_box[3] = text_block_btn['bbox'][3]
+                        continue
+                    else:
+                        break
+                else:
+                    temp_box[3] = text_block_btn['bbox'][3] # 宽度不变，扩大
+                    max_find_cnt = max_find_cnt - 1
+            else:
+                break
+        max_find_cnt = 3 # 向上，向下最多找3个就停止
+        temp_box = tb.copy()
+        while max_find_cnt>0:
+            text_block_top = find_top_nearest_text_bbox(pymu_blocks, temp_box)
+            if text_block_top:
+                txt, line_cnt = __get_fig_caption_text(text_block_top)
+                if len(txt.strip())>0:
+                    if not __check_img_title_pattern(txt) and max_find_cnt>0 and line_cnt <3:
+                        max_find_cnt = max_find_cnt - 1
+                        temp_box[1] = text_block_top['bbox'][1]
+                        continue
+                    else:
+                        break
+                else:
+                    b = text_block_top['bbox']
+                    temp_box[1] = b[1] # 宽度不变，扩大
+                    max_find_cnt = max_find_cnt - 1
+            else:
+                break
+        if text_block_btn and text_block_top and text_block_btn.get("_image_caption", False) is False and text_block_top.get("_image_caption", False) is False :
+            btn_text, _ = __get_fig_caption_text(text_block_btn)
+            top_text, _ = __get_fig_caption_text(text_block_top)
+            if __check_img_title_pattern(btn_text) and __check_img_title_pattern(top_text):
+                # 取距离图片最近的
+                btn_text_distance = text_block_btn['bbox'][1] - tb[3]
+                top_text_distance = tb[1] - text_block_top['bbox'][3]
+                if btn_text_distance<top_text_distance: # caption在下方
+                    __find_and_extend_bottom_caption(text_block_btn, pymu_blocks, tb)
+                else:
+                    text_block = text_block_top
+                    tb[0] = min(tb[0], text_block['bbox'][0])
+                    tb[1] = min(tb[1], text_block['bbox'][1])
+                    tb[2] = max(tb[2], text_block['bbox'][2])
+                    tb[3] = max(tb[3], text_block['bbox'][3])
+                    text_block_btn['_image_caption'] = True
+                continue
+        text_block = text_block_btn # find_bottom_nearest_text_bbox(pymu_blocks, tb)
+        if text_block and text_block.get("_image_caption", False) is False:
+            first_text_line, _ = __get_fig_caption_text(text_block)
+            if __check_img_title_pattern(first_text_line):
+                # 发现特征之后，继续向相同方向寻找（想同颜色，想同大小，想同字体）的textblock
+                __find_and_extend_bottom_caption(text_block, pymu_blocks, tb)
+                continue
+        text_block = text_block_top # find_top_nearest_text_bbox(pymu_blocks, tb)
+        if text_block  and text_block.get("_image_caption", False) is False:
+            first_text_line, _ = __get_fig_caption_text(text_block)
+            if __check_img_title_pattern(first_text_line):
+                tb[0] = min(tb[0], text_block['bbox'][0])
+                tb[1] = min(tb[1], text_block['bbox'][1])
+                tb[2] = max(tb[2], text_block['bbox'][2])
+                tb[3] = max(tb[3], text_block['bbox'][3])
+                text_block['_image_caption'] = True
+                continue
+        """向左、向右寻找，暂时只寻找一次"""
+        left_text_block = find_left_nearest_text_bbox(pymu_blocks, tb)
+        if left_text_block and left_text_block.get("_image_caption", False) is False:
+            first_text_line, _ = __get_fig_caption_text(left_text_block)
+            if __check_img_title_pattern(first_text_line):
+                tb[0] = min(tb[0], left_text_block['bbox'][0])
+                tb[1] = min(tb[1], left_text_block['bbox'][1])
+                tb[2] = max(tb[2], left_text_block['bbox'][2])
+                tb[3] = max(tb[3], left_text_block['bbox'][3])
+                left_text_block['_image_caption'] = True
+                continue
+        right_text_block = find_right_nearest_text_bbox(pymu_blocks, tb)
+        if right_text_block and right_text_block.get("_image_caption", False) is False:
+            first_text_line, _ = __get_fig_caption_text(right_text_block)
+            if __check_img_title_pattern(first_text_line):
+                tb[0] = min(tb[0], right_text_block['bbox'][0])
+                tb[1] = min(tb[1], right_text_block['bbox'][1])
+                tb[2] = max(tb[2], right_text_block['bbox'][2])
+                tb[3] = max(tb[3], right_text_block['bbox'][3])
+                right_text_block['_image_caption'] = True
+                continue
+    return image_bboxes
+def combine_images(image_bboxes:list):
+    """
+    合并图片，如果图片有重叠，那么合并
+    """
+    new_images = []
+    droped_img_idx = []
+    for i in range(0, len(image_bboxes)):
+        for j in range(i+1, len(image_bboxes)):
+            if j not in droped_img_idx and _is_in_or_part_overlap(image_bboxes[i], image_bboxes[j]):
+                # 合并
+                image_bboxes[i][0], image_bboxes[i][1],image_bboxes[i][2],image_bboxes[i][3] = min(image_bboxes[i][0], image_bboxes[j][0]), min(image_bboxes[i][1], image_bboxes[j][1]), max(image_bboxes[i][2], image_bboxes[j][2]), max(image_bboxes[i][3], image_bboxes[j][3])
+                droped_img_idx.append(j)
+    for i in range(0, len(image_bboxes)):
+        if i not in droped_img_idx:
+            new_images.append(image_bboxes[i])
+    return new_images
\ No newline at end of file
--- a/pre_proc/main_text_font.py
+++ b/pre_proc/main_text_font.py
+import collections
+def get_main_text_font(pdf_docs):
+    font_names = collections.Counter()
+    for page in pdf_docs:
+        blocks = page.get_text('dict')['blocks']
+        if blocks is not None:
+            for block in blocks:
+                lines = block.get('lines')
+                if lines is not None:
+                    for line in lines:
+                        span_font = [(span['font'], len(span['text'])) for span in line['spans'] if
+                                     'font' in span and len(span['text']) > 0]
+                        if span_font:
+                            # main_text_font应该用基于字数最多的字体而不是span级别的统计
+                            # font_names.append(font_name for font_name in span_font)
+                            # block_fonts.append(font_name for font_name in span_font)
+                            for font, count in span_font:
+                                font_names[font] += count
+    main_text_font = font_names.most_common(1)[0][0]
+    return main_text_font
--- a/pre_proc/pdf_filter.py
+++ b/pre_proc/pdf_filter.py
+from libs.commons import fitz
+from libs.boxbase import _is_in, _is_in_or_part_overlap
+from libs.drop_reason import DropReason
+def __area(box):
+    return (box[2] - box[0]) * (box[3] - box[1])
+def __is_contain_color_background_rect(page:fitz.Page, text_blocks, image_bboxes) -> bool:
+    """
+    检查page是包含有颜色背景的矩形
+    """
+    color_bg_rect = []
+    p_width, p_height = page.rect.width, page.rect.height
+    # 先找到最大的带背景矩形
+    blocks = page.get_cdrawings()
+    for block in blocks:
+        if 'fill' in block and block['fill']: # 过滤掉透明的
+            fill = list(block['fill'])
+            fill[0], fill[1], fill[2] = int(fill[0]), int(fill[1]), int(fill[2])
+            if fill==(1.0,1.0,1.0):
+                continue
+            rect = block['rect']
+            # 过滤掉特别小的矩形
+            if __area(rect) < 10*10:
+                continue
+            # 为了防止是svg图片上的色块，这里过滤掉这类
+            if any([_is_in_or_part_overlap(rect, img_bbox) for img_bbox in image_bboxes]):
+                continue
+            color_bg_rect.append(rect)
+    # 找到最大的背景矩形
+    if len(color_bg_rect) > 0:
+        max_rect = max(color_bg_rect, key=lambda x:__area(x))
+        max_rect_int = (int(max_rect[0]), int(max_rect[1]), int(max_rect[2]), int(max_rect[3]))
+        # 判断最大的背景矩形是否包含超过3行文字，或者50个字 TODO
+        if max_rect[2]-max_rect[0] > 0.2*p_width and  max_rect[3]-max_rect[1] > 0.1*p_height:#宽度符合
+            #看是否有文本块落入到这个矩形中
+            for text_block in text_blocks:
+                box = text_block['bbox']
+                box_int = (int(box[0]), int(box[1]), int(box[2]), int(box[3]))
+                if _is_in(box_int, max_rect_int):
+                    return True
+    return False
+def __is_table_overlap_text_block(text_blocks, table_bbox):
+    """
+    检查table_bbox是否覆盖了text_blocks里的文本块
+    TODO
+    """
+    for text_block in text_blocks:
+        box = text_block['bbox']
+        if _is_in_or_part_overlap(table_bbox, box):
+            return True
+    return False
+def pdf_filter(page:fitz.Page, text_blocks, table_bboxes, image_bboxes) -> tuple:
+    """
+    return:(True|False, err_msg)
+        True, 如果pdf符合要求
+        False, 如果pdf不符合要求
+    """
+    if __is_contain_color_background_rect(page, text_blocks, image_bboxes):
+        return False, {"need_drop": True, "drop_reason": DropReason.COLOR_BACKGROUND_TEXT_BOX}
+    return True, None
\ No newline at end of file
--- a/pre_proc/post_layout_split.py
+++ b/pre_proc/post_layout_split.py
--- a/pre_proc/remove_colored_strip_bbox.py
+++ b/pre_proc/remove_colored_strip_bbox.py
+from libs.boxbase import _is_in, _is_in_or_part_overlap, calculate_overlap_area_2_minbox_area_ratio
+from loguru import logger
+from libs.drop_tag import COLOR_BG_HEADER_TXT_BLOCK
+def __area(box):
+    return (box[2] - box[0]) * (box[3] - box[1])
+def rectangle_position_determination(rect, p_width):
+    """
+    判断矩形是否在页面中轴线附近。
+    Args:
+        rect (list): 矩形坐标，格式为[x1, y1, x2, y2]。
+        p_width (int): 页面宽度。
+    Returns:
+        bool: 若矩形在页面中轴线附近则返回True，否则返回False。
+    """
+    # 页面中轴线x坐标
+    x_axis = p_width / 2
+    # 矩形是否跨越中轴线
+    is_span = rect[0] < x_axis and rect[2] > x_axis
+    if is_span:
+        return True
+    else:
+        # 矩形与中轴线的距离，只算近的那一边
+        distance = rect[0] - x_axis if rect[0] > x_axis else x_axis - rect[2]
+        # 判断矩形与中轴线的距离是否小于页面宽度的20%
+        if distance < p_width * 0.2:
+            return True
+        else:
+            return False
+def remove_colored_strip_textblock(remain_text_blocks, page):
+    """
+    根据页面中特定颜色和大小过滤文本块，将符合条件的文本块从remain_text_blocks中移除，并返回移除的文本块列表colored_strip_textblock。
+    Args:
+        remain_text_blocks (list): 剩余文本块列表。
+        page (Page): 页面对象。
+    Returns:
+        tuple: 剩余文本块列表和移除的文本块列表。
+    """
+    colored_strip_textblocks = []  # 先构造一个空的返回
+    if len(remain_text_blocks) > 0:
+        p_width, p_height = page.rect.width, page.rect.height
+        blocks = page.get_cdrawings()
+        colored_strip_bg_rect = []
+        for block in blocks:
+            is_filled = 'fill' in block and block['fill'] and block['fill'] != (1.0, 1.0, 1.0)  # 过滤掉透明的
+            rect = block['rect']
+            area_is_large_enough = __area(rect) > 100  # 过滤掉特别小的矩形
+            rectangle_position_determination_result = rectangle_position_determination(rect, p_width)
+            in_upper_half_page = rect[3] < p_height * 0.3  # 找到位于页面上半部分的矩形，下边界小于页面高度的30%
+            aspect_ratio_exceeds_4 = (rect[2] - rect[0]) > (rect[3] - rect[1]) * 4  # 找到长宽比超过4的矩形
+            if is_filled and area_is_large_enough and rectangle_position_determination_result and in_upper_half_page and aspect_ratio_exceeds_4:
+                colored_strip_bg_rect.append(rect)
+        if len(colored_strip_bg_rect) > 0:
+            for colored_strip_block_bbox in colored_strip_bg_rect:
+                for text_block in remain_text_blocks:
+                    text_bbox = text_block['bbox']
+                    if _is_in(text_bbox, colored_strip_block_bbox) or (_is_in_or_part_overlap(text_bbox, colored_strip_block_bbox) and calculate_overlap_area_2_minbox_area_ratio(text_bbox, colored_strip_block_bbox) > 0.6):
+                        logger.info(f'remove_colored_strip_textblock: {text_bbox}, {colored_strip_block_bbox}')
+                        text_block['tag'] = COLOR_BG_HEADER_TXT_BLOCK
+                        colored_strip_textblocks.append(text_block)
+                if len(colored_strip_textblocks) > 0:
+                    for colored_strip_textblock in colored_strip_textblocks:
+                        if colored_strip_textblock in remain_text_blocks:
+                            remain_text_blocks.remove(colored_strip_textblock)
+    return remain_text_blocks, colored_strip_textblocks
--- a/pre_proc/remove_rotate_bbox.py
+++ b/pre_proc/remove_rotate_bbox.py
+import json
+import math
+from libs.boxbase import is_vbox_on_side
+def detect_non_horizontal_texts(result_dict):
+    """
+    This function detects watermarks and vertical margin notes in the document.
+    Watermarks are identified by finding blocks with the same coordinates and frequently occurring identical texts across multiple pages.
+    If these conditions are met, the blocks are highly likely to be watermarks, as opposed to headers or footers, which can change from page to page.
+    If the direction of these blocks is not horizontal, they are definitely considered to be watermarks.
+    Vertical margin notes are identified by finding blocks with the same coordinates and frequently occurring identical texts across multiple pages.
+    If these conditions are met, the blocks are highly likely to be vertical margin notes, which typically appear on the left and right sides of the page.
+    If the direction of these blocks is vertical, they are definitely considered to be vertical margin notes.
+    Parameters
+    ----------
+    result_dict : dict
+        The result dictionary.
+    Returns
+    -------
+    result_dict : dict
+        The updated result dictionary.
+    """
+    # Dictionary to store information about potential watermarks
+    potential_watermarks = {}
+    potential_margin_notes = {}
+    for page_id, page_content in result_dict.items():
+        if page_id.startswith("page_"):
+            for block_id, block_data in page_content.items():
+                if block_id.startswith("block_"):
+                    if "dir" in block_data:
+                        coordinates_text = (block_data["bbox"], block_data["text"])  # Tuple of coordinates and text
+                        angle = math.atan2(block_data["dir"][1], block_data["dir"][0])
+                        angle = abs(math.degrees(angle))
+                        if angle > 5 and angle < 85:  # Check if direction is watermarks
+                            if coordinates_text in potential_watermarks:
+                                potential_watermarks[coordinates_text] += 1
+                            else:
+                                potential_watermarks[coordinates_text] = 1
+                        if angle > 85 and angle < 105:  # Check if direction is vertical
+                            if coordinates_text in potential_margin_notes:
+                                potential_margin_notes[coordinates_text] += 1  # Increment count
+                            else:
+                                potential_margin_notes[coordinates_text] = 1  # Initialize count
+    # Identify watermarks by finding entries with counts higher than a threshold (e.g., appearing on more than half of the pages)
+    watermark_threshold = len(result_dict) // 2
+    watermarks = {k: v for k, v in potential_watermarks.items() if v > watermark_threshold}
+    # Identify margin notes by finding entries with counts higher than a threshold (e.g., appearing on more than half of the pages)
+    margin_note_threshold = len(result_dict) // 2
+    margin_notes = {k: v for k, v in potential_margin_notes.items() if v > margin_note_threshold}
+    # Add watermark information to the result dictionary
+    for page_id, blocks in result_dict.items():
+        if page_id.startswith("page_"):
+            for block_id, block_data in blocks.items():
+                coordinates_text = (block_data["bbox"], block_data["text"])
+                if coordinates_text in watermarks:
+                    block_data["is_watermark"] = 1
+                else:
+                    block_data["is_watermark"] = 0
+                if coordinates_text in margin_notes:
+                    block_data["is_vertical_margin_note"] = 1
+                else:
+                    block_data["is_vertical_margin_note"] = 0
+    return result_dict
+"""
+1. 当一个block里全部文字都不是dir=(1,0)，这个block整体去掉
+2. 当一个block里全部文字都是dir=(1,0)，但是每行只有一个字，这个block整体去掉。这个block必须出现在页面的四周，否则不去掉
+"""
+import string, re
+def __is_a_word(sentence):
+    # 如果输入是中文并且长度为1，则返回True
+    if re.fullmatch(r'[\u4e00-\u9fa5]', sentence):
+        return True
+    # 判断是否为单个英文单词或字符（包括ASCII标点）
+    elif re.fullmatch(r'[a-zA-Z0-9]+', sentence) and len(sentence) <=2:
+        return True
+    else:
+        return False
+def __get_text_color(num):
+    """获取字体的颜色RGB值"""
+    blue = num & 255
+    green = (num >> 8) & 255
+    red = (num >> 16) & 255
+    return red, green, blue
+def __is_empty_side_box(text_block):
+    """
+    是否是边缘上的空白没有任何内容的block
+    """
+    for line in text_block['lines']:
+        for span in line['spans']:
+            font_color = span['color']
+            r,g,b = __get_text_color(font_color)
+            if len(span['text'].strip())>0 and (r,g,b)!=(255,255,255):
+                return False
+    return True
+def remove_rotate_side_textblock(pymu_text_block, page_width, page_height):
+    """
+    返回删除了垂直，水印，旋转的textblock
+    删除的内容打上tag返回
+    """
+    removed_text_block = []
+    for i, block in enumerate(pymu_text_block): # 格式参考test/assets/papre/pymu_textblocks.json
+        lines = block['lines']
+        block_bbox = block['bbox']
+        if not is_vbox_on_side(block_bbox, page_width, page_height, 0.2): # 保证这些box必须在页面的两边
+           continue
+        if all([__is_a_word(line['spans'][0]["text"]) for line in lines if len(line['spans'])>0]) and len(lines)>1 and all([len(line['spans'])==1 for line in lines]):
+            is_box_valign = (len(set([int(line['spans'][0]['bbox'][0] ) for line in lines if len(line['spans'])>0]))==1) and (len([int(line['spans'][0]['bbox'][0] ) for line in lines if len(line['spans'])>0])>1)  # 测试bbox在垂直方向是不是x0都相等，也就是在垂直方向排列.同时必须大于等于2个字
+            if is_box_valign:
+                block['tag'] = "vertical-text"
+                removed_text_block.append(block)
+                continue
+        for line in lines:
+            if line['dir']!=(1,0):
+                block['tag'] = "rotate"
+                removed_text_block.append(block) # 只要有一个line不是dir=(1,0)，就把整个block都删掉
+                break
+    for block in removed_text_block:
+        pymu_text_block.remove(block)
+    return pymu_text_block, removed_text_block
+def get_side_boundry(rotate_bbox, page_width, page_height):
+    """
+    根据rotate_bbox，返回页面的左右正文边界
+    """
+    left_x = 0
+    right_x = page_width
+    for x in rotate_bbox:
+        box = x['bbox']
+        if box[2]<page_width/2:
+            left_x = max(left_x, box[2])
+        else:
+            right_x = min(right_x, box[0])
+    return left_x+1, right_x-1
+def remove_side_blank_block(pymu_text_block, page_width, page_height):
+    """
+    删除页面两侧的空白block
+    """
+    removed_text_block = []
+    for i, block in enumerate(pymu_text_block): # 格式参考test/assets/papre/pymu_textblocks.json
+        block_bbox = block['bbox']
+        if not is_vbox_on_side(block_bbox, page_width, page_height, 0.2): # 保证这些box必须在页面的两边
+           continue
+        if __is_empty_side_box(block):
+            block['tag'] = "empty-side-block"
+            removed_text_block.append(block)
+            continue
+    for block in removed_text_block:
+        pymu_text_block.remove(block)
+    return pymu_text_block, removed_text_block
\ No newline at end of file
--- a/pre_proc/resolve_bbox_conflict.py
+++ b/pre_proc/resolve_bbox_conflict.py
+"""
+从pdf里提取出来api给出的bbox,然后根据重叠情况做出取舍
+1. 首先去掉出现在图片上的bbox，图片包括表格和图片
+2. 然后去掉出现在文字blcok上的图片bbox
+"""
+from libs.boxbase import _is_in, _is_in_or_part_overlap, _is_left_overlap, calculate_iou, calculate_overlap_area_2_minbox_area_ratio
+def resolve_bbox_overlap_conflict(images:list, tables:list, interline_equations:list, inline_equations:list, text_raw_blocks:list):
+    """
+    text_raw_blocks结构是从pymupdf里直接取到的结构，具体样例参考test/assets/papre/pymu_textblocks.json
+    当下采用一种粗暴的方式：
+    1. 去掉图片上的公式
+    2. 去掉table上的公式
+    2. 图片和文字block部分重叠，首先丢弃图片
+    3. 图片和图片重叠，修改图片的bbox，使得图片不重叠(暂时没这么做，先把图片都扔掉)
+    4. 去掉文字bbox里位于图片、表格上的文字（一定要完全在图、表内部）
+    5. 去掉表格上的文字
+    """
+    text_block_removed = []
+    images_backup = []
+    # 去掉位于图片上的文字block
+    for image_box in images:
+        for text_block in text_raw_blocks:
+            text_bbox = text_block["bbox"]
+            if _is_in(text_bbox, image_box):
+                text_block['tag'] = "on-image"
+                text_block_removed.append(text_block)
+    # 去掉table上的文字block
+    for table_box in tables:
+        for text_block in text_raw_blocks:
+            text_bbox = text_block["bbox"]
+            if _is_in(text_bbox, table_box):
+                text_block['tag'] = "on-table"
+                text_block_removed.append(text_block)
+    for text_block in text_block_removed:
+        if text_block in text_raw_blocks:
+            text_raw_blocks.remove(text_block)
+    # 第一步去掉在图片上出现的公式box
+    temp = []
+    for image_box in images:
+        for eq1 in interline_equations:
+            if _is_in_or_part_overlap(image_box, eq1[:4]):
+                temp.append(eq1)
+        for eq2 in inline_equations:
+            if _is_in_or_part_overlap(image_box, eq2[:4]):
+                temp.append(eq2)
+    for eq in temp:
+        if eq in interline_equations:
+            interline_equations.remove(eq)
+        if eq in inline_equations:
+            inline_equations.remove(eq)
+    # 第二步去掉在表格上出现的公式box
+    temp = []
+    for table_box in tables:
+        for eq1 in interline_equations:
+            if _is_in_or_part_overlap(table_box, eq1[:4]):
+                temp.append(eq1)
+        for eq2 in inline_equations:
+            if _is_in_or_part_overlap(table_box, eq2[:4]):
+                temp.append(eq2)
+    for eq in temp:
+        if eq in interline_equations:
+            interline_equations.remove(eq)
+        if eq in inline_equations:
+            inline_equations.remove(eq)
+    # 图片和文字重叠，丢掉图片
+    for image_box in images:
+        for text_block in text_raw_blocks:
+            text_bbox = text_block["bbox"]
+            if _is_in_or_part_overlap(image_box, text_bbox):
+                images_backup.append(image_box)
+                break
+    for image_box in images_backup:
+        images.remove(image_box)
+    # 图片和图片重叠，两张都暂时不参与版面计算
+    images_dup_index = []
+    for i in range(len(images)):
+        for j in range(i+1, len(images)):
+            if _is_in_or_part_overlap(images[i], images[j]):
+                images_dup_index.append(i)
+                images_dup_index.append(j)
+    dup_idx = set(images_dup_index)
+    for img_id in dup_idx:
+        images_backup.append(images[img_id])
+        images[img_id] = None
+    images = [img for img in images if img is not None]
+    # 如果行间公式和文字block重叠，放到临时的数据里，防止这些文字box影响到layout计算。通过计算IOU合并行间公式和文字block
+    # 对于这样的文本块删除，然后保留行间公式的大小不变。
+    # 当计算完毕layout，这部分再合并回来
+    text_block_removed_2 = []
+    # for text_block in text_raw_blocks:
+    #     text_bbox = text_block["bbox"]
+    #     for eq in interline_equations:
+    #         ratio = calculate_overlap_area_2_minbox_area_ratio(text_bbox, eq[:4])
+    #         if ratio>0.05:
+    #             text_block['tag'] = "belong-to-interline-equation"
+    #             text_block_removed_2.append(text_block)
+    #             break
+    # for tb in text_block_removed_2:
+    #     if tb in text_raw_blocks:
+    #         text_raw_blocks.remove(tb)
+    # text_block_removed = text_block_removed + text_block_removed_2
+    return images, tables, interline_equations, inline_equations, text_raw_blocks, text_block_removed, images_backup, text_block_removed_2
+def check_text_block_horizontal_overlap(text_blocks:list, header, footer) -> bool:
+    """
+    检查文本block之间的水平重叠情况，这种情况如果发生，那么这个pdf就不再继续处理了。
+    因为这种情况大概率发生了公式没有被检测出来。
+    """
+    if len(text_blocks)==0:
+        return False
+    page_min_y = 0
+    page_max_y = max(yy['bbox'][3] for yy in text_blocks)
+    def __max_y(lst:list):
+        if len(lst)>0:
+            return max([item[1] for item in lst])
+        return page_min_y
+    def __min_y(lst:list):
+        if len(lst)>0:
+            return min([item[3] for item in lst])
+        return page_max_y
+    clip_y0 = __max_y(header)
+    clip_y1 = __min_y(footer)
+    txt_bboxes = []
+    for text_block in text_blocks:
+        bbox = text_block["bbox"]
+        if bbox[1]>=clip_y0 and bbox[3]<=clip_y1:
+            txt_bboxes.append(bbox)
+    for i in range(len(txt_bboxes)):
+        for j in range(i+1, len(txt_bboxes)):
+            if _is_left_overlap(txt_bboxes[i], txt_bboxes[j]) or _is_left_overlap(txt_bboxes[j], txt_bboxes[i]):
+                return True
+    return False
\ No newline at end of file
--- a/pre_proc/statistics.py
+++ b/pre_proc/statistics.py
+"""
+统计处需要跨页、全局性的数据
+- 统计出字号从大到小
+- 正文区域占比最高的前5
+- 正文平均行间距
+- 正文平均字间距
+- 正文平均字符宽度
+- 正文平均字符高度
+"""
--- a/pre_proc/table_fix.py
+++ b/pre_proc/table_fix.py
+import os                   
+import collections      # 统计库
+import re               # 正则
+from libs.commons import fitz             # pyMuPDF库
+import json
+import re
+from libs.boxbase import _is_in_or_part_overlap, _is_part_overlap, find_bottom_nearest_text_bbox, find_left_nearest_text_bbox, find_right_nearest_text_bbox, find_top_nearest_text_bbox             # json
+## version 2
+def get_merged_line(page):
+    """
+    这个函数是为了从pymuPDF中提取出的矢量里筛出水平的横线，并且将断开的线段进行了合并。
+    :param page :fitz读取的当前页的内容
+    """
+    drawings_bbox = []
+    drawings_line = []
+    drawings = page.get_drawings()  # 提取所有的矢量
+    for p in drawings:
+        drawings_bbox.append(p["rect"].irect)  # (L, U, R, D)
+    lines = []
+    for L, U, R, D in drawings_bbox:
+        if abs(D - U) <= 3: # 筛出水平的横线
+            lines.append((L, U, R, D))
+    U_groups = []
+    visited = [False for _ in range(len(lines))]
+    for i, (L1, U1, R1, D1) in enumerate(lines):
+        if visited[i] == True:
+            continue
+        tmp_g = [(L1, U1, R1, D1)]
+        for j, (L2, U2, R2, D2) in enumerate(lines):
+            if i == j:
+                continue
+            if visited[j] == True:
+                continue
+            if max(U1, D1, U2, D2) - min(U1, D1, U2, D2) <= 5:   # 把高度一致的线放进一个group
+                tmp_g.append((L2, U2, R2, D2))
+                visited[j] = True
+        U_groups.append(tmp_g)
+    res = []
+    for group in U_groups:
+        group.sort(key = lambda LURD: (LURD[0], LURD[2]))
+        LL, UU, RR, DD = group[0]
+        for i, (L1, U1, R1, D1) in enumerate(group):
+            if (L1 - RR) >= 5:
+                cur_line = (LL, UU, RR, DD)
+                res.append(cur_line)
+                LL = L1
+            else:
+                RR = max(RR, R1)
+        cur_line = (LL, UU, RR, DD)
+        res.append(cur_line)
+    return res
+def fix_tables(page: fitz.Page, table_bboxes: list, include_table_title: bool, scan_line_num: int):
+    """
+    :param page :fitz读取的当前页的内容
+    :param table_bboxes: list类型，每一个元素是一个元祖 (L, U, R, D)
+    :param include_table_title: 是否将表格的标题也圈进来
+    :param scan_line_num: 在与表格框临近的上下几个文本框里扫描搜索标题
+    """
+    drawings_lines = get_merged_line(page)
+    fix_table_bboxes = []
+    for table in table_bboxes:
+        (L, U, R, D) = table
+        fix_table_L = []
+        fix_table_U = []
+        fix_table_R = []
+        fix_table_D = []
+        width = R - L
+        width_range = width * 0.1 # 只看距离表格整体宽度10%之内偏差的线
+        height = D - U
+        height_range = height * 0.1 # 只看距离表格整体高度10%之内偏差的线
+        for line in drawings_lines:
+            if (L - width_range) <= line[0] <= (L + width_range) and (R - width_range) <= line[2] <= (R + width_range): # 相近的宽度
+                if (U - height_range) < line[1] < (U + height_range): # 上边界，在一定的高度范围内
+                    fix_table_U.append(line[1])
+                    fix_table_L.append(line[0])
+                    fix_table_R.append(line[2])
+                elif (D - height_range) < line[1] < (D + height_range): # 下边界，在一定的高度范围内
+                    fix_table_D.append(line[1])
+                    fix_table_L.append(line[0])
+                    fix_table_R.append(line[2])
+        if fix_table_U:
+            U = min(fix_table_U)
+        if fix_table_D:
+            D = max(fix_table_D)
+        if fix_table_L:
+            L = min(fix_table_L)
+        if fix_table_R:
+            R = max(fix_table_R)
+        if include_table_title:   # 需要将表格标题包括
+            text_blocks = page.get_text("dict", flags=fitz.TEXTFLAGS_TEXT)["blocks"]   # 所有的text的block
+            incolumn_text_blocks = [block for block in text_blocks if not ((block['bbox'][0] < L and block['bbox'][2] < L) or (block['bbox'][0] > R and block['bbox'][2] > R))]  # 将与表格完全没有任何遮挡的文字筛除掉（比如另一栏的文字）
+            upper_text_blocks = [block for block in incolumn_text_blocks if (U - block['bbox'][3]) > 0]  # 将在表格线以上的text block筛选出来
+            sorted_filtered_text_blocks = sorted(upper_text_blocks, key=lambda x: (U - x['bbox'][3], x['bbox'][0])) # 按照text block的下边界距离表格上边界的距离升序排序，如果是同一个高度，则先左再右
+            for idx in range(scan_line_num):   
+                if idx+1 <= len(sorted_filtered_text_blocks):
+                    line_temp = sorted_filtered_text_blocks[idx]['lines']
+                    if line_temp:
+                        text = line_temp[0]['spans'][0]['text'] # 提取出第一个span里的text内容
+                        check_en = re.match('Table', text) # 检查是否有Table开头的(英文）
+                        check_ch = re.match('表', text) # 检查是否有Table开头的(中文）
+                        if check_en or check_ch:
+                            if sorted_filtered_text_blocks[idx]['bbox'][1] < D: # 以防出现负的bbox
+                                U = sorted_filtered_text_blocks[idx]['bbox'][1]
+        fix_table_bboxes.append([L-2, U-2, R+2, D+2])
+    return fix_table_bboxes
+def __check_table_title_pattern(text):
+    """
+    检查文本段是否是表格的标题
+    """
+    patterns = [r'^table\s\d+']
+    for pattern in patterns:
+        match = re.match(pattern, text, re.IGNORECASE)
+        if match:
+            return True
+        else:
+            return False
+def fix_table_text_block(pymu_blocks, table_bboxes: list):
+    """
+    调整table, 如果table和上下的text block有相交区域，则将table的上下边界调整到text block的上下边界
+    例如 tmp/unittest/unittest_pdf/纯2列_ViLT_6_文字 表格.pdf
+    """
+    for tb in table_bboxes:
+        (L, U, R, D) = tb
+        for block in pymu_blocks:
+            if _is_in_or_part_overlap((L, U, R, D), block['bbox']):
+                txt = " ".join(span['text'] for line in block['lines'] for span in line['spans'])
+                if not __check_table_title_pattern(txt) and block.get("_table", False) is False: # 如果是table的title，那么不调整。因为下一步会统一调整，如果这里进行了调整，后面的调整会造成调整到其他table的title上（在连续出现2个table的情况下）。
+                    tb[0] = min(tb[0], block['bbox'][0])
+                    tb[1] = min(tb[1], block['bbox'][1])
+                    tb[2] = max(tb[2], block['bbox'][2])
+                    tb[3] = max(tb[3], block['bbox'][3])
+                    block['_table'] = True # 占位，防止其他table再次占用
+                """如果是个table的title，但是有部分重叠，那么修正这个title,使得和table不重叠"""
+                if _is_part_overlap(tb, block['bbox']) and __check_table_title_pattern(txt):
+                    block['bbox'] = list(block['bbox'])
+                    if block['bbox'][3] > U:
+                        block['bbox'][3] = U-1
+                    if block['bbox'][1] < D:
+                        block['bbox'][1] = D+1
+    return table_bboxes
+def __get_table_caption_text(text_block):
+    txt = " ".join(span['text'] for line in text_block['lines'] for span in line['spans'])
+    line_cnt = len(text_block['lines'])
+    txt = txt.replace("Ž . ", '')
+    return txt, line_cnt
+def include_table_title(pymu_blocks, table_bboxes: list):
+    """
+    把表格的title也包含进来，扩展到table_bbox上
+    """
+    for tb in table_bboxes:
+        max_find_cnt = 3 # 上上最多找3次
+        temp_box = tb.copy()
+        while max_find_cnt>0:
+            text_block_top = find_top_nearest_text_bbox(pymu_blocks, temp_box)
+            if text_block_top:
+                txt, line_cnt = __get_table_caption_text(text_block_top)
+                if len(txt.strip())>0:
+                    if not __check_table_title_pattern(txt) and max_find_cnt>0 and line_cnt<3:
+                        max_find_cnt = max_find_cnt -1
+                        temp_box[1] = text_block_top['bbox'][1]
+                        continue
+                    else:
+                        break
+                else:
+                    temp_box[1] = text_block_top['bbox'][1] # 宽度不变，扩大
+                    max_find_cnt = max_find_cnt - 1
+            else:
+                break
+        max_find_cnt = 3 # 向下找
+        temp_box = tb.copy()
+        while max_find_cnt>0:
+            text_block_bottom = find_bottom_nearest_text_bbox(pymu_blocks, temp_box)
+            if text_block_bottom:
+                txt, line_cnt = __get_table_caption_text(text_block_bottom)
+                if len(txt.strip())>0:
+                    if not __check_table_title_pattern(txt) and max_find_cnt>0 and line_cnt<3:
+                        max_find_cnt = max_find_cnt - 1
+                        temp_box[3] = text_block_bottom['bbox'][3]
+                        continue
+                    else:
+                        break
+                else:
+                    temp_box[3] = text_block_bottom['bbox'][3]
+                    max_find_cnt = max_find_cnt - 1
+            else:
+                break
+        if text_block_top and text_block_bottom and text_block_top.get("_table_caption", False) is False and text_block_bottom.get("_table_caption", False) is False :
+            btn_text, _ = __get_table_caption_text(text_block_bottom)
+            top_text, _ = __get_table_caption_text(text_block_top)
+            if __check_table_title_pattern(btn_text) and __check_table_title_pattern(top_text): # 上下都有一个tbale的caption
+                # 取距离最近的
+                btn_text_distance = text_block_bottom['bbox'][1] - tb[3]
+                top_text_distance = tb[1] - text_block_top['bbox'][3]
+                text_block = text_block_bottom if btn_text_distance<top_text_distance else text_block_top
+                tb[0] = min(tb[0], text_block['bbox'][0])
+                tb[1] = min(tb[1], text_block['bbox'][1])
+                tb[2] = max(tb[2], text_block['bbox'][2])
+                tb[3] = max(tb[3], text_block['bbox'][3])
+                text_block_bottom['_table_caption'] = True
+                continue
+        # 如果以上条件都不满足，那么就向下找
+        text_block = text_block_top
+        if text_block and text_block.get("_table_caption", False) is False:
+            first_text_line = " ".join(span['text'] for line in text_block['lines'] for span in line['spans'])
+            if __check_table_title_pattern(first_text_line) and text_block.get("_table", False) is False:
+                tb[0] = min(tb[0], text_block['bbox'][0])
+                tb[1] = min(tb[1], text_block['bbox'][1])
+                tb[2] = max(tb[2], text_block['bbox'][2])
+                tb[3] = max(tb[3], text_block['bbox'][3])
+                text_block['_table_caption'] = True
+                continue
+        text_block = text_block_bottom
+        if text_block and text_block.get("_table_caption", False) is False:
+            first_text_line, _ = __get_table_caption_text(text_block)
+            if __check_table_title_pattern(first_text_line) and text_block.get("_table", False) is False:
+                tb[0] = min(tb[0], text_block['bbox'][0])
+                tb[1] = min(tb[1], text_block['bbox'][1])
+                tb[2] = max(tb[2], text_block['bbox'][2])
+                tb[3] = max(tb[3], text_block['bbox'][3])
+                text_block['_table_caption'] = True
+                continue
+        """向左、向右寻找，暂时只寻找一次"""
+        left_text_block = find_left_nearest_text_bbox(pymu_blocks, tb)
+        if left_text_block and left_text_block.get("_image_caption", False) is False:
+            first_text_line, _ = __get_table_caption_text(left_text_block)
+            if __check_table_title_pattern(first_text_line):
+                tb[0] = min(tb[0], left_text_block['bbox'][0])
+                tb[1] = min(tb[1], left_text_block['bbox'][1])
+                tb[2] = max(tb[2], left_text_block['bbox'][2])
+                tb[3] = max(tb[3], left_text_block['bbox'][3])
+                left_text_block['_image_caption'] = True
+                continue
+        right_text_block = find_right_nearest_text_bbox(pymu_blocks, tb)
+        if right_text_block and right_text_block.get("_image_caption", False) is False:
+            first_text_line, _ = __get_table_caption_text(right_text_block)
+            if __check_table_title_pattern(first_text_line):
+                tb[0] = min(tb[0], right_text_block['bbox'][0])
+                tb[1] = min(tb[1], right_text_block['bbox'][1])
+                tb[2] = max(tb[2], right_text_block['bbox'][2])
+                tb[3] = max(tb[3], right_text_block['bbox'][3])
+                right_text_block['_image_caption'] = True
+                continue
+    return table_bboxes
\ No newline at end of file
--- a/s3pdf2md.py
+++ b/s3pdf2md.py
+import os
+import sys
+from pathlib import Path
+import click
+import json
+from loguru import logger
+from libs.commons import join_path, parse_aws_param, parse_bucket_key, read_file
+from mkcontent import mk_nlp_markdown
+from pdf2md import main
+from pdf_parse_by_model import parse_pdf_by_model
+@click.command()
+@click.option("--pdf-file-path", help="s3上pdf文件的路径")
+@click.option("--pdf-name", help="pdf name")
+def main_shell(pdf_file_path: str, pdf_name: str):
+    with open('/mnt/petrelfs/share_data/ouyanglinke/OCR/OCR_validation_dataset_final_rotated_formulafix_highdpi_scihub.json', 'r') as f:
+        samples = json.load(f)
+    for sample in samples:
+        pdf_file_path = sample['s3_path']
+        pdf_bin_file_profile = "outsider"
+        pdf_name = sample['pdf_name']
+        pdf_model_dir = f"s3://llm-pdf-text/eval_1k/layout_res/{pdf_name}"
+        pdf_model_profile = "langchao"
+        p = Path(pdf_file_path)
+        pdf_file_name = p.name  # pdf文件名字，含后缀
+        #pdf_model_dir = join_path(pdf_model_parent_dir, pdf_file_name)
+        main(
+            pdf_file_path,
+            pdf_bin_file_profile,
+            pdf_model_dir,
+            pdf_model_profile,
+            debug_mode=True,
+        )
+if __name__ == "__main__":
+    main_shell()
--- a/spark/s3_tools.py
+++ b/spark/s3_tools.py
+# from app.common import s3
+import boto3
+from botocore.client import Config
+from spark import s3_buckets, s3_clusters, get_cluster_name, s3_users
+import re
+import random
+from typing import Dict, Iterator, List, Tuple, Union
+__re_s3_path = re.compile("^s3a?://([^/]+)(?:/(.*))?$")
+def get_s3_config(path: Union[str, List[str]], outside=False):
+    paths = [path] if type(path) == str else path
+    bucket_config = None
+    for p in paths:
+        bc = __get_s3_bucket_config(p)
+        if bucket_config in [bc, None]:
+            bucket_config = bc
+            continue
+        raise Exception(f"{paths} have different s3 config, cannot read together.")
+    if not bucket_config:
+        raise Exception("path is empty.")
+    return __get_s3_config(bucket_config, outside, prefer_ip=True)
+def __get_s3_config(
+    bucket_config: tuple,
+    outside: bool,
+    prefer_ip=False,
+    prefer_auto=False,
+):
+    cluster, user = bucket_config
+    cluster_config = s3_clusters[cluster]
+    if outside:
+        endpoint_key = "outside"
+    elif prefer_auto and "auto" in cluster_config:
+        endpoint_key = "auto"
+    elif cluster_config.get("cluster") == get_cluster_name():
+        endpoint_key = "inside"
+    else:
+        endpoint_key = "outside"
+    if prefer_ip and f"{endpoint_key}_ips" in cluster_config:
+        endpoint_key = f"{endpoint_key}_ips"
+    endpoints = cluster_config[endpoint_key]
+    endpoint = random.choice(endpoints)
+    return {"endpoint": endpoint, **s3_users[user]}
+def split_s3_path(path: str):
+    "split bucket and key from path"
+    m = __re_s3_path.match(path)
+    if m is None:
+        return "", ""
+    return m.group(1), (m.group(2) or "")
+def __get_s3_bucket_config(path: str):
+    bucket = split_s3_path(path)[0] if path else ""
+    bucket_config = s3_buckets.get(bucket)
+    if not bucket_config:
+        bucket_config = s3_buckets.get("[default]")
+        assert bucket_config is not None
+    return bucket_config
+def get_s3_client(path: Union[str, List[str]], outside=False):
+    s3_config = get_s3_config(path, outside)
+    try:
+        return boto3.client(
+            "s3",
+            aws_access_key_id=s3_config["ak"],
+            aws_secret_access_key=s3_config["sk"],
+            endpoint_url=s3_config["endpoint"],
+            config=Config(s3={"addressing_style": "path"}, retries={"max_attempts": 8, "mode": "standard"}),
+        )
+    except:
+        # older boto3 do not support retries.mode param.
+        return boto3.client(
+            "s3",
+            aws_access_key_id=s3_config["ak"],
+            aws_secret_access_key=s3_config["sk"],
+            endpoint_url=s3_config["endpoint"],
+            config=Config(s3={"addressing_style": "path"}, retries={"max_attempts": 8}),
+        )
\ No newline at end of file
--- a/test/__init__.py
+++ b/test/__init__.py
--- a/test/assets/more_para_test_samples/gift_files.txt
+++ b/test/assets/more_para_test_samples/gift_files.txt
--- a/test/assets/more_para_test_samples/scihub_files.txt
+++ b/test/assets/more_para_test_samples/scihub_files.txt
+scihub/scihub_00500000/libgen.scimag00527000-00527999.zip_10.1002/app.25178
+scihub/scihub_07400000/libgen.scimag07481000-07481999.zip_10.1007/s003960050343
+scihub/scihub_11400000/libgen.scimag11451000-11451999.zip_10.1017/s0009838811000231
+scihub/scihub_24400000/libgen.scimag24401000-24401999.zip_10.1016/j.toxicon.2014.02.018
+scihub/scihub_27400000/libgen.scimag27441000-27441999.zip_10.2307/30122482
+scihub/scihub_28400000/libgen.scimag28413000-28413999.zip_10.2307/1316224
+scihub/scihub_31200000/libgen.scimag31207000-31207999.zip_10.1080/03639040600920622
+scihub/scihub_31800000/libgen.scimag31824000-31824999.zip_10.1109/med.2012.6265668
+scihub/scihub_32500000/libgen.scimag32539000-32539999.zip_10.1080/09540121003721000
+scihub/scihub_42500000/libgen.scimag42522000-42522999.zip_10.1016/S1365-6937(15)30162-3
+scihub/scihub_45900000/libgen.scimag45914000-45914999.zip_10.1055/s-0030-1256333
+scihub/scihub_50900000/libgen.scimag50902000-50902999.zip_10.1007/s12274-016-1035-8
+scihub/scihub_63900000/libgen.scimag63921000-63921999.zip_10.1063/1.4938050
+scihub/scihub_65800000/libgen.scimag65832000-65832999.zip_10.1016/s0166-4115(08)62165-2
+scihub/scihub_67300000/libgen.scimag67369000-67369999.zip_10.1096/fj.201700997R
+scihub/scihub_67900000/libgen.scimag67967000-67967999.zip_10.1038/s41598-018-21867-z
+scihub/scihub_77400000/libgen.scimag77447000-77447999.zip_10.1016/j.jid.2019.06.094
--- a/test/assets/more_para_test_samples/zlib_files.txt
+++ b/test/assets/more_para_test_samples/zlib_files.txt