Merge pull request #740 from myhloli/para-split-v3

feat(list&index block): detect and merge list and index blocks

Merge pull request #740 from myhloli/para-split-v3
feat(list&index block): detect and merge list and index blocks
702b6ac9 · Xiaomeng Zhao · GitHub · c479245e · 1f1dd353 · 702b6ac9
Unverified Commit 702b6ac9 authored Oct 15, 2024 by Xiaomeng Zhao Committed by GitHub Oct 15, 2024
7 changed files
--- a/magic_pdf/dict2md/ocr_mkcontent.py
+++ b/magic_pdf/dict2md/ocr_mkcontent.py
@@ -8,6 +8,7 @@ from magic_pdf.libs.language import detect_lang
 from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
 from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
 from magic_pdf.libs.ocr_content_type import BlockType, ContentType
+from magic_pdf.para.para_split_v3 import ListLineTag
 def __is_hyphen_at_line_end(line):
@@ -124,7 +125,7 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
    for para_block in paras_of_layout:
        para_text = ''
        para_type = para_block['type']
-        if para_type == BlockType.Text:
+        if para_type in [BlockType.Text, BlockType.List, BlockType.Index]:
            para_text = merge_para_with_text(para_block, parse_type=parse_type, lang=lang)
        elif para_type == BlockType.Title:
            para_text = f'# {merge_para_with_text(para_block, parse_type=parse_type, lang=lang)}'
@@ -177,22 +178,26 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
    return page_markdown
-def merge_para_with_text(para_block, parse_type="auto", lang=None):
+def detect_language(text):
+    en_pattern = r'[a-zA-Z]+'
-    def detect_language(text):
+    en_matches = re.findall(en_pattern, text)
-        en_pattern = r'[a-zA-Z]+'
+    en_length = sum(len(match) for match in en_matches)
-        en_matches = re.findall(en_pattern, text)
+    if len(text) > 0:
-        en_length = sum(len(match) for match in en_matches)
+        if en_length / len(text) >= 0.5:
-        if len(text) > 0:
+            return 'en'
-            if en_length / len(text) >= 0.5:
-                return 'en'
-            else:
-                return 'unknown'
        else:
-            return 'empty'
+            return 'unknown'
+    else:
+        return 'empty'
+def merge_para_with_text(para_block, parse_type="auto", lang=None):
    para_text = ''
-    for line in para_block['lines']:
+    for i, line in enumerate(para_block['lines']):
+        if i >= 1 and line.get(ListLineTag.IS_LIST_START_LINE, False):
+            para_text += '  \n'
        line_text = ''
        line_lang = ''
        for span in line['spans']:

--- a/magic_pdf/libs/draw_bbox.py
+++ b/magic_pdf/libs/draw_bbox.py
@@ -75,6 +75,8 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
    titles_list = []
    texts_list = []
    interequations_list = []
+    lists_list = []
+    indexs_list = []
    for page in pdf_info:
        page_dropped_list = []
@@ -83,6 +85,8 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
        titles = []
        texts = []
        interequations = []
+        lists = []
+        indexs = []
        for dropped_bbox in page['discarded_blocks']:
            page_dropped_list.append(dropped_bbox['bbox'])
@@ -115,6 +119,11 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
                texts.append(bbox)
            elif block['type'] == BlockType.InterlineEquation:
                interequations.append(bbox)
+            elif block['type'] == BlockType.List:
+                lists.append(bbox)
+            elif block['type'] == BlockType.Index:
+                indexs.append(bbox)
        tables_list.append(tables)
        tables_body_list.append(tables_body)
        tables_caption_list.append(tables_caption)
@@ -126,6 +135,8 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
        titles_list.append(titles)
        texts_list.append(texts)
        interequations_list.append(interequations)
+        lists_list.append(lists)
+        indexs_list.append(indexs)
    layout_bbox_list = []
@@ -160,6 +171,8 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
        draw_bbox_without_number(i, texts_list, page, [153, 0, 76], True)
        draw_bbox_without_number(i, interequations_list, page, [0, 255, 0],
                                 True)
+        draw_bbox_without_number(i, lists_list, page, [40, 169, 92], True)
+        draw_bbox_without_number(i, indexs_list, page, [40, 169, 92], True)
        draw_bbox_with_number(i, layout_bbox_list, page, [255, 0, 0], False, draw_bbox=False)

--- a/magic_pdf/libs/ocr_content_type.py
+++ b/magic_pdf/libs/ocr_content_type.py
@@ -20,6 +20,8 @@ class BlockType:
    InterlineEquation = 'interline_equation'
    Footnote = 'footnote'
    Discarded = 'discarded'
+    List = 'list'
+    Index = 'index'
 class CategoryId:

--- a/magic_pdf/model/pdf_extract_kit.py
+++ b/magic_pdf/model/pdf_extract_kit.py
@@ -340,7 +340,7 @@ class CustomPEKModel:
        if torch.cuda.is_available():
            properties = torch.cuda.get_device_properties(self.device)
            total_memory = properties.total_memory / (1024 ** 3)  # 将字节转换为 GB
-            if total_memory <= 8:
+            if total_memory <= 10:
                gc_start = time.time()
                clean_memory()
                gc_time = round(time.time() - gc_start, 2)

--- a/magic_pdf/para/para_split_v3.py
+++ b/magic_pdf/para/para_split_v3.py
 import copy
+from loguru import logger
 from magic_pdf.libs.Constants import LINES_DELETED, CROSS_PAGE
+from magic_pdf.libs.ocr_content_type import BlockType, ContentType
 LINE_STOP_FLAG = ('.', '!', '?', '。', '！', '？', ')', '）', '"', '”', ':', '：', ';', '；')
+LIST_END_FLAG = ('.', '。', ';', '；')
+class ListLineTag:
+    IS_LIST_START_LINE = "is_list_start_line"
+    IS_LIST_END_LINE = "is_list_end_line"
 def __process_blocks(blocks):
@@ -38,7 +47,127 @@ def __process_blocks(blocks):
    return result
-def __merge_2_blocks(block1, block2):
+def __is_list_block(block):
+    # 一个block如果是list block 应该同时满足以下特征
+    # 1.block内有多个line 2.block 内有多个line左侧顶格写 3.block内有多个line 右侧不顶格（狗牙状）
+    # 1.block内有多个line 2.block 内有多个line左侧顶格写 3.多个line以endflag结尾
+    # 1.block内有多个line 2.block 内有多个line左侧顶格写 3.block内有多个line 左侧不顶格
+    if len(block['lines']) >= 3:
+        first_line = block['lines'][0]
+        line_height = first_line['bbox'][3] - first_line['bbox'][1]
+        block_weight = block['bbox_fs'][2] - block['bbox_fs'][0]
+        left_close_num = 0
+        left_not_close_num = 0
+        right_not_close_num = 0
+        lines_text_list = []
+        for line in block['lines']:
+            line_text = ""
+            for span in line['spans']:
+                span_type = span['type']
+                if span_type == ContentType.Text:
+                    line_text += span['content'].strip()
+            lines_text_list.append(line_text)
+            # 计算line左侧顶格数量是否大于2，是否顶格用abs(block['bbox_fs'][0] - line['bbox'][0]) < line_height/2 来判断
+            if abs(block['bbox_fs'][0] - line['bbox'][0]) < line_height/2:
+                left_close_num += 1
+            elif line['bbox'][0] - block['bbox_fs'][0] > line_height:
+                # logger.info(f"{line_text}, {block['bbox_fs']}, {line['bbox']}")
+                left_not_close_num += 1
+            # 计算右侧是否不顶格，拍脑袋用0.3block宽度做阈值
+            closed_area = 0.3 * block_weight
+            # closed_area = 5 * line_height
+            if block['bbox_fs'][2] - line['bbox'][2] > closed_area:
+                right_not_close_num += 1
+        # 判断lines_text_list中的元素是否有超过80%都以LIST_END_FLAG结尾
+        line_end_flag = False
+        if len(lines_text_list) > 0:
+            num_end_count = 0
+            for line_text in lines_text_list:
+                if len(line_text) > 0:
+                    if line_text[-1] in LIST_END_FLAG:
+                        num_end_count += 1
+            if num_end_count / len(lines_text_list) >= 0.8:
+                line_end_flag = True
+        if left_close_num >= 2 and (right_not_close_num >= 2 or line_end_flag or left_not_close_num >= 2):
+            for line in block['lines']:
+                if abs(block['bbox_fs'][0] - line['bbox'][0]) < line_height / 2:
+                    line[ListLineTag.IS_LIST_START_LINE] = True
+                if abs(block['bbox_fs'][2] - line['bbox'][2]) > line_height:
+                    line[ListLineTag.IS_LIST_END_LINE] = True
+            return True
+        else:
+            return False
+    else:
+        return False
+def __is_index_block(block):
+    # 一个block如果是index block 应该同时满足以下特征
+    # 1.block内有多个line 2.block 内有多个line两侧均顶格写 3.line的开头或者结尾均为数字
+    if len(block['lines']) >= 3:
+        first_line = block['lines'][0]
+        line_height = first_line['bbox'][3] - first_line['bbox'][1]
+        left_close_num = 0
+        right_close_num = 0
+        lines_text_list = []
+        for line in block['lines']:
+            # 计算line左侧顶格数量是否大于2，是否顶格用abs(block['bbox_fs'][0] - line['bbox'][0]) < line_height/2 来判断
+            if abs(block['bbox_fs'][0] - line['bbox'][0]) < line_height / 2:
+                left_close_num += 1
+            # 计算右侧是否不顶格
+            if abs(block['bbox_fs'][2] - line['bbox'][2]) < line_height / 2:
+                right_close_num += 1
+            line_text = ""
+            for span in line['spans']:
+                span_type = span['type']
+                if span_type == ContentType.Text:
+                    line_text += span['content'].strip()
+            lines_text_list.append(line_text)
+        # 判断lines_text_list中的元素是否有超过80%都以数字开头或都以数字结尾
+        line_num_flag = False
+        if len(lines_text_list) > 0:
+            num_start_count = 0
+            num_end_count = 0
+            for line_text in lines_text_list:
+                if len(line_text) > 0:
+                    if line_text[0].isdigit():
+                        num_start_count += 1
+                    if line_text[-1].isdigit():
+                        num_end_count += 1
+            if num_start_count / len(lines_text_list) >= 0.8 or num_end_count / len(lines_text_list) >= 0.8:
+                line_num_flag = True
+        if left_close_num >= 2 and right_close_num >= 2 and line_num_flag:
+            for line in block['lines']:
+                line[ListLineTag.IS_LIST_START_LINE] = True
+            return True
+        else:
+            return False
+    else:
+        return False
+def __merge_2_text_blocks(block1, block2):
    if len(block1['lines']) > 0:
        first_line = block1['lines'][0]
        line_height = first_line['bbox'][3] - first_line['bbox'][1]
@@ -59,17 +188,46 @@ def __merge_2_blocks(block1, block2):
    return block1, block2
+def __merge_2_list_blocks(block1, block2):
+    if block1['page_num'] != block2['page_num']:
+        for line in block1['lines']:
+            for span in line['spans']:
+                span[CROSS_PAGE] = True
+    block2['lines'].extend(block1['lines'])
+    block1['lines'] = []
+    block1[LINES_DELETED] = True
+    return block1, block2
 def __para_merge_page(blocks):
    page_text_blocks_groups = __process_blocks(blocks)
    for text_blocks_group in page_text_blocks_groups:
+        if len(text_blocks_group) > 0:
+            # 需要先在合并前对所有block判断是否为list block
+            for block in text_blocks_group:
+                if __is_list_block(block):
+                    block['type'] = BlockType.List
+                elif __is_index_block(block):
+                    block['type'] = BlockType.Index
        if len(text_blocks_group) > 1:
            # 倒序遍历
            for i in range(len(text_blocks_group)-1, -1, -1):
                current_block = text_blocks_group[i]
                # 检查是否有前一个块
                if i - 1 >= 0:
                    prev_block = text_blocks_group[i - 1]
-                    __merge_2_blocks(current_block, prev_block)
+                    if current_block['type'] == 'text' and prev_block['type'] == 'text':
+                        __merge_2_text_blocks(current_block, prev_block)
+                    if current_block['type'] == BlockType.List and prev_block['type'] == BlockType.List:
+                        __merge_2_list_blocks(current_block, prev_block)
+                    if current_block['type'] == BlockType.Index and prev_block['type'] == BlockType.Index:
+                        __merge_2_list_blocks(current_block, prev_block)
        else:
            continue

--- a/magic_pdf/pre_proc/ocr_detect_all_bboxes.py
+++ b/magic_pdf/pre_proc/ocr_detect_all_bboxes.py
@@ -108,7 +108,7 @@ def ocr_prepare_bboxes_for_layout_split_v2(img_blocks, table_blocks, discarded_b
    all_bboxes = remove_overlaps_min_blocks(all_bboxes)
    all_discarded_blocks = remove_overlaps_min_blocks(all_discarded_blocks)
    '''将剩余的bbox做分离处理，防止后面分layout时出错'''
-    # all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes)
+    all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes)
    return all_bboxes, all_discarded_blocks

--- a/magic_pdf/pre_proc/ocr_dict_merge.py
+++ b/magic_pdf/pre_proc/ocr_dict_merge.py
@@ -49,8 +49,7 @@ def merge_spans_to_line(spans):
                continue
            # 如果当前的span与当前行的最后一个span在y轴上重叠，则添加到当前行
-            if __is_overlaps_y_exceeds_threshold(span['bbox'],
+            if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox'], 0.6):
-                                                 current_line[-1]['bbox']):
                current_line.append(span)
            else:
                # 否则，开始新行