Commit c9171d1f authored by zhougaofeng's avatar zhougaofeng
Browse files

Update app.py, count_pdfs.py, LICENSE.md, magic-pdf.template.json,...

Update app.py, count_pdfs.py, LICENSE.md, magic-pdf.template.json, requirements.txt, requirements-docker.txt, requirements-qa.txt, update_version.py, setup.py, magic_pdf/__init__.py, magic_pdf/pdf_parse_by_txt.py, magic_pdf/pdf_parse_by_ocr.py, magic_pdf/user_api.py, magic_pdf/pdf_parse_union_core.py, magic_pdf/__pycache__/pdf_parse_by_ocr.cpython-310.pyc, magic_pdf/__pycache__/__init__.cpython-310.pyc, magic_pdf/__pycache__/pdf_parse_by_txt.cpython-310.pyc, magic_pdf/__pycache__/pdf_parse_union_core.cpython-310.pyc, magic_pdf/__pycache__/user_api.cpython-310.pyc, magic_pdf/dict2md/__init__.py, magic_pdf/dict2md/mkcontent.py, magic_pdf/dict2md/ocr_mkcontent.py, magic_pdf/dict2md/ocr_client.py, magic_pdf/dict2md/ocr_server.py, magic_pdf/dict2md/ocr_server_72.py, magic_pdf/dict2md/tmp.py, magic_pdf/dict2md/__pycache__/__init__.cpython-310.pyc, magic_pdf/dict2md/__pycache__/ocr_client.cpython-310.pyc, magic_pdf/dict2md/__pycache__/ocr_mkcontent.cpython-310.pyc, magic_pdf/filter/__init__.py, magic_pdf/filter/pdf_classify_by_type.py, magic_pdf/filter/pdf_meta_scan.py, magic_pdf/integrations/__init__.py, magic_pdf/integrations/rag/__init__.py, magic_pdf/integrations/rag/api.py, magic_pdf/integrations/rag/type.py, magic_pdf/integrations/rag/utils.py, magic_pdf/layout/__init__.py, magic_pdf/layout/bbox_sort.py, magic_pdf/layout/layout_det_utils.py, magic_pdf/layout/layout_sort.py, magic_pdf/layout/layout_spiler_recog.py, magic_pdf/layout/mcol_sort.py, magic_pdf/libs/__init__.py, magic_pdf/libs/boxbase.py, magic_pdf/libs/calc_span_stats.py, magic_pdf/libs/commons.py, magic_pdf/libs/config_reader.py, magic_pdf/libs/Constants.py, magic_pdf/libs/convert_utils.py, magic_pdf/libs/coordinate_transform.py, magic_pdf/libs/detect_language_from_model.py, magic_pdf/libs/draw_bbox.py, magic_pdf/libs/drop_reason.py, magic_pdf/libs/drop_tag.py, magic_pdf/libs/hash_utils.py, magic_pdf/libs/json_compressor.py, magic_pdf/libs/language.py, magic_pdf/libs/local_math.py, magic_pdf/libs/MakeContentConfig.py, magic_pdf/libs/markdown_utils.py, magic_pdf/libs/nlp_utils.py, magic_pdf/libs/ModelBlockTypeEnum.py, magic_pdf/libs/ocr_content_type.py, magic_pdf/libs/path_utils.py, magic_pdf/libs/pdf_check.py, magic_pdf/libs/pdf_image_tools.py, magic_pdf/libs/safe_filename.py, magic_pdf/libs/textbase.py, magic_pdf/libs/version.py, magic_pdf/libs/vis_utils.py, magic_pdf/model/__init__.py, magic_pdf/model/doc_analyze_by_custom_model.py, magic_pdf/model/magic_model.py, magic_pdf/model/pdf_extract_kit.py, magic_pdf/model/model_list.py, magic_pdf/model/pp_structure_v2.py, magic_pdf/model/ppTableModel.py, magic_pdf/model/pek_sub_modules/__init__.py, magic_pdf/model/pek_sub_modules/post_process.py, magic_pdf/model/pek_sub_modules/self_modify.py, magic_pdf/model/pek_sub_modules/layoutlmv3/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/backbone.py, magic_pdf/model/pek_sub_modules/layoutlmv3/beit.py, magic_pdf/model/pek_sub_modules/layoutlmv3/deit.py, magic_pdf/model/pek_sub_modules/layoutlmv3/model_init.py, magic_pdf/model/pek_sub_modules/layoutlmv3/rcnn_vl.py, magic_pdf/model/pek_sub_modules/layoutlmv3/visualizer.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/modeling_layoutlmv3.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/configuration_layoutlmv3.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/cord.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/data_collator.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/funsd.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/image_utils.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/xfund.py, magic_pdf/model/pek_sub_modules/structeqtable/__init__.py, magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py, magic_pdf/para/__init__.py, magic_pdf/para/block_continuation_processor.py, magic_pdf/para/block_termination_processor.py, magic_pdf/para/commons.py, magic_pdf/para/denoise.py, magic_pdf/para/draw.py, magic_pdf/para/exceptions.py, magic_pdf/para/layout_match_processor.py, magic_pdf/para/para_pipeline.py, magic_pdf/para/para_split.py, magic_pdf/para/para_split_v2.py, magic_pdf/para/raw_processor.py, magic_pdf/para/stats.py, magic_pdf/para/title_processor.py, magic_pdf/parse/__init__.py, magic_pdf/parse/common_parse.py, magic_pdf/parse/excel_parse.py, magic_pdf/parse/pdf_client.py, magic_pdf/pipe/__init__.py, magic_pdf/pipe/AbsPipe.py, magic_pdf/pipe/OCRPipe.py, magic_pdf/pipe/TXTPipe.py, magic_pdf/pipe/UNIPipe.py, magic_pdf/post_proc/__init__.py, magic_pdf/post_proc/pdf_post_filter.py, magic_pdf/post_proc/remove_footnote.py, magic_pdf/post_proc/detect_para.py, magic_pdf/pre_proc/__init__.py, magic_pdf/pre_proc/citationmarker_remove.py, magic_pdf/pre_proc/construct_page_dict.py, magic_pdf/pre_proc/cut_image.py, magic_pdf/pre_proc/detect_equation.py, magic_pdf/pre_proc/detect_footer_header_by_statistics.py, magic_pdf/pre_proc/detect_footer_by_model.py, magic_pdf/pre_proc/detect_footnote.py, magic_pdf/pre_proc/detect_header.py, magic_pdf/pre_proc/detect_images.py, magic_pdf/pre_proc/detect_page_number.py, magic_pdf/pre_proc/detect_tables.py, magic_pdf/pre_proc/equations_replace.py, magic_pdf/pre_proc/fix_image.py, magic_pdf/pre_proc/fix_table.py, magic_pdf/pre_proc/main_text_font.py, magic_pdf/pre_proc/ocr_detect_all_bboxes.py, magic_pdf/pre_proc/ocr_detect_layout.py, magic_pdf/pre_proc/ocr_dict_merge.py, magic_pdf/pre_proc/ocr_span_list_modify.py, magic_pdf/pre_proc/post_layout_split.py, magic_pdf/pre_proc/remove_bbox_overlap.py, magic_pdf/pre_proc/remove_colored_strip_bbox.py, magic_pdf/pre_proc/pdf_pre_filter.py, magic_pdf/pre_proc/remove_footer_header.py, magic_pdf/pre_proc/remove_rotate_bbox.py, magic_pdf/pre_proc/resolve_bbox_conflict.py, magic_pdf/pre_proc/solve_line_alien.py, magic_pdf/pre_proc/statistics.py, magic_pdf/resources/fasttext-langdetect/lid.176.ftz, magic_pdf/resources/model_config/model_configs.yaml, magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml, magic_pdf/resources/model_config/UniMERNet/demo.yaml, magic_pdf/rw/__init__.py, magic_pdf/rw/AbsReaderWriter.py, magic_pdf/rw/DiskReaderWriter.py, magic_pdf/rw/S3ReaderWriter.py, magic_pdf/spark/__init__.py, magic_pdf/spark/spark_api.py, magic_pdf/tools/__init__.py, magic_pdf/tools/pdf_client.py, magic_pdf/tools/common.py, magic_pdf/tools/cli_dev.py, magic_pdf/tools/cli.py, magic_pdf/tools/pdf_server.py files
parent 748e3b56
Pipeline #1783 canceled with stages
from loguru import logger
from magic_pdf.libs.boxbase import get_minbox_if_overlap_by_ratio, calculate_overlap_area_in_bbox1_area_ratio, \
calculate_iou
from magic_pdf.libs.drop_tag import DropTag
from magic_pdf.libs.ocr_content_type import BlockType
from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox_for_block
def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_blocks, text_blocks,
title_blocks, interline_equation_blocks, page_w, page_h):
all_bboxes = []
all_discarded_blocks = []
for image in img_blocks:
x0, y0, x1, y1 = image['bbox']
all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Image, None, None, None, None, image["score"]])
for table in table_blocks:
x0, y0, x1, y1 = table['bbox']
all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Table, None, None, None, None, table["score"]])
for text in text_blocks:
x0, y0, x1, y1 = text['bbox']
all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Text, None, None, None, None, text["score"]])
for title in title_blocks:
x0, y0, x1, y1 = title['bbox']
all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Title, None, None, None, None, title["score"]])
for interline_equation in interline_equation_blocks:
x0, y0, x1, y1 = interline_equation['bbox']
all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.InterlineEquation, None, None, None, None, interline_equation["score"]])
'''block嵌套问题解决'''
'''文本框与标题框重叠,优先信任文本框'''
all_bboxes = fix_text_overlap_title_blocks(all_bboxes)
'''任何框体与舍弃框重叠,优先信任舍弃框'''
all_bboxes = remove_need_drop_blocks(all_bboxes, discarded_blocks)
# interline_equation 与title或text框冲突的情况,分两种情况处理
'''interline_equation框与文本类型框iou比较接近1的时候,信任行间公式框'''
all_bboxes = fix_interline_equation_overlap_text_blocks_with_hi_iou(all_bboxes)
'''interline_equation框被包含在文本类型框内,且interline_equation比文本区块小很多时信任文本框,这时需要舍弃公式框'''
# 通过后续大框套小框逻辑删除
'''discarded_blocks中只保留宽度超过1/3页面宽度的,高度超过10的,处于页面下半50%区域的(限定footnote)'''
for discarded in discarded_blocks:
x0, y0, x1, y1 = discarded['bbox']
all_discarded_blocks.append([x0, y0, x1, y1, None, None, None, BlockType.Discarded, None, None, None, None, discarded["score"]])
# 将footnote加入到all_bboxes中,用来计算layout
if (x1 - x0) > (page_w / 3) and (y1 - y0) > 10 and y0 > (page_h / 2):
all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Footnote, None, None, None, None, discarded["score"]])
'''经过以上处理后,还存在大框套小框的情况,则删除小框'''
all_bboxes = remove_overlaps_min_blocks(all_bboxes)
all_discarded_blocks = remove_overlaps_min_blocks(all_discarded_blocks)
'''将剩余的bbox做分离处理,防止后面分layout时出错'''
all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes)
return all_bboxes, all_discarded_blocks, drop_reasons
def fix_interline_equation_overlap_text_blocks_with_hi_iou(all_bboxes):
# 先提取所有text和interline block
text_blocks = []
for block in all_bboxes:
if block[7] == BlockType.Text:
text_blocks.append(block)
interline_equation_blocks = []
for block in all_bboxes:
if block[7] == BlockType.InterlineEquation:
interline_equation_blocks.append(block)
need_remove = []
for interline_equation_block in interline_equation_blocks:
for text_block in text_blocks:
interline_equation_block_bbox = interline_equation_block[:4]
text_block_bbox = text_block[:4]
if calculate_iou(interline_equation_block_bbox, text_block_bbox) > 0.8:
if text_block not in need_remove:
need_remove.append(text_block)
if len(need_remove) > 0:
for block in need_remove:
all_bboxes.remove(block)
return all_bboxes
def fix_text_overlap_title_blocks(all_bboxes):
# 先提取所有text和title block
text_blocks = []
for block in all_bboxes:
if block[7] == BlockType.Text:
text_blocks.append(block)
title_blocks = []
for block in all_bboxes:
if block[7] == BlockType.Title:
title_blocks.append(block)
need_remove = []
for text_block in text_blocks:
for title_block in title_blocks:
text_block_bbox = text_block[:4]
title_block_bbox = title_block[:4]
if calculate_iou(text_block_bbox, title_block_bbox) > 0.8:
if title_block not in need_remove:
need_remove.append(title_block)
if len(need_remove) > 0:
for block in need_remove:
all_bboxes.remove(block)
return all_bboxes
def remove_need_drop_blocks(all_bboxes, discarded_blocks):
need_remove = []
for block in all_bboxes:
for discarded_block in discarded_blocks:
block_bbox = block[:4]
if calculate_overlap_area_in_bbox1_area_ratio(block_bbox, discarded_block['bbox']) > 0.6:
if block not in need_remove:
need_remove.append(block)
break
if len(need_remove) > 0:
for block in need_remove:
all_bboxes.remove(block)
return all_bboxes
def remove_overlaps_min_blocks(all_bboxes):
# 重叠block,小的不能直接删除,需要和大的那个合并成一个更大的。
# 删除重叠blocks中较小的那些
need_remove = []
for block1 in all_bboxes:
for block2 in all_bboxes:
if block1 != block2:
block1_bbox = block1[:4]
block2_bbox = block2[:4]
overlap_box = get_minbox_if_overlap_by_ratio(block1_bbox, block2_bbox, 0.8)
if overlap_box is not None:
block_to_remove = next((block for block in all_bboxes if block[:4] == overlap_box), None)
if block_to_remove is not None and block_to_remove not in need_remove:
large_block = block1 if block1 != block_to_remove else block2
x1, y1, x2, y2 = large_block[:4]
sx1, sy1, sx2, sy2 = block_to_remove[:4]
x1 = min(x1, sx1)
y1 = min(y1, sy1)
x2 = max(x2, sx2)
y2 = max(y2, sy2)
large_block[:4] = [x1, y1, x2, y2]
need_remove.append(block_to_remove)
if len(need_remove) > 0:
for block in need_remove:
all_bboxes.remove(block)
return all_bboxes
import fitz
from magic_pdf.layout.layout_sort import get_bboxes_layout
from magic_pdf.libs.boxbase import _is_part_overlap, _is_in
from magic_pdf.libs.coordinate_transform import get_scale_ratio
def get_center_point(bbox):
"""
根据边界框坐标信息,计算出该边界框的中心点坐标。
Args:
bbox (list): 边界框坐标信息,包含四个元素,分别为左上角x坐标、左上角y坐标、右下角x坐标、右下角y坐标。
Returns:
list: 中心点坐标信息,包含两个元素,分别为x坐标和y坐标。
"""
return [(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2]
def get_area(bbox):
"""
根据边界框坐标信息,计算出该边界框的面积。
Args:
bbox (list): 边界框坐标信息,包含四个元素,分别为左上角x坐标、左上角y坐标、右下角x坐标、右下角y坐标。
Returns:
float: 该边界框的面积。
"""
return (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
def adjust_layouts(layout_bboxes, page_boundry, page_id):
# 遍历所有布局框
for i in range(len(layout_bboxes)):
# 遍历当前布局框之后的布局框
for j in range(i + 1, len(layout_bboxes)):
# 判断两个布局框是否重叠
if _is_part_overlap(layout_bboxes[i], layout_bboxes[j]):
# 计算每个布局框的中心点坐标和面积
area_i = get_area(layout_bboxes[i])
area_j = get_area(layout_bboxes[j])
# 较大布局框和较小布局框的赋值
if area_i > area_j:
larger_layout, smaller_layout = layout_bboxes[i], layout_bboxes[j]
else:
larger_layout, smaller_layout = layout_bboxes[j], layout_bboxes[i]
center_large = get_center_point(larger_layout)
center_small = get_center_point(smaller_layout)
# 计算横向和纵向的距离差
distance_x = center_large[0] - center_small[0]
distance_y = center_large[1] - center_small[1]
# 根据距离差判断重叠方向并修正边界
if abs(distance_x) > abs(distance_y): # 左右重叠
if distance_x > 0 and larger_layout[0] < smaller_layout[2]:
larger_layout[0] = smaller_layout[2]+1
if distance_x < 0 and larger_layout[2] > smaller_layout[0]:
larger_layout[2] = smaller_layout[0]-1
else: # 上下重叠
if distance_y > 0 and larger_layout[1] < smaller_layout[3]:
larger_layout[1] = smaller_layout[3]+1
if distance_y < 0 and larger_layout[3] > smaller_layout[1]:
larger_layout[3] = smaller_layout[1]-1
# 排序调整布局边界框列表
new_bboxes = []
for layout_bbox in layout_bboxes:
new_bboxes.append([layout_bbox[0], layout_bbox[1], layout_bbox[2], layout_bbox[3], None, None, None, None, None, None, None, None, None])
layout_bboxes, layout_tree = get_bboxes_layout(new_bboxes, page_boundry, page_id)
# 返回排序调整后的布局边界框列表
return layout_bboxes, layout_tree
def layout_detect(layout_info, page: fitz.Page, ocr_page_info):
"""
对输入的布局信息进行解析,提取出每个子布局的边界框,并对所有子布局进行排序调整。
Args:
layout_info (list): 包含子布局信息的列表,每个子布局信息为字典类型,包含'poly'字段,表示子布局的边界框坐标信息。
Returns:
list: 经过排序调整后的所有子布局边界框信息的列表,每个边界框信息为字典类型,包含'layout_bbox'字段,表示边界框的坐标信息。
"""
page_id = ocr_page_info['page_info']['page_no']-1
horizontal_scale_ratio, vertical_scale_ratio = get_scale_ratio(ocr_page_info, page)
# 初始化布局边界框列表
layout_bboxes = []
# 遍历每个子布局
for sub_layout in layout_info:
# 提取子布局的边界框坐标信息
x0, y0, _, _, x1, y1, _, _ = sub_layout['poly']
bbox = [int(x0 / horizontal_scale_ratio), int(y0 / vertical_scale_ratio),
int(x1 / horizontal_scale_ratio), int(y1 / vertical_scale_ratio)]
# 将子布局的边界框添加到列表中
layout_bboxes.append(bbox)
# 初始化新的布局边界框列表
new_layout_bboxes = []
# 遍历每个布局边界框
for i in range(len(layout_bboxes)):
# 初始化标记变量,用于判断当前边界框是否需要保留
keep = True
# 获取当前边界框的坐标信息
box_i = layout_bboxes[i]
# 遍历其他边界框
for j in range(len(layout_bboxes)):
# 排除当前边界框自身
if i != j:
# 获取其他边界框的坐标信息
box_j = layout_bboxes[j]
# 检测box_i是否被box_j包含
if _is_in(box_i, box_j):
# 如果当前边界框被其他边界框包含,则标记为不需要保留
keep = False
# 跳出内层循环
break
# 如果当前边界框需要保留,则添加到新的布局边界框列表中
if keep:
new_layout_bboxes.append(layout_bboxes[i])
# 对新的布局边界框列表进行排序调整
page_width = page.rect.width
page_height = page.rect.height
page_boundry = [0, 0, page_width, page_height]
layout_bboxes, layout_tree = adjust_layouts(new_layout_bboxes, page_boundry, page_id)
# 返回排序调整后的布局边界框列表
return layout_bboxes, layout_tree
from magic_pdf.libs.boxbase import (__is_overlaps_y_exceeds_threshold,
_is_in_or_part_overlap_with_area_ratio,
calculate_overlap_area_in_bbox1_area_ratio)
from magic_pdf.libs.drop_tag import DropTag
from magic_pdf.libs.ocr_content_type import BlockType, ContentType
from loguru import logger
# 将每一个line中的span从左到右排序
def line_sort_spans_by_left_to_right(lines):
line_objects = []
for line in lines:
# 按照x0坐标排序
line.sort(key=lambda span: span['bbox'][0])
line_bbox = [
min(span['bbox'][0] for span in line), # x0
min(span['bbox'][1] for span in line), # y0
max(span['bbox'][2] for span in line), # x1
max(span['bbox'][3] for span in line), # y1
]
line_objects.append({
'bbox': line_bbox,
'spans': line,
})
return line_objects
def merge_spans_to_line(spans):
if len(spans) == 0:
return []
else:
# 按照y0坐标排序
spans.sort(key=lambda span: span['bbox'][1])
lines = []
current_line = [spans[0]]
for span in spans[1:]:
# 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation"
# image和table类型,同上
if span['type'] in [
ContentType.InterlineEquation, ContentType.Image,
ContentType.Table
] or any(s['type'] in [
ContentType.InterlineEquation, ContentType.Image,
ContentType.Table
] for s in current_line):
# 则开始新行
lines.append(current_line)
current_line = [span]
continue
# 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
if __is_overlaps_y_exceeds_threshold(span['bbox'],
current_line[-1]['bbox']):
current_line.append(span)
else:
# 否则,开始新行
lines.append(current_line)
current_line = [span]
# 添加最后一行
if current_line:
lines.append(current_line)
return lines
def merge_spans_to_line_by_layout(spans, layout_bboxes):
lines = []
new_spans = []
dropped_spans = []
for item in layout_bboxes:
layout_bbox = item['layout_bbox']
# 遍历spans,将每个span放入对应的layout中
layout_sapns = []
for span in spans:
if calculate_overlap_area_in_bbox1_area_ratio(
span['bbox'], layout_bbox) > 0.6:
layout_sapns.append(span)
# 如果layout_sapns不为空,则放入new_spans中
if len(layout_sapns) > 0:
new_spans.append(layout_sapns)
# 从spans删除已经放入layout_sapns中的span
for layout_sapn in layout_sapns:
spans.remove(layout_sapn)
if len(new_spans) > 0:
for layout_sapns in new_spans:
layout_lines = merge_spans_to_line(layout_sapns)
lines.extend(layout_lines)
# 对line中的span进行排序
lines = line_sort_spans_by_left_to_right(lines)
for span in spans:
span['tag'] = DropTag.NOT_IN_LAYOUT
dropped_spans.append(span)
return lines, dropped_spans
def merge_lines_to_block(lines):
# 目前不做block拼接,先做个结构,每个block中只有一个line,block的bbox就是line的bbox
blocks = []
for line in lines:
blocks.append({
'bbox': line['bbox'],
'lines': [line],
})
return blocks
def sort_blocks_by_layout(all_bboxes, layout_bboxes):
new_blocks = []
sort_blocks = []
for item in layout_bboxes:
layout_bbox = item['layout_bbox']
# 遍历blocks,将每个blocks放入对应的layout中
layout_blocks = []
for block in all_bboxes:
# 如果是footnote则跳过
if block[7] == BlockType.Footnote:
continue
block_bbox = block[:4]
if calculate_overlap_area_in_bbox1_area_ratio(
block_bbox, layout_bbox) > 0.8:
layout_blocks.append(block)
# 如果layout_blocks不为空,则放入new_blocks中
if len(layout_blocks) > 0:
new_blocks.append(layout_blocks)
# 从all_bboxes删除已经放入layout_blocks中的block
for layout_block in layout_blocks:
all_bboxes.remove(layout_block)
# 如果new_blocks不为空,则对new_blocks中每个block进行排序
if len(new_blocks) > 0:
for bboxes_in_layout_block in new_blocks:
bboxes_in_layout_block.sort(
key=lambda x: x[1]) # 一个layout内部的box,按照y0自上而下排序
sort_blocks.extend(bboxes_in_layout_block)
# sort_blocks中已经包含了当前页面所有最终留下的block,且已经排好了顺序
return sort_blocks
def fill_spans_in_blocks(blocks, spans, radio):
"""将allspans中的span按位置关系,放入blocks中."""
block_with_spans = []
for block in blocks:
block_type = block[7]
block_bbox = block[0:4]
block_dict = {
'type': block_type,
'bbox': block_bbox,
}
block_spans = []
for span in spans:
span_bbox = span['bbox']
if calculate_overlap_area_in_bbox1_area_ratio(
span_bbox, block_bbox) > radio:
block_spans.append(span)
'''行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)'''
# displayed_list = []
# text_inline_lines = []
# modify_y_axis(block_spans, displayed_list, text_inline_lines)
'''模型识别错误的行间公式, type类型转换成行内公式'''
# block_spans = modify_inline_equation(block_spans, displayed_list, text_inline_lines)
'''bbox去除粘连''' # 去粘连会影响span的bbox,导致后续fill的时候出错
# block_spans = remove_overlap_between_bbox_for_span(block_spans)
block_dict['spans'] = block_spans
block_with_spans.append(block_dict)
# 从spans删除已经放入block_spans中的span
if len(block_spans) > 0:
for span in block_spans:
spans.remove(span)
return block_with_spans, spans
def fix_block_spans(block_with_spans, img_blocks, table_blocks):
"""1、img_block和table_block因为包含caption和footnote的关系,存在block的嵌套关系
需要将caption和footnote的text_span放入相应img_block和table_block内的
caption_block和footnote_block中 2、同时需要删除block中的spans字段."""
fix_blocks = []
for block in block_with_spans:
block_type = block['type']
if block_type == BlockType.Image:
block = fix_image_block(block, img_blocks)
#logger.info('跳过处理img_blocks')
#pass
elif block_type == BlockType.Table:
block = fix_table_block(block, table_blocks)
elif block_type in [BlockType.Text, BlockType.Title]:
block = fix_text_block(block)
elif block_type == BlockType.InterlineEquation:
block = fix_interline_block(block)
else:
continue
fix_blocks.append(block)
return fix_blocks
def fix_discarded_block(discarded_block_with_spans):
fix_discarded_blocks = []
for block in discarded_block_with_spans:
block = fix_text_block(block)
fix_discarded_blocks.append(block)
return fix_discarded_blocks
def merge_spans_to_block(spans: list, block_bbox: list, block_type: str):
block_spans = []
# 如果有img_caption,则将img_block中的text_spans放入img_caption_block中
for span in spans:
if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'],
block_bbox) > 0.6:
block_spans.append(span)
block_lines = merge_spans_to_line(block_spans)
# 对line中的span进行排序
sort_block_lines = line_sort_spans_by_left_to_right(block_lines)
block = {'bbox': block_bbox, 'type': block_type, 'lines': sort_block_lines}
return block, block_spans
def make_body_block(span: dict, block_bbox: list, block_type: str):
# 创建body_block
body_line = {
'bbox': block_bbox,
'spans': [span],
}
body_block = {'bbox': block_bbox, 'type': block_type, 'lines': [body_line]}
return body_block
def fix_image_block(block, img_blocks):
block['blocks'] = []
# 遍历img_blocks,找到与当前block匹配的img_block
for img_block in img_blocks:
if _is_in_or_part_overlap_with_area_ratio(block['bbox'],
img_block['bbox'], 0.95):
# 创建img_body_block
for span in block['spans']:
if span['type'] == ContentType.Image and img_block[
'img_body_bbox'] == span['bbox']:
# 创建img_body_block
img_body_block = make_body_block(
span, img_block['img_body_bbox'], BlockType.ImageBody)
block['blocks'].append(img_body_block)
# 从spans中移除img_body_block中已经放入的span
block['spans'].remove(span)
break
# 根据list长度,判断img_block中是否有img_caption
if img_block['img_caption_bbox'] is not None:
img_caption_block, img_caption_spans = merge_spans_to_block(
block['spans'], img_block['img_caption_bbox'],
BlockType.ImageCaption)
block['blocks'].append(img_caption_block)
if img_block['img_footnote_bbox'] is not None:
img_footnote_block, img_footnote_spans = merge_spans_to_block(
block['spans'], img_block['img_footnote_bbox'],
BlockType.ImageFootnote)
block['blocks'].append(img_footnote_block)
break
del block['spans']
return block
def fix_table_block(block, table_blocks):
block['blocks'] = []
# 遍历table_blocks,找到与当前block匹配的table_block
for table_block in table_blocks:
if _is_in_or_part_overlap_with_area_ratio(block['bbox'],
table_block['bbox'], 0.95):
# 创建table_body_block
for span in block['spans']:
if span['type'] == ContentType.Table and table_block[
'table_body_bbox'] == span['bbox']:
# 创建table_body_block
table_body_block = make_body_block(
span, table_block['table_body_bbox'],
BlockType.TableBody)
block['blocks'].append(table_body_block)
# 从spans中移除img_body_block中已经放入的span
block['spans'].remove(span)
break
# 根据list长度,判断table_block中是否有caption
if table_block['table_caption_bbox'] is not None:
table_caption_block, table_caption_spans = merge_spans_to_block(
block['spans'], table_block['table_caption_bbox'],
BlockType.TableCaption)
block['blocks'].append(table_caption_block)
# 如果table_caption_block_spans不为空
if len(table_caption_spans) > 0:
# 一些span已经放入了caption_block中,需要从block['spans']中删除
for span in table_caption_spans:
block['spans'].remove(span)
# 根据list长度,判断table_block中是否有table_note
if table_block['table_footnote_bbox'] is not None:
table_footnote_block, table_footnote_spans = merge_spans_to_block(
block['spans'], table_block['table_footnote_bbox'],
BlockType.TableFootnote)
block['blocks'].append(table_footnote_block)
break
del block['spans']
return block
def fix_text_block(block):
# 文本block中的公式span都应该转换成行内type
for span in block['spans']:
if span['type'] == ContentType.InterlineEquation:
span['type'] = ContentType.InlineEquation
block_lines = merge_spans_to_line(block['spans'])
sort_block_lines = line_sort_spans_by_left_to_right(block_lines)
block['lines'] = sort_block_lines
del block['spans']
return block
def fix_interline_block(block):
block_lines = merge_spans_to_line(block['spans'])
sort_block_lines = line_sort_spans_by_left_to_right(block_lines)
block['lines'] = sort_block_lines
del block['spans']
return block
from loguru import logger
from magic_pdf.libs.boxbase import calculate_overlap_area_in_bbox1_area_ratio, get_minbox_if_overlap_by_ratio, \
__is_overlaps_y_exceeds_threshold, calculate_iou
from magic_pdf.libs.drop_tag import DropTag
from magic_pdf.libs.ocr_content_type import ContentType, BlockType
def remove_overlaps_low_confidence_spans(spans):
dropped_spans = []
# 删除重叠spans中置信度低的的那些
for span1 in spans:
for span2 in spans:
if span1 != span2:
# span1 或 span2 任何一个都不应该在 dropped_spans 中
if span1 in dropped_spans or span2 in dropped_spans:
continue
else:
if calculate_iou(span1['bbox'], span2['bbox']) > 0.9:
if span1['score'] < span2['score']:
span_need_remove = span1
else:
span_need_remove = span2
if span_need_remove is not None and span_need_remove not in dropped_spans:
dropped_spans.append(span_need_remove)
if len(dropped_spans) > 0:
for span_need_remove in dropped_spans:
spans.remove(span_need_remove)
span_need_remove['tag'] = DropTag.SPAN_OVERLAP
return spans, dropped_spans
def remove_overlaps_min_spans(spans):
dropped_spans = []
# 删除重叠spans中较小的那些
for span1 in spans:
for span2 in spans:
if span1 != span2:
overlap_box = get_minbox_if_overlap_by_ratio(span1['bbox'], span2['bbox'], 0.65)
if overlap_box is not None:
span_need_remove = next((span for span in spans if span['bbox'] == overlap_box), None)
if span_need_remove is not None and span_need_remove not in dropped_spans:
dropped_spans.append(span_need_remove)
if len(dropped_spans) > 0:
for span_need_remove in dropped_spans:
spans.remove(span_need_remove)
span_need_remove['tag'] = DropTag.SPAN_OVERLAP
return spans, dropped_spans
def remove_spans_by_bboxes(spans, need_remove_spans_bboxes):
# 遍历spans, 判断是否在removed_span_block_bboxes中
# 如果是, 则删除该span 否则, 保留该span
need_remove_spans = []
for span in spans:
for removed_bbox in need_remove_spans_bboxes:
if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], removed_bbox) > 0.5:
if span not in need_remove_spans:
need_remove_spans.append(span)
break
if len(need_remove_spans) > 0:
for span in need_remove_spans:
spans.remove(span)
return spans
def remove_spans_by_bboxes_dict(spans, need_remove_spans_bboxes_dict):
dropped_spans = []
for drop_tag, removed_bboxes in need_remove_spans_bboxes_dict.items():
# logger.info(f"remove spans by bbox dict, drop_tag: {drop_tag}, removed_bboxes: {removed_bboxes}")
need_remove_spans = []
for span in spans:
# 通过判断span的bbox是否在removed_bboxes中, 判断是否需要删除该span
for removed_bbox in removed_bboxes:
if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], removed_bbox) > 0.5:
need_remove_spans.append(span)
break
# 当drop_tag为DropTag.FOOTNOTE时, 判断span是否在removed_bboxes中任意一个的下方,如果是,则删除该span
elif drop_tag == DropTag.FOOTNOTE and (span['bbox'][1] + span['bbox'][3]) / 2 > removed_bbox[3] and \
removed_bbox[0] < (span['bbox'][0] + span['bbox'][2]) / 2 < removed_bbox[2]:
need_remove_spans.append(span)
break
for span in need_remove_spans:
spans.remove(span)
span['tag'] = drop_tag
dropped_spans.append(span)
return spans, dropped_spans
def adjust_bbox_for_standalone_block(spans):
# 对tpye=["interline_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0
for sb_span in spans:
if sb_span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]:
for text_span in spans:
if text_span['type'] in [ContentType.Text, ContentType.InlineEquation]:
# 判断span2的纵向高度是否被span所覆盖
if sb_span['bbox'][1] < text_span['bbox'][1] and sb_span['bbox'][3] > text_span['bbox'][3]:
# 判断span2是否在span左边
if text_span['bbox'][0] < sb_span['bbox'][0]:
# 调整span的y0和span2的y0一致
sb_span['bbox'][1] = text_span['bbox'][1]
return spans
def modify_y_axis(spans: list, displayed_list: list, text_inline_lines: list):
# displayed_list = []
# 如果spans为空,则不处理
if len(spans) == 0:
pass
else:
spans.sort(key=lambda span: span['bbox'][1])
lines = []
current_line = [spans[0]]
if spans[0]["type"] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]:
displayed_list.append(spans[0])
line_first_y0 = spans[0]["bbox"][1]
line_first_y = spans[0]["bbox"][3]
# 用于给行间公式搜索
# text_inline_lines = []
for span in spans[1:]:
# if span.get("content","") == "78.":
# print("debug")
# 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation"
# image和table类型,同上
if span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] or any(
s['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] for s in
current_line):
# 传入
if span["type"] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]:
displayed_list.append(span)
# 则开始新行
lines.append(current_line)
if len(current_line) > 1 or current_line[0]["type"] in [ContentType.Text, ContentType.InlineEquation]:
text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
current_line = [span]
line_first_y0 = span["bbox"][1]
line_first_y = span["bbox"][3]
continue
# 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox']):
if span["type"] == "text":
line_first_y0 = span["bbox"][1]
line_first_y = span["bbox"][3]
current_line.append(span)
else:
# 否则,开始新行
lines.append(current_line)
text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
current_line = [span]
line_first_y0 = span["bbox"][1]
line_first_y = span["bbox"][3]
# 添加最后一行
if current_line:
lines.append(current_line)
if len(current_line) > 1 or current_line[0]["type"] in [ContentType.Text, ContentType.InlineEquation]:
text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
for line in text_inline_lines:
# 按照x0坐标排序
current_line = line[0]
current_line.sort(key=lambda span: span['bbox'][0])
# 调整每一个文字行内bbox统一
for line in text_inline_lines:
current_line, (line_first_y0, line_first_y) = line
for span in current_line:
span["bbox"][1] = line_first_y0
span["bbox"][3] = line_first_y
# return spans, displayed_list, text_inline_lines
def modify_inline_equation(spans: list, displayed_list: list, text_inline_lines: list):
# 错误行间公式转行内公式
j = 0
for i in range(len(displayed_list)):
# if i == 8:
# print("debug")
span = displayed_list[i]
span_y0, span_y = span["bbox"][1], span["bbox"][3]
while j < len(text_inline_lines):
text_line = text_inline_lines[j]
y0, y1 = text_line[1]
if (
span_y0 < y0 < span_y or span_y0 < y1 < span_y or span_y0 < y0 and span_y > y1
) and __is_overlaps_y_exceeds_threshold(
span['bbox'], (0, y0, 0, y1)
):
# 调整公式类型
if span["type"] == ContentType.InterlineEquation:
# 最后一行是行间公式
if j + 1 >= len(text_inline_lines):
span["type"] = ContentType.InlineEquation
span["bbox"][1] = y0
span["bbox"][3] = y1
else:
# 行间公式旁边有多行文字或者行间公式比文字高3倍则不转换
y0_next, y1_next = text_inline_lines[j + 1][1]
if not __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0_next, 0, y1_next)) and 3 * (
y1 - y0) > span_y - span_y0:
span["type"] = ContentType.InlineEquation
span["bbox"][1] = y0
span["bbox"][3] = y1
break
elif span_y < y0 or span_y0 < y0 < span_y and not __is_overlaps_y_exceeds_threshold(span['bbox'],
(0, y0, 0, y1)):
break
else:
j += 1
return spans
def get_qa_need_list(blocks):
# 创建 images, tables, interline_equations, inline_equations 的副本
images = []
tables = []
interline_equations = []
inline_equations = []
for block in blocks:
for line in block["lines"]:
for span in line["spans"]:
if span["type"] == ContentType.Image:
images.append(span)
elif span["type"] == ContentType.Table:
tables.append(span)
elif span["type"] == ContentType.InlineEquation:
inline_equations.append(span)
elif span["type"] == ContentType.InterlineEquation:
interline_equations.append(span)
else:
continue
return images, tables, interline_equations, inline_equations
def get_qa_need_list_v2(blocks):
# 创建 images, tables, interline_equations, inline_equations 的副本
images = []
tables = []
interline_equations = []
for block in blocks:
if block["type"] == BlockType.Image:
images.append(block)
elif block["type"] == BlockType.Table:
tables.append(block)
elif block["type"] == BlockType.InterlineEquation:
interline_equations.append(block)
return images, tables, interline_equations
from magic_pdf.libs.commons import fitz
from magic_pdf.libs.boxbase import _is_in, _is_in_or_part_overlap
from magic_pdf.libs.drop_reason import DropReason
def __area(box):
return (box[2] - box[0]) * (box[3] - box[1])
def __is_contain_color_background_rect(page:fitz.Page, text_blocks, image_bboxes) -> bool:
"""
检查page是包含有颜色背景的矩形
"""
color_bg_rect = []
p_width, p_height = page.rect.width, page.rect.height
# 先找到最大的带背景矩形
blocks = page.get_cdrawings()
for block in blocks:
if 'fill' in block and block['fill']: # 过滤掉透明的
fill = list(block['fill'])
fill[0], fill[1], fill[2] = int(fill[0]), int(fill[1]), int(fill[2])
if fill==(1.0,1.0,1.0):
continue
rect = block['rect']
# 过滤掉特别小的矩形
if __area(rect) < 10*10:
continue
# 为了防止是svg图片上的色块,这里过滤掉这类
if any([_is_in_or_part_overlap(rect, img_bbox) for img_bbox in image_bboxes]):
continue
color_bg_rect.append(rect)
# 找到最大的背景矩形
if len(color_bg_rect) > 0:
max_rect = max(color_bg_rect, key=lambda x:__area(x))
max_rect_int = (int(max_rect[0]), int(max_rect[1]), int(max_rect[2]), int(max_rect[3]))
# 判断最大的背景矩形是否包含超过3行文字,或者50个字 TODO
if max_rect[2]-max_rect[0] > 0.2*p_width and max_rect[3]-max_rect[1] > 0.1*p_height:#宽度符合
#看是否有文本块落入到这个矩形中
for text_block in text_blocks:
box = text_block['bbox']
box_int = (int(box[0]), int(box[1]), int(box[2]), int(box[3]))
if _is_in(box_int, max_rect_int):
return True
return False
def __is_table_overlap_text_block(text_blocks, table_bbox):
"""
检查table_bbox是否覆盖了text_blocks里的文本块
TODO
"""
for text_block in text_blocks:
box = text_block['bbox']
if _is_in_or_part_overlap(table_bbox, box):
return True
return False
def pdf_filter(page:fitz.Page, text_blocks, table_bboxes, image_bboxes) -> tuple:
"""
return:(True|False, err_msg)
True, 如果pdf符合要求
False, 如果pdf不符合要求
"""
if __is_contain_color_background_rect(page, text_blocks, image_bboxes):
return False, {"_need_drop": True, "_drop_reason": DropReason.COLOR_BACKGROUND_TEXT_BOX}
return True, None
\ No newline at end of file
from magic_pdf.libs.boxbase import _is_in_or_part_overlap, _is_in, _is_part_overlap
from magic_pdf.libs.drop_reason import DropReason
def _remove_overlap_between_bbox(bbox1, bbox2):
if _is_part_overlap(bbox1, bbox2):
ix0, iy0, ix1, iy1 = bbox1
x0, y0, x1, y1 = bbox2
diff_x = min(x1, ix1) - max(x0, ix0)
diff_y = min(y1, iy1) - max(y0, iy0)
if diff_y > diff_x:
if x1 >= ix1:
mid = (x0 + ix1) // 2
ix1 = min(mid - 0.25, ix1)
x0 = max(mid + 0.25, x0)
else:
mid = (ix0 + x1) // 2
ix0 = max(mid + 0.25, ix0)
x1 = min(mid - 0.25, x1)
else:
if y1 >= iy1:
mid = (y0 + iy1) // 2
y0 = max(mid + 0.25, y0)
iy1 = min(iy1, mid-0.25)
else:
mid = (iy0 + y1) // 2
y1 = min(y1, mid-0.25)
iy0 = max(mid + 0.25, iy0)
if ix1 > ix0 and iy1 > iy0 and y1 > y0 and x1 > x0:
bbox1 = [ix0, iy0, ix1, iy1]
bbox2 = [x0, y0, x1, y1]
return bbox1, bbox2, None
else:
return bbox1, bbox2, DropReason.NEGATIVE_BBOX_AREA
else:
return bbox1, bbox2, None
def _remove_overlap_between_bboxes(arr):
drop_reasons = []
N = len(arr)
keeps = [True] * N
res = [None] * N
for i in range(N):
for j in range(N):
if i == j:
continue
if _is_in(arr[i]["bbox"], arr[j]["bbox"]):
keeps[i] = False
for idx, v in enumerate(arr):
if not keeps[idx]:
continue
for i in range(N):
if res[i] is None:
continue
bbox1, bbox2, drop_reason = _remove_overlap_between_bbox(v["bbox"], res[i]["bbox"])
if drop_reason is None:
v["bbox"] = bbox1
res[i]["bbox"] = bbox2
else:
if v["score"] > res[i]["score"]:
keeps[i] = False
res[i] = None
else:
keeps[idx] = False
drop_reasons.append(drop_reasons)
if keeps[idx]:
res[idx] = v
return res, drop_reasons
def remove_overlap_between_bbox_for_span(spans):
arr = [{"bbox": span["bbox"], "score": span.get("score", 0.1)} for span in spans ]
res, drop_reasons = _remove_overlap_between_bboxes(arr)
ret = []
for i in range(len(res)):
if res[i] is None:
continue
spans[i]["bbox"] = res[i]["bbox"]
ret.append(spans[i])
return ret, drop_reasons
def remove_overlap_between_bbox_for_block(all_bboxes):
arr = [{"bbox": bbox[:4], "score": bbox[-1]} for bbox in all_bboxes ]
res, drop_reasons = _remove_overlap_between_bboxes(arr)
ret = []
for i in range(len(res)):
if res[i] is None:
continue
all_bboxes[i][:4] = res[i]["bbox"]
ret.append(all_bboxes[i])
return ret, drop_reasons
from magic_pdf.libs.boxbase import _is_in, _is_in_or_part_overlap, calculate_overlap_area_2_minbox_area_ratio
from loguru import logger
from magic_pdf.libs.drop_tag import COLOR_BG_HEADER_TXT_BLOCK
def __area(box):
return (box[2] - box[0]) * (box[3] - box[1])
def rectangle_position_determination(rect, p_width):
"""
判断矩形是否在页面中轴线附近。
Args:
rect (list): 矩形坐标,格式为[x1, y1, x2, y2]。
p_width (int): 页面宽度。
Returns:
bool: 若矩形在页面中轴线附近则返回True,否则返回False。
"""
# 页面中轴线x坐标
x_axis = p_width / 2
# 矩形是否跨越中轴线
is_span = rect[0] < x_axis and rect[2] > x_axis
if is_span:
return True
else:
# 矩形与中轴线的距离,只算近的那一边
distance = rect[0] - x_axis if rect[0] > x_axis else x_axis - rect[2]
# 判断矩形与中轴线的距离是否小于页面宽度的20%
if distance < p_width * 0.2:
return True
else:
return False
def remove_colored_strip_textblock(remain_text_blocks, page):
"""
根据页面中特定颜色和大小过滤文本块,将符合条件的文本块从remain_text_blocks中移除,并返回移除的文本块列表colored_strip_textblock。
Args:
remain_text_blocks (list): 剩余文本块列表。
page (Page): 页面对象。
Returns:
tuple: 剩余文本块列表和移除的文本块列表。
"""
colored_strip_textblocks = [] # 先构造一个空的返回
if len(remain_text_blocks) > 0:
p_width, p_height = page.rect.width, page.rect.height
blocks = page.get_cdrawings()
colored_strip_bg_rect = []
for block in blocks:
is_filled = 'fill' in block and block['fill'] and block['fill'] != (1.0, 1.0, 1.0) # 过滤掉透明的
rect = block['rect']
area_is_large_enough = __area(rect) > 100 # 过滤掉特别小的矩形
rectangle_position_determination_result = rectangle_position_determination(rect, p_width)
in_upper_half_page = rect[3] < p_height * 0.3 # 找到位于页面上半部分的矩形,下边界小于页面高度的30%
aspect_ratio_exceeds_4 = (rect[2] - rect[0]) > (rect[3] - rect[1]) * 4 # 找到长宽比超过4的矩形
if is_filled and area_is_large_enough and rectangle_position_determination_result and in_upper_half_page and aspect_ratio_exceeds_4:
colored_strip_bg_rect.append(rect)
if len(colored_strip_bg_rect) > 0:
for colored_strip_block_bbox in colored_strip_bg_rect:
for text_block in remain_text_blocks:
text_bbox = text_block['bbox']
if _is_in(text_bbox, colored_strip_block_bbox) or (_is_in_or_part_overlap(text_bbox, colored_strip_block_bbox) and calculate_overlap_area_2_minbox_area_ratio(text_bbox, colored_strip_block_bbox) > 0.6):
logger.info(f'remove_colored_strip_textblock: {text_bbox}, {colored_strip_block_bbox}')
text_block['tag'] = COLOR_BG_HEADER_TXT_BLOCK
colored_strip_textblocks.append(text_block)
if len(colored_strip_textblocks) > 0:
for colored_strip_textblock in colored_strip_textblocks:
if colored_strip_textblock in remain_text_blocks:
remain_text_blocks.remove(colored_strip_textblock)
return remain_text_blocks, colored_strip_textblocks
import re
from magic_pdf.libs.boxbase import _is_in_or_part_overlap
from magic_pdf.libs.drop_tag import CONTENT_IN_FOOT_OR_HEADER, PAGE_NO
def remove_headder_footer_one_page(text_raw_blocks, image_bboxes, table_bboxes, header_bboxs, footer_bboxs,
page_no_bboxs, page_w, page_h):
"""
删除页眉页脚,页码
从line级别进行删除,删除之后观察这个text-block是否是空的,如果是空的,则移动到remove_list中
"""
header = []
footer = []
if len(header) == 0:
model_header = header_bboxs
if model_header:
x0 = min([x for x, _, _, _ in model_header])
y0 = min([y for _, y, _, _ in model_header])
x1 = max([x1 for _, _, x1, _ in model_header])
y1 = max([y1 for _, _, _, y1 in model_header])
header = [x0, y0, x1, y1]
if len(footer) == 0:
model_footer = footer_bboxs
if model_footer:
x0 = min([x for x, _, _, _ in model_footer])
y0 = min([y for _, y, _, _ in model_footer])
x1 = max([x1 for _, _, x1, _ in model_footer])
y1 = max([y1 for _, _, _, y1 in model_footer])
footer = [x0, y0, x1, y1]
header_y0 = 0 if len(header) == 0 else header[3]
footer_y0 = page_h if len(footer) == 0 else footer[1]
if page_no_bboxs:
top_part = [b for b in page_no_bboxs if b[3] < page_h / 2]
btn_part = [b for b in page_no_bboxs if b[1] > page_h / 2]
top_max_y0 = max([b[1] for b in top_part]) if top_part else 0
btn_min_y1 = min([b[3] for b in btn_part]) if btn_part else page_h
header_y0 = max(header_y0, top_max_y0)
footer_y0 = min(footer_y0, btn_min_y1)
content_boundry = [0, header_y0, page_w, footer_y0]
header = [0, 0, page_w, header_y0]
footer = [0, footer_y0, page_w, page_h]
"""以上计算出来了页眉页脚的边界,下面开始进行删除"""
text_block_to_remove = []
# 首先检查每个textblock
for blk in text_raw_blocks:
if len(blk['lines']) > 0:
for line in blk['lines']:
line_del = []
for span in line['spans']:
span_del = []
if span['bbox'][3] < header_y0:
span_del.append(span)
elif _is_in_or_part_overlap(span['bbox'], header) or _is_in_or_part_overlap(span['bbox'], footer):
span_del.append(span)
for span in span_del:
line['spans'].remove(span)
if not line['spans']:
line_del.append(line)
for line in line_del:
blk['lines'].remove(line)
else:
# if not blk['lines']:
blk['tag'] = CONTENT_IN_FOOT_OR_HEADER
text_block_to_remove.append(blk)
"""有的时候由于pageNo太小了,总是会有一点和content_boundry重叠一点,被放入正文,因此对于pageNo,进行span粒度的删除"""
page_no_block_2_remove = []
if page_no_bboxs:
for pagenobox in page_no_bboxs:
for block in text_raw_blocks:
if _is_in_or_part_overlap(pagenobox, block['bbox']): # 在span级别删除页码
for line in block['lines']:
for span in line['spans']:
if _is_in_or_part_overlap(pagenobox, span['bbox']):
# span['text'] = ''
span['tag'] = PAGE_NO
# 检查这个block是否只有这一个span,如果是,那么就把这个block也删除
if len(line['spans']) == 1 and len(block['lines']) == 1:
page_no_block_2_remove.append(block)
else:
# 测试最后一个是不是页码:规则是,最后一个block仅有1个line,一个span,且text是数字,空格,符号组成,不含字母,并且包含数字
if len(text_raw_blocks) > 0:
text_raw_blocks.sort(key=lambda x: x['bbox'][1], reverse=True)
last_block = text_raw_blocks[0]
if len(last_block['lines']) == 1:
last_line = last_block['lines'][0]
if len(last_line['spans']) == 1:
last_span = last_line['spans'][0]
if last_span['text'].strip() and not re.search('[a-zA-Z]', last_span['text']) and re.search('[0-9]',
last_span[
'text']):
last_span['tag'] = PAGE_NO
page_no_block_2_remove.append(last_block)
for b in page_no_block_2_remove:
text_block_to_remove.append(b)
for blk in text_block_to_remove:
if blk in text_raw_blocks:
text_raw_blocks.remove(blk)
text_block_remain = text_raw_blocks
image_bbox_to_remove = [bbox for bbox in image_bboxes if not _is_in_or_part_overlap(bbox, content_boundry)]
image_bbox_remain = [bbox for bbox in image_bboxes if _is_in_or_part_overlap(bbox, content_boundry)]
table_bbox_to_remove = [bbox for bbox in table_bboxes if not _is_in_or_part_overlap(bbox, content_boundry)]
table_bbox_remain = [bbox for bbox in table_bboxes if _is_in_or_part_overlap(bbox, content_boundry)]
return image_bbox_remain, table_bbox_remain, text_block_remain, text_block_to_remove, image_bbox_to_remove, table_bbox_to_remove
import math
from magic_pdf.libs.boxbase import is_vbox_on_side
from magic_pdf.libs.drop_tag import EMPTY_SIDE_BLOCK, ROTATE_TEXT, VERTICAL_TEXT
def detect_non_horizontal_texts(result_dict):
"""
This function detects watermarks and vertical margin notes in the document.
Watermarks are identified by finding blocks with the same coordinates and frequently occurring identical texts across multiple pages.
If these conditions are met, the blocks are highly likely to be watermarks, as opposed to headers or footers, which can change from page to page.
If the direction of these blocks is not horizontal, they are definitely considered to be watermarks.
Vertical margin notes are identified by finding blocks with the same coordinates and frequently occurring identical texts across multiple pages.
If these conditions are met, the blocks are highly likely to be vertical margin notes, which typically appear on the left and right sides of the page.
If the direction of these blocks is vertical, they are definitely considered to be vertical margin notes.
Parameters
----------
result_dict : dict
The result dictionary.
Returns
-------
result_dict : dict
The updated result dictionary.
"""
# Dictionary to store information about potential watermarks
potential_watermarks = {}
potential_margin_notes = {}
for page_id, page_content in result_dict.items():
if page_id.startswith("page_"):
for block_id, block_data in page_content.items():
if block_id.startswith("block_"):
if "dir" in block_data:
coordinates_text = (block_data["bbox"], block_data["text"]) # Tuple of coordinates and text
angle = math.atan2(block_data["dir"][1], block_data["dir"][0])
angle = abs(math.degrees(angle))
if angle > 5 and angle < 85: # Check if direction is watermarks
if coordinates_text in potential_watermarks:
potential_watermarks[coordinates_text] += 1
else:
potential_watermarks[coordinates_text] = 1
if angle > 85 and angle < 105: # Check if direction is vertical
if coordinates_text in potential_margin_notes:
potential_margin_notes[coordinates_text] += 1 # Increment count
else:
potential_margin_notes[coordinates_text] = 1 # Initialize count
# Identify watermarks by finding entries with counts higher than a threshold (e.g., appearing on more than half of the pages)
watermark_threshold = len(result_dict) // 2
watermarks = {k: v for k, v in potential_watermarks.items() if v > watermark_threshold}
# Identify margin notes by finding entries with counts higher than a threshold (e.g., appearing on more than half of the pages)
margin_note_threshold = len(result_dict) // 2
margin_notes = {k: v for k, v in potential_margin_notes.items() if v > margin_note_threshold}
# Add watermark information to the result dictionary
for page_id, blocks in result_dict.items():
if page_id.startswith("page_"):
for block_id, block_data in blocks.items():
coordinates_text = (block_data["bbox"], block_data["text"])
if coordinates_text in watermarks:
block_data["is_watermark"] = 1
else:
block_data["is_watermark"] = 0
if coordinates_text in margin_notes:
block_data["is_vertical_margin_note"] = 1
else:
block_data["is_vertical_margin_note"] = 0
return result_dict
"""
1. 当一个block里全部文字都不是dir=(1,0),这个block整体去掉
2. 当一个block里全部文字都是dir=(1,0),但是每行只有一个字,这个block整体去掉。这个block必须出现在页面的四周,否则不去掉
"""
import re
def __is_a_word(sentence):
# 如果输入是中文并且长度为1,则返回True
if re.fullmatch(r'[\u4e00-\u9fa5]', sentence):
return True
# 判断是否为单个英文单词或字符(包括ASCII标点)
elif re.fullmatch(r'[a-zA-Z0-9]+', sentence) and len(sentence) <=2:
return True
else:
return False
def __get_text_color(num):
"""获取字体的颜色RGB值"""
blue = num & 255
green = (num >> 8) & 255
red = (num >> 16) & 255
return red, green, blue
def __is_empty_side_box(text_block):
"""
是否是边缘上的空白没有任何内容的block
"""
for line in text_block['lines']:
for span in line['spans']:
font_color = span['color']
r,g,b = __get_text_color(font_color)
if len(span['text'].strip())>0 and (r,g,b)!=(255,255,255):
return False
return True
def remove_rotate_side_textblock(pymu_text_block, page_width, page_height):
"""
返回删除了垂直,水印,旋转的textblock
删除的内容打上tag返回
"""
removed_text_block = []
for i, block in enumerate(pymu_text_block): # 格式参考test/assets/papre/pymu_textblocks.json
lines = block['lines']
block_bbox = block['bbox']
if not is_vbox_on_side(block_bbox, page_width, page_height, 0.2): # 保证这些box必须在页面的两边
continue
if all([__is_a_word(line['spans'][0]["text"]) for line in lines if len(line['spans'])>0]) and len(lines)>1 and all([len(line['spans'])==1 for line in lines]):
is_box_valign = (len(set([int(line['spans'][0]['bbox'][0] ) for line in lines if len(line['spans'])>0]))==1) and (len([int(line['spans'][0]['bbox'][0] ) for line in lines if len(line['spans'])>0])>1) # 测试bbox在垂直方向是不是x0都相等,也就是在垂直方向排列.同时必须大于等于2个字
if is_box_valign:
block['tag'] = VERTICAL_TEXT
removed_text_block.append(block)
continue
for line in lines:
if line['dir']!=(1,0):
block['tag'] = ROTATE_TEXT
removed_text_block.append(block) # 只要有一个line不是dir=(1,0),就把整个block都删掉
break
for block in removed_text_block:
pymu_text_block.remove(block)
return pymu_text_block, removed_text_block
def get_side_boundry(rotate_bbox, page_width, page_height):
"""
根据rotate_bbox,返回页面的左右正文边界
"""
left_x = 0
right_x = page_width
for x in rotate_bbox:
box = x['bbox']
if box[2]<page_width/2:
left_x = max(left_x, box[2])
else:
right_x = min(right_x, box[0])
return left_x+1, right_x-1
def remove_side_blank_block(pymu_text_block, page_width, page_height):
"""
删除页面两侧的空白block
"""
removed_text_block = []
for i, block in enumerate(pymu_text_block): # 格式参考test/assets/papre/pymu_textblocks.json
block_bbox = block['bbox']
if not is_vbox_on_side(block_bbox, page_width, page_height, 0.2): # 保证这些box必须在页面的两边
continue
if __is_empty_side_box(block):
block['tag'] = EMPTY_SIDE_BLOCK
removed_text_block.append(block)
continue
for block in removed_text_block:
pymu_text_block.remove(block)
return pymu_text_block, removed_text_block
\ No newline at end of file
"""
从pdf里提取出来api给出的bbox,然后根据重叠情况做出取舍
1. 首先去掉出现在图片上的bbox,图片包括表格和图片
2. 然后去掉出现在文字blcok上的图片bbox
"""
from magic_pdf.libs.boxbase import _is_in, _is_in_or_part_overlap, _is_left_overlap
from magic_pdf.libs.drop_tag import ON_IMAGE_TEXT, ON_TABLE_TEXT
def resolve_bbox_overlap_conflict(images: list, tables: list, interline_equations: list, inline_equations: list,
text_raw_blocks: list):
"""
text_raw_blocks结构是从pymupdf里直接取到的结构,具体样例参考test/assets/papre/pymu_textblocks.json
当下采用一种粗暴的方式:
1. 去掉图片上的公式
2. 去掉table上的公式
2. 图片和文字block部分重叠,首先丢弃图片
3. 图片和图片重叠,修改图片的bbox,使得图片不重叠(暂时没这么做,先把图片都扔掉)
4. 去掉文字bbox里位于图片、表格上的文字(一定要完全在图、表内部)
5. 去掉表格上的文字
"""
text_block_removed = []
images_backup = []
# 去掉位于图片上的文字block
for image_box in images:
for text_block in text_raw_blocks:
text_bbox = text_block["bbox"]
if _is_in(text_bbox, image_box):
text_block['tag'] = ON_IMAGE_TEXT
text_block_removed.append(text_block)
# 去掉table上的文字block
for table_box in tables:
for text_block in text_raw_blocks:
text_bbox = text_block["bbox"]
if _is_in(text_bbox, table_box):
text_block['tag'] = ON_TABLE_TEXT
text_block_removed.append(text_block)
for text_block in text_block_removed:
if text_block in text_raw_blocks:
text_raw_blocks.remove(text_block)
# 第一步去掉在图片上出现的公式box
temp = []
for image_box in images:
for eq1 in interline_equations:
if _is_in_or_part_overlap(image_box, eq1[:4]):
temp.append(eq1)
for eq2 in inline_equations:
if _is_in_or_part_overlap(image_box, eq2[:4]):
temp.append(eq2)
for eq in temp:
if eq in interline_equations:
interline_equations.remove(eq)
if eq in inline_equations:
inline_equations.remove(eq)
# 第二步去掉在表格上出现的公式box
temp = []
for table_box in tables:
for eq1 in interline_equations:
if _is_in_or_part_overlap(table_box, eq1[:4]):
temp.append(eq1)
for eq2 in inline_equations:
if _is_in_or_part_overlap(table_box, eq2[:4]):
temp.append(eq2)
for eq in temp:
if eq in interline_equations:
interline_equations.remove(eq)
if eq in inline_equations:
inline_equations.remove(eq)
# 图片和文字重叠,丢掉图片
for image_box in images:
for text_block in text_raw_blocks:
text_bbox = text_block["bbox"]
if _is_in_or_part_overlap(image_box, text_bbox):
images_backup.append(image_box)
break
for image_box in images_backup:
images.remove(image_box)
# 图片和图片重叠,两张都暂时不参与版面计算
images_dup_index = []
for i in range(len(images)):
for j in range(i + 1, len(images)):
if _is_in_or_part_overlap(images[i], images[j]):
images_dup_index.append(i)
images_dup_index.append(j)
dup_idx = set(images_dup_index)
for img_id in dup_idx:
images_backup.append(images[img_id])
images[img_id] = None
images = [img for img in images if img is not None]
# 如果行间公式和文字block重叠,放到临时的数据里,防止这些文字box影响到layout计算。通过计算IOU合并行间公式和文字block
# 对于这样的文本块删除,然后保留行间公式的大小不变。
# 当计算完毕layout,这部分再合并回来
text_block_removed_2 = []
# for text_block in text_raw_blocks:
# text_bbox = text_block["bbox"]
# for eq in interline_equations:
# ratio = calculate_overlap_area_2_minbox_area_ratio(text_bbox, eq[:4])
# if ratio>0.05:
# text_block['tag'] = "belong-to-interline-equation"
# text_block_removed_2.append(text_block)
# break
# for tb in text_block_removed_2:
# if tb in text_raw_blocks:
# text_raw_blocks.remove(tb)
# text_block_removed = text_block_removed + text_block_removed_2
return images, tables, interline_equations, inline_equations, text_raw_blocks, text_block_removed, images_backup, text_block_removed_2
def check_text_block_horizontal_overlap(text_blocks: list, header, footer) -> bool:
"""
检查文本block之间的水平重叠情况,这种情况如果发生,那么这个pdf就不再继续处理了。
因为这种情况大概率发生了公式没有被检测出来。
"""
if len(text_blocks) == 0:
return False
page_min_y = 0
page_max_y = max(yy['bbox'][3] for yy in text_blocks)
def __max_y(lst: list):
if len(lst) > 0:
return max([item[1] for item in lst])
return page_min_y
def __min_y(lst: list):
if len(lst) > 0:
return min([item[3] for item in lst])
return page_max_y
clip_y0 = __max_y(header)
clip_y1 = __min_y(footer)
txt_bboxes = []
for text_block in text_blocks:
bbox = text_block["bbox"]
if bbox[1] >= clip_y0 and bbox[3] <= clip_y1:
txt_bboxes.append(bbox)
for i in range(len(txt_bboxes)):
for j in range(i + 1, len(txt_bboxes)):
if _is_left_overlap(txt_bboxes[i], txt_bboxes[j]) or _is_left_overlap(txt_bboxes[j], txt_bboxes[i]):
return True
return False
def check_useful_block_horizontal_overlap(useful_blocks: list) -> bool:
"""
检查文本block之间的水平重叠情况,这种情况如果发生,那么这个pdf就不再继续处理了。
因为这种情况大概率发生了公式没有被检测出来。
"""
if len(useful_blocks) == 0:
return False
page_min_y = 0
page_max_y = max(yy['bbox'][3] for yy in useful_blocks)
useful_bboxes = []
for text_block in useful_blocks:
bbox = text_block["bbox"]
if bbox[1] >= page_min_y and bbox[3] <= page_max_y:
useful_bboxes.append(bbox)
for i in range(len(useful_bboxes)):
for j in range(i + 1, len(useful_bboxes)):
area_i = (useful_bboxes[i][2] - useful_bboxes[i][0]) * (useful_bboxes[i][3] - useful_bboxes[i][1])
area_j = (useful_bboxes[j][2] - useful_bboxes[j][0]) * (useful_bboxes[j][3] - useful_bboxes[j][1])
if _is_left_overlap(useful_bboxes[i], useful_bboxes[j]) or _is_left_overlap(useful_bboxes[j], useful_bboxes[i]):
if area_i > area_j:
return True, useful_bboxes[j], useful_bboxes[i]
else:
return True, useful_bboxes[i], useful_bboxes[j]
return False, None, None
def solve_inline_too_large_interval(pdf_info_dict: dict) -> dict: # text_block -> json中的preproc_block
"""解决行内文本间距过大问题"""
for i in range(len(pdf_info_dict)):
text_blocks = pdf_info_dict[f'page_{i}']['preproc_blocks']
for block in text_blocks:
x_pre_1, y_pre_1, x_pre_2, y_pre_2 = 0, 0, 0, 0
for line in block['lines']:
x_cur_1, y_cur_1, x_cur_2, y_cur_2 = line['bbox']
# line_box = [x1, y1, x2, y2]
if int(y_cur_1) == int(y_pre_1) and int(y_cur_2) == int(y_pre_2):
# if len(line['spans']) == 1:
line['spans'][0]['text'] = ' ' + line['spans'][0]['text']
x_pre_1, y_pre_1, x_pre_2, y_pre_2 = line['bbox']
return pdf_info_dict
"""
统计处需要跨页、全局性的数据
- 统计出字号从大到小
- 正文区域占比最高的前5
- 正文平均行间距
- 正文平均字间距
- 正文平均字符宽度
- 正文平均字符高度
"""
model:
arch: unimernet
model_type: unimernet
model_config:
model_name: ./models
max_seq_len: 1024
length_aware: False
load_pretrained: True
pretrained: ./models/pytorch_model.bin
tokenizer_config:
path: ./models
datasets:
formula_rec_eval:
vis_processor:
eval:
name: "formula_image_eval"
image_size:
- 192
- 672
run:
runner: runner_iter
task: unimernet_train
batch_size_train: 64
batch_size_eval: 64
num_workers: 1
iters_per_inner_epoch: 2000
max_iters: 60000
seed: 42
output_dir: "../output/demo"
evaluate: True
test_splits: [ "eval" ]
device: "cuda"
world_size: 1
dist_url: "env://"
distributed: True
distributed_type: ddp # or fsdp when train llm
generate_cfg:
temperature: 0.0
AUG:
DETR: true
CACHE_DIR: ~/cache/huggingface
CUDNN_BENCHMARK: false
DATALOADER:
ASPECT_RATIO_GROUPING: true
FILTER_EMPTY_ANNOTATIONS: false
NUM_WORKERS: 4
REPEAT_THRESHOLD: 0.0
SAMPLER_TRAIN: TrainingSampler
DATASETS:
PRECOMPUTED_PROPOSAL_TOPK_TEST: 1000
PRECOMPUTED_PROPOSAL_TOPK_TRAIN: 2000
PROPOSAL_FILES_TEST: []
PROPOSAL_FILES_TRAIN: []
TEST:
- scihub_train
TRAIN:
- scihub_train
GLOBAL:
HACK: 1.0
ICDAR_DATA_DIR_TEST: ''
ICDAR_DATA_DIR_TRAIN: ''
INPUT:
CROP:
ENABLED: true
SIZE:
- 384
- 600
TYPE: absolute_range
FORMAT: RGB
MASK_FORMAT: polygon
MAX_SIZE_TEST: 1333
MAX_SIZE_TRAIN: 1333
MIN_SIZE_TEST: 800
MIN_SIZE_TRAIN:
- 480
- 512
- 544
- 576
- 608
- 640
- 672
- 704
- 736
- 768
- 800
MIN_SIZE_TRAIN_SAMPLING: choice
RANDOM_FLIP: horizontal
MODEL:
ANCHOR_GENERATOR:
ANGLES:
- - -90
- 0
- 90
ASPECT_RATIOS:
- - 0.5
- 1.0
- 2.0
NAME: DefaultAnchorGenerator
OFFSET: 0.0
SIZES:
- - 32
- - 64
- - 128
- - 256
- - 512
BACKBONE:
FREEZE_AT: 2
NAME: build_vit_fpn_backbone
CONFIG_PATH: ''
DEVICE: cuda
FPN:
FUSE_TYPE: sum
IN_FEATURES:
- layer3
- layer5
- layer7
- layer11
NORM: ''
OUT_CHANNELS: 256
IMAGE_ONLY: true
KEYPOINT_ON: false
LOAD_PROPOSALS: false
MASK_ON: true
META_ARCHITECTURE: VLGeneralizedRCNN
PANOPTIC_FPN:
COMBINE:
ENABLED: true
INSTANCES_CONFIDENCE_THRESH: 0.5
OVERLAP_THRESH: 0.5
STUFF_AREA_LIMIT: 4096
INSTANCE_LOSS_WEIGHT: 1.0
PIXEL_MEAN:
- 127.5
- 127.5
- 127.5
PIXEL_STD:
- 127.5
- 127.5
- 127.5
PROPOSAL_GENERATOR:
MIN_SIZE: 0
NAME: RPN
RESNETS:
DEFORM_MODULATED: false
DEFORM_NUM_GROUPS: 1
DEFORM_ON_PER_STAGE:
- false
- false
- false
- false
DEPTH: 50
NORM: FrozenBN
NUM_GROUPS: 1
OUT_FEATURES:
- res4
RES2_OUT_CHANNELS: 256
RES5_DILATION: 1
STEM_OUT_CHANNELS: 64
STRIDE_IN_1X1: true
WIDTH_PER_GROUP: 64
RETINANET:
BBOX_REG_LOSS_TYPE: smooth_l1
BBOX_REG_WEIGHTS:
- 1.0
- 1.0
- 1.0
- 1.0
FOCAL_LOSS_ALPHA: 0.25
FOCAL_LOSS_GAMMA: 2.0
IN_FEATURES:
- p3
- p4
- p5
- p6
- p7
IOU_LABELS:
- 0
- -1
- 1
IOU_THRESHOLDS:
- 0.4
- 0.5
NMS_THRESH_TEST: 0.5
NORM: ''
NUM_CLASSES: 10
NUM_CONVS: 4
PRIOR_PROB: 0.01
SCORE_THRESH_TEST: 0.05
SMOOTH_L1_LOSS_BETA: 0.1
TOPK_CANDIDATES_TEST: 1000
ROI_BOX_CASCADE_HEAD:
BBOX_REG_WEIGHTS:
- - 10.0
- 10.0
- 5.0
- 5.0
- - 20.0
- 20.0
- 10.0
- 10.0
- - 30.0
- 30.0
- 15.0
- 15.0
IOUS:
- 0.5
- 0.6
- 0.7
ROI_BOX_HEAD:
BBOX_REG_LOSS_TYPE: smooth_l1
BBOX_REG_LOSS_WEIGHT: 1.0
BBOX_REG_WEIGHTS:
- 10.0
- 10.0
- 5.0
- 5.0
CLS_AGNOSTIC_BBOX_REG: true
CONV_DIM: 256
FC_DIM: 1024
NAME: FastRCNNConvFCHead
NORM: ''
NUM_CONV: 0
NUM_FC: 2
POOLER_RESOLUTION: 7
POOLER_SAMPLING_RATIO: 0
POOLER_TYPE: ROIAlignV2
SMOOTH_L1_BETA: 0.0
TRAIN_ON_PRED_BOXES: false
ROI_HEADS:
BATCH_SIZE_PER_IMAGE: 512
IN_FEATURES:
- p2
- p3
- p4
- p5
IOU_LABELS:
- 0
- 1
IOU_THRESHOLDS:
- 0.5
NAME: CascadeROIHeads
NMS_THRESH_TEST: 0.5
NUM_CLASSES: 10
POSITIVE_FRACTION: 0.25
PROPOSAL_APPEND_GT: true
SCORE_THRESH_TEST: 0.05
ROI_KEYPOINT_HEAD:
CONV_DIMS:
- 512
- 512
- 512
- 512
- 512
- 512
- 512
- 512
LOSS_WEIGHT: 1.0
MIN_KEYPOINTS_PER_IMAGE: 1
NAME: KRCNNConvDeconvUpsampleHead
NORMALIZE_LOSS_BY_VISIBLE_KEYPOINTS: true
NUM_KEYPOINTS: 17
POOLER_RESOLUTION: 14
POOLER_SAMPLING_RATIO: 0
POOLER_TYPE: ROIAlignV2
ROI_MASK_HEAD:
CLS_AGNOSTIC_MASK: false
CONV_DIM: 256
NAME: MaskRCNNConvUpsampleHead
NORM: ''
NUM_CONV: 4
POOLER_RESOLUTION: 14
POOLER_SAMPLING_RATIO: 0
POOLER_TYPE: ROIAlignV2
RPN:
BATCH_SIZE_PER_IMAGE: 256
BBOX_REG_LOSS_TYPE: smooth_l1
BBOX_REG_LOSS_WEIGHT: 1.0
BBOX_REG_WEIGHTS:
- 1.0
- 1.0
- 1.0
- 1.0
BOUNDARY_THRESH: -1
CONV_DIMS:
- -1
HEAD_NAME: StandardRPNHead
IN_FEATURES:
- p2
- p3
- p4
- p5
- p6
IOU_LABELS:
- 0
- -1
- 1
IOU_THRESHOLDS:
- 0.3
- 0.7
LOSS_WEIGHT: 1.0
NMS_THRESH: 0.7
POSITIVE_FRACTION: 0.5
POST_NMS_TOPK_TEST: 1000
POST_NMS_TOPK_TRAIN: 2000
PRE_NMS_TOPK_TEST: 1000
PRE_NMS_TOPK_TRAIN: 2000
SMOOTH_L1_BETA: 0.0
SEM_SEG_HEAD:
COMMON_STRIDE: 4
CONVS_DIM: 128
IGNORE_VALUE: 255
IN_FEATURES:
- p2
- p3
- p4
- p5
LOSS_WEIGHT: 1.0
NAME: SemSegFPNHead
NORM: GN
NUM_CLASSES: 10
VIT:
DROP_PATH: 0.1
IMG_SIZE:
- 224
- 224
NAME: layoutlmv3_base
OUT_FEATURES:
- layer3
- layer5
- layer7
- layer11
POS_TYPE: abs
WEIGHTS:
OUTPUT_DIR:
SCIHUB_DATA_DIR_TRAIN: ~/publaynet/layout_scihub/train
SEED: 42
SOLVER:
AMP:
ENABLED: true
BACKBONE_MULTIPLIER: 1.0
BASE_LR: 0.0002
BIAS_LR_FACTOR: 1.0
CHECKPOINT_PERIOD: 2000
CLIP_GRADIENTS:
CLIP_TYPE: full_model
CLIP_VALUE: 1.0
ENABLED: true
NORM_TYPE: 2.0
GAMMA: 0.1
GRADIENT_ACCUMULATION_STEPS: 1
IMS_PER_BATCH: 32
LR_SCHEDULER_NAME: WarmupCosineLR
MAX_ITER: 20000
MOMENTUM: 0.9
NESTEROV: false
OPTIMIZER: ADAMW
REFERENCE_WORLD_SIZE: 0
STEPS:
- 10000
WARMUP_FACTOR: 0.01
WARMUP_ITERS: 333
WARMUP_METHOD: linear
WEIGHT_DECAY: 0.05
WEIGHT_DECAY_BIAS: null
WEIGHT_DECAY_NORM: 0.0
TEST:
AUG:
ENABLED: false
FLIP: true
MAX_SIZE: 4000
MIN_SIZES:
- 400
- 500
- 600
- 700
- 800
- 900
- 1000
- 1100
- 1200
DETECTIONS_PER_IMAGE: 100
EVAL_PERIOD: 1000
EXPECTED_RESULTS: []
KEYPOINT_OKS_SIGMAS: []
PRECISE_BN:
ENABLED: false
NUM_ITER: 200
VERSION: 2
VIS_PERIOD: 0
config:
device: cpu
layout: True
formula: False
table_config:
model: TableMaster
is_table_recog_enable: False
max_time: 400
weights:
layout: Layout/model_final.pth
mfd: MFD/weights.pt
mfr: MFR/UniMERNet
struct_eqtable: TabRec/StructEqTable
TableMaster: TabRec/TableMaster
from abc import ABC, abstractmethod
class AbsReaderWriter(ABC):
MODE_TXT = "text"
MODE_BIN = "binary"
@abstractmethod
def read(self, path: str, mode=MODE_TXT):
raise NotImplementedError
@abstractmethod
def write(self, content: str, path: str, mode=MODE_TXT):
raise NotImplementedError
@abstractmethod
def read_offset(self, path: str, offset=0, limit=None) -> bytes:
raise NotImplementedError
import os
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
from loguru import logger
class DiskReaderWriter(AbsReaderWriter):
def __init__(self, parent_path, encoding="utf-8"):
self.path = parent_path
self.encoding = encoding
def read(self, path, mode=AbsReaderWriter.MODE_TXT):
if os.path.isabs(path):
abspath = path
else:
abspath = os.path.join(self.path, path)
if not os.path.exists(abspath):
logger.error(f"file {abspath} not exists")
raise Exception(f"file {abspath} no exists")
if mode == AbsReaderWriter.MODE_TXT:
with open(abspath, "r", encoding=self.encoding) as f:
return f.read()
elif mode == AbsReaderWriter.MODE_BIN:
with open(abspath, "rb") as f:
return f.read()
else:
raise ValueError("Invalid mode. Use 'text' or 'binary'.")
def write(self, content, path, mode=AbsReaderWriter.MODE_TXT):
if os.path.isabs(path):
abspath = path
else:
abspath = os.path.join(self.path, path)
directory_path = os.path.dirname(abspath)
if not os.path.exists(directory_path):
os.makedirs(directory_path)
if mode == AbsReaderWriter.MODE_TXT:
with open(abspath, "w", encoding=self.encoding, errors="replace") as f:
f.write(content)
elif mode == AbsReaderWriter.MODE_BIN:
with open(abspath, "wb") as f:
f.write(content)
else:
raise ValueError("Invalid mode. Use 'text' or 'binary'.")
def read_offset(self, path: str, offset=0, limit=None):
abspath = path
if not os.path.isabs(path):
abspath = os.path.join(self.path, path)
with open(abspath, "rb") as f:
f.seek(offset)
return f.read(limit)
if __name__ == "__main__":
if 0:
file_path = "io/test/example.txt"
drw = DiskReaderWriter("D:\projects\papayfork\Magic-PDF\magic_pdf")
# 写入内容到文件
drw.write(b"Hello, World!", path="io/test/example.txt", mode="binary")
# 从文件读取内容
content = drw.read(path=file_path)
if content:
logger.info(f"从 {file_path} 读取的内容: {content}")
if 1:
drw = DiskReaderWriter("/opt/data/pdf/resources/test/io/")
content_bin = drw.read_offset("1.txt")
assert content_bin == b"ABCD!"
content_bin = drw.read_offset("1.txt", offset=1, limit=2)
assert content_bin == b"BC"
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
from magic_pdf.libs.commons import parse_aws_param, parse_bucket_key, join_path
import boto3
from loguru import logger
from botocore.config import Config
class S3ReaderWriter(AbsReaderWriter):
def __init__(
self,
ak: str,
sk: str,
endpoint_url: str,
addressing_style: str = "auto",
parent_path: str = "",
):
self.client = self._get_client(ak, sk, endpoint_url, addressing_style)
self.path = parent_path
def _get_client(self, ak: str, sk: str, endpoint_url: str, addressing_style: str):
s3_client = boto3.client(
service_name="s3",
aws_access_key_id=ak,
aws_secret_access_key=sk,
endpoint_url=endpoint_url,
config=Config(
s3={"addressing_style": addressing_style},
retries={"max_attempts": 5, "mode": "standard"},
),
)
return s3_client
def read(self, s3_relative_path, mode=AbsReaderWriter.MODE_TXT, encoding="utf-8"):
if s3_relative_path.startswith("s3://"):
s3_path = s3_relative_path
else:
s3_path = join_path(self.path, s3_relative_path)
bucket_name, key = parse_bucket_key(s3_path)
res = self.client.get_object(Bucket=bucket_name, Key=key)
body = res["Body"].read()
if mode == AbsReaderWriter.MODE_TXT:
data = body.decode(encoding) # Decode bytes to text
elif mode == AbsReaderWriter.MODE_BIN:
data = body
else:
raise ValueError("Invalid mode. Use 'text' or 'binary'.")
return data
def write(self, content, s3_relative_path, mode=AbsReaderWriter.MODE_TXT, encoding="utf-8"):
if s3_relative_path.startswith("s3://"):
s3_path = s3_relative_path
else:
s3_path = join_path(self.path, s3_relative_path)
if mode == AbsReaderWriter.MODE_TXT:
body = content.encode(encoding) # Encode text data as bytes
elif mode == AbsReaderWriter.MODE_BIN:
body = content
else:
raise ValueError("Invalid mode. Use 'text' or 'binary'.")
bucket_name, key = parse_bucket_key(s3_path)
self.client.put_object(Body=body, Bucket=bucket_name, Key=key)
logger.info(f"内容已写入 {s3_path} ")
def read_offset(self, path: str, offset=0, limit=None) -> bytes:
if path.startswith("s3://"):
s3_path = path
else:
s3_path = join_path(self.path, path)
bucket_name, key = parse_bucket_key(s3_path)
range_header = (
f"bytes={offset}-{offset+limit-1}" if limit else f"bytes={offset}-"
)
res = self.client.get_object(Bucket=bucket_name, Key=key, Range=range_header)
return res["Body"].read()
if __name__ == "__main__":
if 0:
# Config the connection info
ak = ""
sk = ""
endpoint_url = ""
addressing_style = "auto"
bucket_name = ""
# Create an S3ReaderWriter object
s3_reader_writer = S3ReaderWriter(
ak, sk, endpoint_url, addressing_style, "s3://bucket_name/"
)
# Write text data to S3
text_data = "This is some text data"
s3_reader_writer.write(
text_data,
s3_relative_path=f"s3://{bucket_name}/ebook/test/test.json",
mode=AbsReaderWriter.MODE_TXT,
)
# Read text data from S3
text_data_read = s3_reader_writer.read(
s3_relative_path=f"s3://{bucket_name}/ebook/test/test.json", mode=AbsReaderWriter.MODE_TXT
)
logger.info(f"Read text data from S3: {text_data_read}")
# Write binary data to S3
binary_data = b"This is some binary data"
s3_reader_writer.write(
text_data,
s3_relative_path=f"s3://{bucket_name}/ebook/test/test.json",
mode=AbsReaderWriter.MODE_BIN,
)
# Read binary data from S3
binary_data_read = s3_reader_writer.read(
s3_relative_path=f"s3://{bucket_name}/ebook/test/test.json", mode=AbsReaderWriter.MODE_BIN
)
logger.info(f"Read binary data from S3: {binary_data_read}")
# Range Read text data from S3
binary_data_read = s3_reader_writer.read_offset(
path=f"s3://{bucket_name}/ebook/test/test.json", offset=0, limit=10
)
logger.info(f"Read binary data from S3: {binary_data_read}")
if 1:
import os
import json
ak = os.getenv("AK", "")
sk = os.getenv("SK", "")
endpoint_url = os.getenv("ENDPOINT", "")
bucket = os.getenv("S3_BUCKET", "")
prefix = os.getenv("S3_PREFIX", "")
key_basename = os.getenv("S3_KEY_BASENAME", "")
s3_reader_writer = S3ReaderWriter(
ak, sk, endpoint_url, "auto", f"s3://{bucket}/{prefix}"
)
content_bin = s3_reader_writer.read_offset(key_basename)
assert content_bin[:10] == b'{"track_id'
assert content_bin[-10:] == b'r":null}}\n'
content_bin = s3_reader_writer.read_offset(key_basename, offset=424, limit=426)
jso = json.dumps(content_bin.decode("utf-8"))
print(jso)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment