Commit 826086d2 authored by zhougaofeng's avatar zhougaofeng
Browse files

Deleted magic_pdf/__pycache__/__init__.cpython-310.pyc,...

Deleted magic_pdf/__pycache__/__init__.cpython-310.pyc, magic_pdf/__pycache__/pdf_parse_by_ocr.cpython-310.pyc, magic_pdf/__pycache__/pdf_parse_by_txt.cpython-310.pyc, magic_pdf/__pycache__/pdf_parse_union_core.cpython-310.pyc, magic_pdf/__pycache__/user_api.cpython-310.pyc, magic_pdf/dict2md/__pycache__/__init__.cpython-310.pyc, magic_pdf/dict2md/__pycache__/ocr_client.cpython-310.pyc, magic_pdf/dict2md/__pycache__/ocr_mkcontent.cpython-310.pyc, magic_pdf/dict2md/__init__.py, magic_pdf/dict2md/mkcontent.py, magic_pdf/dict2md/ocr_client.py, magic_pdf/dict2md/ocr_mkcontent.py, magic_pdf/dict2md/ocr_server.py, magic_pdf/filter/__init__.py, magic_pdf/filter/pdf_classify_by_type.py, magic_pdf/filter/pdf_meta_scan.py, magic_pdf/integrations/rag/__init__.py, magic_pdf/integrations/rag/api.py, magic_pdf/integrations/rag/type.py, magic_pdf/integrations/rag/utils.py, magic_pdf/integrations/__init__.py, magic_pdf/layout/__init__.py, magic_pdf/layout/bbox_sort.py, magic_pdf/layout/layout_det_utils.py, magic_pdf/layout/layout_sort.py, magic_pdf/layout/layout_spiler_recog.py, magic_pdf/layout/mcol_sort.py, magic_pdf/libs/Constants.py, magic_pdf/libs/MakeContentConfig.py, magic_pdf/libs/ModelBlockTypeEnum.py, magic_pdf/libs/__init__.py, magic_pdf/libs/boxbase.py, magic_pdf/libs/calc_span_stats.py, magic_pdf/libs/commons.py, magic_pdf/libs/config_reader.py, magic_pdf/libs/convert_utils.py, magic_pdf/libs/coordinate_transform.py, magic_pdf/libs/detect_language_from_model.py, magic_pdf/libs/draw_bbox.py, magic_pdf/libs/drop_reason.py, magic_pdf/libs/drop_tag.py, magic_pdf/libs/hash_utils.py, magic_pdf/libs/json_compressor.py, magic_pdf/libs/language.py, magic_pdf/libs/local_math.py, magic_pdf/libs/markdown_utils.py, magic_pdf/libs/nlp_utils.py, magic_pdf/libs/ocr_content_type.py, magic_pdf/libs/path_utils.py, magic_pdf/libs/pdf_check.py, magic_pdf/libs/pdf_image_tools.py, magic_pdf/libs/safe_filename.py, magic_pdf/libs/textbase.py, magic_pdf/libs/version.py, magic_pdf/libs/vis_utils.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/cord.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/data_collator.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/funsd.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/image_utils.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/xfund.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/configuration_layoutlmv3.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/modeling_layoutlmv3.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/backbone.py, magic_pdf/model/pek_sub_modules/layoutlmv3/beit.py, magic_pdf/model/pek_sub_modules/layoutlmv3/deit.py, magic_pdf/model/pek_sub_modules/layoutlmv3/model_init.py, magic_pdf/model/pek_sub_modules/layoutlmv3/rcnn_vl.py, magic_pdf/model/pek_sub_modules/layoutlmv3/visualizer.py, magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py, magic_pdf/model/pek_sub_modules/structeqtable/__init__.py, magic_pdf/model/pek_sub_modules/__init__.py, magic_pdf/model/pek_sub_modules/post_process.py, magic_pdf/model/pek_sub_modules/self_modify.py, magic_pdf/model/__init__.py, magic_pdf/model/doc_analyze_by_custom_model.py, magic_pdf/model/magic_model.py, magic_pdf/model/model_list.py, magic_pdf/model/pdf_extract_kit.py, magic_pdf/model/ppTableModel.py, magic_pdf/model/pp_structure_v2.py, magic_pdf/para/__init__.py, magic_pdf/para/block_continuation_processor.py, magic_pdf/para/block_termination_processor.py, magic_pdf/para/commons.py, magic_pdf/para/denoise.py, magic_pdf/para/draw.py, magic_pdf/para/exceptions.py, magic_pdf/para/layout_match_processor.py, magic_pdf/para/para_pipeline.py, magic_pdf/para/para_split.py, magic_pdf/para/para_split_v2.py, magic_pdf/para/raw_processor.py, magic_pdf/para/stats.py, magic_pdf/para/title_processor.py, magic_pdf/parse/__init__.py, magic_pdf/parse/common_parse.py, magic_pdf/parse/excel_parse.py, magic_pdf/parse/pdf_client.py, magic_pdf/pipe/AbsPipe.py, magic_pdf/pipe/OCRPipe.py, magic_pdf/pipe/TXTPipe.py, magic_pdf/pipe/UNIPipe.py, magic_pdf/pipe/__init__.py, magic_pdf/post_proc/__init__.py, magic_pdf/post_proc/detect_para.py, magic_pdf/post_proc/pdf_post_filter.py, magic_pdf/post_proc/remove_footnote.py, magic_pdf/pre_proc/__init__.py, magic_pdf/pre_proc/citationmarker_remove.py, magic_pdf/pre_proc/construct_page_dict.py, magic_pdf/pre_proc/cut_image.py, magic_pdf/pre_proc/detect_equation.py, magic_pdf/pre_proc/detect_footer_by_model.py, magic_pdf/pre_proc/detect_footer_header_by_statistics.py, magic_pdf/pre_proc/detect_footnote.py, magic_pdf/pre_proc/detect_header.py, magic_pdf/pre_proc/detect_images.py, magic_pdf/pre_proc/detect_page_number.py, magic_pdf/pre_proc/detect_tables.py, magic_pdf/pre_proc/equations_replace.py, magic_pdf/pre_proc/fix_image.py, magic_pdf/pre_proc/fix_table.py, magic_pdf/pre_proc/main_text_font.py, magic_pdf/pre_proc/ocr_detect_all_bboxes.py, magic_pdf/pre_proc/ocr_detect_layout.py, magic_pdf/pre_proc/ocr_dict_merge.py, magic_pdf/pre_proc/ocr_span_list_modify.py, magic_pdf/pre_proc/pdf_pre_filter.py, magic_pdf/pre_proc/post_layout_split.py, magic_pdf/pre_proc/remove_bbox_overlap.py, magic_pdf/pre_proc/remove_colored_strip_bbox.py, magic_pdf/pre_proc/remove_footer_header.py, magic_pdf/pre_proc/remove_rotate_bbox.py, magic_pdf/pre_proc/resolve_bbox_conflict.py, magic_pdf/pre_proc/solve_line_alien.py, magic_pdf/pre_proc/statistics.py, magic_pdf/resources/fasttext-langdetect/lid.176.ftz, magic_pdf/resources/model_config/UniMERNet/demo.yaml, magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml, magic_pdf/resources/model_config/model_configs.yaml, magic_pdf/rw/AbsReaderWriter.py, magic_pdf/rw/DiskReaderWriter.py, magic_pdf/rw/S3ReaderWriter.py, magic_pdf/rw/__init__.py, magic_pdf/spark/__init__.py, magic_pdf/spark/spark_api.py, magic_pdf/tools/__init__.py, magic_pdf/tools/cli.py, magic_pdf/tools/cli_dev.py, magic_pdf/tools/common.py, magic_pdf/tools/pdf_server.py, magic_pdf/__init__.py, magic_pdf/config.ini, magic_pdf/pdf_parse_by_ocr.py, magic_pdf/pdf_parse_by_txt.py, magic_pdf/pdf_parse_union_core.py, magic_pdf/user_api.py files
parent 57aaa1cf
from magic_pdf.libs.boxbase import _is_in_or_part_overlap, _is_in, _is_part_overlap
from magic_pdf.libs.drop_reason import DropReason
def _remove_overlap_between_bbox(bbox1, bbox2):
if _is_part_overlap(bbox1, bbox2):
ix0, iy0, ix1, iy1 = bbox1
x0, y0, x1, y1 = bbox2
diff_x = min(x1, ix1) - max(x0, ix0)
diff_y = min(y1, iy1) - max(y0, iy0)
if diff_y > diff_x:
if x1 >= ix1:
mid = (x0 + ix1) // 2
ix1 = min(mid - 0.25, ix1)
x0 = max(mid + 0.25, x0)
else:
mid = (ix0 + x1) // 2
ix0 = max(mid + 0.25, ix0)
x1 = min(mid - 0.25, x1)
else:
if y1 >= iy1:
mid = (y0 + iy1) // 2
y0 = max(mid + 0.25, y0)
iy1 = min(iy1, mid-0.25)
else:
mid = (iy0 + y1) // 2
y1 = min(y1, mid-0.25)
iy0 = max(mid + 0.25, iy0)
if ix1 > ix0 and iy1 > iy0 and y1 > y0 and x1 > x0:
bbox1 = [ix0, iy0, ix1, iy1]
bbox2 = [x0, y0, x1, y1]
return bbox1, bbox2, None
else:
return bbox1, bbox2, DropReason.NEGATIVE_BBOX_AREA
else:
return bbox1, bbox2, None
def _remove_overlap_between_bboxes(arr):
drop_reasons = []
N = len(arr)
keeps = [True] * N
res = [None] * N
for i in range(N):
for j in range(N):
if i == j:
continue
if _is_in(arr[i]["bbox"], arr[j]["bbox"]):
keeps[i] = False
for idx, v in enumerate(arr):
if not keeps[idx]:
continue
for i in range(N):
if res[i] is None:
continue
bbox1, bbox2, drop_reason = _remove_overlap_between_bbox(v["bbox"], res[i]["bbox"])
if drop_reason is None:
v["bbox"] = bbox1
res[i]["bbox"] = bbox2
else:
if v["score"] > res[i]["score"]:
keeps[i] = False
res[i] = None
else:
keeps[idx] = False
drop_reasons.append(drop_reasons)
if keeps[idx]:
res[idx] = v
return res, drop_reasons
def remove_overlap_between_bbox_for_span(spans):
arr = [{"bbox": span["bbox"], "score": span.get("score", 0.1)} for span in spans ]
res, drop_reasons = _remove_overlap_between_bboxes(arr)
ret = []
for i in range(len(res)):
if res[i] is None:
continue
spans[i]["bbox"] = res[i]["bbox"]
ret.append(spans[i])
return ret, drop_reasons
def remove_overlap_between_bbox_for_block(all_bboxes):
arr = [{"bbox": bbox[:4], "score": bbox[-1]} for bbox in all_bboxes ]
res, drop_reasons = _remove_overlap_between_bboxes(arr)
ret = []
for i in range(len(res)):
if res[i] is None:
continue
all_bboxes[i][:4] = res[i]["bbox"]
ret.append(all_bboxes[i])
return ret, drop_reasons
from magic_pdf.libs.boxbase import _is_in, _is_in_or_part_overlap, calculate_overlap_area_2_minbox_area_ratio
from loguru import logger
from magic_pdf.libs.drop_tag import COLOR_BG_HEADER_TXT_BLOCK
def __area(box):
return (box[2] - box[0]) * (box[3] - box[1])
def rectangle_position_determination(rect, p_width):
"""
判断矩形是否在页面中轴线附近。
Args:
rect (list): 矩形坐标,格式为[x1, y1, x2, y2]。
p_width (int): 页面宽度。
Returns:
bool: 若矩形在页面中轴线附近则返回True,否则返回False。
"""
# 页面中轴线x坐标
x_axis = p_width / 2
# 矩形是否跨越中轴线
is_span = rect[0] < x_axis and rect[2] > x_axis
if is_span:
return True
else:
# 矩形与中轴线的距离,只算近的那一边
distance = rect[0] - x_axis if rect[0] > x_axis else x_axis - rect[2]
# 判断矩形与中轴线的距离是否小于页面宽度的20%
if distance < p_width * 0.2:
return True
else:
return False
def remove_colored_strip_textblock(remain_text_blocks, page):
"""
根据页面中特定颜色和大小过滤文本块,将符合条件的文本块从remain_text_blocks中移除,并返回移除的文本块列表colored_strip_textblock。
Args:
remain_text_blocks (list): 剩余文本块列表。
page (Page): 页面对象。
Returns:
tuple: 剩余文本块列表和移除的文本块列表。
"""
colored_strip_textblocks = [] # 先构造一个空的返回
if len(remain_text_blocks) > 0:
p_width, p_height = page.rect.width, page.rect.height
blocks = page.get_cdrawings()
colored_strip_bg_rect = []
for block in blocks:
is_filled = 'fill' in block and block['fill'] and block['fill'] != (1.0, 1.0, 1.0) # 过滤掉透明的
rect = block['rect']
area_is_large_enough = __area(rect) > 100 # 过滤掉特别小的矩形
rectangle_position_determination_result = rectangle_position_determination(rect, p_width)
in_upper_half_page = rect[3] < p_height * 0.3 # 找到位于页面上半部分的矩形,下边界小于页面高度的30%
aspect_ratio_exceeds_4 = (rect[2] - rect[0]) > (rect[3] - rect[1]) * 4 # 找到长宽比超过4的矩形
if is_filled and area_is_large_enough and rectangle_position_determination_result and in_upper_half_page and aspect_ratio_exceeds_4:
colored_strip_bg_rect.append(rect)
if len(colored_strip_bg_rect) > 0:
for colored_strip_block_bbox in colored_strip_bg_rect:
for text_block in remain_text_blocks:
text_bbox = text_block['bbox']
if _is_in(text_bbox, colored_strip_block_bbox) or (_is_in_or_part_overlap(text_bbox, colored_strip_block_bbox) and calculate_overlap_area_2_minbox_area_ratio(text_bbox, colored_strip_block_bbox) > 0.6):
logger.info(f'remove_colored_strip_textblock: {text_bbox}, {colored_strip_block_bbox}')
text_block['tag'] = COLOR_BG_HEADER_TXT_BLOCK
colored_strip_textblocks.append(text_block)
if len(colored_strip_textblocks) > 0:
for colored_strip_textblock in colored_strip_textblocks:
if colored_strip_textblock in remain_text_blocks:
remain_text_blocks.remove(colored_strip_textblock)
return remain_text_blocks, colored_strip_textblocks
import re
from magic_pdf.libs.boxbase import _is_in_or_part_overlap
from magic_pdf.libs.drop_tag import CONTENT_IN_FOOT_OR_HEADER, PAGE_NO
def remove_headder_footer_one_page(text_raw_blocks, image_bboxes, table_bboxes, header_bboxs, footer_bboxs,
page_no_bboxs, page_w, page_h):
"""
删除页眉页脚,页码
从line级别进行删除,删除之后观察这个text-block是否是空的,如果是空的,则移动到remove_list中
"""
header = []
footer = []
if len(header) == 0:
model_header = header_bboxs
if model_header:
x0 = min([x for x, _, _, _ in model_header])
y0 = min([y for _, y, _, _ in model_header])
x1 = max([x1 for _, _, x1, _ in model_header])
y1 = max([y1 for _, _, _, y1 in model_header])
header = [x0, y0, x1, y1]
if len(footer) == 0:
model_footer = footer_bboxs
if model_footer:
x0 = min([x for x, _, _, _ in model_footer])
y0 = min([y for _, y, _, _ in model_footer])
x1 = max([x1 for _, _, x1, _ in model_footer])
y1 = max([y1 for _, _, _, y1 in model_footer])
footer = [x0, y0, x1, y1]
header_y0 = 0 if len(header) == 0 else header[3]
footer_y0 = page_h if len(footer) == 0 else footer[1]
if page_no_bboxs:
top_part = [b for b in page_no_bboxs if b[3] < page_h / 2]
btn_part = [b for b in page_no_bboxs if b[1] > page_h / 2]
top_max_y0 = max([b[1] for b in top_part]) if top_part else 0
btn_min_y1 = min([b[3] for b in btn_part]) if btn_part else page_h
header_y0 = max(header_y0, top_max_y0)
footer_y0 = min(footer_y0, btn_min_y1)
content_boundry = [0, header_y0, page_w, footer_y0]
header = [0, 0, page_w, header_y0]
footer = [0, footer_y0, page_w, page_h]
"""以上计算出来了页眉页脚的边界,下面开始进行删除"""
text_block_to_remove = []
# 首先检查每个textblock
for blk in text_raw_blocks:
if len(blk['lines']) > 0:
for line in blk['lines']:
line_del = []
for span in line['spans']:
span_del = []
if span['bbox'][3] < header_y0:
span_del.append(span)
elif _is_in_or_part_overlap(span['bbox'], header) or _is_in_or_part_overlap(span['bbox'], footer):
span_del.append(span)
for span in span_del:
line['spans'].remove(span)
if not line['spans']:
line_del.append(line)
for line in line_del:
blk['lines'].remove(line)
else:
# if not blk['lines']:
blk['tag'] = CONTENT_IN_FOOT_OR_HEADER
text_block_to_remove.append(blk)
"""有的时候由于pageNo太小了,总是会有一点和content_boundry重叠一点,被放入正文,因此对于pageNo,进行span粒度的删除"""
page_no_block_2_remove = []
if page_no_bboxs:
for pagenobox in page_no_bboxs:
for block in text_raw_blocks:
if _is_in_or_part_overlap(pagenobox, block['bbox']): # 在span级别删除页码
for line in block['lines']:
for span in line['spans']:
if _is_in_or_part_overlap(pagenobox, span['bbox']):
# span['text'] = ''
span['tag'] = PAGE_NO
# 检查这个block是否只有这一个span,如果是,那么就把这个block也删除
if len(line['spans']) == 1 and len(block['lines']) == 1:
page_no_block_2_remove.append(block)
else:
# 测试最后一个是不是页码:规则是,最后一个block仅有1个line,一个span,且text是数字,空格,符号组成,不含字母,并且包含数字
if len(text_raw_blocks) > 0:
text_raw_blocks.sort(key=lambda x: x['bbox'][1], reverse=True)
last_block = text_raw_blocks[0]
if len(last_block['lines']) == 1:
last_line = last_block['lines'][0]
if len(last_line['spans']) == 1:
last_span = last_line['spans'][0]
if last_span['text'].strip() and not re.search('[a-zA-Z]', last_span['text']) and re.search('[0-9]',
last_span[
'text']):
last_span['tag'] = PAGE_NO
page_no_block_2_remove.append(last_block)
for b in page_no_block_2_remove:
text_block_to_remove.append(b)
for blk in text_block_to_remove:
if blk in text_raw_blocks:
text_raw_blocks.remove(blk)
text_block_remain = text_raw_blocks
image_bbox_to_remove = [bbox for bbox in image_bboxes if not _is_in_or_part_overlap(bbox, content_boundry)]
image_bbox_remain = [bbox for bbox in image_bboxes if _is_in_or_part_overlap(bbox, content_boundry)]
table_bbox_to_remove = [bbox for bbox in table_bboxes if not _is_in_or_part_overlap(bbox, content_boundry)]
table_bbox_remain = [bbox for bbox in table_bboxes if _is_in_or_part_overlap(bbox, content_boundry)]
return image_bbox_remain, table_bbox_remain, text_block_remain, text_block_to_remove, image_bbox_to_remove, table_bbox_to_remove
import math
from magic_pdf.libs.boxbase import is_vbox_on_side
from magic_pdf.libs.drop_tag import EMPTY_SIDE_BLOCK, ROTATE_TEXT, VERTICAL_TEXT
def detect_non_horizontal_texts(result_dict):
"""
This function detects watermarks and vertical margin notes in the document.
Watermarks are identified by finding blocks with the same coordinates and frequently occurring identical texts across multiple pages.
If these conditions are met, the blocks are highly likely to be watermarks, as opposed to headers or footers, which can change from page to page.
If the direction of these blocks is not horizontal, they are definitely considered to be watermarks.
Vertical margin notes are identified by finding blocks with the same coordinates and frequently occurring identical texts across multiple pages.
If these conditions are met, the blocks are highly likely to be vertical margin notes, which typically appear on the left and right sides of the page.
If the direction of these blocks is vertical, they are definitely considered to be vertical margin notes.
Parameters
----------
result_dict : dict
The result dictionary.
Returns
-------
result_dict : dict
The updated result dictionary.
"""
# Dictionary to store information about potential watermarks
potential_watermarks = {}
potential_margin_notes = {}
for page_id, page_content in result_dict.items():
if page_id.startswith("page_"):
for block_id, block_data in page_content.items():
if block_id.startswith("block_"):
if "dir" in block_data:
coordinates_text = (block_data["bbox"], block_data["text"]) # Tuple of coordinates and text
angle = math.atan2(block_data["dir"][1], block_data["dir"][0])
angle = abs(math.degrees(angle))
if angle > 5 and angle < 85: # Check if direction is watermarks
if coordinates_text in potential_watermarks:
potential_watermarks[coordinates_text] += 1
else:
potential_watermarks[coordinates_text] = 1
if angle > 85 and angle < 105: # Check if direction is vertical
if coordinates_text in potential_margin_notes:
potential_margin_notes[coordinates_text] += 1 # Increment count
else:
potential_margin_notes[coordinates_text] = 1 # Initialize count
# Identify watermarks by finding entries with counts higher than a threshold (e.g., appearing on more than half of the pages)
watermark_threshold = len(result_dict) // 2
watermarks = {k: v for k, v in potential_watermarks.items() if v > watermark_threshold}
# Identify margin notes by finding entries with counts higher than a threshold (e.g., appearing on more than half of the pages)
margin_note_threshold = len(result_dict) // 2
margin_notes = {k: v for k, v in potential_margin_notes.items() if v > margin_note_threshold}
# Add watermark information to the result dictionary
for page_id, blocks in result_dict.items():
if page_id.startswith("page_"):
for block_id, block_data in blocks.items():
coordinates_text = (block_data["bbox"], block_data["text"])
if coordinates_text in watermarks:
block_data["is_watermark"] = 1
else:
block_data["is_watermark"] = 0
if coordinates_text in margin_notes:
block_data["is_vertical_margin_note"] = 1
else:
block_data["is_vertical_margin_note"] = 0
return result_dict
"""
1. 当一个block里全部文字都不是dir=(1,0),这个block整体去掉
2. 当一个block里全部文字都是dir=(1,0),但是每行只有一个字,这个block整体去掉。这个block必须出现在页面的四周,否则不去掉
"""
import re
def __is_a_word(sentence):
# 如果输入是中文并且长度为1,则返回True
if re.fullmatch(r'[\u4e00-\u9fa5]', sentence):
return True
# 判断是否为单个英文单词或字符(包括ASCII标点)
elif re.fullmatch(r'[a-zA-Z0-9]+', sentence) and len(sentence) <=2:
return True
else:
return False
def __get_text_color(num):
"""获取字体的颜色RGB值"""
blue = num & 255
green = (num >> 8) & 255
red = (num >> 16) & 255
return red, green, blue
def __is_empty_side_box(text_block):
"""
是否是边缘上的空白没有任何内容的block
"""
for line in text_block['lines']:
for span in line['spans']:
font_color = span['color']
r,g,b = __get_text_color(font_color)
if len(span['text'].strip())>0 and (r,g,b)!=(255,255,255):
return False
return True
def remove_rotate_side_textblock(pymu_text_block, page_width, page_height):
"""
返回删除了垂直,水印,旋转的textblock
删除的内容打上tag返回
"""
removed_text_block = []
for i, block in enumerate(pymu_text_block): # 格式参考test/assets/papre/pymu_textblocks.json
lines = block['lines']
block_bbox = block['bbox']
if not is_vbox_on_side(block_bbox, page_width, page_height, 0.2): # 保证这些box必须在页面的两边
continue
if all([__is_a_word(line['spans'][0]["text"]) for line in lines if len(line['spans'])>0]) and len(lines)>1 and all([len(line['spans'])==1 for line in lines]):
is_box_valign = (len(set([int(line['spans'][0]['bbox'][0] ) for line in lines if len(line['spans'])>0]))==1) and (len([int(line['spans'][0]['bbox'][0] ) for line in lines if len(line['spans'])>0])>1) # 测试bbox在垂直方向是不是x0都相等,也就是在垂直方向排列.同时必须大于等于2个字
if is_box_valign:
block['tag'] = VERTICAL_TEXT
removed_text_block.append(block)
continue
for line in lines:
if line['dir']!=(1,0):
block['tag'] = ROTATE_TEXT
removed_text_block.append(block) # 只要有一个line不是dir=(1,0),就把整个block都删掉
break
for block in removed_text_block:
pymu_text_block.remove(block)
return pymu_text_block, removed_text_block
def get_side_boundry(rotate_bbox, page_width, page_height):
"""
根据rotate_bbox,返回页面的左右正文边界
"""
left_x = 0
right_x = page_width
for x in rotate_bbox:
box = x['bbox']
if box[2]<page_width/2:
left_x = max(left_x, box[2])
else:
right_x = min(right_x, box[0])
return left_x+1, right_x-1
def remove_side_blank_block(pymu_text_block, page_width, page_height):
"""
删除页面两侧的空白block
"""
removed_text_block = []
for i, block in enumerate(pymu_text_block): # 格式参考test/assets/papre/pymu_textblocks.json
block_bbox = block['bbox']
if not is_vbox_on_side(block_bbox, page_width, page_height, 0.2): # 保证这些box必须在页面的两边
continue
if __is_empty_side_box(block):
block['tag'] = EMPTY_SIDE_BLOCK
removed_text_block.append(block)
continue
for block in removed_text_block:
pymu_text_block.remove(block)
return pymu_text_block, removed_text_block
\ No newline at end of file
"""
从pdf里提取出来api给出的bbox,然后根据重叠情况做出取舍
1. 首先去掉出现在图片上的bbox,图片包括表格和图片
2. 然后去掉出现在文字blcok上的图片bbox
"""
from magic_pdf.libs.boxbase import _is_in, _is_in_or_part_overlap, _is_left_overlap
from magic_pdf.libs.drop_tag import ON_IMAGE_TEXT, ON_TABLE_TEXT
def resolve_bbox_overlap_conflict(images: list, tables: list, interline_equations: list, inline_equations: list,
text_raw_blocks: list):
"""
text_raw_blocks结构是从pymupdf里直接取到的结构,具体样例参考test/assets/papre/pymu_textblocks.json
当下采用一种粗暴的方式:
1. 去掉图片上的公式
2. 去掉table上的公式
2. 图片和文字block部分重叠,首先丢弃图片
3. 图片和图片重叠,修改图片的bbox,使得图片不重叠(暂时没这么做,先把图片都扔掉)
4. 去掉文字bbox里位于图片、表格上的文字(一定要完全在图、表内部)
5. 去掉表格上的文字
"""
text_block_removed = []
images_backup = []
# 去掉位于图片上的文字block
for image_box in images:
for text_block in text_raw_blocks:
text_bbox = text_block["bbox"]
if _is_in(text_bbox, image_box):
text_block['tag'] = ON_IMAGE_TEXT
text_block_removed.append(text_block)
# 去掉table上的文字block
for table_box in tables:
for text_block in text_raw_blocks:
text_bbox = text_block["bbox"]
if _is_in(text_bbox, table_box):
text_block['tag'] = ON_TABLE_TEXT
text_block_removed.append(text_block)
for text_block in text_block_removed:
if text_block in text_raw_blocks:
text_raw_blocks.remove(text_block)
# 第一步去掉在图片上出现的公式box
temp = []
for image_box in images:
for eq1 in interline_equations:
if _is_in_or_part_overlap(image_box, eq1[:4]):
temp.append(eq1)
for eq2 in inline_equations:
if _is_in_or_part_overlap(image_box, eq2[:4]):
temp.append(eq2)
for eq in temp:
if eq in interline_equations:
interline_equations.remove(eq)
if eq in inline_equations:
inline_equations.remove(eq)
# 第二步去掉在表格上出现的公式box
temp = []
for table_box in tables:
for eq1 in interline_equations:
if _is_in_or_part_overlap(table_box, eq1[:4]):
temp.append(eq1)
for eq2 in inline_equations:
if _is_in_or_part_overlap(table_box, eq2[:4]):
temp.append(eq2)
for eq in temp:
if eq in interline_equations:
interline_equations.remove(eq)
if eq in inline_equations:
inline_equations.remove(eq)
# 图片和文字重叠,丢掉图片
for image_box in images:
for text_block in text_raw_blocks:
text_bbox = text_block["bbox"]
if _is_in_or_part_overlap(image_box, text_bbox):
images_backup.append(image_box)
break
for image_box in images_backup:
images.remove(image_box)
# 图片和图片重叠,两张都暂时不参与版面计算
images_dup_index = []
for i in range(len(images)):
for j in range(i + 1, len(images)):
if _is_in_or_part_overlap(images[i], images[j]):
images_dup_index.append(i)
images_dup_index.append(j)
dup_idx = set(images_dup_index)
for img_id in dup_idx:
images_backup.append(images[img_id])
images[img_id] = None
images = [img for img in images if img is not None]
# 如果行间公式和文字block重叠,放到临时的数据里,防止这些文字box影响到layout计算。通过计算IOU合并行间公式和文字block
# 对于这样的文本块删除,然后保留行间公式的大小不变。
# 当计算完毕layout,这部分再合并回来
text_block_removed_2 = []
# for text_block in text_raw_blocks:
# text_bbox = text_block["bbox"]
# for eq in interline_equations:
# ratio = calculate_overlap_area_2_minbox_area_ratio(text_bbox, eq[:4])
# if ratio>0.05:
# text_block['tag'] = "belong-to-interline-equation"
# text_block_removed_2.append(text_block)
# break
# for tb in text_block_removed_2:
# if tb in text_raw_blocks:
# text_raw_blocks.remove(tb)
# text_block_removed = text_block_removed + text_block_removed_2
return images, tables, interline_equations, inline_equations, text_raw_blocks, text_block_removed, images_backup, text_block_removed_2
def check_text_block_horizontal_overlap(text_blocks: list, header, footer) -> bool:
"""
检查文本block之间的水平重叠情况,这种情况如果发生,那么这个pdf就不再继续处理了。
因为这种情况大概率发生了公式没有被检测出来。
"""
if len(text_blocks) == 0:
return False
page_min_y = 0
page_max_y = max(yy['bbox'][3] for yy in text_blocks)
def __max_y(lst: list):
if len(lst) > 0:
return max([item[1] for item in lst])
return page_min_y
def __min_y(lst: list):
if len(lst) > 0:
return min([item[3] for item in lst])
return page_max_y
clip_y0 = __max_y(header)
clip_y1 = __min_y(footer)
txt_bboxes = []
for text_block in text_blocks:
bbox = text_block["bbox"]
if bbox[1] >= clip_y0 and bbox[3] <= clip_y1:
txt_bboxes.append(bbox)
for i in range(len(txt_bboxes)):
for j in range(i + 1, len(txt_bboxes)):
if _is_left_overlap(txt_bboxes[i], txt_bboxes[j]) or _is_left_overlap(txt_bboxes[j], txt_bboxes[i]):
return True
return False
def check_useful_block_horizontal_overlap(useful_blocks: list) -> bool:
"""
检查文本block之间的水平重叠情况,这种情况如果发生,那么这个pdf就不再继续处理了。
因为这种情况大概率发生了公式没有被检测出来。
"""
if len(useful_blocks) == 0:
return False
page_min_y = 0
page_max_y = max(yy['bbox'][3] for yy in useful_blocks)
useful_bboxes = []
for text_block in useful_blocks:
bbox = text_block["bbox"]
if bbox[1] >= page_min_y and bbox[3] <= page_max_y:
useful_bboxes.append(bbox)
for i in range(len(useful_bboxes)):
for j in range(i + 1, len(useful_bboxes)):
area_i = (useful_bboxes[i][2] - useful_bboxes[i][0]) * (useful_bboxes[i][3] - useful_bboxes[i][1])
area_j = (useful_bboxes[j][2] - useful_bboxes[j][0]) * (useful_bboxes[j][3] - useful_bboxes[j][1])
if _is_left_overlap(useful_bboxes[i], useful_bboxes[j]) or _is_left_overlap(useful_bboxes[j], useful_bboxes[i]):
if area_i > area_j:
return True, useful_bboxes[j], useful_bboxes[i]
else:
return True, useful_bboxes[i], useful_bboxes[j]
return False, None, None
def solve_inline_too_large_interval(pdf_info_dict: dict) -> dict: # text_block -> json中的preproc_block
"""解决行内文本间距过大问题"""
for i in range(len(pdf_info_dict)):
text_blocks = pdf_info_dict[f'page_{i}']['preproc_blocks']
for block in text_blocks:
x_pre_1, y_pre_1, x_pre_2, y_pre_2 = 0, 0, 0, 0
for line in block['lines']:
x_cur_1, y_cur_1, x_cur_2, y_cur_2 = line['bbox']
# line_box = [x1, y1, x2, y2]
if int(y_cur_1) == int(y_pre_1) and int(y_cur_2) == int(y_pre_2):
# if len(line['spans']) == 1:
line['spans'][0]['text'] = ' ' + line['spans'][0]['text']
x_pre_1, y_pre_1, x_pre_2, y_pre_2 = line['bbox']
return pdf_info_dict
"""
统计处需要跨页、全局性的数据
- 统计出字号从大到小
- 正文区域占比最高的前5
- 正文平均行间距
- 正文平均字间距
- 正文平均字符宽度
- 正文平均字符高度
"""
model:
arch: unimernet
model_type: unimernet
model_config:
model_name: ./models
max_seq_len: 1024
length_aware: False
load_pretrained: True
pretrained: ./models/pytorch_model.bin
tokenizer_config:
path: ./models
datasets:
formula_rec_eval:
vis_processor:
eval:
name: "formula_image_eval"
image_size:
- 192
- 672
run:
runner: runner_iter
task: unimernet_train
batch_size_train: 64
batch_size_eval: 64
num_workers: 1
iters_per_inner_epoch: 2000
max_iters: 60000
seed: 42
output_dir: "../output/demo"
evaluate: True
test_splits: [ "eval" ]
device: "cuda"
world_size: 1
dist_url: "env://"
distributed: True
distributed_type: ddp # or fsdp when train llm
generate_cfg:
temperature: 0.0
AUG:
DETR: true
CACHE_DIR: ~/cache/huggingface
CUDNN_BENCHMARK: false
DATALOADER:
ASPECT_RATIO_GROUPING: true
FILTER_EMPTY_ANNOTATIONS: false
NUM_WORKERS: 4
REPEAT_THRESHOLD: 0.0
SAMPLER_TRAIN: TrainingSampler
DATASETS:
PRECOMPUTED_PROPOSAL_TOPK_TEST: 1000
PRECOMPUTED_PROPOSAL_TOPK_TRAIN: 2000
PROPOSAL_FILES_TEST: []
PROPOSAL_FILES_TRAIN: []
TEST:
- scihub_train
TRAIN:
- scihub_train
GLOBAL:
HACK: 1.0
ICDAR_DATA_DIR_TEST: ''
ICDAR_DATA_DIR_TRAIN: ''
INPUT:
CROP:
ENABLED: true
SIZE:
- 384
- 600
TYPE: absolute_range
FORMAT: RGB
MASK_FORMAT: polygon
MAX_SIZE_TEST: 1333
MAX_SIZE_TRAIN: 1333
MIN_SIZE_TEST: 800
MIN_SIZE_TRAIN:
- 480
- 512
- 544
- 576
- 608
- 640
- 672
- 704
- 736
- 768
- 800
MIN_SIZE_TRAIN_SAMPLING: choice
RANDOM_FLIP: horizontal
MODEL:
ANCHOR_GENERATOR:
ANGLES:
- - -90
- 0
- 90
ASPECT_RATIOS:
- - 0.5
- 1.0
- 2.0
NAME: DefaultAnchorGenerator
OFFSET: 0.0
SIZES:
- - 32
- - 64
- - 128
- - 256
- - 512
BACKBONE:
FREEZE_AT: 2
NAME: build_vit_fpn_backbone
CONFIG_PATH: ''
DEVICE: cuda
FPN:
FUSE_TYPE: sum
IN_FEATURES:
- layer3
- layer5
- layer7
- layer11
NORM: ''
OUT_CHANNELS: 256
IMAGE_ONLY: true
KEYPOINT_ON: false
LOAD_PROPOSALS: false
MASK_ON: true
META_ARCHITECTURE: VLGeneralizedRCNN
PANOPTIC_FPN:
COMBINE:
ENABLED: true
INSTANCES_CONFIDENCE_THRESH: 0.5
OVERLAP_THRESH: 0.5
STUFF_AREA_LIMIT: 4096
INSTANCE_LOSS_WEIGHT: 1.0
PIXEL_MEAN:
- 127.5
- 127.5
- 127.5
PIXEL_STD:
- 127.5
- 127.5
- 127.5
PROPOSAL_GENERATOR:
MIN_SIZE: 0
NAME: RPN
RESNETS:
DEFORM_MODULATED: false
DEFORM_NUM_GROUPS: 1
DEFORM_ON_PER_STAGE:
- false
- false
- false
- false
DEPTH: 50
NORM: FrozenBN
NUM_GROUPS: 1
OUT_FEATURES:
- res4
RES2_OUT_CHANNELS: 256
RES5_DILATION: 1
STEM_OUT_CHANNELS: 64
STRIDE_IN_1X1: true
WIDTH_PER_GROUP: 64
RETINANET:
BBOX_REG_LOSS_TYPE: smooth_l1
BBOX_REG_WEIGHTS:
- 1.0
- 1.0
- 1.0
- 1.0
FOCAL_LOSS_ALPHA: 0.25
FOCAL_LOSS_GAMMA: 2.0
IN_FEATURES:
- p3
- p4
- p5
- p6
- p7
IOU_LABELS:
- 0
- -1
- 1
IOU_THRESHOLDS:
- 0.4
- 0.5
NMS_THRESH_TEST: 0.5
NORM: ''
NUM_CLASSES: 10
NUM_CONVS: 4
PRIOR_PROB: 0.01
SCORE_THRESH_TEST: 0.05
SMOOTH_L1_LOSS_BETA: 0.1
TOPK_CANDIDATES_TEST: 1000
ROI_BOX_CASCADE_HEAD:
BBOX_REG_WEIGHTS:
- - 10.0
- 10.0
- 5.0
- 5.0
- - 20.0
- 20.0
- 10.0
- 10.0
- - 30.0
- 30.0
- 15.0
- 15.0
IOUS:
- 0.5
- 0.6
- 0.7
ROI_BOX_HEAD:
BBOX_REG_LOSS_TYPE: smooth_l1
BBOX_REG_LOSS_WEIGHT: 1.0
BBOX_REG_WEIGHTS:
- 10.0
- 10.0
- 5.0
- 5.0
CLS_AGNOSTIC_BBOX_REG: true
CONV_DIM: 256
FC_DIM: 1024
NAME: FastRCNNConvFCHead
NORM: ''
NUM_CONV: 0
NUM_FC: 2
POOLER_RESOLUTION: 7
POOLER_SAMPLING_RATIO: 0
POOLER_TYPE: ROIAlignV2
SMOOTH_L1_BETA: 0.0
TRAIN_ON_PRED_BOXES: false
ROI_HEADS:
BATCH_SIZE_PER_IMAGE: 512
IN_FEATURES:
- p2
- p3
- p4
- p5
IOU_LABELS:
- 0
- 1
IOU_THRESHOLDS:
- 0.5
NAME: CascadeROIHeads
NMS_THRESH_TEST: 0.5
NUM_CLASSES: 10
POSITIVE_FRACTION: 0.25
PROPOSAL_APPEND_GT: true
SCORE_THRESH_TEST: 0.05
ROI_KEYPOINT_HEAD:
CONV_DIMS:
- 512
- 512
- 512
- 512
- 512
- 512
- 512
- 512
LOSS_WEIGHT: 1.0
MIN_KEYPOINTS_PER_IMAGE: 1
NAME: KRCNNConvDeconvUpsampleHead
NORMALIZE_LOSS_BY_VISIBLE_KEYPOINTS: true
NUM_KEYPOINTS: 17
POOLER_RESOLUTION: 14
POOLER_SAMPLING_RATIO: 0
POOLER_TYPE: ROIAlignV2
ROI_MASK_HEAD:
CLS_AGNOSTIC_MASK: false
CONV_DIM: 256
NAME: MaskRCNNConvUpsampleHead
NORM: ''
NUM_CONV: 4
POOLER_RESOLUTION: 14
POOLER_SAMPLING_RATIO: 0
POOLER_TYPE: ROIAlignV2
RPN:
BATCH_SIZE_PER_IMAGE: 256
BBOX_REG_LOSS_TYPE: smooth_l1
BBOX_REG_LOSS_WEIGHT: 1.0
BBOX_REG_WEIGHTS:
- 1.0
- 1.0
- 1.0
- 1.0
BOUNDARY_THRESH: -1
CONV_DIMS:
- -1
HEAD_NAME: StandardRPNHead
IN_FEATURES:
- p2
- p3
- p4
- p5
- p6
IOU_LABELS:
- 0
- -1
- 1
IOU_THRESHOLDS:
- 0.3
- 0.7
LOSS_WEIGHT: 1.0
NMS_THRESH: 0.7
POSITIVE_FRACTION: 0.5
POST_NMS_TOPK_TEST: 1000
POST_NMS_TOPK_TRAIN: 2000
PRE_NMS_TOPK_TEST: 1000
PRE_NMS_TOPK_TRAIN: 2000
SMOOTH_L1_BETA: 0.0
SEM_SEG_HEAD:
COMMON_STRIDE: 4
CONVS_DIM: 128
IGNORE_VALUE: 255
IN_FEATURES:
- p2
- p3
- p4
- p5
LOSS_WEIGHT: 1.0
NAME: SemSegFPNHead
NORM: GN
NUM_CLASSES: 10
VIT:
DROP_PATH: 0.1
IMG_SIZE:
- 224
- 224
NAME: layoutlmv3_base
OUT_FEATURES:
- layer3
- layer5
- layer7
- layer11
POS_TYPE: abs
WEIGHTS:
OUTPUT_DIR:
SCIHUB_DATA_DIR_TRAIN: ~/publaynet/layout_scihub/train
SEED: 42
SOLVER:
AMP:
ENABLED: true
BACKBONE_MULTIPLIER: 1.0
BASE_LR: 0.0002
BIAS_LR_FACTOR: 1.0
CHECKPOINT_PERIOD: 2000
CLIP_GRADIENTS:
CLIP_TYPE: full_model
CLIP_VALUE: 1.0
ENABLED: true
NORM_TYPE: 2.0
GAMMA: 0.1
GRADIENT_ACCUMULATION_STEPS: 1
IMS_PER_BATCH: 32
LR_SCHEDULER_NAME: WarmupCosineLR
MAX_ITER: 20000
MOMENTUM: 0.9
NESTEROV: false
OPTIMIZER: ADAMW
REFERENCE_WORLD_SIZE: 0
STEPS:
- 10000
WARMUP_FACTOR: 0.01
WARMUP_ITERS: 333
WARMUP_METHOD: linear
WEIGHT_DECAY: 0.05
WEIGHT_DECAY_BIAS: null
WEIGHT_DECAY_NORM: 0.0
TEST:
AUG:
ENABLED: false
FLIP: true
MAX_SIZE: 4000
MIN_SIZES:
- 400
- 500
- 600
- 700
- 800
- 900
- 1000
- 1100
- 1200
DETECTIONS_PER_IMAGE: 100
EVAL_PERIOD: 1000
EXPECTED_RESULTS: []
KEYPOINT_OKS_SIGMAS: []
PRECISE_BN:
ENABLED: false
NUM_ITER: 200
VERSION: 2
VIS_PERIOD: 0
config:
device: cpu
layout: True
formula: False
table_config:
model: TableMaster
is_table_recog_enable: False
max_time: 400
weights:
layout: Layout/model_final.pth
mfd: MFD/weights.pt
mfr: MFR/UniMERNet
struct_eqtable: TabRec/StructEqTable
TableMaster: TabRec/TableMaster
from abc import ABC, abstractmethod
class AbsReaderWriter(ABC):
MODE_TXT = "text"
MODE_BIN = "binary"
@abstractmethod
def read(self, path: str, mode=MODE_TXT):
raise NotImplementedError
@abstractmethod
def write(self, content: str, path: str, mode=MODE_TXT):
raise NotImplementedError
@abstractmethod
def read_offset(self, path: str, offset=0, limit=None) -> bytes:
raise NotImplementedError
import os
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
from loguru import logger
class DiskReaderWriter(AbsReaderWriter):
def __init__(self, parent_path, encoding="utf-8"):
self.path = parent_path
self.encoding = encoding
def read(self, path, mode=AbsReaderWriter.MODE_TXT):
if os.path.isabs(path):
abspath = path
else:
abspath = os.path.join(self.path, path)
if not os.path.exists(abspath):
logger.error(f"file {abspath} not exists")
raise Exception(f"file {abspath} no exists")
if mode == AbsReaderWriter.MODE_TXT:
with open(abspath, "r", encoding=self.encoding) as f:
return f.read()
elif mode == AbsReaderWriter.MODE_BIN:
with open(abspath, "rb") as f:
return f.read()
else:
raise ValueError("Invalid mode. Use 'text' or 'binary'.")
def write(self, content, path, mode=AbsReaderWriter.MODE_TXT):
if os.path.isabs(path):
abspath = path
else:
abspath = os.path.join(self.path, path)
directory_path = os.path.dirname(abspath)
if not os.path.exists(directory_path):
os.makedirs(directory_path)
if mode == AbsReaderWriter.MODE_TXT:
with open(abspath, "w", encoding=self.encoding, errors="replace") as f:
f.write(content)
elif mode == AbsReaderWriter.MODE_BIN:
with open(abspath, "wb") as f:
f.write(content)
else:
raise ValueError("Invalid mode. Use 'text' or 'binary'.")
def read_offset(self, path: str, offset=0, limit=None):
abspath = path
if not os.path.isabs(path):
abspath = os.path.join(self.path, path)
with open(abspath, "rb") as f:
f.seek(offset)
return f.read(limit)
if __name__ == "__main__":
if 0:
file_path = "io/test/example.txt"
drw = DiskReaderWriter("D:\projects\papayfork\Magic-PDF\magic_pdf")
# 写入内容到文件
drw.write(b"Hello, World!", path="io/test/example.txt", mode="binary")
# 从文件读取内容
content = drw.read(path=file_path)
if content:
logger.info(f"从 {file_path} 读取的内容: {content}")
if 1:
drw = DiskReaderWriter("/opt/data/pdf/resources/test/io/")
content_bin = drw.read_offset("1.txt")
assert content_bin == b"ABCD!"
content_bin = drw.read_offset("1.txt", offset=1, limit=2)
assert content_bin == b"BC"
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
from magic_pdf.libs.commons import parse_aws_param, parse_bucket_key, join_path
import boto3
from loguru import logger
from botocore.config import Config
class S3ReaderWriter(AbsReaderWriter):
def __init__(
self,
ak: str,
sk: str,
endpoint_url: str,
addressing_style: str = "auto",
parent_path: str = "",
):
self.client = self._get_client(ak, sk, endpoint_url, addressing_style)
self.path = parent_path
def _get_client(self, ak: str, sk: str, endpoint_url: str, addressing_style: str):
s3_client = boto3.client(
service_name="s3",
aws_access_key_id=ak,
aws_secret_access_key=sk,
endpoint_url=endpoint_url,
config=Config(
s3={"addressing_style": addressing_style},
retries={"max_attempts": 5, "mode": "standard"},
),
)
return s3_client
def read(self, s3_relative_path, mode=AbsReaderWriter.MODE_TXT, encoding="utf-8"):
if s3_relative_path.startswith("s3://"):
s3_path = s3_relative_path
else:
s3_path = join_path(self.path, s3_relative_path)
bucket_name, key = parse_bucket_key(s3_path)
res = self.client.get_object(Bucket=bucket_name, Key=key)
body = res["Body"].read()
if mode == AbsReaderWriter.MODE_TXT:
data = body.decode(encoding) # Decode bytes to text
elif mode == AbsReaderWriter.MODE_BIN:
data = body
else:
raise ValueError("Invalid mode. Use 'text' or 'binary'.")
return data
def write(self, content, s3_relative_path, mode=AbsReaderWriter.MODE_TXT, encoding="utf-8"):
if s3_relative_path.startswith("s3://"):
s3_path = s3_relative_path
else:
s3_path = join_path(self.path, s3_relative_path)
if mode == AbsReaderWriter.MODE_TXT:
body = content.encode(encoding) # Encode text data as bytes
elif mode == AbsReaderWriter.MODE_BIN:
body = content
else:
raise ValueError("Invalid mode. Use 'text' or 'binary'.")
bucket_name, key = parse_bucket_key(s3_path)
self.client.put_object(Body=body, Bucket=bucket_name, Key=key)
logger.info(f"内容已写入 {s3_path} ")
def read_offset(self, path: str, offset=0, limit=None) -> bytes:
if path.startswith("s3://"):
s3_path = path
else:
s3_path = join_path(self.path, path)
bucket_name, key = parse_bucket_key(s3_path)
range_header = (
f"bytes={offset}-{offset+limit-1}" if limit else f"bytes={offset}-"
)
res = self.client.get_object(Bucket=bucket_name, Key=key, Range=range_header)
return res["Body"].read()
if __name__ == "__main__":
if 0:
# Config the connection info
ak = ""
sk = ""
endpoint_url = ""
addressing_style = "auto"
bucket_name = ""
# Create an S3ReaderWriter object
s3_reader_writer = S3ReaderWriter(
ak, sk, endpoint_url, addressing_style, "s3://bucket_name/"
)
# Write text data to S3
text_data = "This is some text data"
s3_reader_writer.write(
text_data,
s3_relative_path=f"s3://{bucket_name}/ebook/test/test.json",
mode=AbsReaderWriter.MODE_TXT,
)
# Read text data from S3
text_data_read = s3_reader_writer.read(
s3_relative_path=f"s3://{bucket_name}/ebook/test/test.json", mode=AbsReaderWriter.MODE_TXT
)
logger.info(f"Read text data from S3: {text_data_read}")
# Write binary data to S3
binary_data = b"This is some binary data"
s3_reader_writer.write(
text_data,
s3_relative_path=f"s3://{bucket_name}/ebook/test/test.json",
mode=AbsReaderWriter.MODE_BIN,
)
# Read binary data from S3
binary_data_read = s3_reader_writer.read(
s3_relative_path=f"s3://{bucket_name}/ebook/test/test.json", mode=AbsReaderWriter.MODE_BIN
)
logger.info(f"Read binary data from S3: {binary_data_read}")
# Range Read text data from S3
binary_data_read = s3_reader_writer.read_offset(
path=f"s3://{bucket_name}/ebook/test/test.json", offset=0, limit=10
)
logger.info(f"Read binary data from S3: {binary_data_read}")
if 1:
import os
import json
ak = os.getenv("AK", "")
sk = os.getenv("SK", "")
endpoint_url = os.getenv("ENDPOINT", "")
bucket = os.getenv("S3_BUCKET", "")
prefix = os.getenv("S3_PREFIX", "")
key_basename = os.getenv("S3_KEY_BASENAME", "")
s3_reader_writer = S3ReaderWriter(
ak, sk, endpoint_url, "auto", f"s3://{bucket}/{prefix}"
)
content_bin = s3_reader_writer.read_offset(key_basename)
assert content_bin[:10] == b'{"track_id'
assert content_bin[-10:] == b'r":null}}\n'
content_bin = s3_reader_writer.read_offset(key_basename, offset=424, limit=426)
jso = json.dumps(content_bin.decode("utf-8"))
print(jso)
from loguru import logger
from magic_pdf.libs.drop_reason import DropReason
def get_data_source(jso: dict):
data_source = jso.get("data_source")
if data_source is None:
data_source = jso.get("file_source")
return data_source
def get_data_type(jso: dict):
data_type = jso.get("data_type")
if data_type is None:
data_type = jso.get("file_type")
return data_type
def get_bookid(jso: dict):
book_id = jso.get("bookid")
if book_id is None:
book_id = jso.get("original_file_id")
return book_id
def exception_handler(jso: dict, e):
logger.exception(e)
jso["_need_drop"] = True
jso["_drop_reason"] = DropReason.Exception
jso["_exception"] = f"ERROR: {e}"
return jso
def get_bookname(jso: dict):
data_source = get_data_source(jso)
file_id = jso.get("file_id")
book_name = f"{data_source}/{file_id}"
return book_name
def spark_json_extractor(jso: dict) -> dict:
"""
从json中提取数据,返回一个dict
"""
return {
"_pdf_type": jso["_pdf_type"],
"model_list": jso["doc_layout_result"],
}
import os
from pathlib import Path
import click
from loguru import logger
import magic_pdf.model as model_config
from magic_pdf.libs.version import __version__
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
from magic_pdf.tools.common import do_parse, parse_pdf_methods
@click.command()
@click.version_option(__version__,
'--version',
'-v',
help='display the version and exit')
@click.option(
'-p',
'--path',
'path',
type=click.Path(exists=True),
required=True,
help='local pdf filepath or directory',
)
@click.option(
'-o',
'--output-dir',
'output_dir',
type=click.Path(),
required=True,
help='output local directory',
)
@click.option(
'-m',
'--method',
'method',
type=parse_pdf_methods,
help="""the method for parsing pdf.
ocr: using ocr technique to extract information from pdf.
txt: suitable for the text-based pdf only and outperform ocr.
auto: automatically choose the best method for parsing pdf from ocr and txt.
without method specified, auto will be used by default.""",
default='auto',
)
@click.option(
'-d',
'--debug',
'debug_able',
type=bool,
help='Enables detailed debugging information during the execution of the CLI commands.',
default=False,
)
@click.option(
'-s',
'--start',
'start_page_id',
type=int,
help='The starting page for PDF parsing, beginning from 0.',
default=0,
)
@click.option(
'-e',
'--end',
'end_page_id',
type=int,
help='The ending page for PDF parsing, beginning from 0.',
default=None,
)
def cli(path, output_dir, method, debug_able, start_page_id, end_page_id):
model_config.__use_inside_model__ = True
model_config.__model_mode__ = 'full'
os.makedirs(output_dir, exist_ok=True)
def read_fn(path):
disk_rw = DiskReaderWriter(os.path.dirname(path))
return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN)
def parse_doc(doc_path: str):
try:
file_name = str(Path(doc_path).stem)
pdf_data = read_fn(doc_path)
do_parse(
output_dir,
file_name,
pdf_data,
[],
method,
debug_able,
start_page_id=start_page_id,
end_page_id=end_page_id,
)
except Exception as e:
logger.exception(e)
if os.path.isdir(path):
for root, dirs, files in os.walk(path):
# 查找所有的pdf文件
for file in files:
if file.endswith('.pdf'):
# 打印pdf文件的完整路径
doc_path = os.path.join(root, file)
logger.info(f'正在解析:{doc_path}')
parse_doc(doc_path)
else:
#logger.info(f'正在解析:{doc_path}')
parse_doc(path)
if __name__ == '__main__':
cli()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment