Commit 2df265c8 authored by zhougaofeng's avatar zhougaofeng
Browse files

Update magic_pdf/__init__.py, magic_pdf/config.ini, magic_pdf/tmp.py,...

Update magic_pdf/__init__.py, magic_pdf/config.ini, magic_pdf/tmp.py, magic_pdf/pdf_parse_by_ocr.py, magic_pdf/pdf_parse_by_txt.py, magic_pdf/user_api.py, magic_pdf/pdf_parse_union_core.py, magic_pdf/pdf_parse_union_core_v2.py, magic_pdf/config/__init__.py, magic_pdf/config/enums.py, magic_pdf/config/exceptions.py, magic_pdf/data/__init__.py, magic_pdf/data/schemas.py, magic_pdf/data/dataset.py, magic_pdf/data/utils.py, magic_pdf/data/read_api.py, magic_pdf/data/data_reader_writer/__init__.py, magic_pdf/data/data_reader_writer/base.py, magic_pdf/data/data_reader_writer/filebase.py, magic_pdf/data/data_reader_writer/s3.py, magic_pdf/data/data_reader_writer/multi_bucket_s3.py, magic_pdf/data/io/__init__.py, magic_pdf/data/io/base.py, magic_pdf/data/io/s3.py, magic_pdf/data/io/http.py, magic_pdf/dict2md/__init__.py, magic_pdf/dict2md/ocr_vllm_client.py, magic_pdf/dict2md/ocr_vllm_server.py, magic_pdf/dict2md/ocr_mkcontent.py, magic_pdf/dict2md/mkcontent.py, magic_pdf/dict2md/ocr_server.py, magic_pdf/dict2md/ocr_client.py, magic_pdf/filter/__init__.py, magic_pdf/filter/pdf_classify_by_type.py, magic_pdf/filter/pdf_meta_scan.py, magic_pdf/integrations/__init__.py, magic_pdf/integrations/rag/__init__.py, magic_pdf/integrations/rag/api.py, magic_pdf/integrations/rag/utils.py, magic_pdf/integrations/rag/type.py, magic_pdf/layout/__init__.py, magic_pdf/layout/layout_det_utils.py, magic_pdf/layout/layout_spiler_recog.py, magic_pdf/layout/mcol_sort.py, magic_pdf/layout/layout_sort.py, magic_pdf/layout/bbox_sort.py, magic_pdf/libs/__init__.py, magic_pdf/libs/boxbase.py, magic_pdf/libs/calc_span_stats.py, magic_pdf/libs/clean_memory.py, magic_pdf/libs/convert_utils.py, magic_pdf/libs/detect_language_from_model.py, magic_pdf/libs/coordinate_transform.py, magic_pdf/libs/drop_tag.py, magic_pdf/libs/nlp_utils.py, magic_pdf/libs/ocr_content_type.py, magic_pdf/libs/hash_utils.py, magic_pdf/libs/path_utils.py, magic_pdf/libs/pdf_check.py, magic_pdf/libs/language.py, magic_pdf/libs/markdown_utils.py, magic_pdf/libs/drop_reason.py, magic_pdf/libs/pdf_image_tools.py, magic_pdf/libs/config_reader.py, magic_pdf/libs/Constants.py, magic_pdf/libs/local_math.py, magic_pdf/libs/ModelBlockTypeEnum.py, magic_pdf/libs/draw_bbox.py, magic_pdf/libs/vis_utils.py, magic_pdf/libs/textbase.py, magic_pdf/libs/safe_filename.py, magic_pdf/libs/MakeContentConfig.py, magic_pdf/libs/version.py, magic_pdf/libs/json_compressor.py, magic_pdf/libs/commons.py, magic_pdf/model/__init__.py, magic_pdf/model/doc_analyze_by_custom_model.py, magic_pdf/model/magic_model.py, magic_pdf/model/model_list.py, magic_pdf/model/pp_structure_v2.py, magic_pdf/model/ppTableModel.py, magic_pdf/model/pdf_extract_kit.py, magic_pdf/model/pek_sub_modules/__init__.py, magic_pdf/model/pek_sub_modules/self_modify.py, magic_pdf/model/pek_sub_modules/post_process.py, magic_pdf/model/pek_sub_modules/layoutlmv3/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/backbone.py, magic_pdf/model/pek_sub_modules/layoutlmv3/beit.py, magic_pdf/model/pek_sub_modules/layoutlmv3/deit.py, magic_pdf/model/pek_sub_modules/layoutlmv3/model_init.py, magic_pdf/model/pek_sub_modules/layoutlmv3/visualizer.py, magic_pdf/model/pek_sub_modules/layoutlmv3/rcnn_vl.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/funsd.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/data_collator.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/cord.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/image_utils.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/xfund.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/configuration_layoutlmv3.py, magic_pdf/model/pek_sub_modules/structeqtable/__init__.py, magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py, magic_pdf/para/__init__.py, magic_pdf/para/commons.py, magic_pdf/para/draw.py, magic_pdf/para/layout_match_processor.py, magic_pdf/para/raw_processor.py, magic_pdf/para/title_processor.py, magic_pdf/para/para_split.py, magic_pdf/para/denoise.py, magic_pdf/para/block_continuation_processor.py, magic_pdf/para/block_termination_processor.py, magic_pdf/para/para_pipeline.py, magic_pdf/para/para_split_v2.py, magic_pdf/para/para_split_v3.py, magic_pdf/para/stats.py, magic_pdf/para/exceptions.py, magic_pdf/parse/__init__.py, magic_pdf/parse/excel_parse.py, magic_pdf/parse/common_parse.py, magic_pdf/parse/ofd_parse.py, magic_pdf/parse/pdf_client.py, magic_pdf/pipe/__init__.py, magic_pdf/pipe/AbsPipe.py, magic_pdf/pipe/OCRPipe.py, magic_pdf/pipe/TXTPipe.py, magic_pdf/pipe/UNIPipe.py, magic_pdf/post_proc/__init__.py, magic_pdf/post_proc/pdf_post_filter.py, magic_pdf/post_proc/remove_spaces_html.py, magic_pdf/post_proc/remove_footnote.py, magic_pdf/post_proc/detect_para.py, magic_pdf/pre_proc/__init__.py, magic_pdf/pre_proc/post_layout_split.py, magic_pdf/pre_proc/citationmarker_remove.py, magic_pdf/pre_proc/detect_equation.py, magic_pdf/pre_proc/detect_footer_by_model.py, magic_pdf/pre_proc/cut_image.py, magic_pdf/pre_proc/construct_page_dict.py, magic_pdf/pre_proc/ocr_detect_layout.py, magic_pdf/pre_proc/pdf_pre_filter.py, magic_pdf/pre_proc/ocr_dict_merge.py, magic_pdf/pre_proc/ocr_span_list_modify.py, magic_pdf/pre_proc/remove_bbox_overlap.py, magic_pdf/pre_proc/remove_colored_strip_bbox.py, magic_pdf/pre_proc/remove_footer_header.py, magic_pdf/pre_proc/detect_header.py, magic_pdf/pre_proc/detect_page_number.py, magic_pdf/pre_proc/detect_tables.py, magic_pdf/pre_proc/detect_footer_header_by_statistics.py, magic_pdf/pre_proc/detect_footnote.py, magic_pdf/pre_proc/remove_rotate_bbox.py, magic_pdf/pre_proc/resolve_bbox_conflict.py, magic_pdf/pre_proc/solve_line_alien.py, magic_pdf/pre_proc/statistics.py, magic_pdf/pre_proc/detect_images.py, magic_pdf/pre_proc/equations_replace.py, magic_pdf/pre_proc/fix_image.py, magic_pdf/pre_proc/ocr_detect_all_bboxes.py, magic_pdf/pre_proc/fix_table.py, magic_pdf/pre_proc/main_text_font.py, magic_pdf/resources/fasttext-langdetect/lid.176.ftz, magic_pdf/resources/model_config/model_configs.yaml, magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml, magic_pdf/resources/model_config/UniMERNet/demo.yaml, magic_pdf/rw/__init__.py, magic_pdf/rw/AbsReaderWriter.py, magic_pdf/rw/DiskReaderWriter.py, magic_pdf/rw/draw_ofd.py, magic_pdf/rw/ofdtemplate.py, magic_pdf/rw/pdf_parse.py, magic_pdf/rw/draw_pdf.py, magic_pdf/rw/S3ReaderWriter.py, magic_pdf/spark/__init__.py, magic_pdf/spark/spark_api.py, magic_pdf/tools/__init__.py, magic_pdf/tools/cli.py, magic_pdf/tools/cli_dev.py, magic_pdf/tools/common.py, magic_pdf/tools/file_deal.py, magic_pdf/tools/img_deal.py, magic_pdf/tools/find_seal_img.py, magic_pdf/tools/font_tools.py, magic_pdf/tools/file_parser.py, magic_pdf/tools/parameter_parser.py, magic_pdf/tools/ofd.py, magic_pdf/tools/pdf_server.py, magic_pdf/tools/ofd_parser.py, magic_pdf/utils/__init__.py, magic_pdf/utils/annotations.py files
parent 826086d2
"""
从pdf里提取出来api给出的bbox,然后根据重叠情况做出取舍
1. 首先去掉出现在图片上的bbox,图片包括表格和图片
2. 然后去掉出现在文字blcok上的图片bbox
"""
from magic_pdf.libs.boxbase import _is_in, _is_in_or_part_overlap, _is_left_overlap
from magic_pdf.libs.drop_tag import ON_IMAGE_TEXT, ON_TABLE_TEXT
def resolve_bbox_overlap_conflict(images: list, tables: list, interline_equations: list, inline_equations: list,
text_raw_blocks: list):
"""
text_raw_blocks结构是从pymupdf里直接取到的结构,具体样例参考test/assets/papre/pymu_textblocks.json
当下采用一种粗暴的方式:
1. 去掉图片上的公式
2. 去掉table上的公式
2. 图片和文字block部分重叠,首先丢弃图片
3. 图片和图片重叠,修改图片的bbox,使得图片不重叠(暂时没这么做,先把图片都扔掉)
4. 去掉文字bbox里位于图片、表格上的文字(一定要完全在图、表内部)
5. 去掉表格上的文字
"""
text_block_removed = []
images_backup = []
# 去掉位于图片上的文字block
for image_box in images:
for text_block in text_raw_blocks:
text_bbox = text_block["bbox"]
if _is_in(text_bbox, image_box):
text_block['tag'] = ON_IMAGE_TEXT
text_block_removed.append(text_block)
# 去掉table上的文字block
for table_box in tables:
for text_block in text_raw_blocks:
text_bbox = text_block["bbox"]
if _is_in(text_bbox, table_box):
text_block['tag'] = ON_TABLE_TEXT
text_block_removed.append(text_block)
for text_block in text_block_removed:
if text_block in text_raw_blocks:
text_raw_blocks.remove(text_block)
# 第一步去掉在图片上出现的公式box
temp = []
for image_box in images:
for eq1 in interline_equations:
if _is_in_or_part_overlap(image_box, eq1[:4]):
temp.append(eq1)
for eq2 in inline_equations:
if _is_in_or_part_overlap(image_box, eq2[:4]):
temp.append(eq2)
for eq in temp:
if eq in interline_equations:
interline_equations.remove(eq)
if eq in inline_equations:
inline_equations.remove(eq)
# 第二步去掉在表格上出现的公式box
temp = []
for table_box in tables:
for eq1 in interline_equations:
if _is_in_or_part_overlap(table_box, eq1[:4]):
temp.append(eq1)
for eq2 in inline_equations:
if _is_in_or_part_overlap(table_box, eq2[:4]):
temp.append(eq2)
for eq in temp:
if eq in interline_equations:
interline_equations.remove(eq)
if eq in inline_equations:
inline_equations.remove(eq)
# 图片和文字重叠,丢掉图片
for image_box in images:
for text_block in text_raw_blocks:
text_bbox = text_block["bbox"]
if _is_in_or_part_overlap(image_box, text_bbox):
images_backup.append(image_box)
break
for image_box in images_backup:
images.remove(image_box)
# 图片和图片重叠,两张都暂时不参与版面计算
images_dup_index = []
for i in range(len(images)):
for j in range(i + 1, len(images)):
if _is_in_or_part_overlap(images[i], images[j]):
images_dup_index.append(i)
images_dup_index.append(j)
dup_idx = set(images_dup_index)
for img_id in dup_idx:
images_backup.append(images[img_id])
images[img_id] = None
images = [img for img in images if img is not None]
# 如果行间公式和文字block重叠,放到临时的数据里,防止这些文字box影响到layout计算。通过计算IOU合并行间公式和文字block
# 对于这样的文本块删除,然后保留行间公式的大小不变。
# 当计算完毕layout,这部分再合并回来
text_block_removed_2 = []
# for text_block in text_raw_blocks:
# text_bbox = text_block["bbox"]
# for eq in interline_equations:
# ratio = calculate_overlap_area_2_minbox_area_ratio(text_bbox, eq[:4])
# if ratio>0.05:
# text_block['tag'] = "belong-to-interline-equation"
# text_block_removed_2.append(text_block)
# break
# for tb in text_block_removed_2:
# if tb in text_raw_blocks:
# text_raw_blocks.remove(tb)
# text_block_removed = text_block_removed + text_block_removed_2
return images, tables, interline_equations, inline_equations, text_raw_blocks, text_block_removed, images_backup, text_block_removed_2
def check_text_block_horizontal_overlap(text_blocks: list, header, footer) -> bool:
"""
检查文本block之间的水平重叠情况,这种情况如果发生,那么这个pdf就不再继续处理了。
因为这种情况大概率发生了公式没有被检测出来。
"""
if len(text_blocks) == 0:
return False
page_min_y = 0
page_max_y = max(yy['bbox'][3] for yy in text_blocks)
def __max_y(lst: list):
if len(lst) > 0:
return max([item[1] for item in lst])
return page_min_y
def __min_y(lst: list):
if len(lst) > 0:
return min([item[3] for item in lst])
return page_max_y
clip_y0 = __max_y(header)
clip_y1 = __min_y(footer)
txt_bboxes = []
for text_block in text_blocks:
bbox = text_block["bbox"]
if bbox[1] >= clip_y0 and bbox[3] <= clip_y1:
txt_bboxes.append(bbox)
for i in range(len(txt_bboxes)):
for j in range(i + 1, len(txt_bboxes)):
if _is_left_overlap(txt_bboxes[i], txt_bboxes[j]) or _is_left_overlap(txt_bboxes[j], txt_bboxes[i]):
return True
return False
def check_useful_block_horizontal_overlap(useful_blocks: list) -> bool:
"""
检查文本block之间的水平重叠情况,这种情况如果发生,那么这个pdf就不再继续处理了。
因为这种情况大概率发生了公式没有被检测出来。
"""
if len(useful_blocks) == 0:
return False
page_min_y = 0
page_max_y = max(yy['bbox'][3] for yy in useful_blocks)
useful_bboxes = []
for text_block in useful_blocks:
bbox = text_block["bbox"]
if bbox[1] >= page_min_y and bbox[3] <= page_max_y:
useful_bboxes.append(bbox)
for i in range(len(useful_bboxes)):
for j in range(i + 1, len(useful_bboxes)):
area_i = (useful_bboxes[i][2] - useful_bboxes[i][0]) * (useful_bboxes[i][3] - useful_bboxes[i][1])
area_j = (useful_bboxes[j][2] - useful_bboxes[j][0]) * (useful_bboxes[j][3] - useful_bboxes[j][1])
if _is_left_overlap(useful_bboxes[i], useful_bboxes[j]) or _is_left_overlap(useful_bboxes[j], useful_bboxes[i]):
if area_i > area_j:
return True, useful_bboxes[j], useful_bboxes[i]
else:
return True, useful_bboxes[i], useful_bboxes[j]
return False, None, None
def solve_inline_too_large_interval(pdf_info_dict: dict) -> dict: # text_block -> json中的preproc_block
"""解决行内文本间距过大问题"""
for i in range(len(pdf_info_dict)):
text_blocks = pdf_info_dict[f'page_{i}']['preproc_blocks']
for block in text_blocks:
x_pre_1, y_pre_1, x_pre_2, y_pre_2 = 0, 0, 0, 0
for line in block['lines']:
x_cur_1, y_cur_1, x_cur_2, y_cur_2 = line['bbox']
# line_box = [x1, y1, x2, y2]
if int(y_cur_1) == int(y_pre_1) and int(y_cur_2) == int(y_pre_2):
# if len(line['spans']) == 1:
line['spans'][0]['text'] = ' ' + line['spans'][0]['text']
x_pre_1, y_pre_1, x_pre_2, y_pre_2 = line['bbox']
return pdf_info_dict
"""
统计处需要跨页、全局性的数据
- 统计出字号从大到小
- 正文区域占比最高的前5
- 正文平均行间距
- 正文平均字间距
- 正文平均字符宽度
- 正文平均字符高度
"""
model:
arch: unimernet
model_type: unimernet
model_config:
model_name: ./models/unimernet_base
max_seq_len: 1536
load_pretrained: True
pretrained: './models/unimernet_base/pytorch_model.pth'
tokenizer_config:
path: ./models/unimernet_base
datasets:
formula_rec_eval:
vis_processor:
eval:
name: "formula_image_eval"
image_size:
- 192
- 672
run:
runner: runner_iter
task: unimernet_train
batch_size_train: 64
batch_size_eval: 64
num_workers: 1
iters_per_inner_epoch: 2000
max_iters: 60000
seed: 42
output_dir: "../output/demo"
evaluate: True
test_splits: [ "eval" ]
device: "cuda"
world_size: 1
dist_url: "env://"
distributed: True
distributed_type: ddp # or fsdp when train llm
generate_cfg:
temperature: 0.0
\ No newline at end of file
AUG:
DETR: true
CACHE_DIR: ~/cache/huggingface
CUDNN_BENCHMARK: false
DATALOADER:
ASPECT_RATIO_GROUPING: true
FILTER_EMPTY_ANNOTATIONS: false
NUM_WORKERS: 4
REPEAT_THRESHOLD: 0.0
SAMPLER_TRAIN: TrainingSampler
DATASETS:
PRECOMPUTED_PROPOSAL_TOPK_TEST: 1000
PRECOMPUTED_PROPOSAL_TOPK_TRAIN: 2000
PROPOSAL_FILES_TEST: []
PROPOSAL_FILES_TRAIN: []
TEST:
- scihub_train
TRAIN:
- scihub_train
GLOBAL:
HACK: 1.0
ICDAR_DATA_DIR_TEST: ''
ICDAR_DATA_DIR_TRAIN: ''
INPUT:
CROP:
ENABLED: true
SIZE:
- 384
- 600
TYPE: absolute_range
FORMAT: RGB
MASK_FORMAT: polygon
MAX_SIZE_TEST: 1333
MAX_SIZE_TRAIN: 1333
MIN_SIZE_TEST: 800
MIN_SIZE_TRAIN:
- 480
- 512
- 544
- 576
- 608
- 640
- 672
- 704
- 736
- 768
- 800
MIN_SIZE_TRAIN_SAMPLING: choice
RANDOM_FLIP: horizontal
MODEL:
ANCHOR_GENERATOR:
ANGLES:
- - -90
- 0
- 90
ASPECT_RATIOS:
- - 0.5
- 1.0
- 2.0
NAME: DefaultAnchorGenerator
OFFSET: 0.0
SIZES:
- - 32
- - 64
- - 128
- - 256
- - 512
BACKBONE:
FREEZE_AT: 2
NAME: build_vit_fpn_backbone
CONFIG_PATH: ''
DEVICE: cuda
FPN:
FUSE_TYPE: sum
IN_FEATURES:
- layer3
- layer5
- layer7
- layer11
NORM: ''
OUT_CHANNELS: 256
IMAGE_ONLY: true
KEYPOINT_ON: false
LOAD_PROPOSALS: false
MASK_ON: true
META_ARCHITECTURE: VLGeneralizedRCNN
PANOPTIC_FPN:
COMBINE:
ENABLED: true
INSTANCES_CONFIDENCE_THRESH: 0.5
OVERLAP_THRESH: 0.5
STUFF_AREA_LIMIT: 4096
INSTANCE_LOSS_WEIGHT: 1.0
PIXEL_MEAN:
- 127.5
- 127.5
- 127.5
PIXEL_STD:
- 127.5
- 127.5
- 127.5
PROPOSAL_GENERATOR:
MIN_SIZE: 0
NAME: RPN
RESNETS:
DEFORM_MODULATED: false
DEFORM_NUM_GROUPS: 1
DEFORM_ON_PER_STAGE:
- false
- false
- false
- false
DEPTH: 50
NORM: FrozenBN
NUM_GROUPS: 1
OUT_FEATURES:
- res4
RES2_OUT_CHANNELS: 256
RES5_DILATION: 1
STEM_OUT_CHANNELS: 64
STRIDE_IN_1X1: true
WIDTH_PER_GROUP: 64
RETINANET:
BBOX_REG_LOSS_TYPE: smooth_l1
BBOX_REG_WEIGHTS:
- 1.0
- 1.0
- 1.0
- 1.0
FOCAL_LOSS_ALPHA: 0.25
FOCAL_LOSS_GAMMA: 2.0
IN_FEATURES:
- p3
- p4
- p5
- p6
- p7
IOU_LABELS:
- 0
- -1
- 1
IOU_THRESHOLDS:
- 0.4
- 0.5
NMS_THRESH_TEST: 0.5
NORM: ''
NUM_CLASSES: 10
NUM_CONVS: 4
PRIOR_PROB: 0.01
SCORE_THRESH_TEST: 0.05
SMOOTH_L1_LOSS_BETA: 0.1
TOPK_CANDIDATES_TEST: 1000
ROI_BOX_CASCADE_HEAD:
BBOX_REG_WEIGHTS:
- - 10.0
- 10.0
- 5.0
- 5.0
- - 20.0
- 20.0
- 10.0
- 10.0
- - 30.0
- 30.0
- 15.0
- 15.0
IOUS:
- 0.5
- 0.6
- 0.7
ROI_BOX_HEAD:
BBOX_REG_LOSS_TYPE: smooth_l1
BBOX_REG_LOSS_WEIGHT: 1.0
BBOX_REG_WEIGHTS:
- 10.0
- 10.0
- 5.0
- 5.0
CLS_AGNOSTIC_BBOX_REG: true
CONV_DIM: 256
FC_DIM: 1024
NAME: FastRCNNConvFCHead
NORM: ''
NUM_CONV: 0
NUM_FC: 2
POOLER_RESOLUTION: 7
POOLER_SAMPLING_RATIO: 0
POOLER_TYPE: ROIAlignV2
SMOOTH_L1_BETA: 0.0
TRAIN_ON_PRED_BOXES: false
ROI_HEADS:
BATCH_SIZE_PER_IMAGE: 512
IN_FEATURES:
- p2
- p3
- p4
- p5
IOU_LABELS:
- 0
- 1
IOU_THRESHOLDS:
- 0.5
NAME: CascadeROIHeads
NMS_THRESH_TEST: 0.5
NUM_CLASSES: 10
POSITIVE_FRACTION: 0.25
PROPOSAL_APPEND_GT: true
SCORE_THRESH_TEST: 0.05
ROI_KEYPOINT_HEAD:
CONV_DIMS:
- 512
- 512
- 512
- 512
- 512
- 512
- 512
- 512
LOSS_WEIGHT: 1.0
MIN_KEYPOINTS_PER_IMAGE: 1
NAME: KRCNNConvDeconvUpsampleHead
NORMALIZE_LOSS_BY_VISIBLE_KEYPOINTS: true
NUM_KEYPOINTS: 17
POOLER_RESOLUTION: 14
POOLER_SAMPLING_RATIO: 0
POOLER_TYPE: ROIAlignV2
ROI_MASK_HEAD:
CLS_AGNOSTIC_MASK: false
CONV_DIM: 256
NAME: MaskRCNNConvUpsampleHead
NORM: ''
NUM_CONV: 4
POOLER_RESOLUTION: 14
POOLER_SAMPLING_RATIO: 0
POOLER_TYPE: ROIAlignV2
RPN:
BATCH_SIZE_PER_IMAGE: 256
BBOX_REG_LOSS_TYPE: smooth_l1
BBOX_REG_LOSS_WEIGHT: 1.0
BBOX_REG_WEIGHTS:
- 1.0
- 1.0
- 1.0
- 1.0
BOUNDARY_THRESH: -1
CONV_DIMS:
- -1
HEAD_NAME: StandardRPNHead
IN_FEATURES:
- p2
- p3
- p4
- p5
- p6
IOU_LABELS:
- 0
- -1
- 1
IOU_THRESHOLDS:
- 0.3
- 0.7
LOSS_WEIGHT: 1.0
NMS_THRESH: 0.7
POSITIVE_FRACTION: 0.5
POST_NMS_TOPK_TEST: 1000
POST_NMS_TOPK_TRAIN: 2000
PRE_NMS_TOPK_TEST: 1000
PRE_NMS_TOPK_TRAIN: 2000
SMOOTH_L1_BETA: 0.0
SEM_SEG_HEAD:
COMMON_STRIDE: 4
CONVS_DIM: 128
IGNORE_VALUE: 255
IN_FEATURES:
- p2
- p3
- p4
- p5
LOSS_WEIGHT: 1.0
NAME: SemSegFPNHead
NORM: GN
NUM_CLASSES: 10
VIT:
DROP_PATH: 0.1
IMG_SIZE:
- 224
- 224
NAME: layoutlmv3_base
OUT_FEATURES:
- layer3
- layer5
- layer7
- layer11
POS_TYPE: abs
WEIGHTS:
OUTPUT_DIR:
SCIHUB_DATA_DIR_TRAIN: ~/publaynet/layout_scihub/train
SEED: 42
SOLVER:
AMP:
ENABLED: true
BACKBONE_MULTIPLIER: 1.0
BASE_LR: 0.0002
BIAS_LR_FACTOR: 1.0
CHECKPOINT_PERIOD: 2000
CLIP_GRADIENTS:
CLIP_TYPE: full_model
CLIP_VALUE: 1.0
ENABLED: true
NORM_TYPE: 2.0
GAMMA: 0.1
GRADIENT_ACCUMULATION_STEPS: 1
IMS_PER_BATCH: 32
LR_SCHEDULER_NAME: WarmupCosineLR
MAX_ITER: 20000
MOMENTUM: 0.9
NESTEROV: false
OPTIMIZER: ADAMW
REFERENCE_WORLD_SIZE: 0
STEPS:
- 10000
WARMUP_FACTOR: 0.01
WARMUP_ITERS: 333
WARMUP_METHOD: linear
WEIGHT_DECAY: 0.05
WEIGHT_DECAY_BIAS: null
WEIGHT_DECAY_NORM: 0.0
TEST:
AUG:
ENABLED: false
FLIP: true
MAX_SIZE: 4000
MIN_SIZES:
- 400
- 500
- 600
- 700
- 800
- 900
- 1000
- 1100
- 1200
DETECTIONS_PER_IMAGE: 100
EVAL_PERIOD: 1000
EXPECTED_RESULTS: []
KEYPOINT_OKS_SIGMAS: []
PRECISE_BN:
ENABLED: false
NUM_ITER: 200
VERSION: 2
VIS_PERIOD: 0
weights:
layoutlmv3: Layout/LayoutLMv3/model_final.pth
doclayout_yolo: Layout/YOLO/doclayout_yolo_ft.pt
yolo_v8_mfd: MFD/YOLO/yolo_v8_ft.pt
unimernet_small: MFR/unimernet_small
struct_eqtable: TabRec/StructEqTable
tablemaster: TabRec/TableMaster
\ No newline at end of file
from abc import ABC, abstractmethod
class AbsReaderWriter(ABC):
MODE_TXT = "text"
MODE_BIN = "binary"
@abstractmethod
def read(self, path: str, mode=MODE_TXT):
raise NotImplementedError
@abstractmethod
def write(self, content: str, path: str, mode=MODE_TXT):
raise NotImplementedError
@abstractmethod
def read_offset(self, path: str, offset=0, limit=None) -> bytes:
raise NotImplementedError
import os
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
from loguru import logger
class DiskReaderWriter(AbsReaderWriter):
def __init__(self, parent_path, encoding="utf-8"):
self.path = parent_path
self.encoding = encoding
def read(self, path, mode=AbsReaderWriter.MODE_TXT):
if os.path.isabs(path):
abspath = path
else:
abspath = os.path.join(self.path, path)
if not os.path.exists(abspath):
logger.error(f"file {abspath} not exists")
raise Exception(f"file {abspath} no exists")
if mode == AbsReaderWriter.MODE_TXT:
with open(abspath, "r", encoding=self.encoding) as f:
return f.read()
elif mode == AbsReaderWriter.MODE_BIN:
with open(abspath, "rb") as f:
return f.read()
else:
raise ValueError("Invalid mode. Use 'text' or 'binary'.")
def write(self, content, path, mode=AbsReaderWriter.MODE_TXT):
if os.path.isabs(path):
abspath = path
else:
abspath = os.path.join(self.path, path)
directory_path = os.path.dirname(abspath)
if not os.path.exists(directory_path):
os.makedirs(directory_path)
if mode == AbsReaderWriter.MODE_TXT:
with open(abspath, "w", encoding=self.encoding, errors="replace") as f:
f.write(content)
elif mode == AbsReaderWriter.MODE_BIN:
with open(abspath, "wb") as f:
f.write(content)
else:
raise ValueError("Invalid mode. Use 'text' or 'binary'.")
def read_offset(self, path: str, offset=0, limit=None):
abspath = path
if not os.path.isabs(path):
abspath = os.path.join(self.path, path)
with open(abspath, "rb") as f:
f.seek(offset)
return f.read(limit)
if __name__ == "__main__":
if 0:
file_path = "io/test/example.txt"
drw = DiskReaderWriter("D:\projects\papayfork\Magic-PDF\magic_pdf")
# 写入内容到文件
drw.write(b"Hello, World!", path="io/test/example.txt", mode="binary")
# 从文件读取内容
content = drw.read(path=file_path)
if content:
logger.info(f"从 {file_path} 读取的内容: {content}")
if 1:
drw = DiskReaderWriter("/opt/data/pdf/resources/test/io/")
content_bin = drw.read_offset("1.txt")
assert content_bin == b"ABCD!"
content_bin = drw.read_offset("1.txt", offset=1, limit=2)
assert content_bin == b"BC"
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
from magic_pdf.libs.commons import parse_aws_param, parse_bucket_key, join_path
import boto3
from loguru import logger
from botocore.config import Config
class S3ReaderWriter(AbsReaderWriter):
def __init__(
self,
ak: str,
sk: str,
endpoint_url: str,
addressing_style: str = "auto",
parent_path: str = "",
):
self.client = self._get_client(ak, sk, endpoint_url, addressing_style)
self.path = parent_path
def _get_client(self, ak: str, sk: str, endpoint_url: str, addressing_style: str):
s3_client = boto3.client(
service_name="s3",
aws_access_key_id=ak,
aws_secret_access_key=sk,
endpoint_url=endpoint_url,
config=Config(
s3={"addressing_style": addressing_style},
retries={"max_attempts": 5, "mode": "standard"},
),
)
return s3_client
def read(self, s3_relative_path, mode=AbsReaderWriter.MODE_TXT, encoding="utf-8"):
if s3_relative_path.startswith("s3://"):
s3_path = s3_relative_path
else:
s3_path = join_path(self.path, s3_relative_path)
bucket_name, key = parse_bucket_key(s3_path)
res = self.client.get_object(Bucket=bucket_name, Key=key)
body = res["Body"].read()
if mode == AbsReaderWriter.MODE_TXT:
data = body.decode(encoding) # Decode bytes to text
elif mode == AbsReaderWriter.MODE_BIN:
data = body
else:
raise ValueError("Invalid mode. Use 'text' or 'binary'.")
return data
def write(self, content, s3_relative_path, mode=AbsReaderWriter.MODE_TXT, encoding="utf-8"):
if s3_relative_path.startswith("s3://"):
s3_path = s3_relative_path
else:
s3_path = join_path(self.path, s3_relative_path)
if mode == AbsReaderWriter.MODE_TXT:
body = content.encode(encoding) # Encode text data as bytes
elif mode == AbsReaderWriter.MODE_BIN:
body = content
else:
raise ValueError("Invalid mode. Use 'text' or 'binary'.")
bucket_name, key = parse_bucket_key(s3_path)
self.client.put_object(Body=body, Bucket=bucket_name, Key=key)
logger.info(f"内容已写入 {s3_path} ")
def read_offset(self, path: str, offset=0, limit=None) -> bytes:
if path.startswith("s3://"):
s3_path = path
else:
s3_path = join_path(self.path, path)
bucket_name, key = parse_bucket_key(s3_path)
range_header = (
f"bytes={offset}-{offset+limit-1}" if limit else f"bytes={offset}-"
)
res = self.client.get_object(Bucket=bucket_name, Key=key, Range=range_header)
return res["Body"].read()
if __name__ == "__main__":
if 0:
# Config the connection info
ak = ""
sk = ""
endpoint_url = ""
addressing_style = "auto"
bucket_name = ""
# Create an S3ReaderWriter object
s3_reader_writer = S3ReaderWriter(
ak, sk, endpoint_url, addressing_style, "s3://bucket_name/"
)
# Write text data to S3
text_data = "This is some text data"
s3_reader_writer.write(
text_data,
s3_relative_path=f"s3://{bucket_name}/ebook/test/test.json",
mode=AbsReaderWriter.MODE_TXT,
)
# Read text data from S3
text_data_read = s3_reader_writer.read(
s3_relative_path=f"s3://{bucket_name}/ebook/test/test.json", mode=AbsReaderWriter.MODE_TXT
)
logger.info(f"Read text data from S3: {text_data_read}")
# Write binary data to S3
binary_data = b"This is some binary data"
s3_reader_writer.write(
text_data,
s3_relative_path=f"s3://{bucket_name}/ebook/test/test.json",
mode=AbsReaderWriter.MODE_BIN,
)
# Read binary data from S3
binary_data_read = s3_reader_writer.read(
s3_relative_path=f"s3://{bucket_name}/ebook/test/test.json", mode=AbsReaderWriter.MODE_BIN
)
logger.info(f"Read binary data from S3: {binary_data_read}")
# Range Read text data from S3
binary_data_read = s3_reader_writer.read_offset(
path=f"s3://{bucket_name}/ebook/test/test.json", offset=0, limit=10
)
logger.info(f"Read binary data from S3: {binary_data_read}")
if 1:
import os
import json
ak = os.getenv("AK", "")
sk = os.getenv("SK", "")
endpoint_url = os.getenv("ENDPOINT", "")
bucket = os.getenv("S3_BUCKET", "")
prefix = os.getenv("S3_PREFIX", "")
key_basename = os.getenv("S3_KEY_BASENAME", "")
s3_reader_writer = S3ReaderWriter(
ak, sk, endpoint_url, "auto", f"s3://{bucket}/{prefix}"
)
content_bin = s3_reader_writer.read_offset(key_basename)
assert content_bin[:10] == b'{"track_id'
assert content_bin[-10:] == b'r":null}}\n'
content_bin = s3_reader_writer.read_offset(key_basename, offset=424, limit=426)
jso = json.dumps(content_bin.decode("utf-8"))
print(jso)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# PROJECT_NAME: F:\code\easyofd\easyofd\draw
# CREATE_TIME: 2023-10-26
# E_MAIL: renoyuan@foxmail.com
# AUTHOR: reno
# note: 写入 xml 目录并打包成ofd 文件
from datetime import datetime
from io import BytesIO
from typing import Optional
from PIL import Image
from loguru import logger
from magic_pdf.rw.ofdtemplate import CurId, OFDTemplate, DocumentTemplate, DocumentResTemplate, PublicResTemplate, ContentTemplate, \
OFDStructure
from magic_pdf.rw.pdf_parse import DPFParser
class OFDWrite(object):
"""
写入ofd 工具类
"""
def __init__(self, ):
self.OP = 200 / 25.4
# self.OP = 1
def build_ofd_entrance(self, id_obj: Optional[CurId] = None):
"""
build_ofd_entrance
"""
CreationDate = str(datetime.now())
ofd_entrance = OFDTemplate(CreationDate=CreationDate, id_obj=id_obj)
return ofd_entrance
def build_document(self, img_len, id_obj: Optional[CurId] = None, PhysicalBox: Optional[str] = "0 0 140 90"):
"""
build_document
"""
pages = []
for idx in range(img_len):
pages.append(
{
"@ID": f"{idx + 1}",
"@BaseLoc": f"Pages/Page_{idx}/Content.xml"
}
)
document = DocumentTemplate(Page=pages, id_obj=id_obj, PhysicalBox=PhysicalBox)
return document
def build_document_res(self, img_len: int = 0, id_obj: Optional[CurId] = None,
pfd_res_uuid_map: Optional[dict] = None):
"""
build_document_res
"""
MultiMedia = []
DrawParams = [] # todo DrawParams 参数后面有空增加
if img_len and not pfd_res_uuid_map:
for num in range(img_len):
MultiMedia.append({
"@ID": 0,
"@Type": "Image",
"ofd:MediaFile": f"Image_{num}.jpg",
"res_uuid": f"{num}",
})
elif pfd_res_uuid_map and (pfd_img := pfd_res_uuid_map.get("img")):
for res_uuid in pfd_img.keys():
name = f"Image_{res_uuid}.jpg"
MultiMedia.append({
"@ID": 0,
"@Type": "Image",
"ofd:MediaFile": name,
"res_uuid": res_uuid,
})
document_res = DocumentResTemplate(MultiMedia=MultiMedia, id_obj=id_obj)
return document_res
def build_public_res(self, id_obj: CurId = None, pfd_res_uuid_map: dict = None):
"""
build_public_res
"""
fonts = []
if pfd_res_uuid_map and (pfd_font := pfd_res_uuid_map.get("font")):
for res_uuid, font in pfd_font.items():
fonts.append({
"@ID": 0,
"@FontName": font,
"@FamilyName": font, # 匹配替代字型
"res_uuid": res_uuid,
"@FixedWidth": "false",
"@Serif": "false",
"@Bold": "false",
"@Charset": "prc"
})
else:
pass
public_res = PublicResTemplate(Font=fonts, id_obj=id_obj)
return public_res
def build_content_res(self, pil_img_list=None, pdf_info_list=None, id_obj: CurId = None,
pfd_res_uuid_map: dict = None):
"""
pil_img_list - >一张图片是一页
content_res -> 写入 pdf 信息
"""
PhysicalBox = None
content_res_list = []
if pil_img_list:
for idx, pil_img in enumerate(pil_img_list):
# print(pil_img)
# print(idx, pil_img[1], pil_img[2])
PhysicalBox = f"0 0 {pil_img[1]} {pil_img[2]}"
ImageObject = [{
"@ID": 0,
"@CTM": f"{pil_img[1]} 0 0 {pil_img[2]} 0 0",
"@Boundary": f"0 0 {pil_img[1]} {pil_img[2]}",
"res_uuid": f"{idx}", # 资源标识
"@ResourceID": f""
}]
conten = ContentTemplate(PhysicalBox=PhysicalBox, ImageObject=ImageObject,
CGTransform=[], PathObject=[], TextObject=[], id_obj=id_obj)
# print(conten)
content_res_list.append(conten)
elif pdf_info_list: # 写入读取后的pdf 结果 # todo 图片id 需要关联得提前定义或者有其他方式反向对齐
for idx, content in enumerate(pdf_info_list):
ImageObject = []
TextObject = []
PhysicalBox = pfd_res_uuid_map["other"]["page_size"][idx]
PhysicalBox = f"0 0 {PhysicalBox[0]} {PhysicalBox[1]}" # page_size 没有的话使用document 里面的
for block in content:
# print(block)
bbox = block['bbox']
x0, y0, length, height = bbox[0] / self.OP, bbox[1] / self.OP, (bbox[2] - bbox[0]) / self.OP, (
bbox[3] - bbox[1]) / self.OP
if block["type"] == "text":
count = len(block.get("text"))
TextObject.append({
"@ID": 0,
"res_uuid": block.get("res_uuid"), # 资源标识
"@Font": "",
"ofd:FillColor": {"Value": "156 82 35"},
"ofd:TextCode": {
"#text": block.get("text"),
"@X": "0",
"@Y": f"{block.get('size') / self.OP}",
"@DeltaX": f"g {count - 1} {length / count}"
},
"@size": block.get("size") / self.OP,
"@Boundary": f"{x0} {y0} {length} {height}",
})
elif block["type"] == "img":
ImageObject.append({
"@ID": 0,
"res_uuid": block.get("res_uuid"), # 资源标识
"@Boundary": f"{x0} {y0} {length} {height}",
"@ResourceID": f"" # 需要关联public res 里面的结果
})
# for i in content:
# if i["type"] == "img":
# ImageObject.append(i)
# elif i["type"] == "text":
# TextObject.append(i)
conten = ContentTemplate(PhysicalBox=PhysicalBox, ImageObject=ImageObject,
CGTransform=[], PathObject=[], TextObject=TextObject, id_obj=id_obj)
# print(conten)
content_res_list.append(conten)
else:
pass
return content_res_list
def pil_2_bytes(self, image):
""""""
# 创建一个 BytesIO 对象
img_bytesio = BytesIO()
# 将图像保存到 BytesIO 对象
image.save(img_bytesio, format='PNG') # 你可以根据需要选择其他图像格式
# 获取 BytesIO 对象中的字节
img_bytes = img_bytesio.getvalue()
# 关闭 BytesIO 对象
img_bytesio.close()
return img_bytes
def __call__(self, pdf_bytes=None, pil_img_list=None, optional_text=False):
"""
input pdf | imgs if pdf >optional_text or not
0 解析pdf文件
1 构建必要的ofd template
2 转化为 ofd
"""
pdf_obj = DPFParser()
page_pil_img_list = None
# 插入图片ofd
if pil_img_list: # 读取 图片
page_pil_img_list = [(self.pil_2_bytes(_img), _img.size[0] / self.OP, _img.size[1] / self.OP) for _img in
pil_img_list]
else: # 读取 pdf 转图片
if optional_text: # 生成可编辑ofd:
pdf_info_list, pfd_res_uuid_map = pdf_obj.extract_text_with_details(pdf_bytes) # 解析pdf
logger.debug(f"pdf_info_list: {pdf_info_list} \n pfd_res_uuid_map {pfd_res_uuid_map}")
else:
img_list = pdf_obj.to_img(pdf_bytes)
page_pil_img_list = [(self.pil_2_bytes(Image.frombytes("RGB", [_img.width, _img.height],
_img.samples)), _img.width / self.OP,
_img.height / self.OP) for _img in img_list]
id_obj = CurId()
if page_pil_img_list: # img 内容转ofd
res_static = {} # 图片资源
pfd_res_uuid_map = {"img": {}}
PhysicalBox = f"0 0 {page_pil_img_list[0][1]} {page_pil_img_list[0][2]}"
for idx, pil_img_tuple in enumerate(page_pil_img_list):
pfd_res_uuid_map["img"][f"{idx}"] = pil_img_tuple[0]
res_static[f"Image_{idx}.jpg"] = pil_img_tuple[0]
ofd_entrance = self.build_ofd_entrance(id_obj=id_obj)
document = self.build_document(len(page_pil_img_list), id_obj=id_obj, PhysicalBox=PhysicalBox)
public_res = self.build_public_res(id_obj=id_obj)
document_res = self.build_document_res(len(page_pil_img_list), id_obj=id_obj,
pfd_res_uuid_map=pfd_res_uuid_map)
content_res_list = self.build_content_res(page_pil_img_list, id_obj=id_obj,
pfd_res_uuid_map=pfd_res_uuid_map)
else:
# 生成的文档结构对象需要传入id实例
ofd_entrance = self.build_ofd_entrance(id_obj=id_obj)
document = self.build_document(len(pdf_info_list), id_obj=id_obj)
public_res = self.build_public_res(id_obj=id_obj, pfd_res_uuid_map=pfd_res_uuid_map)
document_res = self.build_document_res(len(pdf_info_list), id_obj=id_obj, pfd_res_uuid_map=pfd_res_uuid_map)
content_res_list = self.build_content_res(pdf_info_list=pdf_info_list, id_obj=id_obj,
pfd_res_uuid_map=pfd_res_uuid_map)
res_static = {} # 图片资源
print("pfd_res_uuid_map", pfd_res_uuid_map)
img_dict = pfd_res_uuid_map.get("img")
if img_dict:
for key, v_io in img_dict.items():
res_static[f"Image_{key}.jpg"] = v_io.getvalue()
# 生成 ofd 文件
ofd_byte = OFDStructure("123", ofd=ofd_entrance, document=document, public_res=public_res,
document_res=document_res, content_res=content_res_list, res_static=res_static)(
test=True)
return ofd_byte
if __name__ == "__main__":
pdf_p = r"D:\renodoc\技术栈\GBT_33190-2016_电子文件存储与交换格式版式文档.pdf"
pdf_p = r"F:\code\easyofd\test"
with open(pdf_p, "rb") as f:
content = f.read()
ofd_content = OFDWrite()(content)
with open("ofd.ofd", "wb") as f:
f.write(ofd_content)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# PROJECT_NAME: E:\code\easyofd\easyofd\draw
# CREATE_TIME: 2023-08-10
# E_MAIL: renoyuan@foxmail.com
# AUTHOR: reno
# NOTE: 绘制pdf
import base64
import os
import re
import traceback
from io import BytesIO
from PIL import Image as PILImage
from loguru import logger
from reportlab.lib.pagesizes import A4
from reportlab.lib.utils import ImageReader
from reportlab.pdfgen import canvas
from magic_pdf.tools.font_tools import FontTool
from magic_pdf.tools.find_seal_img import SealExtract
# print(reportlab_fonts)
class DrawPDF():
"""
ofd 解析结果 绘制pdf
OP ofd 单位转换
"""
def __init__(self, data, *args, **kwargs):
assert data, "未输入ofd解析结果"
self.data = data
self.author = "sugon"
self.OP = 200 / 25.4
# self.OP = 1
self.pdf_uuid_name = self.data[0]["pdf_name"]
self.pdf_io = BytesIO()
self.SupportImgType = ("JPG", "JPEG", "PNG")
self.init_font = "宋体"
self.font_tool = FontTool()
def draw_lines(my_canvas):
"""
draw_line
"""
my_canvas.setLineWidth(.3)
start_y = 710
my_canvas.line(30, start_y, 580, start_y)
for x in range(10):
start_y -= 10
my_canvas.line(30, start_y, 580, start_y)
def gen_empty_pdf(self):
"""
"""
c = canvas.Canvas(self.pdf_io)
c.setPageSize(A4)
c.setFont(self.init_font, 20)
c.drawString(0, 210, "ofd 格式错误,不支持解析", mode=1)
c.save()
# 单个字符偏移量计算
def cmp_offset(self, pos, offset, DeltaRule, text, CTM_info, dire="X") -> list:
"""
pos 文本框x|y 坐标
offset 第一个字符的X|Y
DeltaRule 偏移量规则
resize 字符坐标缩放
返回 x|y 字符位置list
"""
if CTM_info and dire == "X":
resize = CTM_info.get("resizeX")
rotate = CTM_info.get("rotateX")
move= CTM_info.get("moveX")
elif CTM_info and dire == "Y":
resize = CTM_info.get("resizeY")
rotate = CTM_info.get("rotateY")
move = CTM_info.get("moveY")
else:
resize = 1
rotate = 0
move = 0
char_pos = float(pos if pos else 0) + (float(offset if offset else 0) + move) * resize
pos_list = []
pos_list.append(char_pos) # 放入第一个字符
offsets = [i for i in DeltaRule.split(" ")]
if "g" in DeltaRule: # g 代表多个元素
g_no = None
for _no, offset_i in enumerate(offsets):
if offset_i == "g":
g_no = _no
for j in range(int(offsets[(g_no + 1)])):
char_pos += float(offsets[(g_no + 2)])
pos_list.append(char_pos)
elif offset_i != "g":
if g_no == None:
char_pos += float(offset_i) * resize
pos_list.append(char_pos)
elif (int(_no) > int(g_no + 2)) and g_no != None:
char_pos += float(offset_i) * resize
pos_list.append(char_pos)
elif not DeltaRule: # 没有字符偏移量 一般单字符
pos_list = []
for i in range(len(text)):
pos_list.append(char_pos)
else: # 有字符偏移量
for i in offsets:
if not i:
char_pos += 0
else:
char_pos += float(i) * resize
pos_list.append(char_pos)
return pos_list
def draw_chars(self, canvas, text_list, fonts, page_size):
"""写入字符"""
c = canvas
for line_dict in text_list:
# TODO 写入前对于正文内容整体序列化一次 方便 查看最后输入值 对于最终 格式先
text = line_dict.get("text")
font_info = fonts.get(line_dict.get("font"), {})
if font_info:
font_name = font_info.get("FontName", "")
else:
font_name = self.init_font
# TODO 判断是否通用已有字体 否则匹配相近字体使用
if font_name not in self.font_tool.FONTS:
font_name = self.font_tool.FONTS[0]
font = font_name
# if font not in FONT: # KeyError: 'SWDRSO+KaiTi-KaiTi-0'
c.setFont(font, line_dict["size"] * self.OP)
# 原点在页面的左下角
color = line_dict.get("color", [0, 0, 0])
if len(color) < 3:
color = [0, 0, 0]
c.setFillColorRGB(int(color[0]) / 255, int(color[1]) / 255, int(color[2]) / 255)
c.setStrokeColorRGB(int(color[0]) / 255, int(color[1]) / 255, int(color[2]) / 255)
DeltaX = line_dict.get("DeltaX", "")
DeltaY = line_dict.get("DeltaY", "")
# print("DeltaX",DeltaX)
X = line_dict.get("X", "")
Y = line_dict.get("Y", "")
CTM = line_dict.get("CTM", "") # 因为ofd 增加这个字符缩放
resizeX = 1
resizeY = 1
# CTM =None # 有的数据不使用这个CTM
if CTM and (CTMS:=CTM.split(" ")) and len(CTMS) == 6:
CTM_info = {
"resizeX": float(CTMS[0]),
"rotateX": float(CTMS[1]),
"rotateY": float(CTMS[2]),
"resizeY": float(CTMS[3]),
"moveX": float(CTMS[4]),
"moveY": float(CTMS[5]),
}
else:
CTM_info ={}
x_list = self.cmp_offset(line_dict.get("pos")[0], X, DeltaX, text, CTM_info, dire="X")
y_list = self.cmp_offset(line_dict.get("pos")[1], Y, DeltaY, text, CTM_info, dire="Y")
# print("x_list",x_list)
# print("y_list",y_list)
# print("Y",page_size[3])
# print("x",page_size[2])
# if line_dict.get("Glyphs_d") and FontFilePath.get(line_dict["font"]) and font_f not in FONTS:
if False: # 对于自定义字体 写入字形 drawPath 性能差暂时作废
Glyphs = [int(i) for i in line_dict.get("Glyphs_d").get("Glyphs").split(" ")]
for idx, Glyph_id in enumerate(Glyphs):
_cahr_x = float(x_list[idx]) * self.OP
_cahr_y = (float(page_size[3]) - (float(y_list[idx]))) * self.OP
imageFile = draw_Glyph(FontFilePath.get(line_dict["font"]), Glyph_id, text[idx])
# font_img_info.append((FontFilePath.get(line_dict["font"]), Glyph_id,text[idx],_cahr_x,_cahr_y,-line_dict["size"]*Op*2,line_dict["size"]*Op*2))
c.drawImage(imageFile, _cahr_x, _cahr_y, -line_dict["size"] * self.OP * 2,
line_dict["size"] * self.OP * 2)
else:
if len(text) > len(x_list) or len(text) > len(y_list):
text = re.sub("[^\u4e00-\u9fa5]", "", text)
try:
# 按行写入 最后一个字符y 算出来大于 y轴 最后一个字符x 算出来大于 x轴
if y_list[-1] * self.OP > page_size[3] * self.OP or x_list[-1] * self.OP > page_size[2] * self.OP or \
x_list[-1] < 0 or y_list[-1] < 0:
# print("line wtite")
x_p = abs(float(X)) * self.OP
y_p = abs(float(page_size[3]) - (float(Y))) * self.OP
c.drawString(x_p, y_p, text, mode=0) # mode=3 文字不可见 0可見
# text_write.append((x_p, y_p, text))
# 按字符写入
else:
for cahr_id, _cahr_ in enumerate(text):
# print("char wtite")
c.setFont(font, line_dict["size"] * self.OP * resizeX)
_cahr_x = float(x_list[cahr_id]) * self.OP
_cahr_y = (float(page_size[3]) - (float(y_list[cahr_id]))) * self.OP
# print(_cahr_x, _cahr_y, _cahr_)
c.drawString(_cahr_x, _cahr_y, _cahr_, mode=0) # mode=3 文字不可见 0可見
# text_write.append((_cahr_x, _cahr_y, _cahr_))
except Exception as e:
logger.error(f"{e}")
traceback.print_exc()
def draw_img(self, canvas, img_list, images, page_size):
"""写入图片"""
c = canvas
for img_d in img_list:
image = images.get(img_d["ResourceID"])
if not image or image.get("suffix").upper() not in self.SupportImgType:
continue
imgbyte = base64.b64decode(image.get('imgb64'))
if not imgbyte:
logger.error(f"{image['fileName']} is null")
continue
img = PILImage.open(BytesIO(imgbyte))
imgReade = ImageReader(img)
CTM = img_d.get('CTM')
x_offset = 0
y_offset = 0
wrap_pos = image.get("wrap_pos")
x = (img_d.get('pos')[0] + x_offset) * self.OP
y = (page_size[3] - (img_d.get('pos')[1] + y_offset)) * self.OP
if wrap_pos:
x = x + (wrap_pos[0] * self.OP)
y = y - (wrap_pos[1] * self.OP)
w = img_d.get('pos')[2] * self.OP
h = -img_d.get('pos')[3] * self.OP
c.drawImage(imgReade, x, y, w, h, 'auto')
def draw_signature(self, canvas, signatures_page_list, page_size):
"""
写入签章
{
"sing_page_no": sing_page_no,
"PageRef": PageRef,
"Boundary": Boundary,
"SignedValue": self.file_tree(SignedValue),
}
"""
c = canvas
try:
if signatures_page_list:
# print("signatures_page_list",signatures_page_list)
for signature_info in signatures_page_list:
image = SealExtract()(b64=signature_info.get("SignedValue"))
if not image:
logger.info(f"提取不到签章图片")
continue
else:
image_pil = image[0]
pos = [float(i) for i in signature_info.get("Boundary").split(" ")]
imgReade = ImageReader(image_pil)
x = pos[0] * self.OP
y = (page_size[3] - pos[1]) * self.OP
w = pos[2] * self.OP
h = -pos[3] * self.OP
c.drawImage(imgReade, x, y, w, h, 'auto')
print(f"签章写入成功")
else:
# 无签章
pass
except Exception as e:
print(f"签章写入失败 {e}")
traceback.print_exc()
def draw_line(self, canvas, line_list, page_size):
"""绘制线条"""
# print("绘制",line_list)
def match_mode(Abbr: list):
"""
解析AbbreviatedData
匹配各种线条模式
S 定义起始 坐标 x, y
M 移动到指定坐标 x, y
L 从当前点移动到指定点 x, y
Q x1 y1 x2 y2 二次贝塞尔曲线
B x1 y1 x2 y2 x3 y3 三次贝塞尔曲线
A 到 x,y 的圆弧 并移动到 x,y rx 长轴 ry 短轴 angle 旋转角度 large为1表示 大于180 的弧 为0时表示小于180的弧 swcpp 为1 表示顺时针旋转 0 表示逆时针旋转
C 当前点和SubPath自动闭合
"""
relu_list = []
mode = ""
modes = ["S", "M", "L", "Q", "B", "A", "C"]
mode_dict = {}
for idx, i in enumerate(Abbr):
if i in modes:
mode = i
if mode_dict:
relu_list.append(mode_dict)
mode_dict = {"mode": i, "points": []}
else:
mode_dict["points"].append(i)
if idx + 1 == len(Abbr):
relu_list.append(mode_dict)
return relu_list
def assemble(relu_list: list):
start_point = {}
acticon = []
for i in relu_list:
if i.get("mode") == "M":
start_point = i
elif i.get("mode") in ['B', "Q", 'L']:
acticon.append({"start_point": start_point,
"end_point": i
})
return acticon
def convert_coord(p_list, direction, page_size, pos):
"""坐标转换ofd2pdf"""
new_p_l = []
for p in p_list:
if direction == "x":
new_p = (float(pos[0]) + float(p)) * self.OP
else:
new_p = (float(page_size[3]) - float(pos[1]) - float(p)) * self.OP
new_p_l.append(new_p)
return new_p_l
for line in line_list:
Abbr = line.get("AbbreviatedData").split(" ") # AbbreviatedData
color = line.get("FillColor", [0, 0, 0])
relu_list = match_mode(Abbr)
# TODO 组合 relu_list 1 M L 直线 2 M B*n 三次贝塞尔线 3 M Q*n 二次贝塞尔线
# print(relu_list)
acticons = assemble(relu_list)
pos = line.get("pos")
# print(color)
if len(color) < 3:
color = [0, 0, 0]
canvas.setStrokeColorRGB(*(int(color[0]) / 255, int(color[1]) / 255, int(color[2]) / 255)) # 颜色
# 设置线条宽度
try:
LineWidth = (float(line.get("LineWidth", "0.25").replace(" ", "")) if \
line.get("LineWidth", "0.25").replace(" ", "") else 0.25) * self.OP
except Exception as e:
logger.error(f"{e}")
LineWidth = 0.25 * self.OP
canvas.setLineWidth(LineWidth) # 单位为点,2 表示 2 点
for acticon in acticons:
if acticon.get("end_point").get("mode") == 'L': # 直线
x1, y1, x2, y2 = *acticon.get("start_point").get("points"), *acticon.get("end_point").get("points")
x1, x2 = convert_coord([x1, x2], "x", page_size, pos)
y1, y2 = convert_coord([y1, y2], "y", page_size, pos)
# 绘制一条线 x1 y1 x2 y2
canvas.line(x1, y1, x2, y2)
elif acticon.get("end_point").get("mode") == 'B': # 三次贝塞尔线
continue
x1, y1, x2, y2, x3, y3, x4, y4 = *acticon.get("start_point").get("points"), *acticon.get(
"end_point").get("points")
x1, x2, x3, x4 = convert_coord([x1, x2, x3, x4], "x", page_size, pos)
y1, y2, y3, y4 = convert_coord([y1, y2, y3, y4], "y", page_size, pos)
# print(x1, y1, x2, y2, x3, y3, x4, y4)
# 绘制三次贝塞尔线
canvas.bezier(x1, y1, x2, y2, x3, y3, x4, y4)
elif acticon.get("end_point").get("mode") == 'Q': # 二次贝塞尔线
pass
else:
continue
def draw_pdf(self):
c = canvas.Canvas(self.pdf_io)
c.setAuthor(self.author)
for doc_id, doc in enumerate(self.data, start=0):
# print(1)
fonts = doc.get("fonts")
images = doc.get("images")
default_page_size = doc.get("default_page_size")
page_size_details = doc.get("page_size")
print("page_size_details", page_size_details)
signatures_page_id = doc.get("signatures_page_id") # 签证信息
# 注册字体
for font_id, font_v in fonts.items():
file_name = font_v.get("FontFile")
font_b64 = font_v.get("font_b64")
if font_b64:
self.font_tool.register_font(os.path.split(file_name)[1], font_v.get("@FontName"), font_b64)
# text_write = []
# print("doc.get(page_info)", len(doc.get("page_info")))
for page_id, page in doc.get("page_info").items():
if page_size_details[page_id]:
page_size = page_size_details[page_id]
else:
page_size = default_page_size
# logger.info(f"page_id {page_id} page_size {page_size}")
text_list = page.get("text_list")
img_list = page.get("img_list")
line_list = page.get("line_list")
# print("img_list",img_list)
c.setPageSize((page_size[2] * self.OP, page_size[3] * self.OP))
# 写入图片
if img_list:
self.draw_img(c, img_list, images, page_size)
# 写入文本
if text_list:
self.draw_chars(c, text_list, fonts, page_size)
# 绘制线条
if line_list:
self.draw_line(c, line_list, page_size)
# 绘制签章
if signatures_page_id:
self.draw_signature(c, signatures_page_id.get(page_id), page_size)
# print("去写入")
# print(doc_id,len(self.data))
# 页码判断逻辑
if page_id != len(doc.get("page_info")) - 1 and doc_id != len(self.data):
# print("写入")
c.showPage()
# json.dump(text_write,open("text_write.json","w",encoding="utf-8"),ensure_ascii=False)
c.save()
def __call__(self):
try:
self.draw_pdf()
pdfbytes = self.pdf_io.getvalue()
except Exception as e:
logger.error(f"{e}")
logger.error(f"ofd解析失败")
traceback.print_exc()
self.gen_empty_pdf()
pdfbytes = self.pdf_io.getvalue()
return pdfbytes
#!/usr/bin/env python
#-*- coding: utf-8 -*-
#PROJECT_NAME: F:\code\easyofd\easyofd\draw
#CREATE_TIME: 2023-10-30
#E_MAIL: renoyuan@foxmail.com
#AUTHOR: reno
#note: ofd 基础结构模板
import tempfile
import os
import abc
import copy
from loguru import logger
import xmltodict
import zipfile
__all__ = ["CurId", "OFDTemplate", "DocumentTemplate", "DocumentResTemplate",
"PublicResTemplate", "ContentTemplate", "OFDStructure"]
"""
OFD目录结构
│ OFD.xml
└─Doc_0
│ Document.xml
│ DocumentRes.xml
│ PublicRes.xml
├─Annots
│ │ Annotations.xml
│ │
│ └─Page_0
│ Annotation.xml
├─Attachs
│ Attachments.xml
│ original_invoice.xml
├─Pages
│ └─Page_0
│ Content.xml
├─Res
│ image_80.jb2
├─Signs
│ │ Signatures.xml
│ │
│ └─Sign_0
│ Signature.xml
│ SignedValue.dat
├─Tags
│ CustomTag.xml
│ CustomTags.xml
└─Tpls
└─Tpl_0
Content.xml
"""
class CurId(object):
"""文档内id控制对象"""
def __init__(self):
self.id = 1
self.used = False
self.uuid_map = {} # 资源文件生成id的时候手动添加进来后面构建page 可以 匹配ResourceID
def add_uuid_map(self, k, v):
logger.debug(f"uuid_map add {k}: {v}")
self.uuid_map[k] = v
def add(self):
self.id += 1
def get_id(self):
if self.used:
self.add()
return self.id
if not self.used:
cur_id = self.id
self.used =True
return cur_id
def get_max_id(self):
MaxUnitID = self.id + 1
return MaxUnitID
class TemplateBase(object):
"""模板基类"""
key_map = {} # 变量名对应 xml 中形式 映射 如 传入 DocID -> ofd:DocID
id_keys = [ ] # 对需要的要素添加 "@ID"
template_name = ""
def __init__(self,*args,**kwargs):
# print(args)
# print(kwargs)
self.id_obj: CurId = kwargs.get("id_obj")
# print("id_obj", self.id_obj)
self.assemble(*args, **kwargs)
def assemble(self,*args, **kwargs):
"""对ofdjson组装"""
self.final_json = copy.deepcopy(self.ofdjson)
# 往模板里面添加要素值
if kwargs:
for k, v in kwargs.items():
if k in self.key_map:
self.modify(self.final_json, self.key_map[k], v)
# 添加id
for id_key in self.id_keys:
print(f"开始gen_id >> {self.template_name}>>{id_key}")
# print(f"final_json {self.final_json}")
self.gen_id(self.final_json, id_key)
def gen_id(self,ofdjson, id_key):
"""生成id"""
# print("id_key ", id_key, "ofdjson ", ofdjson)
for k, v in ofdjson.items():
if k == id_key:
# 添加id
if isinstance(ofdjson[k], dict):
ofdjson[k]["@ID"] = f"{self.id_obj.get_id()}"
# logger.info(f"添加id -> {ofdjson[k]}")
elif isinstance(ofdjson[k], list):
for i in ofdjson[k]:
i["@ID"] = f"{self.id_obj.get_id()}"
# logger.info(f"添加id ->i {i}")
elif isinstance(v, dict):
# logger.debug(f"dict_v{v}")
self.gen_id(v, id_key)
elif isinstance(v, list):
for v_cell in v:
if isinstance(v_cell, dict):
# logger.debug(f"dict_v{v}")
self.gen_id(v_cell, id_key)
def modify(self, ofdjson, key, value):
"""对指定key的值更改 多个会统一改"""
for k, v in ofdjson.items():
if k == key:
ofdjson[k] = value
elif isinstance(v, dict):
self.modify(v, key, value)
elif isinstance(v, list):
for v_cell in v:
if isinstance(v_cell, dict):
self.modify(v_cell, key, value)
def save(self, path):
xml_data = xmltodict.unparse(self.final_json, pretty=True)
with open(path, "w", encoding="utf-8") as f:
f.write(xml_data)
class OFDTemplate(TemplateBase):
"""根节点全局唯一 OFD.xml"""
template_name = "OFD"
key_map = {"Author": "ofd:Author", "DocID": "ofd:DocID" ,"CreationDate": "ofd:CreationDate"
}
ofdjson = {
"ofd:OFD": {
"@xmlns:ofd": "http://blog.yuanhaiying.cn",
"@Version": "1.1",
"@DocType": "OFD",
"ofd:DocBody": [{
"ofd:DocInfo": {
"ofd:DocID": "0C1D4F7159954EEEDE517F7285E84DC4",
"ofd:Creator": "easyofd",
"ofd:author": "renoyuan",
"ofd:authoremail": "renoyuan@foxmail.com",
"ofd:CreatorVersion": "1.0",
"ofd:CreationDate": "2023-10-27"
},
"ofd:DocRoot": "Doc_0/Document.xml"
}]
}
}
class DocumentTemplate(TemplateBase):
"""DOC 内唯一 表示DOC内部结构 Document.xml
"""
template_name = "Document"
key_map = {"Page": "ofd:Page","PhysicalBox":"ofd:PhysicalBox"}
id_keys = ["ofd:Page"]
ofdjson ={
"ofd:Document": {
"@xmlns:ofd": "http://blog.yuanhaiying.cn",
"ofd:CommonData": {
"ofd:MaxUnitID": 0,
"ofd:PageArea": {
"ofd:PhysicalBox": "0 0 140 90"
},
"ofd:PublicRes": "PublicRes.xml",
"ofd:DocumentRes": "DocumentRes.xml"
},
"ofd:Pages":
{
"ofd:Page": [{
"@ID": 0,
"@BaseLoc": "Pages/Page_0/Content.xml"
}]
}
}
}
def update_max_unit_id(self, final_json=None):
if not final_json:
final_json = self.final_json
for k, v in final_json.items():
if k == "ofd:MaxUnitID":
final_json["ofd:MaxUnitID"]=self.id_obj.get_max_id()
return
elif isinstance(v, dict):
self.update_max_unit_id(v)
elif isinstance(v, list):
for v_cell in v:
if isinstance(v_cell, dict):
self.update_max_unit_id(v_cell)
def update_page(self,page_num):
pass
class DocumentResTemplate(TemplateBase):
"""DOC 内唯一 表示MultyMedia 资源信息 如 图片 DocumentRes.xml """
template_name = "DocumentRes"
key_map = {"MultiMedia": "ofd:MultiMedia"}
id_keys = ["ofd:DrawParam", "ofd:MultiMedia"]
ofdjson = {
"ofd:Res": {
"@xmlns:ofd": "http://blog.yuanhaiying.cn",
"@BaseLoc": "Res",
"ofd:MultiMedias": {
"ofd:MultiMedia": [
{
"@ID": 0,
"@Type": "Image",
"ofd:MediaFile": "Image_2.jpg"
}
]
}
}
}
def gen_id(self,ofdjson, id_key):
"""生成id"""
# print("id_key ", id_key, "ofdjson ", ofdjson)
for k, v in ofdjson.items():
if k == id_key:
# 添加id
if isinstance(ofdjson[k], dict):
ofdjson[k]["@ID"] = f"{self.id_obj.get_id()}"
if res_uuid := ofdjson[k].get("res_uuid"):
self.id_obj.add_uuid_map(res_uuid, ofdjson[k]["@ID"])
# logger.info(f"添加id -> {ofdjson[k]}")
elif isinstance(ofdjson[k], list):
for i in ofdjson[k]:
i["@ID"] = f"{self.id_obj.get_id()}"
if res_uuid := i.get("res_uuid"):
self.id_obj.add_uuid_map(res_uuid, i["@ID"])
# logger.info(f"添加id ->i {i}")
elif isinstance(v, dict):
# logger.debug(f"dict_v{v}")
self.gen_id(v, id_key)
elif isinstance(v, list):
for v_cell in v:
if isinstance(v_cell, dict):
# logger.debug(f"dict_v{v}")
self.gen_id(v_cell, id_key)
class PublicResTemplate(TemplateBase):
"""DOC 内唯一 公共配置资源信息 如 Font Color 等 PublicRes.xml"""
template_name = "PulicRes"
key_map = {"Font": "ofd:Font"}
id_keys = ["ofd:ColorSpace", "ofd:Font"]
ofdjson = {
"ofd:Res": {
"@xmlns:ofd": "http://blog.yuanhaiying.cn",
"@BaseLoc": "Res",
"ofd:ColorSpaces": {
"ofd:ColorSpace": {
"@ID": 0,
"@Type": "RGB",
"@BitsPerComponent": "8",
"#text":""
}
},
"ofd:Fonts": {
"ofd:Font": [
{
"@ID": 0,
"@FontName": "宋体",
"@FamilyName": "宋体",
}
]
}
}
}
def gen_id(self,ofdjson, id_key):
"""生成id"""
# print("id_key ", id_key, "ofdjson ", ofdjson)
for k, v in ofdjson.items():
if k == id_key:
# 添加id
if isinstance(ofdjson[k], dict):
ofdjson[k]["@ID"] = f"{self.id_obj.get_id()}"
if res_uuid := ofdjson[k].get("res_uuid"):
self.id_obj.add_uuid_map(res_uuid, ofdjson[k]["@ID"])
# logger.info(f"添加id -> {ofdjson[k]}")
elif isinstance(ofdjson[k], list):
for i in ofdjson[k]:
i["@ID"] = f"{self.id_obj.get_id()}"
if res_uuid := i.get("res_uuid"):
self.id_obj.add_uuid_map(res_uuid, i["@ID"])
# logger.info(f"添加id ->i {i}")
elif isinstance(v, dict):
# logger.debug(f"dict_v{v}")
self.gen_id(v, id_key)
elif isinstance(v, list):
for v_cell in v:
if isinstance(v_cell, dict):
# logger.debug(f"dict_v{v}")
self.gen_id(v_cell, id_key)
'''
"ofd:Font": [
{
"@ID": 0,
"@FontName": "STSong",
"@FamilyName": "SimSun",
"@Serif": "true",
"@FixedWidth": "true",
"@Charset": "prc"
}
"ofd:Area": {
"ofd:PhysicalBox": "0 0 210 140"
},
'''
class ContentTemplate(TemplateBase):
"""正文部分 Content.xml"""
#"@Type": "Body",
template_name = "Content"
key_map = {"ImageObject": "ofd:ImageObject",
"PathObject": "ofd:PathObject",
"TextObject": "ofd:TextObject",
"CGTransform": "ofd:CGTransform",
"PhysicalBox": "ofd:PhysicalBox",
}
id_keys = ["ofd:Layer", "ofd:TextObject", "ofd:PathObject", "ofd:Clips", "ofd:ImageObject"]
correlate_map = {"ofd:TextObject": "@Font",
"ofd:ImageObject": "@ResourceID"
}
ofdjson = {
"ofd:Page": {
"@xmlns:ofd": "http://blog.yuanhaiying.cn",
"ofd:Content": {
"ofd:PageArea": {
"ofd:PhysicalBox": "0 0 210 140"
},
"ofd:Layer": {
"@ID": 0,
"@Type": "Foreground",
"ofd:TextObject": [{
"@ID": 0,
"@CTM": "7.054 0 0 7.054 0 134.026",
"@Boundary": "69 7 72 7.6749",
"@Font": "69",
"@Size": "6.7028",
"ofd:FillColor": {
"@ColorSpace": "4",
"@Value": "156 82 35"
},
"ofd:CGTransform": {
"@CodePosition": "0",
"@CodeCount": "10",
"@GlyphCount": "10",
"ofd:Glyphs": "18 10 11 42 60 53 24 11 42 61"
},
"ofd:TextCode": {
"@X": "13.925",
"@Y": "10",
"@DeltaX": "7 7 7 7 7 7 7 7 7",
"#text": "电⼦发票(普通发票)"
}
}],
"ofd:ImageObject": []
}
}}}
def __init__(self,*args,**kwargs):
# print(args)
# print(kwargs)
super().__init__(*args, **kwargs)
# 关联res_uuid
for key, targe_key in self.correlate_map.items():
self.correlate_res_uuid(self.final_json,key,targe_key)
def correlate_res_uuid(self, ofdjson,key,targe_key):
"""correlate_res_uuid"""
print("========uuid_map", self.id_obj.uuid_map)
for k, v in ofdjson.items():
if k == key:
if isinstance(v, dict) and (res_uuid := v_cell.pop("res_uuid", None)):
v[targe_key] = self.id_obj.uuid_map[res_uuid]
logger.debug(f'{targe_key} >>> {v[targe_key]} -- {res_uuid}')
elif isinstance(v, list):
for v_cell in v:
if isinstance(v_cell, dict) and (res_uuid := v_cell.pop("res_uuid", None)):
v_cell[targe_key] = self.id_obj.uuid_map[res_uuid]
logger.debug(f'{targe_key} >>> {v_cell[targe_key]} -- {res_uuid}')
else:
print(f"v_cell {v_cell}")
pass
else:
pass
elif isinstance(v, dict):
self.correlate_res_uuid(v, key, targe_key)
elif isinstance(v, list):
for v_cell in v:
if isinstance(v_cell, dict):
self.correlate_res_uuid(v_cell, key, targe_key)
'''
"ofd:PathObject": [{
"@ID": 0,
"@CTM": "0.3527 0 0 -0.3527 0.35 141.43001",
"@Boundary": "-0.35 -0.35 212.33 141.78999",
"@LineWidth": "1",
"@MiterLimit": "10",
"@Stroke": "false",
"@Fill": "true",
"ofd:FillColor": {
"@ColorSpace": "4",
"@Value": "255 255 255"
},
"ofd:StrokeColor": {
"@ColorSpace": "4",
"@Value": "0 0 0"
},
"ofd:Clips": {
"ofd:Clip": {
"ofd:Area": {
"ofd:Path": {
"@ID": 0,
"@Boundary": "0.00766 -0.00763 600 400.00003",
"@Stroke": "false",
"@Fill": "true",
"ofd:AbbreviatedData": "M 0 0 L 600 0 L 600 400.00003 L 0 400.00003 C"
}
}
}
},
"ofd:AbbreviatedData": "M -1 401 L 601 401 L 601 -1 L -1 -1 C"
},],
"ofd:ImageObject": [{
"@ID": 0,
"@CTM": "19.7512 0 0 19.7512 0 0",
"@Boundary": "7.23035 7.40671 19.7512 19.7512",
"@ResourceID": "104"
}],
'''
class OFDStructure(object):
"""OFD structure"""
def __init__(self, name, ofd=None, document=None,
document_res=None, public_res=None,
content_res:list=[], res_static: dict={}):
# 初始化的时候会先自动初始化 默认参数值
id_obj = CurId()
self.name = name
self.ofd = ofd if ofd else OFDTemplate(id_obj=id_obj)
self.document = document if document else DocumentTemplate(id_obj=id_obj)
self.document_res = document_res if document_res else DocumentResTemplate(id_obj=id_obj)
self.public_res = public_res if public_res else PublicResTemplate(id_obj=id_obj)
self.content_res = content_res if content_res else [ContentTemplate(id_obj=id_obj)]
self.res_static = res_static
def __call__(self, test=False):
"""写入文件生成ofd"""
with tempfile.TemporaryDirectory() as t_dir:
if test:
temp_dir = r"./test"
os.mkdir(temp_dir)
else:
temp_dir = t_dir
# 创建过程目录
temp_dir_doc_0 = os.path.join(temp_dir, 'Doc_0')
temp_dir_pages = os.path.join(temp_dir, 'Doc_0', "Pages")
temp_dir_res = os.path.join(temp_dir, 'Doc_0', "Res") # 静态资源路径
for i in [temp_dir_doc_0, temp_dir_pages, temp_dir_res]:
# print(i)
os.mkdir(i)
# 写入 OFD
self.ofd.save(os.path.join(temp_dir, 'OFD.xml'))
# 更新 max_unit_id & 写入 Document
self.document.update_max_unit_id()
self.document.save(os.path.join(temp_dir_doc_0, 'Document.xml'))
# 写入 DocumentRes
self.document_res.save(os.path.join(temp_dir_doc_0, 'DocumentRes.xml'))
# 写入 PublicRes
self.public_res.save(os.path.join(temp_dir_doc_0, 'PublicRes.xml'))
# 写入 content_res
for idx, page in enumerate(self.content_res):
temp_dir_pages_idx = os.path.join(temp_dir_pages, f"Page_{idx}")
os.mkdir(temp_dir_pages_idx)
# os.mkdir(i)
page.save(os.path.join(temp_dir_pages_idx, 'Content.xml'))
# 写入静态资源
for k, v in self.res_static.items():
with open(os.path.join(temp_dir_res, k), "wb") as f:
f.write(v)
# 打包成ofd
zip = zipfile.ZipFile("test.ofd", "w", zipfile.ZIP_DEFLATED)
for path, dirnames, filenames in os.walk(temp_dir):
# 去掉目标跟路径,只对目标文件夹下边的文件及文件夹进行压缩
fpath = path.replace(temp_dir, '')
for filename in filenames:
zip.write(os.path.join(path, filename), os.path.join(fpath, filename))
zip.close()
with open("test.ofd", "rb") as f:
content = f.read()
if os.path.exists("test.ofd"):
os.remove("test.ofd")
return content
if __name__ == "__main__":
print("---------")
# 资源文件
img_path = r"F:\code\easyofd\test\test_img0.jpg"
# with open(img_path, "rb") as f:
# content = f.read()
content = b""
res_static = {"Image_0.jpg": content}
# 构建数据
font = [
{
"@FontName": "宋体",
"@FamilyName": "宋体",
}
]
MultiMedia = [
{
"@Type": "Image",
"ofd:MediaFile": "Image_0.jpg"
}
]
ImageObject = [{
"@CTM": "200 0 0 140 0 0",
"@Boundary": "0 0 200 140",
"@ResourceID": "55"
}]
TextObject = [
{
"@Boundary": "50 5 100 20",
"@Font": "2",
"@Size": "5",
"ofd:FillColor": {
"@Value": "156 82 35",
"@ColorSpace" : "1"
},
"ofd:TextCode": {
"@X": "5",
"@Y": "5",
"@DeltaX": "7 7 7 7 7 7 7 7 7",
"#text": "电⼦发票(普通发票)"
}
}, {
"@Boundary": "0 0 100 100",
"@Font": "2",
"@Size": "10",
"ofd:FillColor": {
"@Value": "156 82 35"
},
"ofd:TextCode": {
"@X": "0",
"@Y": "0",
"@DeltaX": "0",
"#text": "电"
}
}
]
# 实例化模板
id_obj = CurId()
print("id_obj实例化", id_obj)
ofd = OFDTemplate(id_obj=id_obj)
document = DocumentTemplate(id_obj=id_obj)
public_res = PublicResTemplate(Font=font, id_obj=id_obj)
document_res = DocumentResTemplate(MultiMedia=MultiMedia, id_obj=id_obj)
# ImageObject=ImageObject
content_res = ContentTemplate(CGTransform=[], PathObject=[], TextObject=TextObject, ImageObject=[], id_obj=id_obj)
ofd_byte = OFDStructure("123",ofd=ofd, document=document,public_res=public_res,
document_res=document_res, content_res=[content_res], res_static=res_static)(test=True)
with open("test.ofd", "wb") as f:
content = f.write(ofd_byte)
import os
import re
import io
import json
import time
import copy
import string
import random
from uuid import uuid1
from decimal import Decimal
from collections import OrderedDict
# 第三方包
import fitz
from PIL import Image
# import pdfplumber
__ALL__ = ['pdf_ocr',"DPFParser"]
class MyEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, bytes):
return str(obj)
elif isinstance(obj, Decimal):
return float(obj)
return json.JSONEncoder.default(self, obj)
class DPFParser(object):
def __init__(self, ):
pass
def extract_text_with_details(self, pdf_bytes):
"""
提取PDF每页的文本及其位置、字体信息。
:param pdf_path: PDF文件路径
:return: 包含每页文本及其详细信息的列表
[[
]]
"""
details_list = []
pdf_stream = io.BytesIO(pdf_bytes)
# 使用fitz.open直接打开BytesIO对象
with fitz.open(stream=pdf_stream, filetype="pdf") as doc:
res_uuid_map = {
"img": {},
"font": {},
"other": {}
} # 全局资源标识
for page_num in range(len(doc)):
page_details_list = [] # 页面内信息
page = doc.load_page(page_num)
rect = page.rect
width = rect.width
height = rect.height
if res_uuid_map["other"].get("page_size"):
res_uuid_map["other"]["page_size"][page_num] = [width,height]
else :
res_uuid_map["other"]["page_size"] = {page_num: [width, height]}
blocks = page.get_text("dict").get("blocks") # 获取文本块信息
image_list = page.get_images(full=True) # 获取页面上所有图片的详细信息
# print(blocks)
# 获取页面内文本信息
for block in blocks:
block_text = block.get("text", "")
block_rect = block["bbox"] # 文本块的边界框,格式为[x0, y0, x1, y1]
# 遍历块中的每一行
for line in block.get("lines", []):
line_text = line.get("spans", [{}])[0].get("text", "") # 单行文本
line_rect = line["bbox"] # 行的边界框
# 遍历行中的每一个跨度(span),获取字体信息
for span in line.get("spans", []):
span_text = span.get("text", "")
font_size = span.get("size") # 字体大小
font_name = span.get("font") # 字体名称
res_uuid = None
if font_name not in res_uuid_map["font"].values():
res_uuid = str(uuid1())
res_uuid_map["font"][res_uuid] = font_name
else:
keys = list(res_uuid_map["font"].keys())
vs = list(res_uuid_map["font"].values())
idx = vs.index(font_name)
res_uuid =keys[idx]
font_color = span.get("color") # 字体颜色,默认可能没有
span_rect = (
line_rect[0], line_rect[1], line_rect[2], line_rect[3]) # 使用行的边界框作为参考,具体到单个字符或词可能需要更复杂的处理
# 打印或存储信息
print(
f"Page: {page_num }, Text: '{span_text}', Font: {font_name}, Size: {font_size}, "
f"Color: {font_color}, Rect: {span_rect} ,res_uuid {res_uuid}")
# 存储信息到details_list中(根据需要调整存储格式)
page_details_list.append({
"page": page_num,
"text": span_text,
"font": font_name,
"res_uuid": res_uuid,
"size": font_size,
"color": font_color,
"bbox": list(span_rect),
"type": "text"
})
for image_index, img_info in enumerate(image_list):
# 解析图片信息
xref = img_info[0]
base_image = doc.extract_image(xref)
image_data = base_image["image"] # 图片数据
res_uuid = str(uuid1())
img_io = io.BytesIO(image_data)
res_uuid_map["img"][res_uuid] = img_io
image_type = base_image["ext"] # 图片类型
smask = base_image["smask"] # 图片类型
xres = base_image["xres"] # 图片类型
yres = base_image["yres"] # 图片类型
width = base_image["width"] # 图片宽度
height = base_image["height"] # 图片高度
# 计算坐标(左下角和右上角)
x0, y0, x1, y1 = xres, yres,xres+width,yres+height
print(
f"Page: {page_num}, image_type: '{image_type}',x0{x0}, y0{y0}, x1{x1}, y1{y1} ")
page_details_list.append({
"page": page_num,
"index": image_index,
"x0": x0,
"y0": y0,
"x1": x1,
"y1": y1,
"bbox": [x0,y0,width,height],
"width": width,
"height": height,
"res_uuid": res_uuid,
"image_type": image_type,
"type": "img"
})
details_list.append(page_details_list)
# print("details_list",details_list)
return details_list, res_uuid_map
def to_img(self, buffer_pdf):
"""pdf2img"""
pix_list = []
pdfDoc = fitz.open(stream=buffer_pdf)
for pg in range(pdfDoc.page_count):
page = pdfDoc[pg]
rotate = int(0)
# 每个尺寸的缩放系数为1.3,这将为我们生成分辨率提高2.6的图像。
# 此处若是不做设置,默认图片大小为:792X612, dpi=96
zoom_x = 1.33333333 #(1.33333333-->1056x816) (2-->1584x1224)
zoom_y = 1.33333333
# zoom_x,zoom_y = (1,1)
mat = fitz.Matrix(zoom_x, zoom_y).prerotate(rotate)
pix = page.get_pixmap(matrix=mat, alpha=False)
pix_list.append(pix)
return pix_list
def get_size(self):
pass
def coast_time(func):
'''
计算对象执行耗时
'''
def fun(*agrs, **kwargs):
t = time.perf_counter()
result = func(*agrs, **kwargs)
print(f'function {func.__name__} coast time: {time.perf_counter() - t:.8f} s')
return result
return fun
class BaseInit:
'''
解析pdf所需的基本信息
'''
def __init__(self, pdf_path, output_path):
self.file_path = pdf_path
self.output_path = output_path
# file_name
self.file_name = os.path.basename(self.file_path)
# file_type
self.fileType = os.path.splitext(self.file_path)[-1]
# no suffix
self.file_no_suffix = self.file_name[:-len(self.fileType)]
self.uuidChars = tuple(list(string.ascii_letters) + list(range(10)))
# 表格占位、分割符
self.divide = ':'
self.solid = ''
# 初始化整个过程需要创建的中间目录
# iou 占比
self.iou_rate = 0.001
self.init_file()
def init_file(self):
"""
初始化项目过程中需要创建的文件夹
"""
self.image_folder_path = os.path.join(self.output_path, 'pdf_img_save')
self.json_folder_path = os.path.join(self.output_path, 'json')
self.ocr_result_path = os.path.join(self.json_folder_path, self.file_no_suffix + '.json')
# 后面还有txt..., 目前的流程先需要5个
for path in [self.image_folder_path, self.json_folder_path]:
if not os.path.exists(path):
os.makedirs(path)
def genShortId(self, length=12):
"""
:params length: 默认随机生成的uuid长度
"""
uuid = str(uuid1()).replace('-', '')
result = ''
for i in range(0, 8):
sub = uuid[i * 4: i * 4 + 4]
x = int(sub, 16)
result += str(self.uuidChars[x % 0x3E])
return result + ''.join(random.sample(uuid, length - 8))
class PageInfo(BaseInit):
'''
记录每页中的 图片和表格信息
'''
__page_image = {}
__page_table = {}
@classmethod
def add_image(cls, page_num, image):
if not cls.__page_image.get(page_num):
cls.__page_image[page_num] = []
cls.__page_image[page_num].append(image)
@classmethod
def add_table(cls, page_num, table):
if not cls.__page_table.get(page_num):
cls.__page_table[page_num] = []
cls.__page_table[page_num].append(table)
@classmethod
def get_image(cls, page_num):
return cls.__page_image.get(page_num, [])
@classmethod
def get_table(cls, page_num):
return cls.__page_table.get(page_num, [])
@classmethod
def save_image(cls, output_path, file):
'''
保存图片至本地
:param output:
:return:
'''
file = file.split('.')[0]
for images in cls.__page_image.values():
for image in images:
iamge_content = image['objContent']
name = image['name']
img_dir = os.path.join(output_path, 'page_img_save')
img_path = os.path.join(img_dir, file + '_' + name + '.jpg')
if not os.path.exists(img_dir):
os.mkdir(img_dir)
with open(img_path, 'wb') as fp:
fp.write(iamge_content)
class ParseFile(PageInfo):
def __init__(self, pdf_path, output_path, table_type='v2', is_save=True):
super().__init__(pdf_path, output_path)
print('初始化 pdf 对象:{}'.format(self.file_path))
self.is_save = is_save
self.table_type = table_type
# 第一版结果列表: 行 表分开
self.page_result_list = []
# 第二版结果列表: 行表合并
self.combine_page_result_list = []
@coast_time
def get_result(self):
self.load_pdf()
result = self.parse_pdf()
self.ocr_result = result
print(f'解析完成:共 {len(result)} 页 表格类型: {self.table_type}')
return result
def load_pdf(self):
self.fitz_doc = fitz.open(self.file_path, filetype='pdf')
# self.pdfplum_doc_pages = pdfplumber.open(self.file_path).pages
# assert len(self.fitz_doc) == len(self.pdfplum_doc_pages)
def parse_pdf(self):
for page_no, fitz_doc in enumerate(self.fitz_doc):
# 测试
# if page_no != 25:
# continue
self.height = fitz_doc.get_text('dict')['height']
self.width = fitz_doc.get_text('dict')['width']
# 聚合fitz页面解析的字符, 行, 块信息
line_list = self.group_block(page_no, fitz_doc)
# 获取页面表格信息
table_list = self.extract_table(page_no, self.pdfplum_doc_pages[page_no])
# 计算表格行列合并信息
table_list = list(CalcTableRL(table_list).run())
# 获取页面图片信息
image_list = self.get_image(page_no)
# 构造每页最终返回结果,
page_result = self.construct_final_result(line_list, page_no, image_list, table_list)
if self.table_type == 'v2':
# 合并成ocr所需格式:表格合并至行列表
combine_page_result_list = self.combine_table_v2(page_result)
page_result = self.construct_final_result(combine_page_result_list, page_no, image_list, table_list)
self.page_result_list.append(page_result)
if page_no and page_no % 10 == 0:
print(f'解析前 {page_no} 页完成')
final_result_list = copy.deepcopy(self.page_result_list)
# 转换为符合ocr解析格式
if self.table_type == 'v2':
final_result_list = self.reform_ocr_result(final_result_list)
# 2023/09/26 保存之前加入 contIndex 给后续 抽取模型使用
for page_num, page in enumerate(final_result_list):
if not page.get('lineList'):
break
contIndex = {}
for line in page['lineList']:
line_bak = dict(copy.copy(line))
line_bak["objType_postpreprocess"] = f"{line_bak.get('objType','textLine')}_postpreprocess"
contIndex[line_bak["lineId"]] = line_bak
page["contIndex"] = contIndex
for line in page['lineList']:
print(page_num, line['objType'], line['objContent'])
# 保存至本地
if self.is_save:
self.save_result(final_result_list)
for page_num, page in enumerate(final_result_list):
for line in page['lineList']:
print(page_num, line['objType'], line['objContent'])
return final_result_list
def combine_table_v2(self, page_result):
lineList = page_result['lineList']
table_list = page_result['table_list']
# 先进行表格行、非表格行划分 减少后续操作的时间杂度
__notable_lines, __all_table_lines = self.filter_table_line(lineList, table_list)
notable_lines, all_table_lines = copy.deepcopy(__notable_lines), copy.deepcopy(__all_table_lines)
del __notable_lines, __all_table_lines, lineList
# 整合
combine_page_result_list = self.combine_table_with_line(notable_lines, all_table_lines, table_list)
return combine_page_result_list
def filter_table_line(self, lineList, table_list):
'''
筛选出属于表格的行、在 __notable_lines 属于表格的位置插庄 方便后续补全
__notable_lines: 非表格的行
__all_table_lines:属于表格的行
'''
__notable_lines = []
__all_table_lines = []
for table_info in table_list:
table_bbox = table_info['objPos']
# 属于当前表格的所有行
__sub_table_lines = []
is_iter_table = False
while lineList:
line = lineList.pop(0)
line_bbox = line['objPos']
# 空表格误判:行Y坐标已经超过表范围导致后续全都识别不到
table_y, line_y = table_bbox[3], line_bbox[1]
if line_y >= table_y:
lineList.insert(0, line)
break
iou = self.count_iou(table_bbox, line_bbox)
# 非表格区域
if iou > 0:
__sub_table_lines.append(line)
# 首次匹配到表格行
if not is_iter_table:
is_iter_table = True
# 插入标记
__notable_lines.append('table')
elif iou <= 0 and not is_iter_table:
__notable_lines.append(line)
# 当前表格判断结束
elif iou <= 0 and is_iter_table:
lineList.insert(0, line)
line_index, flag = self.more_judge(table_bbox, lineList)
if flag:
# 跳至index位置继续后续判断
# more_lines = copy.deepcopy()
__notable_lines.extend(lineList[:line_index])
lineList = lineList[line_index:]
else:
break
__all_table_lines.append(__sub_table_lines)
# 表格遍历替换完毕, 合并剩下的 page_words
if lineList:
__notable_lines.extend(lineList)
return __notable_lines, __all_table_lines
def more_judge(self, table_bbox, lineList, max_judge=6):
'''
判断后续行列表是否还存在属于当前表格的行
对于表格、行界限不明显的额外判断 如: 页面分栏、表格不全
:return 是否存在 True | False
'''
# 往后多判断 max_judge 行
if len(lineList) < max_judge:
judge_lines = lineList
else:
judge_lines = lineList[:max_judge]
for index, line in enumerate(judge_lines):
line_bbox = line['objPos']
iou = self.count_iou(table_bbox, line_bbox)
if iou > 0:
return index, True
return index, False
def combine_table_with_line(self, notable_lines, all_table_lines, table_list):
'''
将行、字符合并至对应的表格行、cell
'''
for table_id, table in enumerate(table_list):
new_table_lines = []
for table_line in table['lineList']:
is_iter_table = False
table_line_bbox = table_line['objPos']
# 遍历每一行:全局匹配
for __line in all_table_lines[table_id]:
line = copy.deepcopy(__line)
line_bbox = line['objPos']
iou = self.count_iou(table_line_bbox, line_bbox)
# 首次识别到表格, 将文本行的文本、坐标替换为表格行文本、坐标,文本行的其他信息不变
if iou > self.iou_rate and not is_iter_table:
is_iter_table = True
line['objContent'] = table_line['objContent']
line['objPos'] = table_line['objPos']
line['objType'] = 'table'
line['tableId'] = table_id
self.combine_cell_with_span(table_line, line)
line['cells'] = table_line['cells']
new_table_lines.append(line)
elif iou > self.iou_rate and is_iter_table:
self.combine_cell_with_span(table_line, line)
else:
pass
if 'table' not in notable_lines or not new_table_lines:
# FIX ERROR: 'table' is not in list
# 处理大表格内识别到小表格的情况
# 有可能的bug:如果此时有多个大表格嵌套会导致行分配和插庄个数不对等
continue
# 将表格行new_table_lines替换之前插庄table位置并展开
table_index = notable_lines.index('table')
new_notable_lines = notable_lines[:table_index]
new_notable_lines.extend(new_table_lines)
notable_lines = new_notable_lines + notable_lines[table_index+1:]
return notable_lines
def combine_cell_with_span(self,table_line , text_line):
'''
将表格的cell内加上对应span的chars信息:解决表格合并时cell有多行导致chars顺序错乱的问题
'''
del_list = []
for index, cell in enumerate(table_line['cells']):
if not cell.get('chars'):
cell['chars'] = []
cell_bbox = cell['objPos']
if cell_bbox is None:
del_list.append(index)
continue
for span in text_line['span']:
span_bbox = span['bbox']
iou = self.count_iou(cell_bbox, span_bbox)
if iou < self.iou_rate:
continue
# 为了解决一些 span 和 cell 长度不一致问题 将循环细分到每个字符chars
for char in span['chars']:
char_bbox = char['bbox']
iou = self.count_iou(cell_bbox, char_bbox)
if iou > self.iou_rate:
cell['chars'].append(char)
else:
pass
# 清除无效的span
if len(del_list):
for index, index_del in enumerate(del_list):
index_del -= index
del table_line['cells'][index_del]
def group_block(self, page_num, fitz_doc):
"""
组合两个方法的block信息, 使每一个span内具有其每一个字符信息
参考官方文档:https://pymupdf.readthedocs.io/en/latest/textpage.html#textpagedict
:param fitz_doc:
:return: total_info
"""
line_count = 0
total_line_list = []
# char_blocks 最小粒度为每一个字符
char_blocks = fitz_doc.get_text('rawdict')['blocks']
# block_blocks 最小粒度为每行中的span
block_blocks = fitz_doc.get_text('dict')['blocks']
# 先进行文本块排序
char_blocks.sort(key=lambda x: [int(x['bbox'][1]), int(x['bbox'][0])])
block_blocks.sort(key=lambda x: [int(x['bbox'][1]), int(x['bbox'][0])])
# 分组聚合
group_blocks = zip(block_blocks, char_blocks)
for span_blocks, char_block in group_blocks:
if span_blocks['type'] == 1:
# 保存其中的图片
img_attrs = self.deal_image(page_num, line_count, span_blocks)
self.add_image(page_num, img_attrs)
continue
for line_index, line in enumerate(span_blocks['lines']):
line['text'] = ''
line['chars'] = []
line['span'] = []
# 减少时间复杂度,在此处合并每一行
# 合并每一行,并附上行内每一个字符的信息
for span_index, span in enumerate(line['spans']):
span['text'] = span['text'].replace(' ', '').strip()
if not span['text']:
continue
# 给span_blocks中的span加上char_block的chars信息
span_chars = char_block['lines'][line_index]['spans'][span_index]['chars']
span_chars = [char for char in span_chars if char['c'].strip()]
line['text'] += span['text']
line['chars'].extend(span_chars)
line['span'].append({'bbox': span['bbox'], 'chars': span_chars,'text': span['text']})
if not line['text']:
continue
# 构造每行内部的数据结构
line_info = self.construct_line_info(line['text'], line['bbox'], line['span'], line['chars'],
line_count, page_num)
total_line_list.append(line_info)
line_count += 1
return total_line_list
def extract_table(self, page_no, plum_page):
'''
提取页面所有表格
:param page_no:
:param plum_page:
:return:
'''
table_list = []
for table in plum_page.find_tables():
# 获取当前表格的边界定位
table_line_list = self.merge_table_row(table)
if not table_line_list:
continue
table_info = self.deal_table(page_no, table.bbox, table_line_list)
table_list.append(table_info)
# 将表格信息加入全局变量 | 此处有点有点冗余
self.add_table(page_no, table_info)
return table_list
def merge_table_row(self, table):
'''
表格cell 按行合并
:param table:
:return: [({line_text}, {line_bbox}), ...]
'''
table_line_list = []
for item, row in zip(table.extract(), table.rows):
# 表格每行预处理
table_line = self.divide.join([self.clear_text(txt) for txt in item])
# 判断当前行是否为空
__line = self.clear_text(table_line).replace(' ', '')
if not __line:
continue
table_line_list.append((table_line, row.bbox, zip(item, row.cells)))
return table_line_list
def clear_text(self, txt, retrans=False):
if retrans:
txt = txt.replace(self.solid, '').replace(self.divide, '')
else:
# 空列替换为占位符
txt = txt if txt else self.solid
return str(txt).replace('\n', '').replace(' ', '')
def deal_table(self, page_no, table_bbox, table_line_list):
'''
对表格做结构转换
:param page_no:
:param table_bbox:
:param table_line_list:
:return:
'''
table_first_line = self.clear_text(table_line_list[0][0], retrans=True)
table_id = '{0}_{1}_'.format(page_no, table_first_line) + self.genShortId()
lineList = [{
'objContent': line[0],
'objPos': line[1],
'cells': self.deal_table_cell(line[2])
} for line in table_line_list]
table_info = {
'tableId': table_id,
'name': table_id,
'objPos': table_bbox,
'lineList': lineList,
}
return table_info
def deal_table_cell(self, cells):
return [{"objContent": self.clear_text(text), "objPos": box} for text, box in cells]
def deal_image(self, page_num, name, img_attrs):
'''
对image做结构转换
:param page_num:
:param name:
:param img_attrs:
:return:
'''
image_id = '{0}_{1}_'.format(page_num, name) + self.genShortId()
img_info = {
'imageId': image_id,
'name': image_id, # 暂时以图片所在页面的行数命名
'objPos': img_attrs['bbox'],
'ext': img_attrs['ext'],
'objContent': img_attrs['image'],
'size': img_attrs['size']
}
return img_info
def deal_chars(self, line_num, lineId, chars):
'''
对chars做结构转换
:param line_num:
:param lineId:
:param chars:
:return:
'''
num_count = 0
char_list = []
for char in chars:
if not char['c'].strip():
continue
char_dict = {
'lineId': lineId,
'charId': 'char_' + str(line_num) + '_' + str(num_count) + '_' + self.genShortId(),
'objContent': char['c'],
'objPos': char['bbox']
}
char_list.append(char_dict)
num_count += 1
return char_list
def construct_line_info(self, text, rect, span, chars, count, pageNo, objType='textLine'):
'''
对每行做结构转换
# x, y, h, w = rect[0], rect[1], rect[3] - rect[1], rect[2] - rect[0]
'''
lineId = 'line_' + str(pageNo) + '_' + str(count) + '_' + self.genShortId()
chars = self.deal_chars(count, lineId, chars)
return OrderedDict({
'lineNo': count,
'lineId': lineId,
'objType': objType,
'objContent': re.sub(r'\s', '', text),
'chars': chars,
'objPos': rect,
'span': span
})
@staticmethod
def rect_format(bbox):
'''
数据坐标转换 x1, y1, x2, y2 >> y1, x1 h, w
:param rect: [x1, y1, x2, y2]
:return: [y, x, h, w]
'''
y, x, h, w = bbox[1], bbox[0], bbox[3] - bbox[1], bbox[2] - bbox[0]
return [y, x, h, w]
def count_iou(self, RecA, RecB):
'''
计算边框交并比
左上边界坐标为Ax0, Ay0, Bx0, By0
右下边界坐标为Ax1, Ay1, Bx1, By1
交集面积计算为:
M = min(Ax1, Bx1) - max(Ax0, Bx0)
H = min(Ay1, By1) - max(Ay0, By0)
# 当前表格的边界信息
left_x, top_y, right_x, botm_y: table_box_info[0], table_box_info[1], table_box_info[2], table_box_info[3]
'''
M = min(RecB[2], RecA[2]) - max(RecB[0], RecA[0])
H = min(RecB[3], RecA[3]) - max(RecB[1], RecA[1])
# 计算交集部分面积
interArea = max(0, M) * max(0, H)
# 计算两个边框的面积
RecA_Area = (RecA[2] - RecA[0]) * (RecA[3] - RecA[1])
RecB_Area = (RecB[2] - RecB[0]) * (RecB[3] - RecB[1])
# 计算IOU
iou = interArea / float(RecA_Area + RecB_Area - interArea)
return iou
def construct_final_result(self, line_list, pageNo, image_list=[], table_list=[]):
'''
每页转换为最终数据结构
:param line_list: ocr每行结果
:param pageNo: 页码
:param image_list:
:param table_list:
:return: type: Dict
'''
document_id = 'v1' + '_' + self.file_no_suffix + '_' + self.genShortId()
return OrderedDict({
'pageNo': pageNo,
'docID': document_id,
'page_info':{'size': [self.width, self.height]},
'lineList': line_list,
'image_list': image_list if image_list else [],
'table_list': table_list if table_list else []
})
def save_result(self, final_result_list):
'''
保存结果数据至本地
'''
if self.table_type == 'v2':
with open(self.ocr_result_path, 'w', encoding='utf-8') as f:
json.dump(final_result_list, f, indent=4, ensure_ascii=False)
else:
with open(self.ocr_result_path, 'w', encoding='utf-8') as f:
json.dump(self.page_result_list, f, cls=MyEncoder, indent=4, ensure_ascii=False)
def reform_ocr_result(self, final_result_list):
"""
对返回的结果最最终处理 并 重新定义行号排序
:param final_result_list: 本地解析和ocr解析的合并结果
"""
for result_list in final_result_list:
del result_list['image_list']
del result_list['table_list']
lineList = result_list['lineList']
for num, line in enumerate(lineList):
# 重写行号和行ID
line['lineNo'] = str(num)
line_split = line['lineId'].split('_')
line_split[-2] = str(num)
line['lineId'] = '_'.join(line_split)
# 转换坐标格式
obj_type = line['objType']
# 计算每一个字相对于当前行想x,y 的偏移量
offset_x_list, offset_y_list = self.coord_offset(line, obj_type)
line['objPos'] = self.rect_format(line['objPos'])
line['objPos'].append(offset_x_list)
line['chars_offset'] = [offset_x_list, offset_y_list]
if line.get('chars'):
del line['chars']
if obj_type == 'table' and line.get('span'):
del line['span']
return final_result_list
def coord_offset(self, line, obj_type='textLine'):
'''
计算每个字符的左上角 相对行左上角位置的偏移量
@obj_type: textLine | table
'''
offset_x_list = []
offset_y_list = []
line_x, line_y = line['objPos'][0], line['objPos'][1]
if obj_type == 'textLine':
for span in line['span']:
self.all_rect_format(span)
for char in span['chars']:
char_x, char_y = char['bbox'][0], char['bbox'][1]
offset_x_list.append(char_x - line_x)
offset_y_list.append(char_y - line_y)
self.all_rect_format(char)
else:
__cells = []
for num, _cell in enumerate(line['cells']):
cell = copy.deepcopy(_cell)
self.all_rect_format(cell)
for char in cell['chars']:
char_x, char_y = char['bbox'][0], char['bbox'][1]
offset_x_list.append(char_x - line_x)
offset_y_list.append(char_y - line_y)
self.all_rect_format(char)
__cells.append(cell)
line['cells'] = __cells
return offset_x_list, offset_y_list
def all_rect_format(self, obj):
'''
将所有格式转换为ocr所需格式
'''
if 'chars' in obj:
if obj.get('text'):
obj['objContent'] = obj['text']
del obj['text']
if obj.get('objPos'):
obj['objPos'] = self.rect_format(obj['objPos'])
elif obj.get('bbox'):
obj['objPos'] = self.rect_format(obj['bbox'])
del obj['bbox']
else:
obj['objContent'] = obj['c']
obj['objPos'] = self.rect_format(obj['bbox'])
del obj['c']
del obj['bbox']
class CalcTableRL:
'''
还原表格虚线 计算表格行列合并信息
输入目标表格结构信息:必须包含所有的cell坐标
在目标表格结构cell上加上row_start_end, col_start_end属性
'''
def __init__(self, table_info):
self.table_info = table_info
def run(self):
if isinstance(self.table_info, list):
for table_info in self.table_info:
table_info = self.add_table_property(table_info)
yield table_info
else:
table_info = self.add_table_property(self.table_info)
yield table_info
def add_table_property(self, table_info):
'''
表格结构增加行列合并信息:
cell['col_start_end'] = (col_start, col_end)
cell['row_start_end'] = (row_start, row_end)
'''
# 分别得到所有排序好的行列坐标
set_x, set_y = self.collect_table_coord(table_info)
# 排序 后的set_x,set_y 坐标集合就是最小粒度表格
list_x, list_y = sorted(set_x), sorted(set_y)
for line in table_info['lineList']:
for cell in line['cells']:
if cell['objPos'] == None:
continue
x1, y1, x2, y2 = cell['objPos']
# 查找坐标点在虚线表格中对应的位置
col_start = list_x.index(x1)
col_end = list_x.index(x2)
row_start = list_y.index(y1)
row_end = list_y.index(y2)
cell['col_start_end'] = (col_start, col_end)
cell['row_start_end'] = (row_start, row_end)
# print(f"{cell['objContent']} 属于行:{cell['row_start_end']} 属于列:{cell['col_start_end']}")
return table_info
def collect_table_coord(self, table_info):
'''
获取所有x, y坐标点
传入单个表格信息,提取出其中所有cell的x1, y1, x2, y2坐标点 去重
:param table_info:
:return: set(x), set(y)
'''
set_x = set()
set_y = set()
for line in table_info['lineList']:
for cell in line['cells']:
if cell['objPos'] == None:
continue
x1, y1, x2, y2 = cell['objPos']
set_x.add(x1)
set_x.add(x2)
set_y.add(y1)
set_y.add(y2)
return set_x, set_y
def pdf_ocr(pdf_path, output_path, table_type='v2', is_save=True):
'''
简单封装, 方便调用和多线程
'''
pdf = ParseFile(pdf_path, output_path, table_type, is_save)
pdf.get_result()
return pdf
# ---------------------------以下是测试案列-----------------------------------
@coast_time
def test_dir():
for root in os.walk(r'E:\workplace\cjhx_test\创金和信\pdf2json\input\all_test'):
dir, files = root[0], root[2]
for file in files:
if 'test.pdf' not in file:
continue
file_path = os.path.join(dir, file)
output_dir = r'E:\workplace\cjhx_test\创金和信\pdf2json\file_data\all_test'
pdf_ocr_result = pdf_ocr(file_path, output_dir)
@coast_time
def test_single():
# file_path = r'E:\workplace\daily_work\pdf2json\input\all_test\测试足够复杂的表格解析.pdf'
file_path = r'/home/yhocr/extractor/3f195fba-0916-4d74-b956-bf3bcadc77f2/20220913-浙江省贰号职业年金计划银华资产组合2022年二季度管理费用支付指令.pdf'
# file_path = r'E:\workplace\daily_work\pdf2json\input\all_test\公开募集基金销售支付结算机构名录(2022年9月)(1).pdf'
# file_path = r'C:\Users\Administrator\Documents\WeChat Files\wxid_x36dhycno4s121\FileStorage\File\2022-11\20210928-ZL001-西部利得天添鑫货币B-申购5000万-确认书.pdf'
# file_path = r'E:\workplace\daily_work\pdf2json\input\all_test\2-信息系统部2021年大数据平台系统维护服务--工作记录表和考核表2021Q3-原版.pdf'
output_dir = r'/home/yhocr/extractor/3f195fba-0916-4d74-b956-bf3bcadc77f2/电子解析'
pdf = pdf_ocr(file_path, output_dir, table_type='v2')
# print(pdf.ocr_result)
@coast_time
def test_thread():
# 多进程
from concurrent.futures import ProcessPoolExecutor
pool = ProcessPoolExecutor(max_workers=8)
# 多线程
# from concurrent.futures import ThreadPoolExecutor
# pool = ThreadPoolExecutor(max_workers=8)
for root in os.walk(r'E:\workplace\daily_work\pdf2json\input\签字模板二'):
dir, files = root[0], root[2]
for file in files:
file_path = os.path.join(dir, file)
output_dir = r'E:\workplace\daily_work\pdf2json\output\签字模板二'
ret = pool.submit(pdf_ocr, file_path, output_dir, table_type='v2')
ret.add_done_callback(print_callback)
pool.shutdown()
def print_callback(ret):
# print('ret:', ret.result())
pass
if __name__ == '__main__':
# test_dir()
# test_thread()
# test_single()
pdf_obj = DPFParser()
with open(r"F:\code\easyofd\test\test.pdf","rb") as f:
pdf_bytes = f.read()
img_list = pdf_obj.to_img(pdf_bytes)
pil_img_list = []
for _img in img_list:
print(_img.width,_img.height)
img = Image.frombytes("RGB", [_img.width, _img.height], _img.samples)
print(type(img))
img.save('output_image.png')
from loguru import logger
from magic_pdf.libs.drop_reason import DropReason
def get_data_source(jso: dict):
data_source = jso.get("data_source")
if data_source is None:
data_source = jso.get("file_source")
return data_source
def get_data_type(jso: dict):
data_type = jso.get("data_type")
if data_type is None:
data_type = jso.get("file_type")
return data_type
def get_bookid(jso: dict):
book_id = jso.get("bookid")
if book_id is None:
book_id = jso.get("original_file_id")
return book_id
def exception_handler(jso: dict, e):
logger.exception(e)
jso["_need_drop"] = True
jso["_drop_reason"] = DropReason.Exception
jso["_exception"] = f"ERROR: {e}"
return jso
def get_bookname(jso: dict):
data_source = get_data_source(jso)
file_id = jso.get("file_id")
book_name = f"{data_source}/{file_id}"
return book_name
def spark_json_extractor(jso: dict) -> dict:
"""
从json中提取数据,返回一个dict
"""
return {
"_pdf_type": jso["_pdf_type"],
"model_list": jso["doc_layout_result"],
}
import html
def decode_html_entities(text):
# 将 HTML 实体转换为相应的字符
return html.unescape(text)
# 示例文本
text = "这是一个&ast;示例&nbsp;文本,包含&nbsp;HTML&nbsp;转义字符。&#33;"
# 转换文本中的 HTML 实体
decoded_text = decode_html_entities(text)
print(decoded_text)
import os
from pathlib import Path
import click
from loguru import logger
import magic_pdf.model as model_config
from magic_pdf.libs.version import __version__
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
from magic_pdf.tools.common import do_parse, parse_pdf_methods
@click.command()
@click.version_option(__version__,
'--version',
'-v',
help='display the version and exit')
@click.option(
'-p',
'--path',
'path',
type=click.Path(exists=True),
required=True,
help='local pdf filepath or directory',
)
@click.option(
'-o',
'--output-dir',
'output_dir',
type=click.Path(),
required=True,
help='output local directory',
)
@click.option(
'-m',
'--method',
'method',
type=parse_pdf_methods,
help="""the method for parsing pdf.
ocr: using ocr technique to extract information from pdf.
txt: suitable for the text-based pdf only and outperform ocr.
auto: automatically choose the best method for parsing pdf from ocr and txt.
without method specified, auto will be used by default.""",
default='auto',
)
@click.option(
'-l',
'--lang',
'lang',
type=str,
help="""
Input the languages in the pdf (if known) to improve OCR accuracy. Optional.
You should input "Abbreviation" with language form url:
https://paddlepaddle.github.io/PaddleOCR/latest/en/ppocr/blog/multi_languages.html#5-support-languages-and-abbreviations
""",
default=None,
)
@click.option(
'-d',
'--debug',
'debug_able',
type=bool,
help='Enables detailed debugging information during the execution of the CLI commands.',
default=False,
)
@click.option(
'-s',
'--start',
'start_page_id',
type=int,
help='The starting page for PDF parsing, beginning from 0.',
default=0,
)
@click.option(
'-e',
'--end',
'end_page_id',
type=int,
help='The ending page for PDF parsing, beginning from 0.',
default=None,
)
def cli(path, output_dir, method, lang, debug_able, start_page_id, end_page_id):
model_config.__use_inside_model__ = True
model_config.__model_mode__ = 'full'
os.makedirs(output_dir, exist_ok=True)
def read_fn(path):
disk_rw = DiskReaderWriter(os.path.dirname(path))
return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN)
def parse_doc(doc_path: str):
try:
file_name = str(Path(doc_path).stem)
pdf_data = read_fn(doc_path)
do_parse(
output_dir,
file_name,
pdf_data,
[],
method,
debug_able,
start_page_id=start_page_id,
end_page_id=end_page_id,
lang=lang
)
except Exception as e:
logger.exception(e)
if os.path.isdir(path):
for doc_path in Path(path).glob('*.pdf'):
parse_doc(doc_path)
else:
parse_doc(path)
if __name__ == '__main__':
cli()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment