Commit d5dbed73 authored by 赵小蒙's avatar 赵小蒙
Browse files

目录重构

parent 7c7910e4
......@@ -2,9 +2,9 @@ import os
import re
import numpy as np
from pdf_tools.libs.nlp_utils import NLPModels
from magic_pdf.libs.nlp_utils import NLPModels
from pdf_tools.para.commons import *
from magic_pdf.para.commons import *
if sys.version_info[0] >= 3:
sys.stdout.reconfigure(encoding="utf-8") # type: ignore
......
......@@ -2,28 +2,28 @@ import time
# from anyio import Path
from pdf_tools.libs.commons import fitz, get_delta_time, get_img_s3_client
from magic_pdf.libs.commons import fitz, get_delta_time, get_img_s3_client
import json
import os
import math
from loguru import logger
from pdf_tools.layout.bbox_sort import (
from magic_pdf.layout.bbox_sort import (
prepare_bboxes_for_layout_split,
)
from pdf_tools.layout.layout_sort import LAYOUT_UNPROC, get_bboxes_layout, get_columns_cnt_of_layout, sort_text_block
from pdf_tools.libs.drop_reason import DropReason
from pdf_tools.libs.markdown_utils import escape_special_markdown_char
from pdf_tools.libs.safe_filename import sanitize_filename
from pdf_tools.libs.vis_utils import draw_bbox_on_page, draw_layout_bbox_on_page
from pdf_tools.pre_proc.detect_images import parse_images
from pdf_tools.pre_proc.detect_tables import parse_tables # 获取tables的bbox
from pdf_tools.pre_proc.detect_equation import parse_equations # 获取equations的bbox
from pdf_tools.pre_proc.detect_header import parse_headers # 获取headers的bbox
from pdf_tools.pre_proc.detect_page_number import parse_pageNos # 获取pageNos的bbox
from pdf_tools.pre_proc.detect_footnote import parse_footnotes_by_model, parse_footnotes_by_rule # 获取footnotes的bbox
from pdf_tools.pre_proc.detect_footer_by_model import parse_footers # 获取footers的bbox
from pdf_tools.post_proc.detect_para import (
from magic_pdf.layout.layout_sort import LAYOUT_UNPROC, get_bboxes_layout, get_columns_cnt_of_layout, sort_text_block
from magic_pdf.libs.drop_reason import DropReason
from magic_pdf.libs.markdown_utils import escape_special_markdown_char
from magic_pdf.libs.safe_filename import sanitize_filename
from magic_pdf.libs.vis_utils import draw_bbox_on_page, draw_layout_bbox_on_page
from magic_pdf.pre_proc.detect_images import parse_images
from magic_pdf.pre_proc.detect_tables import parse_tables # 获取tables的bbox
from magic_pdf.pre_proc.detect_equation import parse_equations # 获取equations的bbox
from magic_pdf.pre_proc.detect_header import parse_headers # 获取headers的bbox
from magic_pdf.pre_proc.detect_page_number import parse_pageNos # 获取pageNos的bbox
from magic_pdf.pre_proc.detect_footnote import parse_footnotes_by_model, parse_footnotes_by_rule # 获取footnotes的bbox
from magic_pdf.pre_proc.detect_footer_by_model import parse_footers # 获取footers的bbox
from magic_pdf.post_proc.detect_para import (
ParaProcessPipeline,
TitleDetectionException,
TitleLevelException,
......@@ -31,9 +31,9 @@ from pdf_tools.post_proc.detect_para import (
ParaMergeException,
DenseSingleLineBlockException,
)
from pdf_tools.pre_proc.main_text_font import get_main_text_font
from pdf_tools.pre_proc.remove_colored_strip_bbox import remove_colored_strip_textblock
from pdf_tools.pre_proc.remove_footer_header import remove_headder_footer_one_page
from magic_pdf.pre_proc.main_text_font import get_main_text_font
from magic_pdf.pre_proc.remove_colored_strip_bbox import remove_colored_strip_textblock
from magic_pdf.pre_proc.remove_footer_header import remove_headder_footer_one_page
'''
from para.para_pipeline import ParaProcessPipeline
......@@ -46,19 +46,19 @@ from para.exceptions import (
)
'''
from pdf_tools.libs.commons import read_file, join_path
from pdf_tools.libs.pdf_image_tools import save_images_by_bboxes
from pdf_tools.post_proc.remove_footnote import merge_footnote_blocks, remove_footnote_blocks
from pdf_tools.pre_proc.citationmarker_remove import remove_citation_marker
from pdf_tools.pre_proc.equations_replace import combine_chars_to_pymudict, remove_chars_in_text_blocks, replace_equations_in_textblock
from pdf_tools.pre_proc.pdf_pre_filter import pdf_filter
from pdf_tools.pre_proc.detect_footer_header_by_statistics import drop_footer_header
from pdf_tools.pre_proc.construct_paras import construct_page_component
from pdf_tools.pre_proc.fix_image import combine_images, fix_image_vertical, fix_seperated_image, include_img_title
from pdf_tools.post_proc.pdf_post_filter import pdf_post_filter
from pdf_tools.pre_proc.remove_rotate_bbox import get_side_boundry, remove_rotate_side_textblock, remove_side_blank_block
from pdf_tools.pre_proc.resolve_bbox_conflict import check_text_block_horizontal_overlap, resolve_bbox_overlap_conflict
from pdf_tools.pre_proc.fix_table import fix_table_text_block, fix_tables, include_table_title
from magic_pdf.libs.commons import read_file, join_path
from magic_pdf.libs.pdf_image_tools import save_images_by_bboxes
from magic_pdf.post_proc.remove_footnote import merge_footnote_blocks, remove_footnote_blocks
from magic_pdf.pre_proc.citationmarker_remove import remove_citation_marker
from magic_pdf.pre_proc.equations_replace import combine_chars_to_pymudict, remove_chars_in_text_blocks, replace_equations_in_textblock
from magic_pdf.pre_proc.pdf_pre_filter import pdf_filter
from magic_pdf.pre_proc.detect_footer_header_by_statistics import drop_footer_header
from magic_pdf.pre_proc.construct_paras import construct_page_component
from magic_pdf.pre_proc.fix_image import combine_images, fix_image_vertical, fix_seperated_image, include_img_title
from magic_pdf.post_proc.pdf_post_filter import pdf_post_filter
from magic_pdf.pre_proc.remove_rotate_bbox import get_side_boundry, remove_rotate_side_textblock, remove_side_blank_block
from magic_pdf.pre_proc.resolve_bbox_conflict import check_text_block_horizontal_overlap, resolve_bbox_overlap_conflict
from magic_pdf.pre_proc.fix_table import fix_table_text_block, fix_tables, include_table_title
denseSingleLineBlockException_msg = DenseSingleLineBlockException().message
titleDetectionException_msg = TitleDetectionException().message
......@@ -108,7 +108,7 @@ def parse_pdf_by_model(
debug_mode=False,
):
pdf_bytes = read_file(s3_pdf_path, s3_pdf_profile)
save_tmp_path = os.path.join(os.path.dirname(__file__), "../../..", "tmp", "unittest")
save_tmp_path = os.path.join(os.path.dirname(__file__), "../..", "tmp", "unittest")
md_bookname_save_path = ""
book_name = sanitize_filename(book_name)
if debug_mode:
......
......@@ -11,8 +11,8 @@ import numpy as np
from termcolor import cprint
from pdf_tools.libs.commons import fitz
from pdf_tools.libs.nlp_utils import NLPModels
from magic_pdf.libs.commons import fitz
from magic_pdf.libs.nlp_utils import NLPModels
if sys.version_info[0] >= 3:
......
from loguru import logger
from pdf_tools.layout.layout_sort import get_columns_cnt_of_layout
from pdf_tools.libs.drop_reason import DropReason
from magic_pdf.layout.layout_sort import get_columns_cnt_of_layout
from magic_pdf.libs.drop_reason import DropReason
def __is_pseudo_single_column(page_info) -> bool:
......
from pdf_tools.libs.boxbase import _is_in, _is_in_or_part_overlap
from magic_pdf.libs.boxbase import _is_in, _is_in_or_part_overlap
import collections # 统计库
......
......@@ -3,7 +3,7 @@
https://aicarrier.feishu.cn/wiki/YLOPwo1PGiwFRdkwmyhcZmr0n3d
"""
import re
from pdf_tools.libs.nlp_utils import NLPModels
from magic_pdf.libs.nlp_utils import NLPModels
__NLP_MODEL = NLPModels()
......
from pdf_tools.libs.boxbase import _is_in # 正则
from pdf_tools.libs.commons import fitz # pyMuPDF库
from magic_pdf.libs.boxbase import _is_in # 正则
from magic_pdf.libs.commons import fitz # pyMuPDF库
def __solve_contain_bboxs(all_bbox_list: list):
......
from pdf_tools.libs.commons import fitz # pyMuPDF库
from magic_pdf.libs.commons import fitz # pyMuPDF库
def parse_footers(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict):
......
from collections import defaultdict
from pdf_tools.libs.boxbase import calculate_iou
from magic_pdf.libs.boxbase import calculate_iou
def compare_bbox_with_list(bbox, bbox_list, tolerance=1):
......
from collections import Counter
from pdf_tools.libs.commons import fitz # pyMuPDF库
from magic_pdf.libs.commons import fitz # pyMuPDF库
def parse_footnotes_by_model(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict, md_bookname_save_path, debug_mode=False):
......
from pdf_tools.libs.commons import fitz # pyMuPDF库
from magic_pdf.libs.commons import fitz # pyMuPDF库
def parse_headers(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict):
......
import collections # 统计库
import re
from pdf_tools.libs.commons import fitz # pyMuPDF库
from magic_pdf.libs.commons import fitz # pyMuPDF库
#--------------------------------------- Tool Functions --------------------------------------#
......
from pdf_tools.libs.commons import fitz # pyMuPDF库
from magic_pdf.libs.commons import fitz # pyMuPDF库
def parse_pageNos(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict):
......
from pdf_tools.libs.commons import fitz # pyMuPDF库
from magic_pdf.libs.commons import fitz # pyMuPDF库
def parse_tables(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict):
......
"""
对pymupdf返回的结构里的公式进行替换,替换为模型识别的公式结果
"""
from pdf_tools.libs.commons import fitz
from magic_pdf.libs.commons import fitz
import json
import os
from pathlib import Path
......
......@@ -2,9 +2,9 @@
import re
from pdf_tools.libs.boxbase import _is_in_or_part_overlap, _is_part_overlap, find_bottom_nearest_text_bbox, find_left_nearest_text_bbox, find_right_nearest_text_bbox, find_top_nearest_text_bbox
from magic_pdf.libs.boxbase import _is_in_or_part_overlap, _is_part_overlap, find_bottom_nearest_text_bbox, find_left_nearest_text_bbox, find_right_nearest_text_bbox, find_top_nearest_text_bbox
from pdf_tools.libs.textbase import get_text_block_base_info
from magic_pdf.libs.textbase import get_text_block_base_info
def fix_image_vertical(image_bboxes:list, text_blocks:list):
"""
......
from pdf_tools.libs.commons import fitz # pyMuPDF库
from magic_pdf.libs.commons import fitz # pyMuPDF库
import re
from pdf_tools.libs.boxbase import _is_in_or_part_overlap, _is_part_overlap, find_bottom_nearest_text_bbox, find_left_nearest_text_bbox, find_right_nearest_text_bbox, find_top_nearest_text_bbox # json
from magic_pdf.libs.boxbase import _is_in_or_part_overlap, _is_part_overlap, find_bottom_nearest_text_bbox, find_left_nearest_text_bbox, find_right_nearest_text_bbox, find_top_nearest_text_bbox # json
## version 2
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment