Commit f99149b8 authored by 赵小蒙's avatar 赵小蒙
Browse files

重构目录结构

parent 59bc15e0
import os from pdf_tools.libs.commons import fitz # pyMuPDF库
import collections # 统计库
import re # 正则
from libs.commons import fitz # pyMuPDF库
import json # json
def parse_footers(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict): def parse_footers(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict):
......
from collections import defaultdict from collections import defaultdict
from loguru import logger from pdf_tools.libs.boxbase import calculate_iou
from libs.boxbase import _is_in, calculate_iou
def compare_bbox_with_list(bbox, bbox_list, tolerance=1): def compare_bbox_with_list(bbox, bbox_list, tolerance=1):
......
import os
from collections import Counter from collections import Counter
import re # 正则 from pdf_tools.libs.commons import fitz # pyMuPDF库
from libs.commons import fitz # pyMuPDF库
import json # json
def parse_footnotes_by_model(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict, md_bookname_save_path, debug_mode=False): def parse_footnotes_by_model(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict, md_bookname_save_path, debug_mode=False):
......
import os from pdf_tools.libs.commons import fitz # pyMuPDF库
import collections # 统计库
import re # 正则
from libs.commons import fitz # pyMuPDF库
import json # json
def parse_headers(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict): def parse_headers(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict):
......
import os
import collections # 统计库 import collections # 统计库
import re import re
from libs.boxbase import _is_in_or_part_overlap # 正则 from pdf_tools.libs.commons import fitz # pyMuPDF库
from libs.commons import fitz # pyMuPDF库
import json # json
#--------------------------------------- Tool Functions --------------------------------------# #--------------------------------------- Tool Functions --------------------------------------#
......
import os from pdf_tools.libs.commons import fitz # pyMuPDF库
import collections # 统计库
import re # 正则
from libs.commons import fitz # pyMuPDF库
import json # json
def parse_pageNos(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict): def parse_pageNos(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict):
......
import os from pdf_tools.libs.commons import fitz # pyMuPDF库
import collections # 统计库
import re # 正则
from libs.commons import fitz # pyMuPDF库
import json # json
def parse_tables(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict): def parse_tables(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict):
......
""" """
对pymupdf返回的结构里的公式进行替换,替换为模型识别的公式结果 对pymupdf返回的结构里的公式进行替换,替换为模型识别的公式结果
""" """
from libs.commons import fitz from pdf_tools.libs.commons import fitz
import json import json
import os import os
from pathlib import Path from pathlib import Path
......
...@@ -2,10 +2,9 @@ ...@@ -2,10 +2,9 @@
import re import re
from libs.boxbase import _is_in_or_part_overlap, _is_part_overlap, _is_in, find_bottom_nearest_text_bbox, find_left_nearest_text_bbox, find_right_nearest_text_bbox, find_top_nearest_text_bbox from pdf_tools.libs.boxbase import _is_in_or_part_overlap, _is_part_overlap, find_bottom_nearest_text_bbox, find_left_nearest_text_bbox, find_right_nearest_text_bbox, find_top_nearest_text_bbox
from loguru import logger
from libs.textbase import get_text_block_base_info from pdf_tools.libs.textbase import get_text_block_base_info
def fix_image_vertical(image_bboxes:list, text_blocks:list): def fix_image_vertical(image_bboxes:list, text_blocks:list):
""" """
......
import os from pdf_tools.libs.commons import fitz # pyMuPDF库
import collections # 统计库
import re # 正则
from libs.commons import fitz # pyMuPDF库
import json
import re import re
from libs.boxbase import _is_in_or_part_overlap, _is_part_overlap, find_bottom_nearest_text_bbox, find_left_nearest_text_bbox, find_right_nearest_text_bbox, find_top_nearest_text_bbox # json from pdf_tools.libs.boxbase import _is_in_or_part_overlap, _is_part_overlap, find_bottom_nearest_text_bbox, find_left_nearest_text_bbox, find_right_nearest_text_bbox, find_top_nearest_text_bbox # json
## version 2 ## version 2
......
from libs.commons import fitz from pdf_tools.libs.commons import fitz
from libs.boxbase import _is_in, _is_in_or_part_overlap from pdf_tools.libs.boxbase import _is_in, _is_in_or_part_overlap
from libs.drop_reason import DropReason from pdf_tools.libs.drop_reason import DropReason
def __area(box): def __area(box):
......
from libs.boxbase import _is_in, _is_in_or_part_overlap, calculate_overlap_area_2_minbox_area_ratio from pdf_tools.libs.boxbase import _is_in, _is_in_or_part_overlap, calculate_overlap_area_2_minbox_area_ratio
from loguru import logger from loguru import logger
from libs.drop_tag import COLOR_BG_HEADER_TXT_BLOCK from pdf_tools.libs.drop_tag import COLOR_BG_HEADER_TXT_BLOCK
def __area(box): def __area(box):
......
import re import re
from libs.boxbase import _is_in_or_part_overlap from pdf_tools.libs.boxbase import _is_in_or_part_overlap
def remove_headder_footer_one_page(text_raw_blocks, image_bboxes, table_bboxes, header_bboxs, footer_bboxs, def remove_headder_footer_one_page(text_raw_blocks, image_bboxes, table_bboxes, header_bboxs, footer_bboxs,
......
import json
import math import math
from libs.boxbase import is_vbox_on_side from pdf_tools.libs.boxbase import is_vbox_on_side
def detect_non_horizontal_texts(result_dict): def detect_non_horizontal_texts(result_dict):
...@@ -84,7 +82,7 @@ def detect_non_horizontal_texts(result_dict): ...@@ -84,7 +82,7 @@ def detect_non_horizontal_texts(result_dict):
1. 当一个block里全部文字都不是dir=(1,0),这个block整体去掉 1. 当一个block里全部文字都不是dir=(1,0),这个block整体去掉
2. 当一个block里全部文字都是dir=(1,0),但是每行只有一个字,这个block整体去掉。这个block必须出现在页面的四周,否则不去掉 2. 当一个block里全部文字都是dir=(1,0),但是每行只有一个字,这个block整体去掉。这个block必须出现在页面的四周,否则不去掉
""" """
import string, re import re
def __is_a_word(sentence): def __is_a_word(sentence):
# 如果输入是中文并且长度为1,则返回True # 如果输入是中文并且长度为1,则返回True
......
...@@ -5,7 +5,7 @@ ...@@ -5,7 +5,7 @@
2. 然后去掉出现在文字blcok上的图片bbox 2. 然后去掉出现在文字blcok上的图片bbox
""" """
from libs.boxbase import _is_in, _is_in_or_part_overlap, _is_left_overlap, calculate_iou, calculate_overlap_area_2_minbox_area_ratio from pdf_tools.libs.boxbase import _is_in, _is_in_or_part_overlap, _is_left_overlap
def resolve_bbox_overlap_conflict(images:list, tables:list, interline_equations:list, inline_equations:list, text_raw_blocks:list): def resolve_bbox_overlap_conflict(images:list, tables:list, interline_equations:list, inline_equations:list, text_raw_blocks:list):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment