Commit f99149b8 authored by 赵小蒙's avatar 赵小蒙
Browse files

重构目录结构

parent 59bc15e0
import sys from pdf_tools.para.commons import *
from libs.commons import fitz
from termcolor import cprint
from para.commons import *
if sys.version_info[0] >= 3: if sys.version_info[0] >= 3:
......
import sys import sys
from libs.commons import fitz from pdf_tools.libs.commons import fitz
from termcolor import cprint from termcolor import cprint
......
import sys
import math import math
from collections import defaultdict from collections import defaultdict
from para.commons import * from pdf_tools.para.commons import *
if sys.version_info[0] >= 3: if sys.version_info[0] >= 3:
sys.stdout.reconfigure(encoding="utf-8") # type: ignore sys.stdout.reconfigure(encoding="utf-8") # type: ignore
......
import sys from pdf_tools.libs.commons import fitz
from libs.commons import fitz from pdf_tools.para.commons import *
from para.commons import *
if sys.version_info[0] >= 3: if sys.version_info[0] >= 3:
......
import sys
import math import math
from para.commons import * from pdf_tools.para.commons import *
if sys.version_info[0] >= 3: if sys.version_info[0] >= 3:
......
import os import os
import sys
import json import json
from para.commons import * from pdf_tools.para.commons import *
from para.raw_processor import RawBlockProcessor from pdf_tools.para.raw_processor import RawBlockProcessor
from para.layout_match_processor import LayoutFilterProcessor from pdf_tools.para.layout_match_processor import LayoutFilterProcessor
from para.stats import BlockStatisticsCalculator from pdf_tools.para.stats import BlockStatisticsCalculator
from para.stats import DocStatisticsCalculator from pdf_tools.para.stats import DocStatisticsCalculator
from para.title_processor import TitleProcessor from pdf_tools.para.title_processor import TitleProcessor
from para.block_termination_processor import BlockTerminationProcessor from pdf_tools.para.block_termination_processor import BlockTerminationProcessor
from para.block_continuation_processor import BlockContinuationProcessor from pdf_tools.para.block_continuation_processor import BlockContinuationProcessor
from para.draw import DrawAnnos from pdf_tools.para.draw import DrawAnnos
from para.exceptions import ( from pdf_tools.para.exceptions import (
DenseSingleLineBlockException, DenseSingleLineBlockException,
TitleDetectionException, TitleDetectionException,
TitleLevelException, TitleLevelException,
......
from para.commons import *
class RawBlockProcessor: class RawBlockProcessor:
def __init__(self) -> None: def __init__(self) -> None:
self.y_tolerance = 2 self.y_tolerance = 2
...@@ -186,7 +184,7 @@ class RawBlockProcessor: ...@@ -186,7 +184,7 @@ class RawBlockProcessor:
The instance of the class. The instance of the class.
---------- ----------
blocks : list blocks : list
Input block is a list of raw blocks. Schema can refer to the value of key ""preproc_blocks", demo file is app/pdf_toolbox/test/preproc_2_parasplit_example.json. Input block is a list of raw blocks. Schema can refer to the value of key ""preproc_blocks", demo file is app/pdf_toolbox/tests/preproc_2_parasplit_example.json.
Returns Returns
------- -------
......
import sys
from collections import Counter from collections import Counter
import numpy as np import numpy as np
from para.commons import * from pdf_tools.para.commons import *
if sys.version_info[0] >= 3: if sys.version_info[0] >= 3:
...@@ -149,7 +148,7 @@ class BlockStatisticsCalculator: ...@@ -149,7 +148,7 @@ class BlockStatisticsCalculator:
The instance of the class. The instance of the class.
---------- ----------
blocks : list blocks : list
Input block is a list of raw blocks. Schema can refer to the value of key ""preproc_blocks", demo file is app/pdf_toolbox/test/preproc_2_parasplit_example.json Input block is a list of raw blocks. Schema can refer to the value of key ""preproc_blocks", demo file is app/pdf_toolbox/tests/preproc_2_parasplit_example.json
Returns Returns
------- -------
......
import os import os
import sys
import re import re
import numpy as np import numpy as np
from libs.nlp_utils import NLPModels from pdf_tools.libs.nlp_utils import NLPModels
from para.commons import * from pdf_tools.para.commons import *
if sys.version_info[0] >= 3: if sys.version_info[0] >= 3:
sys.stdout.reconfigure(encoding="utf-8") # type: ignore sys.stdout.reconfigure(encoding="utf-8") # type: ignore
......
...@@ -2,28 +2,28 @@ import time ...@@ -2,28 +2,28 @@ import time
# from anyio import Path # from anyio import Path
from libs.commons import fitz, get_delta_time, get_img_s3_client from pdf_tools.libs.commons import fitz, get_delta_time, get_img_s3_client
import json import json
import os import os
import math import math
from loguru import logger from loguru import logger
from layout.bbox_sort import ( from pdf_tools.layout.bbox_sort import (
prepare_bboxes_for_layout_split, prepare_bboxes_for_layout_split,
) )
from layout.layout_sort import LAYOUT_UNPROC, get_bboxes_layout, get_columns_cnt_of_layout, sort_text_block from pdf_tools.layout.layout_sort import LAYOUT_UNPROC, get_bboxes_layout, get_columns_cnt_of_layout, sort_text_block
from libs.drop_reason import DropReason from pdf_tools.libs.drop_reason import DropReason
from libs.markdown_utils import escape_special_markdown_char from pdf_tools.libs.markdown_utils import escape_special_markdown_char
from libs.safe_filename import sanitize_filename from pdf_tools.libs.safe_filename import sanitize_filename
from libs.vis_utils import draw_bbox_on_page, draw_layout_bbox_on_page from pdf_tools.libs.vis_utils import draw_bbox_on_page, draw_layout_bbox_on_page
from pre_proc.detect_images import parse_images from pdf_tools.pre_proc.detect_images import parse_images
from pre_proc.detect_tables import parse_tables # 获取tables的bbox from pdf_tools.pre_proc.detect_tables import parse_tables # 获取tables的bbox
from pre_proc.detect_equation import parse_equations # 获取equations的bbox from pdf_tools.pre_proc.detect_equation import parse_equations # 获取equations的bbox
from pre_proc.detect_header import parse_headers # 获取headers的bbox from pdf_tools.pre_proc.detect_header import parse_headers # 获取headers的bbox
from pre_proc.detect_page_number import parse_pageNos # 获取pageNos的bbox from pdf_tools.pre_proc.detect_page_number import parse_pageNos # 获取pageNos的bbox
from pre_proc.detect_footnote import parse_footnotes_by_model, parse_footnotes_by_rule # 获取footnotes的bbox from pdf_tools.pre_proc.detect_footnote import parse_footnotes_by_model, parse_footnotes_by_rule # 获取footnotes的bbox
from pre_proc.detect_footer_by_model import parse_footers # 获取footers的bbox from pdf_tools.pre_proc.detect_footer_by_model import parse_footers # 获取footers的bbox
from post_proc.detect_para import ( from pdf_tools.post_proc.detect_para import (
ParaProcessPipeline, ParaProcessPipeline,
TitleDetectionException, TitleDetectionException,
TitleLevelException, TitleLevelException,
...@@ -31,9 +31,9 @@ from post_proc.detect_para import ( ...@@ -31,9 +31,9 @@ from post_proc.detect_para import (
ParaMergeException, ParaMergeException,
DenseSingleLineBlockException, DenseSingleLineBlockException,
) )
from pre_proc.main_text_font import get_main_text_font from pdf_tools.pre_proc.main_text_font import get_main_text_font
from pre_proc.remove_colored_strip_bbox import remove_colored_strip_textblock from pdf_tools.pre_proc.remove_colored_strip_bbox import remove_colored_strip_textblock
from pre_proc.remove_footer_header import remove_headder_footer_one_page from pdf_tools.pre_proc.remove_footer_header import remove_headder_footer_one_page
''' '''
from para.para_pipeline import ParaProcessPipeline from para.para_pipeline import ParaProcessPipeline
...@@ -46,19 +46,19 @@ from para.exceptions import ( ...@@ -46,19 +46,19 @@ from para.exceptions import (
) )
''' '''
from libs.commons import read_file, join_path from pdf_tools.libs.commons import read_file, join_path
from libs.pdf_image_tools import save_images_by_bboxes from pdf_tools.libs.pdf_image_tools import save_images_by_bboxes
from post_proc.remove_footnote import merge_footnote_blocks, remove_footnote_blocks from pdf_tools.post_proc.remove_footnote import merge_footnote_blocks, remove_footnote_blocks
from pre_proc.citationmarker_remove import remove_citation_marker from pdf_tools.pre_proc.citationmarker_remove import remove_citation_marker
from pre_proc.equations_replace import combine_chars_to_pymudict, remove_chars_in_text_blocks, replace_equations_in_textblock from pdf_tools.pre_proc.equations_replace import combine_chars_to_pymudict, remove_chars_in_text_blocks, replace_equations_in_textblock
from pre_proc.pdf_pre_filter import pdf_filter from pdf_tools.pre_proc.pdf_pre_filter import pdf_filter
from pre_proc.detect_footer_header_by_statistics import drop_footer_header from pdf_tools.pre_proc.detect_footer_header_by_statistics import drop_footer_header
from pre_proc.construct_paras import construct_page_component from pdf_tools.pre_proc.construct_paras import construct_page_component
from pre_proc.fix_image import combine_images, fix_image_vertical, fix_seperated_image, include_img_title from pdf_tools.pre_proc.fix_image import combine_images, fix_image_vertical, fix_seperated_image, include_img_title
from post_proc.pdf_post_filter import pdf_post_filter from pdf_tools.post_proc.pdf_post_filter import pdf_post_filter
from pre_proc.remove_rotate_bbox import get_side_boundry, remove_rotate_side_textblock, remove_side_blank_block from pdf_tools.pre_proc.remove_rotate_bbox import get_side_boundry, remove_rotate_side_textblock, remove_side_blank_block
from pre_proc.resolve_bbox_conflict import check_text_block_horizontal_overlap, resolve_bbox_overlap_conflict from pdf_tools.pre_proc.resolve_bbox_conflict import check_text_block_horizontal_overlap, resolve_bbox_overlap_conflict
from pre_proc.fix_table import fix_table_text_block, fix_tables, include_table_title from pdf_tools.pre_proc.fix_table import fix_table_text_block, fix_tables, include_table_title
denseSingleLineBlockException_msg = DenseSingleLineBlockException().message denseSingleLineBlockException_msg = DenseSingleLineBlockException().message
titleDetectionException_msg = TitleDetectionException().message titleDetectionException_msg = TitleDetectionException().message
...@@ -108,7 +108,7 @@ def parse_pdf_by_model( ...@@ -108,7 +108,7 @@ def parse_pdf_by_model(
debug_mode=False, debug_mode=False,
): ):
pdf_bytes = read_file(s3_pdf_path, s3_pdf_profile) pdf_bytes = read_file(s3_pdf_path, s3_pdf_profile)
save_tmp_path = os.path.join(os.path.dirname(__file__), "../..", "tmp", "unittest") save_tmp_path = os.path.join(os.path.dirname(__file__), "../../..", "tmp", "unittest")
md_bookname_save_path = "" md_bookname_save_path = ""
book_name = sanitize_filename(book_name) book_name = sanitize_filename(book_name)
if debug_mode: if debug_mode:
......
...@@ -11,8 +11,8 @@ import numpy as np ...@@ -11,8 +11,8 @@ import numpy as np
from termcolor import cprint from termcolor import cprint
from libs.commons import fitz from pdf_tools.libs.commons import fitz
from libs.nlp_utils import NLPModels from pdf_tools.libs.nlp_utils import NLPModels
if sys.version_info[0] >= 3: if sys.version_info[0] >= 3:
...@@ -3404,7 +3404,7 @@ Params: ...@@ -3404,7 +3404,7 @@ Params:
if __name__ == "__main__": if __name__ == "__main__":
DEFAULT_PDF_PATH = ( DEFAULT_PDF_PATH = (
"app/pdf_toolbox/test/assets/paper/paper.pdf" if os.name != "nt" else "app\\pdf_toolbox\\test\\assets\\paper\\paper.pdf" "app/pdf_toolbox/tests/assets/paper/paper.pdf" if os.name != "nt" else "app\\pdf_toolbox\\tests\\assets\\paper\\paper.pdf"
) )
input_pdf_path = sys.argv[1] if len(sys.argv) > 1 else DEFAULT_PDF_PATH input_pdf_path = sys.argv[1] if len(sys.argv) > 1 else DEFAULT_PDF_PATH
output_pdf_path = sys.argv[2] if len(sys.argv) > 2 else input_pdf_path.split(".")[0] + "_recogPara.pdf" output_pdf_path = sys.argv[2] if len(sys.argv) > 2 else input_pdf_path.split(".")[0] + "_recogPara.pdf"
......
from loguru import logger from loguru import logger
from layout.layout_sort import get_columns_cnt_of_layout from pdf_tools.layout.layout_sort import get_columns_cnt_of_layout
from libs.drop_reason import DropReason from pdf_tools.libs.drop_reason import DropReason
def __is_pseudo_single_column(page_info) -> bool: def __is_pseudo_single_column(page_info) -> bool:
......
from libs.boxbase import _is_in, _is_in_or_part_overlap from pdf_tools.libs.boxbase import _is_in, _is_in_or_part_overlap
import collections # 统计库 import collections # 统计库
......
...@@ -3,8 +3,7 @@ ...@@ -3,8 +3,7 @@
https://aicarrier.feishu.cn/wiki/YLOPwo1PGiwFRdkwmyhcZmr0n3d https://aicarrier.feishu.cn/wiki/YLOPwo1PGiwFRdkwmyhcZmr0n3d
""" """
import re import re
from loguru import logger from pdf_tools.libs.nlp_utils import NLPModels
from libs.nlp_utils import NLPModels
__NLP_MODEL = NLPModels() __NLP_MODEL = NLPModels()
......
import os from pdf_tools.libs.boxbase import _is_in # 正则
import collections # 统计库 from pdf_tools.libs.commons import fitz # pyMuPDF库
import re
from libs.boxbase import _is_in # 正则
from libs.commons import fitz # pyMuPDF库
import json # json
from pathlib import Path
def __solve_contain_bboxs(all_bbox_list: list): def __solve_contain_bboxs(all_bbox_list: list):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment