Commit d5dbed73 authored by 赵小蒙's avatar 赵小蒙
Browse files

目录重构

parent 7c7910e4
from pdf_tools.libs.commons import fitz from magic_pdf.libs.commons import fitz
from pdf_tools.libs.boxbase import _is_in, _is_in_or_part_overlap from magic_pdf.libs.boxbase import _is_in, _is_in_or_part_overlap
from pdf_tools.libs.drop_reason import DropReason from magic_pdf.libs.drop_reason import DropReason
def __area(box): def __area(box):
......
from pdf_tools.libs.boxbase import _is_in, _is_in_or_part_overlap, calculate_overlap_area_2_minbox_area_ratio from magic_pdf.libs.boxbase import _is_in, _is_in_or_part_overlap, calculate_overlap_area_2_minbox_area_ratio
from loguru import logger from loguru import logger
from pdf_tools.libs.drop_tag import COLOR_BG_HEADER_TXT_BLOCK from magic_pdf.libs.drop_tag import COLOR_BG_HEADER_TXT_BLOCK
def __area(box): def __area(box):
......
import re import re
from pdf_tools.libs.boxbase import _is_in_or_part_overlap from magic_pdf.libs.boxbase import _is_in_or_part_overlap
def remove_headder_footer_one_page(text_raw_blocks, image_bboxes, table_bboxes, header_bboxs, footer_bboxs, def remove_headder_footer_one_page(text_raw_blocks, image_bboxes, table_bboxes, header_bboxs, footer_bboxs,
......
import math import math
from pdf_tools.libs.boxbase import is_vbox_on_side from magic_pdf.libs.boxbase import is_vbox_on_side
def detect_non_horizontal_texts(result_dict): def detect_non_horizontal_texts(result_dict):
......
...@@ -5,7 +5,7 @@ ...@@ -5,7 +5,7 @@
2. 然后去掉出现在文字blcok上的图片bbox 2. 然后去掉出现在文字blcok上的图片bbox
""" """
from pdf_tools.libs.boxbase import _is_in, _is_in_or_part_overlap, _is_left_overlap from magic_pdf.libs.boxbase import _is_in, _is_in_or_part_overlap, _is_left_overlap
def resolve_bbox_overlap_conflict(images:list, tables:list, interline_equations:list, inline_equations:list, text_raw_blocks:list): def resolve_bbox_overlap_conflict(images:list, tables:list, interline_equations:list, inline_equations:list, text_raw_blocks:list):
......
# 最终版:把那种text_block有重叠,且inline_formula位置在重叠部分的,认定整个页面都有问题,所有的inline_formula都改成no_check # 最终版:把那种text_block有重叠,且inline_formula位置在重叠部分的,认定整个页面都有问题,所有的inline_formula都改成no_check
from pdf_tools.libs import fitz from magic_pdf.libs import fitz
def check_inline_formula(page, inline_formula_boxes): def check_inline_formula(page, inline_formula_boxes):
......
...@@ -3,7 +3,7 @@ from typing import Tuple ...@@ -3,7 +3,7 @@ from typing import Tuple
import os import os
import boto3, json import boto3, json
from botocore.config import Config from botocore.config import Config
from pdf_tools.libs import fitz from magic_pdf.libs import fitz
from loguru import logger from loguru import logger
from pathlib import Path from pathlib import Path
from tqdm import tqdm from tqdm import tqdm
...@@ -22,13 +22,13 @@ from validation import cal_edit_distance, format_gt_bbox, label_match, detect_va ...@@ -22,13 +22,13 @@ from validation import cal_edit_distance, format_gt_bbox, label_match, detect_va
# from pdf2text_recogPara import parse_blocks_per_page # from pdf2text_recogPara import parse_blocks_per_page
# from bbox_sort import bbox_sort, CONTENT_IDX, CONTENT_TYPE_IDX # from bbox_sort import bbox_sort, CONTENT_IDX, CONTENT_TYPE_IDX
from pdf_tools.layout.bbox_sort import bbox_sort, CONTENT_IDX, CONTENT_TYPE_IDX from magic_pdf.layout.bbox_sort import bbox_sort, CONTENT_IDX, CONTENT_TYPE_IDX
from pdf_tools.pre_proc import parse_images # 获取figures的bbox from magic_pdf.pre_proc import parse_images # 获取figures的bbox
from pdf_tools.pre_proc.detect_tables import parse_tables # 获取tables的bbox from magic_pdf.pre_proc.detect_tables import parse_tables # 获取tables的bbox
from pdf_tools.pre_proc import parse_equations # 获取equations的bbox from magic_pdf.pre_proc import parse_equations # 获取equations的bbox
# from pdf2text_recogFootnote import parse_footnotes # 获取footnotes的bbox # from pdf2text_recogFootnote import parse_footnotes # 获取footnotes的bbox
from pdf_tools.post_proc.detect_para import process_blocks_per_page from magic_pdf.post_proc.detect_para import process_blocks_per_page
from pdf_tools.libs import parse_aws_param, parse_bucket_key, read_file, join_path from magic_pdf.libs import parse_aws_param, parse_bucket_key, read_file, join_path
def cut_image(bbox: Tuple, page_num: int, page: fitz.Page, save_parent_path: str, s3_profile: str): def cut_image(bbox: Tuple, page_num: int, page: fitz.Page, save_parent_path: str, s3_profile: str):
......
from pdf_tools.libs import fitz # pyMuPDF库 from magic_pdf.libs import fitz # pyMuPDF库
def calculate_overlapRatio_between_rect1_and_rect2(L1: float, U1: float, R1: float, D1: float, L2: float, U2: float, R2: float, D2: float) -> (float, float): def calculate_overlapRatio_between_rect1_and_rect2(L1: float, U1: float, R1: float, D1: float, L2: float, U2: float, R2: float, D2: float) -> (float, float):
......
from pdf_tools.libs import fitz from magic_pdf.libs import fitz
from typing import List from typing import List
......
import re import re
from pdf_tools.libs import _is_in_or_part_overlap from magic_pdf.libs import _is_in_or_part_overlap
from pdf_tools.libs import fitz from magic_pdf.libs import fitz
import collections import collections
......
...@@ -11,8 +11,8 @@ import numpy as np ...@@ -11,8 +11,8 @@ import numpy as np
from termcolor import cprint from termcolor import cprint
from pdf_tools.libs import fitz from magic_pdf.libs import fitz
from pdf_tools.libs import NLPModels from magic_pdf.libs import NLPModels
if sys.version_info[0] >= 3: if sys.version_info[0] >= 3:
......
from pdf_tools.libs.commons import fitz # pyMuPDF库 from magic_pdf.libs.commons import fitz # pyMuPDF库
def parse_titles(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict, exclude_bboxes): def parse_titles(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict, exclude_bboxes):
......
...@@ -2,7 +2,7 @@ import numpy as np ...@@ -2,7 +2,7 @@ import numpy as np
import tqdm import tqdm
import json import json
from validation import cal_edit_distance, format_gt_bbox from validation import cal_edit_distance, format_gt_bbox
from pdf_tools.layout.layout_sort import sort_with_layout from magic_pdf.layout.layout_sort import sort_with_layout
with open('/mnt/petrelfs/share_data/ouyanglinke/OCR/OCR_validation_dataset_final_rotated_formulafix_highdpi_scihub.json', 'r') as f: with open('/mnt/petrelfs/share_data/ouyanglinke/OCR/OCR_validation_dataset_final_rotated_formulafix_highdpi_scihub.json', 'r') as f:
samples = json.load(f) samples = json.load(f)
......
import io import io
import json import json
import os import os
from pdf_tools.libs import fitz from magic_pdf.libs import fitz
from app.common.s3 import get_s3_config, get_s3_client from app.common.s3 import get_s3_config, get_s3_client
from pdf_tools.libs import join_path, json_dump_path, read_file, parse_bucket_key from magic_pdf.libs import join_path, json_dump_path, read_file, parse_bucket_key
from loguru import logger from loguru import logger
test_pdf_dir_path = "s3://llm-pdf-text/unittest/pdf/" test_pdf_dir_path = "s3://llm-pdf-text/unittest/pdf/"
......
...@@ -2,9 +2,9 @@ import os ...@@ -2,9 +2,9 @@ import os
import pytest import pytest
from pdf_tools.filter import classify_by_area, classify_by_text_len, classify_by_avg_words, \ from magic_pdf.filter import classify_by_area, classify_by_text_len, classify_by_avg_words, \
classify_by_img_num, classify_by_text_layout, classify_by_img_narrow_strips classify_by_img_num, classify_by_text_layout, classify_by_img_narrow_strips
from pdf_tools.filter.pdf_meta_scan import get_pdf_page_size_pts, get_pdf_textlen_per_page, get_imgs_per_page from magic_pdf.filter.pdf_meta_scan import get_pdf_page_size_pts, get_pdf_textlen_per_page, get_imgs_per_page
from test.test_commons import get_docs_from_test_pdf, get_test_json_data from test.test_commons import get_docs_from_test_pdf, get_test_json_data
# 获取当前目录 # 获取当前目录
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment