Commit d5dbed73 authored by 赵小蒙's avatar 赵小蒙
Browse files

目录重构

parent 7c7910e4
from pdf_tools.libs.commons import fitz
from pdf_tools.libs.boxbase import _is_in, _is_in_or_part_overlap
from pdf_tools.libs.drop_reason import DropReason
from magic_pdf.libs.commons import fitz
from magic_pdf.libs.boxbase import _is_in, _is_in_or_part_overlap
from magic_pdf.libs.drop_reason import DropReason
def __area(box):
......
from pdf_tools.libs.boxbase import _is_in, _is_in_or_part_overlap, calculate_overlap_area_2_minbox_area_ratio
from magic_pdf.libs.boxbase import _is_in, _is_in_or_part_overlap, calculate_overlap_area_2_minbox_area_ratio
from loguru import logger
from pdf_tools.libs.drop_tag import COLOR_BG_HEADER_TXT_BLOCK
from magic_pdf.libs.drop_tag import COLOR_BG_HEADER_TXT_BLOCK
def __area(box):
......
import re
from pdf_tools.libs.boxbase import _is_in_or_part_overlap
from magic_pdf.libs.boxbase import _is_in_or_part_overlap
def remove_headder_footer_one_page(text_raw_blocks, image_bboxes, table_bboxes, header_bboxs, footer_bboxs,
......
import math
from pdf_tools.libs.boxbase import is_vbox_on_side
from magic_pdf.libs.boxbase import is_vbox_on_side
def detect_non_horizontal_texts(result_dict):
......
......@@ -5,7 +5,7 @@
2. 然后去掉出现在文字blcok上的图片bbox
"""
from pdf_tools.libs.boxbase import _is_in, _is_in_or_part_overlap, _is_left_overlap
from magic_pdf.libs.boxbase import _is_in, _is_in_or_part_overlap, _is_left_overlap
def resolve_bbox_overlap_conflict(images:list, tables:list, interline_equations:list, inline_equations:list, text_raw_blocks:list):
......
# 最终版:把那种text_block有重叠,且inline_formula位置在重叠部分的,认定整个页面都有问题,所有的inline_formula都改成no_check
from pdf_tools.libs import fitz
from magic_pdf.libs import fitz
def check_inline_formula(page, inline_formula_boxes):
......
......@@ -3,7 +3,7 @@ from typing import Tuple
import os
import boto3, json
from botocore.config import Config
from pdf_tools.libs import fitz
from magic_pdf.libs import fitz
from loguru import logger
from pathlib import Path
from tqdm import tqdm
......@@ -22,13 +22,13 @@ from validation import cal_edit_distance, format_gt_bbox, label_match, detect_va
# from pdf2text_recogPara import parse_blocks_per_page
# from bbox_sort import bbox_sort, CONTENT_IDX, CONTENT_TYPE_IDX
from pdf_tools.layout.bbox_sort import bbox_sort, CONTENT_IDX, CONTENT_TYPE_IDX
from pdf_tools.pre_proc import parse_images # 获取figures的bbox
from pdf_tools.pre_proc.detect_tables import parse_tables # 获取tables的bbox
from pdf_tools.pre_proc import parse_equations # 获取equations的bbox
from magic_pdf.layout.bbox_sort import bbox_sort, CONTENT_IDX, CONTENT_TYPE_IDX
from magic_pdf.pre_proc import parse_images # 获取figures的bbox
from magic_pdf.pre_proc.detect_tables import parse_tables # 获取tables的bbox
from magic_pdf.pre_proc import parse_equations # 获取equations的bbox
# from pdf2text_recogFootnote import parse_footnotes # 获取footnotes的bbox
from pdf_tools.post_proc.detect_para import process_blocks_per_page
from pdf_tools.libs import parse_aws_param, parse_bucket_key, read_file, join_path
from magic_pdf.post_proc.detect_para import process_blocks_per_page
from magic_pdf.libs import parse_aws_param, parse_bucket_key, read_file, join_path
def cut_image(bbox: Tuple, page_num: int, page: fitz.Page, save_parent_path: str, s3_profile: str):
......
from pdf_tools.libs import fitz # pyMuPDF库
from magic_pdf.libs import fitz # pyMuPDF库
def calculate_overlapRatio_between_rect1_and_rect2(L1: float, U1: float, R1: float, D1: float, L2: float, U2: float, R2: float, D2: float) -> (float, float):
......
from pdf_tools.libs import fitz
from magic_pdf.libs import fitz
from typing import List
......
import re
from pdf_tools.libs import _is_in_or_part_overlap
from pdf_tools.libs import fitz
from magic_pdf.libs import _is_in_or_part_overlap
from magic_pdf.libs import fitz
import collections
......
......@@ -11,8 +11,8 @@ import numpy as np
from termcolor import cprint
from pdf_tools.libs import fitz
from pdf_tools.libs import NLPModels
from magic_pdf.libs import fitz
from magic_pdf.libs import NLPModels
if sys.version_info[0] >= 3:
......
from pdf_tools.libs.commons import fitz # pyMuPDF库
from magic_pdf.libs.commons import fitz # pyMuPDF库
def parse_titles(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict, exclude_bboxes):
......
......@@ -2,7 +2,7 @@ import numpy as np
import tqdm
import json
from validation import cal_edit_distance, format_gt_bbox
from pdf_tools.layout.layout_sort import sort_with_layout
from magic_pdf.layout.layout_sort import sort_with_layout
with open('/mnt/petrelfs/share_data/ouyanglinke/OCR/OCR_validation_dataset_final_rotated_formulafix_highdpi_scihub.json', 'r') as f:
samples = json.load(f)
......
from setuptools import setup, find_packages
setup(
name="Magic-PDF", # 项目名
version="0.1.0", # 版本号
name="Magic-PDF", # 项目名
version="0.1.0", # 版本号
packages=find_packages(), # 包含所有的包
install_requires=['PyMuPDF>=1.23.25',
'boto3>=1.34.52',
......
import io
import json
import os
from pdf_tools.libs import fitz
from magic_pdf.libs import fitz
from app.common.s3 import get_s3_config, get_s3_client
from pdf_tools.libs import join_path, json_dump_path, read_file, parse_bucket_key
from magic_pdf.libs import join_path, json_dump_path, read_file, parse_bucket_key
from loguru import logger
test_pdf_dir_path = "s3://llm-pdf-text/unittest/pdf/"
......
......@@ -2,9 +2,9 @@ import os
import pytest
from pdf_tools.filter import classify_by_area, classify_by_text_len, classify_by_avg_words, \
from magic_pdf.filter import classify_by_area, classify_by_text_len, classify_by_avg_words, \
classify_by_img_num, classify_by_text_layout, classify_by_img_narrow_strips
from pdf_tools.filter.pdf_meta_scan import get_pdf_page_size_pts, get_pdf_textlen_per_page, get_imgs_per_page
from magic_pdf.filter.pdf_meta_scan import get_pdf_page_size_pts, get_pdf_textlen_per_page, get_imgs_per_page
from test.test_commons import get_docs_from_test_pdf, get_test_json_data
# 获取当前目录
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment