Commit f99149b8 authored by 赵小蒙's avatar 赵小蒙
Browse files

重构目录结构

parent 59bc15e0
......@@ -31,6 +31,6 @@ tmp/
tmp
.vscode
.vscode/
/test/
/tests/
/app/pdf_toolbox/test/test_bookname.txt
......@@ -2,7 +2,7 @@ import json
import os
from tqdm import tqdm
from libs.commons import join_path
from pdf_tools.libs import join_path
with open('/mnt/petrelfs/share_data/ouyanglinke/OCR/OCR_validation_dataset.json', 'r') as f:
samples = json.load(f)
......
from libs.commons import fitz # PyMuPDF
from pdf_tools.libs import fitz # PyMuPDF
# PDF文件路径
pdf_path = "D:\\project\\20231108code-clean\\code-clean\\tmp\\unittest\\download-pdfs\\scihub\\scihub_53700000\\libgen.scimag53724000-53724999.zip_10.1097\\00129191-200509000-00018.pdf"
......
......@@ -5,9 +5,9 @@ from pathlib import Path
import click
from loguru import logger
from libs.commons import join_path
from dict2md.mkcontent import mk_mm_markdown
from pipeline.pdf_parse_by_model import parse_pdf_by_model
from pdf_tools.libs import join_path
from pdf_tools.dict2md.mkcontent import mk_mm_markdown
from pdf_tools.pipeline import parse_pdf_by_model
......
# 最终版:把那种text_block有重叠,且inline_formula位置在重叠部分的,认定整个页面都有问题,所有的inline_formula都改成no_check
from libs.commons import fitz
from pdf_tools.libs import fitz
def check_inline_formula(page, inline_formula_boxes):
......
......@@ -3,7 +3,7 @@ from typing import Tuple
import os
import boto3, json
from botocore.config import Config
from libs.commons import fitz
from pdf_tools.libs import fitz
from loguru import logger
from pathlib import Path
from tqdm import tqdm
......@@ -22,13 +22,13 @@ from validation import cal_edit_distance, format_gt_bbox, label_match, detect_va
# from pdf2text_recogPara import parse_blocks_per_page
# from bbox_sort import bbox_sort, CONTENT_IDX, CONTENT_TYPE_IDX
from layout.bbox_sort import bbox_sort, CONTENT_IDX, CONTENT_TYPE_IDX
from pre_proc.detect_images import parse_images # 获取figures的bbox
from pre_proc.detect_tables import parse_tables # 获取tables的bbox
from pre_proc.detect_equation import parse_equations # 获取equations的bbox
from pdf_tools.layout.bbox_sort import bbox_sort, CONTENT_IDX, CONTENT_TYPE_IDX
from pdf_tools.pre_proc import parse_images # 获取figures的bbox
from pdf_tools.pre_proc.detect_tables import parse_tables # 获取tables的bbox
from pdf_tools.pre_proc import parse_equations # 获取equations的bbox
# from pdf2text_recogFootnote import parse_footnotes # 获取footnotes的bbox
from post_proc.detect_para import process_blocks_per_page
from libs.commons import parse_aws_param, parse_bucket_key, read_file, join_path
from pdf_tools.post_proc.detect_para import process_blocks_per_page
from pdf_tools.libs import parse_aws_param, parse_bucket_key, read_file, join_path
def cut_image(bbox: Tuple, page_num: int, page: fitz.Page, save_parent_path: str, s3_profile: str):
......
import os
import collections # 统计库
import re # 正则
from libs.commons import fitz # pyMuPDF库
import json # json
from pdf_tools.libs import fitz # pyMuPDF库
def calculate_overlapRatio_between_rect1_and_rect2(L1: float, U1: float, R1: float, D1: float, L2: float, U2: float, R2: float, D2: float) -> (float, float):
......
from libs.commons import fitz
from pdf_tools.libs import fitz
from typing import List
......
import io
import re
import os
import json
from libs.boxbase import _is_in_or_part_overlap, calculate_overlap_area_2_minbox_area_ratio
from libs.commons import fitz
from fitz import Point
from pprint import pprint
import pickle
from pdf_tools.libs import _is_in_or_part_overlap
from pdf_tools.libs import fitz
import collections
from typing import List
def calculate_overlapRatio_between_rect1_and_rect2(L1: float, U1: float, R1: float, D1: float, L2: float, U2: float, R2: float, D2: float) -> (float, float):
......
......@@ -11,8 +11,8 @@ import numpy as np
from termcolor import cprint
from libs.commons import fitz
from libs.nlp_utils import NLPModels
from pdf_tools.libs import fitz
from pdf_tools.libs import NLPModels
if sys.version_info[0] >= 3:
......@@ -3478,7 +3478,7 @@ Params:
if __name__ == "__main__":
DEFAULT_PDF_PATH = (
"app/pdf_toolbox/test/assets/paper/paper.pdf" if os.name != "nt" else "app\\pdf_toolbox\\test\\assets\\paper\\paper.pdf"
"app/pdf_toolbox/tests/assets/paper/paper.pdf" if os.name != "nt" else "app\\pdf_toolbox\\tests\\assets\\paper\\paper.pdf"
)
input_pdf_path = sys.argv[1] if len(sys.argv) > 1 else DEFAULT_PDF_PATH
output_pdf_path = sys.argv[2] if len(sys.argv) > 2 else input_pdf_path.split(".")[0] + "_recogPara.pdf"
......
import os
import collections # 统计库
import re # 正则
from libs.commons import fitz # pyMuPDF库
import json # json
from pdf_tools.libs.commons import fitz # pyMuPDF库
def parse_titles(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict, exclude_bboxes):
......
import numpy as np
import tqdm
import json
from validation import cal_edit_distance, format_gt_bbox, label_match, detect_val
from layout.layout_sort import sort_with_layout
from validation import cal_edit_distance, format_gt_bbox
from pdf_tools.layout.layout_sort import sort_with_layout
with open('/mnt/petrelfs/share_data/ouyanglinke/OCR/OCR_validation_dataset_final_rotated_formulafix_highdpi_scihub.json', 'r') as f:
samples = json.load(f)
......
import re
import math
from loguru import logger
from libs.boxbase import find_bottom_nearest_text_bbox, find_top_nearest_text_bbox
from pdf_tools.libs.boxbase import find_bottom_nearest_text_bbox, find_top_nearest_text_bbox
def mk_nlp_markdown(para_dict: dict):
......
......@@ -16,8 +16,8 @@ from collections import Counter
import click
import numpy as np
from libs.commons import mymax, get_top_percent_list
from filter.pdf_meta_scan import scan_max_page, junk_limit_min
from pdf_tools.libs.commons import mymax, get_top_percent_list
from pdf_tools.filter.pdf_meta_scan import scan_max_page, junk_limit_min
TEXT_LEN_THRESHOLD = 100
AVG_TEXT_LEN_THRESHOLD = 200
......
......@@ -2,18 +2,16 @@
输入: s3路径,每行一个
输出: pdf文件元信息,包括每一页上的所有图片的长宽高,bbox位置
"""
import math
import sys
import click
from libs.commons import read_file, mymax, get_top_percent_list
import json
from libs.commons import fitz
from pdf_tools.libs.commons import read_file, mymax, get_top_percent_list
from pdf_tools.libs.commons import fitz
from loguru import logger
from collections import Counter
from libs.drop_reason import DropReason
from libs.language import detect_lang
from pdf_tools.libs.drop_reason import DropReason
from pdf_tools.libs.language import detect_lang
scan_max_page = 50
junk_limit_min = 10
......
......@@ -3,9 +3,9 @@
from layout.layout_spiler_recog import get_spilter_of_page
from libs.boxbase import _is_bottom_full_overlap, _is_in, _is_in_or_part_overlap, _is_vertical_full_overlap
from libs.commons import mymax
from pdf_tools.layout.layout_spiler_recog import get_spilter_of_page
from pdf_tools.libs.boxbase import _is_in, _is_in_or_part_overlap, _is_vertical_full_overlap
from pdf_tools.libs.commons import mymax
X0_IDX = 0
Y0_IDX = 1
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment