Commit d5dbed73 authored by 赵小蒙's avatar 赵小蒙
Browse files

目录重构

parent 7c7910e4
...@@ -2,7 +2,7 @@ import json ...@@ -2,7 +2,7 @@ import json
import os import os
from tqdm import tqdm from tqdm import tqdm
from pdf_tools.libs import join_path from magic_pdf.libs import join_path
with open('/mnt/petrelfs/share_data/ouyanglinke/OCR/OCR_validation_dataset.json', 'r') as f: with open('/mnt/petrelfs/share_data/ouyanglinke/OCR/OCR_validation_dataset.json', 'r') as f:
samples = json.load(f) samples = json.load(f)
......
from pdf_tools.libs import fitz # PyMuPDF from magic_pdf.libs import fitz # PyMuPDF
# PDF文件路径 # PDF文件路径
pdf_path = "D:\\project\\20231108code-clean\\code-clean\\tmp\\unittest\\download-pdfs\\scihub\\scihub_53700000\\libgen.scimag53724000-53724999.zip_10.1097\\00129191-200509000-00018.pdf" pdf_path = "D:\\project\\20231108code-clean\\code-clean\\tmp\\unittest\\download-pdfs\\scihub\\scihub_53700000\\libgen.scimag53724000-53724999.zip_10.1097\\00129191-200509000-00018.pdf"
......
...@@ -5,9 +5,9 @@ from pathlib import Path ...@@ -5,9 +5,9 @@ from pathlib import Path
import click import click
from loguru import logger from loguru import logger
from pdf_tools.libs import join_path from magic_pdf.libs import join_path
from pdf_tools.dict2md.mkcontent import mk_mm_markdown from magic_pdf.dict2md.mkcontent import mk_mm_markdown
from pdf_tools.pipeline import parse_pdf_by_model from magic_pdf.pipeline import parse_pdf_by_model
......
import math import math
from loguru import logger from loguru import logger
from pdf_tools.libs.boxbase import find_bottom_nearest_text_bbox, find_top_nearest_text_bbox from magic_pdf.libs.boxbase import find_bottom_nearest_text_bbox, find_top_nearest_text_bbox
def mk_nlp_markdown(para_dict: dict): def mk_nlp_markdown(para_dict: dict):
......
...@@ -16,8 +16,8 @@ from collections import Counter ...@@ -16,8 +16,8 @@ from collections import Counter
import click import click
import numpy as np import numpy as np
from pdf_tools.libs.commons import mymax, get_top_percent_list from magic_pdf.libs.commons import mymax, get_top_percent_list
from pdf_tools.filter.pdf_meta_scan import scan_max_page, junk_limit_min from magic_pdf.filter.pdf_meta_scan import scan_max_page, junk_limit_min
TEXT_LEN_THRESHOLD = 100 TEXT_LEN_THRESHOLD = 100
AVG_TEXT_LEN_THRESHOLD = 200 AVG_TEXT_LEN_THRESHOLD = 200
......
...@@ -5,13 +5,13 @@ ...@@ -5,13 +5,13 @@
import sys import sys
import click import click
from pdf_tools.libs.commons import read_file, mymax, get_top_percent_list from magic_pdf.libs.commons import read_file, mymax, get_top_percent_list
from pdf_tools.libs.commons import fitz from magic_pdf.libs.commons import fitz
from loguru import logger from loguru import logger
from collections import Counter from collections import Counter
from pdf_tools.libs.drop_reason import DropReason from magic_pdf.libs.drop_reason import DropReason
from pdf_tools.libs.language import detect_lang from magic_pdf.libs.language import detect_lang
scan_max_page = 50 scan_max_page = 50
junk_limit_min = 10 junk_limit_min = 10
......
...@@ -3,9 +3,9 @@ ...@@ -3,9 +3,9 @@
from pdf_tools.layout.layout_spiler_recog import get_spilter_of_page from magic_pdf.layout.layout_spiler_recog import get_spilter_of_page
from pdf_tools.libs.boxbase import _is_in, _is_in_or_part_overlap, _is_vertical_full_overlap from magic_pdf.libs.boxbase import _is_in, _is_in_or_part_overlap, _is_vertical_full_overlap
from pdf_tools.libs.commons import mymax from magic_pdf.libs.commons import mymax
X0_IDX = 0 X0_IDX = 0
Y0_IDX = 1 Y0_IDX = 1
......
from pdf_tools.layout.bbox_sort import X0_EXT_IDX, X0_IDX, X1_EXT_IDX, X1_IDX, Y0_IDX, Y1_EXT_IDX, Y1_IDX from magic_pdf.layout.bbox_sort import X0_EXT_IDX, X0_IDX, X1_EXT_IDX, X1_IDX, Y0_IDX, Y1_EXT_IDX, Y1_IDX
from pdf_tools.libs.boxbase import _is_bottom_full_overlap, _left_intersect, _right_intersect from magic_pdf.libs.boxbase import _is_bottom_full_overlap, _left_intersect, _right_intersect
def find_all_left_bbox_direct(this_bbox, all_bboxes) -> list: def find_all_left_bbox_direct(this_bbox, all_bboxes) -> list:
......
...@@ -3,9 +3,9 @@ ...@@ -3,9 +3,9 @@
""" """
from loguru import logger from loguru import logger
from pdf_tools.layout.bbox_sort import CONTENT_IDX, CONTENT_TYPE_IDX, X0_EXT_IDX, X0_IDX, X1_EXT_IDX, X1_IDX, Y0_EXT_IDX, Y0_IDX, Y1_EXT_IDX, Y1_IDX, paper_bbox_sort from magic_pdf.layout.bbox_sort import CONTENT_IDX, CONTENT_TYPE_IDX, X0_EXT_IDX, X0_IDX, X1_EXT_IDX, X1_IDX, Y0_EXT_IDX, Y0_IDX, Y1_EXT_IDX, Y1_IDX, paper_bbox_sort
from pdf_tools.layout.layout_det_utils import find_all_left_bbox_direct, find_all_right_bbox_direct, find_bottom_bbox_direct_from_left_edge, find_bottom_bbox_direct_from_right_edge, find_top_bbox_direct_from_left_edge, find_top_bbox_direct_from_right_edge, find_all_top_bbox_direct, find_all_bottom_bbox_direct, get_left_edge_bboxes, get_right_edge_bboxes from magic_pdf.layout.layout_det_utils import find_all_left_bbox_direct, find_all_right_bbox_direct, find_bottom_bbox_direct_from_left_edge, find_bottom_bbox_direct_from_right_edge, find_top_bbox_direct_from_left_edge, find_top_bbox_direct_from_right_edge, find_all_top_bbox_direct, find_all_bottom_bbox_direct, get_left_edge_bboxes, get_right_edge_bboxes
from pdf_tools.libs.boxbase import get_bbox_in_boundry from magic_pdf.libs.boxbase import get_bbox_in_boundry
LAYOUT_V = "V" LAYOUT_V = "V"
......
...@@ -3,8 +3,8 @@ ...@@ -3,8 +3,8 @@
""" """
import os import os
from pdf_tools.libs.commons import fitz from magic_pdf.libs.commons import fitz
from pdf_tools.libs.boxbase import _is_in_or_part_overlap from magic_pdf.libs.boxbase import _is_in_or_part_overlap
def __rect_filter_by_width(rect, page_w, page_h): def __rect_filter_by_width(rect, page_w, page_h):
......
...@@ -50,7 +50,7 @@ Usage ...@@ -50,7 +50,7 @@ Usage
---------------------------------------------------------------------------------- ----------------------------------------------------------------------------------
""" """
import sys import sys
from pdf_tools.libs.commons import fitz from magic_pdf.libs.commons import fitz
def column_boxes(page, footer_margin=50, header_margin=50, no_image_text=True): def column_boxes(page, footer_margin=50, header_margin=50, no_image_text=True):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment