Unverified Commit ea2f8ea0 authored by Xiaomeng Zhao's avatar Xiaomeng Zhao Committed by GitHub
Browse files

Merge branch 'dev' into dev

parents e4810cee 23c8436e
import re import re
from magic_pdf.config.drop_tag import CONTENT_IN_FOOT_OR_HEADER, PAGE_NO
from magic_pdf.libs.boxbase import _is_in_or_part_overlap from magic_pdf.libs.boxbase import _is_in_or_part_overlap
from magic_pdf.libs.drop_tag import CONTENT_IN_FOOT_OR_HEADER, PAGE_NO
def remove_headder_footer_one_page(text_raw_blocks, image_bboxes, table_bboxes, header_bboxs, footer_bboxs, def remove_headder_footer_one_page(text_raw_blocks, image_bboxes, table_bboxes, header_bboxs, footer_bboxs,
page_no_bboxs, page_w, page_h): page_no_bboxs, page_w, page_h):
""" """删除页眉页脚,页码 从line级别进行删除,删除之后观察这个text-block是否是空的,如果是空的,则移动到remove_list中."""
删除页眉页脚,页码
从line级别进行删除,删除之后观察这个text-block是否是空的,如果是空的,则移动到remove_list中
"""
header = [] header = []
footer = [] footer = []
if len(header) == 0: if len(header) == 0:
......
import math import math
import re
from magic_pdf.config.drop_tag import (EMPTY_SIDE_BLOCK, ROTATE_TEXT,
VERTICAL_TEXT)
from magic_pdf.libs.boxbase import is_vbox_on_side from magic_pdf.libs.boxbase import is_vbox_on_side
from magic_pdf.libs.drop_tag import EMPTY_SIDE_BLOCK, ROTATE_TEXT, VERTICAL_TEXT
def detect_non_horizontal_texts(result_dict): def detect_non_horizontal_texts(result_dict):
""" """This function detects watermarks and vertical margin notes in the
This function detects watermarks and vertical margin notes in the document. document.
Watermarks are identified by finding blocks with the same coordinates and frequently occurring identical texts across multiple pages. Watermarks are identified by finding blocks with the same coordinates and frequently occurring identical texts across multiple pages.
If these conditions are met, the blocks are highly likely to be watermarks, as opposed to headers or footers, which can change from page to page. If these conditions are met, the blocks are highly likely to be watermarks, as opposed to headers or footers, which can change from page to page.
If the direction of these blocks is not horizontal, they are definitely considered to be watermarks. If the direction of these blocks is not horizontal, they are definitely considered to be watermarks.
Vertical margin notes are identified by finding blocks with the same coordinates and frequently occurring identical texts across multiple pages. Vertical margin notes are identified by finding blocks with the same coordinates and frequently occurring identical texts across multiple pages.
If these conditions are met, the blocks are highly likely to be vertical margin notes, which typically appear on the left and right sides of the page. If these conditions are met, the blocks are highly likely to be vertical margin notes, which typically appear on the left and right sides of the page. # noqa: E501
If the direction of these blocks is vertical, they are definitely considered to be vertical margin notes. If the direction of these blocks is vertical, they are definitely considered to be vertical margin notes.
...@@ -32,13 +34,16 @@ def detect_non_horizontal_texts(result_dict): ...@@ -32,13 +34,16 @@ def detect_non_horizontal_texts(result_dict):
potential_margin_notes = {} potential_margin_notes = {}
for page_id, page_content in result_dict.items(): for page_id, page_content in result_dict.items():
if page_id.startswith("page_"): if page_id.startswith('page_'):
for block_id, block_data in page_content.items(): for block_id, block_data in page_content.items():
if block_id.startswith("block_"): if block_id.startswith('block_'):
if "dir" in block_data: if 'dir' in block_data:
coordinates_text = (block_data["bbox"], block_data["text"]) # Tuple of coordinates and text coordinates_text = (
block_data['bbox'],
angle = math.atan2(block_data["dir"][1], block_data["dir"][0]) block_data['text'],
) # Tuple of coordinates and text
angle = math.atan2(block_data['dir'][1], block_data['dir'][0])
angle = abs(math.degrees(angle)) angle = abs(math.degrees(angle))
if angle > 5 and angle < 85: # Check if direction is watermarks if angle > 5 and angle < 85: # Check if direction is watermarks
...@@ -49,32 +54,40 @@ def detect_non_horizontal_texts(result_dict): ...@@ -49,32 +54,40 @@ def detect_non_horizontal_texts(result_dict):
if angle > 85 and angle < 105: # Check if direction is vertical if angle > 85 and angle < 105: # Check if direction is vertical
if coordinates_text in potential_margin_notes: if coordinates_text in potential_margin_notes:
potential_margin_notes[coordinates_text] += 1 # Increment count potential_margin_notes[coordinates_text] += (
1 # Increment count
)
else: else:
potential_margin_notes[coordinates_text] = 1 # Initialize count potential_margin_notes[coordinates_text] = (
1 # Initialize count
)
# Identify watermarks by finding entries with counts higher than a threshold (e.g., appearing on more than half of the pages) # Identify watermarks by finding entries with counts higher than a threshold (e.g., appearing on more than half of the pages)
watermark_threshold = len(result_dict) // 2 watermark_threshold = len(result_dict) // 2
watermarks = {k: v for k, v in potential_watermarks.items() if v > watermark_threshold} watermarks = {
k: v for k, v in potential_watermarks.items() if v > watermark_threshold
}
# Identify margin notes by finding entries with counts higher than a threshold (e.g., appearing on more than half of the pages) # Identify margin notes by finding entries with counts higher than a threshold (e.g., appearing on more than half of the pages)
margin_note_threshold = len(result_dict) // 2 margin_note_threshold = len(result_dict) // 2
margin_notes = {k: v for k, v in potential_margin_notes.items() if v > margin_note_threshold} margin_notes = {
k: v for k, v in potential_margin_notes.items() if v > margin_note_threshold
}
# Add watermark information to the result dictionary # Add watermark information to the result dictionary
for page_id, blocks in result_dict.items(): for page_id, blocks in result_dict.items():
if page_id.startswith("page_"): if page_id.startswith('page_'):
for block_id, block_data in blocks.items(): for block_id, block_data in blocks.items():
coordinates_text = (block_data["bbox"], block_data["text"]) coordinates_text = (block_data['bbox'], block_data['text'])
if coordinates_text in watermarks: if coordinates_text in watermarks:
block_data["is_watermark"] = 1 block_data['is_watermark'] = 1
else: else:
block_data["is_watermark"] = 0 block_data['is_watermark'] = 0
if coordinates_text in margin_notes: if coordinates_text in margin_notes:
block_data["is_vertical_margin_note"] = 1 block_data['is_vertical_margin_note'] = 1
else: else:
block_data["is_vertical_margin_note"] = 0 block_data['is_vertical_margin_note'] = 0
return result_dict return result_dict
...@@ -83,21 +96,21 @@ def detect_non_horizontal_texts(result_dict): ...@@ -83,21 +96,21 @@ def detect_non_horizontal_texts(result_dict):
1. 当一个block里全部文字都不是dir=(1,0),这个block整体去掉 1. 当一个block里全部文字都不是dir=(1,0),这个block整体去掉
2. 当一个block里全部文字都是dir=(1,0),但是每行只有一个字,这个block整体去掉。这个block必须出现在页面的四周,否则不去掉 2. 当一个block里全部文字都是dir=(1,0),但是每行只有一个字,这个block整体去掉。这个block必须出现在页面的四周,否则不去掉
""" """
import re
def __is_a_word(sentence): def __is_a_word(sentence):
# 如果输入是中文并且长度为1,则返回True # 如果输入是中文并且长度为1,则返回True
if re.fullmatch(r'[\u4e00-\u9fa5]', sentence): if re.fullmatch(r'[\u4e00-\u9fa5]', sentence):
return True return True
# 判断是否为单个英文单词或字符(包括ASCII标点) # 判断是否为单个英文单词或字符(包括ASCII标点)
elif re.fullmatch(r'[a-zA-Z0-9]+', sentence) and len(sentence) <=2: elif re.fullmatch(r'[a-zA-Z0-9]+', sentence) and len(sentence) <= 2:
return True return True
else: else:
return False return False
def __get_text_color(num): def __get_text_color(num):
"""获取字体的颜色RGB值""" """获取字体的颜色RGB值."""
blue = num & 255 blue = num & 255
green = (num >> 8) & 255 green = (num >> 8) & 255
red = (num >> 16) & 255 red = (num >> 16) & 255
...@@ -105,84 +118,119 @@ def __get_text_color(num): ...@@ -105,84 +118,119 @@ def __get_text_color(num):
def __is_empty_side_box(text_block): def __is_empty_side_box(text_block):
""" """是否是边缘上的空白没有任何内容的block."""
是否是边缘上的空白没有任何内容的block
"""
for line in text_block['lines']: for line in text_block['lines']:
for span in line['spans']: for span in line['spans']:
font_color = span['color'] font_color = span['color']
r,g,b = __get_text_color(font_color) r, g, b = __get_text_color(font_color)
if len(span['text'].strip())>0 and (r,g,b)!=(255,255,255): if len(span['text'].strip()) > 0 and (r, g, b) != (255, 255, 255):
return False return False
return True return True
def remove_rotate_side_textblock(pymu_text_block, page_width, page_height): def remove_rotate_side_textblock(pymu_text_block, page_width, page_height):
""" """返回删除了垂直,水印,旋转的textblock 删除的内容打上tag返回."""
返回删除了垂直,水印,旋转的textblock
删除的内容打上tag返回
"""
removed_text_block = [] removed_text_block = []
for i, block in enumerate(pymu_text_block): # 格式参考test/assets/papre/pymu_textblocks.json for i, block in enumerate(
pymu_text_block
): # 格式参考test/assets/papre/pymu_textblocks.json
lines = block['lines'] lines = block['lines']
block_bbox = block['bbox'] block_bbox = block['bbox']
if not is_vbox_on_side(block_bbox, page_width, page_height, 0.2): # 保证这些box必须在页面的两边 if not is_vbox_on_side(
continue block_bbox, page_width, page_height, 0.2
): # 保证这些box必须在页面的两边
if all([__is_a_word(line['spans'][0]["text"]) for line in lines if len(line['spans'])>0]) and len(lines)>1 and all([len(line['spans'])==1 for line in lines]): continue
is_box_valign = (len(set([int(line['spans'][0]['bbox'][0] ) for line in lines if len(line['spans'])>0]))==1) and (len([int(line['spans'][0]['bbox'][0] ) for line in lines if len(line['spans'])>0])>1) # 测试bbox在垂直方向是不是x0都相等,也就是在垂直方向排列.同时必须大于等于2个字
if (
all(
[
__is_a_word(line['spans'][0]['text'])
for line in lines
if len(line['spans']) > 0
]
)
and len(lines) > 1
and all([len(line['spans']) == 1 for line in lines])
):
is_box_valign = (
(
len(
set(
[
int(line['spans'][0]['bbox'][0])
for line in lines
if len(line['spans']) > 0
]
)
)
== 1
)
and (
len(
[
int(line['spans'][0]['bbox'][0])
for line in lines
if len(line['spans']) > 0
]
)
> 1
)
) # 测试bbox在垂直方向是不是x0都相等,也就是在垂直方向排列.同时必须大于等于2个字
if is_box_valign: if is_box_valign:
block['tag'] = VERTICAL_TEXT block['tag'] = VERTICAL_TEXT
removed_text_block.append(block) removed_text_block.append(block)
continue continue
for line in lines: for line in lines:
if line['dir']!=(1,0): if line['dir'] != (1, 0):
block['tag'] = ROTATE_TEXT block['tag'] = ROTATE_TEXT
removed_text_block.append(block) # 只要有一个line不是dir=(1,0),就把整个block都删掉 removed_text_block.append(
block
) # 只要有一个line不是dir=(1,0),就把整个block都删掉
break break
for block in removed_text_block: for block in removed_text_block:
pymu_text_block.remove(block) pymu_text_block.remove(block)
return pymu_text_block, removed_text_block return pymu_text_block, removed_text_block
def get_side_boundry(rotate_bbox, page_width, page_height): def get_side_boundry(rotate_bbox, page_width, page_height):
""" """根据rotate_bbox,返回页面的左右正文边界."""
根据rotate_bbox,返回页面的左右正文边界
"""
left_x = 0 left_x = 0
right_x = page_width right_x = page_width
for x in rotate_bbox: for x in rotate_bbox:
box = x['bbox'] box = x['bbox']
if box[2]<page_width/2: if box[2] < page_width / 2:
left_x = max(left_x, box[2]) left_x = max(left_x, box[2])
else: else:
right_x = min(right_x, box[0]) right_x = min(right_x, box[0])
return left_x+1, right_x-1 return left_x + 1, right_x - 1
def remove_side_blank_block(pymu_text_block, page_width, page_height): def remove_side_blank_block(pymu_text_block, page_width, page_height):
""" """删除页面两侧的空白block."""
删除页面两侧的空白block
"""
removed_text_block = [] removed_text_block = []
for i, block in enumerate(pymu_text_block): # 格式参考test/assets/papre/pymu_textblocks.json for i, block in enumerate(
pymu_text_block
): # 格式参考test/assets/papre/pymu_textblocks.json
block_bbox = block['bbox'] block_bbox = block['bbox']
if not is_vbox_on_side(block_bbox, page_width, page_height, 0.2): # 保证这些box必须在页面的两边 if not is_vbox_on_side(
continue block_bbox, page_width, page_height, 0.2
): # 保证这些box必须在页面的两边
continue
if __is_empty_side_box(block): if __is_empty_side_box(block):
block['tag'] = EMPTY_SIDE_BLOCK block['tag'] = EMPTY_SIDE_BLOCK
removed_text_block.append(block) removed_text_block.append(block)
continue continue
for block in removed_text_block: for block in removed_text_block:
pymu_text_block.remove(block) pymu_text_block.remove(block)
return pymu_text_block, removed_text_block return pymu_text_block, removed_text_block
\ No newline at end of file
...@@ -4,8 +4,9 @@ ...@@ -4,8 +4,9 @@
2. 然后去掉出现在文字blcok上的图片bbox 2. 然后去掉出现在文字blcok上的图片bbox
""" """
from magic_pdf.libs.boxbase import _is_in, _is_in_or_part_overlap, _is_left_overlap from magic_pdf.config.drop_tag import ON_IMAGE_TEXT, ON_TABLE_TEXT
from magic_pdf.libs.drop_tag import ON_IMAGE_TEXT, ON_TABLE_TEXT from magic_pdf.libs.boxbase import (_is_in, _is_in_or_part_overlap,
_is_left_overlap)
def resolve_bbox_overlap_conflict(images: list, tables: list, interline_equations: list, inline_equations: list, def resolve_bbox_overlap_conflict(images: list, tables: list, interline_equations: list, inline_equations: list,
...@@ -26,14 +27,14 @@ def resolve_bbox_overlap_conflict(images: list, tables: list, interline_equation ...@@ -26,14 +27,14 @@ def resolve_bbox_overlap_conflict(images: list, tables: list, interline_equation
# 去掉位于图片上的文字block # 去掉位于图片上的文字block
for image_box in images: for image_box in images:
for text_block in text_raw_blocks: for text_block in text_raw_blocks:
text_bbox = text_block["bbox"] text_bbox = text_block['bbox']
if _is_in(text_bbox, image_box): if _is_in(text_bbox, image_box):
text_block['tag'] = ON_IMAGE_TEXT text_block['tag'] = ON_IMAGE_TEXT
text_block_removed.append(text_block) text_block_removed.append(text_block)
# 去掉table上的文字block # 去掉table上的文字block
for table_box in tables: for table_box in tables:
for text_block in text_raw_blocks: for text_block in text_raw_blocks:
text_bbox = text_block["bbox"] text_bbox = text_block['bbox']
if _is_in(text_bbox, table_box): if _is_in(text_bbox, table_box):
text_block['tag'] = ON_TABLE_TEXT text_block['tag'] = ON_TABLE_TEXT
text_block_removed.append(text_block) text_block_removed.append(text_block)
...@@ -77,7 +78,7 @@ def resolve_bbox_overlap_conflict(images: list, tables: list, interline_equation ...@@ -77,7 +78,7 @@ def resolve_bbox_overlap_conflict(images: list, tables: list, interline_equation
# 图片和文字重叠,丢掉图片 # 图片和文字重叠,丢掉图片
for image_box in images: for image_box in images:
for text_block in text_raw_blocks: for text_block in text_raw_blocks:
text_bbox = text_block["bbox"] text_bbox = text_block['bbox']
if _is_in_or_part_overlap(image_box, text_bbox): if _is_in_or_part_overlap(image_box, text_bbox):
images_backup.append(image_box) images_backup.append(image_box)
break break
...@@ -122,11 +123,7 @@ def resolve_bbox_overlap_conflict(images: list, tables: list, interline_equation ...@@ -122,11 +123,7 @@ def resolve_bbox_overlap_conflict(images: list, tables: list, interline_equation
def check_text_block_horizontal_overlap(text_blocks: list, header, footer) -> bool: def check_text_block_horizontal_overlap(text_blocks: list, header, footer) -> bool:
""" """检查文本block之间的水平重叠情况,这种情况如果发生,那么这个pdf就不再继续处理了。 因为这种情况大概率发生了公式没有被检测出来。"""
检查文本block之间的水平重叠情况,这种情况如果发生,那么这个pdf就不再继续处理了。
因为这种情况大概率发生了公式没有被检测出来。
"""
if len(text_blocks) == 0: if len(text_blocks) == 0:
return False return False
...@@ -148,7 +145,7 @@ def check_text_block_horizontal_overlap(text_blocks: list, header, footer) -> bo ...@@ -148,7 +145,7 @@ def check_text_block_horizontal_overlap(text_blocks: list, header, footer) -> bo
txt_bboxes = [] txt_bboxes = []
for text_block in text_blocks: for text_block in text_blocks:
bbox = text_block["bbox"] bbox = text_block['bbox']
if bbox[1] >= clip_y0 and bbox[3] <= clip_y1: if bbox[1] >= clip_y0 and bbox[3] <= clip_y1:
txt_bboxes.append(bbox) txt_bboxes.append(bbox)
...@@ -161,11 +158,7 @@ def check_text_block_horizontal_overlap(text_blocks: list, header, footer) -> bo ...@@ -161,11 +158,7 @@ def check_text_block_horizontal_overlap(text_blocks: list, header, footer) -> bo
def check_useful_block_horizontal_overlap(useful_blocks: list) -> bool: def check_useful_block_horizontal_overlap(useful_blocks: list) -> bool:
""" """检查文本block之间的水平重叠情况,这种情况如果发生,那么这个pdf就不再继续处理了。 因为这种情况大概率发生了公式没有被检测出来。"""
检查文本block之间的水平重叠情况,这种情况如果发生,那么这个pdf就不再继续处理了。
因为这种情况大概率发生了公式没有被检测出来。
"""
if len(useful_blocks) == 0: if len(useful_blocks) == 0:
return False return False
...@@ -174,7 +167,7 @@ def check_useful_block_horizontal_overlap(useful_blocks: list) -> bool: ...@@ -174,7 +167,7 @@ def check_useful_block_horizontal_overlap(useful_blocks: list) -> bool:
useful_bboxes = [] useful_bboxes = []
for text_block in useful_blocks: for text_block in useful_blocks:
bbox = text_block["bbox"] bbox = text_block['bbox']
if bbox[1] >= page_min_y and bbox[3] <= page_max_y: if bbox[1] >= page_min_y and bbox[3] <= page_max_y:
useful_bboxes.append(bbox) useful_bboxes.append(bbox)
......
from loguru import logger from loguru import logger
from magic_pdf.libs.drop_reason import DropReason from magic_pdf.config.drop_reason import DropReason
def get_data_source(jso: dict): def get_data_source(jso: dict):
data_source = jso.get("data_source") data_source = jso.get('data_source')
if data_source is None: if data_source is None:
data_source = jso.get("file_source") data_source = jso.get('file_source')
return data_source return data_source
def get_data_type(jso: dict): def get_data_type(jso: dict):
data_type = jso.get("data_type") data_type = jso.get('data_type')
if data_type is None: if data_type is None:
data_type = jso.get("file_type") data_type = jso.get('file_type')
return data_type return data_type
def get_bookid(jso: dict): def get_bookid(jso: dict):
book_id = jso.get("bookid") book_id = jso.get('bookid')
if book_id is None: if book_id is None:
book_id = jso.get("original_file_id") book_id = jso.get('original_file_id')
return book_id return book_id
def exception_handler(jso: dict, e): def exception_handler(jso: dict, e):
logger.exception(e) logger.exception(e)
jso["_need_drop"] = True jso['_need_drop'] = True
jso["_drop_reason"] = DropReason.Exception jso['_drop_reason'] = DropReason.Exception
jso["_exception"] = f"ERROR: {e}" jso['_exception'] = f'ERROR: {e}'
return jso return jso
def get_bookname(jso: dict): def get_bookname(jso: dict):
data_source = get_data_source(jso) data_source = get_data_source(jso)
file_id = jso.get("file_id") file_id = jso.get('file_id')
book_name = f"{data_source}/{file_id}" book_name = f'{data_source}/{file_id}'
return book_name return book_name
def spark_json_extractor(jso: dict) -> dict: def spark_json_extractor(jso: dict) -> dict:
""" """从json中提取数据,返回一个dict."""
从json中提取数据,返回一个dict
"""
return { return {
"_pdf_type": jso["_pdf_type"], '_pdf_type': jso['_pdf_type'],
"model_list": jso["doc_layout_result"], 'model_list': jso['doc_layout_result'],
} }
...@@ -5,9 +5,8 @@ import click ...@@ -5,9 +5,8 @@ import click
from loguru import logger from loguru import logger
import magic_pdf.model as model_config import magic_pdf.model as model_config
from magic_pdf.data.data_reader_writer import FileBasedDataReader
from magic_pdf.libs.version import __version__ from magic_pdf.libs.version import __version__
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
from magic_pdf.tools.common import do_parse, parse_pdf_methods from magic_pdf.tools.common import do_parse, parse_pdf_methods
...@@ -86,8 +85,8 @@ def cli(path, output_dir, method, lang, debug_able, start_page_id, end_page_id): ...@@ -86,8 +85,8 @@ def cli(path, output_dir, method, lang, debug_able, start_page_id, end_page_id):
os.makedirs(output_dir, exist_ok=True) os.makedirs(output_dir, exist_ok=True)
def read_fn(path): def read_fn(path):
disk_rw = DiskReaderWriter(os.path.dirname(path)) disk_rw = FileBasedDataReader(os.path.dirname(path))
return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN) return disk_rw.read(os.path.basename(path))
def parse_doc(doc_path: str): def parse_doc(doc_path: str):
try: try:
......
...@@ -5,13 +5,11 @@ from pathlib import Path ...@@ -5,13 +5,11 @@ from pathlib import Path
import click import click
import magic_pdf.model as model_config import magic_pdf.model as model_config
from magic_pdf.data.data_reader_writer import FileBasedDataReader, S3DataReader
from magic_pdf.libs.config_reader import get_s3_config from magic_pdf.libs.config_reader import get_s3_config
from magic_pdf.libs.path_utils import (parse_s3_range_params, parse_s3path, from magic_pdf.libs.path_utils import (parse_s3_range_params, parse_s3path,
remove_non_official_s3_args) remove_non_official_s3_args)
from magic_pdf.libs.version import __version__ from magic_pdf.libs.version import __version__
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
from magic_pdf.rw.S3ReaderWriter import S3ReaderWriter
from magic_pdf.tools.common import do_parse, parse_pdf_methods from magic_pdf.tools.common import do_parse, parse_pdf_methods
...@@ -19,15 +17,14 @@ def read_s3_path(s3path): ...@@ -19,15 +17,14 @@ def read_s3_path(s3path):
bucket, key = parse_s3path(s3path) bucket, key = parse_s3path(s3path)
s3_ak, s3_sk, s3_endpoint = get_s3_config(bucket) s3_ak, s3_sk, s3_endpoint = get_s3_config(bucket)
s3_rw = S3ReaderWriter(s3_ak, s3_sk, s3_endpoint, 'auto', s3_rw = S3DataReader('', bucket, s3_ak, s3_sk, s3_endpoint, 'auto')
remove_non_official_s3_args(s3path))
may_range_params = parse_s3_range_params(s3path) may_range_params = parse_s3_range_params(s3path)
if may_range_params is None or 2 != len(may_range_params): if may_range_params is None or 2 != len(may_range_params):
byte_start, byte_end = 0, None byte_start, byte_end = 0, -1
else: else:
byte_start, byte_end = int(may_range_params[0]), int( byte_start, byte_end = int(may_range_params[0]), int(
may_range_params[1]) may_range_params[1])
return s3_rw.read_offset( return s3_rw.read_at(
remove_non_official_s3_args(s3path), remove_non_official_s3_args(s3path),
byte_start, byte_start,
byte_end, byte_end,
...@@ -129,8 +126,8 @@ def pdf(pdf, json_data, output_dir, method): ...@@ -129,8 +126,8 @@ def pdf(pdf, json_data, output_dir, method):
os.makedirs(output_dir, exist_ok=True) os.makedirs(output_dir, exist_ok=True)
def read_fn(path): def read_fn(path):
disk_rw = DiskReaderWriter(os.path.dirname(path)) disk_rw = FileBasedDataReader(os.path.dirname(path))
return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN) return disk_rw.read(os.path.basename(path))
model_json_list = json_parse.loads(read_fn(json_data).decode('utf-8')) model_json_list = json_parse.loads(read_fn(json_data).decode('utf-8'))
......
...@@ -3,18 +3,18 @@ import json as json_parse ...@@ -3,18 +3,18 @@ import json as json_parse
import os import os
import click import click
import fitz
from loguru import logger from loguru import logger
import magic_pdf.model as model_config import magic_pdf.model as model_config
from magic_pdf.config.make_content_config import DropMode, MakeMode
from magic_pdf.data.data_reader_writer import FileBasedDataWriter
from magic_pdf.libs.draw_bbox import (draw_layout_bbox, draw_line_sort_bbox, from magic_pdf.libs.draw_bbox import (draw_layout_bbox, draw_line_sort_bbox,
draw_model_bbox, draw_span_bbox) draw_model_bbox, draw_span_bbox)
from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
from magic_pdf.pipe.OCRPipe import OCRPipe from magic_pdf.pipe.OCRPipe import OCRPipe
from magic_pdf.pipe.TXTPipe import TXTPipe from magic_pdf.pipe.TXTPipe import TXTPipe
from magic_pdf.pipe.UNIPipe import UNIPipe from magic_pdf.pipe.UNIPipe import UNIPipe
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
import fitz
# from io import BytesIO # from io import BytesIO
# from pypdf import PdfReader, PdfWriter # from pypdf import PdfReader, PdfWriter
...@@ -54,11 +54,11 @@ def prepare_env(output_dir, pdf_file_name, method): ...@@ -54,11 +54,11 @@ def prepare_env(output_dir, pdf_file_name, method):
def convert_pdf_bytes_to_bytes_by_pymupdf(pdf_bytes, start_page_id=0, end_page_id=None): def convert_pdf_bytes_to_bytes_by_pymupdf(pdf_bytes, start_page_id=0, end_page_id=None):
document = fitz.open("pdf", pdf_bytes) document = fitz.open('pdf', pdf_bytes)
output_document = fitz.open() output_document = fitz.open()
end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else len(document) - 1 end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else len(document) - 1
if end_page_id > len(document) - 1: if end_page_id > len(document) - 1:
logger.warning("end_page_id is out of range, use pdf_docs length") logger.warning('end_page_id is out of range, use pdf_docs length')
end_page_id = len(document) - 1 end_page_id = len(document) - 1
output_document.insert_pdf(document, from_page=start_page_id, to_page=end_page_id) output_document.insert_pdf(document, from_page=start_page_id, to_page=end_page_id)
output_bytes = output_document.tobytes() output_bytes = output_document.tobytes()
...@@ -103,8 +103,8 @@ def do_parse( ...@@ -103,8 +103,8 @@ def do_parse(
local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name,
parse_method) parse_method)
image_writer, md_writer = DiskReaderWriter( image_writer, md_writer = FileBasedDataWriter(
local_image_dir), DiskReaderWriter(local_md_dir) local_image_dir), FileBasedDataWriter(local_md_dir)
image_dir = str(os.path.basename(local_image_dir)) image_dir = str(os.path.basename(local_image_dir))
if parse_method == 'auto': if parse_method == 'auto':
...@@ -148,49 +148,36 @@ def do_parse( ...@@ -148,49 +148,36 @@ def do_parse(
if f_draw_line_sort_bbox: if f_draw_line_sort_bbox:
draw_line_sort_bbox(pdf_info, pdf_bytes, local_md_dir, pdf_file_name) draw_line_sort_bbox(pdf_info, pdf_bytes, local_md_dir, pdf_file_name)
md_content = pipe.pipe_mk_markdown(image_dir, md_content = pipe.pipe_mk_markdown(image_dir, drop_mode=DropMode.NONE, md_make_mode=f_make_md_mode)
drop_mode=DropMode.NONE,
md_make_mode=f_make_md_mode)
if f_dump_md: if f_dump_md:
md_writer.write( md_writer.write_string(
content=md_content, f'{pdf_file_name}.md',
path=f'{pdf_file_name}.md', md_content
mode=AbsReaderWriter.MODE_TXT,
) )
if f_dump_middle_json: if f_dump_middle_json:
md_writer.write( md_writer.write_string(
content=json_parse.dumps(pipe.pdf_mid_data, f'{pdf_file_name}_middle.json',
ensure_ascii=False, json_parse.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4)
indent=4),
path=f'{pdf_file_name}_middle.json',
mode=AbsReaderWriter.MODE_TXT,
) )
if f_dump_model_json: if f_dump_model_json:
md_writer.write( md_writer.write_string(
content=json_parse.dumps(orig_model_list, f'{pdf_file_name}_model.json',
ensure_ascii=False, json_parse.dumps(orig_model_list, ensure_ascii=False, indent=4)
indent=4),
path=f'{pdf_file_name}_model.json',
mode=AbsReaderWriter.MODE_TXT,
) )
if f_dump_orig_pdf: if f_dump_orig_pdf:
md_writer.write( md_writer.write(
content=pdf_bytes, f'{pdf_file_name}_origin.pdf',
path=f'{pdf_file_name}_origin.pdf', pdf_bytes,
mode=AbsReaderWriter.MODE_BIN,
) )
content_list = pipe.pipe_mk_uni_format(image_dir, drop_mode=DropMode.NONE) content_list = pipe.pipe_mk_uni_format(image_dir, drop_mode=DropMode.NONE)
if f_dump_content_list: if f_dump_content_list:
md_writer.write( md_writer.write_string(
content=json_parse.dumps(content_list, f'{pdf_file_name}_content_list.json',
ensure_ascii=False, json_parse.dumps(content_list, ensure_ascii=False, indent=4)
indent=4),
path=f'{pdf_file_name}_content_list.json',
mode=AbsReaderWriter.MODE_TXT,
) )
logger.info(f'local output dir is {local_md_dir}') logger.info(f'local output dir is {local_md_dir}')
......
""" """用户输入: model数组,每个元素代表一个页面 pdf在s3的路径 截图保存的s3位置.
用户输入:
model数组,每个元素代表一个页面
pdf在s3的路径
截图保存的s3位置
然后: 然后:
1)根据s3路径,调用spark集群的api,拿到ak,sk,endpoint,构造出s3PDFReader 1)根据s3路径,调用spark集群的api,拿到ak,sk,endpoint,构造出s3PDFReader
2)根据用户输入的s3地址,调用spark集群的api,拿到ak,sk,endpoint,构造出s3ImageWriter 2)根据用户输入的s3地址,调用spark集群的api,拿到ak,sk,endpoint,构造出s3ImageWriter
其余部分至于构造s3cli, 获取ak,sk都在code-clean里写代码完成。不要反向依赖!!! 其余部分至于构造s3cli, 获取ak,sk都在code-clean里写代码完成。不要反向依赖!!!
""" """
import re
from loguru import logger from loguru import logger
from magic_pdf.data.data_reader_writer import DataWriter
from magic_pdf.libs.version import __version__ from magic_pdf.libs.version import __version__
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.rw import AbsReaderWriter
from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr
from magic_pdf.pdf_parse_by_txt import parse_pdf_by_txt from magic_pdf.pdf_parse_by_txt import parse_pdf_by_txt
PARSE_TYPE_TXT = "txt" PARSE_TYPE_TXT = 'txt'
PARSE_TYPE_OCR = "ocr" PARSE_TYPE_OCR = 'ocr'
def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False, def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: DataWriter, is_debug=False,
start_page_id=0, end_page_id=None, lang=None, start_page_id=0, end_page_id=None, lang=None,
*args, **kwargs): *args, **kwargs):
""" """解析文本类pdf."""
解析文本类pdf
"""
pdf_info_dict = parse_pdf_by_txt( pdf_info_dict = parse_pdf_by_txt(
pdf_bytes, pdf_bytes,
pdf_models, pdf_models,
...@@ -40,22 +32,20 @@ def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit ...@@ -40,22 +32,20 @@ def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit
debug_mode=is_debug, debug_mode=is_debug,
) )
pdf_info_dict["_parse_type"] = PARSE_TYPE_TXT pdf_info_dict['_parse_type'] = PARSE_TYPE_TXT
pdf_info_dict["_version_name"] = __version__ pdf_info_dict['_version_name'] = __version__
if lang is not None: if lang is not None:
pdf_info_dict["_lang"] = lang pdf_info_dict['_lang'] = lang
return pdf_info_dict return pdf_info_dict
def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False, def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: DataWriter, is_debug=False,
start_page_id=0, end_page_id=None, lang=None, start_page_id=0, end_page_id=None, lang=None,
*args, **kwargs): *args, **kwargs):
""" """解析ocr类pdf."""
解析ocr类pdf
"""
pdf_info_dict = parse_pdf_by_ocr( pdf_info_dict = parse_pdf_by_ocr(
pdf_bytes, pdf_bytes,
pdf_models, pdf_models,
...@@ -65,23 +55,21 @@ def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit ...@@ -65,23 +55,21 @@ def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit
debug_mode=is_debug, debug_mode=is_debug,
) )
pdf_info_dict["_parse_type"] = PARSE_TYPE_OCR pdf_info_dict['_parse_type'] = PARSE_TYPE_OCR
pdf_info_dict["_version_name"] = __version__ pdf_info_dict['_version_name'] = __version__
if lang is not None: if lang is not None:
pdf_info_dict["_lang"] = lang pdf_info_dict['_lang'] = lang
return pdf_info_dict return pdf_info_dict
def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False, def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: DataWriter, is_debug=False,
input_model_is_empty: bool = False, input_model_is_empty: bool = False,
start_page_id=0, end_page_id=None, lang=None, start_page_id=0, end_page_id=None, lang=None,
*args, **kwargs): *args, **kwargs):
""" """ocr和文本混合的pdf,全部解析出来."""
ocr和文本混合的pdf,全部解析出来
"""
def parse_pdf(method): def parse_pdf(method):
try: try:
...@@ -98,12 +86,12 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr ...@@ -98,12 +86,12 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
return None return None
pdf_info_dict = parse_pdf(parse_pdf_by_txt) pdf_info_dict = parse_pdf(parse_pdf_by_txt)
if pdf_info_dict is None or pdf_info_dict.get("_need_drop", False): if pdf_info_dict is None or pdf_info_dict.get('_need_drop', False):
logger.warning(f"parse_pdf_by_txt drop or error, switch to parse_pdf_by_ocr") logger.warning('parse_pdf_by_txt drop or error, switch to parse_pdf_by_ocr')
if input_model_is_empty: if input_model_is_empty:
layout_model = kwargs.get("layout_model", None) layout_model = kwargs.get('layout_model', None)
formula_enable = kwargs.get("formula_enable", None) formula_enable = kwargs.get('formula_enable', None)
table_enable = kwargs.get("table_enable", None) table_enable = kwargs.get('table_enable', None)
pdf_models = doc_analyze( pdf_models = doc_analyze(
pdf_bytes, pdf_bytes,
ocr=True, ocr=True,
...@@ -116,15 +104,15 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr ...@@ -116,15 +104,15 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
) )
pdf_info_dict = parse_pdf(parse_pdf_by_ocr) pdf_info_dict = parse_pdf(parse_pdf_by_ocr)
if pdf_info_dict is None: if pdf_info_dict is None:
raise Exception("Both parse_pdf_by_txt and parse_pdf_by_ocr failed.") raise Exception('Both parse_pdf_by_txt and parse_pdf_by_ocr failed.')
else: else:
pdf_info_dict["_parse_type"] = PARSE_TYPE_OCR pdf_info_dict['_parse_type'] = PARSE_TYPE_OCR
else: else:
pdf_info_dict["_parse_type"] = PARSE_TYPE_TXT pdf_info_dict['_parse_type'] = PARSE_TYPE_TXT
pdf_info_dict["_version_name"] = __version__ pdf_info_dict['_version_name'] = __version__
if lang is not None: if lang is not None:
pdf_info_dict["_lang"] = lang pdf_info_dict['_lang'] = lang
return pdf_info_dict return pdf_info_dict
...@@ -2,39 +2,37 @@ ...@@ -2,39 +2,37 @@
import base64 import base64
import os import os
import re
import time import time
import uuid import uuid
import zipfile import zipfile
from pathlib import Path from pathlib import Path
import re
import gradio as gr
import pymupdf import pymupdf
from gradio_pdf import PDF
from loguru import logger from loguru import logger
from magic_pdf.data.data_reader_writer import DataReader
from magic_pdf.libs.hash_utils import compute_sha256 from magic_pdf.libs.hash_utils import compute_sha256
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
from magic_pdf.tools.common import do_parse, prepare_env from magic_pdf.tools.common import do_parse, prepare_env
import gradio as gr
from gradio_pdf import PDF
def read_fn(path): def read_fn(path):
disk_rw = DiskReaderWriter(os.path.dirname(path)) disk_rw = DataReader(os.path.dirname(path))
return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN) return disk_rw.read(os.path.basename(path))
def parse_pdf(doc_path, output_dir, end_page_id, is_ocr, layout_mode, formula_enable, table_enable, language): def parse_pdf(doc_path, output_dir, end_page_id, is_ocr, layout_mode, formula_enable, table_enable, language):
os.makedirs(output_dir, exist_ok=True) os.makedirs(output_dir, exist_ok=True)
try: try:
file_name = f"{str(Path(doc_path).stem)}_{time.time()}" file_name = f'{str(Path(doc_path).stem)}_{time.time()}'
pdf_data = read_fn(doc_path) pdf_data = read_fn(doc_path)
if is_ocr: if is_ocr:
parse_method = "ocr" parse_method = 'ocr'
else: else:
parse_method = "auto" parse_method = 'auto'
local_image_dir, local_md_dir = prepare_env(output_dir, file_name, parse_method) local_image_dir, local_md_dir = prepare_env(output_dir, file_name, parse_method)
do_parse( do_parse(
output_dir, output_dir,
...@@ -55,8 +53,7 @@ def parse_pdf(doc_path, output_dir, end_page_id, is_ocr, layout_mode, formula_en ...@@ -55,8 +53,7 @@ def parse_pdf(doc_path, output_dir, end_page_id, is_ocr, layout_mode, formula_en
def compress_directory_to_zip(directory_path, output_zip_path): def compress_directory_to_zip(directory_path, output_zip_path):
""" """压缩指定目录到一个 ZIP 文件。
压缩指定目录到一个 ZIP 文件。
:param directory_path: 要压缩的目录路径 :param directory_path: 要压缩的目录路径
:param output_zip_path: 输出的 ZIP 文件路径 :param output_zip_path: 输出的 ZIP 文件路径
...@@ -80,7 +77,7 @@ def compress_directory_to_zip(directory_path, output_zip_path): ...@@ -80,7 +77,7 @@ def compress_directory_to_zip(directory_path, output_zip_path):
def image_to_base64(image_path): def image_to_base64(image_path):
with open(image_path, "rb") as image_file: with open(image_path, 'rb') as image_file:
return base64.b64encode(image_file.read()).decode('utf-8') return base64.b64encode(image_file.read()).decode('utf-8')
...@@ -93,7 +90,7 @@ def replace_image_with_base64(markdown_text, image_dir_path): ...@@ -93,7 +90,7 @@ def replace_image_with_base64(markdown_text, image_dir_path):
relative_path = match.group(1) relative_path = match.group(1)
full_path = os.path.join(image_dir_path, relative_path) full_path = os.path.join(image_dir_path, relative_path)
base64_image = image_to_base64(full_path) base64_image = image_to_base64(full_path)
return f"![{relative_path}](data:image/jpeg;base64,{base64_image})" return f'![{relative_path}](data:image/jpeg;base64,{base64_image})'
# 应用替换 # 应用替换
return re.sub(pattern, replace, markdown_text) return re.sub(pattern, replace, markdown_text)
...@@ -103,34 +100,34 @@ def to_markdown(file_path, end_pages, is_ocr, layout_mode, formula_enable, table ...@@ -103,34 +100,34 @@ def to_markdown(file_path, end_pages, is_ocr, layout_mode, formula_enable, table
# 获取识别的md文件以及压缩包文件路径 # 获取识别的md文件以及压缩包文件路径
local_md_dir, file_name = parse_pdf(file_path, './output', end_pages - 1, is_ocr, local_md_dir, file_name = parse_pdf(file_path, './output', end_pages - 1, is_ocr,
layout_mode, formula_enable, table_enable, language) layout_mode, formula_enable, table_enable, language)
archive_zip_path = os.path.join("./output", compute_sha256(local_md_dir) + ".zip") archive_zip_path = os.path.join('./output', compute_sha256(local_md_dir) + '.zip')
zip_archive_success = compress_directory_to_zip(local_md_dir, archive_zip_path) zip_archive_success = compress_directory_to_zip(local_md_dir, archive_zip_path)
if zip_archive_success == 0: if zip_archive_success == 0:
logger.info("压缩成功") logger.info('压缩成功')
else: else:
logger.error("压缩失败") logger.error('压缩失败')
md_path = os.path.join(local_md_dir, file_name + ".md") md_path = os.path.join(local_md_dir, file_name + '.md')
with open(md_path, 'r', encoding='utf-8') as f: with open(md_path, 'r', encoding='utf-8') as f:
txt_content = f.read() txt_content = f.read()
md_content = replace_image_with_base64(txt_content, local_md_dir) md_content = replace_image_with_base64(txt_content, local_md_dir)
# 返回转换后的PDF路径 # 返回转换后的PDF路径
new_pdf_path = os.path.join(local_md_dir, file_name + "_layout.pdf") new_pdf_path = os.path.join(local_md_dir, file_name + '_layout.pdf')
return md_content, txt_content, archive_zip_path, new_pdf_path return md_content, txt_content, archive_zip_path, new_pdf_path
latex_delimiters = [{"left": "$$", "right": "$$", "display": True}, latex_delimiters = [{'left': '$$', 'right': '$$', 'display': True},
{"left": '$', "right": '$', "display": False}] {'left': '$', 'right': '$', 'display': False}]
def init_model(): def init_model():
from magic_pdf.model.doc_analyze_by_custom_model import ModelSingleton from magic_pdf.model.doc_analyze_by_custom_model import ModelSingleton
try: try:
model_manager = ModelSingleton() model_manager = ModelSingleton()
txt_model = model_manager.get_model(False, False) txt_model = model_manager.get_model(False, False) # noqa: F841
logger.info(f"txt_model init final") logger.info('txt_model init final')
ocr_model = model_manager.get_model(True, False) ocr_model = model_manager.get_model(True, False) # noqa: F841
logger.info(f"ocr_model init final") logger.info('ocr_model init final')
return 0 return 0
except Exception as e: except Exception as e:
logger.exception(e) logger.exception(e)
...@@ -138,31 +135,31 @@ def init_model(): ...@@ -138,31 +135,31 @@ def init_model():
model_init = init_model() model_init = init_model()
logger.info(f"model_init: {model_init}") logger.info(f'model_init: {model_init}')
with open("header.html", "r") as file: with open('header.html', 'r') as file:
header = file.read() header = file.read()
latin_lang = [ latin_lang = [
'af', 'az', 'bs', 'cs', 'cy', 'da', 'de', 'es', 'et', 'fr', 'ga', 'hr', 'af', 'az', 'bs', 'cs', 'cy', 'da', 'de', 'es', 'et', 'fr', 'ga', 'hr', # noqa: E126
'hu', 'id', 'is', 'it', 'ku', 'la', 'lt', 'lv', 'mi', 'ms', 'mt', 'nl', 'hu', 'id', 'is', 'it', 'ku', 'la', 'lt', 'lv', 'mi', 'ms', 'mt', 'nl',
'no', 'oc', 'pi', 'pl', 'pt', 'ro', 'rs_latin', 'sk', 'sl', 'sq', 'sv', 'no', 'oc', 'pi', 'pl', 'pt', 'ro', 'rs_latin', 'sk', 'sl', 'sq', 'sv',
'sw', 'tl', 'tr', 'uz', 'vi', 'french', 'german' 'sw', 'tl', 'tr', 'uz', 'vi', 'french', 'german'
] ]
arabic_lang = ['ar', 'fa', 'ug', 'ur'] arabic_lang = ['ar', 'fa', 'ug', 'ur']
cyrillic_lang = [ cyrillic_lang = [
'ru', 'rs_cyrillic', 'be', 'bg', 'uk', 'mn', 'abq', 'ady', 'kbd', 'ava', 'ru', 'rs_cyrillic', 'be', 'bg', 'uk', 'mn', 'abq', 'ady', 'kbd', 'ava', # noqa: E126
'dar', 'inh', 'che', 'lbe', 'lez', 'tab' 'dar', 'inh', 'che', 'lbe', 'lez', 'tab'
] ]
devanagari_lang = [ devanagari_lang = [
'hi', 'mr', 'ne', 'bh', 'mai', 'ang', 'bho', 'mah', 'sck', 'new', 'gom', 'hi', 'mr', 'ne', 'bh', 'mai', 'ang', 'bho', 'mah', 'sck', 'new', 'gom', # noqa: E126
'sa', 'bgc' 'sa', 'bgc'
] ]
other_lang = ['ch', 'en', 'korean', 'japan', 'chinese_cht', 'ta', 'te', 'ka'] other_lang = ['ch', 'en', 'korean', 'japan', 'chinese_cht', 'ta', 'te', 'ka']
all_lang = [""] all_lang = ['']
all_lang.extend([*other_lang, *latin_lang, *arabic_lang, *cyrillic_lang, *devanagari_lang]) all_lang.extend([*other_lang, *latin_lang, *arabic_lang, *cyrillic_lang, *devanagari_lang])
...@@ -174,7 +171,7 @@ def to_pdf(file_path): ...@@ -174,7 +171,7 @@ def to_pdf(file_path):
pdf_bytes = f.convert_to_pdf() pdf_bytes = f.convert_to_pdf()
# 将pdfbytes 写入到uuid.pdf中 # 将pdfbytes 写入到uuid.pdf中
# 生成唯一的文件名 # 生成唯一的文件名
unique_filename = f"{uuid.uuid4()}.pdf" unique_filename = f'{uuid.uuid4()}.pdf'
# 构建完整的文件路径 # 构建完整的文件路径
tmp_file_path = os.path.join(os.path.dirname(file_path), unique_filename) tmp_file_path = os.path.join(os.path.dirname(file_path), unique_filename)
...@@ -186,43 +183,43 @@ def to_pdf(file_path): ...@@ -186,43 +183,43 @@ def to_pdf(file_path):
return tmp_file_path return tmp_file_path
if __name__ == "__main__": if __name__ == '__main__':
with gr.Blocks() as demo: with gr.Blocks() as demo:
gr.HTML(header) gr.HTML(header)
with gr.Row(): with gr.Row():
with gr.Column(variant='panel', scale=5): with gr.Column(variant='panel', scale=5):
file = gr.File(label="Please upload a PDF or image", file_types=[".pdf", ".png", ".jpeg", ".jpg"]) file = gr.File(label='Please upload a PDF or image', file_types=['.pdf', '.png', '.jpeg', '.jpg'])
max_pages = gr.Slider(1, 10, 5, step=1, label="Max convert pages") max_pages = gr.Slider(1, 10, 5, step=1, label='Max convert pages')
with gr.Row(): with gr.Row():
layout_mode = gr.Dropdown(["layoutlmv3", "doclayout_yolo"], label="Layout model", value="layoutlmv3") layout_mode = gr.Dropdown(['layoutlmv3', 'doclayout_yolo'], label='Layout model', value='layoutlmv3')
language = gr.Dropdown(all_lang, label="Language", value="") language = gr.Dropdown(all_lang, label='Language', value='')
with gr.Row(): with gr.Row():
formula_enable = gr.Checkbox(label="Enable formula recognition", value=True) formula_enable = gr.Checkbox(label='Enable formula recognition', value=True)
is_ocr = gr.Checkbox(label="Force enable OCR", value=False) is_ocr = gr.Checkbox(label='Force enable OCR', value=False)
table_enable = gr.Checkbox(label="Enable table recognition(test)", value=False) table_enable = gr.Checkbox(label='Enable table recognition(test)', value=False)
with gr.Row(): with gr.Row():
change_bu = gr.Button("Convert") change_bu = gr.Button('Convert')
clear_bu = gr.ClearButton(value="Clear") clear_bu = gr.ClearButton(value='Clear')
pdf_show = PDF(label="PDF preview", interactive=True, height=800) pdf_show = PDF(label='PDF preview', interactive=True, height=800)
with gr.Accordion("Examples:"): with gr.Accordion('Examples:'):
example_root = os.path.join(os.path.dirname(__file__), "examples") example_root = os.path.join(os.path.dirname(__file__), 'examples')
gr.Examples( gr.Examples(
examples=[os.path.join(example_root, _) for _ in os.listdir(example_root) if examples=[os.path.join(example_root, _) for _ in os.listdir(example_root) if
_.endswith("pdf")], _.endswith('pdf')],
inputs=pdf_show inputs=pdf_show
) )
with gr.Column(variant='panel', scale=5): with gr.Column(variant='panel', scale=5):
output_file = gr.File(label="convert result", interactive=False) output_file = gr.File(label='convert result', interactive=False)
with gr.Tabs(): with gr.Tabs():
with gr.Tab("Markdown rendering"): with gr.Tab('Markdown rendering'):
md = gr.Markdown(label="Markdown rendering", height=900, show_copy_button=True, md = gr.Markdown(label='Markdown rendering', height=900, show_copy_button=True,
latex_delimiters=latex_delimiters, line_breaks=True) latex_delimiters=latex_delimiters, line_breaks=True)
with gr.Tab("Markdown text"): with gr.Tab('Markdown text'):
md_text = gr.TextArea(lines=45, show_copy_button=True) md_text = gr.TextArea(lines=45, show_copy_button=True)
file.upload(fn=to_pdf, inputs=file, outputs=pdf_show) file.upload(fn=to_pdf, inputs=file, outputs=pdf_show)
change_bu.click(fn=to_markdown, inputs=[pdf_show, max_pages, is_ocr, layout_mode, formula_enable, table_enable, language], change_bu.click(fn=to_markdown, inputs=[pdf_show, max_pages, is_ocr, layout_mode, formula_enable, table_enable, language],
outputs=[md, md_text, output_file, pdf_show]) outputs=[md, md_text, output_file, pdf_show])
clear_bu.add([file, md, pdf_show, md_text, output_file, is_ocr, table_enable, language]) clear_bu.add([file, md, pdf_show, md_text, output_file, is_ocr, table_enable, language])
demo.launch(server_name="0.0.0.0") demo.launch(server_name='0.0.0.0')
\ No newline at end of file
{"track_id":"e8824f5a-9fcb-4ee5-b2d4-6bf2c67019dc","path":"tests/test_data/assets/pdfs/test_02.pdf","file_type":"pdf","content_type":"application/pdf","content_length":80078,"title":"German Idealism and the Concept of Punishment || Conclusion","remark":{"file_id":"scihub_78800000/libgen.scimag78872000-78872999.zip_10.1017/cbo9780511770425.012","file_source_type":"paper","original_file_id":"10.1017/cbo9780511770425.012","file_name":"10.1017/cbo9780511770425.012.pdf","author":"Merle, Jean-Christophe"}} {"track_id":"e8824f5a-9fcb-4ee5-b2d4-6bf2c67019dc","path":"tests/unittest/test_data/assets/pdfs/test_02.pdf","file_type":"pdf","content_type":"application/pdf","content_length":80078,"title":"German Idealism and the Concept of Punishment || Conclusion","remark":{"file_id":"scihub_78800000/libgen.scimag78872000-78872999.zip_10.1017/cbo9780511770425.012","file_source_type":"paper","original_file_id":"10.1017/cbo9780511770425.012","file_name":"10.1017/cbo9780511770425.012.pdf","author":"Merle, Jean-Christophe"}}
...@@ -3,7 +3,7 @@ from magic_pdf.data.dataset import ImageDataset, PymuDocDataset ...@@ -3,7 +3,7 @@ from magic_pdf.data.dataset import ImageDataset, PymuDocDataset
def test_pymudataset(): def test_pymudataset():
with open('tests/test_data/assets/pdfs/test_01.pdf', 'rb') as f: with open('tests/unittest/test_data/assets/pdfs/test_01.pdf', 'rb') as f:
bits = f.read() bits = f.read()
datasets = PymuDocDataset(bits) datasets = PymuDocDataset(bits)
assert len(datasets) > 0 assert len(datasets) > 0
...@@ -11,7 +11,7 @@ def test_pymudataset(): ...@@ -11,7 +11,7 @@ def test_pymudataset():
def test_imagedataset(): def test_imagedataset():
with open('tests/test_data/assets/pngs/test_01.png', 'rb') as f: with open('tests/unittest/test_data/assets/pngs/test_01.png', 'rb') as f:
bits = f.read() bits = f.read()
datasets = ImageDataset(bits) datasets = ImageDataset(bits)
assert len(datasets) == 1 assert len(datasets) == 1
......
...@@ -9,7 +9,7 @@ from magic_pdf.data.schemas import S3Config ...@@ -9,7 +9,7 @@ from magic_pdf.data.schemas import S3Config
def test_read_local_pdfs(): def test_read_local_pdfs():
datasets = read_local_pdfs('tests/test_data/assets/pdfs') datasets = read_local_pdfs('tests/unittest/test_data/assets/pdfs')
assert len(datasets) == 2 assert len(datasets) == 2
assert len(datasets[0]) > 0 assert len(datasets[0]) > 0
assert len(datasets[1]) > 0 assert len(datasets[1]) > 0
...@@ -19,7 +19,7 @@ def test_read_local_pdfs(): ...@@ -19,7 +19,7 @@ def test_read_local_pdfs():
def test_read_local_images(): def test_read_local_images():
datasets = read_local_images('tests/test_data/assets/pngs', suffixes=['png']) datasets = read_local_images('tests/unittest/test_data/assets/pngs', suffixes=['png'])
assert len(datasets) == 2 assert len(datasets) == 2
assert len(datasets[0]) == 1 assert len(datasets[0]) == 1
assert len(datasets[1]) == 1 assert len(datasets[1]) == 1
...@@ -69,10 +69,10 @@ def test_read_json(): ...@@ -69,10 +69,10 @@ def test_read_json():
assert len(datasets) > 0 assert len(datasets) > 0
assert len(datasets[0]) == 10 assert len(datasets[0]) == 10
datasets = read_jsonl('tests/test_data/assets/jsonl/test_01.jsonl', reader) datasets = read_jsonl('tests/unittest/test_data/assets/jsonl/test_01.jsonl', reader)
assert len(datasets) == 1 assert len(datasets) == 1
assert len(datasets[0]) == 10 assert len(datasets[0]) == 10
datasets = read_jsonl('tests/test_data/assets/jsonl/test_02.jsonl') datasets = read_jsonl('tests/unittest/test_data/assets/jsonl/test_02.jsonl')
assert len(datasets) == 1 assert len(datasets) == 1
assert len(datasets[0]) == 1 assert len(datasets[0]) == 1
...@@ -17,7 +17,7 @@ def test_rag_document_reader(): ...@@ -17,7 +17,7 @@ def test_rag_document_reader():
os.makedirs(temp_output_dir, exist_ok=True) os.makedirs(temp_output_dir, exist_ok=True)
# test # test
with open('tests/test_integrations/test_rag/assets/middle.json') as f: with open('tests/unittest/test_integrations/test_rag/assets/middle.json') as f:
json_data = json.load(f) json_data = json.load(f)
res = convert_middle_json_to_layout_elements(json_data, temp_output_dir) res = convert_middle_json_to_layout_elements(json_data, temp_output_dir)
...@@ -43,7 +43,7 @@ def test_data_reader(): ...@@ -43,7 +43,7 @@ def test_data_reader():
os.makedirs(temp_output_dir, exist_ok=True) os.makedirs(temp_output_dir, exist_ok=True)
# test # test
data_reader = DataReader('tests/test_integrations/test_rag/assets', 'ocr', data_reader = DataReader('tests/unittest/test_integrations/test_rag/assets', 'ocr',
temp_output_dir) temp_output_dir)
assert data_reader.get_documents_count() == 2 assert data_reader.get_documents_count() == 2
......
...@@ -16,7 +16,7 @@ def test_convert_middle_json_to_layout_elements(): ...@@ -16,7 +16,7 @@ def test_convert_middle_json_to_layout_elements():
os.makedirs(temp_output_dir, exist_ok=True) os.makedirs(temp_output_dir, exist_ok=True)
# test # test
with open('tests/test_integrations/test_rag/assets/middle.json') as f: with open('tests/unittest/test_integrations/test_rag/assets/middle.json') as f:
json_data = json.load(f) json_data = json.load(f)
res = convert_middle_json_to_layout_elements(json_data, temp_output_dir) res = convert_middle_json_to_layout_elements(json_data, temp_output_dir)
...@@ -32,7 +32,7 @@ def test_convert_middle_json_to_layout_elements(): ...@@ -32,7 +32,7 @@ def test_convert_middle_json_to_layout_elements():
def test_inference(): def test_inference():
asset_dir = 'tests/test_integrations/test_rag/assets' asset_dir = 'tests/unittest/test_integrations/test_rag/assets'
# setup # setup
unitest_dir = '/tmp/magic_pdf/unittest/integrations/rag' unitest_dir = '/tmp/magic_pdf/unittest/integrations/rag'
os.makedirs(unitest_dir, exist_ok=True) os.makedirs(unitest_dir, exist_ok=True)
...@@ -48,7 +48,7 @@ def test_inference(): ...@@ -48,7 +48,7 @@ def test_inference():
assert res is not None assert res is not None
assert len(res) == 1 assert len(res) == 1
assert len(res[0].layout_dets) == 10 assert len(res[0].layout_dets) == 11
assert res[0].layout_dets[0].anno_id == 0 assert res[0].layout_dets[0].anno_id == 0
assert res[0].layout_dets[0].category_type == CategoryType.text assert res[0].layout_dets[0].category_type == CategoryType.text
assert len(res[0].extra.element_relation) == 3 assert len(res[0].extra.element_relation) == 3
......
...@@ -5,8 +5,8 @@ from magic_pdf.model.magic_model import MagicModel ...@@ -5,8 +5,8 @@ from magic_pdf.model.magic_model import MagicModel
def test_magic_model_image_v2(): def test_magic_model_image_v2():
datasets = read_local_pdfs('tests/test_model/assets/test_01.pdf') datasets = read_local_pdfs('tests/unittest/test_model/assets/test_01.pdf')
with open('tests/test_model/assets/test_01.model.json') as f: with open('tests/unittest/test_model/assets/test_01.model.json') as f:
model_json = json.load(f) model_json = json.load(f)
magic_model = MagicModel(model_json, datasets[0]) magic_model = MagicModel(model_json, datasets[0])
...@@ -19,8 +19,8 @@ def test_magic_model_image_v2(): ...@@ -19,8 +19,8 @@ def test_magic_model_image_v2():
def test_magic_model_table_v2(): def test_magic_model_table_v2():
datasets = read_local_pdfs('tests/test_model/assets/test_02.pdf') datasets = read_local_pdfs('tests/unittest/test_model/assets/test_02.pdf')
with open('tests/test_model/assets/test_02.model.json') as f: with open('tests/unittest/test_model/assets/test_02.model.json') as f:
model_json = json.load(f) model_json = json.load(f)
magic_model = MagicModel(model_json, datasets[0]) magic_model = MagicModel(model_json, datasets[0])
......
{"file_location":"tests/test_tools/assets/cli_dev/cli_test_01.pdf","doc_layout_result":[{"layout_dets":[{"category_id":1,"poly":[882.4013061523438,169.93817138671875,1552.350341796875,169.93817138671875,1552.350341796875,625.8263549804688,882.4013061523438,625.8263549804688],"score":0.999992311000824},{"category_id":1,"poly":[882.474853515625,1450.92822265625,1551.4490966796875,1450.92822265625,1551.4490966796875,1877.5712890625,882.474853515625,1877.5712890625],"score":0.9999903440475464},{"category_id":1,"poly":[881.6513061523438,626.2058715820312,1552.1400146484375,626.2058715820312,1552.1400146484375,1450.604736328125,881.6513061523438,1450.604736328125],"score":0.9999856352806091},{"category_id":1,"poly":[149.41075134277344,232.1595001220703,819.0465087890625,232.1595001220703,819.0465087890625,625.8865356445312,149.41075134277344,625.8865356445312],"score":0.99998539686203},{"category_id":1,"poly":[149.3945770263672,1215.5172119140625,817.8850708007812,1215.5172119140625,817.8850708007812,1304.873291015625,149.3945770263672,1304.873291015625],"score":0.9999765157699585},{"category_id":1,"poly":[882.6979370117188,1880.13916015625,1552.15185546875,1880.13916015625,1552.15185546875,2031.339599609375,882.6979370117188,2031.339599609375],"score":0.9999744892120361},{"category_id":1,"poly":[148.96054077148438,743.3055419921875,818.6231689453125,743.3055419921875,818.6231689453125,1074.2369384765625,148.96054077148438,1074.2369384765625],"score":0.9999669790267944},{"category_id":1,"poly":[148.8435516357422,1791.14306640625,818.6885375976562,1791.14306640625,818.6885375976562,2030.794189453125,148.8435516357422,2030.794189453125],"score":0.9999618530273438},{"category_id":0,"poly":[150.7009735107422,684.0087890625,623.5106201171875,684.0087890625,623.5106201171875,717.03662109375,150.7009735107422,717.03662109375],"score":0.9999415278434753},{"category_id":8,"poly":[146.48068237304688,1331.6737060546875,317.2640075683594,1331.6737060546875,317.2640075683594,1400.1722412109375,146.48068237304688,1400.1722412109375],"score":0.9998958110809326},{"category_id":1,"poly":[149.42420959472656,1430.8782958984375,818.9042358398438,1430.8782958984375,818.9042358398438,1672.7386474609375,149.42420959472656,1672.7386474609375],"score":0.9998599290847778},{"category_id":1,"poly":[149.18746948242188,172.10252380371094,818.5662231445312,172.10252380371094,818.5662231445312,230.4594268798828,149.18746948242188,230.4594268798828],"score":0.9997718334197998},{"category_id":0,"poly":[149.0175018310547,1732.1090087890625,702.1005859375,1732.1090087890625,702.1005859375,1763.6046142578125,149.0175018310547,1763.6046142578125],"score":0.9997085928916931},{"category_id":2,"poly":[1519.802490234375,98.59099578857422,1551.985107421875,98.59099578857422,1551.985107421875,119.48420715332031,1519.802490234375,119.48420715332031],"score":0.9995552897453308},{"category_id":8,"poly":[146.9109649658203,1100.156494140625,544.2803344726562,1100.156494140625,544.2803344726562,1184.929443359375,146.9109649658203,1184.929443359375],"score":0.9995207786560059},{"category_id":2,"poly":[148.11611938476562,99.87767791748047,318.926025390625,99.87767791748047,318.926025390625,120.70393371582031,148.11611938476562,120.70393371582031],"score":0.999351441860199},{"category_id":9,"poly":[791.7642211914062,1130.056396484375,818.6940307617188,1130.056396484375,818.6940307617188,1161.1080322265625,791.7642211914062,1161.1080322265625],"score":0.9908884763717651},{"category_id":9,"poly":[788.37060546875,1346.8450927734375,818.5010986328125,1346.8450927734375,818.5010986328125,1377.370361328125,788.37060546875,1377.370361328125],"score":0.9873985052108765},{"category_id":14,"poly":[146,1103,543,1103,543,1184,146,1184],"score":0.94,"latex":"E\\!\\left(W\\right)\\!=\\!\\frac{E\\!\\left[H^{2}\\right]}{2E\\!\\left[H\\right]}\\!=\\!\\frac{E\\!\\left[H\\right]}{2}\\!\\!\\left(1\\!+\\!\\operatorname{CV}\\!\\left(H\\right)^{2}\\right)"},{"category_id":13,"poly":[1196,354,1278,354,1278,384,1196,384],"score":0.91,"latex":"p(1-q)"},{"category_id":13,"poly":[881,415,1020,415,1020,444,881,444],"score":0.91,"latex":"(1-p)(1-q)"},{"category_id":14,"poly":[147,1333,318,1333,318,1400,147,1400],"score":0.91,"latex":"\\mathbf{CV}\\big(H\\big)\\!=\\!\\frac{\\boldsymbol{\\upsigma}_{H}}{E\\big[H\\big]}"},{"category_id":13,"poly":[1197,657,1263,657,1263,686,1197,686],"score":0.9,"latex":"(1-p)"},{"category_id":13,"poly":[213,1217,263,1217,263,1244,213,1244],"score":0.88,"latex":"E[X]"},{"category_id":13,"poly":[214,1434,245,1434,245,1459,214,1459],"score":0.87,"latex":"\\upsigma_{H}"},{"category_id":13,"poly":[324,2002,373,2002,373,2028,324,2028],"score":0.84,"latex":"30\\%"},{"category_id":13,"poly":[1209,693,1225,693,1225,717,1209,717],"score":0.83,"latex":"p"},{"category_id":13,"poly":[990,449,1007,449,1007,474,990,474],"score":0.81,"latex":"p"},{"category_id":13,"poly":[346,1277,369,1277,369,1301,346,1301],"score":0.81,"latex":"H"},{"category_id":13,"poly":[1137,661,1154,661,1154,686,1137,686],"score":0.81,"latex":"p"},{"category_id":13,"poly":[522,1432,579,1432,579,1459,522,1459],"score":0.81,"latex":"H\\left(4\\right)"},{"category_id":13,"poly":[944,540,962,540,962,565,944,565],"score":0.8,"latex":"p"},{"category_id":13,"poly":[1444,936,1461,936,1461,961,1444,961],"score":0.79,"latex":"p"},{"category_id":13,"poly":[602,1247,624,1247,624,1270,602,1270],"score":0.78,"latex":"H"},{"category_id":13,"poly":[147,1247,167,1247,167,1271,147,1271],"score":0.77,"latex":"X"},{"category_id":13,"poly":[210,1246,282,1246,282,1274,210,1274],"score":0.77,"latex":"\\operatorname{CV}(H)"},{"category_id":13,"poly":[1346,268,1361,268,1361,292,1346,292],"score":0.76,"latex":"q"},{"category_id":13,"poly":[215,957,238,957,238,981,215,981],"score":0.74,"latex":"H"},{"category_id":13,"poly":[149,956,173,956,173,981,149,981],"score":0.63,"latex":"W"},{"category_id":13,"poly":[924,841,1016,841,1016,868,924,868],"score":0.56,"latex":"8{\\cdot}00\\;\\mathrm{a.m}"},{"category_id":13,"poly":[956,871,1032,871,1032,898,956,898],"score":0.43,"latex":"20~\\mathrm{min}"},{"category_id":13,"poly":[1082,781,1112,781,1112,808,1082,808],"score":0.41,"latex":"(l)"},{"category_id":13,"poly":[697,1821,734,1821,734,1847,697,1847],"score":0.3,"latex":"^{1\\mathrm{~h~}}"}],"page_info":{"page_no":0,"height":2200,"width":1700}}]} {"file_location":"tests/unittest/test_tools/assets/cli_dev/cli_test_01.pdf","doc_layout_result":[{"layout_dets":[{"category_id":1,"poly":[882.4013061523438,169.93817138671875,1552.350341796875,169.93817138671875,1552.350341796875,625.8263549804688,882.4013061523438,625.8263549804688],"score":0.999992311000824},{"category_id":1,"poly":[882.474853515625,1450.92822265625,1551.4490966796875,1450.92822265625,1551.4490966796875,1877.5712890625,882.474853515625,1877.5712890625],"score":0.9999903440475464},{"category_id":1,"poly":[881.6513061523438,626.2058715820312,1552.1400146484375,626.2058715820312,1552.1400146484375,1450.604736328125,881.6513061523438,1450.604736328125],"score":0.9999856352806091},{"category_id":1,"poly":[149.41075134277344,232.1595001220703,819.0465087890625,232.1595001220703,819.0465087890625,625.8865356445312,149.41075134277344,625.8865356445312],"score":0.99998539686203},{"category_id":1,"poly":[149.3945770263672,1215.5172119140625,817.8850708007812,1215.5172119140625,817.8850708007812,1304.873291015625,149.3945770263672,1304.873291015625],"score":0.9999765157699585},{"category_id":1,"poly":[882.6979370117188,1880.13916015625,1552.15185546875,1880.13916015625,1552.15185546875,2031.339599609375,882.6979370117188,2031.339599609375],"score":0.9999744892120361},{"category_id":1,"poly":[148.96054077148438,743.3055419921875,818.6231689453125,743.3055419921875,818.6231689453125,1074.2369384765625,148.96054077148438,1074.2369384765625],"score":0.9999669790267944},{"category_id":1,"poly":[148.8435516357422,1791.14306640625,818.6885375976562,1791.14306640625,818.6885375976562,2030.794189453125,148.8435516357422,2030.794189453125],"score":0.9999618530273438},{"category_id":0,"poly":[150.7009735107422,684.0087890625,623.5106201171875,684.0087890625,623.5106201171875,717.03662109375,150.7009735107422,717.03662109375],"score":0.9999415278434753},{"category_id":8,"poly":[146.48068237304688,1331.6737060546875,317.2640075683594,1331.6737060546875,317.2640075683594,1400.1722412109375,146.48068237304688,1400.1722412109375],"score":0.9998958110809326},{"category_id":1,"poly":[149.42420959472656,1430.8782958984375,818.9042358398438,1430.8782958984375,818.9042358398438,1672.7386474609375,149.42420959472656,1672.7386474609375],"score":0.9998599290847778},{"category_id":1,"poly":[149.18746948242188,172.10252380371094,818.5662231445312,172.10252380371094,818.5662231445312,230.4594268798828,149.18746948242188,230.4594268798828],"score":0.9997718334197998},{"category_id":0,"poly":[149.0175018310547,1732.1090087890625,702.1005859375,1732.1090087890625,702.1005859375,1763.6046142578125,149.0175018310547,1763.6046142578125],"score":0.9997085928916931},{"category_id":2,"poly":[1519.802490234375,98.59099578857422,1551.985107421875,98.59099578857422,1551.985107421875,119.48420715332031,1519.802490234375,119.48420715332031],"score":0.9995552897453308},{"category_id":8,"poly":[146.9109649658203,1100.156494140625,544.2803344726562,1100.156494140625,544.2803344726562,1184.929443359375,146.9109649658203,1184.929443359375],"score":0.9995207786560059},{"category_id":2,"poly":[148.11611938476562,99.87767791748047,318.926025390625,99.87767791748047,318.926025390625,120.70393371582031,148.11611938476562,120.70393371582031],"score":0.999351441860199},{"category_id":9,"poly":[791.7642211914062,1130.056396484375,818.6940307617188,1130.056396484375,818.6940307617188,1161.1080322265625,791.7642211914062,1161.1080322265625],"score":0.9908884763717651},{"category_id":9,"poly":[788.37060546875,1346.8450927734375,818.5010986328125,1346.8450927734375,818.5010986328125,1377.370361328125,788.37060546875,1377.370361328125],"score":0.9873985052108765},{"category_id":14,"poly":[146,1103,543,1103,543,1184,146,1184],"score":0.94,"latex":"E\\!\\left(W\\right)\\!=\\!\\frac{E\\!\\left[H^{2}\\right]}{2E\\!\\left[H\\right]}\\!=\\!\\frac{E\\!\\left[H\\right]}{2}\\!\\!\\left(1\\!+\\!\\operatorname{CV}\\!\\left(H\\right)^{2}\\right)"},{"category_id":13,"poly":[1196,354,1278,354,1278,384,1196,384],"score":0.91,"latex":"p(1-q)"},{"category_id":13,"poly":[881,415,1020,415,1020,444,881,444],"score":0.91,"latex":"(1-p)(1-q)"},{"category_id":14,"poly":[147,1333,318,1333,318,1400,147,1400],"score":0.91,"latex":"\\mathbf{CV}\\big(H\\big)\\!=\\!\\frac{\\boldsymbol{\\upsigma}_{H}}{E\\big[H\\big]}"},{"category_id":13,"poly":[1197,657,1263,657,1263,686,1197,686],"score":0.9,"latex":"(1-p)"},{"category_id":13,"poly":[213,1217,263,1217,263,1244,213,1244],"score":0.88,"latex":"E[X]"},{"category_id":13,"poly":[214,1434,245,1434,245,1459,214,1459],"score":0.87,"latex":"\\upsigma_{H}"},{"category_id":13,"poly":[324,2002,373,2002,373,2028,324,2028],"score":0.84,"latex":"30\\%"},{"category_id":13,"poly":[1209,693,1225,693,1225,717,1209,717],"score":0.83,"latex":"p"},{"category_id":13,"poly":[990,449,1007,449,1007,474,990,474],"score":0.81,"latex":"p"},{"category_id":13,"poly":[346,1277,369,1277,369,1301,346,1301],"score":0.81,"latex":"H"},{"category_id":13,"poly":[1137,661,1154,661,1154,686,1137,686],"score":0.81,"latex":"p"},{"category_id":13,"poly":[522,1432,579,1432,579,1459,522,1459],"score":0.81,"latex":"H\\left(4\\right)"},{"category_id":13,"poly":[944,540,962,540,962,565,944,565],"score":0.8,"latex":"p"},{"category_id":13,"poly":[1444,936,1461,936,1461,961,1444,961],"score":0.79,"latex":"p"},{"category_id":13,"poly":[602,1247,624,1247,624,1270,602,1270],"score":0.78,"latex":"H"},{"category_id":13,"poly":[147,1247,167,1247,167,1271,147,1271],"score":0.77,"latex":"X"},{"category_id":13,"poly":[210,1246,282,1246,282,1274,210,1274],"score":0.77,"latex":"\\operatorname{CV}(H)"},{"category_id":13,"poly":[1346,268,1361,268,1361,292,1346,292],"score":0.76,"latex":"q"},{"category_id":13,"poly":[215,957,238,957,238,981,215,981],"score":0.74,"latex":"H"},{"category_id":13,"poly":[149,956,173,956,173,981,149,981],"score":0.63,"latex":"W"},{"category_id":13,"poly":[924,841,1016,841,1016,868,924,868],"score":0.56,"latex":"8{\\cdot}00\\;\\mathrm{a.m}"},{"category_id":13,"poly":[956,871,1032,871,1032,898,956,898],"score":0.43,"latex":"20~\\mathrm{min}"},{"category_id":13,"poly":[1082,781,1112,781,1112,808,1082,808],"score":0.41,"latex":"(l)"},{"category_id":13,"poly":[697,1821,734,1821,734,1847,697,1847],"score":0.3,"latex":"^{1\\mathrm{~h~}}"}],"page_info":{"page_no":0,"height":2200,"width":1700}}]}
\ No newline at end of file
import tempfile
import os import os
import shutil import shutil
import tempfile
from click.testing import CliRunner from click.testing import CliRunner
from magic_pdf.tools.cli import cli from magic_pdf.tools.cli import cli
...@@ -8,19 +9,19 @@ from magic_pdf.tools.cli import cli ...@@ -8,19 +9,19 @@ from magic_pdf.tools.cli import cli
def test_cli_pdf(): def test_cli_pdf():
# setup # setup
unitest_dir = "/tmp/magic_pdf/unittest/tools" unitest_dir = '/tmp/magic_pdf/unittest/tools'
filename = "cli_test_01" filename = 'cli_test_01'
os.makedirs(unitest_dir, exist_ok=True) os.makedirs(unitest_dir, exist_ok=True)
temp_output_dir = tempfile.mkdtemp(dir="/tmp/magic_pdf/unittest/tools") temp_output_dir = tempfile.mkdtemp(dir='/tmp/magic_pdf/unittest/tools')
# run # run
runner = CliRunner() runner = CliRunner()
result = runner.invoke( result = runner.invoke(
cli, cli,
[ [
"-p", '-p',
"tests/test_tools/assets/cli/pdf/cli_test_01.pdf", 'tests/unittest/test_tools/assets/cli/pdf/cli_test_01.pdf',
"-o", '-o',
temp_output_dir, temp_output_dir,
], ],
) )
...@@ -28,29 +29,29 @@ def test_cli_pdf(): ...@@ -28,29 +29,29 @@ def test_cli_pdf():
# check # check
assert result.exit_code == 0 assert result.exit_code == 0
base_output_dir = os.path.join(temp_output_dir, "cli_test_01/auto") base_output_dir = os.path.join(temp_output_dir, 'cli_test_01/auto')
r = os.stat(os.path.join(base_output_dir, f"{filename}.md")) r = os.stat(os.path.join(base_output_dir, f'{filename}.md'))
assert r.st_size > 7000 assert r.st_size > 7000
r = os.stat(os.path.join(base_output_dir, "middle.json")) r = os.stat(os.path.join(base_output_dir, f'{filename}_middle.json'))
assert r.st_size > 200000 assert r.st_size > 200000
r = os.stat(os.path.join(base_output_dir, "model.json")) r = os.stat(os.path.join(base_output_dir, f'{filename}_model.json'))
assert r.st_size > 15000 assert r.st_size > 15000
r = os.stat(os.path.join(base_output_dir, "origin.pdf")) r = os.stat(os.path.join(base_output_dir, f'{filename}_origin.pdf'))
assert r.st_size > 500000 assert r.st_size > 400000
r = os.stat(os.path.join(base_output_dir, "layout.pdf")) r = os.stat(os.path.join(base_output_dir, f'{filename}_layout.pdf'))
assert r.st_size > 500000 assert r.st_size > 400000
r = os.stat(os.path.join(base_output_dir, "spans.pdf")) r = os.stat(os.path.join(base_output_dir, f'{filename}_spans.pdf'))
assert r.st_size > 500000 assert r.st_size > 400000
assert os.path.exists(os.path.join(base_output_dir, "images")) is True assert os.path.exists(os.path.join(base_output_dir, 'images')) is True
assert os.path.isdir(os.path.join(base_output_dir, "images")) is True assert os.path.isdir(os.path.join(base_output_dir, 'images')) is True
assert os.path.exists(os.path.join(base_output_dir, "content_list.json")) is False assert os.path.exists(os.path.join(base_output_dir, f'{filename}_content_list.json')) is True
# teardown # teardown
shutil.rmtree(temp_output_dir) shutil.rmtree(temp_output_dir)
...@@ -58,68 +59,68 @@ def test_cli_pdf(): ...@@ -58,68 +59,68 @@ def test_cli_pdf():
def test_cli_path(): def test_cli_path():
# setup # setup
unitest_dir = "/tmp/magic_pdf/unittest/tools" unitest_dir = '/tmp/magic_pdf/unittest/tools'
os.makedirs(unitest_dir, exist_ok=True) os.makedirs(unitest_dir, exist_ok=True)
temp_output_dir = tempfile.mkdtemp(dir="/tmp/magic_pdf/unittest/tools") temp_output_dir = tempfile.mkdtemp(dir='/tmp/magic_pdf/unittest/tools')
# run # run
runner = CliRunner() runner = CliRunner()
result = runner.invoke( result = runner.invoke(
cli, ["-p", "tests/test_tools/assets/cli/path", "-o", temp_output_dir] cli, ['-p', 'tests/unittest/test_tools/assets/cli/path', '-o', temp_output_dir]
) )
# check # check
assert result.exit_code == 0 assert result.exit_code == 0
filename = "cli_test_01" filename = 'cli_test_01'
base_output_dir = os.path.join(temp_output_dir, "cli_test_01/auto") base_output_dir = os.path.join(temp_output_dir, 'cli_test_01/auto')
r = os.stat(os.path.join(base_output_dir, f"{filename}.md")) r = os.stat(os.path.join(base_output_dir, f'{filename}.md'))
assert r.st_size > 7000 assert r.st_size > 7000
r = os.stat(os.path.join(base_output_dir, "middle.json")) r = os.stat(os.path.join(base_output_dir, f'{filename}_middle.json'))
assert r.st_size > 200000 assert r.st_size > 200000
r = os.stat(os.path.join(base_output_dir, "model.json")) r = os.stat(os.path.join(base_output_dir, f'{filename}_model.json'))
assert r.st_size > 15000 assert r.st_size > 15000
r = os.stat(os.path.join(base_output_dir, "origin.pdf")) r = os.stat(os.path.join(base_output_dir, f'{filename}_origin.pdf'))
assert r.st_size > 500000 assert r.st_size > 400000
r = os.stat(os.path.join(base_output_dir, "layout.pdf")) r = os.stat(os.path.join(base_output_dir, f'{filename}_layout.pdf'))
assert r.st_size > 500000 assert r.st_size > 400000
r = os.stat(os.path.join(base_output_dir, "spans.pdf")) r = os.stat(os.path.join(base_output_dir, f'{filename}_spans.pdf'))
assert r.st_size > 500000 assert r.st_size > 400000
assert os.path.exists(os.path.join(base_output_dir, "images")) is True assert os.path.exists(os.path.join(base_output_dir, 'images')) is True
assert os.path.isdir(os.path.join(base_output_dir, "images")) is True assert os.path.isdir(os.path.join(base_output_dir, 'images')) is True
assert os.path.exists(os.path.join(base_output_dir, "content_list.json")) is False assert os.path.exists(os.path.join(base_output_dir, f'{filename}_content_list.json')) is True
base_output_dir = os.path.join(temp_output_dir, "cli_test_02/auto") base_output_dir = os.path.join(temp_output_dir, 'cli_test_02/auto')
filename = "cli_test_02" filename = 'cli_test_02'
r = os.stat(os.path.join(base_output_dir, f"{filename}.md")) r = os.stat(os.path.join(base_output_dir, f'{filename}.md'))
assert r.st_size > 5000 assert r.st_size > 5000
r = os.stat(os.path.join(base_output_dir, "middle.json")) r = os.stat(os.path.join(base_output_dir, f'{filename}_middle.json'))
assert r.st_size > 200000 assert r.st_size > 200000
r = os.stat(os.path.join(base_output_dir, "model.json")) r = os.stat(os.path.join(base_output_dir, f'{filename}_model.json'))
assert r.st_size > 15000 assert r.st_size > 15000
r = os.stat(os.path.join(base_output_dir, "origin.pdf")) r = os.stat(os.path.join(base_output_dir, f'{filename}_origin.pdf'))
assert r.st_size > 500000 assert r.st_size > 400000
r = os.stat(os.path.join(base_output_dir, "layout.pdf")) r = os.stat(os.path.join(base_output_dir, f'{filename}_layout.pdf'))
assert r.st_size > 500000 assert r.st_size > 400000
r = os.stat(os.path.join(base_output_dir, "spans.pdf")) r = os.stat(os.path.join(base_output_dir, f'{filename}_spans.pdf'))
assert r.st_size > 500000 assert r.st_size > 400000
assert os.path.exists(os.path.join(base_output_dir, "images")) is True assert os.path.exists(os.path.join(base_output_dir, 'images')) is True
assert os.path.isdir(os.path.join(base_output_dir, "images")) is True assert os.path.isdir(os.path.join(base_output_dir, 'images')) is True
assert os.path.exists(os.path.join(base_output_dir, "content_list.json")) is False assert os.path.exists(os.path.join(base_output_dir, f'{filename}_content_list.json')) is True
# teardown # teardown
shutil.rmtree(temp_output_dir) shutil.rmtree(temp_output_dir)
import tempfile
import os import os
import shutil import shutil
import tempfile
from click.testing import CliRunner from click.testing import CliRunner
from magic_pdf.tools import cli_dev from magic_pdf.tools import cli_dev
...@@ -8,22 +9,22 @@ from magic_pdf.tools import cli_dev ...@@ -8,22 +9,22 @@ from magic_pdf.tools import cli_dev
def test_cli_pdf(): def test_cli_pdf():
# setup # setup
unitest_dir = "/tmp/magic_pdf/unittest/tools" unitest_dir = '/tmp/magic_pdf/unittest/tools'
filename = "cli_test_01" filename = 'cli_test_01'
os.makedirs(unitest_dir, exist_ok=True) os.makedirs(unitest_dir, exist_ok=True)
temp_output_dir = tempfile.mkdtemp(dir="/tmp/magic_pdf/unittest/tools") temp_output_dir = tempfile.mkdtemp(dir='/tmp/magic_pdf/unittest/tools')
# run # run
runner = CliRunner() runner = CliRunner()
result = runner.invoke( result = runner.invoke(
cli_dev.cli, cli_dev.cli,
[ [
"pdf", 'pdf',
"-p", '-p',
"tests/test_tools/assets/cli/pdf/cli_test_01.pdf", 'tests/unittest/test_tools/assets/cli/pdf/cli_test_01.pdf',
"-j", '-j',
"tests/test_tools/assets/cli_dev/cli_test_01.model.json", 'tests/unittest/test_tools/assets/cli_dev/cli_test_01.model.json',
"-o", '-o',
temp_output_dir, temp_output_dir,
], ],
) )
...@@ -31,31 +32,30 @@ def test_cli_pdf(): ...@@ -31,31 +32,30 @@ def test_cli_pdf():
# check # check
assert result.exit_code == 0 assert result.exit_code == 0
base_output_dir = os.path.join(temp_output_dir, "cli_test_01/auto") base_output_dir = os.path.join(temp_output_dir, 'cli_test_01/auto')
r = os.stat(os.path.join(base_output_dir, "content_list.json")) r = os.stat(os.path.join(base_output_dir, f'{filename}_content_list.json'))
assert r.st_size > 5000 assert r.st_size > 5000
r = os.stat(os.path.join(base_output_dir, f'{filename}.md'))
r = os.stat(os.path.join(base_output_dir, f"{filename}.md"))
assert r.st_size > 7000 assert r.st_size > 7000
r = os.stat(os.path.join(base_output_dir, "middle.json")) r = os.stat(os.path.join(base_output_dir, f'{filename}_middle.json'))
assert r.st_size > 200000 assert r.st_size > 200000
r = os.stat(os.path.join(base_output_dir, "model.json")) r = os.stat(os.path.join(base_output_dir, f'{filename}_model.json'))
assert r.st_size > 15000 assert r.st_size > 15000
r = os.stat(os.path.join(base_output_dir, "origin.pdf")) r = os.stat(os.path.join(base_output_dir, f'{filename}_origin.pdf'))
assert r.st_size > 500000 assert r.st_size > 400000
r = os.stat(os.path.join(base_output_dir, "layout.pdf")) r = os.stat(os.path.join(base_output_dir, f'{filename}_layout.pdf'))
assert r.st_size > 500000 assert r.st_size > 400000
r = os.stat(os.path.join(base_output_dir, "spans.pdf")) r = os.stat(os.path.join(base_output_dir, f'{filename}_spans.pdf'))
assert r.st_size > 500000 assert r.st_size > 400000
assert os.path.exists(os.path.join(base_output_dir, "images")) is True assert os.path.exists(os.path.join(base_output_dir, 'images')) is True
assert os.path.isdir(os.path.join(base_output_dir, "images")) is True assert os.path.isdir(os.path.join(base_output_dir, 'images')) is True
# teardown # teardown
shutil.rmtree(temp_output_dir) shutil.rmtree(temp_output_dir)
...@@ -63,26 +63,26 @@ def test_cli_pdf(): ...@@ -63,26 +63,26 @@ def test_cli_pdf():
def test_cli_jsonl(): def test_cli_jsonl():
# setup # setup
unitest_dir = "/tmp/magic_pdf/unittest/tools" unitest_dir = '/tmp/magic_pdf/unittest/tools'
filename = "cli_test_01" filename = 'cli_test_01'
os.makedirs(unitest_dir, exist_ok=True) os.makedirs(unitest_dir, exist_ok=True)
temp_output_dir = tempfile.mkdtemp(dir="/tmp/magic_pdf/unittest/tools") temp_output_dir = tempfile.mkdtemp(dir='/tmp/magic_pdf/unittest/tools')
def mock_read_s3_path(s3path): def mock_read_s3_path(s3path):
with open(s3path, "rb") as f: with open(s3path, 'rb') as f:
return f.read() return f.read()
cli_dev.read_s3_path = mock_read_s3_path # mock cli_dev.read_s3_path = mock_read_s3_path # mock
# run # run
runner = CliRunner() runner = CliRunner()
result = runner.invoke( result = runner.invoke(
cli_dev.cli, cli_dev.cli,
[ [
"jsonl", 'jsonl',
"-j", '-j',
"tests/test_tools/assets/cli_dev/cli_test_01.jsonl", 'tests/unittest/test_tools/assets/cli_dev/cli_test_01.jsonl',
"-o", '-o',
temp_output_dir, temp_output_dir,
], ],
) )
...@@ -90,31 +90,31 @@ def test_cli_jsonl(): ...@@ -90,31 +90,31 @@ def test_cli_jsonl():
# check # check
assert result.exit_code == 0 assert result.exit_code == 0
base_output_dir = os.path.join(temp_output_dir, "cli_test_01/auto") base_output_dir = os.path.join(temp_output_dir, 'cli_test_01/auto')
r = os.stat(os.path.join(base_output_dir, "content_list.json")) r = os.stat(os.path.join(base_output_dir, f'{filename}_content_list.json'))
assert r.st_size > 5000 assert r.st_size > 5000
r = os.stat(os.path.join(base_output_dir, f"{filename}.md")) r = os.stat(os.path.join(base_output_dir, f'{filename}.md'))
assert r.st_size > 7000 assert r.st_size > 7000
r = os.stat(os.path.join(base_output_dir, "middle.json")) r = os.stat(os.path.join(base_output_dir, f'{filename}_middle.json'))
assert r.st_size > 200000 assert r.st_size > 200000
r = os.stat(os.path.join(base_output_dir, "model.json")) r = os.stat(os.path.join(base_output_dir, f'{filename}_model.json'))
assert r.st_size > 15000 assert r.st_size > 15000
r = os.stat(os.path.join(base_output_dir, "origin.pdf")) r = os.stat(os.path.join(base_output_dir, f'{filename}_origin.pdf'))
assert r.st_size > 500000 assert r.st_size > 400000
r = os.stat(os.path.join(base_output_dir, "layout.pdf")) r = os.stat(os.path.join(base_output_dir, f'{filename}_layout.pdf'))
assert r.st_size > 500000 assert r.st_size > 400000
r = os.stat(os.path.join(base_output_dir, "spans.pdf")) r = os.stat(os.path.join(base_output_dir, f'{filename}_spans.pdf'))
assert r.st_size > 500000 assert r.st_size > 400000
assert os.path.exists(os.path.join(base_output_dir, "images")) is True assert os.path.exists(os.path.join(base_output_dir, 'images')) is True
assert os.path.isdir(os.path.join(base_output_dir, "images")) is True assert os.path.isdir(os.path.join(base_output_dir, 'images')) is True
# teardown # teardown
shutil.rmtree(temp_output_dir) shutil.rmtree(temp_output_dir)
import tempfile
import os import os
import shutil import shutil
import tempfile
import pytest import pytest
from magic_pdf.tools.common import do_parse from magic_pdf.tools.common import do_parse
@pytest.mark.parametrize("method", ["auto", "txt", "ocr"]) @pytest.mark.parametrize('method', ['auto', 'txt', 'ocr'])
def test_common_do_parse(method): def test_common_do_parse(method):
import magic_pdf.model as model_config
model_config.__use_inside_model__ = True
# setup # setup
unitest_dir = "/tmp/magic_pdf/unittest/tools" unitest_dir = '/tmp/magic_pdf/unittest/tools'
filename = "fake" filename = 'fake'
os.makedirs(unitest_dir, exist_ok=True) os.makedirs(unitest_dir, exist_ok=True)
temp_output_dir = tempfile.mkdtemp(dir="/tmp/magic_pdf/unittest/tools") temp_output_dir = tempfile.mkdtemp(dir='/tmp/magic_pdf/unittest/tools')
# run # run
with open("tests/test_tools/assets/common/cli_test_01.pdf", "rb") as f: with open('tests/unittest/test_tools/assets/common/cli_test_01.pdf', 'rb') as f:
bits = f.read() bits = f.read()
do_parse(temp_output_dir, do_parse(temp_output_dir,
filename, filename,
...@@ -27,31 +29,31 @@ def test_common_do_parse(method): ...@@ -27,31 +29,31 @@ def test_common_do_parse(method):
f_dump_content_list=True) f_dump_content_list=True)
# check # check
base_output_dir = os.path.join(temp_output_dir, f"fake/{method}") base_output_dir = os.path.join(temp_output_dir, f'fake/{method}')
r = os.stat(os.path.join(base_output_dir, "content_list.json")) r = os.stat(os.path.join(base_output_dir, f'{filename}_content_list.json'))
assert r.st_size > 5000 assert r.st_size > 5000
r = os.stat(os.path.join(base_output_dir, f"{filename}.md")) r = os.stat(os.path.join(base_output_dir, f'{filename}.md'))
assert r.st_size > 7000 assert r.st_size > 7000
r = os.stat(os.path.join(base_output_dir, "middle.json")) r = os.stat(os.path.join(base_output_dir, f'{filename}_middle.json'))
assert r.st_size > 200000 assert r.st_size > 200000
r = os.stat(os.path.join(base_output_dir, "model.json")) r = os.stat(os.path.join(base_output_dir, f'{filename}_model.json'))
assert r.st_size > 15000 assert r.st_size > 15000
r = os.stat(os.path.join(base_output_dir, "origin.pdf")) r = os.stat(os.path.join(base_output_dir, f'{filename}_origin.pdf'))
assert r.st_size > 500000 assert r.st_size > 400000
r = os.stat(os.path.join(base_output_dir, "layout.pdf")) r = os.stat(os.path.join(base_output_dir, f'{filename}_layout.pdf'))
assert r.st_size > 500000 assert r.st_size > 400000
r = os.stat(os.path.join(base_output_dir, "spans.pdf")) r = os.stat(os.path.join(base_output_dir, f'{filename}_spans.pdf'))
assert r.st_size > 500000 assert r.st_size > 400000
os.path.exists(os.path.join(base_output_dir, "images")) os.path.exists(os.path.join(base_output_dir, 'images'))
os.path.isdir(os.path.join(base_output_dir, "images")) os.path.isdir(os.path.join(base_output_dir, 'images'))
# teardown # teardown
shutil.rmtree(temp_output_dir) shutil.rmtree(temp_output_dir)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment