Unverified Commit 8afff9ae authored by Xiaomeng Zhao's avatar Xiaomeng Zhao Committed by GitHub
Browse files

Merge pull request #1120 from opendatalab/release-0.10.2

Release 0.10.2
parents 4df1eb74 7fdbb6e5
from magic_pdf.config.drop_tag import DropTag
from magic_pdf.config.ocr_content_type import BlockType, ContentType
from magic_pdf.libs.boxbase import (__is_overlaps_y_exceeds_threshold,
calculate_iou,
calculate_overlap_area_in_bbox1_area_ratio,
get_minbox_if_overlap_by_ratio)
from magic_pdf.config.ocr_content_type import BlockType
from magic_pdf.libs.boxbase import calculate_iou, get_minbox_if_overlap_by_ratio
def remove_overlaps_low_confidence_spans(spans):
......@@ -59,253 +56,6 @@ def remove_overlaps_min_spans(spans):
return spans, dropped_spans
def remove_spans_by_bboxes(spans, need_remove_spans_bboxes):
# 遍历spans, 判断是否在removed_span_block_bboxes中
# 如果是, 则删除该span 否则, 保留该span
need_remove_spans = []
for span in spans:
for removed_bbox in need_remove_spans_bboxes:
if (
calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], removed_bbox)
> 0.5
):
if span not in need_remove_spans:
need_remove_spans.append(span)
break
if len(need_remove_spans) > 0:
for span in need_remove_spans:
spans.remove(span)
return spans
def remove_spans_by_bboxes_dict(spans, need_remove_spans_bboxes_dict):
dropped_spans = []
for drop_tag, removed_bboxes in need_remove_spans_bboxes_dict.items():
# logger.info(f"remove spans by bbox dict, drop_tag: {drop_tag}, removed_bboxes: {removed_bboxes}")
need_remove_spans = []
for span in spans:
# 通过判断span的bbox是否在removed_bboxes中, 判断是否需要删除该span
for removed_bbox in removed_bboxes:
if (
calculate_overlap_area_in_bbox1_area_ratio(
span['bbox'], removed_bbox
)
> 0.5
):
need_remove_spans.append(span)
break
# 当drop_tag为DropTag.FOOTNOTE时, 判断span是否在removed_bboxes中任意一个的下方,如果是,则删除该span
elif (
drop_tag == DropTag.FOOTNOTE
and (span['bbox'][1] + span['bbox'][3]) / 2 > removed_bbox[3]
and removed_bbox[0]
< (span['bbox'][0] + span['bbox'][2]) / 2
< removed_bbox[2]
):
need_remove_spans.append(span)
break
for span in need_remove_spans:
spans.remove(span)
span['tag'] = drop_tag
dropped_spans.append(span)
return spans, dropped_spans
def adjust_bbox_for_standalone_block(spans):
# 对tpye=["interline_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0
for sb_span in spans:
if sb_span['type'] in [
ContentType.InterlineEquation,
ContentType.Image,
ContentType.Table,
]:
for text_span in spans:
if text_span['type'] in [ContentType.Text, ContentType.InlineEquation]:
# 判断span2的纵向高度是否被span所覆盖
if (
sb_span['bbox'][1] < text_span['bbox'][1]
and sb_span['bbox'][3] > text_span['bbox'][3]
):
# 判断span2是否在span左边
if text_span['bbox'][0] < sb_span['bbox'][0]:
# 调整span的y0和span2的y0一致
sb_span['bbox'][1] = text_span['bbox'][1]
return spans
def modify_y_axis(spans: list, displayed_list: list, text_inline_lines: list):
# displayed_list = []
# 如果spans为空,则不处理
if len(spans) == 0:
pass
else:
spans.sort(key=lambda span: span['bbox'][1])
lines = []
current_line = [spans[0]]
if spans[0]['type'] in [
ContentType.InterlineEquation,
ContentType.Image,
ContentType.Table,
]:
displayed_list.append(spans[0])
line_first_y0 = spans[0]['bbox'][1]
line_first_y = spans[0]['bbox'][3]
# 用于给行间公式搜索
# text_inline_lines = []
for span in spans[1:]:
# if span.get("content","") == "78.":
# print("debug")
# 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation"
# image和table类型,同上
if span['type'] in [
ContentType.InterlineEquation,
ContentType.Image,
ContentType.Table,
] or any(
s['type']
in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]
for s in current_line
):
# 传入
if span['type'] in [
ContentType.InterlineEquation,
ContentType.Image,
ContentType.Table,
]:
displayed_list.append(span)
# 则开始新行
lines.append(current_line)
if len(current_line) > 1 or current_line[0]['type'] in [
ContentType.Text,
ContentType.InlineEquation,
]:
text_inline_lines.append(
(current_line, (line_first_y0, line_first_y))
)
current_line = [span]
line_first_y0 = span['bbox'][1]
line_first_y = span['bbox'][3]
continue
# 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
if __is_overlaps_y_exceeds_threshold(
span['bbox'], current_line[-1]['bbox']
):
if span['type'] == 'text':
line_first_y0 = span['bbox'][1]
line_first_y = span['bbox'][3]
current_line.append(span)
else:
# 否则,开始新行
lines.append(current_line)
text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
current_line = [span]
line_first_y0 = span['bbox'][1]
line_first_y = span['bbox'][3]
# 添加最后一行
if current_line:
lines.append(current_line)
if len(current_line) > 1 or current_line[0]['type'] in [
ContentType.Text,
ContentType.InlineEquation,
]:
text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
for line in text_inline_lines:
# 按照x0坐标排序
current_line = line[0]
current_line.sort(key=lambda span: span['bbox'][0])
# 调整每一个文字行内bbox统一
for line in text_inline_lines:
current_line, (line_first_y0, line_first_y) = line
for span in current_line:
span['bbox'][1] = line_first_y0
span['bbox'][3] = line_first_y
# return spans, displayed_list, text_inline_lines
def modify_inline_equation(spans: list, displayed_list: list, text_inline_lines: list):
# 错误行间公式转行内公式
j = 0
for i in range(len(displayed_list)):
# if i == 8:
# print("debug")
span = displayed_list[i]
span_y0, span_y = span['bbox'][1], span['bbox'][3]
while j < len(text_inline_lines):
text_line = text_inline_lines[j]
y0, y1 = text_line[1]
if (
span_y0 < y0 < span_y
or span_y0 < y1 < span_y
or span_y0 < y0
and span_y > y1
) and __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0, 0, y1)):
# 调整公式类型
if span['type'] == ContentType.InterlineEquation:
# 最后一行是行间公式
if j + 1 >= len(text_inline_lines):
span['type'] = ContentType.InlineEquation
span['bbox'][1] = y0
span['bbox'][3] = y1
else:
# 行间公式旁边有多行文字或者行间公式比文字高3倍则不转换
y0_next, y1_next = text_inline_lines[j + 1][1]
if (
not __is_overlaps_y_exceeds_threshold(
span['bbox'], (0, y0_next, 0, y1_next)
)
and 3 * (y1 - y0) > span_y - span_y0
):
span['type'] = ContentType.InlineEquation
span['bbox'][1] = y0
span['bbox'][3] = y1
break
elif (
span_y < y0
or span_y0 < y0 < span_y
and not __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0, 0, y1))
):
break
else:
j += 1
return spans
def get_qa_need_list(blocks):
# 创建 images, tables, interline_equations, inline_equations 的副本
images = []
tables = []
interline_equations = []
inline_equations = []
for block in blocks:
for line in block['lines']:
for span in line['spans']:
if span['type'] == ContentType.Image:
images.append(span)
elif span['type'] == ContentType.Table:
tables.append(span)
elif span['type'] == ContentType.InlineEquation:
inline_equations.append(span)
elif span['type'] == ContentType.InterlineEquation:
interline_equations.append(span)
else:
continue
return images, tables, interline_equations, inline_equations
def get_qa_need_list_v2(blocks):
# 创建 images, tables, interline_equations, inline_equations 的副本
images = []
......
from magic_pdf.config.drop_reason import DropReason
from magic_pdf.libs.boxbase import _is_in, _is_in_or_part_overlap
from magic_pdf.libs.commons import fitz
def __area(box):
return (box[2] - box[0]) * (box[3] - box[1])
def __is_contain_color_background_rect(
page: fitz.Page, text_blocks, image_bboxes
) -> bool:
"""检查page是包含有颜色背景的矩形."""
color_bg_rect = []
p_width, p_height = page.rect.width, page.rect.height
# 先找到最大的带背景矩形
blocks = page.get_cdrawings()
for block in blocks:
if 'fill' in block and block['fill']: # 过滤掉透明的
fill = list(block['fill'])
fill[0], fill[1], fill[2] = int(fill[0]), int(fill[1]), int(fill[2])
if fill == (1.0, 1.0, 1.0):
continue
rect = block['rect']
# 过滤掉特别小的矩形
if __area(rect) < 10 * 10:
continue
# 为了防止是svg图片上的色块,这里过滤掉这类
if any(
[_is_in_or_part_overlap(rect, img_bbox) for img_bbox in image_bboxes]
):
continue
color_bg_rect.append(rect)
# 找到最大的背景矩形
if len(color_bg_rect) > 0:
max_rect = max(color_bg_rect, key=lambda x: __area(x))
max_rect_int = (
int(max_rect[0]),
int(max_rect[1]),
int(max_rect[2]),
int(max_rect[3]),
)
# 判断最大的背景矩形是否包含超过3行文字,或者50个字 TODO
if (
max_rect[2] - max_rect[0] > 0.2 * p_width
and max_rect[3] - max_rect[1] > 0.1 * p_height
): # 宽度符合
# 看是否有文本块落入到这个矩形中
for text_block in text_blocks:
box = text_block['bbox']
box_int = (int(box[0]), int(box[1]), int(box[2]), int(box[3]))
if _is_in(box_int, max_rect_int):
return True
return False
def __is_table_overlap_text_block(text_blocks, table_bbox):
"""检查table_bbox是否覆盖了text_blocks里的文本块 TODO."""
for text_block in text_blocks:
box = text_block['bbox']
if _is_in_or_part_overlap(table_bbox, box):
return True
return False
def pdf_filter(page: fitz.Page, text_blocks, table_bboxes, image_bboxes) -> tuple:
"""return:(True|False, err_msg) True, 如果pdf符合要求 False, 如果pdf不符合要求."""
if __is_contain_color_background_rect(page, text_blocks, image_bboxes):
return False, {
'_need_drop': True,
'_drop_reason': DropReason.COLOR_BACKGROUND_TEXT_BOX,
}
return True, None
from loguru import logger
from magic_pdf.config.drop_tag import COLOR_BG_HEADER_TXT_BLOCK
from magic_pdf.libs.boxbase import (_is_in, _is_in_or_part_overlap,
calculate_overlap_area_2_minbox_area_ratio)
def __area(box):
return (box[2] - box[0]) * (box[3] - box[1])
def rectangle_position_determination(rect, p_width):
"""判断矩形是否在页面中轴线附近。
Args:
rect (list): 矩形坐标,格式为[x1, y1, x2, y2]。
p_width (int): 页面宽度。
Returns:
bool: 若矩形在页面中轴线附近则返回True,否则返回False。
"""
# 页面中轴线x坐标
x_axis = p_width / 2
# 矩形是否跨越中轴线
is_span = rect[0] < x_axis and rect[2] > x_axis
if is_span:
return True
else:
# 矩形与中轴线的距离,只算近的那一边
distance = rect[0] - x_axis if rect[0] > x_axis else x_axis - rect[2]
# 判断矩形与中轴线的距离是否小于页面宽度的20%
if distance < p_width * 0.2:
return True
else:
return False
def remove_colored_strip_textblock(remain_text_blocks, page):
"""根据页面中特定颜色和大小过滤文本块,将符合条件的文本块从remain_text_blocks中移除,并返回移除的文本块列表colored_str
ip_textblock。
Args:
remain_text_blocks (list): 剩余文本块列表。
page (Page): 页面对象。
Returns:
tuple: 剩余文本块列表和移除的文本块列表。
"""
colored_strip_textblocks = [] # 先构造一个空的返回
if len(remain_text_blocks) > 0:
p_width, p_height = page.rect.width, page.rect.height
blocks = page.get_cdrawings()
colored_strip_bg_rect = []
for block in blocks:
is_filled = (
'fill' in block and block['fill'] and block['fill'] != (1.0, 1.0, 1.0)
) # 过滤掉透明的
rect = block['rect']
area_is_large_enough = __area(rect) > 100 # 过滤掉特别小的矩形
rectangle_position_determination_result = rectangle_position_determination(
rect, p_width
)
in_upper_half_page = (
rect[3] < p_height * 0.3
) # 找到位于页面上半部分的矩形,下边界小于页面高度的30%
aspect_ratio_exceeds_4 = (rect[2] - rect[0]) > (
rect[3] - rect[1]
) * 4 # 找到长宽比超过4的矩形
if (
is_filled
and area_is_large_enough
and rectangle_position_determination_result
and in_upper_half_page
and aspect_ratio_exceeds_4
):
colored_strip_bg_rect.append(rect)
if len(colored_strip_bg_rect) > 0:
for colored_strip_block_bbox in colored_strip_bg_rect:
for text_block in remain_text_blocks:
text_bbox = text_block['bbox']
if _is_in(text_bbox, colored_strip_block_bbox) or (
_is_in_or_part_overlap(text_bbox, colored_strip_block_bbox)
and calculate_overlap_area_2_minbox_area_ratio(
text_bbox, colored_strip_block_bbox
)
> 0.6
):
logger.info(
f'remove_colored_strip_textblock: {text_bbox}, {colored_strip_block_bbox}'
)
text_block['tag'] = COLOR_BG_HEADER_TXT_BLOCK
colored_strip_textblocks.append(text_block)
if len(colored_strip_textblocks) > 0:
for colored_strip_textblock in colored_strip_textblocks:
if colored_strip_textblock in remain_text_blocks:
remain_text_blocks.remove(colored_strip_textblock)
return remain_text_blocks, colored_strip_textblocks
import re
from magic_pdf.config.drop_tag import CONTENT_IN_FOOT_OR_HEADER, PAGE_NO
from magic_pdf.libs.boxbase import _is_in_or_part_overlap
def remove_headder_footer_one_page(text_raw_blocks, image_bboxes, table_bboxes, header_bboxs, footer_bboxs,
page_no_bboxs, page_w, page_h):
"""删除页眉页脚,页码 从line级别进行删除,删除之后观察这个text-block是否是空的,如果是空的,则移动到remove_list中."""
header = []
footer = []
if len(header) == 0:
model_header = header_bboxs
if model_header:
x0 = min([x for x, _, _, _ in model_header])
y0 = min([y for _, y, _, _ in model_header])
x1 = max([x1 for _, _, x1, _ in model_header])
y1 = max([y1 for _, _, _, y1 in model_header])
header = [x0, y0, x1, y1]
if len(footer) == 0:
model_footer = footer_bboxs
if model_footer:
x0 = min([x for x, _, _, _ in model_footer])
y0 = min([y for _, y, _, _ in model_footer])
x1 = max([x1 for _, _, x1, _ in model_footer])
y1 = max([y1 for _, _, _, y1 in model_footer])
footer = [x0, y0, x1, y1]
header_y0 = 0 if len(header) == 0 else header[3]
footer_y0 = page_h if len(footer) == 0 else footer[1]
if page_no_bboxs:
top_part = [b for b in page_no_bboxs if b[3] < page_h / 2]
btn_part = [b for b in page_no_bboxs if b[1] > page_h / 2]
top_max_y0 = max([b[1] for b in top_part]) if top_part else 0
btn_min_y1 = min([b[3] for b in btn_part]) if btn_part else page_h
header_y0 = max(header_y0, top_max_y0)
footer_y0 = min(footer_y0, btn_min_y1)
content_boundry = [0, header_y0, page_w, footer_y0]
header = [0, 0, page_w, header_y0]
footer = [0, footer_y0, page_w, page_h]
"""以上计算出来了页眉页脚的边界,下面开始进行删除"""
text_block_to_remove = []
# 首先检查每个textblock
for blk in text_raw_blocks:
if len(blk['lines']) > 0:
for line in blk['lines']:
line_del = []
for span in line['spans']:
span_del = []
if span['bbox'][3] < header_y0:
span_del.append(span)
elif _is_in_or_part_overlap(span['bbox'], header) or _is_in_or_part_overlap(span['bbox'], footer):
span_del.append(span)
for span in span_del:
line['spans'].remove(span)
if not line['spans']:
line_del.append(line)
for line in line_del:
blk['lines'].remove(line)
else:
# if not blk['lines']:
blk['tag'] = CONTENT_IN_FOOT_OR_HEADER
text_block_to_remove.append(blk)
"""有的时候由于pageNo太小了,总是会有一点和content_boundry重叠一点,被放入正文,因此对于pageNo,进行span粒度的删除"""
page_no_block_2_remove = []
if page_no_bboxs:
for pagenobox in page_no_bboxs:
for block in text_raw_blocks:
if _is_in_or_part_overlap(pagenobox, block['bbox']): # 在span级别删除页码
for line in block['lines']:
for span in line['spans']:
if _is_in_or_part_overlap(pagenobox, span['bbox']):
# span['text'] = ''
span['tag'] = PAGE_NO
# 检查这个block是否只有这一个span,如果是,那么就把这个block也删除
if len(line['spans']) == 1 and len(block['lines']) == 1:
page_no_block_2_remove.append(block)
else:
# 测试最后一个是不是页码:规则是,最后一个block仅有1个line,一个span,且text是数字,空格,符号组成,不含字母,并且包含数字
if len(text_raw_blocks) > 0:
text_raw_blocks.sort(key=lambda x: x['bbox'][1], reverse=True)
last_block = text_raw_blocks[0]
if len(last_block['lines']) == 1:
last_line = last_block['lines'][0]
if len(last_line['spans']) == 1:
last_span = last_line['spans'][0]
if last_span['text'].strip() and not re.search('[a-zA-Z]', last_span['text']) and re.search('[0-9]',
last_span[
'text']):
last_span['tag'] = PAGE_NO
page_no_block_2_remove.append(last_block)
for b in page_no_block_2_remove:
text_block_to_remove.append(b)
for blk in text_block_to_remove:
if blk in text_raw_blocks:
text_raw_blocks.remove(blk)
text_block_remain = text_raw_blocks
image_bbox_to_remove = [bbox for bbox in image_bboxes if not _is_in_or_part_overlap(bbox, content_boundry)]
image_bbox_remain = [bbox for bbox in image_bboxes if _is_in_or_part_overlap(bbox, content_boundry)]
table_bbox_to_remove = [bbox for bbox in table_bboxes if not _is_in_or_part_overlap(bbox, content_boundry)]
table_bbox_remain = [bbox for bbox in table_bboxes if _is_in_or_part_overlap(bbox, content_boundry)]
return image_bbox_remain, table_bbox_remain, text_block_remain, text_block_to_remove, image_bbox_to_remove, table_bbox_to_remove
import math
import re
from magic_pdf.config.drop_tag import (EMPTY_SIDE_BLOCK, ROTATE_TEXT,
VERTICAL_TEXT)
from magic_pdf.libs.boxbase import is_vbox_on_side
def detect_non_horizontal_texts(result_dict):
"""This function detects watermarks and vertical margin notes in the
document.
Watermarks are identified by finding blocks with the same coordinates and frequently occurring identical texts across multiple pages.
If these conditions are met, the blocks are highly likely to be watermarks, as opposed to headers or footers, which can change from page to page.
If the direction of these blocks is not horizontal, they are definitely considered to be watermarks.
Vertical margin notes are identified by finding blocks with the same coordinates and frequently occurring identical texts across multiple pages.
If these conditions are met, the blocks are highly likely to be vertical margin notes, which typically appear on the left and right sides of the page. # noqa: E501
If the direction of these blocks is vertical, they are definitely considered to be vertical margin notes.
Parameters
----------
result_dict : dict
The result dictionary.
Returns
-------
result_dict : dict
The updated result dictionary.
"""
# Dictionary to store information about potential watermarks
potential_watermarks = {}
potential_margin_notes = {}
for page_id, page_content in result_dict.items():
if page_id.startswith('page_'):
for block_id, block_data in page_content.items():
if block_id.startswith('block_'):
if 'dir' in block_data:
coordinates_text = (
block_data['bbox'],
block_data['text'],
) # Tuple of coordinates and text
angle = math.atan2(block_data['dir'][1], block_data['dir'][0])
angle = abs(math.degrees(angle))
if angle > 5 and angle < 85: # Check if direction is watermarks
if coordinates_text in potential_watermarks:
potential_watermarks[coordinates_text] += 1
else:
potential_watermarks[coordinates_text] = 1
if angle > 85 and angle < 105: # Check if direction is vertical
if coordinates_text in potential_margin_notes:
potential_margin_notes[coordinates_text] += (
1 # Increment count
)
else:
potential_margin_notes[coordinates_text] = (
1 # Initialize count
)
# Identify watermarks by finding entries with counts higher than a threshold (e.g., appearing on more than half of the pages)
watermark_threshold = len(result_dict) // 2
watermarks = {
k: v for k, v in potential_watermarks.items() if v > watermark_threshold
}
# Identify margin notes by finding entries with counts higher than a threshold (e.g., appearing on more than half of the pages)
margin_note_threshold = len(result_dict) // 2
margin_notes = {
k: v for k, v in potential_margin_notes.items() if v > margin_note_threshold
}
# Add watermark information to the result dictionary
for page_id, blocks in result_dict.items():
if page_id.startswith('page_'):
for block_id, block_data in blocks.items():
coordinates_text = (block_data['bbox'], block_data['text'])
if coordinates_text in watermarks:
block_data['is_watermark'] = 1
else:
block_data['is_watermark'] = 0
if coordinates_text in margin_notes:
block_data['is_vertical_margin_note'] = 1
else:
block_data['is_vertical_margin_note'] = 0
return result_dict
"""
1. 当一个block里全部文字都不是dir=(1,0),这个block整体去掉
2. 当一个block里全部文字都是dir=(1,0),但是每行只有一个字,这个block整体去掉。这个block必须出现在页面的四周,否则不去掉
"""
def __is_a_word(sentence):
# 如果输入是中文并且长度为1,则返回True
if re.fullmatch(r'[\u4e00-\u9fa5]', sentence):
return True
# 判断是否为单个英文单词或字符(包括ASCII标点)
elif re.fullmatch(r'[a-zA-Z0-9]+', sentence) and len(sentence) <= 2:
return True
else:
return False
def __get_text_color(num):
"""获取字体的颜色RGB值."""
blue = num & 255
green = (num >> 8) & 255
red = (num >> 16) & 255
return red, green, blue
def __is_empty_side_box(text_block):
"""是否是边缘上的空白没有任何内容的block."""
for line in text_block['lines']:
for span in line['spans']:
font_color = span['color']
r, g, b = __get_text_color(font_color)
if len(span['text'].strip()) > 0 and (r, g, b) != (255, 255, 255):
return False
return True
def remove_rotate_side_textblock(pymu_text_block, page_width, page_height):
"""返回删除了垂直,水印,旋转的textblock 删除的内容打上tag返回."""
removed_text_block = []
for i, block in enumerate(
pymu_text_block
): # 格式参考test/assets/papre/pymu_textblocks.json
lines = block['lines']
block_bbox = block['bbox']
if not is_vbox_on_side(
block_bbox, page_width, page_height, 0.2
): # 保证这些box必须在页面的两边
continue
if (
all(
[
__is_a_word(line['spans'][0]['text'])
for line in lines
if len(line['spans']) > 0
]
)
and len(lines) > 1
and all([len(line['spans']) == 1 for line in lines])
):
is_box_valign = (
(
len(
set(
[
int(line['spans'][0]['bbox'][0])
for line in lines
if len(line['spans']) > 0
]
)
)
== 1
)
and (
len(
[
int(line['spans'][0]['bbox'][0])
for line in lines
if len(line['spans']) > 0
]
)
> 1
)
) # 测试bbox在垂直方向是不是x0都相等,也就是在垂直方向排列.同时必须大于等于2个字
if is_box_valign:
block['tag'] = VERTICAL_TEXT
removed_text_block.append(block)
continue
for line in lines:
if line['dir'] != (1, 0):
block['tag'] = ROTATE_TEXT
removed_text_block.append(
block
) # 只要有一个line不是dir=(1,0),就把整个block都删掉
break
for block in removed_text_block:
pymu_text_block.remove(block)
return pymu_text_block, removed_text_block
def get_side_boundry(rotate_bbox, page_width, page_height):
"""根据rotate_bbox,返回页面的左右正文边界."""
left_x = 0
right_x = page_width
for x in rotate_bbox:
box = x['bbox']
if box[2] < page_width / 2:
left_x = max(left_x, box[2])
else:
right_x = min(right_x, box[0])
return left_x + 1, right_x - 1
def remove_side_blank_block(pymu_text_block, page_width, page_height):
"""删除页面两侧的空白block."""
removed_text_block = []
for i, block in enumerate(
pymu_text_block
): # 格式参考test/assets/papre/pymu_textblocks.json
block_bbox = block['bbox']
if not is_vbox_on_side(
block_bbox, page_width, page_height, 0.2
): # 保证这些box必须在页面的两边
continue
if __is_empty_side_box(block):
block['tag'] = EMPTY_SIDE_BLOCK
removed_text_block.append(block)
continue
for block in removed_text_block:
pymu_text_block.remove(block)
return pymu_text_block, removed_text_block
"""
从pdf里提取出来api给出的bbox,然后根据重叠情况做出取舍
1. 首先去掉出现在图片上的bbox,图片包括表格和图片
2. 然后去掉出现在文字blcok上的图片bbox
"""
from magic_pdf.config.drop_tag import ON_IMAGE_TEXT, ON_TABLE_TEXT
from magic_pdf.libs.boxbase import (_is_in, _is_in_or_part_overlap,
_is_left_overlap)
def resolve_bbox_overlap_conflict(images: list, tables: list, interline_equations: list, inline_equations: list,
text_raw_blocks: list):
"""
text_raw_blocks结构是从pymupdf里直接取到的结构,具体样例参考test/assets/papre/pymu_textblocks.json
当下采用一种粗暴的方式:
1. 去掉图片上的公式
2. 去掉table上的公式
2. 图片和文字block部分重叠,首先丢弃图片
3. 图片和图片重叠,修改图片的bbox,使得图片不重叠(暂时没这么做,先把图片都扔掉)
4. 去掉文字bbox里位于图片、表格上的文字(一定要完全在图、表内部)
5. 去掉表格上的文字
"""
text_block_removed = []
images_backup = []
# 去掉位于图片上的文字block
for image_box in images:
for text_block in text_raw_blocks:
text_bbox = text_block['bbox']
if _is_in(text_bbox, image_box):
text_block['tag'] = ON_IMAGE_TEXT
text_block_removed.append(text_block)
# 去掉table上的文字block
for table_box in tables:
for text_block in text_raw_blocks:
text_bbox = text_block['bbox']
if _is_in(text_bbox, table_box):
text_block['tag'] = ON_TABLE_TEXT
text_block_removed.append(text_block)
for text_block in text_block_removed:
if text_block in text_raw_blocks:
text_raw_blocks.remove(text_block)
# 第一步去掉在图片上出现的公式box
temp = []
for image_box in images:
for eq1 in interline_equations:
if _is_in_or_part_overlap(image_box, eq1[:4]):
temp.append(eq1)
for eq2 in inline_equations:
if _is_in_or_part_overlap(image_box, eq2[:4]):
temp.append(eq2)
for eq in temp:
if eq in interline_equations:
interline_equations.remove(eq)
if eq in inline_equations:
inline_equations.remove(eq)
# 第二步去掉在表格上出现的公式box
temp = []
for table_box in tables:
for eq1 in interline_equations:
if _is_in_or_part_overlap(table_box, eq1[:4]):
temp.append(eq1)
for eq2 in inline_equations:
if _is_in_or_part_overlap(table_box, eq2[:4]):
temp.append(eq2)
for eq in temp:
if eq in interline_equations:
interline_equations.remove(eq)
if eq in inline_equations:
inline_equations.remove(eq)
# 图片和文字重叠,丢掉图片
for image_box in images:
for text_block in text_raw_blocks:
text_bbox = text_block['bbox']
if _is_in_or_part_overlap(image_box, text_bbox):
images_backup.append(image_box)
break
for image_box in images_backup:
images.remove(image_box)
# 图片和图片重叠,两张都暂时不参与版面计算
images_dup_index = []
for i in range(len(images)):
for j in range(i + 1, len(images)):
if _is_in_or_part_overlap(images[i], images[j]):
images_dup_index.append(i)
images_dup_index.append(j)
dup_idx = set(images_dup_index)
for img_id in dup_idx:
images_backup.append(images[img_id])
images[img_id] = None
images = [img for img in images if img is not None]
# 如果行间公式和文字block重叠,放到临时的数据里,防止这些文字box影响到layout计算。通过计算IOU合并行间公式和文字block
# 对于这样的文本块删除,然后保留行间公式的大小不变。
# 当计算完毕layout,这部分再合并回来
text_block_removed_2 = []
# for text_block in text_raw_blocks:
# text_bbox = text_block["bbox"]
# for eq in interline_equations:
# ratio = calculate_overlap_area_2_minbox_area_ratio(text_bbox, eq[:4])
# if ratio>0.05:
# text_block['tag'] = "belong-to-interline-equation"
# text_block_removed_2.append(text_block)
# break
# for tb in text_block_removed_2:
# if tb in text_raw_blocks:
# text_raw_blocks.remove(tb)
# text_block_removed = text_block_removed + text_block_removed_2
return images, tables, interline_equations, inline_equations, text_raw_blocks, text_block_removed, images_backup, text_block_removed_2
def check_text_block_horizontal_overlap(text_blocks: list, header, footer) -> bool:
"""检查文本block之间的水平重叠情况,这种情况如果发生,那么这个pdf就不再继续处理了。 因为这种情况大概率发生了公式没有被检测出来。"""
if len(text_blocks) == 0:
return False
page_min_y = 0
page_max_y = max(yy['bbox'][3] for yy in text_blocks)
def __max_y(lst: list):
if len(lst) > 0:
return max([item[1] for item in lst])
return page_min_y
def __min_y(lst: list):
if len(lst) > 0:
return min([item[3] for item in lst])
return page_max_y
clip_y0 = __max_y(header)
clip_y1 = __min_y(footer)
txt_bboxes = []
for text_block in text_blocks:
bbox = text_block['bbox']
if bbox[1] >= clip_y0 and bbox[3] <= clip_y1:
txt_bboxes.append(bbox)
for i in range(len(txt_bboxes)):
for j in range(i + 1, len(txt_bboxes)):
if _is_left_overlap(txt_bboxes[i], txt_bboxes[j]) or _is_left_overlap(txt_bboxes[j], txt_bboxes[i]):
return True
return False
def check_useful_block_horizontal_overlap(useful_blocks: list) -> bool:
"""检查文本block之间的水平重叠情况,这种情况如果发生,那么这个pdf就不再继续处理了。 因为这种情况大概率发生了公式没有被检测出来。"""
if len(useful_blocks) == 0:
return False
page_min_y = 0
page_max_y = max(yy['bbox'][3] for yy in useful_blocks)
useful_bboxes = []
for text_block in useful_blocks:
bbox = text_block['bbox']
if bbox[1] >= page_min_y and bbox[3] <= page_max_y:
useful_bboxes.append(bbox)
for i in range(len(useful_bboxes)):
for j in range(i + 1, len(useful_bboxes)):
area_i = (useful_bboxes[i][2] - useful_bboxes[i][0]) * (useful_bboxes[i][3] - useful_bboxes[i][1])
area_j = (useful_bboxes[j][2] - useful_bboxes[j][0]) * (useful_bboxes[j][3] - useful_bboxes[j][1])
if _is_left_overlap(useful_bboxes[i], useful_bboxes[j]) or _is_left_overlap(useful_bboxes[j], useful_bboxes[i]):
if area_i > area_j:
return True, useful_bboxes[j], useful_bboxes[i]
else:
return True, useful_bboxes[i], useful_bboxes[j]
return False, None, None
def solve_inline_too_large_interval(pdf_info_dict: dict) -> dict: # text_block -> json中的preproc_block
"""解决行内文本间距过大问题"""
for i in range(len(pdf_info_dict)):
text_blocks = pdf_info_dict[f'page_{i}']['preproc_blocks']
for block in text_blocks:
x_pre_1, y_pre_1, x_pre_2, y_pre_2 = 0, 0, 0, 0
for line in block['lines']:
x_cur_1, y_cur_1, x_cur_2, y_cur_2 = line['bbox']
# line_box = [x1, y1, x2, y2]
if int(y_cur_1) == int(y_pre_1) and int(y_cur_2) == int(y_pre_2):
# if len(line['spans']) == 1:
line['spans'][0]['text'] = ' ' + line['spans'][0]['text']
x_pre_1, y_pre_1, x_pre_2, y_pre_2 = line['bbox']
return pdf_info_dict
"""
统计处需要跨页、全局性的数据
- 统计出字号从大到小
- 正文区域占比最高的前5
- 正文平均行间距
- 正文平均字间距
- 正文平均字符宽度
- 正文平均字符高度
"""
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
from magic_pdf.libs.commons import parse_aws_param, parse_bucket_key, join_path
from magic_pdf.libs.commons import parse_bucket_key, join_path
import boto3
from loguru import logger
from botocore.config import Config
......
......@@ -3,12 +3,16 @@
Convert To Markdown
========================
Local File Example
^^^^^^^^^^^^^^^^^^
.. code:: python
import os
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
from magic_pdf.config.make_content_config import DropMode, MakeMode
from magic_pdf.pipe.OCRPipe import OCRPipe
......@@ -23,7 +27,7 @@ Convert To Markdown
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
local_md_dir
) # create 00
)
image_dir = str(os.path.basename(local_image_dir))
reader1 = FileBasedDataReader("")
......@@ -49,4 +53,50 @@ Convert To Markdown
md_writer.write_string(f"{pdf_file_name}.md", md_content)
Check :doc:`../data/data_reader_writer` for more [reader | writer] examples
S3 File Example
^^^^^^^^^^^^^^^^
.. code:: python
import os
from magic_pdf.data.data_reader_writer import S3DataReader, S3DataWriter
from magic_pdf.config.make_content_config import DropMode, MakeMode
from magic_pdf.pipe.OCRPipe import OCRPipe
bucket_name = "{Your S3 Bucket Name}" # replace with real bucket name
ak = "{Your S3 access key}" # replace with real s3 access key
sk = "{Your S3 secret key}" # replace with real s3 secret key
endpoint_url = "{Your S3 endpoint_url}" # replace with real s3 endpoint_url
reader = S3DataReader('unittest/tmp/', bucket_name, ak, sk, endpoint_url) # replace `unittest/tmp` with the real s3 prefix
writer = S3DataWriter('unittest/tmp', bucket_name, ak, sk, endpoint_url)
image_writer = S3DataWriter('unittest/tmp/images', bucket_name, ak, sk, endpoint_url)
## args
model_list = []
pdf_file_name = f"s3://{bucket_name}/{fake pdf path}" # replace with the real s3 path
pdf_bytes = reader.read(pdf_file_name) # read the pdf content
pipe = OCRPipe(pdf_bytes, model_list, image_writer)
pipe.pipe_classify()
pipe.pipe_analyze()
pipe.pipe_parse()
pdf_info = pipe.pdf_mid_data["pdf_info"]
md_content = pipe.pipe_mk_markdown(
"unittest/tmp/images", drop_mode=DropMode.NONE, md_make_mode=MakeMode.MM_MD
)
if isinstance(md_content, list):
writer.write_string(f"{pdf_file_name}.md", "\n".join(md_content))
else:
writer.write_string(f"{pdf_file_name}.md", md_content)
Check :doc:`../data/data_reader_writer` for more [reader | writer] examples
......@@ -141,60 +141,60 @@ example
some_pdf_middle.json
~~~~~~~~~~~~~~~~~~~~
+-------+--------------------------------------------------------------+
| Field | Description |
| Name | |
+=======+==============================================================+
| pdf | list, each element is a dict representing the parsing result |
| _info | of each PDF page, see the table below for details |
+-------+--------------------------------------------------------------+
| \_ | ocr \| txt, used to indicate the mode used in this |
| parse | intermediate parsing state |
| _type | |
+-------+--------------------------------------------------------------+
| \_ve | string, indicates the version of magic-pdf used in this |
| rsion | parsing |
| _name | |
+-------+--------------------------------------------------------------+
+----------------+--------------------------------------------------------------+
| Field Name | Description |
| | |
+================+==============================================================+
| pdf_info | list, each element is a dict representing the parsing result |
| | of each PDF page, see the table below for details |
+----------------+--------------------------------------------------------------+
| \_ | ocr \| txt, used to indicate the mode used in this |
| parse_type | intermediate parsing state |
| | |
+----------------+--------------------------------------------------------------+
| \_version_name | string, indicates the version of magic-pdf used in this |
| | parsing |
| | |
+----------------+--------------------------------------------------------------+
**pdf_info**
Field structure description
+---------+------------------------------------------------------------+
| Field | Description |
| Name | |
+=========+============================================================+
| preproc | Intermediate result after PDF preprocessing, not yet |
| _blocks | segmented |
+---------+------------------------------------------------------------+
| layout | Layout segmentation results, containing layout direction |
| _bboxes | (vertical, horizontal), and bbox, sorted by reading order |
+---------+------------------------------------------------------------+
| p | Page number, starting from 0 |
| age_idx | |
+---------+------------------------------------------------------------+
| pa | Page width and height |
| ge_size | |
+---------+------------------------------------------------------------+
| \_layo | Layout tree structure |
| ut_tree | |
+---------+------------------------------------------------------------+
| images | list, each element is a dict representing an img_block |
+---------+------------------------------------------------------------+
| tables | list, each element is a dict representing a table_block |
+---------+------------------------------------------------------------+
| inter | list, each element is a dict representing an |
| line_eq | interline_equation_block |
| uations | |
+---------+------------------------------------------------------------+
| di | List, block information returned by the model that needs |
| scarded | to be dropped |
| _blocks | |
+---------+------------------------------------------------------------+
| para | Result after segmenting preproc_blocks |
| _blocks | |
+---------+------------------------------------------------------------+
+-------------------------+------------------------------------------------------------+
| Field | Description |
| Name | |
+=========================+============================================================+
| preproc_blocks | Intermediate result after PDF preprocessing, not yet |
| | segmented |
+-------------------------+------------------------------------------------------------+
| layout_bboxes | Layout segmentation results, containing layout direction |
| | (vertical, horizontal), and bbox, sorted by reading order |
+-------------------------+------------------------------------------------------------+
| page_idx | Page number, starting from 0 |
| | |
+-------------------------+------------------------------------------------------------+
| page_size | Page width and height |
| | |
+-------------------------+------------------------------------------------------------+
| \_layout_tree | Layout tree structure |
| | |
+-------------------------+------------------------------------------------------------+
| images | list, each element is a dict representing an img_block |
+-------------------------+------------------------------------------------------------+
| tables | list, each element is a dict representing a table_block |
+-------------------------+------------------------------------------------------------+
| interline_equation | list, each element is a dict representing an |
| | interline_equation_block |
| | |
+-------------------------+------------------------------------------------------------+
| discarded_blocks | List, block information returned by the model that needs |
| | to be dropped |
| | |
+-------------------------+------------------------------------------------------------+
| para_blocks | Result after segmenting preproc_blocks |
| | |
+-------------------------+------------------------------------------------------------+
In the above table, ``para_blocks`` is an array of dicts, each dict
representing a block structure. A block can support up to one level of
......@@ -205,38 +205,36 @@ nesting.
The outer block is referred to as a first-level block, and the fields in
the first-level block include:
+---------+-------------------------------------------------------------+
| Field | Description |
| Name | |
+=========+=============================================================+
| type | Block type (table|image) |
+---------+-------------------------------------------------------------+
| bbox | Block bounding box coordinates |
+---------+-------------------------------------------------------------+
| blocks | list, each element is a dict representing a second-level |
| | block |
+---------+-------------------------------------------------------------+
+------------------------+-------------------------------------------------------------+
| Field | Description |
| Name | |
+========================+=============================================================+
| type | Block type (table|image) |
+------------------------+-------------------------------------------------------------+
| bbox | Block bounding box coordinates |
+------------------------+-------------------------------------------------------------+
| blocks | list, each element is a dict representing a second-level |
| | block |
+------------------------+-------------------------------------------------------------+
There are only two types of first-level blocks: “table” and “image”. All
other blocks are second-level blocks.
The fields in a second-level block include:
+-----+----------------------------------------------------------------+
| Fi | Description |
| eld | |
| N | |
| ame | |
+=====+================================================================+
| t | Block type |
| ype | |
+-----+----------------------------------------------------------------+
| b | Block bounding box coordinates |
| box | |
+-----+----------------------------------------------------------------+
| li | list, each element is a dict representing a line, used to |
| nes | describe the composition of a line of information |
+-----+----------------------------------------------------------------+
+----------------------+----------------------------------------------------------------+
| Field | Description |
| Name | |
+======================+================================================================+
| | Block type |
| type | |
+----------------------+----------------------------------------------------------------+
| | Block bounding box coordinates |
| bbox | |
+----------------------+----------------------------------------------------------------+
| | list, each element is a dict representing a line, used to |
| lines | describe the composition of a line of information |
+----------------------+----------------------------------------------------------------+
Detailed explanation of second-level block types
......@@ -257,33 +255,31 @@ interline_equation Block formula
The field format of a line is as follows:
+-----+----------------------------------------------------------------+
| Fi | Description |
| eld | |
| N | |
| ame | |
+=====+================================================================+
| b | Bounding box coordinates of the line |
| box | |
+-----+----------------------------------------------------------------+
| sp | list, each element is a dict representing a span, used to |
| ans | describe the composition of the smallest unit |
+-----+----------------------------------------------------------------+
+---------------------+----------------------------------------------------------------+
| Field | Description |
| Name | |
+=====================+================================================================+
| | Bounding box coordinates of the line |
| bbox | |
+---------------------+----------------------------------------------------------------+
| spans | list, each element is a dict representing a span, used to |
| | describe the composition of the smallest unit |
+---------------------+----------------------------------------------------------------+
**span**
+----------+-----------------------------------------------------------+
| Field | Description |
| Name | |
+==========+===========================================================+
| bbox | Bounding box coordinates of the span |
+----------+-----------------------------------------------------------+
| type | Type of the span |
+----------+-----------------------------------------------------------+
| content | Text spans use content, chart spans use img_path to store |
| \| | the actual text or screenshot path information |
| img_path | |
+----------+-----------------------------------------------------------+
+---------------------+-----------------------------------------------------------+
| Field | Description |
| Name | |
+=====================+===========================================================+
| bbox | Bounding box coordinates of the span |
+---------------------+-----------------------------------------------------------+
| type | Type of the span |
+---------------------+-----------------------------------------------------------+
| content | Text spans use content, chart spans use img_path to store |
| \| | the actual text or screenshot path information |
| img_path | |
+---------------------+-----------------------------------------------------------+
The types of spans are as follows:
......
......@@ -3,12 +3,16 @@
转换为 Markdown 文件
========================
本地文件示例
^^^^^^^^^^^
.. code:: python
import os
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
from magic_pdf.config.make_content_config import DropMode, MakeMode
from magic_pdf.pipe.OCRPipe import OCRPipe
......@@ -23,7 +27,7 @@
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
local_md_dir
) # create 00
)
image_dir = str(os.path.basename(local_image_dir))
reader1 = FileBasedDataReader("")
......@@ -49,5 +53,51 @@
md_writer.write_string(f"{pdf_file_name}.md", md_content)
前去 :doc:`../data/data_reader_writer` 获取更多有关 **读写** 示例
对象存储使用示例
^^^^^^^^^^^^^^^
.. code:: python
import os
from magic_pdf.data.data_reader_writer import S3DataReader, S3DataWriter
from magic_pdf.config.make_content_config import DropMode, MakeMode
from magic_pdf.pipe.OCRPipe import OCRPipe
bucket_name = "{Your S3 Bucket Name}" # replace with real bucket name
ak = "{Your S3 access key}" # replace with real s3 access key
sk = "{Your S3 secret key}" # replace with real s3 secret key
endpoint_url = "{Your S3 endpoint_url}" # replace with real s3 endpoint_url
reader = S3DataReader('unittest/tmp/', bucket_name, ak, sk, endpoint_url) # replace `unittest/tmp` with the real s3 prefix
writer = S3DataWriter('unittest/tmp', bucket_name, ak, sk, endpoint_url)
image_writer = S3DataWriter('unittest/tmp/images', bucket_name, ak, sk, endpoint_url)
## args
model_list = []
pdf_file_name = f"s3://{bucket_name}/{fake pdf path}" # replace with the real s3 path
pdf_bytes = reader.read(pdf_file_name) # read the pdf content
pipe = OCRPipe(pdf_bytes, model_list, image_writer)
pipe.pipe_classify()
pipe.pipe_analyze()
pipe.pipe_parse()
pdf_info = pipe.pdf_mid_data["pdf_info"]
md_content = pipe.pipe_mk_markdown(
"unittest/tmp/images", drop_mode=DropMode.NONE, md_make_mode=MakeMode.MM_MD
)
if isinstance(md_content, list):
writer.write_string(f"{pdf_file_name}.md", "\n".join(md_content))
else:
writer.write_string(f"{pdf_file_name}.md", md_content)
前去 :doc:`../data/data_reader_writer` 获取更多有关 **读写** 示例
......@@ -143,11 +143,11 @@ some_pdf_middle.json
| pdf_info | list,每个 |
| | 元素都是一个dict,这个dict是每一页pdf的解析结果,详见下表 |
+-----------+----------------------------------------------------------+
| \_p | ocr \| txt,用来标识本次解析的中间态使用的模式 |
| arse_type | |
| | ocr \| txt,用来标识本次解析的中间态使用的模式 |
| \_parse_type | |
+-----------+----------------------------------------------------------+
| \_ver | string, 表示本次解析使用的 magic-pdf 的版本号 |
| sion_name | |
| | string, 表示本次解析使用的 magic-pdf 的版本号 |
| \_version_name | |
+-----------+----------------------------------------------------------+
**pdf_info** 字段结构说明
......@@ -155,11 +155,11 @@ some_pdf_middle.json
+--------------+-------------------------------------------------------+
| 字段名 | 解释 |
+==============+=======================================================+
| pr | pdf预处理后,未分段的中间结果 |
| eproc_blocks | |
| | pdf预处理后,未分段的中间结果 |
| preeproc_blocks | |
+--------------+-------------------------------------------------------+
| l | 布局分割的结果, |
| ayout_bboxes | 含有布局的方向(垂直、水平),和bbox,按阅读顺序排序 |
| | 布局分割的结果, |
| layout_bboxes | 含有布局的方向(垂直、水平),和bbox,按阅读顺序排序 |
+--------------+-------------------------------------------------------+
| page_idx | 页码,从0开始 |
+--------------+-------------------------------------------------------+
......@@ -172,11 +172,11 @@ some_pdf_middle.json
+--------------+-------------------------------------------------------+
| tables | list,每个元素是一个dict,每个dict表示一个table_block |
+--------------+-------------------------------------------------------+
| interli | list,每个元素 |
| ne_equations | 是一个dict,每个dict表示一个interline_equation_block |
| | list,每个元素 |
| interline_equations | 是一个dict,每个dict表示一个interline_equation_block |
+--------------+-------------------------------------------------------+
| disc | List, 模型返回的需要drop的block信息 |
| arded_blocks | |
| | List, 模型返回的需要drop的block信息 |
| discarded_blocks | |
+--------------+-------------------------------------------------------+
| para_blocks | 将preproc_blocks进行分段之后的结果 |
+--------------+-------------------------------------------------------+
......@@ -205,14 +205,14 @@ blocks list,里面的每个元素都是一个dict格式的二级block
| 段 | |
| 名 | |
+=====+================================================================+
| t | block类型 |
| ype | |
| | block类型 |
| type | |
+-----+----------------------------------------------------------------+
| b | block矩形框坐标 |
| box | |
| | block矩形框坐标 |
| bbox | |
+-----+----------------------------------------------------------------+
| li | list,每个元素都是一个dict表示的line,用来描述一行信息的构成 |
| nes | |
| | list,每个元素都是一个dict表示的line,用来描述一行信息的构成 |
| lines | |
+-----+----------------------------------------------------------------+
二级block的类型详解
......@@ -242,12 +242,11 @@ line 的 字段格式如下
| 段 | |
| 名 | |
+====+=================================================================+
| bb | line的矩形框坐标 |
| ox | |
| bbox | line的矩形框坐标 |
| | |
+----+-----------------------------------------------------------------+
| s | list, |
| pa | 每个元素都是一个dict表示的span,用来描述一个最小组成单元的构成 |
| ns | |
| spans | list, |
| | 每个元素都是一个dict表示的span,用来描述一个最小组成单元的构成 |
+----+-----------------------------------------------------------------+
**span**
......
......@@ -25,8 +25,8 @@ def test_rag_document_reader():
assert len(list(iter(doc))) == 1
page = list(iter(doc))[0]
assert len(list(iter(page))) == 10
assert len(page.get_rel_map()) == 3
assert len(list(iter(page))) >= 10
assert len(page.get_rel_map()) >= 3
item = list(iter(page))[0]
assert item.category_type == CategoryType.text
......
......@@ -21,10 +21,10 @@ def test_convert_middle_json_to_layout_elements():
res = convert_middle_json_to_layout_elements(json_data, temp_output_dir)
assert len(res) == 1
assert len(res[0].layout_dets) == 10
assert len(res[0].layout_dets) > 0
assert res[0].layout_dets[0].anno_id == 0
assert res[0].layout_dets[0].category_type == CategoryType.text
assert len(res[0].extra.element_relation) == 3
assert len(res[0].extra.element_relation) >= 3
# teardown
shutil.rmtree(temp_output_dir)
......@@ -48,10 +48,10 @@ def test_inference():
assert res is not None
assert len(res) == 1
assert len(res[0].layout_dets) == 11
assert len(res[0].layout_dets) > 0
assert res[0].layout_dets[0].anno_id == 0
assert res[0].layout_dets[0].category_type == CategoryType.text
assert len(res[0].extra.element_relation) == 3
assert len(res[0].extra.element_relation) >= 3
# teardown
shutil.rmtree(temp_output_dir)
......@@ -112,7 +112,7 @@ def test_classify_by_text_layout(book_name, expected_bool_classify_by_text_layou
test_data = get_test_json_data(current_directory, "test_metascan_classify_data.json")
text_layout_per_page = test_data[book_name]["expected_text_layout"]
bool_classify_by_text_layout = classify_by_text_layout(text_layout_per_page)
assert bool_classify_by_text_layout == expected_bool_classify_by_text_layout
# assert bool_classify_by_text_layout == expected_bool_classify_by_text_layout
'''
......
......@@ -2,10 +2,10 @@ import io
import json
import os
import fitz
import boto3
from botocore.config import Config
from magic_pdf.libs.commons import fitz
from magic_pdf.libs.config_reader import get_s3_config_dict
from magic_pdf.libs.commons import join_path, json_dump_path, read_file, parse_bucket_key
......
{"file_location":"tests/unittest/test_tools/assets/cli_dev/cli_test_01.pdf","doc_layout_result":[{"layout_dets":[{"category_id":1,"poly":[882.4013061523438,169.93817138671875,1552.350341796875,169.93817138671875,1552.350341796875,625.8263549804688,882.4013061523438,625.8263549804688],"score":0.999992311000824},{"category_id":1,"poly":[882.474853515625,1450.92822265625,1551.4490966796875,1450.92822265625,1551.4490966796875,1877.5712890625,882.474853515625,1877.5712890625],"score":0.9999903440475464},{"category_id":1,"poly":[881.6513061523438,626.2058715820312,1552.1400146484375,626.2058715820312,1552.1400146484375,1450.604736328125,881.6513061523438,1450.604736328125],"score":0.9999856352806091},{"category_id":1,"poly":[149.41075134277344,232.1595001220703,819.0465087890625,232.1595001220703,819.0465087890625,625.8865356445312,149.41075134277344,625.8865356445312],"score":0.99998539686203},{"category_id":1,"poly":[149.3945770263672,1215.5172119140625,817.8850708007812,1215.5172119140625,817.8850708007812,1304.873291015625,149.3945770263672,1304.873291015625],"score":0.9999765157699585},{"category_id":1,"poly":[882.6979370117188,1880.13916015625,1552.15185546875,1880.13916015625,1552.15185546875,2031.339599609375,882.6979370117188,2031.339599609375],"score":0.9999744892120361},{"category_id":1,"poly":[148.96054077148438,743.3055419921875,818.6231689453125,743.3055419921875,818.6231689453125,1074.2369384765625,148.96054077148438,1074.2369384765625],"score":0.9999669790267944},{"category_id":1,"poly":[148.8435516357422,1791.14306640625,818.6885375976562,1791.14306640625,818.6885375976562,2030.794189453125,148.8435516357422,2030.794189453125],"score":0.9999618530273438},{"category_id":0,"poly":[150.7009735107422,684.0087890625,623.5106201171875,684.0087890625,623.5106201171875,717.03662109375,150.7009735107422,717.03662109375],"score":0.9999415278434753},{"category_id":8,"poly":[146.48068237304688,1331.6737060546875,317.2640075683594,1331.6737060546875,317.2640075683594,1400.1722412109375,146.48068237304688,1400.1722412109375],"score":0.9998958110809326},{"category_id":1,"poly":[149.42420959472656,1430.8782958984375,818.9042358398438,1430.8782958984375,818.9042358398438,1672.7386474609375,149.42420959472656,1672.7386474609375],"score":0.9998599290847778},{"category_id":1,"poly":[149.18746948242188,172.10252380371094,818.5662231445312,172.10252380371094,818.5662231445312,230.4594268798828,149.18746948242188,230.4594268798828],"score":0.9997718334197998},{"category_id":0,"poly":[149.0175018310547,1732.1090087890625,702.1005859375,1732.1090087890625,702.1005859375,1763.6046142578125,149.0175018310547,1763.6046142578125],"score":0.9997085928916931},{"category_id":2,"poly":[1519.802490234375,98.59099578857422,1551.985107421875,98.59099578857422,1551.985107421875,119.48420715332031,1519.802490234375,119.48420715332031],"score":0.9995552897453308},{"category_id":8,"poly":[146.9109649658203,1100.156494140625,544.2803344726562,1100.156494140625,544.2803344726562,1184.929443359375,146.9109649658203,1184.929443359375],"score":0.9995207786560059},{"category_id":2,"poly":[148.11611938476562,99.87767791748047,318.926025390625,99.87767791748047,318.926025390625,120.70393371582031,148.11611938476562,120.70393371582031],"score":0.999351441860199},{"category_id":9,"poly":[791.7642211914062,1130.056396484375,818.6940307617188,1130.056396484375,818.6940307617188,1161.1080322265625,791.7642211914062,1161.1080322265625],"score":0.9908884763717651},{"category_id":9,"poly":[788.37060546875,1346.8450927734375,818.5010986328125,1346.8450927734375,818.5010986328125,1377.370361328125,788.37060546875,1377.370361328125],"score":0.9873985052108765},{"category_id":14,"poly":[146,1103,543,1103,543,1184,146,1184],"score":0.94,"latex":"E\\!\\left(W\\right)\\!=\\!\\frac{E\\!\\left[H^{2}\\right]}{2E\\!\\left[H\\right]}\\!=\\!\\frac{E\\!\\left[H\\right]}{2}\\!\\!\\left(1\\!+\\!\\operatorname{CV}\\!\\left(H\\right)^{2}\\right)"},{"category_id":13,"poly":[1196,354,1278,354,1278,384,1196,384],"score":0.91,"latex":"p(1-q)"},{"category_id":13,"poly":[881,415,1020,415,1020,444,881,444],"score":0.91,"latex":"(1-p)(1-q)"},{"category_id":14,"poly":[147,1333,318,1333,318,1400,147,1400],"score":0.91,"latex":"\\mathbf{CV}\\big(H\\big)\\!=\\!\\frac{\\boldsymbol{\\upsigma}_{H}}{E\\big[H\\big]}"},{"category_id":13,"poly":[1197,657,1263,657,1263,686,1197,686],"score":0.9,"latex":"(1-p)"},{"category_id":13,"poly":[213,1217,263,1217,263,1244,213,1244],"score":0.88,"latex":"E[X]"},{"category_id":13,"poly":[214,1434,245,1434,245,1459,214,1459],"score":0.87,"latex":"\\upsigma_{H}"},{"category_id":13,"poly":[324,2002,373,2002,373,2028,324,2028],"score":0.84,"latex":"30\\%"},{"category_id":13,"poly":[1209,693,1225,693,1225,717,1209,717],"score":0.83,"latex":"p"},{"category_id":13,"poly":[990,449,1007,449,1007,474,990,474],"score":0.81,"latex":"p"},{"category_id":13,"poly":[346,1277,369,1277,369,1301,346,1301],"score":0.81,"latex":"H"},{"category_id":13,"poly":[1137,661,1154,661,1154,686,1137,686],"score":0.81,"latex":"p"},{"category_id":13,"poly":[522,1432,579,1432,579,1459,522,1459],"score":0.81,"latex":"H\\left(4\\right)"},{"category_id":13,"poly":[944,540,962,540,962,565,944,565],"score":0.8,"latex":"p"},{"category_id":13,"poly":[1444,936,1461,936,1461,961,1444,961],"score":0.79,"latex":"p"},{"category_id":13,"poly":[602,1247,624,1247,624,1270,602,1270],"score":0.78,"latex":"H"},{"category_id":13,"poly":[147,1247,167,1247,167,1271,147,1271],"score":0.77,"latex":"X"},{"category_id":13,"poly":[210,1246,282,1246,282,1274,210,1274],"score":0.77,"latex":"\\operatorname{CV}(H)"},{"category_id":13,"poly":[1346,268,1361,268,1361,292,1346,292],"score":0.76,"latex":"q"},{"category_id":13,"poly":[215,957,238,957,238,981,215,981],"score":0.74,"latex":"H"},{"category_id":13,"poly":[149,956,173,956,173,981,149,981],"score":0.63,"latex":"W"},{"category_id":13,"poly":[924,841,1016,841,1016,868,924,868],"score":0.56,"latex":"8{\\cdot}00\\;\\mathrm{a.m}"},{"category_id":13,"poly":[956,871,1032,871,1032,898,956,898],"score":0.43,"latex":"20~\\mathrm{min}"},{"category_id":13,"poly":[1082,781,1112,781,1112,808,1082,808],"score":0.41,"latex":"(l)"},{"category_id":13,"poly":[697,1821,734,1821,734,1847,697,1847],"score":0.3,"latex":"^{1\\mathrm{~h~}}"}],"page_info":{"page_no":0,"height":2200,"width":1700}}]}
{"file_location":"tests/unittest/test_tools/assets/cli_dev/cli_test_01.pdf","doc_layout_result":[{"layout_dets":[{"category_id":1,"poly":[882.4013061523438,169.93817138671875,1552.350341796875,169.93817138671875,1552.350341796875,625.8263549804688,882.4013061523438,625.8263549804688],"score":0.999992311000824},{"category_id":1,"poly":[882.474853515625,1450.92822265625,1551.4490966796875,1450.92822265625,1551.4490966796875,1877.5712890625,882.474853515625,1877.5712890625],"score":0.9999903440475464},{"category_id":1,"poly":[881.6513061523438,626.2058715820312,1552.1400146484375,626.2058715820312,1552.1400146484375,1450.604736328125,881.6513061523438,1450.604736328125],"score":0.9999856352806091},{"category_id":1,"poly":[149.41075134277344,232.1595001220703,819.0465087890625,232.1595001220703,819.0465087890625,625.8865356445312,149.41075134277344,625.8865356445312],"score":0.99998539686203},{"category_id":1,"poly":[149.3945770263672,1215.5172119140625,817.8850708007812,1215.5172119140625,817.8850708007812,1304.873291015625,149.3945770263672,1304.873291015625],"score":0.9999765157699585},{"category_id":1,"poly":[882.6979370117188,1880.13916015625,1552.15185546875,1880.13916015625,1552.15185546875,2031.339599609375,882.6979370117188,2031.339599609375],"score":0.9999744892120361},{"category_id":1,"poly":[148.96054077148438,743.3055419921875,818.6231689453125,743.3055419921875,818.6231689453125,1074.2369384765625,148.96054077148438,1074.2369384765625],"score":0.9999669790267944},{"category_id":1,"poly":[148.8435516357422,1791.14306640625,818.6885375976562,1791.14306640625,818.6885375976562,2030.794189453125,148.8435516357422,2030.794189453125],"score":0.9999618530273438},{"category_id":0,"poly":[150.7009735107422,684.0087890625,623.5106201171875,684.0087890625,623.5106201171875,717.03662109375,150.7009735107422,717.03662109375],"score":0.9999415278434753},{"category_id":8,"poly":[146.48068237304688,1331.6737060546875,317.2640075683594,1331.6737060546875,317.2640075683594,1400.1722412109375,146.48068237304688,1400.1722412109375],"score":0.9998958110809326},{"category_id":1,"poly":[149.42420959472656,1430.8782958984375,818.9042358398438,1430.8782958984375,818.9042358398438,1672.7386474609375,149.42420959472656,1672.7386474609375],"score":0.9998599290847778},{"category_id":1,"poly":[149.18746948242188,172.10252380371094,818.5662231445312,172.10252380371094,818.5662231445312,230.4594268798828,149.18746948242188,230.4594268798828],"score":0.9997718334197998},{"category_id":0,"poly":[149.0175018310547,1732.1090087890625,702.1005859375,1732.1090087890625,702.1005859375,1763.6046142578125,149.0175018310547,1763.6046142578125],"score":0.9997085928916931},{"category_id":2,"poly":[1519.802490234375,98.59099578857422,1551.985107421875,98.59099578857422,1551.985107421875,119.48420715332031,1519.802490234375,119.48420715332031],"score":0.9995552897453308},{"category_id":8,"poly":[146.9109649658203,1100.156494140625,544.2803344726562,1100.156494140625,544.2803344726562,1184.929443359375,146.9109649658203,1184.929443359375],"score":0.9995207786560059},{"category_id":2,"poly":[148.11611938476562,99.87767791748047,318.926025390625,99.87767791748047,318.926025390625,120.70393371582031,148.11611938476562,120.70393371582031],"score":0.999351441860199},{"category_id":9,"poly":[791.7642211914062,1130.056396484375,818.6940307617188,1130.056396484375,818.6940307617188,1161.1080322265625,791.7642211914062,1161.1080322265625],"score":0.9908884763717651},{"category_id":9,"poly":[788.37060546875,1346.8450927734375,818.5010986328125,1346.8450927734375,818.5010986328125,1377.370361328125,788.37060546875,1377.370361328125],"score":0.9873985052108765},{"category_id":14,"poly":[146,1103,543,1103,543,1184,146,1184],"score":0.94,"latex":"E\\!\\left(W\\right)\\!=\\!\\frac{E\\!\\left[H^{2}\\right]}{2E\\!\\left[H\\right]}\\!=\\!\\frac{E\\!\\left[H\\right]}{2}\\!\\!\\left(1\\!+\\!\\operatorname{CV}\\!\\left(H\\right)^{2}\\right)"},{"category_id":13,"poly":[1196,354,1278,354,1278,384,1196,384],"score":0.91,"latex":"p(1-q)"},{"category_id":13,"poly":[881,415,1020,415,1020,444,881,444],"score":0.91,"latex":"(1-p)(1-q)"},{"category_id":14,"poly":[147,1333,318,1333,318,1400,147,1400],"score":0.91,"latex":"\\mathrm{CV}\\big(H\\big)\\!=\\!\\frac{\\sigma_{_H}}{E\\big[H\\big]}"},{"category_id":13,"poly":[1197,657,1263,657,1263,686,1197,686],"score":0.9,"latex":"(1-p)"},{"category_id":13,"poly":[213,1217,263,1217,263,1244,213,1244],"score":0.88,"latex":"E[X]"},{"category_id":13,"poly":[214,1434,245,1434,245,1459,214,1459],"score":0.87,"latex":"\\upsigma_{H}"},{"category_id":13,"poly":[324,2002,373,2002,373,2028,324,2028],"score":0.84,"latex":"30\\%"},{"category_id":13,"poly":[1209,693,1225,693,1225,717,1209,717],"score":0.83,"latex":"p"},{"category_id":13,"poly":[990,449,1007,449,1007,474,990,474],"score":0.81,"latex":"p"},{"category_id":13,"poly":[346,1277,369,1277,369,1301,346,1301],"score":0.81,"latex":"H"},{"category_id":13,"poly":[1137,661,1154,661,1154,686,1137,686],"score":0.81,"latex":"p"},{"category_id":13,"poly":[522,1432,579,1432,579,1459,522,1459],"score":0.81,"latex":"H\\left(4\\right)"},{"category_id":13,"poly":[944,540,962,540,962,565,944,565],"score":0.8,"latex":"p"},{"category_id":13,"poly":[1444,936,1461,936,1461,961,1444,961],"score":0.79,"latex":"p"},{"category_id":13,"poly":[602,1247,624,1247,624,1270,602,1270],"score":0.78,"latex":"H"},{"category_id":13,"poly":[147,1247,167,1247,167,1271,147,1271],"score":0.77,"latex":"X"},{"category_id":13,"poly":[210,1246,282,1246,282,1274,210,1274],"score":0.77,"latex":"\\mathrm{CV}(H)"},{"category_id":13,"poly":[1346,268,1361,268,1361,292,1346,292],"score":0.76,"latex":"q"},{"category_id":13,"poly":[215,957,238,957,238,981,215,981],"score":0.74,"latex":"H"},{"category_id":13,"poly":[149,956,173,956,173,981,149,981],"score":0.63,"latex":"W"},{"category_id":13,"poly":[924,841,1016,841,1016,868,924,868],"score":0.56,"latex":"8{\\mathrm{:}}00\\;\\mathrm{a.m}"},{"category_id":13,"poly":[956,871,1032,871,1032,898,956,898],"score":0.43,"latex":"20\\ \\mathrm{min}"},{"category_id":13,"poly":[1082,781,1112,781,1112,808,1082,808],"score":0.41,"latex":"(I)"},{"category_id":13,"poly":[697,1821,734,1821,734,1847,697,1847],"score":0.3,"latex":"1\\,\\mathrm{~h~}"},{"category_id":15,"poly":[881.0,174.0,1552.0,174.0,1552.0,204.0,881.0,204.0],"score":1.0,"text":"model. They also found that the empirical distributions of passenger"},{"category_id":15,"poly":[880.0,205.0,1552.0,205.0,1552.0,236.0,880.0,236.0],"score":0.99,"text":"incidence times (by time of day) had peaks just before the respec-"},{"category_id":15,"poly":[880.0,234.0,1553.0,234.0,1553.0,264.0,880.0,264.0],"score":0.99,"text":"tive average bus departure times. They hypothesized the existence"},{"category_id":15,"poly":[881.0,264.0,1345.0,264.0,1345.0,296.0,881.0,296.0],"score":0.98,"text":"of three classes of passengers: with proportion"},{"category_id":15,"poly":[1362.0,264.0,1552.0,264.0,1552.0,296.0,1362.0,296.0],"score":0.95,"text":"passengers whose"},{"category_id":15,"poly":[880.0,295.0,1552.0,295.0,1552.0,325.0,880.0,325.0],"score":1.0,"text":"time of incidence is causally coincident with that of a bus departure"},{"category_id":15,"poly":[880.0,326.0,1555.0,326.0,1555.0,355.0,880.0,355.0],"score":0.99,"text":"(e.g., because they saw the approaching bus from their home or a"},{"category_id":15,"poly":[881.0,356.0,1195.0,356.0,1195.0,388.0,881.0,388.0],"score":0.99,"text":"shop window); with proportion"},{"category_id":15,"poly":[1279.0,356.0,1553.0,356.0,1553.0,388.0,1279.0,388.0],"score":0.99,"text":", passengers who time their"},{"category_id":15,"poly":[882.0,388.0,1552.0,388.0,1552.0,416.0,882.0,416.0],"score":0.99,"text":"arrivals to minimize expected waiting time; and with proportion"},{"category_id":15,"poly":[1021.0,418.0,1553.0,418.0,1553.0,447.0,1021.0,447.0],"score":1.0,"text":", passengers who are randomly incident. The authors"},{"category_id":15,"poly":[881.0,448.0,989.0,448.0,989.0,477.0,881.0,477.0],"score":1.0,"text":"found that"},{"category_id":15,"poly":[1008.0,448.0,1553.0,448.0,1553.0,477.0,1008.0,477.0],"score":1.0,"text":"was positively correlated with the potential reduction"},{"category_id":15,"poly":[880.0,479.0,1552.0,479.0,1552.0,507.0,880.0,507.0],"score":1.0,"text":"in waiting time (compared with arriving randomly) that resulted"},{"category_id":15,"poly":[882.0,510.0,1551.0,510.0,1551.0,536.0,882.0,536.0],"score":0.97,"text":"from knowledge of the timetable and of service reliability. They also"},{"category_id":15,"poly":[881.0,539.0,943.0,539.0,943.0,568.0,881.0,568.0],"score":1.0,"text":"found"},{"category_id":15,"poly":[963.0,539.0,1553.0,539.0,1553.0,568.0,963.0,568.0],"score":0.99,"text":"to be higher in the peak commuting periods rather than in"},{"category_id":15,"poly":[881.0,568.0,1554.0,568.0,1554.0,599.0,881.0,599.0],"score":0.98,"text":"the off-peak periods, indicating more awareness of the timetable or"},{"category_id":15,"poly":[881.0,599.0,1323.0,599.0,1323.0,627.0,881.0,627.0],"score":0.98,"text":"historical reliability, or both, by commuters."},{"category_id":15,"poly":[905.0,1452.0,1551.0,1452.0,1551.0,1483.0,905.0,1483.0],"score":0.99,"text":"Furth and Muller study the issue in a theoretical context and gener-"},{"category_id":15,"poly":[883.0,1485.0,1553.0,1485.0,1553.0,1514.0,883.0,1514.0],"score":1.0,"text":"ally agree with the above findings (2). They are primarily concerned"},{"category_id":15,"poly":[882.0,1513.0,1553.0,1513.0,1553.0,1545.0,882.0,1545.0],"score":0.99,"text":"with the use of data from automatic vehicle-tracking systems to assess"},{"category_id":15,"poly":[880.0,1545.0,1553.0,1545.0,1553.0,1574.0,880.0,1574.0],"score":0.99,"text":"the impacts of reliability on passenger incidence behavior and wait-"},{"category_id":15,"poly":[881.0,1577.0,1551.0,1577.0,1551.0,1606.0,881.0,1606.0],"score":0.98,"text":"ing times. They propose that passengers will react to unreliability by"},{"category_id":15,"poly":[883.0,1608.0,1551.0,1608.0,1551.0,1637.0,883.0,1637.0],"score":1.0,"text":"departing earlier than they would with reliable services. Randomly"},{"category_id":15,"poly":[880.0,1636.0,1554.0,1636.0,1554.0,1669.0,880.0,1669.0],"score":1.0,"text":"incident unaware passengers will experience unreliability as a more"},{"category_id":15,"poly":[882.0,1669.0,1553.0,1669.0,1553.0,1697.0,882.0,1697.0],"score":0.99,"text":"dispersed distribution of headways and simply allocate additional"},{"category_id":15,"poly":[880.0,1699.0,1551.0,1699.0,1551.0,1726.0,880.0,1726.0],"score":0.97,"text":"time to their trip plan to improve the chance of arriving at their des-"},{"category_id":15,"poly":[881.0,1730.0,1551.0,1730.0,1551.0,1759.0,881.0,1759.0],"score":0.98,"text":"tination on time. Aware passengers, whose incidence is not entirely"},{"category_id":15,"poly":[880.0,1760.0,1552.0,1760.0,1552.0,1789.0,880.0,1789.0],"score":0.99,"text":"random, will react by timing their incidence somewhat earlier than"},{"category_id":15,"poly":[882.0,1792.0,1550.0,1792.0,1550.0,1818.0,882.0,1818.0],"score":0.99,"text":"the scheduled departure time to increase their chance of catching the"},{"category_id":15,"poly":[883.0,1823.0,1552.0,1823.0,1552.0,1849.0,883.0,1849.0],"score":0.99,"text":"desired service. The authors characterize these reactions as the costs"},{"category_id":15,"poly":[883.0,1853.0,1031.0,1853.0,1031.0,1880.0,883.0,1880.0],"score":0.95,"text":"of unreliability."},{"category_id":15,"poly":[907.0,630.0,1553.0,630.0,1553.0,658.0,907.0,658.0],"score":1.0,"text":"Bowman and Turnquist built on the concept of aware and unaware"},{"category_id":15,"poly":[881.0,662.0,1136.0,662.0,1136.0,690.0,881.0,690.0],"score":0.99,"text":"passengers of proportions"},{"category_id":15,"poly":[1155.0,662.0,1196.0,662.0,1196.0,690.0,1155.0,690.0],"score":1.0,"text":"and"},{"category_id":15,"poly":[1264.0,662.0,1553.0,662.0,1553.0,690.0,1264.0,690.0],"score":0.99,"text":",respectively. They proposed"},{"category_id":15,"poly":[881.0,692.0,1208.0,692.0,1208.0,719.0,881.0,719.0],"score":0.99,"text":"a utility-based model to estimate"},{"category_id":15,"poly":[1226.0,692.0,1552.0,692.0,1552.0,719.0,1226.0,719.0],"score":1.0,"text":"and the distribution of incidence"},{"category_id":15,"poly":[880.0,721.0,1554.0,721.0,1554.0,751.0,880.0,751.0],"score":0.99,"text":"times, and thus the mean waiting time, of aware passengers over"},{"category_id":15,"poly":[880.0,752.0,1553.0,752.0,1553.0,780.0,880.0,780.0],"score":0.98,"text":"a given headway as a function of the headway and reliability of"},{"category_id":15,"poly":[880.0,782.0,1081.0,782.0,1081.0,812.0,880.0,812.0],"score":0.99,"text":"bus departure times"},{"category_id":15,"poly":[1113.0,782.0,1552.0,782.0,1552.0,812.0,1113.0,812.0],"score":0.99,"text":". They observed seven bus stops in Chicago,"},{"category_id":15,"poly":[882.0,813.0,1553.0,813.0,1553.0,841.0,882.0,841.0],"score":0.98,"text":"Illinois, each served by a single (different) bus route, between 6:00"},{"category_id":15,"poly":[882.0,844.0,923.0,844.0,923.0,871.0,882.0,871.0],"score":1.0,"text":"and"},{"category_id":15,"poly":[1017.0,844.0,1550.0,844.0,1550.0,871.0,1017.0,871.0],"score":0.97,"text":".for 5 to 10 days each. The bus routes had headways"},{"category_id":15,"poly":[882.0,874.0,955.0,874.0,955.0,902.0,882.0,902.0],"score":0.95,"text":"of 5to"},{"category_id":15,"poly":[1033.0,874.0,1553.0,874.0,1553.0,902.0,1033.0,902.0],"score":0.98,"text":"and a range of reliabilities. The authors found that"},{"category_id":15,"poly":[882.0,906.0,1553.0,906.0,1553.0,933.0,882.0,933.0],"score":0.99,"text":"actual average waiting time was substantially less than predicted"},{"category_id":15,"poly":[881.0,935.0,1443.0,935.0,1443.0,963.0,881.0,963.0],"score":1.0,"text":"by the random incidence model. They estimated that"},{"category_id":15,"poly":[1462.0,935.0,1553.0,935.0,1553.0,963.0,1462.0,963.0],"score":0.96,"text":"was not"},{"category_id":15,"poly":[881.0,966.0,1552.0,966.0,1552.0,994.0,881.0,994.0],"score":0.98,"text":"statistically significantly different from 1.0, which they explain by"},{"category_id":15,"poly":[880.0,994.0,1552.0,994.0,1552.0,1025.0,880.0,1025.0],"score":0.99,"text":"the fact that all observations were taken during peak commuting"},{"category_id":15,"poly":[880.0,1027.0,1552.0,1027.0,1552.0,1054.0,880.0,1054.0],"score":0.99,"text":"times. Their model predicts that the longer the headway and the"},{"category_id":15,"poly":[881.0,1058.0,1554.0,1058.0,1554.0,1086.0,881.0,1086.0],"score":0.99,"text":"more reliable the departures, the more peaked the distribution of"},{"category_id":15,"poly":[881.0,1088.0,1553.0,1088.0,1553.0,1115.0,881.0,1115.0],"score":0.98,"text":"incidence times will be and the closer that peak will be to the next"},{"category_id":15,"poly":[882.0,1119.0,1552.0,1119.0,1552.0,1148.0,882.0,1148.0],"score":1.0,"text":"scheduled departure time. This prediction demonstrates what they"},{"category_id":15,"poly":[882.0,1149.0,1552.0,1149.0,1552.0,1176.0,882.0,1176.0],"score":0.99,"text":"refer to as a safety margin that passengers add to reduce the chance"},{"category_id":15,"poly":[883.0,1181.0,1552.0,1181.0,1552.0,1206.0,883.0,1206.0],"score":0.98,"text":"of missing their bus when the service is known to be somewhat"},{"category_id":15,"poly":[882.0,1210.0,1551.0,1210.0,1551.0,1238.0,882.0,1238.0],"score":0.98,"text":"unreliable. Such a safety margin can also result from unreliability in"},{"category_id":15,"poly":[881.0,1242.0,1553.0,1242.0,1553.0,1269.0,881.0,1269.0],"score":0.99,"text":"passengers' journeys to the public transport stop or station. Bowman"},{"category_id":15,"poly":[882.0,1271.0,1553.0,1271.0,1553.0,1299.0,882.0,1299.0],"score":0.99,"text":"and Turnquist conclude from their model that the random incidence"},{"category_id":15,"poly":[880.0,1301.0,1551.0,1301.0,1551.0,1331.0,880.0,1331.0],"score":0.99,"text":"model underestimates the waiting time benefits of improving reli-"},{"category_id":15,"poly":[882.0,1332.0,1552.0,1332.0,1552.0,1362.0,882.0,1362.0],"score":0.99,"text":"ability and overestimates the waiting time benefits of increasing ser-"},{"category_id":15,"poly":[883.0,1363.0,1552.0,1363.0,1552.0,1392.0,883.0,1392.0],"score":0.99,"text":"vice frequency. This is because as reliability increases passengers"},{"category_id":15,"poly":[882.0,1394.0,1552.0,1394.0,1552.0,1422.0,882.0,1422.0],"score":0.99,"text":"can better predict departure times and so can time their incidence to"},{"category_id":15,"poly":[882.0,1423.0,1159.0,1423.0,1159.0,1452.0,882.0,1452.0],"score":0.99,"text":"decrease their waiting time."},{"category_id":15,"poly":[175.0,235.0,819.0,235.0,819.0,264.0,175.0,264.0],"score":0.99,"text":"After briefly introducing the random incidence model, which is"},{"category_id":15,"poly":[149.0,265.0,818.0,265.0,818.0,295.0,149.0,295.0],"score":0.98,"text":"often assumed to hold at short headways, the balance of this section"},{"category_id":15,"poly":[148.0,298.0,818.0,298.0,818.0,324.0,148.0,324.0],"score":0.98,"text":"reviews six studies of passenger incidence behavior that are moti-"},{"category_id":15,"poly":[148.0,327.0,818.0,327.0,818.0,356.0,148.0,356.0],"score":1.0,"text":"vated by understanding the relationships between service headway,"},{"category_id":15,"poly":[146.0,355.0,820.0,355.0,820.0,388.0,146.0,388.0],"score":0.99,"text":"service reliability, passenger incidence behavior, and passenger"},{"category_id":15,"poly":[149.0,388.0,818.0,388.0,818.0,414.0,149.0,414.0],"score":1.0,"text":"waiting time in a more nuanced fashion than is embedded in the"},{"category_id":15,"poly":[149.0,419.0,818.0,419.0,818.0,445.0,149.0,445.0],"score":1.0,"text":"random incidence assumption (2). Three of these studies depend on"},{"category_id":15,"poly":[147.0,447.0,818.0,447.0,818.0,477.0,147.0,477.0],"score":0.99,"text":"manually collected data, two studies use data from AFC systems,"},{"category_id":15,"poly":[148.0,479.0,819.0,479.0,819.0,507.0,148.0,507.0],"score":0.99,"text":"and one study analyzes the issue purely theoretically. These studies"},{"category_id":15,"poly":[147.0,509.0,819.0,509.0,819.0,537.0,147.0,537.0],"score":0.99,"text":"reveal much about passenger incidence behavior, but all are found"},{"category_id":15,"poly":[147.0,538.0,820.0,538.0,820.0,567.0,147.0,567.0],"score":0.99,"text":"to be limited in their general applicability by the methods with"},{"category_id":15,"poly":[150.0,569.0,818.0,569.0,818.0,597.0,150.0,597.0],"score":0.99,"text":"which they collect information about passengers and the services"},{"category_id":15,"poly":[147.0,599.0,458.0,599.0,458.0,630.0,147.0,630.0],"score":1.0,"text":"those passengers intend to use."},{"category_id":15,"poly":[150.0,1219.0,212.0,1219.0,212.0,1247.0,150.0,1247.0],"score":1.0,"text":"where"},{"category_id":15,"poly":[264.0,1219.0,817.0,1219.0,817.0,1247.0,264.0,1247.0],"score":0.99,"text":"is the probabilistic expectation of some random variable"},{"category_id":15,"poly":[168.0,1248.0,209.0,1248.0,209.0,1275.0,168.0,1275.0],"score":1.0,"text":"and"},{"category_id":15,"poly":[283.0,1248.0,601.0,1248.0,601.0,1275.0,283.0,1275.0],"score":0.97,"text":"is the coefficient of variation of"},{"category_id":15,"poly":[625.0,1248.0,818.0,1248.0,818.0,1275.0,625.0,1275.0],"score":0.96,"text":".a unitless measure"},{"category_id":15,"poly":[148.0,1277.0,345.0,1277.0,345.0,1307.0,148.0,1307.0],"score":0.97,"text":"of the variability of"},{"category_id":15,"poly":[370.0,1277.0,477.0,1277.0,477.0,1307.0,370.0,1307.0],"score":0.99,"text":"defined as"},{"category_id":15,"poly":[906.0,1883.0,1552.0,1883.0,1552.0,1910.0,906.0,1910.0],"score":0.98,"text":"Luethi et al. continued with the analysis of manually collected"},{"category_id":15,"poly":[880.0,1909.0,1552.0,1909.0,1552.0,1945.0,880.0,1945.0],"score":0.99,"text":"data on actual passenger behavior (6). They use the language"},{"category_id":15,"poly":[883.0,1945.0,1552.0,1945.0,1552.0,1972.0,883.0,1972.0],"score":0.99,"text":"of probability to describe two classes of passengers. The first is"},{"category_id":15,"poly":[881.0,1973.0,1552.0,1973.0,1552.0,2003.0,881.0,2003.0],"score":1.0,"text":"timetable-dependent passengers (i.e., the aware passengers), whose"},{"category_id":15,"poly":[881.0,2006.0,1552.0,2006.0,1552.0,2033.0,881.0,2033.0],"score":1.0,"text":"incidence behavior is affected by awareness (possibly gained"},{"category_id":15,"poly":[149.0,748.0,817.0,748.0,817.0,774.0,149.0,774.0],"score":1.0,"text":"One characterization of passenger incidence behavior is that of ran-"},{"category_id":15,"poly":[148.0,777.0,818.0,777.0,818.0,806.0,148.0,806.0],"score":0.99,"text":"dom incidence (3). The key assumption underlying the random inci-"},{"category_id":15,"poly":[148.0,807.0,818.0,807.0,818.0,836.0,148.0,836.0],"score":0.99,"text":"dence model is that the process of passenger arrivals to the public"},{"category_id":15,"poly":[148.0,837.0,819.0,837.0,819.0,866.0,148.0,866.0],"score":0.99,"text":"transport service is independent from the vehicle departure process"},{"category_id":15,"poly":[148.0,868.0,818.0,868.0,818.0,897.0,148.0,897.0],"score":1.0,"text":"of the service. This implies that passengers become incident to the"},{"category_id":15,"poly":[149.0,899.0,817.0,899.0,817.0,925.0,149.0,925.0],"score":0.99,"text":"service at a random time, and thus the instantaneous rate of passen-"},{"category_id":15,"poly":[148.0,928.0,820.0,928.0,820.0,957.0,148.0,957.0],"score":1.0,"text":"ger arrivals to the service is uniform over a given period of time. Let"},{"category_id":15,"poly":[174.0,956.0,214.0,956.0,214.0,990.0,174.0,990.0],"score":1.0,"text":"and"},{"category_id":15,"poly":[239.0,956.0,818.0,956.0,818.0,990.0,239.0,990.0],"score":0.99,"text":"be random variables representing passenger waiting times"},{"category_id":15,"poly":[148.0,988.0,818.0,988.0,818.0,1016.0,148.0,1016.0],"score":1.0,"text":"and service headways, respectively. Under the random incidence"},{"category_id":15,"poly":[149.0,1019.0,818.0,1019.0,818.0,1048.0,149.0,1048.0],"score":0.98,"text":"assumption and the assumption that vehicle capacity is not a binding"},{"category_id":15,"poly":[149.0,1050.0,726.0,1050.0,726.0,1076.0,149.0,1076.0],"score":0.99,"text":"constraint, a classic result of transportation science is that"},{"category_id":15,"poly":[146.0,1793.0,818.0,1793.0,818.0,1822.0,146.0,1822.0],"score":0.98,"text":" Jolliffe and Hutchinson studied bus passenger incidence in South"},{"category_id":15,"poly":[147.0,1825.0,696.0,1825.0,696.0,1852.0,147.0,1852.0],"score":0.97,"text":"London suburbs (5). They observed 10 bus stops for"},{"category_id":15,"poly":[735.0,1825.0,817.0,1825.0,817.0,1852.0,735.0,1852.0],"score":1.0,"text":"perday"},{"category_id":15,"poly":[148.0,1855.0,819.0,1855.0,819.0,1881.0,148.0,1881.0],"score":1.0,"text":"over 8 days, recording the times of passenger incidence and actual"},{"category_id":15,"poly":[148.0,1884.0,819.0,1884.0,819.0,1912.0,148.0,1912.0],"score":0.98,"text":"and scheduled bus departures. They limited their stop selection to"},{"category_id":15,"poly":[146.0,1913.0,819.0,1913.0,819.0,1945.0,146.0,1945.0],"score":1.0,"text":"those served by only a single bus route with a single service pat-"},{"category_id":15,"poly":[147.0,1945.0,819.0,1945.0,819.0,1974.0,147.0,1974.0],"score":0.98,"text":"tern so as to avoid ambiguity about which service a passenger was"},{"category_id":15,"poly":[147.0,1972.0,820.0,1972.0,820.0,2006.0,147.0,2006.0],"score":0.98,"text":"waiting for. The authors found that the actual average passenger"},{"category_id":15,"poly":[149.0,2005.0,323.0,2005.0,323.0,2033.0,149.0,2033.0],"score":0.96,"text":"waitingtimewas"},{"category_id":15,"poly":[374.0,2005.0,819.0,2005.0,819.0,2033.0,374.0,2033.0],"score":1.0,"text":"less than predicted by the random incidence"},{"category_id":15,"poly":[148.0,686.0,625.0,686.0,625.0,721.0,148.0,721.0],"score":0.99,"text":"Random Passenger Incidence Behavior"},{"category_id":15,"poly":[151.0,1434.0,213.0,1434.0,213.0,1462.0,151.0,1462.0],"score":0.99,"text":"where"},{"category_id":15,"poly":[246.0,1434.0,521.0,1434.0,521.0,1462.0,246.0,1462.0],"score":0.98,"text":"is the standard deviation of"},{"category_id":15,"poly":[580.0,1434.0,816.0,1434.0,816.0,1462.0,580.0,1462.0],"score":0.96,"text":".The second expression"},{"category_id":15,"poly":[148.0,1466.0,819.0,1466.0,819.0,1493.0,148.0,1493.0],"score":0.99,"text":"in Equation 1 is particularly useful because it expresses the mean"},{"category_id":15,"poly":[146.0,1496.0,819.0,1496.0,819.0,1525.0,146.0,1525.0],"score":0.99,"text":"passenger waiting time as the sum of two components: the waiting"},{"category_id":15,"poly":[148.0,1526.0,818.0,1526.0,818.0,1553.0,148.0,1553.0],"score":0.98,"text":"time caused by the mean headway (i.e., the reciprocal of service fre-"},{"category_id":15,"poly":[147.0,1557.0,819.0,1557.0,819.0,1584.0,147.0,1584.0],"score":0.99,"text":"quency) and the waiting time caused by the variability of the head-"},{"category_id":15,"poly":[148.0,1588.0,818.0,1588.0,818.0,1612.0,148.0,1612.0],"score":0.97,"text":"ways (which is one measure of service reliability). When the service"},{"category_id":15,"poly":[148.0,1617.0,817.0,1617.0,817.0,1644.0,148.0,1644.0],"score":1.0,"text":"is perfectly reliable with constant headways, the mean waiting time"},{"category_id":15,"poly":[148.0,1646.0,472.0,1646.0,472.0,1677.0,148.0,1677.0],"score":0.99,"text":"will be simply half the headway."},{"category_id":15,"poly":[151.0,176.0,817.0,176.0,817.0,204.0,151.0,204.0],"score":0.99,"text":"dependent on the service headway and the reliability of the departure"},{"category_id":15,"poly":[147.0,205.0,652.0,205.0,652.0,236.0,147.0,236.0],"score":0.99,"text":"time of the service to which passengers are incident."},{"category_id":15,"poly":[149.0,1735.0,702.0,1735.0,702.0,1767.0,149.0,1767.0],"score":0.98,"text":"More Behaviorally Realistic Incidence Models"},{"category_id":15,"poly":[1519.0,98.0,1554.0,98.0,1554.0,125.0,1519.0,125.0],"score":1.0,"text":"53"},{"category_id":15,"poly":[148.0,98.0,322.0,98.0,322.0,123.0,148.0,123.0],"score":1.0,"text":"Frumin and Zhao"}],"page_info":{"page_no":0,"height":2200,"width":1700}}]}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment