Commit 826086d2 authored by zhougaofeng's avatar zhougaofeng
Browse files

Deleted magic_pdf/__pycache__/__init__.cpython-310.pyc,...

Deleted magic_pdf/__pycache__/__init__.cpython-310.pyc, magic_pdf/__pycache__/pdf_parse_by_ocr.cpython-310.pyc, magic_pdf/__pycache__/pdf_parse_by_txt.cpython-310.pyc, magic_pdf/__pycache__/pdf_parse_union_core.cpython-310.pyc, magic_pdf/__pycache__/user_api.cpython-310.pyc, magic_pdf/dict2md/__pycache__/__init__.cpython-310.pyc, magic_pdf/dict2md/__pycache__/ocr_client.cpython-310.pyc, magic_pdf/dict2md/__pycache__/ocr_mkcontent.cpython-310.pyc, magic_pdf/dict2md/__init__.py, magic_pdf/dict2md/mkcontent.py, magic_pdf/dict2md/ocr_client.py, magic_pdf/dict2md/ocr_mkcontent.py, magic_pdf/dict2md/ocr_server.py, magic_pdf/filter/__init__.py, magic_pdf/filter/pdf_classify_by_type.py, magic_pdf/filter/pdf_meta_scan.py, magic_pdf/integrations/rag/__init__.py, magic_pdf/integrations/rag/api.py, magic_pdf/integrations/rag/type.py, magic_pdf/integrations/rag/utils.py, magic_pdf/integrations/__init__.py, magic_pdf/layout/__init__.py, magic_pdf/layout/bbox_sort.py, magic_pdf/layout/layout_det_utils.py, magic_pdf/layout/layout_sort.py, magic_pdf/layout/layout_spiler_recog.py, magic_pdf/layout/mcol_sort.py, magic_pdf/libs/Constants.py, magic_pdf/libs/MakeContentConfig.py, magic_pdf/libs/ModelBlockTypeEnum.py, magic_pdf/libs/__init__.py, magic_pdf/libs/boxbase.py, magic_pdf/libs/calc_span_stats.py, magic_pdf/libs/commons.py, magic_pdf/libs/config_reader.py, magic_pdf/libs/convert_utils.py, magic_pdf/libs/coordinate_transform.py, magic_pdf/libs/detect_language_from_model.py, magic_pdf/libs/draw_bbox.py, magic_pdf/libs/drop_reason.py, magic_pdf/libs/drop_tag.py, magic_pdf/libs/hash_utils.py, magic_pdf/libs/json_compressor.py, magic_pdf/libs/language.py, magic_pdf/libs/local_math.py, magic_pdf/libs/markdown_utils.py, magic_pdf/libs/nlp_utils.py, magic_pdf/libs/ocr_content_type.py, magic_pdf/libs/path_utils.py, magic_pdf/libs/pdf_check.py, magic_pdf/libs/pdf_image_tools.py, magic_pdf/libs/safe_filename.py, magic_pdf/libs/textbase.py, magic_pdf/libs/version.py, magic_pdf/libs/vis_utils.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/cord.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/data_collator.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/funsd.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/image_utils.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/xfund.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/configuration_layoutlmv3.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/modeling_layoutlmv3.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/backbone.py, magic_pdf/model/pek_sub_modules/layoutlmv3/beit.py, magic_pdf/model/pek_sub_modules/layoutlmv3/deit.py, magic_pdf/model/pek_sub_modules/layoutlmv3/model_init.py, magic_pdf/model/pek_sub_modules/layoutlmv3/rcnn_vl.py, magic_pdf/model/pek_sub_modules/layoutlmv3/visualizer.py, magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py, magic_pdf/model/pek_sub_modules/structeqtable/__init__.py, magic_pdf/model/pek_sub_modules/__init__.py, magic_pdf/model/pek_sub_modules/post_process.py, magic_pdf/model/pek_sub_modules/self_modify.py, magic_pdf/model/__init__.py, magic_pdf/model/doc_analyze_by_custom_model.py, magic_pdf/model/magic_model.py, magic_pdf/model/model_list.py, magic_pdf/model/pdf_extract_kit.py, magic_pdf/model/ppTableModel.py, magic_pdf/model/pp_structure_v2.py, magic_pdf/para/__init__.py, magic_pdf/para/block_continuation_processor.py, magic_pdf/para/block_termination_processor.py, magic_pdf/para/commons.py, magic_pdf/para/denoise.py, magic_pdf/para/draw.py, magic_pdf/para/exceptions.py, magic_pdf/para/layout_match_processor.py, magic_pdf/para/para_pipeline.py, magic_pdf/para/para_split.py, magic_pdf/para/para_split_v2.py, magic_pdf/para/raw_processor.py, magic_pdf/para/stats.py, magic_pdf/para/title_processor.py, magic_pdf/parse/__init__.py, magic_pdf/parse/common_parse.py, magic_pdf/parse/excel_parse.py, magic_pdf/parse/pdf_client.py, magic_pdf/pipe/AbsPipe.py, magic_pdf/pipe/OCRPipe.py, magic_pdf/pipe/TXTPipe.py, magic_pdf/pipe/UNIPipe.py, magic_pdf/pipe/__init__.py, magic_pdf/post_proc/__init__.py, magic_pdf/post_proc/detect_para.py, magic_pdf/post_proc/pdf_post_filter.py, magic_pdf/post_proc/remove_footnote.py, magic_pdf/pre_proc/__init__.py, magic_pdf/pre_proc/citationmarker_remove.py, magic_pdf/pre_proc/construct_page_dict.py, magic_pdf/pre_proc/cut_image.py, magic_pdf/pre_proc/detect_equation.py, magic_pdf/pre_proc/detect_footer_by_model.py, magic_pdf/pre_proc/detect_footer_header_by_statistics.py, magic_pdf/pre_proc/detect_footnote.py, magic_pdf/pre_proc/detect_header.py, magic_pdf/pre_proc/detect_images.py, magic_pdf/pre_proc/detect_page_number.py, magic_pdf/pre_proc/detect_tables.py, magic_pdf/pre_proc/equations_replace.py, magic_pdf/pre_proc/fix_image.py, magic_pdf/pre_proc/fix_table.py, magic_pdf/pre_proc/main_text_font.py, magic_pdf/pre_proc/ocr_detect_all_bboxes.py, magic_pdf/pre_proc/ocr_detect_layout.py, magic_pdf/pre_proc/ocr_dict_merge.py, magic_pdf/pre_proc/ocr_span_list_modify.py, magic_pdf/pre_proc/pdf_pre_filter.py, magic_pdf/pre_proc/post_layout_split.py, magic_pdf/pre_proc/remove_bbox_overlap.py, magic_pdf/pre_proc/remove_colored_strip_bbox.py, magic_pdf/pre_proc/remove_footer_header.py, magic_pdf/pre_proc/remove_rotate_bbox.py, magic_pdf/pre_proc/resolve_bbox_conflict.py, magic_pdf/pre_proc/solve_line_alien.py, magic_pdf/pre_proc/statistics.py, magic_pdf/resources/fasttext-langdetect/lid.176.ftz, magic_pdf/resources/model_config/UniMERNet/demo.yaml, magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml, magic_pdf/resources/model_config/model_configs.yaml, magic_pdf/rw/AbsReaderWriter.py, magic_pdf/rw/DiskReaderWriter.py, magic_pdf/rw/S3ReaderWriter.py, magic_pdf/rw/__init__.py, magic_pdf/spark/__init__.py, magic_pdf/spark/spark_api.py, magic_pdf/tools/__init__.py, magic_pdf/tools/cli.py, magic_pdf/tools/cli_dev.py, magic_pdf/tools/common.py, magic_pdf/tools/pdf_server.py, magic_pdf/__init__.py, magic_pdf/config.ini, magic_pdf/pdf_parse_by_ocr.py, magic_pdf/pdf_parse_by_txt.py, magic_pdf/pdf_parse_union_core.py, magic_pdf/user_api.py files
parent 57aaa1cf
import os
from pathlib import Path
from loguru import logger
from magic_pdf.integrations.rag.type import (ElementRelation, LayoutElements,
Node)
from magic_pdf.integrations.rag.utils import inference
class RagPageReader:
def __init__(self, pagedata: LayoutElements):
self.o = [
Node(
category_type=v.category_type,
text=v.text,
image_path=v.image_path,
anno_id=v.anno_id,
latex=v.latex,
html=v.html,
) for v in pagedata.layout_dets
]
self.pagedata = pagedata
def __iter__(self):
return iter(self.o)
def get_rel_map(self) -> list[ElementRelation]:
return self.pagedata.extra.element_relation
class RagDocumentReader:
def __init__(self, ragdata: list[LayoutElements]):
self.o = [RagPageReader(v) for v in ragdata]
def __iter__(self):
return iter(self.o)
class DataReader:
def __init__(self, path_or_directory: str, method: str, output_dir: str):
self.path_or_directory = path_or_directory
self.method = method
self.output_dir = output_dir
self.pdfs = []
if os.path.isdir(path_or_directory):
for doc_path in Path(path_or_directory).glob('*.pdf'):
self.pdfs.append(doc_path)
else:
assert path_or_directory.endswith('.pdf')
self.pdfs.append(Path(path_or_directory))
def get_documents_count(self) -> int:
"""Returns the number of documents in the directory."""
return len(self.pdfs)
def get_document_result(self, idx: int) -> RagDocumentReader | None:
"""
Args:
idx (int): the index of documents under the
directory path_or_directory
Returns:
RagDocumentReader | None: RagDocumentReader is an iterable object,
more details @RagDocumentReader
"""
if idx >= self.get_documents_count() or idx < 0:
logger.error(f'invalid idx: {idx}')
return None
res = inference(str(self.pdfs[idx]), self.output_dir, self.method)
if res is None:
logger.warning(f'failed to inference pdf {self.pdfs[idx]}')
return None
return RagDocumentReader(res)
def get_document_filename(self, idx: int) -> Path:
"""get the filename of the document."""
return self.pdfs[idx]
from enum import Enum
from pydantic import BaseModel, Field
# rag
class CategoryType(Enum): # py310 not support StrEnum
text = 'text'
title = 'title'
interline_equation = 'interline_equation'
image = 'image'
image_body = 'image_body'
image_caption = 'image_caption'
table = 'table'
table_body = 'table_body'
table_caption = 'table_caption'
table_footnote = 'table_footnote'
class ElementRelType(Enum):
sibling = 'sibling'
class PageInfo(BaseModel):
page_no: int = Field(description='the index of page, start from zero',
ge=0)
height: int = Field(description='the height of page', gt=0)
width: int = Field(description='the width of page', ge=0)
image_path: str | None = Field(description='the image of this page',
default=None)
class ContentObject(BaseModel):
category_type: CategoryType = Field(description='类别')
poly: list[float] = Field(
description=('Coordinates, need to convert back to PDF coordinates,'
' order is top-left, top-right, bottom-right, bottom-left'
' x,y coordinates'))
ignore: bool = Field(description='whether ignore this object',
default=False)
text: str | None = Field(description='text content of the object',
default=None)
image_path: str | None = Field(description='path of embedded image',
default=None)
order: int = Field(description='the order of this object within a page',
default=-1)
anno_id: int = Field(description='unique id', default=-1)
latex: str | None = Field(description='latex result', default=None)
html: str | None = Field(description='html result', default=None)
class ElementRelation(BaseModel):
source_anno_id: int = Field(description='unique id of the source object',
default=-1)
target_anno_id: int = Field(description='unique id of the target object',
default=-1)
relation: ElementRelType = Field(
description='the relation between source and target element')
class LayoutElementsExtra(BaseModel):
element_relation: list[ElementRelation] = Field(
description='the relation between source and target element')
class LayoutElements(BaseModel):
layout_dets: list[ContentObject] = Field(
description='layout element details')
page_info: PageInfo = Field(description='page info')
extra: LayoutElementsExtra = Field(description='extra information')
# iter data format
class Node(BaseModel):
category_type: CategoryType = Field(description='类别')
text: str | None = Field(description='text content of the object',
default=None)
image_path: str | None = Field(description='path of embedded image',
default=None)
anno_id: int = Field(description='unique id', default=-1)
latex: str | None = Field(description='latex result', default=None)
html: str | None = Field(description='html result', default=None)
import json
import os
from pathlib import Path
from loguru import logger
import magic_pdf.model as model_config
from magic_pdf.dict2md.ocr_mkcontent import merge_para_with_text
from magic_pdf.integrations.rag.type import (CategoryType, ContentObject,
ElementRelation, ElementRelType,
LayoutElements,
LayoutElementsExtra, PageInfo)
from magic_pdf.libs.ocr_content_type import BlockType, ContentType
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
from magic_pdf.tools.common import do_parse, prepare_env
def convert_middle_json_to_layout_elements(
json_data: dict,
output_dir: str,
) -> list[LayoutElements]:
uniq_anno_id = 0
res: list[LayoutElements] = []
for page_no, page_data in enumerate(json_data['pdf_info']):
order_id = 0
page_info = PageInfo(
height=int(page_data['page_size'][1]),
width=int(page_data['page_size'][0]),
page_no=page_no,
)
layout_dets: list[ContentObject] = []
extra_element_relation: list[ElementRelation] = []
for para_block in page_data['para_blocks']:
para_text = ''
para_type = para_block['type']
if para_type == BlockType.Text:
para_text = merge_para_with_text(para_block)
x0, y0, x1, y1 = para_block['bbox']
content = ContentObject(
anno_id=uniq_anno_id,
category_type=CategoryType.text,
text=para_text,
order=order_id,
poly=[x0, y0, x1, y0, x1, y1, x0, y1],
)
uniq_anno_id += 1
order_id += 1
layout_dets.append(content)
elif para_type == BlockType.Title:
para_text = merge_para_with_text(para_block)
x0, y0, x1, y1 = para_block['bbox']
content = ContentObject(
anno_id=uniq_anno_id,
category_type=CategoryType.title,
text=para_text,
order=order_id,
poly=[x0, y0, x1, y0, x1, y1, x0, y1],
)
uniq_anno_id += 1
order_id += 1
layout_dets.append(content)
elif para_type == BlockType.InterlineEquation:
para_text = merge_para_with_text(para_block)
x0, y0, x1, y1 = para_block['bbox']
content = ContentObject(
anno_id=uniq_anno_id,
category_type=CategoryType.interline_equation,
text=para_text,
order=order_id,
poly=[x0, y0, x1, y0, x1, y1, x0, y1],
)
uniq_anno_id += 1
order_id += 1
layout_dets.append(content)
elif para_type == BlockType.Image:
body_anno_id = -1
caption_anno_id = -1
for block in para_block['blocks']:
if block['type'] == BlockType.ImageBody:
for line in block['lines']:
for span in line['spans']:
if span['type'] == ContentType.Image:
x0, y0, x1, y1 = block['bbox']
content = ContentObject(
anno_id=uniq_anno_id,
category_type=CategoryType.image_body,
image_path=os.path.join(
output_dir, span['image_path']),
order=order_id,
poly=[x0, y0, x1, y0, x1, y1, x0, y1],
)
body_anno_id = uniq_anno_id
uniq_anno_id += 1
order_id += 1
layout_dets.append(content)
for block in para_block['blocks']:
if block['type'] == BlockType.ImageCaption:
para_text += merge_para_with_text(block)
x0, y0, x1, y1 = block['bbox']
content = ContentObject(
anno_id=uniq_anno_id,
category_type=CategoryType.image_caption,
text=para_text,
order=order_id,
poly=[x0, y0, x1, y0, x1, y1, x0, y1],
)
caption_anno_id = uniq_anno_id
uniq_anno_id += 1
order_id += 1
layout_dets.append(content)
if body_anno_id > 0 and caption_anno_id > 0:
element_relation = ElementRelation(
relation=ElementRelType.sibling,
source_anno_id=body_anno_id,
target_anno_id=caption_anno_id,
)
extra_element_relation.append(element_relation)
elif para_type == BlockType.Table:
body_anno_id, caption_anno_id, footnote_anno_id = -1, -1, -1
for block in para_block['blocks']:
if block['type'] == BlockType.TableCaption:
para_text += merge_para_with_text(block)
x0, y0, x1, y1 = block['bbox']
content = ContentObject(
anno_id=uniq_anno_id,
category_type=CategoryType.table_caption,
text=para_text,
order=order_id,
poly=[x0, y0, x1, y0, x1, y1, x0, y1],
)
caption_anno_id = uniq_anno_id
uniq_anno_id += 1
order_id += 1
layout_dets.append(content)
for block in para_block['blocks']:
if block['type'] == BlockType.TableBody:
for line in block['lines']:
for span in line['spans']:
if span['type'] == ContentType.Table:
x0, y0, x1, y1 = para_block['bbox']
content = ContentObject(
anno_id=uniq_anno_id,
category_type=CategoryType.table_body,
order=order_id,
poly=[x0, y0, x1, y0, x1, y1, x0, y1],
)
body_anno_id = uniq_anno_id
uniq_anno_id += 1
order_id += 1
# if processed by table model
if span.get('latex', ''):
content.latex = span['latex']
else:
content.image_path = os.path.join(
output_dir, span['image_path'])
layout_dets.append(content)
for block in para_block['blocks']:
if block['type'] == BlockType.TableFootnote:
para_text += merge_para_with_text(block)
x0, y0, x1, y1 = block['bbox']
content = ContentObject(
anno_id=uniq_anno_id,
category_type=CategoryType.table_footnote,
text=para_text,
order=order_id,
poly=[x0, y0, x1, y0, x1, y1, x0, y1],
)
footnote_anno_id = uniq_anno_id
uniq_anno_id += 1
order_id += 1
layout_dets.append(content)
if caption_anno_id != -1 and body_anno_id != -1:
element_relation = ElementRelation(
relation=ElementRelType.sibling,
source_anno_id=body_anno_id,
target_anno_id=caption_anno_id,
)
extra_element_relation.append(element_relation)
if footnote_anno_id != -1 and body_anno_id != -1:
element_relation = ElementRelation(
relation=ElementRelType.sibling,
source_anno_id=body_anno_id,
target_anno_id=footnote_anno_id,
)
extra_element_relation.append(element_relation)
res.append(
LayoutElements(
page_info=page_info,
layout_dets=layout_dets,
extra=LayoutElementsExtra(
element_relation=extra_element_relation),
))
return res
def inference(path, output_dir, method):
model_config.__use_inside_model__ = True
model_config.__model_mode__ = 'full'
if output_dir == '':
if os.path.isdir(path):
output_dir = os.path.join(path, 'output')
else:
output_dir = os.path.join(os.path.dirname(path), 'output')
local_image_dir, local_md_dir = prepare_env(output_dir,
str(Path(path).stem), method)
def read_fn(path):
disk_rw = DiskReaderWriter(os.path.dirname(path))
return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN)
def parse_doc(doc_path: str):
try:
file_name = str(Path(doc_path).stem)
pdf_data = read_fn(doc_path)
do_parse(
output_dir,
file_name,
pdf_data,
[],
method,
False,
f_draw_span_bbox=False,
f_draw_layout_bbox=False,
f_dump_md=False,
f_dump_middle_json=True,
f_dump_model_json=False,
f_dump_orig_pdf=False,
f_dump_content_list=False,
f_draw_model_bbox=False,
)
middle_json_fn = os.path.join(local_md_dir,
f'{file_name}_middle.json')
with open(middle_json_fn) as fd:
jso = json.load(fd)
os.remove(middle_json_fn)
return convert_middle_json_to_layout_elements(jso, local_image_dir)
except Exception as e:
logger.exception(e)
return parse_doc(path)
if __name__ == '__main__':
import pprint
base_dir = '/opt/data/pdf/resources/samples/'
if 0:
with open(base_dir + 'json_outputs/middle.json') as f:
d = json.load(f)
result = convert_middle_json_to_layout_elements(d, '/tmp')
pprint.pp(result)
if 0:
with open(base_dir + 'json_outputs/middle.3.json') as f:
d = json.load(f)
result = convert_middle_json_to_layout_elements(d, '/tmp')
pprint.pp(result)
if 1:
res = inference(
base_dir + 'samples/pdf/one_page_with_table_image.pdf',
'/tmp/output',
'ocr',
)
pprint.pp(res)
This diff is collapsed.
from magic_pdf.layout.bbox_sort import X0_EXT_IDX, X0_IDX, X1_EXT_IDX, X1_IDX, Y0_IDX, Y1_EXT_IDX, Y1_IDX
from magic_pdf.libs.boxbase import _is_bottom_full_overlap, _left_intersect, _right_intersect
def find_all_left_bbox_direct(this_bbox, all_bboxes) -> list:
"""
在all_bboxes里找到所有右侧垂直方向上和this_bbox有重叠的bbox, 不用延长线
并且要考虑两个box左右相交的情况,如果相交了,那么右侧的box就不算最左侧。
"""
left_boxes = [box for box in all_bboxes if box[X1_IDX] <= this_bbox[X0_IDX]
and any([
box[Y0_IDX] < this_bbox[Y0_IDX] < box[Y1_IDX], box[Y0_IDX] < this_bbox[Y1_IDX] < box[Y1_IDX],
this_bbox[Y0_IDX] < box[Y0_IDX] < this_bbox[Y1_IDX], this_bbox[Y0_IDX] < box[Y1_IDX] < this_bbox[Y1_IDX],
box[Y0_IDX]==this_bbox[Y0_IDX] and box[Y1_IDX]==this_bbox[Y1_IDX]]) or _left_intersect(box[:4], this_bbox[:4])]
# 然后再过滤一下,找到水平上距离this_bbox最近的那个——x1最大的那个
if len(left_boxes) > 0:
left_boxes.sort(key=lambda x: x[X1_EXT_IDX] if x[X1_EXT_IDX] else x[X1_IDX], reverse=True)
left_boxes = left_boxes[0]
else:
left_boxes = None
return left_boxes
def find_all_right_bbox_direct(this_bbox, all_bboxes) -> list:
"""
找到在this_bbox右侧且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
"""
right_bboxes = [box for box in all_bboxes if box[X0_IDX] >= this_bbox[X1_IDX]
and any([
this_bbox[Y0_IDX] < box[Y0_IDX] < this_bbox[Y1_IDX], this_bbox[Y0_IDX] < box[Y1_IDX] < this_bbox[Y1_IDX],
box[Y0_IDX] < this_bbox[Y0_IDX] < box[Y1_IDX], box[Y0_IDX] < this_bbox[Y1_IDX] < box[Y1_IDX],
box[Y0_IDX]==this_bbox[Y0_IDX] and box[Y1_IDX]==this_bbox[Y1_IDX]]) or _right_intersect(this_bbox[:4], box[:4])]
if len(right_bboxes)>0:
right_bboxes.sort(key=lambda x: x[X0_EXT_IDX] if x[X0_EXT_IDX] else x[X0_IDX])
right_bboxes = right_bboxes[0]
else:
right_bboxes = None
return right_bboxes
def find_all_top_bbox_direct(this_bbox, all_bboxes) -> list:
"""
找到在this_bbox上侧且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
"""
top_bboxes = [box for box in all_bboxes if box[Y1_IDX] <= this_bbox[Y0_IDX] and any([
box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX],
this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX],
box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])]
if len(top_bboxes)>0:
top_bboxes.sort(key=lambda x: x[Y1_EXT_IDX] if x[Y1_EXT_IDX] else x[Y1_IDX], reverse=True)
top_bboxes = top_bboxes[0]
else:
top_bboxes = None
return top_bboxes
def find_all_bottom_bbox_direct(this_bbox, all_bboxes) -> list:
"""
找到在this_bbox下侧且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
"""
bottom_bboxes = [box for box in all_bboxes if box[Y0_IDX] >= this_bbox[Y1_IDX] and any([
this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX],
box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX],
box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])]
if len(bottom_bboxes)>0:
bottom_bboxes.sort(key=lambda x: x[Y0_IDX])
bottom_bboxes = bottom_bboxes[0]
else:
bottom_bboxes = None
return bottom_bboxes
# ===================================================================================================================
def find_bottom_bbox_direct_from_right_edge(this_bbox, all_bboxes) -> list:
"""
找到在this_bbox下侧且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
"""
bottom_bboxes = [box for box in all_bboxes if box[Y0_IDX] >= this_bbox[Y1_IDX] and any([
this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX],
box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX],
box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])]
if len(bottom_bboxes)>0:
# y0最小, X1最大的那个,也就是box上边缘最靠近this_bbox的那个,并且还最靠右
bottom_bboxes.sort(key=lambda x: x[Y0_IDX])
bottom_bboxes = [box for box in bottom_bboxes if box[Y0_IDX]==bottom_bboxes[0][Y0_IDX]]
# 然后再y1相同的情况下,找到x1最大的那个
bottom_bboxes.sort(key=lambda x: x[X1_IDX], reverse=True)
bottom_bboxes = bottom_bboxes[0]
else:
bottom_bboxes = None
return bottom_bboxes
def find_bottom_bbox_direct_from_left_edge(this_bbox, all_bboxes) -> list:
"""
找到在this_bbox下侧且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
"""
bottom_bboxes = [box for box in all_bboxes if box[Y0_IDX] >= this_bbox[Y1_IDX] and any([
this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX],
box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX],
box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])]
if len(bottom_bboxes)>0:
# y0最小, X0最小的那个
bottom_bboxes.sort(key=lambda x: x[Y0_IDX])
bottom_bboxes = [box for box in bottom_bboxes if box[Y0_IDX]==bottom_bboxes[0][Y0_IDX]]
# 然后再y0相同的情况下,找到x0最小的那个
bottom_bboxes.sort(key=lambda x: x[X0_IDX])
bottom_bboxes = bottom_bboxes[0]
else:
bottom_bboxes = None
return bottom_bboxes
def find_top_bbox_direct_from_left_edge(this_bbox, all_bboxes) -> list:
"""
找到在this_bbox上侧且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
"""
top_bboxes = [box for box in all_bboxes if box[Y1_IDX] <= this_bbox[Y0_IDX] and any([
box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX],
this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX],
box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])]
if len(top_bboxes)>0:
# y1最大, X0最小的那个
top_bboxes.sort(key=lambda x: x[Y1_IDX], reverse=True)
top_bboxes = [box for box in top_bboxes if box[Y1_IDX]==top_bboxes[0][Y1_IDX]]
# 然后再y1相同的情况下,找到x0最小的那个
top_bboxes.sort(key=lambda x: x[X0_IDX])
top_bboxes = top_bboxes[0]
else:
top_bboxes = None
return top_bboxes
def find_top_bbox_direct_from_right_edge(this_bbox, all_bboxes) -> list:
"""
找到在this_bbox上侧且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
"""
top_bboxes = [box for box in all_bboxes if box[Y1_IDX] <= this_bbox[Y0_IDX] and any([
box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX],
this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX],
box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])]
if len(top_bboxes)>0:
# y1最大, X1最大的那个
top_bboxes.sort(key=lambda x: x[Y1_IDX], reverse=True)
top_bboxes = [box for box in top_bboxes if box[Y1_IDX]==top_bboxes[0][Y1_IDX]]
# 然后再y1相同的情况下,找到x1最大的那个
top_bboxes.sort(key=lambda x: x[X1_IDX], reverse=True)
top_bboxes = top_bboxes[0]
else:
top_bboxes = None
return top_bboxes
# ===================================================================================================================
def get_left_edge_bboxes(all_bboxes) -> list:
"""
返回最左边的bbox
"""
left_bboxes = [box for box in all_bboxes if find_all_left_bbox_direct(box, all_bboxes) is None]
return left_bboxes
def get_right_edge_bboxes(all_bboxes) -> list:
"""
返回最右边的bbox
"""
right_bboxes = [box for box in all_bboxes if find_all_right_bbox_direct(box, all_bboxes) is None]
return right_bboxes
def fix_vertical_bbox_pos(bboxes:list):
"""
检查这批bbox在垂直方向是否有轻微的重叠,如果重叠了,就把重叠的bbox往下移动一点
在x方向上必须一个包含或者被包含,或者完全重叠,不能只有部分重叠
"""
bboxes.sort(key=lambda x: x[Y0_IDX]) # 从上向下排列
for i in range(0, len(bboxes)):
for j in range(i+1, len(bboxes)):
if _is_bottom_full_overlap(bboxes[i][:4], bboxes[j][:4]):
# 如果两个bbox有部分重叠,那么就把下面的bbox往下移动一点
bboxes[j][Y0_IDX] = bboxes[i][Y1_IDX] + 2 # 2是个经验值
break
return bboxes
This diff is collapsed.
"""
找到能分割布局的水平的横线、色块
"""
import os
from magic_pdf.libs.commons import fitz
from magic_pdf.libs.boxbase import _is_in_or_part_overlap
def __rect_filter_by_width(rect, page_w, page_h):
mid_x = page_w/2
if rect[0]< mid_x < rect[2]:
return True
return False
def __rect_filter_by_pos(rect, image_bboxes, table_bboxes):
"""
不能出现在table和image的位置
"""
for box in image_bboxes:
if _is_in_or_part_overlap(rect, box):
return False
for box in table_bboxes:
if _is_in_or_part_overlap(rect, box):
return False
return True
def __debug_show_page(page, bboxes1: list,bboxes2: list,bboxes3: list,):
save_path = "./tmp/debug.pdf"
if os.path.exists(save_path):
# 删除已经存在的文件
os.remove(save_path)
# 创建一个新的空白 PDF 文件
doc = fitz.open('')
width = page.rect.width
height = page.rect.height
new_page = doc.new_page(width=width, height=height)
shape = new_page.new_shape()
for bbox in bboxes1:
# 原始box画上去
rect = fitz.Rect(*bbox[0:4])
shape = new_page.new_shape()
shape.draw_rect(rect)
shape.finish(color=fitz.pdfcolor['red'], fill=fitz.pdfcolor['blue'], fill_opacity=0.2)
shape.finish()
shape.commit()
for bbox in bboxes2:
# 原始box画上去
rect = fitz.Rect(*bbox[0:4])
shape = new_page.new_shape()
shape.draw_rect(rect)
shape.finish(color=None, fill=fitz.pdfcolor['yellow'], fill_opacity=0.2)
shape.finish()
shape.commit()
for bbox in bboxes3:
# 原始box画上去
rect = fitz.Rect(*bbox[0:4])
shape = new_page.new_shape()
shape.draw_rect(rect)
shape.finish(color=fitz.pdfcolor['red'], fill=None)
shape.finish()
shape.commit()
parent_dir = os.path.dirname(save_path)
if not os.path.exists(parent_dir):
os.makedirs(parent_dir)
doc.save(save_path)
doc.close()
def get_spilter_of_page(page, image_bboxes, table_bboxes):
"""
获取到色块和横线
"""
cdrawings = page.get_cdrawings()
spilter_bbox = []
for block in cdrawings:
if 'fill' in block:
fill = block['fill']
if 'fill' in block and block['fill'] and block['fill']!=(1.0,1.0,1.0):
rect = block['rect']
if __rect_filter_by_width(rect, page.rect.width, page.rect.height) and __rect_filter_by_pos(rect, image_bboxes, table_bboxes):
spilter_bbox.append(list(rect))
"""过滤、修正一下这些box。因为有时候会有一些矩形,高度为0或者为负数,造成layout计算无限循环。如果是负高度或者0高度,统一修正为高度为1"""
for box in spilter_bbox:
if box[3]-box[1] <= 0:
box[3] = box[1] + 1
#__debug_show_page(page, spilter_bbox, [], [])
return spilter_bbox
"""
This is an advanced PyMuPDF utility for detecting multi-column pages.
It can be used in a shell script, or its main function can be imported and
invoked as descript below.
Features
---------
- Identify text belonging to (a variable number of) columns on the page.
- Text with different background color is handled separately, allowing for
easier treatment of side remarks, comment boxes, etc.
- Uses text block detection capability to identify text blocks and
uses the block bboxes as primary structuring principle.
- Supports ignoring footers via a footer margin parameter.
- Returns re-created text boundary boxes (integer coordinates), sorted ascending
by the top, then by the left coordinates.
Restrictions
-------------
- Only supporting horizontal, left-to-right text
- Returns a list of text boundary boxes - not the text itself. The caller is
expected to extract text from within the returned boxes.
- Text written above images is ignored altogether (option).
- This utility works as expected in most cases. The following situation cannot
be handled correctly:
* overlapping (non-disjoint) text blocks
* image captions are not recognized and are handled like normal text
Usage
------
- As a CLI shell command use
python multi_column.py input.pdf footer_margin
Where footer margin is the height of the bottom stripe to ignore on each page.
This code is intended to be modified according to your need.
- Use in a Python script as follows:
----------------------------------------------------------------------------------
from multi_column import column_boxes
# for each page execute
bboxes = column_boxes(page, footer_margin=50, no_image_text=True)
# bboxes is a list of fitz.IRect objects, that are sort ascending by their y0,
# then x0 coordinates. Their text content can be extracted by all PyMuPDF
# get_text() variants, like for instance the following:
for rect in bboxes:
print(page.get_text(clip=rect, sort=True))
----------------------------------------------------------------------------------
"""
import sys
from magic_pdf.libs.commons import fitz
def column_boxes(page, footer_margin=50, header_margin=50, no_image_text=True):
"""Determine bboxes which wrap a column."""
paths = page.get_drawings()
bboxes = []
# path rectangles
path_rects = []
# image bboxes
img_bboxes = []
# bboxes of non-horizontal text
# avoid when expanding horizontal text boxes
vert_bboxes = []
# compute relevant page area
clip = +page.rect
clip.y1 -= footer_margin # Remove footer area
clip.y0 += header_margin # Remove header area
def can_extend(temp, bb, bboxlist):
"""Determines whether rectangle 'temp' can be extended by 'bb'
without intersecting any of the rectangles contained in 'bboxlist'.
Items of bboxlist may be None if they have been removed.
Returns:
True if 'temp' has no intersections with items of 'bboxlist'.
"""
for b in bboxlist:
if not intersects_bboxes(temp, vert_bboxes) and (
b == None or b == bb or (temp & b).is_empty
):
continue
return False
return True
def in_bbox(bb, bboxes):
"""Return 1-based number if a bbox contains bb, else return 0."""
for i, bbox in enumerate(bboxes):
if bb in bbox:
return i + 1
return 0
def intersects_bboxes(bb, bboxes):
"""Return True if a bbox intersects bb, else return False."""
for bbox in bboxes:
if not (bb & bbox).is_empty:
return True
return False
def extend_right(bboxes, width, path_bboxes, vert_bboxes, img_bboxes):
"""Extend a bbox to the right page border.
Whenever there is no text to the right of a bbox, enlarge it up
to the right page border.
Args:
bboxes: (list[IRect]) bboxes to check
width: (int) page width
path_bboxes: (list[IRect]) bboxes with a background color
vert_bboxes: (list[IRect]) bboxes with vertical text
img_bboxes: (list[IRect]) bboxes of images
Returns:
Potentially modified bboxes.
"""
for i, bb in enumerate(bboxes):
# do not extend text with background color
if in_bbox(bb, path_bboxes):
continue
# do not extend text in images
if in_bbox(bb, img_bboxes):
continue
# temp extends bb to the right page border
temp = +bb
temp.x1 = width
# do not cut through colored background or images
if intersects_bboxes(temp, path_bboxes + vert_bboxes + img_bboxes):
continue
# also, do not intersect other text bboxes
check = can_extend(temp, bb, bboxes)
if check:
bboxes[i] = temp # replace with enlarged bbox
return [b for b in bboxes if b != None]
def clean_nblocks(nblocks):
"""Do some elementary cleaning."""
# 1. remove any duplicate blocks.
blen = len(nblocks)
if blen < 2:
return nblocks
start = blen - 1
for i in range(start, -1, -1):
bb1 = nblocks[i]
bb0 = nblocks[i - 1]
if bb0 == bb1:
del nblocks[i]
# 2. repair sequence in special cases:
# consecutive bboxes with almost same bottom value are sorted ascending
# by x-coordinate.
y1 = nblocks[0].y1 # first bottom coordinate
i0 = 0 # its index
i1 = -1 # index of last bbox with same bottom
# Iterate over bboxes, identifying segments with approx. same bottom value.
# Replace every segment by its sorted version.
for i in range(1, len(nblocks)):
b1 = nblocks[i]
if abs(b1.y1 - y1) > 10: # different bottom
if i1 > i0: # segment length > 1? Sort it!
nblocks[i0 : i1 + 1] = sorted(
nblocks[i0 : i1 + 1], key=lambda b: b.x0
)
y1 = b1.y1 # store new bottom value
i0 = i # store its start index
i1 = i # store current index
if i1 > i0: # segment waiting to be sorted
nblocks[i0 : i1 + 1] = sorted(nblocks[i0 : i1 + 1], key=lambda b: b.x0)
return nblocks
# extract vector graphics
for p in paths:
path_rects.append(p["rect"].irect)
path_bboxes = path_rects
# sort path bboxes by ascending top, then left coordinates
path_bboxes.sort(key=lambda b: (b.y0, b.x0))
# bboxes of images on page, no need to sort them
for item in page.get_images():
img_bboxes.extend(page.get_image_rects(item[0]))
# blocks of text on page
blocks = page.get_text(
"dict",
flags=fitz.TEXTFLAGS_TEXT,
clip=clip,
)["blocks"]
# Make block rectangles, ignoring non-horizontal text
for b in blocks:
bbox = fitz.IRect(b["bbox"]) # bbox of the block
# ignore text written upon images
if no_image_text and in_bbox(bbox, img_bboxes):
continue
# confirm first line to be horizontal
line0 = b["lines"][0] # get first line
if line0["dir"] != (1, 0): # only accept horizontal text
vert_bboxes.append(bbox)
continue
srect = fitz.EMPTY_IRECT()
for line in b["lines"]:
lbbox = fitz.IRect(line["bbox"])
text = "".join([s["text"].strip() for s in line["spans"]])
if len(text) > 1:
srect |= lbbox
bbox = +srect
if not bbox.is_empty:
bboxes.append(bbox)
# Sort text bboxes by ascending background, top, then left coordinates
bboxes.sort(key=lambda k: (in_bbox(k, path_bboxes), k.y0, k.x0))
# Extend bboxes to the right where possible
bboxes = extend_right(
bboxes, int(page.rect.width), path_bboxes, vert_bboxes, img_bboxes
)
# immediately return of no text found
if bboxes == []:
return []
# --------------------------------------------------------------------
# Join bboxes to establish some column structure
# --------------------------------------------------------------------
# the final block bboxes on page
nblocks = [bboxes[0]] # pre-fill with first bbox
bboxes = bboxes[1:] # remaining old bboxes
for i, bb in enumerate(bboxes): # iterate old bboxes
check = False # indicates unwanted joins
# check if bb can extend one of the new blocks
for j in range(len(nblocks)):
nbb = nblocks[j] # a new block
# never join across columns
if bb == None or nbb.x1 < bb.x0 or bb.x1 < nbb.x0:
continue
# never join across different background colors
if in_bbox(nbb, path_bboxes) != in_bbox(bb, path_bboxes):
continue
temp = bb | nbb # temporary extension of new block
check = can_extend(temp, nbb, nblocks)
if check == True:
break
if not check: # bb cannot be used to extend any of the new bboxes
nblocks.append(bb) # so add it to the list
j = len(nblocks) - 1 # index of it
temp = nblocks[j] # new bbox added
# check if some remaining bbox is contained in temp
check = can_extend(temp, bb, bboxes)
if check == False:
nblocks.append(bb)
else:
nblocks[j] = temp
bboxes[i] = None
# do some elementary cleaning
nblocks = clean_nblocks(nblocks)
# return identified text bboxes
return nblocks
if __name__ == "__main__":
"""Only for debugging purposes, currently.
Draw red borders around the returned text bboxes and insert
the bbox number.
Then save the file under the name "input-blocks.pdf".
"""
# get the file name
filename = sys.argv[1]
# check if footer margin is given
if len(sys.argv) > 2:
footer_margin = int(sys.argv[2])
else: # use default vaue
footer_margin = 50
# check if header margin is given
if len(sys.argv) > 3:
header_margin = int(sys.argv[3])
else: # use default vaue
header_margin = 50
# open document
doc = fitz.open(filename)
# iterate over the pages
for page in doc:
# remove any geometry issues
page.wrap_contents()
# get the text bboxes
bboxes = column_boxes(page, footer_margin=footer_margin, header_margin=header_margin)
# prepare a canvas to draw rectangles and text
shape = page.new_shape()
# iterate over the bboxes
for i, rect in enumerate(bboxes):
shape.draw_rect(rect) # draw a border
# write sequence number
shape.insert_text(rect.tl + (5, 15), str(i), color=fitz.pdfcolor["red"])
# finish drawing / text with color red
shape.finish(color=fitz.pdfcolor["red"])
shape.commit() # store to the page
# save document with text bboxes
doc.ez_save(filename.replace(".pdf", "-blocks.pdf"))
\ No newline at end of file
"""
span维度自定义字段
"""
# span是否是跨页合并的
CROSS_PAGE = "cross_page"
"""
block维度自定义字段
"""
# block中lines是否被删除
LINES_DELETED = "lines_deleted"
# struct eqtable
STRUCT_EQTABLE = "struct_eqtable"
# table recognition max time default value
TABLE_MAX_TIME_VALUE = 400
# pp_table_result_max_length
TABLE_MAX_LEN = 480
# pp table structure algorithm
TABLE_MASTER = "TableMaster"
# table master structure dict
TABLE_MASTER_DICT = "table_master_structure_dict.txt"
# table master dir
TABLE_MASTER_DIR = "table_structure_tablemaster_infer/"
# pp detect model dir
DETECT_MODEL_DIR = "ch_PP-OCRv3_det_infer"
# pp rec model dir
REC_MODEL_DIR = "ch_PP-OCRv3_rec_infer"
# pp rec char dict path
REC_CHAR_DICT = "ppocr_keys_v1.txt"
class MakeMode:
MM_MD = "mm_markdown"
NLP_MD = "nlp_markdown"
STANDARD_FORMAT = "standard_format"
class DropMode:
WHOLE_PDF = "whole_pdf"
SINGLE_PAGE = "single_page"
NONE = "none"
from enum import Enum
class ModelBlockTypeEnum(Enum):
TITLE = 0
PLAIN_TEXT = 1
ABANDON = 2
ISOLATE_FORMULA = 8
EMBEDDING = 13
ISOLATED = 14
\ No newline at end of file
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
"""
根据bucket的名字返回对应的s3 AK, SK,endpoint三元组
"""
import json
import os
from loguru import logger
from magic_pdf.libs.commons import parse_bucket_key
# 定义配置文件名常量
CONFIG_FILE_NAME = "magic-pdf.json"
def read_config():
home_dir = os.path.expanduser("~")
config_file = os.path.join(home_dir, CONFIG_FILE_NAME)
if not os.path.exists(config_file):
raise FileNotFoundError(f"{config_file} not found")
with open(config_file, "r", encoding="utf-8") as f:
config = json.load(f)
return config
def get_s3_config(bucket_name: str):
"""
~/magic-pdf.json 读出来
"""
config = read_config()
bucket_info = config.get("bucket_info")
if bucket_name not in bucket_info:
access_key, secret_key, storage_endpoint = bucket_info["[default]"]
else:
access_key, secret_key, storage_endpoint = bucket_info[bucket_name]
if access_key is None or secret_key is None or storage_endpoint is None:
raise Exception(f"ak, sk or endpoint not found in {CONFIG_FILE_NAME}")
# logger.info(f"get_s3_config: ak={access_key}, sk={secret_key}, endpoint={storage_endpoint}")
return access_key, secret_key, storage_endpoint
def get_s3_config_dict(path: str):
access_key, secret_key, storage_endpoint = get_s3_config(get_bucket_name(path))
return {"ak": access_key, "sk": secret_key, "endpoint": storage_endpoint}
def get_bucket_name(path):
bucket, key = parse_bucket_key(path)
return bucket
def get_local_models_dir():
config = read_config()
models_dir = config.get("models-dir")
if models_dir is None:
logger.warning(f"'models-dir' not found in {CONFIG_FILE_NAME}, use '/tmp/models' as default")
return "/tmp/models"
else:
return models_dir
def get_device():
config = read_config()
device = config.get("device-mode")
if device is None:
logger.warning(f"'device-mode' not found in {CONFIG_FILE_NAME}, use 'cpu' as default")
return "cpu"
else:
return device
def get_table_recog_config():
config = read_config()
table_config = config.get("table-config")
if table_config is None:
logger.warning(f"'table-config' not found in {CONFIG_FILE_NAME}, use 'False' as default")
return json.loads('{"is_table_recog_enable": false, "max_time": 400}')
else:
return table_config
if __name__ == "__main__":
ak, sk, endpoint = get_s3_config("llm-raw")
def dict_to_list(input_dict):
items_list = []
for _, item in input_dict.items():
items_list.append(item)
return items_list
def get_scale_ratio(model_page_info, page):
pix = page.get_pixmap(dpi=72)
pymu_width = int(pix.w)
pymu_height = int(pix.h)
width_from_json = model_page_info['page_info']['width']
height_from_json = model_page_info['page_info']['height']
horizontal_scale_ratio = width_from_json / pymu_width
vertical_scale_ratio = height_from_json / pymu_height
return horizontal_scale_ratio, vertical_scale_ratio
from collections import Counter
from magic_pdf.libs.language import detect_lang
def get_language_from_model(model_list: list):
language_lst = []
for ocr_page_info in model_list:
page_text = ""
layout_dets = ocr_page_info["layout_dets"]
for layout_det in layout_dets:
category_id = layout_det["category_id"]
allow_category_id_list = [15]
if category_id in allow_category_id_list:
page_text += layout_det["text"]
page_language = detect_lang(page_text)
language_lst.append(page_language)
# 统计text_language_list中每种语言的个数
count_dict = Counter(language_lst)
# 输出text_language_list中出现的次数最多的语言
language = max(count_dict, key=count_dict.get)
return language
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment