Commit f911a102 authored by myhloli's avatar myhloli
Browse files

feat(tools): add character bounding box drawing functionality

- Add `draw_char_bbox` function to `draw_bbox.py` for drawing character bounding boxes
- Integrate `draw_char_bbox` into `common.py` for use in PDF processing pipeline
- Include option to draw character bounding boxes in debug mode
parent 9951a170
...@@ -394,17 +394,13 @@ def draw_line_sort_bbox(pdf_info, pdf_bytes, out_path, filename): ...@@ -394,17 +394,13 @@ def draw_line_sort_bbox(pdf_info, pdf_bytes, out_path, filename):
pdf_docs.save(f'{out_path}/{filename}') pdf_docs.save(f'{out_path}/{filename}')
def draw_layout_sort_bbox(pdf_info, pdf_bytes, out_path, filename): def draw_char_bbox(pdf_bytes, out_path, filename):
layout_bbox_list = []
for page in pdf_info:
page_block_list = []
for block in page['para_blocks']:
bbox = block['bbox']
page_block_list.append(bbox)
layout_bbox_list.append(page_block_list)
pdf_docs = fitz.open('pdf', pdf_bytes) pdf_docs = fitz.open('pdf', pdf_bytes)
for i, page in enumerate(pdf_docs): for i, page in enumerate(pdf_docs):
draw_bbox_with_number(i, layout_bbox_list, page, [255, 0, 0], False) for block in page.get_text('rawdict', flags=fitz.TEXT_PRESERVE_LIGATURES | fitz.TEXT_PRESERVE_WHITESPACE | fitz.TEXT_MEDIABOX_CLIP)['blocks']:
for line in block['lines']:
pdf_docs.save(f'{out_path}/{filename}_layout_sort.pdf') for span in line['spans']:
for char in span['chars']:
char_bbox = char['bbox']
page.draw_rect(char_bbox, color=[1, 0, 0], fill=None, fill_opacity=1, width=0.3, overlay=True,)
pdf_docs.save(f'{out_path}/{filename}')
...@@ -9,6 +9,7 @@ from magic_pdf.config.enums import SupportedPdfParseMethod ...@@ -9,6 +9,7 @@ from magic_pdf.config.enums import SupportedPdfParseMethod
from magic_pdf.config.make_content_config import DropMode, MakeMode from magic_pdf.config.make_content_config import DropMode, MakeMode
from magic_pdf.data.data_reader_writer import FileBasedDataWriter from magic_pdf.data.data_reader_writer import FileBasedDataWriter
from magic_pdf.data.dataset import PymuDocDataset from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.libs.draw_bbox import draw_char_bbox
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.operators.models import InferenceResult from magic_pdf.operators.models import InferenceResult
...@@ -83,6 +84,7 @@ def do_parse( ...@@ -83,6 +84,7 @@ def do_parse(
f_make_md_mode=MakeMode.MM_MD, f_make_md_mode=MakeMode.MM_MD,
f_draw_model_bbox=False, f_draw_model_bbox=False,
f_draw_line_sort_bbox=False, f_draw_line_sort_bbox=False,
f_draw_char_bbox=False,
start_page_id=0, start_page_id=0,
end_page_id=None, end_page_id=None,
lang=None, lang=None,
...@@ -94,6 +96,7 @@ def do_parse( ...@@ -94,6 +96,7 @@ def do_parse(
logger.warning('debug mode is on') logger.warning('debug mode is on')
f_draw_model_bbox = True f_draw_model_bbox = True
f_draw_line_sort_bbox = True f_draw_line_sort_bbox = True
# f_draw_char_bbox = True
pdf_bytes = convert_pdf_bytes_to_bytes_by_pymupdf( pdf_bytes = convert_pdf_bytes_to_bytes_by_pymupdf(
pdf_bytes, start_page_id, end_page_id pdf_bytes, start_page_id, end_page_id
...@@ -205,6 +208,9 @@ def do_parse( ...@@ -205,6 +208,9 @@ def do_parse(
os.path.join(local_md_dir, f'{pdf_file_name}_line_sort.pdf') os.path.join(local_md_dir, f'{pdf_file_name}_line_sort.pdf')
) )
if f_draw_char_bbox:
draw_char_bbox(pdf_bytes, local_md_dir, f'{pdf_file_name}_char_bbox.pdf')
if f_dump_md: if f_dump_md:
pipe_result.dump_md( pipe_result.dump_md(
md_writer, md_writer,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment