draw_bbox.py 1.69 KB
Newer Older
liukaiwen's avatar
lkw  
liukaiwen committed
1
2
3
from pathlib import Path

from magic_pdf.libs.commons import fitz, join_path  # PyMuPDF
liukaiwen's avatar
lkw  
liukaiwen committed
4
5
from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr
import json
liukaiwen's avatar
lkw  
liukaiwen committed
6
import os
liukaiwen's avatar
lkw  
liukaiwen committed
7
8
9
10
11
12
13
14
15




def read_json_file(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)
    return data

赵小蒙's avatar
赵小蒙 committed
16
17

# PDF文件路径
liukaiwen's avatar
lkw  
liukaiwen committed
18
pdf_path = "D:\\projects\\Magic-PDF\\ocr_demo\\ocr_0_org.pdf"
赵小蒙's avatar
赵小蒙 committed
19
20
21
22

doc = fitz.open(pdf_path)  # Open the PDF
# 你的数据
data = [[[-2, 0, 603, 80, 24]], [[-3, 0, 602, 80, 24]]]
liukaiwen's avatar
lkw  
liukaiwen committed
23
24
ocr_json_file_path = r"D:\projects\Magic-PDF\ocr_demo\ocr_0.json"
ocr_pdf_info = read_json_file(ocr_json_file_path)
liukaiwen's avatar
lkw  
liukaiwen committed
25
26
27
28
29
30
31
32
33
34
35
36
37

pth = Path(ocr_json_file_path)
book_name = pth.name
save_tmp_path = os.path.join(os.path.dirname(__file__), "../..", "tmp", "unittest")
save_path = join_path(save_tmp_path, "md")

pdf_info_dict = parse_pdf_by_ocr(
            pdf_path,
            None,
            ocr_pdf_info,
            save_path,
            book_name,
            debug_mode=True)
liukaiwen's avatar
lkw  
liukaiwen committed
38
39
40
41
42
43
44
45
46
47
48
data_list = []
for page in pdf_info_dict.values():
    page_list = []
    blocks = page.get("preproc_blocks")
    for block in blocks:
        lines = block.get("lines")
        for line in lines:
            spans = line.get("spans")
            for span in spans:
                page_list.append(span["bbox"])
    data_list.append(page_list)
赵小蒙's avatar
赵小蒙 committed
49
50
51
# 对每个页面进行处理
for i, page in enumerate(doc):
    # 获取当前页面的数据
liukaiwen's avatar
lkw  
liukaiwen committed
52
    page_data = data_list[i]
赵小蒙's avatar
赵小蒙 committed
53
    for img in page_data:
liukaiwen's avatar
lkw  
liukaiwen committed
54
        x0, y0, x1, y1 = img
赵小蒙's avatar
赵小蒙 committed
55
56
57
58
        rect_coords = fitz.Rect(x0, y0, x1, y1)  # Define the rectangle
        page.draw_rect(rect_coords, color=(1, 0, 0), fill=None, width=1.5, overlay=True)  # Draw the rectangle

# Save the PDF
liukaiwen's avatar
lkw  
liukaiwen committed
59
doc.save("D:\\projects\\Magic-PDF\\ocr_demo\\ocr_0_new1.pdf")