"examples/vscode:/vscode.git/clone" did not exist on "95eada24fcff616aebdffe9aa0b174eee2264fc5"
Commit 23bacc60 authored by Shuimo's avatar Shuimo
Browse files

add an option to freely output 'badcase.json

parents d1457937 4191fa96
......@@ -40,15 +40,20 @@ jobs:
pip install -r requirements.txt
fi
- name: benchmark
- name: config-net-reset
run: |
export http_proxy=""
export https_proxy=""
- name: get-benchmark-result
run: |
echo "start test"
cd tools && python ocr_badcase.py pdf_json_label_0306.json ocr_dataset.json json_files.zip badcase.json overall.json base_data.json
cd tools && python text_badcase.py pdf_json_label_0306.json pdf_json_label_0229.json json_files.zip text_badcase text_overall base_data_text.json --s3_bucket_name llm-process-pperf --s3_file_directory qa-validate/pdf-datasets/badcase --AWS_ACCESS_KEY 7X9CWNHIVOHH3LXRD5WK --AWS_SECRET_KEY IHLyTsv7h4ArzReLWUGZNKvwqB7CMrRi6e7ZyUt0 --END_POINT_URL http://p-ceph-norm-inside.pjlab.org.cn:80
python ocr_badcase.py pdf_json_label_0306.json ocr_dataset.json json_files.zip ocr_badcase ocr_overall base_data_ocr.json --s3_bucket_name llm-process-pperf --s3_file_directory qa-validate/pdf-datasets/badcase --AWS_ACCESS_KEY 7X9CWNHIVOHH3LXRD5WK --AWS_SECRET_KEY IHLyTsv7h4ArzReLWUGZNKvwqB7CMrRi6e7ZyUt0 --END_POINT_URL http://p-ceph-norm-inside.pjlab.org.cn:80
notify_to_feishu:
if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'master') }}
needs: [pdf-test]
runs-on: [pdf]
runs-on: pdf
steps:
- name: notify
run: |
......
......@@ -22,15 +22,15 @@ git clone https://github.com/magicpdf/Magic-PDF.git
2.Install the requirements
```sh
cd Magic-PDF
pip install -r requirements.txt
```
3.Run the main script
3.Run the command line
```sh
use demo/text_demo.py
or
use demo/ocr_demo.py
export PYTHONPATH=.
python magic_pdf/cli/magicpdf.py --help
```
### 版权说明
......
......@@ -15,7 +15,7 @@ from loguru import logger
from magic_pdf.libs.config_reader import get_s3_config_dict
from magic_pdf.pdf_parse_by_txt import parse_pdf_by_txt
from magic_pdf.spark.base import get_data_source
from magic_pdf.spark.spark_api import get_data_source
def demo_parse_pdf(book_name=None, start_page_id=0, debug_mode=True):
......@@ -67,9 +67,7 @@ def demo_classify_by_type(book_name=None, debug_mode=True):
img_num_list = pdf_meta["imgs_per_page"]
text_len_list = pdf_meta["text_len_per_page"]
text_layout_list = pdf_meta["text_layout_per_page"]
pdf_path = json_object.get("file_location")
is_text_pdf, results = classify(
pdf_path,
total_page,
page_width,
page_height,
......@@ -89,7 +87,7 @@ def demo_meta_scan(book_name=None, debug_mode=True):
s3_pdf_path = json_object.get("file_location")
s3_config = get_s3_config_dict(s3_pdf_path)
pdf_bytes = read_file(s3_pdf_path, s3_config)
res = pdf_meta_scan(s3_pdf_path, pdf_bytes)
res = pdf_meta_scan(pdf_bytes)
logger.info(json.dumps(res, ensure_ascii=False))
write_json_to_local(res, book_name)
......
......@@ -21,28 +21,175 @@ python magicpdf.py --json s3://llm-pdf-text/scihub/xxxx.json?bytes=0,81350
python magicpdf.py --pdf /home/llm/Downloads/xxxx.pdf --model /home/llm/Downloads/xxxx.json 或者 python magicpdf.py --pdf /home/llm/Downloads/xxxx.pdf
"""
import os
import json as json_parse
import click
from loguru import logger
from pathlib import Path
from magic_pdf.pipe.UNIPipe import UNIPipe
from magic_pdf.pipe.OCRPipe import OCRPipe
from magic_pdf.pipe.TXTPipe import TXTPipe
from magic_pdf.libs.config_reader import get_s3_config
from magic_pdf.libs.path_utils import (
parse_s3path,
parse_s3_range_params,
remove_non_official_s3_args,
)
from magic_pdf.libs.config_reader import get_local_dir
from magic_pdf.rw.S3ReaderWriter import S3ReaderWriter
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
parse_pdf_methods = click.Choice(["ocr", "txt", "auto"])
def prepare_env(pdf_file_name):
local_parent_dir = os.path.join(
get_local_dir(), "magic-pdf", pdf_file_name
)
local_image_dir = os.path.join(local_parent_dir, "images")
local_md_dir = local_parent_dir
os.makedirs(local_image_dir, exist_ok=True)
os.makedirs(local_md_dir, exist_ok=True)
return local_image_dir, local_md_dir
def _do_parse(pdf_file_name, pdf_bytes, model_list, parse_method, image_writer, md_writer, image_dir):
if parse_method == "auto":
pipe = UNIPipe(pdf_bytes, model_list, image_writer, image_dir, is_debug=True)
elif parse_method == "txt":
pipe = TXTPipe(pdf_bytes, model_list, image_writer, image_dir, is_debug=True)
elif parse_method == "ocr":
pipe = OCRPipe(pdf_bytes, model_list, image_writer, image_dir, is_debug=True)
else:
print("unknow parse method")
os.exit(1)
pipe.pipe_classify()
pipe.pipe_parse()
md_content = pipe.pipe_mk_markdown()
#part_file_name = datetime.now().strftime("%H-%M-%S")
md_writer.write(
content=md_content, path=f"{pdf_file_name}.md", mode=AbsReaderWriter.MODE_TXT
)
md_writer.write(
content=json_parse.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4),
path=f"{pdf_file_name}.json",
mode=AbsReaderWriter.MODE_TXT,
)
# try:
# content_list = pipe.pipe_mk_uni_format()
# except Exception as e:
# logger.exception(e)
# md_writer.write(
# str(content_list), f"{part_file_name}.txt", AbsReaderWriter.MODE_TXT
# )
import click
@click.group()
def cli():
pass
@cli.command()
@click.option('--json', type=str, help='输入一个S3路径')
def json_command(json):
# 这里处理json相关的逻辑
print(f'处理JSON: {json}')
@click.option("--json", type=str, help="输入一个S3路径")
@click.option(
"--method",
type=parse_pdf_methods,
help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
default="auto",
)
def json_command(json, method):
if not json.startswith("s3://"):
print("usage: python magipdf.py --json s3://some_bucket/some_path")
os.exit(1)
def read_s3_path(s3path):
bucket, key = parse_s3path(s3path)
s3_ak, s3_sk, s3_endpoint = get_s3_config(bucket)
s3_rw = S3ReaderWriter(
s3_ak, s3_sk, s3_endpoint, "auto", remove_non_official_s3_args(s3path)
)
may_range_params = parse_s3_range_params(s3path)
if may_range_params is None or 2 != len(may_range_params):
byte_start, byte_end = 0, None
else:
byte_start, byte_end = int(may_range_params[0]), int(may_range_params[1])
byte_end += byte_start - 1
return s3_rw.read_jsonl(
remove_non_official_s3_args(s3path),
byte_start,
byte_end,
AbsReaderWriter.MODE_BIN,
)
jso = json_parse.loads(read_s3_path(json).decode("utf-8"))
s3_file_path = jso["file_location"]
pdf_file_name = Path(s3_file_path).stem
pdf_data = read_s3_path(s3_file_path)
local_image_dir, local_md_dir = prepare_env(pdf_file_name)
local_image_rw, local_md_rw = DiskReaderWriter(local_image_dir), DiskReaderWriter(
local_md_dir
)
_do_parse(
pdf_file_name,
pdf_data,
jso["doc_layout_result"],
method,
local_image_rw,
local_md_rw,
os.path.basename(local_image_dir),
)
@cli.command()
@click.option('--pdf', type=click.Path(exists=True), required=True, help='PDF文件的路径')
@click.option('--model', type=click.Path(exists=True), help='模型的路径')
def pdf_command(pdf, model):
@click.option(
"--pdf", type=click.Path(exists=True), required=True, help="PDF文件的路径"
)
@click.option("--model", type=click.Path(exists=True), help="模型的路径")
@click.option(
"--method",
type=parse_pdf_methods,
help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
default="auto",
)
def pdf_command(pdf, model, method):
# 这里处理pdf和模型相关的逻辑
print(f'处理PDF: {pdf}')
print(f'加载模型: {model}')
if model is None:
model = pdf.replace(".pdf", ".json")
if not os.path.exists(model):
print(f"make sure json file existed and place under {os.dirname(pdf)}")
os.exit(1)
def read_fn(path):
disk_rw = DiskReaderWriter(os.path.dirname(path))
return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN)
pdf_data = read_fn(pdf)
jso = json_parse.loads(read_fn(model).decode("utf-8"))
pdf_file_name = Path(pdf).stem
local_image_dir, local_md_dir = prepare_env(pdf_file_name)
local_image_rw, local_md_rw = DiskReaderWriter(local_image_dir), DiskReaderWriter(
local_md_dir
)
_do_parse(
pdf_file_name,
pdf_data,
jso,
method,
local_image_rw,
local_md_rw,
os.path.basename(local_image_dir),
)
if __name__ == '__main__':
if __name__ == "__main__":
"""
python magic_pdf/cli/magicpdf.py json-command --json s3://llm-pdf-text/pdf_ebook_and_paper/manual/v001/part-660407a28beb-000002.jsonl?bytes=0,63551
"""
cli()
......@@ -2,6 +2,7 @@ import math
from loguru import logger
from magic_pdf.libs.boxbase import find_bottom_nearest_text_bbox, find_top_nearest_text_bbox
from magic_pdf.libs.commons import join_path
from magic_pdf.libs.ocr_content_type import ContentType
TYPE_INLINE_EQUATION = ContentType.InlineEquation
......@@ -227,12 +228,12 @@ def __insert_before_para(text, type, element, content_list):
logger.error(f"Can't find the location of image {element.get('image_path')} in the markdown file, search target is {text}")
def mk_universal_format(para_dict: dict):
def mk_universal_format(pdf_info_list: list, img_buket_path):
"""
构造统一格式 https://aicarrier.feishu.cn/wiki/FqmMwcH69iIdCWkkyjvcDwNUnTY
"""
content_lst = []
for _, page_info in para_dict.items():
for page_info in pdf_info_list:
page_lst = [] # 一个page内的段落列表
para_blocks = page_info.get("para_blocks")
pymu_raw_blocks = page_info.get("preproc_blocks")
......@@ -249,7 +250,7 @@ def mk_universal_format(para_dict: dict):
for img in all_page_images:
content_node = {
"type": "image",
"img_path": img['image_path'],
"img_path": join_path(img_buket_path, img['image_path']),
"img_alt":"",
"img_title":"",
"img_caption":""
......@@ -258,7 +259,7 @@ def mk_universal_format(para_dict: dict):
for table in all_page_tables:
content_node = {
"type": "table",
"img_path": table['image_path'],
"img_path": join_path(img_buket_path, table['image_path']),
"table_latex": table.get("text"),
"table_title": "",
"table_caption": "",
......
from loguru import logger
from magic_pdf.libs.commons import join_path
from magic_pdf.libs.language import detect_lang
from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
from magic_pdf.libs.ocr_content_type import ContentType
from magic_pdf.libs.ocr_content_type import ContentType, BlockType
import wordninja
import re
......@@ -16,90 +19,41 @@ def split_long_words(text):
return ' '.join(segments)
def ocr_mk_nlp_markdown(pdf_info_dict: dict):
markdown = []
for _, page_info in pdf_info_dict.items():
blocks = page_info.get("preproc_blocks")
if not blocks:
continue
for block in blocks:
for line in block['lines']:
line_text = ''
for span in line['spans']:
if not span.get('content'):
continue
content = ocr_escape_special_markdown_char(span['content']) # 转义特殊符号
if span['type'] == ContentType.InlineEquation:
content = f"${content}$"
elif span['type'] == ContentType.InterlineEquation:
content = f"$$\n{content}\n$$"
line_text += content + ' '
# 在行末添加两个空格以强制换行
markdown.append(line_text.strip() + ' ')
return '\n'.join(markdown)
def ocr_mk_mm_markdown(pdf_info_dict: dict):
markdown = []
for _, page_info in pdf_info_dict.items():
blocks = page_info.get("preproc_blocks")
if not blocks:
continue
for block in blocks:
for line in block['lines']:
line_text = ''
for span in line['spans']:
if not span.get('content'):
if not span.get('image_path'):
continue
else:
content = f"![]({span['image_path']})"
else:
content = ocr_escape_special_markdown_char(span['content']) # 转义特殊符号
if span['type'] == ContentType.InlineEquation:
content = f"${content}$"
elif span['type'] == ContentType.InterlineEquation:
content = f"$$\n{content}\n$$"
line_text += content + ' '
# 在行末添加两个空格以强制换行
markdown.append(line_text.strip() + ' ')
return '\n'.join(markdown)
def ocr_mk_mm_markdown_with_para(pdf_info_dict: dict):
def ocr_mk_mm_markdown_with_para(pdf_info_list: list, img_buket_path):
markdown = []
for _, page_info in pdf_info_dict.items():
for page_info in pdf_info_list:
paras_of_layout = page_info.get("para_blocks")
page_markdown = ocr_mk_markdown_with_para_core(paras_of_layout, "mm")
page_markdown = ocr_mk_markdown_with_para_core_v2(paras_of_layout, "mm", img_buket_path)
markdown.extend(page_markdown)
return '\n\n'.join(markdown)
def ocr_mk_nlp_markdown_with_para(pdf_info_dict: dict):
def ocr_mk_nlp_markdown_with_para(pdf_info_dict: list):
markdown = []
for _, page_info in pdf_info_dict.items():
for page_info in pdf_info_dict:
paras_of_layout = page_info.get("para_blocks")
page_markdown = ocr_mk_markdown_with_para_core(paras_of_layout, "nlp")
page_markdown = ocr_mk_markdown_with_para_core_v2(paras_of_layout, "nlp")
markdown.extend(page_markdown)
return '\n\n'.join(markdown)
def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: dict):
def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: list, img_buket_path):
markdown_with_para_and_pagination = []
for page_no, page_info in pdf_info_dict.items():
page_no = 0
for page_info in pdf_info_dict:
paras_of_layout = page_info.get("para_blocks")
if not paras_of_layout:
continue
page_markdown = ocr_mk_markdown_with_para_core(paras_of_layout, "mm")
page_markdown = ocr_mk_markdown_with_para_core_v2(paras_of_layout, "mm", img_buket_path)
markdown_with_para_and_pagination.append({
'page_no': page_no,
'md_content': '\n\n'.join(page_markdown)
})
page_no += 1
return markdown_with_para_and_pagination
def ocr_mk_markdown_with_para_core(paras_of_layout, mode):
def ocr_mk_markdown_with_para_core(paras_of_layout, mode, img_buket_path=""):
page_markdown = []
for paras in paras_of_layout:
for para in paras:
......@@ -122,7 +76,7 @@ def ocr_mk_markdown_with_para_core(paras_of_layout, mode):
content = f"\n$$\n{span['content']}\n$$\n"
elif span_type in [ContentType.Image, ContentType.Table]:
if mode == 'mm':
content = f"\n![]({span['image_path']})\n"
content = f"\n![]({join_path(img_buket_path, span['image_path'])})\n"
elif mode == 'nlp':
pass
if content != '':
......@@ -137,10 +91,86 @@ def ocr_mk_markdown_with_para_core(paras_of_layout, mode):
return page_markdown
def para_to_standard_format(para):
def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""):
page_markdown = []
for para_block in paras_of_layout:
para_text = ''
para_type = para_block.get('type')
if para_type == BlockType.Text:
para_text = merge_para_with_text(para_block)
elif para_type == BlockType.Title:
para_text = f"# {merge_para_with_text(para_block)}"
elif para_type == BlockType.InterlineEquation:
para_text = merge_para_with_text(para_block)
elif para_type == BlockType.Image:
if mode == 'nlp':
continue
elif mode == 'mm':
img_blocks = para_block.get('blocks')
for img_block in img_blocks:
if img_block.get('type') == BlockType.ImageBody:
for line in img_block.get('lines'):
for span in line['spans']:
if span.get('type') == ContentType.Image:
para_text = f"\n![]({join_path(img_buket_path, span['image_path'])})\n"
for img_block in img_blocks:
if img_block.get('type') == BlockType.ImageCaption:
para_text += merge_para_with_text(img_block)
elif para_type == BlockType.Table:
if mode == 'nlp':
continue
elif mode == 'mm':
table_blocks = para_block.get('blocks')
for table_block in table_blocks:
if table_block.get('type') == BlockType.TableBody:
for line in table_block.get('lines'):
for span in line['spans']:
if span.get('type') == ContentType.Table:
para_text = f"\n![]({join_path(img_buket_path, span['image_path'])})\n"
for table_block in table_blocks:
if table_block.get('type') == BlockType.TableCaption:
para_text += merge_para_with_text(table_block)
elif table_block.get('type') == BlockType.TableFootnote:
para_text += merge_para_with_text(table_block)
if para_text.strip() == '':
continue
else:
page_markdown.append(para_text.strip() + ' ')
return page_markdown
def merge_para_with_text(para):
para_text = ''
for line in para['lines']:
for span in line['spans']:
span_type = span.get('type')
content = ''
language = ''
if span_type == ContentType.Text:
content = span['content']
language = detect_lang(content)
if language == 'en': # 只对英文长词进行分词处理,中文分词会丢失文本
content = ocr_escape_special_markdown_char(split_long_words(content))
else:
content = ocr_escape_special_markdown_char(content)
elif span_type == ContentType.InlineEquation:
content = f"${span['content']}$"
elif span_type == ContentType.InterlineEquation:
content = f"\n$$\n{span['content']}\n$$\n"
if content != '':
if language == 'en': # 英文语境下 content间需要空格分隔
para_text += content + ' '
else: # 中文语境下,content间不需要空格分隔
para_text += content
return para_text
def para_to_standard_format(para, img_buket_path):
para_content = {}
if len(para) == 1:
para_content = line_to_standard_format(para[0])
para_content = line_to_standard_format(para[0], img_buket_path)
elif len(para) > 1:
para_text = ''
inline_equation_num = 0
......@@ -148,6 +178,7 @@ def para_to_standard_format(para):
for span in line['spans']:
language = ''
span_type = span.get('type')
content = ""
if span_type == ContentType.Text:
content = span['content']
language = detect_lang(content)
......@@ -170,20 +201,21 @@ def para_to_standard_format(para):
}
return para_content
def make_standard_format_with_para(pdf_info_dict: dict):
def make_standard_format_with_para(pdf_info_dict: list, img_buket_path: str):
content_list = []
for _, page_info in pdf_info_dict.items():
for page_info in pdf_info_dict:
paras_of_layout = page_info.get("para_blocks")
if not paras_of_layout:
continue
for paras in paras_of_layout:
for para in paras:
para_content = para_to_standard_format(para)
para_content = para_to_standard_format(para, img_buket_path)
content_list.append(para_content)
return content_list
def line_to_standard_format(line):
def line_to_standard_format(line, img_buket_path):
line_text = ""
inline_equation_num = 0
for span in line['spans']:
......@@ -194,13 +226,13 @@ def line_to_standard_format(line):
if span['type'] == ContentType.Image:
content = {
'type': 'image',
'img_path': span['image_path']
'img_path': join_path(img_buket_path, span['image_path'])
}
return content
elif span['type'] == ContentType.Table:
content = {
'type': 'table',
'img_path': span['image_path']
'img_path': join_path(img_buket_path, span['image_path'])
}
return content
else:
......@@ -226,7 +258,7 @@ def line_to_standard_format(line):
return content
def ocr_mk_mm_standard_format(pdf_info_dict: dict):
def ocr_mk_mm_standard_format(pdf_info_dict: list):
"""
content_list
type string image/text/table/equation(行间的单独拿出来,行内的和text合并)
......@@ -236,7 +268,7 @@ def ocr_mk_mm_standard_format(pdf_info_dict: dict):
img_path string s3://full/path/to/img.jpg
"""
content_list = []
for _, page_info in pdf_info_dict.items():
for page_info in pdf_info_dict:
blocks = page_info.get("preproc_blocks")
if not blocks:
continue
......
......@@ -15,6 +15,7 @@ from collections import Counter
import click
import numpy as np
from loguru import logger
from magic_pdf.libs.commons import mymax, get_top_percent_list
from magic_pdf.filter.pdf_meta_scan import scan_max_page, junk_limit_min
......@@ -298,7 +299,7 @@ def classify_by_img_narrow_strips(page_width, page_height, img_sz_list):
return narrow_strip_pages_ratio < 0.5
def classify(pdf_path, total_page: int, page_width, page_height, img_sz_list: list, text_len_list: list, img_num_list: list, text_layout_list: list):
def classify(total_page: int, page_width, page_height, img_sz_list: list, text_len_list: list, img_num_list: list, text_layout_list: list):
"""
这里的图片和页面长度单位是pts
:param total_page:
......@@ -323,7 +324,7 @@ def classify(pdf_path, total_page: int, page_width, page_height, img_sz_list: li
elif not any(results.values()):
return False, results
else:
print(f"WARNING: {pdf_path} is not classified by area and text_len, by_image_area: {results['by_image_area']}, by_text: {results['by_text_len']}, by_avg_words: {results['by_avg_words']}, by_img_num: {results['by_img_num']}, by_text_layout: {results['by_text_layout']}, by_img_narrow_strips: {results['by_img_narrow_strips']}", file=sys.stderr) # 利用这种情况可以快速找出来哪些pdf比较特殊,针对性修正分类算法
logger.warning(f"pdf is not classified by area and text_len, by_image_area: {results['by_image_area']}, by_text: {results['by_text_len']}, by_avg_words: {results['by_avg_words']}, by_img_num: {results['by_img_num']}, by_text_layout: {results['by_text_layout']}, by_img_narrow_strips: {results['by_img_narrow_strips']}", file=sys.stderr) # 利用这种情况可以快速找出来哪些pdf比较特殊,针对性修正分类算法
return False, results
......@@ -350,7 +351,7 @@ def main(json_file):
is_needs_password = o['is_needs_password']
if is_encrypted or total_page == 0 or is_needs_password: # 加密的,需要密码的,没有页面的,都不处理
continue
tag = classify(pdf_path, total_page, page_width, page_height, img_sz_list, text_len_list, text_layout_list)
tag = classify(total_page, page_width, page_height, img_sz_list, text_len_list, text_layout_list)
o['is_text_pdf'] = tag
print(json.dumps(o, ensure_ascii=False))
except Exception as e:
......
......@@ -287,7 +287,7 @@ def get_language(doc: fitz.Document):
return language
def pdf_meta_scan(s3_pdf_path: str, pdf_bytes: bytes):
def pdf_meta_scan(pdf_bytes: bytes):
"""
:param s3_pdf_path:
:param pdf_bytes: pdf文件的二进制数据
......@@ -298,8 +298,8 @@ def pdf_meta_scan(s3_pdf_path: str, pdf_bytes: bytes):
is_encrypted = doc.is_encrypted
total_page = len(doc)
if total_page == 0:
logger.warning(f"drop this pdf: {s3_pdf_path}, drop_reason: {DropReason.EMPTY_PDF}")
result = {"need_drop": True, "drop_reason": DropReason.EMPTY_PDF}
logger.warning(f"drop this pdf, drop_reason: {DropReason.EMPTY_PDF}")
result = {"_need_drop": True, "_drop_reason": DropReason.EMPTY_PDF}
return result
else:
page_width_pts, page_height_pts = get_pdf_page_size_pts(doc)
......@@ -322,7 +322,6 @@ def pdf_meta_scan(s3_pdf_path: str, pdf_bytes: bytes):
# 最后输出一条json
res = {
"pdf_path": s3_pdf_path,
"is_needs_password": is_needs_password,
"is_encrypted": is_encrypted,
"total_page": total_page,
......@@ -350,7 +349,7 @@ def main(s3_pdf_path: str, s3_profile: str):
"""
try:
file_content = read_file(s3_pdf_path, s3_profile)
pdf_meta_scan(s3_pdf_path, file_content)
pdf_meta_scan(file_content)
except Exception as e:
print(f"ERROR: {s3_pdf_path}, {e}", file=sys.stderr)
logger.exception(e)
......
from enum import Enum
class ModelBlockTypeEnum(Enum):
TITLE = 0
PLAIN_TEXT = 1
ABANDON = 2
ISOLATE_FORMULA = 8
EMBEDDING = 13
ISOLATED = 14
\ No newline at end of file
from loguru import logger
import math
def _is_in_or_part_overlap(box1, box2) -> bool:
"""
......@@ -332,3 +332,42 @@ def find_right_nearest_text_bbox(pymu_blocks, obj_bbox):
return right_boxes[0]
else:
return None
def bbox_relative_pos(bbox1, bbox2):
x1, y1, x1b, y1b = bbox1
x2, y2, x2b, y2b = bbox2
left = x2b < x1
right = x1b < x2
bottom = y2b < y1
top = y1b < y2
return left, right, bottom, top
def bbox_distance(bbox1, bbox2):
def dist(point1, point2):
return math.sqrt((point1[0]-point2[0])**2 + (point1[1]-point2[1])**2)
x1, y1, x1b, y1b = bbox1
x2, y2, x2b, y2b = bbox2
left, right, bottom, top = bbox_relative_pos(bbox1, bbox2)
if top and left:
return dist((x1, y1b), (x2b, y2))
elif left and bottom:
return dist((x1, y1), (x2b, y2b))
elif bottom and right:
return dist((x1b, y1), (x2, y2b))
elif right and top:
return dist((x1b, y1b), (x2, y2))
elif left:
return x1 - x2b
elif right:
return x2 - x1b
elif bottom:
return y1 - y2b
elif top:
return y2 - y1b
else: # rectangles intersect
return 0
\ No newline at end of file
......@@ -2,6 +2,7 @@
根据bucket的名字返回对应的s3 AK, SK,endpoint三元组
"""
import json
import os
......@@ -10,20 +11,24 @@ from loguru import logger
from magic_pdf.libs.commons import parse_bucket_key
def get_s3_config(bucket_name: str):
"""
~/magic-pdf.json 读出来
"""
def read_config():
home_dir = os.path.expanduser("~")
config_file = os.path.join(home_dir, "magic-pdf.json")
if not os.path.exists(config_file):
raise Exception("magic-pdf.json not found")
raise Exception(f"{config_file} not found")
with open(config_file, "r") as f:
config = json.load(f)
return config
def get_s3_config(bucket_name: str):
"""
~/magic-pdf.json 读出来
"""
config = read_config()
bucket_info = config.get("bucket_info")
if bucket_name not in bucket_info:
......@@ -49,5 +54,10 @@ def get_bucket_name(path):
return bucket
if __name__ == '__main__':
def get_local_dir():
config = read_config()
return config.get("temp-output-dir", "/tmp")
if __name__ == "__main__":
ak, sk, endpoint = get_s3_config("llm-raw")
def dict_to_list(input_dict):
items_list = []
for _, item in input_dict.items():
items_list.append(item)
return items_list
def get_scale_ratio(ocr_page_info, page):
def get_scale_ratio(model_page_info, page):
pix = page.get_pixmap(dpi=72)
pymu_width = int(pix.w)
pymu_height = int(pix.h)
width_from_json = ocr_page_info['page_info']['width']
height_from_json = ocr_page_info['page_info']['height']
width_from_json = model_page_info['page_info']['width']
height_from_json = model_page_info['page_info']['height']
horizontal_scale_ratio = width_from_json / pymu_width
vertical_scale_ratio = height_from_json / pymu_height
return horizontal_scale_ratio, vertical_scale_ratio
from collections import Counter
from magic_pdf.libs.language import detect_lang
def get_language_from_model(model_list: list):
language_lst = []
for ocr_page_info in model_list:
page_text = ""
layout_dets = ocr_page_info["layout_dets"]
for layout_det in layout_dets:
category_id = layout_det["category_id"]
allow_category_id_list = [15]
if category_id in allow_category_id_list:
page_text += layout_det["text"]
page_language = detect_lang(page_text)
language_lst.append(page_language)
# 统计text_language_list中每种语言的个数
count_dict = Counter(language_lst)
# 输出text_language_list中出现的次数最多的语言
language = max(count_dict, key=count_dict.get)
return language
......@@ -8,7 +8,7 @@ class DropReason:
HIGH_COMPUTATIONAL_lOAD_BY_SVGS = "high_computational_load_by_svgs" # 特殊的SVG图,计算量太大,从而丢弃
HIGH_COMPUTATIONAL_lOAD_BY_TOTAL_PAGES = "high_computational_load_by_total_pages" # 计算量超过负荷,当前方法下计算量消耗过大
MISS_DOC_LAYOUT_RESULT = "missing doc_layout_result" # 版面分析失败
Exception = "exception" # 解析中发生异常
Exception = "_exception" # 解析中发生异常
ENCRYPTED = "encrypted" # PDF是加密的
EMPTY_PDF = "total_page=0" # PDF页面总数为0
NOT_IS_TEXT_PDF = "not_is_text_pdf" # 不是文字版PDF,无法直接解析
......
......@@ -16,3 +16,4 @@ class DropTag:
FOOTNOTE = "footnote"
NOT_IN_LAYOUT = "not_in_layout"
SPAN_OVERLAP = "span_overlap"
BLOCK_OVERLAP = "block_overlap"
def float_gt(a, b):
if 0.0001 >= abs(a -b):
return False
return a > b
def float_equal(a, b):
if 0.0001 >= abs(a-b):
return True
return False
\ No newline at end of file
......@@ -4,4 +4,17 @@ class ContentType:
Text = "text"
InlineEquation = "inline_equation"
InterlineEquation = "interline_equation"
class BlockType:
Image = "image"
ImageBody = "image_body"
ImageCaption = "image_caption"
Table = "table"
TableBody = "table_body"
TableCaption = "table_caption"
TableFootnote = "table_footnote"
Text = "text"
Title = "title"
InterlineEquation = "interline_equation"
Footnote = "footnote"
from s3pathlib import S3Path
def remove_non_official_s3_args(s3path):
"""
example: s3://abc/xxxx.json?bytes=0,81350 ==> s3://abc/xxxx.json
"""
arr = s3path.split("?")
return arr[0]
def parse_s3path(s3path: str):
p = S3Path(remove_non_official_s3_args(s3path))
return p.bucket, p.key
def parse_s3_range_params(s3path: str):
"""
example: s3://abc/xxxx.json?bytes=0,81350 ==> [0, 81350]
"""
arr = s3path.split("?bytes=")
if len(arr) == 1:
return None
return arr[1].split(",")
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
from magic_pdf.libs.commons import fitz
from loguru import logger
from magic_pdf.libs.commons import join_path
from magic_pdf.libs.hash_utils import compute_sha256
def cut_image(bbox: tuple, page_num: int, page: fitz.Page, return_path, imageWriter):
def cut_image(bbox: tuple, page_num: int, page: fitz.Page, return_path, imageWriter: AbsReaderWriter):
"""
从第page_num页的page中,根据bbox进行裁剪出一张jpg图片,返回图片路径
save_path:需要同时支持s3和本地, 图片存放在save_path下,文件名是: {page_num}_{bbox[0]}_{bbox[1]}_{bbox[2]}_{bbox[3]}.jpg , bbox内数字取整。
......@@ -28,49 +28,6 @@ def cut_image(bbox: tuple, page_num: int, page: fitz.Page, return_path, imageWri
byte_data = pix.tobytes(output='jpeg', jpg_quality=95)
imageWriter.write(data=byte_data, path=img_hash256_path, mode="binary")
imageWriter.write(byte_data, img_hash256_path, AbsReaderWriter.MODE_BIN)
return img_hash256_path
def save_images_by_bboxes(page_num: int, page: fitz.Page, pdf_bytes_md5: str,
image_bboxes: list, images_overlap_backup: list, table_bboxes: list,
equation_inline_bboxes: list,
equation_interline_bboxes: list, imageWriter) -> dict:
"""
返回一个dict, key为bbox, 值是图片地址
"""
image_info = []
image_backup_info = []
table_info = []
inline_eq_info = []
interline_eq_info = []
# 图片的保存路径组成是这样的: {s3_or_local_path}/{book_name}/{images|tables|equations}/{page_num}_{bbox[0]}_{bbox[1]}_{bbox[2]}_{bbox[3]}.jpg
def return_path(type):
return join_path(pdf_bytes_md5, type)
for bbox in image_bboxes:
if any([bbox[0] >= bbox[2], bbox[1] >= bbox[3]]):
logger.warning(f"image_bboxes: 错误的box, {bbox}")
continue
image_path = cut_image(bbox, page_num, page, return_path("images"), imageWriter)
image_info.append({"bbox": bbox, "image_path": image_path})
for bbox in images_overlap_backup:
if any([bbox[0] >= bbox[2], bbox[1] >= bbox[3]]):
logger.warning(f"images_overlap_backup: 错误的box, {bbox}")
continue
image_path = cut_image(bbox, page_num, page, return_path("images"), imageWriter)
image_backup_info.append({"bbox": bbox, "image_path": image_path})
for bbox in table_bboxes:
if any([bbox[0] >= bbox[2], bbox[1] >= bbox[3]]):
logger.warning(f"table_bboxes: 错误的box, {bbox}")
continue
image_path = cut_image(bbox, page_num, page, return_path("tables"), imageWriter)
table_info.append({"bbox": bbox, "image_path": image_path})
return image_info, image_backup_info, table_info, inline_eq_info, interline_eq_info
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment