Commit 23bacc60 authored by Shuimo's avatar Shuimo
Browse files

add an option to freely output 'badcase.json

parents d1457937 4191fa96
...@@ -40,15 +40,20 @@ jobs: ...@@ -40,15 +40,20 @@ jobs:
pip install -r requirements.txt pip install -r requirements.txt
fi fi
- name: config-net-reset
- name: benchmark run: |
export http_proxy=""
export https_proxy=""
- name: get-benchmark-result
run: | run: |
echo "start test" echo "start test"
cd tools && python ocr_badcase.py pdf_json_label_0306.json ocr_dataset.json json_files.zip badcase.json overall.json base_data.json cd tools && python text_badcase.py pdf_json_label_0306.json pdf_json_label_0229.json json_files.zip text_badcase text_overall base_data_text.json --s3_bucket_name llm-process-pperf --s3_file_directory qa-validate/pdf-datasets/badcase --AWS_ACCESS_KEY 7X9CWNHIVOHH3LXRD5WK --AWS_SECRET_KEY IHLyTsv7h4ArzReLWUGZNKvwqB7CMrRi6e7ZyUt0 --END_POINT_URL http://p-ceph-norm-inside.pjlab.org.cn:80
python ocr_badcase.py pdf_json_label_0306.json ocr_dataset.json json_files.zip ocr_badcase ocr_overall base_data_ocr.json --s3_bucket_name llm-process-pperf --s3_file_directory qa-validate/pdf-datasets/badcase --AWS_ACCESS_KEY 7X9CWNHIVOHH3LXRD5WK --AWS_SECRET_KEY IHLyTsv7h4ArzReLWUGZNKvwqB7CMrRi6e7ZyUt0 --END_POINT_URL http://p-ceph-norm-inside.pjlab.org.cn:80
notify_to_feishu: notify_to_feishu:
if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'master') }} if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'master') }}
needs: [pdf-test] needs: [pdf-test]
runs-on: [pdf] runs-on: pdf
steps: steps:
- name: notify - name: notify
run: | run: |
......
...@@ -22,15 +22,15 @@ git clone https://github.com/magicpdf/Magic-PDF.git ...@@ -22,15 +22,15 @@ git clone https://github.com/magicpdf/Magic-PDF.git
2.Install the requirements 2.Install the requirements
```sh ```sh
cd Magic-PDF
pip install -r requirements.txt pip install -r requirements.txt
``` ```
3.Run the main script 3.Run the command line
```sh ```sh
use demo/text_demo.py export PYTHONPATH=.
or python magic_pdf/cli/magicpdf.py --help
use demo/ocr_demo.py
``` ```
### 版权说明 ### 版权说明
......
...@@ -15,7 +15,7 @@ from loguru import logger ...@@ -15,7 +15,7 @@ from loguru import logger
from magic_pdf.libs.config_reader import get_s3_config_dict from magic_pdf.libs.config_reader import get_s3_config_dict
from magic_pdf.pdf_parse_by_txt import parse_pdf_by_txt from magic_pdf.pdf_parse_by_txt import parse_pdf_by_txt
from magic_pdf.spark.base import get_data_source from magic_pdf.spark.spark_api import get_data_source
def demo_parse_pdf(book_name=None, start_page_id=0, debug_mode=True): def demo_parse_pdf(book_name=None, start_page_id=0, debug_mode=True):
...@@ -67,9 +67,7 @@ def demo_classify_by_type(book_name=None, debug_mode=True): ...@@ -67,9 +67,7 @@ def demo_classify_by_type(book_name=None, debug_mode=True):
img_num_list = pdf_meta["imgs_per_page"] img_num_list = pdf_meta["imgs_per_page"]
text_len_list = pdf_meta["text_len_per_page"] text_len_list = pdf_meta["text_len_per_page"]
text_layout_list = pdf_meta["text_layout_per_page"] text_layout_list = pdf_meta["text_layout_per_page"]
pdf_path = json_object.get("file_location")
is_text_pdf, results = classify( is_text_pdf, results = classify(
pdf_path,
total_page, total_page,
page_width, page_width,
page_height, page_height,
...@@ -89,7 +87,7 @@ def demo_meta_scan(book_name=None, debug_mode=True): ...@@ -89,7 +87,7 @@ def demo_meta_scan(book_name=None, debug_mode=True):
s3_pdf_path = json_object.get("file_location") s3_pdf_path = json_object.get("file_location")
s3_config = get_s3_config_dict(s3_pdf_path) s3_config = get_s3_config_dict(s3_pdf_path)
pdf_bytes = read_file(s3_pdf_path, s3_config) pdf_bytes = read_file(s3_pdf_path, s3_config)
res = pdf_meta_scan(s3_pdf_path, pdf_bytes) res = pdf_meta_scan(pdf_bytes)
logger.info(json.dumps(res, ensure_ascii=False)) logger.info(json.dumps(res, ensure_ascii=False))
write_json_to_local(res, book_name) write_json_to_local(res, book_name)
......
...@@ -21,28 +21,175 @@ python magicpdf.py --json s3://llm-pdf-text/scihub/xxxx.json?bytes=0,81350 ...@@ -21,28 +21,175 @@ python magicpdf.py --json s3://llm-pdf-text/scihub/xxxx.json?bytes=0,81350
python magicpdf.py --pdf /home/llm/Downloads/xxxx.pdf --model /home/llm/Downloads/xxxx.json 或者 python magicpdf.py --pdf /home/llm/Downloads/xxxx.pdf python magicpdf.py --pdf /home/llm/Downloads/xxxx.pdf --model /home/llm/Downloads/xxxx.json 或者 python magicpdf.py --pdf /home/llm/Downloads/xxxx.pdf
""" """
import os
import json as json_parse
import click
from loguru import logger
from pathlib import Path
from magic_pdf.pipe.UNIPipe import UNIPipe
from magic_pdf.pipe.OCRPipe import OCRPipe
from magic_pdf.pipe.TXTPipe import TXTPipe
from magic_pdf.libs.config_reader import get_s3_config
from magic_pdf.libs.path_utils import (
parse_s3path,
parse_s3_range_params,
remove_non_official_s3_args,
)
from magic_pdf.libs.config_reader import get_local_dir
from magic_pdf.rw.S3ReaderWriter import S3ReaderWriter
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
parse_pdf_methods = click.Choice(["ocr", "txt", "auto"])
def prepare_env(pdf_file_name):
local_parent_dir = os.path.join(
get_local_dir(), "magic-pdf", pdf_file_name
)
local_image_dir = os.path.join(local_parent_dir, "images")
local_md_dir = local_parent_dir
os.makedirs(local_image_dir, exist_ok=True)
os.makedirs(local_md_dir, exist_ok=True)
return local_image_dir, local_md_dir
def _do_parse(pdf_file_name, pdf_bytes, model_list, parse_method, image_writer, md_writer, image_dir):
if parse_method == "auto":
pipe = UNIPipe(pdf_bytes, model_list, image_writer, image_dir, is_debug=True)
elif parse_method == "txt":
pipe = TXTPipe(pdf_bytes, model_list, image_writer, image_dir, is_debug=True)
elif parse_method == "ocr":
pipe = OCRPipe(pdf_bytes, model_list, image_writer, image_dir, is_debug=True)
else:
print("unknow parse method")
os.exit(1)
pipe.pipe_classify()
pipe.pipe_parse()
md_content = pipe.pipe_mk_markdown()
#part_file_name = datetime.now().strftime("%H-%M-%S")
md_writer.write(
content=md_content, path=f"{pdf_file_name}.md", mode=AbsReaderWriter.MODE_TXT
)
md_writer.write(
content=json_parse.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4),
path=f"{pdf_file_name}.json",
mode=AbsReaderWriter.MODE_TXT,
)
# try:
# content_list = pipe.pipe_mk_uni_format()
# except Exception as e:
# logger.exception(e)
# md_writer.write(
# str(content_list), f"{part_file_name}.txt", AbsReaderWriter.MODE_TXT
# )
import click
@click.group() @click.group()
def cli(): def cli():
pass pass
@cli.command() @cli.command()
@click.option('--json', type=str, help='输入一个S3路径') @click.option("--json", type=str, help="输入一个S3路径")
def json_command(json): @click.option(
# 这里处理json相关的逻辑 "--method",
print(f'处理JSON: {json}') type=parse_pdf_methods,
help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
default="auto",
)
def json_command(json, method):
if not json.startswith("s3://"):
print("usage: python magipdf.py --json s3://some_bucket/some_path")
os.exit(1)
def read_s3_path(s3path):
bucket, key = parse_s3path(s3path)
s3_ak, s3_sk, s3_endpoint = get_s3_config(bucket)
s3_rw = S3ReaderWriter(
s3_ak, s3_sk, s3_endpoint, "auto", remove_non_official_s3_args(s3path)
)
may_range_params = parse_s3_range_params(s3path)
if may_range_params is None or 2 != len(may_range_params):
byte_start, byte_end = 0, None
else:
byte_start, byte_end = int(may_range_params[0]), int(may_range_params[1])
byte_end += byte_start - 1
return s3_rw.read_jsonl(
remove_non_official_s3_args(s3path),
byte_start,
byte_end,
AbsReaderWriter.MODE_BIN,
)
jso = json_parse.loads(read_s3_path(json).decode("utf-8"))
s3_file_path = jso["file_location"]
pdf_file_name = Path(s3_file_path).stem
pdf_data = read_s3_path(s3_file_path)
local_image_dir, local_md_dir = prepare_env(pdf_file_name)
local_image_rw, local_md_rw = DiskReaderWriter(local_image_dir), DiskReaderWriter(
local_md_dir
)
_do_parse(
pdf_file_name,
pdf_data,
jso["doc_layout_result"],
method,
local_image_rw,
local_md_rw,
os.path.basename(local_image_dir),
)
@cli.command() @cli.command()
@click.option('--pdf', type=click.Path(exists=True), required=True, help='PDF文件的路径') @click.option(
@click.option('--model', type=click.Path(exists=True), help='模型的路径') "--pdf", type=click.Path(exists=True), required=True, help="PDF文件的路径"
def pdf_command(pdf, model): )
@click.option("--model", type=click.Path(exists=True), help="模型的路径")
@click.option(
"--method",
type=parse_pdf_methods,
help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
default="auto",
)
def pdf_command(pdf, model, method):
# 这里处理pdf和模型相关的逻辑 # 这里处理pdf和模型相关的逻辑
print(f'处理PDF: {pdf}') if model is None:
print(f'加载模型: {model}') model = pdf.replace(".pdf", ".json")
if not os.path.exists(model):
print(f"make sure json file existed and place under {os.dirname(pdf)}")
os.exit(1)
def read_fn(path):
disk_rw = DiskReaderWriter(os.path.dirname(path))
return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN)
pdf_data = read_fn(pdf)
jso = json_parse.loads(read_fn(model).decode("utf-8"))
pdf_file_name = Path(pdf).stem
local_image_dir, local_md_dir = prepare_env(pdf_file_name)
local_image_rw, local_md_rw = DiskReaderWriter(local_image_dir), DiskReaderWriter(
local_md_dir
)
_do_parse(
pdf_file_name,
pdf_data,
jso,
method,
local_image_rw,
local_md_rw,
os.path.basename(local_image_dir),
)
if __name__ == '__main__': if __name__ == "__main__":
"""
python magic_pdf/cli/magicpdf.py json-command --json s3://llm-pdf-text/pdf_ebook_and_paper/manual/v001/part-660407a28beb-000002.jsonl?bytes=0,63551
"""
cli() cli()
...@@ -2,6 +2,7 @@ import math ...@@ -2,6 +2,7 @@ import math
from loguru import logger from loguru import logger
from magic_pdf.libs.boxbase import find_bottom_nearest_text_bbox, find_top_nearest_text_bbox from magic_pdf.libs.boxbase import find_bottom_nearest_text_bbox, find_top_nearest_text_bbox
from magic_pdf.libs.commons import join_path
from magic_pdf.libs.ocr_content_type import ContentType from magic_pdf.libs.ocr_content_type import ContentType
TYPE_INLINE_EQUATION = ContentType.InlineEquation TYPE_INLINE_EQUATION = ContentType.InlineEquation
...@@ -227,12 +228,12 @@ def __insert_before_para(text, type, element, content_list): ...@@ -227,12 +228,12 @@ def __insert_before_para(text, type, element, content_list):
logger.error(f"Can't find the location of image {element.get('image_path')} in the markdown file, search target is {text}") logger.error(f"Can't find the location of image {element.get('image_path')} in the markdown file, search target is {text}")
def mk_universal_format(para_dict: dict): def mk_universal_format(pdf_info_list: list, img_buket_path):
""" """
构造统一格式 https://aicarrier.feishu.cn/wiki/FqmMwcH69iIdCWkkyjvcDwNUnTY 构造统一格式 https://aicarrier.feishu.cn/wiki/FqmMwcH69iIdCWkkyjvcDwNUnTY
""" """
content_lst = [] content_lst = []
for _, page_info in para_dict.items(): for page_info in pdf_info_list:
page_lst = [] # 一个page内的段落列表 page_lst = [] # 一个page内的段落列表
para_blocks = page_info.get("para_blocks") para_blocks = page_info.get("para_blocks")
pymu_raw_blocks = page_info.get("preproc_blocks") pymu_raw_blocks = page_info.get("preproc_blocks")
...@@ -249,7 +250,7 @@ def mk_universal_format(para_dict: dict): ...@@ -249,7 +250,7 @@ def mk_universal_format(para_dict: dict):
for img in all_page_images: for img in all_page_images:
content_node = { content_node = {
"type": "image", "type": "image",
"img_path": img['image_path'], "img_path": join_path(img_buket_path, img['image_path']),
"img_alt":"", "img_alt":"",
"img_title":"", "img_title":"",
"img_caption":"" "img_caption":""
...@@ -258,7 +259,7 @@ def mk_universal_format(para_dict: dict): ...@@ -258,7 +259,7 @@ def mk_universal_format(para_dict: dict):
for table in all_page_tables: for table in all_page_tables:
content_node = { content_node = {
"type": "table", "type": "table",
"img_path": table['image_path'], "img_path": join_path(img_buket_path, table['image_path']),
"table_latex": table.get("text"), "table_latex": table.get("text"),
"table_title": "", "table_title": "",
"table_caption": "", "table_caption": "",
......
from loguru import logger
from magic_pdf.libs.commons import join_path
from magic_pdf.libs.language import detect_lang from magic_pdf.libs.language import detect_lang
from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
from magic_pdf.libs.ocr_content_type import ContentType from magic_pdf.libs.ocr_content_type import ContentType, BlockType
import wordninja import wordninja
import re import re
...@@ -16,90 +19,41 @@ def split_long_words(text): ...@@ -16,90 +19,41 @@ def split_long_words(text):
return ' '.join(segments) return ' '.join(segments)
def ocr_mk_nlp_markdown(pdf_info_dict: dict): def ocr_mk_mm_markdown_with_para(pdf_info_list: list, img_buket_path):
markdown = []
for _, page_info in pdf_info_dict.items():
blocks = page_info.get("preproc_blocks")
if not blocks:
continue
for block in blocks:
for line in block['lines']:
line_text = ''
for span in line['spans']:
if not span.get('content'):
continue
content = ocr_escape_special_markdown_char(span['content']) # 转义特殊符号
if span['type'] == ContentType.InlineEquation:
content = f"${content}$"
elif span['type'] == ContentType.InterlineEquation:
content = f"$$\n{content}\n$$"
line_text += content + ' '
# 在行末添加两个空格以强制换行
markdown.append(line_text.strip() + ' ')
return '\n'.join(markdown)
def ocr_mk_mm_markdown(pdf_info_dict: dict):
markdown = []
for _, page_info in pdf_info_dict.items():
blocks = page_info.get("preproc_blocks")
if not blocks:
continue
for block in blocks:
for line in block['lines']:
line_text = ''
for span in line['spans']:
if not span.get('content'):
if not span.get('image_path'):
continue
else:
content = f"![]({span['image_path']})"
else:
content = ocr_escape_special_markdown_char(span['content']) # 转义特殊符号
if span['type'] == ContentType.InlineEquation:
content = f"${content}$"
elif span['type'] == ContentType.InterlineEquation:
content = f"$$\n{content}\n$$"
line_text += content + ' '
# 在行末添加两个空格以强制换行
markdown.append(line_text.strip() + ' ')
return '\n'.join(markdown)
def ocr_mk_mm_markdown_with_para(pdf_info_dict: dict):
markdown = [] markdown = []
for _, page_info in pdf_info_dict.items(): for page_info in pdf_info_list:
paras_of_layout = page_info.get("para_blocks") paras_of_layout = page_info.get("para_blocks")
page_markdown = ocr_mk_markdown_with_para_core(paras_of_layout, "mm") page_markdown = ocr_mk_markdown_with_para_core_v2(paras_of_layout, "mm", img_buket_path)
markdown.extend(page_markdown) markdown.extend(page_markdown)
return '\n\n'.join(markdown) return '\n\n'.join(markdown)
def ocr_mk_nlp_markdown_with_para(pdf_info_dict: dict): def ocr_mk_nlp_markdown_with_para(pdf_info_dict: list):
markdown = [] markdown = []
for _, page_info in pdf_info_dict.items(): for page_info in pdf_info_dict:
paras_of_layout = page_info.get("para_blocks") paras_of_layout = page_info.get("para_blocks")
page_markdown = ocr_mk_markdown_with_para_core(paras_of_layout, "nlp") page_markdown = ocr_mk_markdown_with_para_core_v2(paras_of_layout, "nlp")
markdown.extend(page_markdown) markdown.extend(page_markdown)
return '\n\n'.join(markdown) return '\n\n'.join(markdown)
def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: dict):
def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: list, img_buket_path):
markdown_with_para_and_pagination = [] markdown_with_para_and_pagination = []
for page_no, page_info in pdf_info_dict.items(): page_no = 0
for page_info in pdf_info_dict:
paras_of_layout = page_info.get("para_blocks") paras_of_layout = page_info.get("para_blocks")
if not paras_of_layout: if not paras_of_layout:
continue continue
page_markdown = ocr_mk_markdown_with_para_core(paras_of_layout, "mm") page_markdown = ocr_mk_markdown_with_para_core_v2(paras_of_layout, "mm", img_buket_path)
markdown_with_para_and_pagination.append({ markdown_with_para_and_pagination.append({
'page_no': page_no, 'page_no': page_no,
'md_content': '\n\n'.join(page_markdown) 'md_content': '\n\n'.join(page_markdown)
}) })
page_no += 1
return markdown_with_para_and_pagination return markdown_with_para_and_pagination
def ocr_mk_markdown_with_para_core(paras_of_layout, mode): def ocr_mk_markdown_with_para_core(paras_of_layout, mode, img_buket_path=""):
page_markdown = [] page_markdown = []
for paras in paras_of_layout: for paras in paras_of_layout:
for para in paras: for para in paras:
...@@ -122,7 +76,7 @@ def ocr_mk_markdown_with_para_core(paras_of_layout, mode): ...@@ -122,7 +76,7 @@ def ocr_mk_markdown_with_para_core(paras_of_layout, mode):
content = f"\n$$\n{span['content']}\n$$\n" content = f"\n$$\n{span['content']}\n$$\n"
elif span_type in [ContentType.Image, ContentType.Table]: elif span_type in [ContentType.Image, ContentType.Table]:
if mode == 'mm': if mode == 'mm':
content = f"\n![]({span['image_path']})\n" content = f"\n![]({join_path(img_buket_path, span['image_path'])})\n"
elif mode == 'nlp': elif mode == 'nlp':
pass pass
if content != '': if content != '':
...@@ -137,10 +91,86 @@ def ocr_mk_markdown_with_para_core(paras_of_layout, mode): ...@@ -137,10 +91,86 @@ def ocr_mk_markdown_with_para_core(paras_of_layout, mode):
return page_markdown return page_markdown
def para_to_standard_format(para): def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""):
page_markdown = []
for para_block in paras_of_layout:
para_text = ''
para_type = para_block.get('type')
if para_type == BlockType.Text:
para_text = merge_para_with_text(para_block)
elif para_type == BlockType.Title:
para_text = f"# {merge_para_with_text(para_block)}"
elif para_type == BlockType.InterlineEquation:
para_text = merge_para_with_text(para_block)
elif para_type == BlockType.Image:
if mode == 'nlp':
continue
elif mode == 'mm':
img_blocks = para_block.get('blocks')
for img_block in img_blocks:
if img_block.get('type') == BlockType.ImageBody:
for line in img_block.get('lines'):
for span in line['spans']:
if span.get('type') == ContentType.Image:
para_text = f"\n![]({join_path(img_buket_path, span['image_path'])})\n"
for img_block in img_blocks:
if img_block.get('type') == BlockType.ImageCaption:
para_text += merge_para_with_text(img_block)
elif para_type == BlockType.Table:
if mode == 'nlp':
continue
elif mode == 'mm':
table_blocks = para_block.get('blocks')
for table_block in table_blocks:
if table_block.get('type') == BlockType.TableBody:
for line in table_block.get('lines'):
for span in line['spans']:
if span.get('type') == ContentType.Table:
para_text = f"\n![]({join_path(img_buket_path, span['image_path'])})\n"
for table_block in table_blocks:
if table_block.get('type') == BlockType.TableCaption:
para_text += merge_para_with_text(table_block)
elif table_block.get('type') == BlockType.TableFootnote:
para_text += merge_para_with_text(table_block)
if para_text.strip() == '':
continue
else:
page_markdown.append(para_text.strip() + ' ')
return page_markdown
def merge_para_with_text(para):
para_text = ''
for line in para['lines']:
for span in line['spans']:
span_type = span.get('type')
content = ''
language = ''
if span_type == ContentType.Text:
content = span['content']
language = detect_lang(content)
if language == 'en': # 只对英文长词进行分词处理,中文分词会丢失文本
content = ocr_escape_special_markdown_char(split_long_words(content))
else:
content = ocr_escape_special_markdown_char(content)
elif span_type == ContentType.InlineEquation:
content = f"${span['content']}$"
elif span_type == ContentType.InterlineEquation:
content = f"\n$$\n{span['content']}\n$$\n"
if content != '':
if language == 'en': # 英文语境下 content间需要空格分隔
para_text += content + ' '
else: # 中文语境下,content间不需要空格分隔
para_text += content
return para_text
def para_to_standard_format(para, img_buket_path):
para_content = {} para_content = {}
if len(para) == 1: if len(para) == 1:
para_content = line_to_standard_format(para[0]) para_content = line_to_standard_format(para[0], img_buket_path)
elif len(para) > 1: elif len(para) > 1:
para_text = '' para_text = ''
inline_equation_num = 0 inline_equation_num = 0
...@@ -148,6 +178,7 @@ def para_to_standard_format(para): ...@@ -148,6 +178,7 @@ def para_to_standard_format(para):
for span in line['spans']: for span in line['spans']:
language = '' language = ''
span_type = span.get('type') span_type = span.get('type')
content = ""
if span_type == ContentType.Text: if span_type == ContentType.Text:
content = span['content'] content = span['content']
language = detect_lang(content) language = detect_lang(content)
...@@ -170,20 +201,21 @@ def para_to_standard_format(para): ...@@ -170,20 +201,21 @@ def para_to_standard_format(para):
} }
return para_content return para_content
def make_standard_format_with_para(pdf_info_dict: dict):
def make_standard_format_with_para(pdf_info_dict: list, img_buket_path: str):
content_list = [] content_list = []
for _, page_info in pdf_info_dict.items(): for page_info in pdf_info_dict:
paras_of_layout = page_info.get("para_blocks") paras_of_layout = page_info.get("para_blocks")
if not paras_of_layout: if not paras_of_layout:
continue continue
for paras in paras_of_layout: for paras in paras_of_layout:
for para in paras: for para in paras:
para_content = para_to_standard_format(para) para_content = para_to_standard_format(para, img_buket_path)
content_list.append(para_content) content_list.append(para_content)
return content_list return content_list
def line_to_standard_format(line): def line_to_standard_format(line, img_buket_path):
line_text = "" line_text = ""
inline_equation_num = 0 inline_equation_num = 0
for span in line['spans']: for span in line['spans']:
...@@ -194,13 +226,13 @@ def line_to_standard_format(line): ...@@ -194,13 +226,13 @@ def line_to_standard_format(line):
if span['type'] == ContentType.Image: if span['type'] == ContentType.Image:
content = { content = {
'type': 'image', 'type': 'image',
'img_path': span['image_path'] 'img_path': join_path(img_buket_path, span['image_path'])
} }
return content return content
elif span['type'] == ContentType.Table: elif span['type'] == ContentType.Table:
content = { content = {
'type': 'table', 'type': 'table',
'img_path': span['image_path'] 'img_path': join_path(img_buket_path, span['image_path'])
} }
return content return content
else: else:
...@@ -226,7 +258,7 @@ def line_to_standard_format(line): ...@@ -226,7 +258,7 @@ def line_to_standard_format(line):
return content return content
def ocr_mk_mm_standard_format(pdf_info_dict: dict): def ocr_mk_mm_standard_format(pdf_info_dict: list):
""" """
content_list content_list
type string image/text/table/equation(行间的单独拿出来,行内的和text合并) type string image/text/table/equation(行间的单独拿出来,行内的和text合并)
...@@ -236,7 +268,7 @@ def ocr_mk_mm_standard_format(pdf_info_dict: dict): ...@@ -236,7 +268,7 @@ def ocr_mk_mm_standard_format(pdf_info_dict: dict):
img_path string s3://full/path/to/img.jpg img_path string s3://full/path/to/img.jpg
""" """
content_list = [] content_list = []
for _, page_info in pdf_info_dict.items(): for page_info in pdf_info_dict:
blocks = page_info.get("preproc_blocks") blocks = page_info.get("preproc_blocks")
if not blocks: if not blocks:
continue continue
......
...@@ -15,6 +15,7 @@ from collections import Counter ...@@ -15,6 +15,7 @@ from collections import Counter
import click import click
import numpy as np import numpy as np
from loguru import logger
from magic_pdf.libs.commons import mymax, get_top_percent_list from magic_pdf.libs.commons import mymax, get_top_percent_list
from magic_pdf.filter.pdf_meta_scan import scan_max_page, junk_limit_min from magic_pdf.filter.pdf_meta_scan import scan_max_page, junk_limit_min
...@@ -298,7 +299,7 @@ def classify_by_img_narrow_strips(page_width, page_height, img_sz_list): ...@@ -298,7 +299,7 @@ def classify_by_img_narrow_strips(page_width, page_height, img_sz_list):
return narrow_strip_pages_ratio < 0.5 return narrow_strip_pages_ratio < 0.5
def classify(pdf_path, total_page: int, page_width, page_height, img_sz_list: list, text_len_list: list, img_num_list: list, text_layout_list: list): def classify(total_page: int, page_width, page_height, img_sz_list: list, text_len_list: list, img_num_list: list, text_layout_list: list):
""" """
这里的图片和页面长度单位是pts 这里的图片和页面长度单位是pts
:param total_page: :param total_page:
...@@ -323,7 +324,7 @@ def classify(pdf_path, total_page: int, page_width, page_height, img_sz_list: li ...@@ -323,7 +324,7 @@ def classify(pdf_path, total_page: int, page_width, page_height, img_sz_list: li
elif not any(results.values()): elif not any(results.values()):
return False, results return False, results
else: else:
print(f"WARNING: {pdf_path} is not classified by area and text_len, by_image_area: {results['by_image_area']}, by_text: {results['by_text_len']}, by_avg_words: {results['by_avg_words']}, by_img_num: {results['by_img_num']}, by_text_layout: {results['by_text_layout']}, by_img_narrow_strips: {results['by_img_narrow_strips']}", file=sys.stderr) # 利用这种情况可以快速找出来哪些pdf比较特殊,针对性修正分类算法 logger.warning(f"pdf is not classified by area and text_len, by_image_area: {results['by_image_area']}, by_text: {results['by_text_len']}, by_avg_words: {results['by_avg_words']}, by_img_num: {results['by_img_num']}, by_text_layout: {results['by_text_layout']}, by_img_narrow_strips: {results['by_img_narrow_strips']}", file=sys.stderr) # 利用这种情况可以快速找出来哪些pdf比较特殊,针对性修正分类算法
return False, results return False, results
...@@ -350,7 +351,7 @@ def main(json_file): ...@@ -350,7 +351,7 @@ def main(json_file):
is_needs_password = o['is_needs_password'] is_needs_password = o['is_needs_password']
if is_encrypted or total_page == 0 or is_needs_password: # 加密的,需要密码的,没有页面的,都不处理 if is_encrypted or total_page == 0 or is_needs_password: # 加密的,需要密码的,没有页面的,都不处理
continue continue
tag = classify(pdf_path, total_page, page_width, page_height, img_sz_list, text_len_list, text_layout_list) tag = classify(total_page, page_width, page_height, img_sz_list, text_len_list, text_layout_list)
o['is_text_pdf'] = tag o['is_text_pdf'] = tag
print(json.dumps(o, ensure_ascii=False)) print(json.dumps(o, ensure_ascii=False))
except Exception as e: except Exception as e:
......
...@@ -287,7 +287,7 @@ def get_language(doc: fitz.Document): ...@@ -287,7 +287,7 @@ def get_language(doc: fitz.Document):
return language return language
def pdf_meta_scan(s3_pdf_path: str, pdf_bytes: bytes): def pdf_meta_scan(pdf_bytes: bytes):
""" """
:param s3_pdf_path: :param s3_pdf_path:
:param pdf_bytes: pdf文件的二进制数据 :param pdf_bytes: pdf文件的二进制数据
...@@ -298,8 +298,8 @@ def pdf_meta_scan(s3_pdf_path: str, pdf_bytes: bytes): ...@@ -298,8 +298,8 @@ def pdf_meta_scan(s3_pdf_path: str, pdf_bytes: bytes):
is_encrypted = doc.is_encrypted is_encrypted = doc.is_encrypted
total_page = len(doc) total_page = len(doc)
if total_page == 0: if total_page == 0:
logger.warning(f"drop this pdf: {s3_pdf_path}, drop_reason: {DropReason.EMPTY_PDF}") logger.warning(f"drop this pdf, drop_reason: {DropReason.EMPTY_PDF}")
result = {"need_drop": True, "drop_reason": DropReason.EMPTY_PDF} result = {"_need_drop": True, "_drop_reason": DropReason.EMPTY_PDF}
return result return result
else: else:
page_width_pts, page_height_pts = get_pdf_page_size_pts(doc) page_width_pts, page_height_pts = get_pdf_page_size_pts(doc)
...@@ -322,7 +322,6 @@ def pdf_meta_scan(s3_pdf_path: str, pdf_bytes: bytes): ...@@ -322,7 +322,6 @@ def pdf_meta_scan(s3_pdf_path: str, pdf_bytes: bytes):
# 最后输出一条json # 最后输出一条json
res = { res = {
"pdf_path": s3_pdf_path,
"is_needs_password": is_needs_password, "is_needs_password": is_needs_password,
"is_encrypted": is_encrypted, "is_encrypted": is_encrypted,
"total_page": total_page, "total_page": total_page,
...@@ -350,7 +349,7 @@ def main(s3_pdf_path: str, s3_profile: str): ...@@ -350,7 +349,7 @@ def main(s3_pdf_path: str, s3_profile: str):
""" """
try: try:
file_content = read_file(s3_pdf_path, s3_profile) file_content = read_file(s3_pdf_path, s3_profile)
pdf_meta_scan(s3_pdf_path, file_content) pdf_meta_scan(file_content)
except Exception as e: except Exception as e:
print(f"ERROR: {s3_pdf_path}, {e}", file=sys.stderr) print(f"ERROR: {s3_pdf_path}, {e}", file=sys.stderr)
logger.exception(e) logger.exception(e)
......
from enum import Enum
class ModelBlockTypeEnum(Enum):
TITLE = 0
PLAIN_TEXT = 1
ABANDON = 2
ISOLATE_FORMULA = 8
EMBEDDING = 13
ISOLATED = 14
\ No newline at end of file
from loguru import logger from loguru import logger
import math
def _is_in_or_part_overlap(box1, box2) -> bool: def _is_in_or_part_overlap(box1, box2) -> bool:
""" """
...@@ -332,3 +332,42 @@ def find_right_nearest_text_bbox(pymu_blocks, obj_bbox): ...@@ -332,3 +332,42 @@ def find_right_nearest_text_bbox(pymu_blocks, obj_bbox):
return right_boxes[0] return right_boxes[0]
else: else:
return None return None
def bbox_relative_pos(bbox1, bbox2):
x1, y1, x1b, y1b = bbox1
x2, y2, x2b, y2b = bbox2
left = x2b < x1
right = x1b < x2
bottom = y2b < y1
top = y1b < y2
return left, right, bottom, top
def bbox_distance(bbox1, bbox2):
def dist(point1, point2):
return math.sqrt((point1[0]-point2[0])**2 + (point1[1]-point2[1])**2)
x1, y1, x1b, y1b = bbox1
x2, y2, x2b, y2b = bbox2
left, right, bottom, top = bbox_relative_pos(bbox1, bbox2)
if top and left:
return dist((x1, y1b), (x2b, y2))
elif left and bottom:
return dist((x1, y1), (x2b, y2b))
elif bottom and right:
return dist((x1b, y1), (x2, y2b))
elif right and top:
return dist((x1b, y1b), (x2, y2))
elif left:
return x1 - x2b
elif right:
return x2 - x1b
elif bottom:
return y1 - y2b
elif top:
return y2 - y1b
else: # rectangles intersect
return 0
\ No newline at end of file
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
根据bucket的名字返回对应的s3 AK, SK,endpoint三元组 根据bucket的名字返回对应的s3 AK, SK,endpoint三元组
""" """
import json import json
import os import os
...@@ -10,20 +11,24 @@ from loguru import logger ...@@ -10,20 +11,24 @@ from loguru import logger
from magic_pdf.libs.commons import parse_bucket_key from magic_pdf.libs.commons import parse_bucket_key
def get_s3_config(bucket_name: str): def read_config():
"""
~/magic-pdf.json 读出来
"""
home_dir = os.path.expanduser("~") home_dir = os.path.expanduser("~")
config_file = os.path.join(home_dir, "magic-pdf.json") config_file = os.path.join(home_dir, "magic-pdf.json")
if not os.path.exists(config_file): if not os.path.exists(config_file):
raise Exception("magic-pdf.json not found") raise Exception(f"{config_file} not found")
with open(config_file, "r") as f: with open(config_file, "r") as f:
config = json.load(f) config = json.load(f)
return config
def get_s3_config(bucket_name: str):
"""
~/magic-pdf.json 读出来
"""
config = read_config()
bucket_info = config.get("bucket_info") bucket_info = config.get("bucket_info")
if bucket_name not in bucket_info: if bucket_name not in bucket_info:
...@@ -49,5 +54,10 @@ def get_bucket_name(path): ...@@ -49,5 +54,10 @@ def get_bucket_name(path):
return bucket return bucket
if __name__ == '__main__': def get_local_dir():
config = read_config()
return config.get("temp-output-dir", "/tmp")
if __name__ == "__main__":
ak, sk, endpoint = get_s3_config("llm-raw") ak, sk, endpoint = get_s3_config("llm-raw")
def dict_to_list(input_dict):
items_list = []
for _, item in input_dict.items():
items_list.append(item)
return items_list
def get_scale_ratio(ocr_page_info, page): def get_scale_ratio(model_page_info, page):
pix = page.get_pixmap(dpi=72) pix = page.get_pixmap(dpi=72)
pymu_width = int(pix.w) pymu_width = int(pix.w)
pymu_height = int(pix.h) pymu_height = int(pix.h)
width_from_json = ocr_page_info['page_info']['width'] width_from_json = model_page_info['page_info']['width']
height_from_json = ocr_page_info['page_info']['height'] height_from_json = model_page_info['page_info']['height']
horizontal_scale_ratio = width_from_json / pymu_width horizontal_scale_ratio = width_from_json / pymu_width
vertical_scale_ratio = height_from_json / pymu_height vertical_scale_ratio = height_from_json / pymu_height
return horizontal_scale_ratio, vertical_scale_ratio return horizontal_scale_ratio, vertical_scale_ratio
from collections import Counter
from magic_pdf.libs.language import detect_lang
def get_language_from_model(model_list: list):
language_lst = []
for ocr_page_info in model_list:
page_text = ""
layout_dets = ocr_page_info["layout_dets"]
for layout_det in layout_dets:
category_id = layout_det["category_id"]
allow_category_id_list = [15]
if category_id in allow_category_id_list:
page_text += layout_det["text"]
page_language = detect_lang(page_text)
language_lst.append(page_language)
# 统计text_language_list中每种语言的个数
count_dict = Counter(language_lst)
# 输出text_language_list中出现的次数最多的语言
language = max(count_dict, key=count_dict.get)
return language
...@@ -8,7 +8,7 @@ class DropReason: ...@@ -8,7 +8,7 @@ class DropReason:
HIGH_COMPUTATIONAL_lOAD_BY_SVGS = "high_computational_load_by_svgs" # 特殊的SVG图,计算量太大,从而丢弃 HIGH_COMPUTATIONAL_lOAD_BY_SVGS = "high_computational_load_by_svgs" # 特殊的SVG图,计算量太大,从而丢弃
HIGH_COMPUTATIONAL_lOAD_BY_TOTAL_PAGES = "high_computational_load_by_total_pages" # 计算量超过负荷,当前方法下计算量消耗过大 HIGH_COMPUTATIONAL_lOAD_BY_TOTAL_PAGES = "high_computational_load_by_total_pages" # 计算量超过负荷,当前方法下计算量消耗过大
MISS_DOC_LAYOUT_RESULT = "missing doc_layout_result" # 版面分析失败 MISS_DOC_LAYOUT_RESULT = "missing doc_layout_result" # 版面分析失败
Exception = "exception" # 解析中发生异常 Exception = "_exception" # 解析中发生异常
ENCRYPTED = "encrypted" # PDF是加密的 ENCRYPTED = "encrypted" # PDF是加密的
EMPTY_PDF = "total_page=0" # PDF页面总数为0 EMPTY_PDF = "total_page=0" # PDF页面总数为0
NOT_IS_TEXT_PDF = "not_is_text_pdf" # 不是文字版PDF,无法直接解析 NOT_IS_TEXT_PDF = "not_is_text_pdf" # 不是文字版PDF,无法直接解析
......
...@@ -16,3 +16,4 @@ class DropTag: ...@@ -16,3 +16,4 @@ class DropTag:
FOOTNOTE = "footnote" FOOTNOTE = "footnote"
NOT_IN_LAYOUT = "not_in_layout" NOT_IN_LAYOUT = "not_in_layout"
SPAN_OVERLAP = "span_overlap" SPAN_OVERLAP = "span_overlap"
BLOCK_OVERLAP = "block_overlap"
def float_gt(a, b):
if 0.0001 >= abs(a -b):
return False
return a > b
def float_equal(a, b):
if 0.0001 >= abs(a-b):
return True
return False
\ No newline at end of file
...@@ -4,4 +4,17 @@ class ContentType: ...@@ -4,4 +4,17 @@ class ContentType:
Text = "text" Text = "text"
InlineEquation = "inline_equation" InlineEquation = "inline_equation"
InterlineEquation = "interline_equation" InterlineEquation = "interline_equation"
class BlockType:
Image = "image"
ImageBody = "image_body"
ImageCaption = "image_caption"
Table = "table"
TableBody = "table_body"
TableCaption = "table_caption"
TableFootnote = "table_footnote"
Text = "text"
Title = "title"
InterlineEquation = "interline_equation"
Footnote = "footnote"
from s3pathlib import S3Path
def remove_non_official_s3_args(s3path):
"""
example: s3://abc/xxxx.json?bytes=0,81350 ==> s3://abc/xxxx.json
"""
arr = s3path.split("?")
return arr[0]
def parse_s3path(s3path: str):
p = S3Path(remove_non_official_s3_args(s3path))
return p.bucket, p.key
def parse_s3_range_params(s3path: str):
"""
example: s3://abc/xxxx.json?bytes=0,81350 ==> [0, 81350]
"""
arr = s3path.split("?bytes=")
if len(arr) == 1:
return None
return arr[1].split(",")
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
from magic_pdf.libs.commons import fitz from magic_pdf.libs.commons import fitz
from loguru import logger
from magic_pdf.libs.commons import join_path from magic_pdf.libs.commons import join_path
from magic_pdf.libs.hash_utils import compute_sha256 from magic_pdf.libs.hash_utils import compute_sha256
def cut_image(bbox: tuple, page_num: int, page: fitz.Page, return_path, imageWriter): def cut_image(bbox: tuple, page_num: int, page: fitz.Page, return_path, imageWriter: AbsReaderWriter):
""" """
从第page_num页的page中,根据bbox进行裁剪出一张jpg图片,返回图片路径 从第page_num页的page中,根据bbox进行裁剪出一张jpg图片,返回图片路径
save_path:需要同时支持s3和本地, 图片存放在save_path下,文件名是: {page_num}_{bbox[0]}_{bbox[1]}_{bbox[2]}_{bbox[3]}.jpg , bbox内数字取整。 save_path:需要同时支持s3和本地, 图片存放在save_path下,文件名是: {page_num}_{bbox[0]}_{bbox[1]}_{bbox[2]}_{bbox[3]}.jpg , bbox内数字取整。
...@@ -28,49 +28,6 @@ def cut_image(bbox: tuple, page_num: int, page: fitz.Page, return_path, imageWri ...@@ -28,49 +28,6 @@ def cut_image(bbox: tuple, page_num: int, page: fitz.Page, return_path, imageWri
byte_data = pix.tobytes(output='jpeg', jpg_quality=95) byte_data = pix.tobytes(output='jpeg', jpg_quality=95)
imageWriter.write(data=byte_data, path=img_hash256_path, mode="binary") imageWriter.write(byte_data, img_hash256_path, AbsReaderWriter.MODE_BIN)
return img_hash256_path return img_hash256_path
def save_images_by_bboxes(page_num: int, page: fitz.Page, pdf_bytes_md5: str,
image_bboxes: list, images_overlap_backup: list, table_bboxes: list,
equation_inline_bboxes: list,
equation_interline_bboxes: list, imageWriter) -> dict:
"""
返回一个dict, key为bbox, 值是图片地址
"""
image_info = []
image_backup_info = []
table_info = []
inline_eq_info = []
interline_eq_info = []
# 图片的保存路径组成是这样的: {s3_or_local_path}/{book_name}/{images|tables|equations}/{page_num}_{bbox[0]}_{bbox[1]}_{bbox[2]}_{bbox[3]}.jpg
def return_path(type):
return join_path(pdf_bytes_md5, type)
for bbox in image_bboxes:
if any([bbox[0] >= bbox[2], bbox[1] >= bbox[3]]):
logger.warning(f"image_bboxes: 错误的box, {bbox}")
continue
image_path = cut_image(bbox, page_num, page, return_path("images"), imageWriter)
image_info.append({"bbox": bbox, "image_path": image_path})
for bbox in images_overlap_backup:
if any([bbox[0] >= bbox[2], bbox[1] >= bbox[3]]):
logger.warning(f"images_overlap_backup: 错误的box, {bbox}")
continue
image_path = cut_image(bbox, page_num, page, return_path("images"), imageWriter)
image_backup_info.append({"bbox": bbox, "image_path": image_path})
for bbox in table_bboxes:
if any([bbox[0] >= bbox[2], bbox[1] >= bbox[3]]):
logger.warning(f"table_bboxes: 错误的box, {bbox}")
continue
image_path = cut_image(bbox, page_num, page, return_path("tables"), imageWriter)
table_info.append({"bbox": bbox, "image_path": image_path})
return image_info, image_backup_info, table_info, inline_eq_info, interline_eq_info
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment