"vscode:/vscode.git/clone" did not exist on "af2b136f73a8b2548039691ffd7cf97b3e013350"
Commit 8a52ada3 authored by 赵小蒙's avatar 赵小蒙
Browse files

data_type/bookid/data_source兼容处理

parent 26c23782
...@@ -36,7 +36,7 @@ def draw_layout_bbox(pdf_info_dict, input_path, out_path): ...@@ -36,7 +36,7 @@ def draw_layout_bbox(pdf_info_dict, input_path, out_path):
for layout in page['layout_bboxes']: for layout in page['layout_bboxes']:
page_layout_list.append(layout['layout_bbox']) page_layout_list.append(layout['layout_bbox'])
layout_bbox_list.append(page_layout_list) layout_bbox_list.append(page_layout_list)
for drop_tag, dropped_bboxes in page['dropped_bboxes'].items(): for drop_tag, dropped_bboxes in page['droped_bboxes'].items():
for dropped_bbox in dropped_bboxes: for dropped_bbox in dropped_bboxes:
page_dropped_list.append(dropped_bbox) page_dropped_list.append(dropped_bbox)
dropped_bbox_list.append(page_dropped_list) dropped_bbox_list.append(page_dropped_list)
......
...@@ -23,6 +23,27 @@ def exception_handler(jso: dict, e): ...@@ -23,6 +23,27 @@ def exception_handler(jso: dict, e):
return jso return jso
def get_data_type(jso: dict):
data_type = jso.get('data_type')
if data_type is None:
data_type = jso.get('file_type')
return data_type
def get_bookid(jso: dict):
book_id = jso.get('bookid')
if book_id is None:
book_id = jso.get('original_file_id')
return book_id
def get_data_source(jso: dict):
data_source = jso.get('data_source')
if data_source is None:
data_source = jso.get('file_source')
return data_source
def meta_scan(jso: dict, doc_layout_check=True) -> dict: def meta_scan(jso: dict, doc_layout_check=True) -> dict:
s3_pdf_path = jso.get('file_location') s3_pdf_path = jso.get('file_location')
s3_config = get_s3_config(s3_pdf_path) s3_config = get_s3_config(s3_pdf_path)
...@@ -32,7 +53,7 @@ def meta_scan(jso: dict, doc_layout_check=True) -> dict: ...@@ -32,7 +53,7 @@ def meta_scan(jso: dict, doc_layout_check=True) -> dict:
jso['drop_reason'] = DropReason.MISS_DOC_LAYOUT_RESULT jso['drop_reason'] = DropReason.MISS_DOC_LAYOUT_RESULT
return jso return jso
try: try:
data_source = jso.get('data_source') data_source = get_data_source(jso)
file_id = jso.get('file_id') file_id = jso.get('file_id')
book_name = data_source + "/" + file_id book_name = data_source + "/" + file_id
...@@ -78,7 +99,7 @@ def classify_by_type(jso: dict, debug_mode=False) -> dict: ...@@ -78,7 +99,7 @@ def classify_by_type(jso: dict, debug_mode=False) -> dict:
# 开始正式逻辑 # 开始正式逻辑
try: try:
pdf_meta = jso.get('pdf_meta') pdf_meta = jso.get('pdf_meta')
data_source = jso.get('data_source') data_source = get_data_source(jso)
file_id = jso.get('file_id') file_id = jso.get('file_id')
book_name = data_source + "/" + file_id book_name = data_source + "/" + file_id
total_page = pdf_meta["total_page"] total_page = pdf_meta["total_page"]
...@@ -140,11 +161,11 @@ def save_tables_to_s3(jso: dict, debug_mode=False) -> dict: ...@@ -140,11 +161,11 @@ def save_tables_to_s3(jso: dict, debug_mode=False) -> dict:
pass pass
else:# 如果debug没开,则检测是否有needdrop字段 else:# 如果debug没开,则检测是否有needdrop字段
if jso.get('need_drop', False): if jso.get('need_drop', False):
logger.info(f"book_name is:{jso['data_source']}/{jso['file_id']} need drop", file=sys.stderr) logger.info(f"book_name is:{get_data_source(jso)}/{jso['file_id']} need drop", file=sys.stderr)
jso["dropped"] = True jso["dropped"] = True
return jso return jso
try: try:
data_source = jso.get('data_source') data_source = get_data_source(jso)
file_id = jso.get('file_id') file_id = jso.get('file_id')
book_name = data_source + "/" + file_id book_name = data_source + "/" + file_id
title = jso.get('title') title = jso.get('title')
...@@ -195,7 +216,7 @@ def save_tables_to_s3(jso: dict, debug_mode=False) -> dict: ...@@ -195,7 +216,7 @@ def save_tables_to_s3(jso: dict, debug_mode=False) -> dict:
def drop_needdrop_pdf(jso: dict) -> dict: def drop_needdrop_pdf(jso: dict) -> dict:
if jso.get('need_drop', False): if jso.get('need_drop', False):
logger.info(f"book_name is:{jso['data_source']}/{jso['file_id']} need drop", file=sys.stderr) logger.info(f"book_name is:{get_data_source(jso)}/{jso['file_id']} need drop", file=sys.stderr)
jso["dropped"] = True jso["dropped"] = True
return jso return jso
...@@ -206,7 +227,7 @@ def pdf_intermediate_dict_to_markdown(jso: dict, debug_mode=False) -> dict: ...@@ -206,7 +227,7 @@ def pdf_intermediate_dict_to_markdown(jso: dict, debug_mode=False) -> dict:
pass pass
else:# 如果debug没开,则检测是否有needdrop字段 else:# 如果debug没开,则检测是否有needdrop字段
if jso.get('need_drop', False): if jso.get('need_drop', False):
book_name = join_path(jso['data_source'], jso['file_id']) book_name = join_path(get_data_source(jso), jso['file_id'])
logger.info(f"book_name is:{book_name} need drop", file=sys.stderr) logger.info(f"book_name is:{book_name} need drop", file=sys.stderr)
jso["dropped"] = True jso["dropped"] = True
return jso return jso
...@@ -216,7 +237,7 @@ def pdf_intermediate_dict_to_markdown(jso: dict, debug_mode=False) -> dict: ...@@ -216,7 +237,7 @@ def pdf_intermediate_dict_to_markdown(jso: dict, debug_mode=False) -> dict:
pdf_intermediate_dict = JsonCompressor.decompress_json(pdf_intermediate_dict) pdf_intermediate_dict = JsonCompressor.decompress_json(pdf_intermediate_dict)
markdown_content = mk_nlp_markdown(pdf_intermediate_dict) markdown_content = mk_nlp_markdown(pdf_intermediate_dict)
jso["content"] = markdown_content jso["content"] = markdown_content
logger.info(f"book_name is:{jso['data_source']}/{jso['file_id']},markdown content length is {len(markdown_content)}", file=sys.stderr) logger.info(f"book_name is:{get_data_source(jso)}/{jso['file_id']},markdown content length is {len(markdown_content)}", file=sys.stderr)
# 把无用的信息清空 # 把无用的信息清空
jso["doc_layout_result"] = "" jso["doc_layout_result"] = ""
jso["pdf_intermediate_dict"] = "" jso["pdf_intermediate_dict"] = ""
...@@ -237,7 +258,7 @@ def parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict: ...@@ -237,7 +258,7 @@ def parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict:
s3_pdf_path = jso.get('file_location') s3_pdf_path = jso.get('file_location')
s3_config = get_s3_config(s3_pdf_path) s3_config = get_s3_config(s3_pdf_path)
model_output_json_list = jso.get('doc_layout_result') model_output_json_list = jso.get('doc_layout_result')
data_source = jso.get('data_source') data_source = get_data_source(jso)
file_id = jso.get('file_id') file_id = jso.get('file_id')
book_name = data_source + "/" + file_id book_name = data_source + "/" + file_id
...@@ -290,5 +311,9 @@ def parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict: ...@@ -290,5 +311,9 @@ def parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict:
return jso return jso
def ocr_parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict:
pass
if __name__ == "__main__": if __name__ == "__main__":
pass pass
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment