Commit 6fb9fc49 authored by zhougaofeng's avatar zhougaofeng
Browse files

Update magic_pdf/dict2md/ocr_server.py, magic_pdf/dict2md/ocr_mkcontent.py,...

Update magic_pdf/dict2md/ocr_server.py, magic_pdf/dict2md/ocr_mkcontent.py, magic_pdf/dict2md/mkcontent.py, magic_pdf/dict2md/ocr_server_72.py, magic_pdf/dict2md/ocr_vllm_client.py, magic_pdf/dict2md/ocr_vllm_server.py, magic_pdf/dict2md/tmp.py, magic_pdf/filter/pdf_classify_by_type.py, magic_pdf/filter/pdf_meta_scan.py, magic_pdf/layout/bbox_sort.py, magic_pdf/layout/layout_det_utils.py, magic_pdf/layout/layout_sort.py, magic_pdf/layout/layout_spiler_recog.py, magic_pdf/layout/mcol_sort.py, magic_pdf/libs/Constants.py, magic_pdf/libs/MakeContentConfig.py, magic_pdf/libs/ModelBlockTypeEnum.py, magic_pdf/libs/calc_span_stats.py, magic_pdf/libs/commons.py, magic_pdf/libs/config_reader.py, magic_pdf/libs/drop_tag.py, magic_pdf/libs/hash_utils.py, magic_pdf/libs/language.py, magic_pdf/libs/local_math.py, magic_pdf/libs/markdown_utils.py, magic_pdf/libs/nlp_utils.py, magic_pdf/libs/ocr_content_type.py, magic_pdf/libs/path_utils.py, magic_pdf/libs/pdf_image_tools.py, magic_pdf/libs/textbase.py, magic_pdf/libs/version.py, magic_pdf/libs/vis_utils.py, magic_pdf/model/__init__.py, magic_pdf/model/doc_analyze_by_custom_model.py, magic_pdf/model/magic_model.py, magic_pdf/model/model_list.py, magic_pdf/model/pdf_extract_kit.py, magic_pdf/model/ppTableModel.py, magic_pdf/para/block_continuation_processor.py, magic_pdf/para/block_termination_processor.py, magic_pdf/para/commons.py, magic_pdf/para/denoise.py, magic_pdf/para/exceptions.py, magic_pdf/para/layout_match_processor.py, magic_pdf/para/para_pipeline.py, magic_pdf/para/para_split.py, magic_pdf/para/para_split_v2.py, magic_pdf/para/raw_processor.py, magic_pdf/para/stats.py, magic_pdf/para/title_processor.py, magic_pdf/parse/excel_parse.py, magic_pdf/parse/pdf_client.py, magic_pdf/pipe/AbsPipe.py, magic_pdf/pipe/OCRPipe.py, magic_pdf/pipe/TXTPipe.py, magic_pdf/pipe/UNIPipe.py, magic_pdf/pre_proc/cut_image.py, magic_pdf/tools/common.py, magic_pdf/tools/pdf_server.py, magic_pdf/pdf_parse_by_ocr.py, magic_pdf/pdf_parse_by_txt.py, magic_pdf/pdf_parse_union_core.py, magic_pdf/user_api.py, magic_pdf/model/pp_structure_v2.py files
parent 255a6ed0
......@@ -178,19 +178,20 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
elif span.get('html', ''):
para_text += f"\n\n{span['html']}\n\n"
else:
# 处理图片
# para_text += f"----------------这是ocr表格内容({join_path(img_buket_path, span['image_path'])})------------------- \n"
if status:
text = '解析图片内容,直接返回一段带有逻辑性的中文书面语描述,要求表达精准,不脱离图片中的实际内容,不要带换行,文中所有的名词不要用指代词'
start = time.time()
image_path = join_path(img_buket_path, span['image_path'])
compress_image(image_path)
generated_text = client.predict(image_path, text)
end = time.time()
logger.info(f'qwen解析{image_path}表格的内容为:{generated_text},耗时为:{end-start}')
para_text += generated_text
else:
para_text += f"----------------图片路径为({join_path(img_buket_path, span['image_path'])}),请检查qwen ocr服务,重新运行文件解析------------------- \n"
para_text += span['image_path']
# # 处理图片
# # para_text += f"----------------这是ocr表格内容({join_path(img_buket_path, span['image_path'])})------------------- \n"
# if status:
# # text = '解析图片内容,直接返回一段带有逻辑性的中文书面语描述,要求表达精准,不脱离图片中的实际内容,不要带换行,文中所有的名词不要用指代词'
# # start = time.time()
# # image_path = join_path(img_buket_path, span['image_path'])
# # compress_image(image_path)
# # generated_text = client.predict(image_path, text)
# # end = time.time()
# # logger.info(f'qwen解析{image_path}表格的内容为:{generated_text},耗时为:{end-start}')
# para_text += span['image_path']
# else:
# para_text += f"----------------图片路径为({join_path(img_buket_path, span['image_path'])}),请检查qwen ocr服务,重新运行文件解析------------------- \n"
for block in para_block['blocks']: # 3rd.拼table_footnote
if block['type'] == BlockType.TableFootnote:
para_text += merge_para_with_text(block)
......@@ -431,17 +432,15 @@ def union_make(ocr_status:str,
drop_mode: str,
img_buket_path: str = ''):
output_content = []
global client
global status
config = configparser.ConfigParser()
config.read(config_path)
url = config.get('server', 'ocr_server')
# global client
# global status
# config = configparser.ConfigParser()
# config.read(config_path)
# url = config.get('server', 'ocr_server')
# logger.info(f'ocr_server:{url}')
client = PredictClient(url)
status = ocr_status
if not status:
logger.warning(f'Health check failed. The server at "{url}" is not responding as expected.')
logger.info(f'Qwen ocr解析服务无法正常运行,暂不使用qwen解析表格服务')
# # client = PredictClient(url)
# status = ocr_status
for page_info in pdf_info_dict:
if page_info.get('need_drop', False):
drop_reason = page_info.get('drop_reason')
......@@ -480,3 +479,4 @@ def union_make(ocr_status:str,
elif make_mode == MakeMode.STANDARD_FORMAT:
return output_content
......@@ -266,3 +266,4 @@ if __name__ == "__main__":
uvicorn.run(app, host=host, port=port)
......@@ -305,4 +305,4 @@ def draw_layout_on_page(raw_pdf_doc: fitz.Document, page_idx: int, page_layout:
else:
doc.saveIncr()
doc.close()
\ No newline at end of file
......@@ -302,7 +302,7 @@ class CustomPEKModel:
ocr_res_list.append(res)
elif int(res['category_id']) in [5]:
table_res_list.append(res)
#logger.info(f'table_res_list:\n{table_res_list}')
# Unified crop img logic
def crop_img(input_res, input_pil_img, crop_paste_x=0, crop_paste_y=0):
crop_xmin, crop_ymin = int(input_res['poly'][0]), int(input_res['poly'][1])
......@@ -327,6 +327,7 @@ class CustomPEKModel:
# Process each area that requires OCR processing
for res in ocr_res_list:
new_image, useful_list = crop_img(res, pil_img, crop_paste_x=50, crop_paste_y=50)
# logger.info(f'------new_image:{new_image}')
paste_x, paste_y, xmin, ymin, xmax, ymax, new_width, new_height = useful_list
# Adjust the coordinates of the formula area
adjusted_mfdetrec_res = []
......@@ -347,6 +348,7 @@ class CustomPEKModel:
# OCR recognition
new_image = cv2.cvtColor(np.asarray(new_image), cv2.COLOR_RGB2BGR)
#logger.info(f'new_image:{new_image}')
ocr_res = self.ocr_model.ocr(new_image, mfd_res=adjusted_mfdetrec_res)[0]
# logger.info(f'------------------------------------orc_res:\n{ocr_res}\n------------------------------------')
# Integration results
......@@ -368,7 +370,7 @@ class CustomPEKModel:
'text': text,
})
ocr_cost = time.time() - ocr_start
ocr_cost = round(time.time() - ocr_start, 2)
# logger.info(f"ocr cost: {ocr_cost}")
total_cost = round(total_cost + ocr_cost,2)
index = index + 1
......@@ -410,6 +412,7 @@ class CustomPEKModel:
logger.warning(f"------------table recognition processing fails----------")
table_cost = round(time.time() - table_start, 2)
logger.info(f"table cost: {table_cost}")
#logger.info(f'layout_res:{layout_res}')
return layout_res
......@@ -98,3 +98,4 @@ if __name__ == "__main__":
main()
from magic_pdf.pdf_parse_union_core import pdf_parse_union
def parse_pdf_by_ocr(pdf_bytes,
def parse_pdf_by_ocr(ocr_status,config_path,local_image_dir,pdf_bytes,
model_list,
imageWriter,
start_page_id=0,
end_page_id=None,
debug_mode=False,
):
return pdf_parse_union(pdf_bytes,
return pdf_parse_union(ocr_status,config_path,local_image_dir,pdf_bytes,
model_list,
imageWriter,
"ocr",
......@@ -16,3 +16,4 @@ def parse_pdf_by_ocr(pdf_bytes,
end_page_id=end_page_id,
debug_mode=debug_mode,
)
......@@ -92,7 +92,7 @@ def replace_text_span(pymu_spans, ocr_spans):
return list(filter(lambda x: x["type"] != ContentType.Text, ocr_spans)) + pymu_spans
def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter, parse_mode):
def parse_page_core(ocr_status,config_path,local_image_dir,pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter, parse_mode):
need_drop = False
drop_reason = []
......@@ -126,7 +126,7 @@ def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter,
'''删除重叠spans中较小的那些'''
spans, dropped_spans_by_span_overlap = remove_overlaps_min_spans(spans)
'''对image和table截图'''
spans = ocr_cut_image_and_table(spans, pdf_docs[page_id], page_id, pdf_bytes_md5, imageWriter)
spans = ocr_cut_image_and_table(ocr_status,config_path,local_image_dir,spans, pdf_docs[page_id], page_id, pdf_bytes_md5, imageWriter)
'''将所有区块的bbox整理到一起'''
# interline_equation_blocks参数不够准,后面切换到interline_equations上
......@@ -210,7 +210,7 @@ def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter,
return page_info
def pdf_parse_union(pdf_bytes,
def pdf_parse_union(ocr_status,config_path,local_image_dir,pdf_bytes,
model_list,
imageWriter,
parse_mode,
......@@ -249,7 +249,7 @@ def pdf_parse_union(pdf_bytes,
'''解析pdf中的每一页'''
if start_page_id <= page_id <= end_page_id:
page_info = parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter, parse_mode)
page_info = parse_page_core(ocr_status,config_path,local_image_dir,pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter, parse_mode)
else:
page_w = page.rect.width
page_h = page.rect.height
......@@ -273,3 +273,4 @@ def pdf_parse_union(pdf_bytes,
if __name__ == '__main__':
pass
......@@ -37,13 +37,13 @@ class UNIPipe(AbsPipe):
self.model_list = doc_analyze(model,self.pdf_bytes, ocr=True,
start_page_id=self.start_page_id, end_page_id=self.end_page_id)
def pipe_parse(self):
def pipe_parse(self,ocr_status,config_path,local_image_dir):
if self.pdf_type == self.PIP_TXT:
self.pdf_mid_data = parse_union_pdf(self.pdf_bytes, self.model_list, self.image_writer,
is_debug=self.is_debug, input_model_is_empty=self.input_model_is_empty,
start_page_id=self.start_page_id, end_page_id=self.end_page_id)
elif self.pdf_type == self.PIP_OCR:
self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer,
self.pdf_mid_data = parse_ocr_pdf(ocr_status,config_path,local_image_dir,self.pdf_bytes, self.model_list, self.image_writer,
is_debug=self.is_debug,
start_page_id=self.start_page_id, end_page_id=self.end_page_id)
......@@ -96,3 +96,4 @@ if __name__ == '__main__':
AbsReaderWriter.MODE_TXT)
md_writer.write(str(content_list), "19983-00.txt", AbsReaderWriter.MODE_TXT)
import configparser
import os
import time
from loguru import logger
from magic_pdf.libs.commons import join_path
from magic_pdf.libs.ocr_content_type import ContentType
from magic_pdf.libs.pdf_image_tools import cut_image
from multiprocessing import Pool
# vllm:
#from magic_pdf.dict2md.ocr_vllm_client import PredictClient,compress_image
# 普通 非vllm
from magic_pdf.dict2md.ocr_client import PredictClient,compress_image
text = '解析图片内容,直接返回一段带有逻辑性的中文书面语描述,要求表达精准,不脱离图片中的实际内容,不要带换行,文中所有的名词不要用指代词'
client = None
def ocr_cut_image_and_table(spans, page, page_id, pdf_bytes_md5, imageWriter):
def ocr_image(image_path):
start = time.time()
compress_image(image_path)
txt = os.getpid()
# global client
#logger.info(f'image_path:{image_path}')
generated_text = f'--【{txt}】--\n'+client.predict(image_path, text)
end = time.time()
logger.info(f'qwen解析{image_path}表格的内容为:{generated_text},耗时为:{end - start}')
return generated_text
def ocr_cut_image_and_table(ocr_status,config_path,local_image_dir,spans, page, page_id, pdf_bytes_md5, imageWriter):
def return_path(type):
return join_path(pdf_bytes_md5, type)
pool = Pool(4)
global client
config = configparser.ConfigParser()
config.read(config_path)
url = config.get('server', 'ocr_server')
client = PredictClient(url)
if not ocr_status:
logger.warning(f'Health check failed. The server at "{url}" is not responding as expected.')
logger.info(f'Qwen ocr解析服务无法正常运行,暂不使用qwen解析表格服务')
for span in spans:
span_type = span['type']
if span_type == ContentType.Image:
......@@ -19,9 +53,18 @@ def ocr_cut_image_and_table(spans, page, page_id, pdf_bytes_md5, imageWriter):
elif span_type == ContentType.Table:
if not check_img_bbox(span['bbox']):
continue
span['image_path'] = cut_image(span['bbox'], page_id, page, return_path=return_path('tables'),
image_path = cut_image(span['bbox'], page_id, page, return_path=return_path('tables'),
imageWriter=imageWriter)
image_path = join_path(local_image_dir,image_path)
if ocr_status:
txt = pool.apply_async(ocr_image,args=(image_path,)).get()
span['image_path'] = str(txt)
# logger.info(f'image_path:{image_path} \t pool.apply_async::{txt}')
else:
span['image_path'] = f"----------------图片路径为({image_path}),请检查qwen ocr服务,重新运行文件解析------------------- \n"
pool.close()
pool.join()
return spans
......@@ -69,3 +112,4 @@ def check_img_bbox(bbox) -> bool:
logger.warning(f"image_bboxes: 错误的box, {bbox}")
return False
return True
......@@ -100,7 +100,7 @@ def do_parse(
logger.error('need model list input')
exit(2)
pipe.pipe_parse()
pipe.pipe_parse(ocr_status,config_path,local_image_dir)
pdf_info = pipe.pdf_mid_data['pdf_info']
if f_draw_layout_bbox:
draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir, pdf_file_name)
......@@ -136,3 +136,4 @@ def do_parse(
parse_pdf_methods = click.Choice(['ocr', 'txt', 'auto'])
......@@ -161,3 +161,4 @@ if __name__ == '__main__':
......@@ -48,14 +48,14 @@ def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit
return pdf_info_dict
def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False,
def parse_ocr_pdf(ocr_status,config_path,local_image_dir,pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False,
start_page_id=0, end_page_id=None,
*args, **kwargs):
"""
解析ocr类pdf
"""
# print('---------------------------------------------------------这是解析ocr类pdf------------------------------------------------------------------')
pdf_info_dict = parse_pdf_by_ocr(
pdf_info_dict = parse_pdf_by_ocr(ocr_status,config_path,local_image_dir,
pdf_bytes,
pdf_models,
imageWriter,
......@@ -112,3 +112,4 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
# logger.info(f'这是pdf_union_pdf中的pdf_dict:\n{pdf_info_dict}\n-----------------------------------------------------------------------------------------')
return pdf_info_dict
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment