"...gpu/git@developer.sourcefind.cn:gaoqiong/migraphx.git" did not exist on "7543eacc541a14883cdc24650e94a3918f0df8c0"
Commit 6fb9fc49 authored by zhougaofeng's avatar zhougaofeng
Browse files

Update magic_pdf/dict2md/ocr_server.py, magic_pdf/dict2md/ocr_mkcontent.py,...

Update magic_pdf/dict2md/ocr_server.py, magic_pdf/dict2md/ocr_mkcontent.py, magic_pdf/dict2md/mkcontent.py, magic_pdf/dict2md/ocr_server_72.py, magic_pdf/dict2md/ocr_vllm_client.py, magic_pdf/dict2md/ocr_vllm_server.py, magic_pdf/dict2md/tmp.py, magic_pdf/filter/pdf_classify_by_type.py, magic_pdf/filter/pdf_meta_scan.py, magic_pdf/layout/bbox_sort.py, magic_pdf/layout/layout_det_utils.py, magic_pdf/layout/layout_sort.py, magic_pdf/layout/layout_spiler_recog.py, magic_pdf/layout/mcol_sort.py, magic_pdf/libs/Constants.py, magic_pdf/libs/MakeContentConfig.py, magic_pdf/libs/ModelBlockTypeEnum.py, magic_pdf/libs/calc_span_stats.py, magic_pdf/libs/commons.py, magic_pdf/libs/config_reader.py, magic_pdf/libs/drop_tag.py, magic_pdf/libs/hash_utils.py, magic_pdf/libs/language.py, magic_pdf/libs/local_math.py, magic_pdf/libs/markdown_utils.py, magic_pdf/libs/nlp_utils.py, magic_pdf/libs/ocr_content_type.py, magic_pdf/libs/path_utils.py, magic_pdf/libs/pdf_image_tools.py, magic_pdf/libs/textbase.py, magic_pdf/libs/version.py, magic_pdf/libs/vis_utils.py, magic_pdf/model/__init__.py, magic_pdf/model/doc_analyze_by_custom_model.py, magic_pdf/model/magic_model.py, magic_pdf/model/model_list.py, magic_pdf/model/pdf_extract_kit.py, magic_pdf/model/ppTableModel.py, magic_pdf/para/block_continuation_processor.py, magic_pdf/para/block_termination_processor.py, magic_pdf/para/commons.py, magic_pdf/para/denoise.py, magic_pdf/para/exceptions.py, magic_pdf/para/layout_match_processor.py, magic_pdf/para/para_pipeline.py, magic_pdf/para/para_split.py, magic_pdf/para/para_split_v2.py, magic_pdf/para/raw_processor.py, magic_pdf/para/stats.py, magic_pdf/para/title_processor.py, magic_pdf/parse/excel_parse.py, magic_pdf/parse/pdf_client.py, magic_pdf/pipe/AbsPipe.py, magic_pdf/pipe/OCRPipe.py, magic_pdf/pipe/TXTPipe.py, magic_pdf/pipe/UNIPipe.py, magic_pdf/pre_proc/cut_image.py, magic_pdf/tools/common.py, magic_pdf/tools/pdf_server.py, magic_pdf/pdf_parse_by_ocr.py, magic_pdf/pdf_parse_by_txt.py, magic_pdf/pdf_parse_union_core.py, magic_pdf/user_api.py, magic_pdf/model/pp_structure_v2.py files
parent 255a6ed0
...@@ -178,19 +178,20 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, ...@@ -178,19 +178,20 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
elif span.get('html', ''): elif span.get('html', ''):
para_text += f"\n\n{span['html']}\n\n" para_text += f"\n\n{span['html']}\n\n"
else: else:
# 处理图片 para_text += span['image_path']
# para_text += f"----------------这是ocr表格内容({join_path(img_buket_path, span['image_path'])})------------------- \n" # # 处理图片
if status: # # para_text += f"----------------这是ocr表格内容({join_path(img_buket_path, span['image_path'])})------------------- \n"
text = '解析图片内容,直接返回一段带有逻辑性的中文书面语描述,要求表达精准,不脱离图片中的实际内容,不要带换行,文中所有的名词不要用指代词' # if status:
start = time.time() # # text = '解析图片内容,直接返回一段带有逻辑性的中文书面语描述,要求表达精准,不脱离图片中的实际内容,不要带换行,文中所有的名词不要用指代词'
image_path = join_path(img_buket_path, span['image_path']) # # start = time.time()
compress_image(image_path) # # image_path = join_path(img_buket_path, span['image_path'])
generated_text = client.predict(image_path, text) # # compress_image(image_path)
end = time.time() # # generated_text = client.predict(image_path, text)
logger.info(f'qwen解析{image_path}表格的内容为:{generated_text},耗时为:{end-start}') # # end = time.time()
para_text += generated_text # # logger.info(f'qwen解析{image_path}表格的内容为:{generated_text},耗时为:{end-start}')
else: # para_text += span['image_path']
para_text += f"----------------图片路径为({join_path(img_buket_path, span['image_path'])}),请检查qwen ocr服务,重新运行文件解析------------------- \n" # else:
# para_text += f"----------------图片路径为({join_path(img_buket_path, span['image_path'])}),请检查qwen ocr服务,重新运行文件解析------------------- \n"
for block in para_block['blocks']: # 3rd.拼table_footnote for block in para_block['blocks']: # 3rd.拼table_footnote
if block['type'] == BlockType.TableFootnote: if block['type'] == BlockType.TableFootnote:
para_text += merge_para_with_text(block) para_text += merge_para_with_text(block)
...@@ -431,17 +432,15 @@ def union_make(ocr_status:str, ...@@ -431,17 +432,15 @@ def union_make(ocr_status:str,
drop_mode: str, drop_mode: str,
img_buket_path: str = ''): img_buket_path: str = ''):
output_content = [] output_content = []
global client # global client
global status # global status
config = configparser.ConfigParser() # config = configparser.ConfigParser()
config.read(config_path) # config.read(config_path)
url = config.get('server', 'ocr_server') # url = config.get('server', 'ocr_server')
# logger.info(f'ocr_server:{url}') # logger.info(f'ocr_server:{url}')
client = PredictClient(url) # # client = PredictClient(url)
status = ocr_status # status = ocr_status
if not status:
logger.warning(f'Health check failed. The server at "{url}" is not responding as expected.')
logger.info(f'Qwen ocr解析服务无法正常运行,暂不使用qwen解析表格服务')
for page_info in pdf_info_dict: for page_info in pdf_info_dict:
if page_info.get('need_drop', False): if page_info.get('need_drop', False):
drop_reason = page_info.get('drop_reason') drop_reason = page_info.get('drop_reason')
...@@ -480,3 +479,4 @@ def union_make(ocr_status:str, ...@@ -480,3 +479,4 @@ def union_make(ocr_status:str,
elif make_mode == MakeMode.STANDARD_FORMAT: elif make_mode == MakeMode.STANDARD_FORMAT:
return output_content return output_content
...@@ -266,3 +266,4 @@ if __name__ == "__main__": ...@@ -266,3 +266,4 @@ if __name__ == "__main__":
uvicorn.run(app, host=host, port=port) uvicorn.run(app, host=host, port=port)
...@@ -302,7 +302,7 @@ class CustomPEKModel: ...@@ -302,7 +302,7 @@ class CustomPEKModel:
ocr_res_list.append(res) ocr_res_list.append(res)
elif int(res['category_id']) in [5]: elif int(res['category_id']) in [5]:
table_res_list.append(res) table_res_list.append(res)
#logger.info(f'table_res_list:\n{table_res_list}')
# Unified crop img logic # Unified crop img logic
def crop_img(input_res, input_pil_img, crop_paste_x=0, crop_paste_y=0): def crop_img(input_res, input_pil_img, crop_paste_x=0, crop_paste_y=0):
crop_xmin, crop_ymin = int(input_res['poly'][0]), int(input_res['poly'][1]) crop_xmin, crop_ymin = int(input_res['poly'][0]), int(input_res['poly'][1])
...@@ -327,6 +327,7 @@ class CustomPEKModel: ...@@ -327,6 +327,7 @@ class CustomPEKModel:
# Process each area that requires OCR processing # Process each area that requires OCR processing
for res in ocr_res_list: for res in ocr_res_list:
new_image, useful_list = crop_img(res, pil_img, crop_paste_x=50, crop_paste_y=50) new_image, useful_list = crop_img(res, pil_img, crop_paste_x=50, crop_paste_y=50)
# logger.info(f'------new_image:{new_image}')
paste_x, paste_y, xmin, ymin, xmax, ymax, new_width, new_height = useful_list paste_x, paste_y, xmin, ymin, xmax, ymax, new_width, new_height = useful_list
# Adjust the coordinates of the formula area # Adjust the coordinates of the formula area
adjusted_mfdetrec_res = [] adjusted_mfdetrec_res = []
...@@ -347,6 +348,7 @@ class CustomPEKModel: ...@@ -347,6 +348,7 @@ class CustomPEKModel:
# OCR recognition # OCR recognition
new_image = cv2.cvtColor(np.asarray(new_image), cv2.COLOR_RGB2BGR) new_image = cv2.cvtColor(np.asarray(new_image), cv2.COLOR_RGB2BGR)
#logger.info(f'new_image:{new_image}')
ocr_res = self.ocr_model.ocr(new_image, mfd_res=adjusted_mfdetrec_res)[0] ocr_res = self.ocr_model.ocr(new_image, mfd_res=adjusted_mfdetrec_res)[0]
# logger.info(f'------------------------------------orc_res:\n{ocr_res}\n------------------------------------') # logger.info(f'------------------------------------orc_res:\n{ocr_res}\n------------------------------------')
# Integration results # Integration results
...@@ -368,7 +370,7 @@ class CustomPEKModel: ...@@ -368,7 +370,7 @@ class CustomPEKModel:
'text': text, 'text': text,
}) })
ocr_cost = time.time() - ocr_start ocr_cost = round(time.time() - ocr_start, 2)
# logger.info(f"ocr cost: {ocr_cost}") # logger.info(f"ocr cost: {ocr_cost}")
total_cost = round(total_cost + ocr_cost,2) total_cost = round(total_cost + ocr_cost,2)
index = index + 1 index = index + 1
...@@ -410,6 +412,7 @@ class CustomPEKModel: ...@@ -410,6 +412,7 @@ class CustomPEKModel:
logger.warning(f"------------table recognition processing fails----------") logger.warning(f"------------table recognition processing fails----------")
table_cost = round(time.time() - table_start, 2) table_cost = round(time.time() - table_start, 2)
logger.info(f"table cost: {table_cost}") logger.info(f"table cost: {table_cost}")
#logger.info(f'layout_res:{layout_res}')
return layout_res return layout_res
...@@ -98,3 +98,4 @@ if __name__ == "__main__": ...@@ -98,3 +98,4 @@ if __name__ == "__main__":
main() main()
from magic_pdf.pdf_parse_union_core import pdf_parse_union from magic_pdf.pdf_parse_union_core import pdf_parse_union
def parse_pdf_by_ocr(pdf_bytes, def parse_pdf_by_ocr(ocr_status,config_path,local_image_dir,pdf_bytes,
model_list, model_list,
imageWriter, imageWriter,
start_page_id=0, start_page_id=0,
end_page_id=None, end_page_id=None,
debug_mode=False, debug_mode=False,
): ):
return pdf_parse_union(pdf_bytes, return pdf_parse_union(ocr_status,config_path,local_image_dir,pdf_bytes,
model_list, model_list,
imageWriter, imageWriter,
"ocr", "ocr",
...@@ -16,3 +16,4 @@ def parse_pdf_by_ocr(pdf_bytes, ...@@ -16,3 +16,4 @@ def parse_pdf_by_ocr(pdf_bytes,
end_page_id=end_page_id, end_page_id=end_page_id,
debug_mode=debug_mode, debug_mode=debug_mode,
) )
...@@ -92,7 +92,7 @@ def replace_text_span(pymu_spans, ocr_spans): ...@@ -92,7 +92,7 @@ def replace_text_span(pymu_spans, ocr_spans):
return list(filter(lambda x: x["type"] != ContentType.Text, ocr_spans)) + pymu_spans return list(filter(lambda x: x["type"] != ContentType.Text, ocr_spans)) + pymu_spans
def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter, parse_mode): def parse_page_core(ocr_status,config_path,local_image_dir,pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter, parse_mode):
need_drop = False need_drop = False
drop_reason = [] drop_reason = []
...@@ -126,7 +126,7 @@ def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter, ...@@ -126,7 +126,7 @@ def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter,
'''删除重叠spans中较小的那些''' '''删除重叠spans中较小的那些'''
spans, dropped_spans_by_span_overlap = remove_overlaps_min_spans(spans) spans, dropped_spans_by_span_overlap = remove_overlaps_min_spans(spans)
'''对image和table截图''' '''对image和table截图'''
spans = ocr_cut_image_and_table(spans, pdf_docs[page_id], page_id, pdf_bytes_md5, imageWriter) spans = ocr_cut_image_and_table(ocr_status,config_path,local_image_dir,spans, pdf_docs[page_id], page_id, pdf_bytes_md5, imageWriter)
'''将所有区块的bbox整理到一起''' '''将所有区块的bbox整理到一起'''
# interline_equation_blocks参数不够准,后面切换到interline_equations上 # interline_equation_blocks参数不够准,后面切换到interline_equations上
...@@ -210,7 +210,7 @@ def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter, ...@@ -210,7 +210,7 @@ def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter,
return page_info return page_info
def pdf_parse_union(pdf_bytes, def pdf_parse_union(ocr_status,config_path,local_image_dir,pdf_bytes,
model_list, model_list,
imageWriter, imageWriter,
parse_mode, parse_mode,
...@@ -249,7 +249,7 @@ def pdf_parse_union(pdf_bytes, ...@@ -249,7 +249,7 @@ def pdf_parse_union(pdf_bytes,
'''解析pdf中的每一页''' '''解析pdf中的每一页'''
if start_page_id <= page_id <= end_page_id: if start_page_id <= page_id <= end_page_id:
page_info = parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter, parse_mode) page_info = parse_page_core(ocr_status,config_path,local_image_dir,pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter, parse_mode)
else: else:
page_w = page.rect.width page_w = page.rect.width
page_h = page.rect.height page_h = page.rect.height
...@@ -273,3 +273,4 @@ def pdf_parse_union(pdf_bytes, ...@@ -273,3 +273,4 @@ def pdf_parse_union(pdf_bytes,
if __name__ == '__main__': if __name__ == '__main__':
pass pass
...@@ -37,13 +37,13 @@ class UNIPipe(AbsPipe): ...@@ -37,13 +37,13 @@ class UNIPipe(AbsPipe):
self.model_list = doc_analyze(model,self.pdf_bytes, ocr=True, self.model_list = doc_analyze(model,self.pdf_bytes, ocr=True,
start_page_id=self.start_page_id, end_page_id=self.end_page_id) start_page_id=self.start_page_id, end_page_id=self.end_page_id)
def pipe_parse(self): def pipe_parse(self,ocr_status,config_path,local_image_dir):
if self.pdf_type == self.PIP_TXT: if self.pdf_type == self.PIP_TXT:
self.pdf_mid_data = parse_union_pdf(self.pdf_bytes, self.model_list, self.image_writer, self.pdf_mid_data = parse_union_pdf(self.pdf_bytes, self.model_list, self.image_writer,
is_debug=self.is_debug, input_model_is_empty=self.input_model_is_empty, is_debug=self.is_debug, input_model_is_empty=self.input_model_is_empty,
start_page_id=self.start_page_id, end_page_id=self.end_page_id) start_page_id=self.start_page_id, end_page_id=self.end_page_id)
elif self.pdf_type == self.PIP_OCR: elif self.pdf_type == self.PIP_OCR:
self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer, self.pdf_mid_data = parse_ocr_pdf(ocr_status,config_path,local_image_dir,self.pdf_bytes, self.model_list, self.image_writer,
is_debug=self.is_debug, is_debug=self.is_debug,
start_page_id=self.start_page_id, end_page_id=self.end_page_id) start_page_id=self.start_page_id, end_page_id=self.end_page_id)
...@@ -96,3 +96,4 @@ if __name__ == '__main__': ...@@ -96,3 +96,4 @@ if __name__ == '__main__':
AbsReaderWriter.MODE_TXT) AbsReaderWriter.MODE_TXT)
md_writer.write(str(content_list), "19983-00.txt", AbsReaderWriter.MODE_TXT) md_writer.write(str(content_list), "19983-00.txt", AbsReaderWriter.MODE_TXT)
import configparser
import os
import time
from loguru import logger from loguru import logger
from magic_pdf.libs.commons import join_path from magic_pdf.libs.commons import join_path
from magic_pdf.libs.ocr_content_type import ContentType from magic_pdf.libs.ocr_content_type import ContentType
from magic_pdf.libs.pdf_image_tools import cut_image from magic_pdf.libs.pdf_image_tools import cut_image
from multiprocessing import Pool
# vllm:
#from magic_pdf.dict2md.ocr_vllm_client import PredictClient,compress_image
# 普通 非vllm
from magic_pdf.dict2md.ocr_client import PredictClient,compress_image
text = '解析图片内容,直接返回一段带有逻辑性的中文书面语描述,要求表达精准,不脱离图片中的实际内容,不要带换行,文中所有的名词不要用指代词'
client = None
def ocr_cut_image_and_table(spans, page, page_id, pdf_bytes_md5, imageWriter): def ocr_image(image_path):
start = time.time()
compress_image(image_path)
txt = os.getpid()
# global client
#logger.info(f'image_path:{image_path}')
generated_text = f'--【{txt}】--\n'+client.predict(image_path, text)
end = time.time()
logger.info(f'qwen解析{image_path}表格的内容为:{generated_text},耗时为:{end - start}')
return generated_text
def ocr_cut_image_and_table(ocr_status,config_path,local_image_dir,spans, page, page_id, pdf_bytes_md5, imageWriter):
def return_path(type): def return_path(type):
return join_path(pdf_bytes_md5, type) return join_path(pdf_bytes_md5, type)
pool = Pool(4)
global client
config = configparser.ConfigParser()
config.read(config_path)
url = config.get('server', 'ocr_server')
client = PredictClient(url)
if not ocr_status:
logger.warning(f'Health check failed. The server at "{url}" is not responding as expected.')
logger.info(f'Qwen ocr解析服务无法正常运行,暂不使用qwen解析表格服务')
for span in spans: for span in spans:
span_type = span['type'] span_type = span['type']
if span_type == ContentType.Image: if span_type == ContentType.Image:
...@@ -19,9 +53,18 @@ def ocr_cut_image_and_table(spans, page, page_id, pdf_bytes_md5, imageWriter): ...@@ -19,9 +53,18 @@ def ocr_cut_image_and_table(spans, page, page_id, pdf_bytes_md5, imageWriter):
elif span_type == ContentType.Table: elif span_type == ContentType.Table:
if not check_img_bbox(span['bbox']): if not check_img_bbox(span['bbox']):
continue continue
span['image_path'] = cut_image(span['bbox'], page_id, page, return_path=return_path('tables'), image_path = cut_image(span['bbox'], page_id, page, return_path=return_path('tables'),
imageWriter=imageWriter) imageWriter=imageWriter)
image_path = join_path(local_image_dir,image_path)
if ocr_status:
txt = pool.apply_async(ocr_image,args=(image_path,)).get()
span['image_path'] = str(txt)
# logger.info(f'image_path:{image_path} \t pool.apply_async::{txt}')
else:
span['image_path'] = f"----------------图片路径为({image_path}),请检查qwen ocr服务,重新运行文件解析------------------- \n"
pool.close()
pool.join()
return spans return spans
...@@ -69,3 +112,4 @@ def check_img_bbox(bbox) -> bool: ...@@ -69,3 +112,4 @@ def check_img_bbox(bbox) -> bool:
logger.warning(f"image_bboxes: 错误的box, {bbox}") logger.warning(f"image_bboxes: 错误的box, {bbox}")
return False return False
return True return True
...@@ -100,7 +100,7 @@ def do_parse( ...@@ -100,7 +100,7 @@ def do_parse(
logger.error('need model list input') logger.error('need model list input')
exit(2) exit(2)
pipe.pipe_parse() pipe.pipe_parse(ocr_status,config_path,local_image_dir)
pdf_info = pipe.pdf_mid_data['pdf_info'] pdf_info = pipe.pdf_mid_data['pdf_info']
if f_draw_layout_bbox: if f_draw_layout_bbox:
draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir, pdf_file_name) draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir, pdf_file_name)
...@@ -136,3 +136,4 @@ def do_parse( ...@@ -136,3 +136,4 @@ def do_parse(
parse_pdf_methods = click.Choice(['ocr', 'txt', 'auto']) parse_pdf_methods = click.Choice(['ocr', 'txt', 'auto'])
...@@ -161,3 +161,4 @@ if __name__ == '__main__': ...@@ -161,3 +161,4 @@ if __name__ == '__main__':
...@@ -48,14 +48,14 @@ def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit ...@@ -48,14 +48,14 @@ def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit
return pdf_info_dict return pdf_info_dict
def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False, def parse_ocr_pdf(ocr_status,config_path,local_image_dir,pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False,
start_page_id=0, end_page_id=None, start_page_id=0, end_page_id=None,
*args, **kwargs): *args, **kwargs):
""" """
解析ocr类pdf 解析ocr类pdf
""" """
# print('---------------------------------------------------------这是解析ocr类pdf------------------------------------------------------------------') # print('---------------------------------------------------------这是解析ocr类pdf------------------------------------------------------------------')
pdf_info_dict = parse_pdf_by_ocr( pdf_info_dict = parse_pdf_by_ocr(ocr_status,config_path,local_image_dir,
pdf_bytes, pdf_bytes,
pdf_models, pdf_models,
imageWriter, imageWriter,
...@@ -112,3 +112,4 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr ...@@ -112,3 +112,4 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
# logger.info(f'这是pdf_union_pdf中的pdf_dict:\n{pdf_info_dict}\n-----------------------------------------------------------------------------------------') # logger.info(f'这是pdf_union_pdf中的pdf_dict:\n{pdf_info_dict}\n-----------------------------------------------------------------------------------------')
return pdf_info_dict return pdf_info_dict
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment