Update magic_pdf/dict2md/ocr_server.py, magic_pdf/dict2md/ocr_mkcontent.py,...

Update magic_pdf/dict2md/ocr_server.py, magic_pdf/dict2md/ocr_mkcontent.py, magic_pdf/dict2md/mkcontent.py, magic_pdf/dict2md/ocr_server_72.py, magic_pdf/dict2md/ocr_vllm_client.py, magic_pdf/dict2md/ocr_vllm_server.py, magic_pdf/dict2md/tmp.py, magic_pdf/filter/pdf_classify_by_type.py, magic_pdf/filter/pdf_meta_scan.py, magic_pdf/layout/bbox_sort.py, magic_pdf/layout/layout_det_utils.py, magic_pdf/layout/layout_sort.py, magic_pdf/layout/layout_spiler_recog.py, magic_pdf/layout/mcol_sort.py, magic_pdf/libs/Constants.py, magic_pdf/libs/MakeContentConfig.py, magic_pdf/libs/ModelBlockTypeEnum.py, magic_pdf/libs/calc_span_stats.py, magic_pdf/libs/commons.py, magic_pdf/libs/config_reader.py, magic_pdf/libs/drop_tag.py, magic_pdf/libs/hash_utils.py, magic_pdf/libs/language.py, magic_pdf/libs/local_math.py, magic_pdf/libs/markdown_utils.py, magic_pdf/libs/nlp_utils.py, magic_pdf/libs/ocr_content_type.py, magic_pdf/libs/path_utils.py, magic_pdf/libs/pdf_image_tools.py, magic_pdf/libs/textbase.py, magic_pdf/libs/version.py, magic_pdf/libs/vis_utils.py, magic_pdf/model/__init__.py, magic_pdf/model/doc_analyze_by_custom_model.py, magic_pdf/model/magic_model.py, magic_pdf/model/model_list.py, magic_pdf/model/pdf_extract_kit.py, magic_pdf/model/ppTableModel.py, magic_pdf/para/block_continuation_processor.py, magic_pdf/para/block_termination_processor.py, magic_pdf/para/commons.py, magic_pdf/para/denoise.py, magic_pdf/para/exceptions.py, magic_pdf/para/layout_match_processor.py, magic_pdf/para/para_pipeline.py, magic_pdf/para/para_split.py, magic_pdf/para/para_split_v2.py, magic_pdf/para/raw_processor.py, magic_pdf/para/stats.py, magic_pdf/para/title_processor.py, magic_pdf/parse/excel_parse.py, magic_pdf/parse/pdf_client.py, magic_pdf/pipe/AbsPipe.py, magic_pdf/pipe/OCRPipe.py, magic_pdf/pipe/TXTPipe.py, magic_pdf/pipe/UNIPipe.py, magic_pdf/pre_proc/cut_image.py, magic_pdf/tools/common.py, magic_pdf/tools/pdf_server.py, magic_pdf/pdf_parse_by_ocr.py, magic_pdf/pdf_parse_by_txt.py, magic_pdf/pdf_parse_union_core.py, magic_pdf/user_api.py, magic_pdf/model/pp_structure_v2.py files

Update magic_pdf/dict2md/ocr_server.py, magic_pdf/dict2md/ocr_mkcontent.py,...
Update magic_pdf/dict2md/ocr_server.py, magic_pdf/dict2md/ocr_mkcontent.py, magic_pdf/dict2md/mkcontent.py, magic_pdf/dict2md/ocr_server_72.py, magic_pdf/dict2md/ocr_vllm_client.py, magic_pdf/dict2md/ocr_vllm_server.py, magic_pdf/dict2md/tmp.py, magic_pdf/filter/pdf_classify_by_type.py, magic_pdf/filter/pdf_meta_scan.py, magic_pdf/layout/bbox_sort.py, magic_pdf/layout/layout_det_utils.py, magic_pdf/layout/layout_sort.py, magic_pdf/layout/layout_spiler_recog.py, magic_pdf/layout/mcol_sort.py, magic_pdf/libs/Constants.py, magic_pdf/libs/MakeContentConfig.py, magic_pdf/libs/ModelBlockTypeEnum.py, magic_pdf/libs/calc_span_stats.py, magic_pdf/libs/commons.py, magic_pdf/libs/config_reader.py, magic_pdf/libs/drop_tag.py, magic_pdf/libs/hash_utils.py, magic_pdf/libs/language.py, magic_pdf/libs/local_math.py, magic_pdf/libs/markdown_utils.py, magic_pdf/libs/nlp_utils.py, magic_pdf/libs/ocr_content_type.py, magic_pdf/libs/path_utils.py, magic_pdf/libs/pdf_image_tools.py, magic_pdf/libs/textbase.py, magic_pdf/libs/version.py, magic_pdf/libs/vis_utils.py, magic_pdf/model/__init__.py, magic_pdf/model/doc_analyze_by_custom_model.py, magic_pdf/model/magic_model.py, magic_pdf/model/model_list.py, magic_pdf/model/pdf_extract_kit.py, magic_pdf/model/ppTableModel.py, magic_pdf/para/block_continuation_processor.py, magic_pdf/para/block_termination_processor.py, magic_pdf/para/commons.py, magic_pdf/para/denoise.py, magic_pdf/para/exceptions.py, magic_pdf/para/layout_match_processor.py, magic_pdf/para/para_pipeline.py, magic_pdf/para/para_split.py, magic_pdf/para/para_split_v2.py, magic_pdf/para/raw_processor.py, magic_pdf/para/stats.py, magic_pdf/para/title_processor.py, magic_pdf/parse/excel_parse.py, magic_pdf/parse/pdf_client.py, magic_pdf/pipe/AbsPipe.py, magic_pdf/pipe/OCRPipe.py, magic_pdf/pipe/TXTPipe.py, magic_pdf/pipe/UNIPipe.py, magic_pdf/pre_proc/cut_image.py, magic_pdf/tools/common.py, magic_pdf/tools/pdf_server.py, magic_pdf/pdf_parse_by_ocr.py, magic_pdf/pdf_parse_by_txt.py, magic_pdf/pdf_parse_union_core.py, magic_pdf/user_api.py, magic_pdf/model/pp_structure_v2.py files
6fb9fc49 · zhougaofeng · 255a6ed0 · 6fb9fc49 · 6fb9fc49 · 6fb9fc49
Commit 6fb9fc49 authored Oct 30, 2024 by zhougaofeng
12 changed files
--- a/magic_pdf/dict2md/ocr_mkcontent.py
+++ b/magic_pdf/dict2md/ocr_mkcontent.py
@@ -178,19 +178,20 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
                                    elif span.get('html', ''):
                                        para_text += f"\n\n{span['html']}\n\n"
                                    else:
-                                        # 处理图片
-                                        # para_text += f"----------------这是ocr表格内容({join_path(img_buket_path, span['image_path'])})-------------------  \n"
-                                        if status:
-                                            text = '解析图片内容，直接返回一段带有逻辑性的中文书面语描述，要求表达精准，不脱离图片中的实际内容，不要带换行,文中所有的名词不要用指代词'
-                                            start = time.time()
-                                            image_path = join_path(img_buket_path, span['image_path'])
-                                            compress_image(image_path)
-                                            generated_text = client.predict(image_path, text)
-                                            end = time.time()
-                                            logger.info(f'qwen解析{image_path}表格的内容为：{generated_text},耗时为：{end-start}')
-                                            para_text += generated_text
-                                        else:
-                                            para_text += f"----------------图片路径为({join_path(img_buket_path, span['image_path'])})，请检查qwen ocr服务，重新运行文件解析-------------------  \n"
+                                        para_text += span['image_path']
+                                        # # 处理图片
+                                        # # para_text += f"----------------这是ocr表格内容({join_path(img_buket_path, span['image_path'])})-------------------  \n"
+                                        # if status:
+                                        #     # text = '解析图片内容，直接返回一段带有逻辑性的中文书面语描述，要求表达精准，不脱离图片中的实际内容，不要带换行,文中所有的名词不要用指代词'
+                                        #     # start = time.time()
+                                        #     # image_path = join_path(img_buket_path, span['image_path'])
+                                        #     # compress_image(image_path)
+                                        #     # generated_text = client.predict(image_path, text)
+                                        #     # end = time.time()
+                                        #     # logger.info(f'qwen解析{image_path}表格的内容为：{generated_text},耗时为：{end-start}')
+                                        #     para_text += span['image_path']
+                                        # else:
+                                        #     para_text += f"----------------图片路径为({join_path(img_buket_path, span['image_path'])})，请检查qwen ocr服务，重新运行文件解析-------------------  \n"
                for block in para_block['blocks']:  # 3rd.拼table_footnote
                    if block['type'] == BlockType.TableFootnote:
                        para_text += merge_para_with_text(block)
@@ -431,17 +432,15 @@ def union_make(ocr_status:str,
               drop_mode: str,
               img_buket_path: str = ''):
    output_content = []
-    global client
-    global status
-    config = configparser.ConfigParser()
-    config.read(config_path)
-    url = config.get('server', 'ocr_server')
+    # global client
+    # global status
+    # config = configparser.ConfigParser()
+    # config.read(config_path)
+    # url = config.get('server', 'ocr_server')
    # logger.info(f'ocr_server：{url}')
-    client = PredictClient(url)
-    status = ocr_status
-    if not status:
-        logger.warning(f'Health check failed. The server at "{url}" is not responding as expected.')
-        logger.info(f'Qwen ocr解析服务无法正常运行,暂不使用qwen解析表格服务')
+    # # client = PredictClient(url)
+    # status = ocr_status
+
    for page_info in pdf_info_dict:
        if page_info.get('need_drop', False):
            drop_reason = page_info.get('drop_reason')
@@ -480,3 +479,4 @@ def union_make(ocr_status:str,
    elif make_mode == MakeMode.STANDARD_FORMAT:
        return output_content

+
--- a/magic_pdf/dict2md/ocr_server.py
+++ b/magic_pdf/dict2md/ocr_server.py
@@ -266,3 +266,4 @@ if __name__ == "__main__":
    uvicorn.run(app, host=host, port=port)


+
--- a/magic_pdf/libs/vis_utils.py
+++ b/magic_pdf/libs/vis_utils.py
@@ -305,4 +305,4 @@ def draw_layout_on_page(raw_pdf_doc: fitz.Document,  page_idx: int, page_layout:
    else:
        doc.saveIncr()
    doc.close()
-    
\ No newline at end of file
+    
--- a/magic_pdf/model/pdf_extract_kit.py
+++ b/magic_pdf/model/pdf_extract_kit.py
@@ -302,7 +302,7 @@ class CustomPEKModel:
                ocr_res_list.append(res)
            elif int(res['category_id']) in [5]:
                table_res_list.append(res)
-
+        #logger.info(f'table_res_list:\n{table_res_list}')
        #  Unified crop img logic
        def crop_img(input_res, input_pil_img, crop_paste_x=0, crop_paste_y=0):
            crop_xmin, crop_ymin = int(input_res['poly'][0]), int(input_res['poly'][1])
@@ -327,6 +327,7 @@ class CustomPEKModel:
            # Process each area that requires OCR processing
            for res in ocr_res_list:
                new_image, useful_list = crop_img(res, pil_img, crop_paste_x=50, crop_paste_y=50)
+               # logger.info(f'------new_image:{new_image}')
                paste_x, paste_y, xmin, ymin, xmax, ymax, new_width, new_height = useful_list
                # Adjust the coordinates of the formula area
                adjusted_mfdetrec_res = []
@@ -347,6 +348,7 @@ class CustomPEKModel:

                # OCR recognition
                new_image = cv2.cvtColor(np.asarray(new_image), cv2.COLOR_RGB2BGR)
+                #logger.info(f'new_image:{new_image}')
                ocr_res = self.ocr_model.ocr(new_image, mfd_res=adjusted_mfdetrec_res)[0]
         #       logger.info(f'------------------------------------orc_res:\n{ocr_res}\n------------------------------------')
                # Integration results
@@ -368,7 +370,7 @@ class CustomPEKModel:
                            'text': text,
                        })

-            ocr_cost = time.time() - ocr_start
+            ocr_cost = round(time.time() - ocr_start, 2)
            # logger.info(f"ocr cost: {ocr_cost}")
            total_cost = round(total_cost + ocr_cost,2)
        index = index + 1
@@ -410,6 +412,7 @@ class CustomPEKModel:
                    logger.warning(f"------------table recognition processing fails----------")
            table_cost = round(time.time() - table_start, 2)
            logger.info(f"table cost: {table_cost}")
-
+        #logger.info(f'layout_res:{layout_res}')
        return layout_res

+
--- a/magic_pdf/parse/pdf_client.py
+++ b/magic_pdf/parse/pdf_client.py
@@ -98,3 +98,4 @@ if __name__ == "__main__":
    main()


+
--- a/magic_pdf/pdf_parse_by_ocr.py
+++ b/magic_pdf/pdf_parse_by_ocr.py
 from magic_pdf.pdf_parse_union_core import pdf_parse_union


-def parse_pdf_by_ocr(pdf_bytes,
+def parse_pdf_by_ocr(ocr_status,config_path,local_image_dir,pdf_bytes,
                     model_list,
                     imageWriter,
                     start_page_id=0,
                     end_page_id=None,
                     debug_mode=False,
                     ):
-    return pdf_parse_union(pdf_bytes,
+    return pdf_parse_union(ocr_status,config_path,local_image_dir,pdf_bytes,
                           model_list,
                           imageWriter,
                           "ocr",
@@ -16,3 +16,4 @@ def parse_pdf_by_ocr(pdf_bytes,
                           end_page_id=end_page_id,
                           debug_mode=debug_mode,
                           )
+
--- a/magic_pdf/pdf_parse_union_core.py
+++ b/magic_pdf/pdf_parse_union_core.py
@@ -92,7 +92,7 @@ def replace_text_span(pymu_spans, ocr_spans):
    return list(filter(lambda x: x["type"] != ContentType.Text, ocr_spans)) + pymu_spans


-def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter, parse_mode):
+def parse_page_core(ocr_status,config_path,local_image_dir,pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter, parse_mode):
    need_drop = False
    drop_reason = []

@@ -126,7 +126,7 @@ def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter,
    '''删除重叠spans中较小的那些'''
    spans, dropped_spans_by_span_overlap = remove_overlaps_min_spans(spans)
    '''对image和table截图'''
-    spans = ocr_cut_image_and_table(spans, pdf_docs[page_id], page_id, pdf_bytes_md5, imageWriter)
+    spans = ocr_cut_image_and_table(ocr_status,config_path,local_image_dir,spans, pdf_docs[page_id], page_id, pdf_bytes_md5, imageWriter)

    '''将所有区块的bbox整理到一起'''
    # interline_equation_blocks参数不够准，后面切换到interline_equations上
@@ -210,7 +210,7 @@ def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter,
    return page_info


-def pdf_parse_union(pdf_bytes,
+def pdf_parse_union(ocr_status,config_path,local_image_dir,pdf_bytes,
                    model_list,
                    imageWriter,
                    parse_mode,
@@ -249,7 +249,7 @@ def pdf_parse_union(pdf_bytes,

        '''解析pdf中的每一页'''
        if start_page_id <= page_id <= end_page_id:
-            page_info = parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter, parse_mode)
+            page_info = parse_page_core(ocr_status,config_path,local_image_dir,pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter, parse_mode)
        else:
            page_w = page.rect.width
            page_h = page.rect.height
@@ -273,3 +273,4 @@ def pdf_parse_union(pdf_bytes,
 if __name__ == '__main__':
    pass

+
--- a/magic_pdf/pipe/UNIPipe.py
+++ b/magic_pdf/pipe/UNIPipe.py
@@ -37,13 +37,13 @@ class UNIPipe(AbsPipe):
            self.model_list = doc_analyze(model,self.pdf_bytes, ocr=True,
                                          start_page_id=self.start_page_id, end_page_id=self.end_page_id)

-    def pipe_parse(self):
+    def pipe_parse(self,ocr_status,config_path,local_image_dir):
        if self.pdf_type == self.PIP_TXT:
            self.pdf_mid_data = parse_union_pdf(self.pdf_bytes, self.model_list, self.image_writer,
                                                is_debug=self.is_debug, input_model_is_empty=self.input_model_is_empty,
                                                start_page_id=self.start_page_id, end_page_id=self.end_page_id)
        elif self.pdf_type == self.PIP_OCR:
-            self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer,
+            self.pdf_mid_data = parse_ocr_pdf(ocr_status,config_path,local_image_dir,self.pdf_bytes, self.model_list, self.image_writer,
                                              is_debug=self.is_debug,
                                              start_page_id=self.start_page_id, end_page_id=self.end_page_id)

@@ -96,3 +96,4 @@ if __name__ == '__main__':
                    AbsReaderWriter.MODE_TXT)
    md_writer.write(str(content_list), "19983-00.txt", AbsReaderWriter.MODE_TXT)

+
--- a/magic_pdf/pre_proc/cut_image.py
+++ b/magic_pdf/pre_proc/cut_image.py
+import configparser
+import os
+import time
+
 from loguru import logger

 from magic_pdf.libs.commons import join_path
 from magic_pdf.libs.ocr_content_type import ContentType
 from magic_pdf.libs.pdf_image_tools import cut_image
+from multiprocessing import Pool
+# vllm：
+#from magic_pdf.dict2md.ocr_vllm_client import PredictClient,compress_image
+
+# 普通 非vllm
+from magic_pdf.dict2md.ocr_client import PredictClient,compress_image
+
+text = '解析图片内容，直接返回一段带有逻辑性的中文书面语描述，要求表达精准，不脱离图片中的实际内容，不要带换行,文中所有的名词不要用指代词'
+client = None


-def ocr_cut_image_and_table(spans, page, page_id, pdf_bytes_md5, imageWriter):
+def ocr_image(image_path):
+    start = time.time()
+    compress_image(image_path)
+    txt = os.getpid()
+    # global client
+    #logger.info(f'image_path:{image_path}')
+    generated_text = f'--【{txt}】--\n'+client.predict(image_path, text)
+    end = time.time()
+    logger.info(f'qwen解析{image_path}表格的内容为：{generated_text},耗时为：{end - start}')
+    return generated_text
+
+
+def ocr_cut_image_and_table(ocr_status,config_path,local_image_dir,spans, page, page_id, pdf_bytes_md5, imageWriter):
    def return_path(type):
        return join_path(pdf_bytes_md5, type)
+    pool = Pool(4)
+    global client
+    config = configparser.ConfigParser()
+    config.read(config_path)
+    url = config.get('server', 'ocr_server')
+    client = PredictClient(url)

+    if not ocr_status:
+        logger.warning(f'Health check failed. The server at "{url}" is not responding as expected.')
+        logger.info(f'Qwen ocr解析服务无法正常运行,暂不使用qwen解析表格服务')
    for span in spans:
        span_type = span['type']
        if span_type == ContentType.Image:
@@ -19,9 +53,18 @@ def ocr_cut_image_and_table(spans, page, page_id, pdf_bytes_md5, imageWriter):
        elif span_type == ContentType.Table:
            if not check_img_bbox(span['bbox']):
                continue
-            span['image_path'] = cut_image(span['bbox'], page_id, page, return_path=return_path('tables'),
+            image_path = cut_image(span['bbox'], page_id, page, return_path=return_path('tables'),
                                           imageWriter=imageWriter)
+            image_path = join_path(local_image_dir,image_path)
+            if ocr_status:

+                txt = pool.apply_async(ocr_image,args=(image_path,)).get()
+                span['image_path'] = str(txt)
+               # logger.info(f'image_path:{image_path} \t pool.apply_async::{txt}')
+            else:
+                span['image_path'] = f"----------------图片路径为({image_path})，请检查qwen ocr服务，重新运行文件解析-------------------  \n"
+    pool.close()
+    pool.join()
    return spans


@@ -69,3 +112,4 @@ def check_img_bbox(bbox) -> bool:
        logger.warning(f"image_bboxes: 错误的box, {bbox}")
        return False
    return True
+
--- a/magic_pdf/tools/common.py
+++ b/magic_pdf/tools/common.py
@@ -100,7 +100,7 @@ def do_parse(
            logger.error('need model list input')
            exit(2)

-    pipe.pipe_parse()
+    pipe.pipe_parse(ocr_status,config_path,local_image_dir)
    pdf_info = pipe.pdf_mid_data['pdf_info']
    if f_draw_layout_bbox:
        draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir, pdf_file_name)
@@ -136,3 +136,4 @@ def do_parse(
 parse_pdf_methods = click.Choice(['ocr', 'txt', 'auto'])


+
--- a/magic_pdf/tools/pdf_server.py
+++ b/magic_pdf/tools/pdf_server.py
@@ -161,3 +161,4 @@ if __name__ == '__main__':



+
--- a/magic_pdf/user_api.py
+++ b/magic_pdf/user_api.py
@@ -48,14 +48,14 @@ def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit
    return pdf_info_dict


-def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False,
+def parse_ocr_pdf(ocr_status,config_path,local_image_dir,pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False,
                  start_page_id=0, end_page_id=None,
                  *args, **kwargs):
    """
    解析ocr类pdf
    """
    # print('---------------------------------------------------------这是解析ocr类pdf------------------------------------------------------------------')
-    pdf_info_dict = parse_pdf_by_ocr(
+    pdf_info_dict = parse_pdf_by_ocr(ocr_status,config_path,local_image_dir,
        pdf_bytes,
        pdf_models,
        imageWriter,
@@ -112,3 +112,4 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
    # logger.info(f'这是pdf_union_pdf中的pdf_dict：\n{pdf_info_dict}\n-----------------------------------------------------------------------------------------')

    return pdf_info_dict
+