Merge pull request #1260 from opendatalab/dev

fix: dup classify pdf type & improve layout detection for DocLayout_YOLO model

Merge pull request #1260 from opendatalab/dev
fix: dup classify pdf type & improve layout detection for DocLayout_YOLO model
0440ee87 · Xiaomeng Zhao · GitHub · fb468671 · 327fdf90 · 0440ee87
Unverified Commit 0440ee87 authored Dec 11, 2024 by Xiaomeng Zhao Committed by GitHub Dec 11, 2024
Hide whitespace changes
Inline Side-by-side

Showing with 25 additions and 4 deletions

magic_pdf/model/pdf_extract_kit.py magic_pdf/model/pdf_extract_kit.py +19 -1

magic_pdf/tools/common.py magic_pdf/tools/common.py +6 -3

No files found.
--- a/magic_pdf/model/pdf_extract_kit.py
+++ b/magic_pdf/model/pdf_extract_kit.py
@@ -179,7 +179,25 @@ class CustomPEKModel:
            layout_res = self.layout_model(image, ignore_catids=[])
        elif self.layout_model_name == MODEL_NAME.DocLayout_YOLO:
            # doclayout_yolo
-            layout_res = self.layout_model.predict(image)
+            img_pil = Image.fromarray(image)
+            width, height = img_pil.size
+            # logger.info(f'width: {width}, height: {height}')
+            input_res = {"poly":[0,0,width,0,width,height,0,height]}
+            new_image, useful_list = crop_img(input_res, img_pil, crop_paste_x=width//2, crop_paste_y=0)
+            paste_x, paste_y, xmin, ymin, xmax, ymax, new_width, new_height = useful_list
+            layout_res = self.layout_model.predict(new_image)
+            for res in layout_res:
+                p1, p2, p3, p4, p5, p6, p7, p8 = res['poly']
+                p1 = p1 - paste_x + xmin
+                p2 = p2 - paste_y + ymin
+                p3 = p3 - paste_x + xmin
+                p4 = p4 - paste_y + ymin
+                p5 = p5 - paste_x + xmin
+                p6 = p6 - paste_y + ymin
+                p7 = p7 - paste_x + xmin
+                p8 = p8 - paste_y + ymin
+                res['poly'] = [p1, p2, p3, p4, p5, p6, p7, p8]
+
        layout_cost = round(time.time() - layout_start, 2)
        logger.info(f'layout detection time: {layout_cost}')


--- a/magic_pdf/tools/common.py
+++ b/magic_pdf/tools/common.py
@@ -123,6 +123,9 @@ def do_parse(
                        formula_enable=formula_enable,
                        table_enable=table_enable,
                    )
+                    pipe_result = infer_result.pipe_txt_mode(
+                        image_writer, debug_mode=True, lang=lang
+                    )
                else:
                    infer_result = ds.apply(
                        doc_analyze,
@@ -132,9 +135,9 @@ def do_parse(
                        formula_enable=formula_enable,
                        table_enable=table_enable,
                    )
-                pipe_result = infer_result.pipe_auto_mode(
-                    image_writer, debug_mode=True, lang=lang
-                )
+                    pipe_result = infer_result.pipe_ocr_mode(
+                        image_writer, debug_mode=True, lang=lang
+                    )

            elif parse_method == 'txt':
                infer_result = ds.apply(