Merge pull request #6 from opendatalab/dev

Dev

Merge pull request #6 from opendatalab/dev
Dev
ece7f8d5 · Kaiwen Liu · GitHub · 98362a6e · 702b6ac9 · ece7f8d5
Unverified Commit ece7f8d5 authored Oct 15, 2024 by Kaiwen Liu Committed by GitHub Oct 15, 2024
20 changed files
--- a/magic_pdf/libs/ocr_content_type.py
+++ b/magic_pdf/libs/ocr_content_type.py
@@ -20,6 +20,8 @@ class BlockType:
    InterlineEquation = 'interline_equation'
    Footnote = 'footnote'
    Discarded = 'discarded'
+    List = 'list'
+    Index = 'index'


 class CategoryId:

--- a/magic_pdf/libs/version.py
+++ b/magic_pdf/libs/version.py
-__version__ = "0.7.1"
+__version__ = "0.8.0"
--- a/magic_pdf/model/doc_analyze_by_custom_model.py
+++ b/magic_pdf/model/doc_analyze_by_custom_model.py
@@ -4,6 +4,7 @@ import fitz
 import numpy as np
 from loguru import logger

+from magic_pdf.libs.clean_memory import clean_memory
 from magic_pdf.libs.config_reader import get_local_models_dir, get_device, get_table_recog_config
 from magic_pdf.model.model_list import MODEL
 import magic_pdf.model as model_config
@@ -23,7 +24,7 @@ def remove_duplicates_dicts(lst):
    return unique_dicts


-def load_images_from_pdf(pdf_bytes: bytes, dpi=200) -> list:
+def load_images_from_pdf(pdf_bytes: bytes, dpi=200, start_page_id=0, end_page_id=None) -> list:
    try:
        from PIL import Image
    except ImportError:
@@ -32,7 +33,14 @@ def load_images_from_pdf(pdf_bytes: bytes, dpi=200) -> list:

    images = []
    with fitz.open("pdf", pdf_bytes) as doc:
+        pdf_page_num = doc.page_count
+        end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else pdf_page_num - 1
+        if end_page_id > pdf_page_num - 1:
+            logger.warning("end_page_id is out of range, use images length")
+            end_page_id = pdf_page_num - 1
+
        for index in range(0, doc.page_count):
+            if start_page_id <= index <= end_page_id:
                page = doc[index]
                mat = fitz.Matrix(dpi / 72, dpi / 72)
                pm = page.get_pixmap(matrix=mat, alpha=False)
@@ -44,6 +52,9 @@ def load_images_from_pdf(pdf_bytes: bytes, dpi=200) -> list:
                img = Image.frombytes("RGB", (pm.width, pm.height), pm.samples)
                img = np.array(img)
                img_dict = {"img": img, "width": pm.width, "height": pm.height}
+            else:
+                img_dict = {"img": [], "width": 0, "height": 0}
+
            images.append(img_dict)
    return images

@@ -57,14 +68,14 @@ class ModelSingleton:
            cls._instance = super().__new__(cls)
        return cls._instance

-    def get_model(self, ocr: bool, show_log: bool):
-        key = (ocr, show_log)
+    def get_model(self, ocr: bool, show_log: bool, lang=None):
+        key = (ocr, show_log, lang)
        if key not in self._models:
-            self._models[key] = custom_model_init(ocr=ocr, show_log=show_log)
+            self._models[key] = custom_model_init(ocr=ocr, show_log=show_log, lang=lang)
        return self._models[key]


-def custom_model_init(ocr: bool = False, show_log: bool = False):
+def custom_model_init(ocr: bool = False, show_log: bool = False, lang=None):
    model = None

    if model_config.__model_mode__ == "lite":
@@ -78,7 +89,7 @@ def custom_model_init(ocr: bool = False, show_log: bool = False):
        model_init_start = time.time()
        if model == MODEL.Paddle:
            from magic_pdf.model.pp_structure_v2 import CustomPaddleModel
-            custom_model = CustomPaddleModel(ocr=ocr, show_log=show_log)
+            custom_model = CustomPaddleModel(ocr=ocr, show_log=show_log, lang=lang)
        elif model == MODEL.PEK:
            from magic_pdf.model.pdf_extract_kit import CustomPEKModel
            # 从配置文件读取model-dir和device
@@ -89,7 +100,9 @@ def custom_model_init(ocr: bool = False, show_log: bool = False):
                           "show_log": show_log,
                           "models_dir": local_models_dir,
                           "device": device,
-                           "table_config": table_config}
+                           "table_config": table_config,
+                           "lang": lang,
+                           }
            custom_model = CustomPEKModel(**model_input)
        else:
            logger.error("Not allow model_name!")
@@ -104,19 +117,19 @@ def custom_model_init(ocr: bool = False, show_log: bool = False):


 def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False,
-                start_page_id=0, end_page_id=None):
+                start_page_id=0, end_page_id=None, lang=None):

    model_manager = ModelSingleton()
-    custom_model = model_manager.get_model(ocr, show_log)
-
-    images = load_images_from_pdf(pdf_bytes)
+    custom_model = model_manager.get_model(ocr, show_log, lang)

-    # end_page_id = end_page_id if end_page_id else len(images) - 1
-    end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else len(images) - 1
-
-    if end_page_id > len(images) - 1:
+    with fitz.open("pdf", pdf_bytes) as doc:
+        pdf_page_num = doc.page_count
+        end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else pdf_page_num - 1
+        if end_page_id > pdf_page_num - 1:
            logger.warning("end_page_id is out of range, use images length")
-        end_page_id = len(images) - 1
+            end_page_id = pdf_page_num - 1
+
+    images = load_images_from_pdf(pdf_bytes, start_page_id=start_page_id, end_page_id=end_page_id)

    model_json = []
    doc_analyze_start = time.time()
@@ -132,7 +145,15 @@ def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False,
        page_info = {"page_no": index, "height": page_height, "width": page_width}
        page_dict = {"layout_dets": result, "page_info": page_info}
        model_json.append(page_dict)
-    doc_analyze_cost = time.time() - doc_analyze_start
-    logger.info(f"doc analyze cost: {doc_analyze_cost}")
+
+    gc_start = time.time()
+    clean_memory()
+    gc_time = round(time.time() - gc_start, 2)
+    logger.info(f"gc time: {gc_time}")
+
+    doc_analyze_time = round(time.time() - doc_analyze_start, 2)
+    doc_analyze_speed = round( (end_page_id + 1 - start_page_id) / doc_analyze_time, 2)
+    logger.info(f"doc analyze time: {round(time.time() - doc_analyze_start, 2)},"
+                f" speed: {doc_analyze_speed} pages/second")

    return model_json
--- a/magic_pdf/model/magic_model.py
+++ b/magic_pdf/model/magic_model.py
 import json

 from magic_pdf.libs.boxbase import (_is_in, _is_part_overlap, bbox_distance,
-                                    bbox_relative_pos, calculate_iou,
-                                    calculate_overlap_area_in_bbox1_area_ratio)
+                                    bbox_relative_pos, box_area, calculate_iou,
+                                    calculate_overlap_area_in_bbox1_area_ratio,
+                                    get_overlap_area)
 from magic_pdf.libs.commons import fitz, join_path
 from magic_pdf.libs.coordinate_transform import get_scale_ratio
 from magic_pdf.libs.local_math import float_gt
@@ -12,6 +13,7 @@ from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
 from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter

 CAPATION_OVERLAP_AREA_RATIO = 0.6
+MERGE_BOX_OVERLAP_AREA_RATIO = 1.1


 class MagicModel:
@@ -108,6 +110,24 @@ class MagicModel:
        self.__fix_by_remove_high_iou_and_low_confidence()
        self.__fix_footnote()

+    def _bbox_distance(self, bbox1, bbox2):
+        left, right, bottom, top = bbox_relative_pos(bbox1, bbox2)
+        flags = [left, right, bottom, top]
+        count = sum([1 if v else 0 for v in flags])
+        if count > 1:
+            return float('inf')
+        if left or right:
+            l1 = bbox1[3] - bbox1[1]
+            l2 = bbox2[3] - bbox2[1]
+        else:
+            l1 = bbox1[2] - bbox1[0]
+            l2 = bbox2[2] - bbox2[0]
+
+        if l2 > l1 and (l2 - l1) / l1 > 0.5:
+            return float('inf')
+
+        return bbox_distance(bbox1, bbox2)
+
    def __fix_footnote(self):
        # 3: figure, 5: table, 7: footnote
        for model_page_info in self.__model_list:
@@ -142,7 +162,7 @@ class MagicModel:
                    if pos_flag_count > 1:
                        continue
                    dis_figure_footnote[i] = min(
-                            bbox_distance(figures[j]['bbox'], footnotes[i]['bbox']),
+                        self._bbox_distance(figures[j]['bbox'], footnotes[i]['bbox']),
                        dis_figure_footnote.get(i, float('inf')),
                    )
            for i in range(len(footnotes)):
@@ -161,10 +181,12 @@ class MagicModel:
                        continue

                    dis_table_footnote[i] = min(
-                            bbox_distance(tables[j]['bbox'], footnotes[i]['bbox']),
+                        self._bbox_distance(tables[j]['bbox'], footnotes[i]['bbox']),
                        dis_table_footnote.get(i, float('inf')),
                    )
            for i in range(len(footnotes)):
+                if i not in dis_figure_footnote:
+                    continue
                if dis_table_footnote.get(i, float('inf')) > dis_figure_footnote[i]:
                    footnotes[i]['category_id'] = CategoryId.ImageFootnote

@@ -191,6 +213,44 @@ class MagicModel:
        筛选出所有和 merged bbox 有 overlap 且 overlap 面积大于 object 的面积的 subjects。
        再求出筛选出的 subjects 和 object 的最短距离
        """
+        def search_overlap_between_boxes(
+            subject_idx, object_idx
+        ):
+            idxes = [subject_idx, object_idx]
+            x0s = [all_bboxes[idx]['bbox'][0] for idx in idxes]
+            y0s = [all_bboxes[idx]['bbox'][1] for idx in idxes]
+            x1s = [all_bboxes[idx]['bbox'][2] for idx in idxes]
+            y1s = [all_bboxes[idx]['bbox'][3] for idx in idxes]
+
+            merged_bbox = [
+                min(x0s),
+                min(y0s),
+                max(x1s),
+                max(y1s),
+            ]
+            ratio = 0
+
+            other_objects = list(
+                map(
+                    lambda x: {'bbox': x['bbox'], 'score': x['score']},
+                    filter(
+                        lambda x: x['category_id']
+                        not in (object_category_id, subject_category_id),
+                        self.__model_list[page_no]['layout_dets'],
+                    ),
+                )
+            )
+            for other_object in other_objects:
+                ratio = max(
+                    ratio,
+                    get_overlap_area(
+                        merged_bbox, other_object['bbox']
+                    ) * 1.0 / box_area(all_bboxes[object_idx]['bbox'])
+                )
+                if ratio >= MERGE_BOX_OVERLAP_AREA_RATIO:
+                    break
+
+            return ratio

        def may_find_other_nearest_bbox(subject_idx, object_idx):
            ret = float('inf')
@@ -299,7 +359,16 @@ class MagicModel:
                ):
                    continue

-                dis[i][j] = bbox_distance(all_bboxes[i]['bbox'], all_bboxes[j]['bbox'])
+                subject_idx, object_idx = i, j
+                if all_bboxes[j]['category_id'] == subject_category_id:
+                    subject_idx, object_idx = j, i
+
+                if search_overlap_between_boxes(subject_idx, object_idx) >= MERGE_BOX_OVERLAP_AREA_RATIO:
+                    dis[i][j] = float('inf')
+                    dis[j][i] = dis[i][j]
+                    continue
+
+                dis[i][j] = self._bbox_distance(all_bboxes[subject_idx]['bbox'], all_bboxes[object_idx]['bbox'])
                dis[j][i] = dis[i][j]

        used = set()
@@ -627,13 +696,13 @@ class MagicModel:
                    span['type'] = ContentType.Image
                elif category_id == 5:
                    # 获取table模型结果
-                    latex = layout_det.get("latex", None)
-                    html = layout_det.get("html", None)
+                    latex = layout_det.get('latex', None)
+                    html = layout_det.get('html', None)
                    if latex:
-                        span["latex"] = latex
+                        span['latex'] = latex
                    elif html:
-                        span["html"] = html
-                    span["type"] = ContentType.Table
+                        span['html'] = html
+                    span['type'] = ContentType.Table
                elif category_id == 13:
                    span['content'] = layout_det['latex']
                    span['type'] = ContentType.InlineEquation

--- a/magic_pdf/model/pdf_extract_kit.py
+++ b/magic_pdf/model/pdf_extract_kit.py
@@ -3,9 +3,11 @@ import os
 import time

 from magic_pdf.libs.Constants import *
+from magic_pdf.libs.clean_memory import clean_memory
 from magic_pdf.model.model_list import AtomicModel

 os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1'  # 禁止albumentations检查更新
+os.environ['YOLO_VERBOSE'] = 'False'  # disable yolo logger
 try:
    import cv2
    import yaml
@@ -32,7 +34,7 @@ except ImportError as e:
    exit(1)

 from magic_pdf.model.pek_sub_modules.layoutlmv3.model_init import Layoutlmv3_Predictor
-from magic_pdf.model.pek_sub_modules.post_process import get_croped_image, latex_rm_whitespace
+from magic_pdf.model.pek_sub_modules.post_process import latex_rm_whitespace
 from magic_pdf.model.pek_sub_modules.self_modify import ModifiedPaddleOCR
 from magic_pdf.model.pek_sub_modules.structeqtable.StructTableModel import StructTableModel
 from magic_pdf.model.ppTableModel import ppTableModel
@@ -58,12 +60,13 @@ def mfd_model_init(weight):
 def mfr_model_init(weight_dir, cfg_path, _device_='cpu'):
    args = argparse.Namespace(cfg_path=cfg_path, options=None)
    cfg = Config(args)
-    cfg.config.model.pretrained = os.path.join(weight_dir, "pytorch_model.bin")
+    cfg.config.model.pretrained = os.path.join(weight_dir, "pytorch_model.pth")
    cfg.config.model.model_config.model_name = weight_dir
    cfg.config.model.tokenizer_config.path = weight_dir
    task = tasks.setup_task(cfg)
    model = task.build_model(cfg)
-    model = model.to(_device_)
+    model.to(_device_)
+    model.eval()
    vis_processor = load_processor('formula_image_eval', cfg.config.datasets.formula_rec_eval.vis_processor.eval)
    mfr_transform = transforms.Compose([vis_processor, ])
    return [model, mfr_transform]
@@ -74,7 +77,10 @@ def layout_model_init(weight, config_file, device):
    return model


-def ocr_model_init(show_log: bool = False, det_db_box_thresh=0.3):
+def ocr_model_init(show_log: bool = False, det_db_box_thresh=0.3, lang=None):
+    if lang is not None:
+        model = ModifiedPaddleOCR(show_log=show_log, det_db_box_thresh=det_db_box_thresh, lang=lang)
+    else:
        model = ModifiedPaddleOCR(show_log=show_log, det_db_box_thresh=det_db_box_thresh)
    return model

@@ -134,7 +140,8 @@ def atom_model_init(model_name: str, **kwargs):
    elif model_name == AtomicModel.OCR:
        atom_model = ocr_model_init(
            kwargs.get("ocr_show_log"),
-            kwargs.get("det_db_box_thresh")
+            kwargs.get("det_db_box_thresh"),
+            kwargs.get("lang")
        )
    elif model_name == AtomicModel.Table:
        atom_model = table_model_init(
@@ -150,6 +157,23 @@ def atom_model_init(model_name: str, **kwargs):
    return atom_model


+#  Unified crop img logic
+def crop_img(input_res, input_pil_img, crop_paste_x=0, crop_paste_y=0):
+    crop_xmin, crop_ymin = int(input_res['poly'][0]), int(input_res['poly'][1])
+    crop_xmax, crop_ymax = int(input_res['poly'][4]), int(input_res['poly'][5])
+    # Create a white background with an additional width and height of 50
+    crop_new_width = crop_xmax - crop_xmin + crop_paste_x * 2
+    crop_new_height = crop_ymax - crop_ymin + crop_paste_y * 2
+    return_image = Image.new('RGB', (crop_new_width, crop_new_height), 'white')
+
+    # Crop image
+    crop_box = (crop_xmin, crop_ymin, crop_xmax, crop_ymax)
+    cropped_img = input_pil_img.crop(crop_box)
+    return_image.paste(cropped_img, (crop_paste_x, crop_paste_y))
+    return_list = [crop_paste_x, crop_paste_y, crop_xmin, crop_ymin, crop_xmax, crop_ymax, crop_new_width, crop_new_height]
+    return return_image, return_list
+
+
 class CustomPEKModel:

    def __init__(self, ocr: bool = False, show_log: bool = False, **kwargs):
@@ -177,9 +201,10 @@ class CustomPEKModel:
        self.table_max_time = self.table_config.get("max_time", TABLE_MAX_TIME_VALUE)
        self.table_model_type = self.table_config.get("model", TABLE_MASTER)
        self.apply_ocr = ocr
+        self.lang = kwargs.get("lang", None)
        logger.info(
-            "DocAnalysis init, this may take some times. apply_layout: {}, apply_formula: {}, apply_ocr: {}, apply_table: {}".format(
-                self.apply_layout, self.apply_formula, self.apply_ocr, self.apply_table
+            "DocAnalysis init, this may take some times. apply_layout: {}, apply_formula: {}, apply_ocr: {}, apply_table: {}, lang: {}".format(
+                self.apply_layout, self.apply_formula, self.apply_ocr, self.apply_table, self.lang
            )
        )
        assert self.apply_layout, "DocAnalysis must contain layout model."
@@ -225,11 +250,13 @@ class CustomPEKModel:
        )
        # 初始化ocr
        if self.apply_ocr:
+
            # self.ocr_model = ModifiedPaddleOCR(show_log=show_log, det_db_box_thresh=0.3)
            self.ocr_model = atom_model_manager.get_atom_model(
                atom_model_name=AtomicModel.OCR,
                ocr_show_log=show_log,
-                det_db_box_thresh=0.3
+                det_db_box_thresh=0.3,
+                lang=self.lang
            )
        # init table model
        if self.apply_table:
@@ -243,10 +270,13 @@ class CustomPEKModel:
                table_max_time=self.table_max_time,
                device=self.device
            )
+
        logger.info('DocAnalysis init done!')

    def __call__(self, image):

+        page_start = time.time()
+
        latex_filling_list = []
        mf_image_list = []

@@ -254,11 +284,15 @@ class CustomPEKModel:
        layout_start = time.time()
        layout_res = self.layout_model(image, ignore_catids=[])
        layout_cost = round(time.time() - layout_start, 2)
-        logger.info(f"layout detection cost: {layout_cost}")
+        logger.info(f"layout detection time: {layout_cost}")
+
+        pil_img = Image.fromarray(image)

        if self.apply_formula:
            # 公式检测
+            mfd_start = time.time()
            mfd_res = self.mfd_model.predict(image, imgsz=1888, conf=0.25, iou=0.45, verbose=True)[0]
+            logger.info(f"mfd time: {round(time.time() - mfd_start, 2)}")
            for xyxy, conf, cla in zip(mfd_res.boxes.xyxy.cpu(), mfd_res.boxes.conf.cpu(), mfd_res.boxes.cls.cpu()):
                xmin, ymin, xmax, ymax = [int(p.item()) for p in xyxy]
                new_item = {
@@ -269,7 +303,8 @@ class CustomPEKModel:
                }
                layout_res.append(new_item)
                latex_filling_list.append(new_item)
-                bbox_img = get_croped_image(Image.fromarray(image), [xmin, ymin, xmax, ymax])
+                # bbox_img = get_croped_image(pil_img, [xmin, ymin, xmax, ymax])
+                bbox_img = pil_img.crop((xmin, ymin, xmax, ymax))
                mf_image_list.append(bbox_img)

            # 公式识别
@@ -279,6 +314,7 @@ class CustomPEKModel:
            mfr_res = []
            for mf_img in dataloader:
                mf_img = mf_img.to(self.device)
+                with torch.no_grad():
                    output = self.mfr_model.generate({'image': mf_img})
                mfr_res.extend(output['pred_str'])
            for res, latex in zip(latex_filling_list, mfr_res):
@@ -301,23 +337,14 @@ class CustomPEKModel:
            elif int(res['category_id']) in [5]:
                table_res_list.append(res)

-        #  Unified crop img logic
-        def crop_img(input_res, input_pil_img, crop_paste_x=0, crop_paste_y=0):
-            crop_xmin, crop_ymin = int(input_res['poly'][0]), int(input_res['poly'][1])
-            crop_xmax, crop_ymax = int(input_res['poly'][4]), int(input_res['poly'][5])
-            # Create a white background with an additional width and height of 50
-            crop_new_width = crop_xmax - crop_xmin + crop_paste_x * 2
-            crop_new_height = crop_ymax - crop_ymin + crop_paste_y * 2
-            return_image = Image.new('RGB', (crop_new_width, crop_new_height), 'white')
-
-            # Crop image
-            crop_box = (crop_xmin, crop_ymin, crop_xmax, crop_ymax)
-            cropped_img = input_pil_img.crop(crop_box)
-            return_image.paste(cropped_img, (crop_paste_x, crop_paste_y))
-            return_list = [crop_paste_x, crop_paste_y, crop_xmin, crop_ymin, crop_xmax, crop_ymax, crop_new_width, crop_new_height]
-            return return_image, return_list
-
-        pil_img = Image.fromarray(image)
+        if torch.cuda.is_available():
+            properties = torch.cuda.get_device_properties(self.device)
+            total_memory = properties.total_memory / (1024 ** 3)  # 将字节转换为 GB
+            if total_memory <= 10:
+                gc_start = time.time()
+                clean_memory()
+                gc_time = round(time.time() - gc_start, 2)
+                logger.info(f"gc time: {gc_time}")

        # ocr识别
        if self.apply_ocr:
@@ -367,7 +394,7 @@ class CustomPEKModel:
                        })

            ocr_cost = round(time.time() - ocr_start, 2)
-            logger.info(f"ocr cost: {ocr_cost}")
+            logger.info(f"ocr time: {ocr_cost}")

        # 表格识别 table recognition
        if self.apply_table:
@@ -375,7 +402,7 @@ class CustomPEKModel:
            for res in table_res_list:
                new_image, _ = crop_img(res, pil_img)
                single_table_start_time = time.time()
-                logger.info("------------------table recognition processing begins-----------------")
+                # logger.info("------------------table recognition processing begins-----------------")
                latex_code = None
                html_code = None
                if self.table_model_type == STRUCT_EQTABLE:
@@ -383,8 +410,9 @@ class CustomPEKModel:
                        latex_code = self.table_model.image2latex(new_image)[0]
                else:
                    html_code = self.table_model.img2html(new_image)
+
                run_time = time.time() - single_table_start_time
-                logger.info(f"------------table recognition processing ends within {run_time}s-----")
+                # logger.info(f"------------table recognition processing ends within {run_time}s-----")
                if run_time > self.table_max_time:
                    logger.warning(f"------------table recognition processing exceeds max time {self.table_max_time}s----------")
                # 判断是否返回正常
@@ -395,12 +423,13 @@ class CustomPEKModel:
                    if expected_ending:
                        res["latex"] = latex_code
                    else:
-                        logger.warning(f"------------table recognition processing fails----------")
+                        logger.warning(f"table recognition processing fails, not found expected LaTeX table end")
                elif html_code:
                    res["html"] = html_code
                else:
-                    logger.warning(f"------------table recognition processing fails----------")
-            table_cost = round(time.time() - table_start, 2)
-            logger.info(f"table cost: {table_cost}")
+                    logger.warning(f"table recognition processing fails, not get latex or html return")
+            logger.info(f"table time: {round(time.time() - table_start, 2)}")
+
+        logger.info(f"-----page total time: {round(time.time() - page_start, 2)}-----")

        return layout_res
--- a/magic_pdf/model/pp_structure_v2.py
+++ b/magic_pdf/model/pp_structure_v2.py
@@ -18,7 +18,10 @@ def region_to_bbox(region):


 class CustomPaddleModel:
-    def __init__(self, ocr: bool = False, show_log: bool = False):
+    def __init__(self, ocr: bool = False, show_log: bool = False, lang=None):
+        if lang is not None:
+            self.model = PPStructure(table=False, ocr=ocr, show_log=show_log, lang=lang)
+        else:
            self.model = PPStructure(table=False, ocr=ocr, show_log=show_log)

    def __call__(self, img):

--- a/projects/web_api/tests/__init__.py
+++ b/projects/web_api/tests/__init__.py
--- a/magic_pdf/model/v3/helpers.py
+++ b/magic_pdf/model/v3/helpers.py
+from collections import defaultdict
+from typing import List, Dict
+
+import torch
+from transformers import LayoutLMv3ForTokenClassification
+
+MAX_LEN = 510
+CLS_TOKEN_ID = 0
+UNK_TOKEN_ID = 3
+EOS_TOKEN_ID = 2
+
+
+class DataCollator:
+    def __call__(self, features: List[dict]) -> Dict[str, torch.Tensor]:
+        bbox = []
+        labels = []
+        input_ids = []
+        attention_mask = []
+
+        # clip bbox and labels to max length, build input_ids and attention_mask
+        for feature in features:
+            _bbox = feature["source_boxes"]
+            if len(_bbox) > MAX_LEN:
+                _bbox = _bbox[:MAX_LEN]
+            _labels = feature["target_index"]
+            if len(_labels) > MAX_LEN:
+                _labels = _labels[:MAX_LEN]
+            _input_ids = [UNK_TOKEN_ID] * len(_bbox)
+            _attention_mask = [1] * len(_bbox)
+            assert len(_bbox) == len(_labels) == len(_input_ids) == len(_attention_mask)
+            bbox.append(_bbox)
+            labels.append(_labels)
+            input_ids.append(_input_ids)
+            attention_mask.append(_attention_mask)
+
+        # add CLS and EOS tokens
+        for i in range(len(bbox)):
+            bbox[i] = [[0, 0, 0, 0]] + bbox[i] + [[0, 0, 0, 0]]
+            labels[i] = [-100] + labels[i] + [-100]
+            input_ids[i] = [CLS_TOKEN_ID] + input_ids[i] + [EOS_TOKEN_ID]
+            attention_mask[i] = [1] + attention_mask[i] + [1]
+
+        # padding to max length
+        max_len = max(len(x) for x in bbox)
+        for i in range(len(bbox)):
+            bbox[i] = bbox[i] + [[0, 0, 0, 0]] * (max_len - len(bbox[i]))
+            labels[i] = labels[i] + [-100] * (max_len - len(labels[i]))
+            input_ids[i] = input_ids[i] + [EOS_TOKEN_ID] * (max_len - len(input_ids[i]))
+            attention_mask[i] = attention_mask[i] + [0] * (
+                max_len - len(attention_mask[i])
+            )
+
+        ret = {
+            "bbox": torch.tensor(bbox),
+            "attention_mask": torch.tensor(attention_mask),
+            "labels": torch.tensor(labels),
+            "input_ids": torch.tensor(input_ids),
+        }
+        # set label > MAX_LEN to -100, because original labels may be > MAX_LEN
+        ret["labels"][ret["labels"] > MAX_LEN] = -100
+        # set label > 0 to label-1, because original labels are 1-indexed
+        ret["labels"][ret["labels"] > 0] -= 1
+        return ret
+
+
+def boxes2inputs(boxes: List[List[int]]) -> Dict[str, torch.Tensor]:
+    bbox = [[0, 0, 0, 0]] + boxes + [[0, 0, 0, 0]]
+    input_ids = [CLS_TOKEN_ID] + [UNK_TOKEN_ID] * len(boxes) + [EOS_TOKEN_ID]
+    attention_mask = [1] + [1] * len(boxes) + [1]
+    return {
+        "bbox": torch.tensor([bbox]),
+        "attention_mask": torch.tensor([attention_mask]),
+        "input_ids": torch.tensor([input_ids]),
+    }
+
+
+def prepare_inputs(
+    inputs: Dict[str, torch.Tensor], model: LayoutLMv3ForTokenClassification
+) -> Dict[str, torch.Tensor]:
+    ret = {}
+    for k, v in inputs.items():
+        v = v.to(model.device)
+        if torch.is_floating_point(v):
+            v = v.to(model.dtype)
+        ret[k] = v
+    return ret
+
+
+def parse_logits(logits: torch.Tensor, length: int) -> List[int]:
+    """
+    parse logits to orders
+
+    :param logits: logits from model
+    :param length: input length
+    :return: orders
+    """
+    logits = logits[1 : length + 1, :length]
+    orders = logits.argsort(descending=False).tolist()
+    ret = [o.pop() for o in orders]
+    while True:
+        order_to_idxes = defaultdict(list)
+        for idx, order in enumerate(ret):
+            order_to_idxes[order].append(idx)
+        # filter idxes len > 1
+        order_to_idxes = {k: v for k, v in order_to_idxes.items() if len(v) > 1}
+        if not order_to_idxes:
+            break
+        # filter
+        for order, idxes in order_to_idxes.items():
+            # find original logits of idxes
+            idxes_to_logit = {}
+            for idx in idxes:
+                idxes_to_logit[idx] = logits[idx, order]
+            idxes_to_logit = sorted(
+                idxes_to_logit.items(), key=lambda x: x[1], reverse=True
+            )
+            # keep the highest logit as order, set others to next candidate
+            for idx, _ in idxes_to_logit[1:]:
+                ret[idx] = orders[idx].pop()
+
+    return ret
+
+
+def check_duplicate(a: List[int]) -> bool:
+    return len(a) != len(set(a))
--- a/magic_pdf/para/para_split_v3.py
+++ b/magic_pdf/para/para_split_v3.py
+import copy
+
+from loguru import logger
+
+from magic_pdf.libs.Constants import LINES_DELETED, CROSS_PAGE
+from magic_pdf.libs.ocr_content_type import BlockType, ContentType
+
+LINE_STOP_FLAG = ('.', '!', '?', '。', '！', '？', ')', '）', '"', '”', ':', '：', ';', '；')
+LIST_END_FLAG = ('.', '。', ';', '；')
+
+
+class ListLineTag:
+    IS_LIST_START_LINE = "is_list_start_line"
+    IS_LIST_END_LINE = "is_list_end_line"
+
+
+def __process_blocks(blocks):
+
+    result = []
+    current_group = []
+
+    for i in range(len(blocks)):
+        current_block = blocks[i]
+
+        # 如果当前块是 text 类型
+        if current_block['type'] == 'text':
+            current_block["bbox_fs"] = copy.deepcopy(current_block["bbox"])
+            if 'lines' in current_block and len(current_block["lines"]) > 0:
+                current_block['bbox_fs'] = [min([line['bbox'][0] for line in current_block['lines']]),
+                                            min([line['bbox'][1] for line in current_block['lines']]),
+                                            max([line['bbox'][2] for line in current_block['lines']]),
+                                            max([line['bbox'][3] for line in current_block['lines']])]
+            current_group.append(current_block)
+
+        # 检查下一个块是否存在
+        if i + 1 < len(blocks):
+            next_block = blocks[i + 1]
+            # 如果下一个块不是 text 类型且是 title 或 interline_equation 类型
+            if next_block['type'] in ['title', 'interline_equation']:
+                result.append(current_group)
+                current_group = []
+
+    # 处理最后一个 group
+    if current_group:
+        result.append(current_group)
+
+    return result
+
+
+def __is_list_block(block):
+    # 一个block如果是list block 应该同时满足以下特征
+    # 1.block内有多个line 2.block 内有多个line左侧顶格写 3.block内有多个line 右侧不顶格（狗牙状）
+    # 1.block内有多个line 2.block 内有多个line左侧顶格写 3.多个line以endflag结尾
+    # 1.block内有多个line 2.block 内有多个line左侧顶格写 3.block内有多个line 左侧不顶格
+    if len(block['lines']) >= 3:
+        first_line = block['lines'][0]
+        line_height = first_line['bbox'][3] - first_line['bbox'][1]
+        block_weight = block['bbox_fs'][2] - block['bbox_fs'][0]
+
+        left_close_num = 0
+        left_not_close_num = 0
+        right_not_close_num = 0
+        lines_text_list = []
+        for line in block['lines']:
+
+            line_text = ""
+
+            for span in line['spans']:
+                span_type = span['type']
+                if span_type == ContentType.Text:
+                    line_text += span['content'].strip()
+
+            lines_text_list.append(line_text)
+
+            # 计算line左侧顶格数量是否大于2，是否顶格用abs(block['bbox_fs'][0] - line['bbox'][0]) < line_height/2 来判断
+            if abs(block['bbox_fs'][0] - line['bbox'][0]) < line_height/2:
+                left_close_num += 1
+            elif line['bbox'][0] - block['bbox_fs'][0] > line_height:
+                # logger.info(f"{line_text}, {block['bbox_fs']}, {line['bbox']}")
+                left_not_close_num += 1
+
+            # 计算右侧是否不顶格，拍脑袋用0.3block宽度做阈值
+            closed_area = 0.3 * block_weight
+            # closed_area = 5 * line_height
+            if block['bbox_fs'][2] - line['bbox'][2] > closed_area:
+                right_not_close_num += 1
+
+        # 判断lines_text_list中的元素是否有超过80%都以LIST_END_FLAG结尾
+        line_end_flag = False
+        if len(lines_text_list) > 0:
+            num_end_count = 0
+            for line_text in lines_text_list:
+                if len(line_text) > 0:
+                    if line_text[-1] in LIST_END_FLAG:
+                        num_end_count += 1
+
+            if num_end_count / len(lines_text_list) >= 0.8:
+                line_end_flag = True
+
+        if left_close_num >= 2 and (right_not_close_num >= 2 or line_end_flag or left_not_close_num >= 2):
+            for line in block['lines']:
+                if abs(block['bbox_fs'][0] - line['bbox'][0]) < line_height / 2:
+                    line[ListLineTag.IS_LIST_START_LINE] = True
+                if abs(block['bbox_fs'][2] - line['bbox'][2]) > line_height:
+                    line[ListLineTag.IS_LIST_END_LINE] = True
+
+            return True
+        else:
+            return False
+    else:
+        return False
+
+
+def __is_index_block(block):
+    # 一个block如果是index block 应该同时满足以下特征
+    # 1.block内有多个line 2.block 内有多个line两侧均顶格写 3.line的开头或者结尾均为数字
+    if len(block['lines']) >= 3:
+        first_line = block['lines'][0]
+        line_height = first_line['bbox'][3] - first_line['bbox'][1]
+
+        left_close_num = 0
+        right_close_num = 0
+
+        lines_text_list = []
+        for line in block['lines']:
+
+            # 计算line左侧顶格数量是否大于2，是否顶格用abs(block['bbox_fs'][0] - line['bbox'][0]) < line_height/2 来判断
+            if abs(block['bbox_fs'][0] - line['bbox'][0]) < line_height / 2:
+                left_close_num += 1
+
+            # 计算右侧是否不顶格
+            if abs(block['bbox_fs'][2] - line['bbox'][2]) < line_height / 2:
+                right_close_num += 1
+
+            line_text = ""
+
+            for span in line['spans']:
+                span_type = span['type']
+                if span_type == ContentType.Text:
+                    line_text += span['content'].strip()
+
+            lines_text_list.append(line_text)
+
+        # 判断lines_text_list中的元素是否有超过80%都以数字开头或都以数字结尾
+        line_num_flag = False
+        if len(lines_text_list) > 0:
+            num_start_count = 0
+            num_end_count = 0
+            for line_text in lines_text_list:
+                if len(line_text) > 0:
+                    if line_text[0].isdigit():
+                        num_start_count += 1
+                    if line_text[-1].isdigit():
+                        num_end_count += 1
+
+            if num_start_count / len(lines_text_list) >= 0.8 or num_end_count / len(lines_text_list) >= 0.8:
+                line_num_flag = True
+
+        if left_close_num >= 2 and right_close_num >= 2 and line_num_flag:
+            for line in block['lines']:
+                line[ListLineTag.IS_LIST_START_LINE] = True
+
+            return True
+        else:
+            return False
+    else:
+        return False
+
+
+def __merge_2_text_blocks(block1, block2):
+    if len(block1['lines']) > 0:
+        first_line = block1['lines'][0]
+        line_height = first_line['bbox'][3] - first_line['bbox'][1]
+        if abs(block1['bbox_fs'][0] - first_line['bbox'][0]) < line_height/2:
+            last_line = block2['lines'][-1]
+            if len(last_line['spans']) > 0:
+                last_span = last_line['spans'][-1]
+                line_height = last_line['bbox'][3] - last_line['bbox'][1]
+                if abs(block2['bbox_fs'][2] - last_line['bbox'][2]) < line_height and not last_span['content'].endswith(LINE_STOP_FLAG):
+                    if block1['page_num'] != block2['page_num']:
+                        for line in block1['lines']:
+                            for span in line['spans']:
+                                span[CROSS_PAGE] = True
+                    block2['lines'].extend(block1['lines'])
+                    block1['lines'] = []
+                    block1[LINES_DELETED] = True
+
+    return block1, block2
+
+
+def __merge_2_list_blocks(block1, block2):
+
+    if block1['page_num'] != block2['page_num']:
+        for line in block1['lines']:
+            for span in line['spans']:
+                span[CROSS_PAGE] = True
+    block2['lines'].extend(block1['lines'])
+    block1['lines'] = []
+    block1[LINES_DELETED] = True
+
+    return block1, block2
+
+
+def __para_merge_page(blocks):
+    page_text_blocks_groups = __process_blocks(blocks)
+    for text_blocks_group in page_text_blocks_groups:
+
+        if len(text_blocks_group) > 0:
+            # 需要先在合并前对所有block判断是否为list block
+            for block in text_blocks_group:
+                if __is_list_block(block):
+                    block['type'] = BlockType.List
+                elif __is_index_block(block):
+                    block['type'] = BlockType.Index
+
+        if len(text_blocks_group) > 1:
+            # 倒序遍历
+            for i in range(len(text_blocks_group)-1, -1, -1):
+                current_block = text_blocks_group[i]
+
+                # 检查是否有前一个块
+                if i - 1 >= 0:
+                    prev_block = text_blocks_group[i - 1]
+
+                    if current_block['type'] == 'text' and prev_block['type'] == 'text':
+                        __merge_2_text_blocks(current_block, prev_block)
+                    if current_block['type'] == BlockType.List and prev_block['type'] == BlockType.List:
+                        __merge_2_list_blocks(current_block, prev_block)
+                    if current_block['type'] == BlockType.Index and prev_block['type'] == BlockType.Index:
+                        __merge_2_list_blocks(current_block, prev_block)
+        else:
+            continue
+
+
+def para_split(pdf_info_dict, debug_mode=False):
+    all_blocks = []
+    for page_num, page in pdf_info_dict.items():
+        blocks = copy.deepcopy(page['preproc_blocks'])
+        for block in blocks:
+            block['page_num'] = page_num
+        all_blocks.extend(blocks)
+
+    __para_merge_page(all_blocks)
+    for page_num, page in pdf_info_dict.items():
+        page['para_blocks'] = []
+        for block in all_blocks:
+            if block['page_num'] == page_num:
+                page['para_blocks'].append(block)
+
+
+if __name__ == '__main__':
+    input_blocks = [{'type': 'text', 'bbox': [19, 79, 285, 95], 'lines': [{'bbox': [21.360000610351562, 81.50750732421875, 287.69000244140625, 93.62750244140625], 'spans': [{'bbox': [21.360000610351562, 81.62750244140625, 170.3000030517578, 93.62750244140625], 'content': '嘉和美康（688246）/计算机', 'type': 'text', 'score': 1.0}, {'bbox': [170.3000030517578, 81.62750244140625, 176.3000030517578, 93.62750244140625], 'content': ' ', 'type': 'text', 'score': 1.0}, {'bbox': [181.22000122070312, 81.50750732421875, 281.8052062988281, 93.50750732421875], 'content': '证券研究报告/公司点评', 'type': 'text', 'score': 1.0}, {'bbox': [281.69000244140625, 81.50750732421875, 287.69000244140625, 93.50750732421875], 'content': ' ', 'type': 'text', 'score': 1.0}], 'index': 0}], 'index': 0, 'page_num': 'page_0', 'bbox_fs': [21.360000610351562, 81.50750732421875, 287.69000244140625, 93.62750244140625]}, {'type': 'title', 'bbox': [18, 109, 124, 123], 'lines': [{'bbox': [21.360000610351562, 101.70799255371094, 98.47967529296875, 116.21743774414062], 'spans': [{'bbox': [21.360000610351562, 101.70799255371094, 98.47967529296875, 116.21743774414062], 'content': '[Table_Industry] ', 'type': 'text', 'score': 1.0}], 'index': 1}, {'bbox': [21.1200008392334, 110.3074951171875, 129.5640106201172, 122.3074951171875], 'spans': [{'bbox': [21.1200008392334, 110.3074951171875, 129.5640106201172, 122.3074951171875], 'content': '评级：买入（维持）', 'type': 'text', 'score': 1.0}], 'index': 2}], 'index': 1.5, 'page_num': 'page_0'}, {'type': 'text', 'bbox': [20, 126, 117, 137], 'lines': [{'bbox': [21.1200008392334, 127.40557861328125, 116.18000030517578, 136.40557861328125], 'spans': [{'bbox': [21.1200008392334, 127.40557861328125, 116.18000030517578, 136.40557861328125], 'content': '市场价格：16.62 元/股', 'type': 'text', 'score': 1.0}], 'index': 3}], 'index': 3, 'page_num': 'page_0', 'bbox_fs': [21.1200008392334, 127.40557861328125, 116.18000030517578, 136.40557861328125]}, {'type': 'text', 'bbox': [19, 144, 158, 172], 'lines': [{'bbox': [21.1200008392334, 144.1099853515625, 86.88600158691406, 156.50299072265625], 'spans': [{'bbox': [21.1200008392334, 146.005615234375, 84.33599853515625, 155.005615234375], 'content': '分析师：闻学臣', 'type': 'text', 'score': 1.0}, {'bbox': [84.38400268554688, 144.1099853515625, 86.88600158691406, 156.50299072265625], 'content': ' ', 'type': 'text', 'score': 1.0}], 'index': 4}, {'bbox': [21.1200008392334, 159.7099609375, 157.9219970703125, 172.10296630859375], 'spans': [{'bbox': [21.1200008392334, 161.6055908203125, 84.33599853515625, 170.6055908203125], 'content': '执业证书编号：', 'type': 'text', 'score': 1.0}, {'bbox': [84.50399780273438, 159.7099609375, 155.45095825195312, 172.10296630859375], 'content': 'S0740519090007', 'type': 'text', 'score': 1.0}, {'bbox': [155.4199981689453, 159.7099609375, 157.9219970703125, 172.10296630859375], 'content': ' ', 'type': 'text', 'score': 1.0}], 'index': 5}], 'index': 4.5, 'page_num': 'page_0', 'bbox_fs': [21.1200008392334, 144.1099853515625, 157.9219970703125, 172.10296630859375]}, {'type': 'text', 'bbox': [18, 194, 157, 241], 'lines': [{'bbox': [21.1200008392334, 193.86497497558594, 86.88600158691406, 206.23097229003906], 'spans': [{'bbox': [21.1200008392334, 195.80560302734375, 84.33599853515625, 204.80560302734375], 'content': '分析师：何柄谕', 'type': 'text', 'score': 1.0}, {'bbox': [84.38400268554688, 193.86497497558594, 86.88600158691406, 206.23097229003906], 'content': ' ', 'type': 'text', 'score': 1.0}], 'index': 6}, {'bbox': [21.1200008392334, 211.07000732421875, 157.9219970703125, 223.4630126953125], 'spans': [{'bbox': [21.1200008392334, 212.96563720703125, 84.33599853515625, 221.96563720703125], 'content': '执业证书编号：', 'type': 'text', 'score': 1.0}, {'bbox': [84.50399780273438, 211.07000732421875, 155.44796752929688, 223.4630126953125], 'content': 'S0740519090003', 'type': 'text', 'score': 1.0}, {'bbox': [155.4199981689453, 211.07000732421875, 157.9219970703125, 223.4630126953125], 'content': ' ', 'type': 'text', 'score': 1.0}], 'index': 7}, {'bbox': [21.1200008392334, 228.0649871826172, 126.84199523925781, 240.4309844970703], 'spans': [{'bbox': [21.1200008392334, 228.0649871826172, 43.73700714111328, 240.4309844970703], 'content': 'Email', 'type': 'text', 'score': 1.0}, {'bbox': [43.79999923706055, 230.005615234375, 52.79999923706055, 239.005615234375], 'content': '：', 'type': 'text', 'score': 1.0}, {'bbox': [52.68000030517578, 228.0649871826172, 124.41200256347656, 240.4309844970703], 'content': 'heby@zts.com.cn', 'type': 'text', 'score': 1.0}, {'bbox': [124.33999633789062, 228.0649871826172, 126.84199523925781, 240.4309844970703], 'content': ' ', 'type': 'text', 'score': 1.0}], 'index': 8}], 'index': 7, 'page_num': 'page_0', 'bbox_fs': [21.1200008392334, 193.86497497558594, 157.9219970703125, 240.4309844970703]}, {'type': 'table', 'bbox': [18, 338, 169, 418], 'blocks': [{'bbox': [18, 356, 169, 418], 'type': 'table_body', 'lines': [{'bbox': [18, 356, 169, 418], 'spans': [{'bbox': [18, 356, 169, 418], 'score': 0.8198961019515991, 'type': 'table', 'image_path': '4123619a2e8de87ebe695a4e7703d09d957670491c939b1050c96bbf4104210e.jpg'}]}]}, {'bbox': [19, 338, 70, 352], 'type': 'table_caption', 'lines': [{'bbox': [21.1200008392334, 335.9779968261719, 85.39967346191406, 350.4874267578125], 'spans': [{'bbox': [21.1200008392334, 335.9779968261719, 85.39967346191406, 350.4874267578125], 'content': '[Table_Profit] ', 'type': 'text', 'score': 1.0}]}]}], 'index': 9.5, 'page_num': 'page_0'}, {'type': 'image', 'bbox': [19, 426, 163, 558], 'blocks': [{'bbox': [21, 452, 163, 558], 'type': 'image_body', 'lines': [{'bbox': [21, 452, 163, 558], 'spans': [{'bbox': [21, 452, 163, 558], 'score': 0.9999651312828064, 'type': 'image', 'image_path': '0e63ab24cdc2ac4cb0c46bf1ff7b9f094c092b9c5707810cbc2b7e30964cf8a1.jpg'}]}]}, {'bbox': [19, 426, 160, 440], 'type': 'image_caption', 'lines': [{'bbox': [21.1200008392334, 427.8774719238281, 165.74000549316406, 439.8774719238281], 'spans': [{'bbox': [21.1200008392334, 427.8774719238281, 165.74000549316406, 439.8774719238281], 'content': '股价与行业-市场走势对比 ', 'type': 'text', 'score': 1.0}]}]}], 'index': 11.5, 'page_num': 'page_0'}, {'type': 'title', 'bbox': [20, 569, 70, 583], 'lines': [{'bbox': [21.1200008392334, 570.70751953125, 75.38400268554688, 582.70751953125], 'spans': [{'bbox': [21.1200008392334, 570.70751953125, 75.38400268554688, 582.70751953125], 'content': '相关报告 ', 'type': 'text', 'score': 1.0}], 'index': 13}], 'index': 13, 'page_num': 'page_0'}, {'type': 'text', 'bbox': [20, 586, 168, 629], 'lines': [{'bbox': [21.1200008392334, 585.9849853515625, 166.1840057373047, 598.3509521484375], 'spans': [{'bbox': [21.1200008392334, 585.9849853515625, 28.661998748779297, 598.3509521484375], 'content': '1 ', 'type': 'text', 'score': 1.0}, {'bbox': [30.239999771118164, 587.9255981445312, 83.76300048828125, 596.9255981445312], 'content': '《嘉和美康（', 'type': 'text', 'score': 1.0}, {'bbox': [83.78399658203125, 585.9849853515625, 113.72698211669922, 598.3509521484375], 'content': '688246', 'type': 'text', 'score': 1.0}, {'bbox': [113.77999877929688, 587.9255981445312, 131.3000030517578, 596.9255981445312], 'content': '）：', 'type': 'text', 'score': 1.0}, {'bbox': [130.82000732421875, 585.9849853515625, 140.74400329589844, 598.3509521484375], 'content': '24', 'type': 'text', 'score': 1.0}, {'bbox': [140.74400329589844, 587.9255981445312, 151.94000244140625, 596.9255981445312], 'content': ' 年', 'type': 'text', 'score': 1.0}, {'bbox': [154.22000122070312, 585.9849853515625, 166.1840057373047, 598.3509521484375], 'content': 'Q1', 'type': 'text', 'score': 1.0}], 'index': 14}, {'bbox': [21.1200008392334, 603.525634765625, 165.1199951171875, 612.525634765625], 'spans': [{'bbox': [21.1200008392334, 603.525634765625, 165.1199951171875, 612.525634765625], 'content': '收入显著改善，医疗大模型产品落地', 'type': 'text', 'score': 1.0}], 'index': 15}, {'bbox': [21.1200008392334, 617.1849975585938, 50.62199783325195, 629.5509643554688], 'spans': [{'bbox': [21.1200008392334, 619.1256103515625, 48.119998931884766, 628.1256103515625], 'content': '良好》', 'type': 'text', 'score': 1.0}, {'bbox': [48.119998931884766, 617.1849975585938, 50.62199783325195, 629.5509643554688], 'content': ' ', 'type': 'text', 'score': 1.0}], 'index': 16}], 'index': 15, 'page_num': 'page_0', 'bbox_fs': [21.1200008392334, 585.9849853515625, 166.1840057373047, 629.5509643554688]}, {'type': 'text', 'bbox': [19, 648, 167, 677], 'lines': [{'bbox': [21.1200008392334, 648.385009765625, 166.21701049804688, 660.7509765625], 'spans': [{'bbox': [21.1200008392334, 648.385009765625, 28.662002563476562, 660.7509765625], 'content': '2 ', 'type': 'text', 'score': 1.0}, {'bbox': [30.1200008392334, 650.3256225585938, 83.51700592041016, 659.3256225585938], 'content': '《嘉和美康（', 'type': 'text', 'score': 1.0}, {'bbox': [83.54399871826172, 648.385009765625, 113.48698425292969, 660.7509765625], 'content': '688246', 'type': 'text', 'score': 1.0}, {'bbox': [113.54000091552734, 650.3256225585938, 166.21701049804688, 659.3256225585938], 'content': '）：收入逐季', 'type': 'text', 'score': 1.0}], 'index': 17}, {'bbox': [21.1200008392334, 663.9849853515625, 153.6020050048828, 676.3509521484375], 'spans': [{'bbox': [21.1200008392334, 665.9255981445312, 111.12000274658203, 674.9255981445312], 'content': '度加速，继续加大医疗', 'type': 'text', 'score': 1.0}, {'bbox': [113.41999816894531, 663.9849853515625, 121.9219970703125, 676.3509521484375], 'content': 'AI', 'type': 'text', 'score': 1.0}, {'bbox': [121.9219970703125, 665.9255981445312, 151.10299682617188, 674.9255981445312], 'content': ' 投入》', 'type': 'text', 'score': 1.0}, {'bbox': [151.10000610351562, 663.9849853515625, 153.6020050048828, 676.3509521484375], 'content': ' ', 'type': 'text', 'score': 1.0}], 'index': 18}], 'index': 17.5, 'page_num': 'page_0', 'bbox_fs': [21.1200008392334, 648.385009765625, 166.21701049804688, 676.3509521484375]}, {'type': 'text', 'bbox': [19, 695, 167, 738], 'lines': [{'bbox': [21.1200008392334, 695.1849975585938, 166.21701049804688, 707.5509643554688], 'spans': [{'bbox': [21.1200008392334, 695.1849975585938, 28.661998748779297, 707.5509643554688], 'content': '3 ', 'type': 'text', 'score': 1.0}, {'bbox': [30.1200008392334, 697.1256103515625, 83.51700592041016, 706.1256103515625], 'content': '《嘉和美康（', 'type': 'text', 'score': 1.0}, {'bbox': [83.54399871826172, 695.1849975585938, 113.48698425292969, 707.5509643554688], 'content': '688246', 'type': 'text', 'score': 1.0}, {'bbox': [113.54000091552734, 697.1256103515625, 166.21701049804688, 706.1256103515625], 'content': '）：回购彰显', 'type': 'text', 'score': 1.0}], 'index': 19}, {'bbox': [21.1200008392334, 710.7849731445312, 160.22000122070312, 723.1509399414062], 'spans': [{'bbox': [21.1200008392334, 712.7255859375, 138.1199951171875, 721.7255859375], 'content': '公司发展信心，公司加大医疗', 'type': 'text', 'score': 1.0}, {'bbox': [140.4199981689453, 710.7849731445312, 148.9219970703125, 723.1509399414062], 'content': 'AI', 'type': 'text', 'score': 1.0}, {'bbox': [148.9219970703125, 712.7255859375, 160.22000122070312, 721.7255859375], 'content': ' 投', 'type': 'text', 'score': 1.0}], 'index': 20}, {'bbox': [21.1200008392334, 726.4049682617188, 41.62199783325195, 738.7709350585938], 'spans': [{'bbox': [21.1200008392334, 728.3455810546875, 39.12000274658203, 737.3455810546875], 'content': '入》', 'type': 'text', 'score': 1.0}, {'bbox': [39.119998931884766, 726.4049682617188, 41.62199783325195, 738.7709350585938], 'content': ' ', 'type': 'text', 'score': 1.0}], 'index': 21}], 'index': 20, 'page_num': 'page_0', 'bbox_fs': [21.1200008392334, 695.1849975585938, 166.21701049804688, 738.7709350585938]}, {'type': 'text', 'bbox': [427, 80, 506, 94], 'lines': [{'bbox': [429.54998779296875, 81.50750732421875, 509.739990234375, 93.50750732421875], 'spans': [{'bbox': [429.54998779296875, 81.50750732421875, 503.8600158691406, 93.50750732421875], 'content': '2024 年8 月28 日', 'type': 'text', 'score': 1.0}, {'bbox': [503.739990234375, 81.50750732421875, 509.739990234375, 93.50750732421875], 'content': ' ', 'type': 'text', 'score': 1.0}], 'index': 22}], 'index': 22, 'page_num': 'page_0', 'bbox_fs': [429.54998779296875, 81.50750732421875, 509.739990234375, 93.50750732421875]}, {'type': 'table', 'bbox': [184, 108, 568, 273], 'blocks': [{'bbox': [184, 124, 568, 249], 'type': 'table_body', 'lines': [{'bbox': [184, 124, 568, 249], 'spans': [{'bbox': [184, 124, 568, 249], 'score': 0.9999539852142334, 'type': 'table', 'image_path': 'feabef6394c4fd70ba64aece3701cd1fc49a0b7deb4ea0693dd63131f182fb9c.jpg'}]}]}, {'bbox': [184, 108, 295, 122], 'type': 'table_caption', 'lines': [{'bbox': [186.5, 110.3074951171875, 294.9320068359375, 122.3074951171875], 'spans': [{'bbox': [186.5, 110.3074951171875, 294.9320068359375, 122.3074951171875], 'content': '公司盈利预测及估值', 'type': 'text', 'score': 1.0}]}]}, {'bbox': [184, 262, 344, 273], 'type': 'table_footnote', 'lines': [{'bbox': [186.5, 262.17498779296875, 343.1300048828125, 274.5409851074219], 'spans': [{'bbox': [186.5, 264.1156005859375, 213.5, 273.1156005859375], 'content': '备注：', 'type': 'text', 'score': 1.0}, {'bbox': [213.52999877929688, 264.1156005859375, 240.52999877929688, 273.1156005859375], 'content': '股价为', 'type': 'text', 'score': 1.0}, {'bbox': [242.80999755859375, 262.17498779296875, 262.8139953613281, 274.5409851074219], 'content': '2024', 'type': 'text', 'score': 1.0}, {'bbox': [262.8139953613281, 264.1156005859375, 274.1300048828125, 273.1156005859375], 'content': ' 年', 'type': 'text', 'score': 1.0}, {'bbox': [276.4100036621094, 262.17498779296875, 281.41400146484375, 274.5409851074219], 'content': '8', 'type': 'text', 'score': 1.0}, {'bbox': [281.41400146484375, 264.1156005859375, 292.6099853515625, 273.1156005859375], 'content': ' 月', 'type': 'text', 'score': 1.0}, {'bbox': [294.8900146484375, 262.17498779296875, 304.93402099609375, 274.5409851074219], 'content': '27', 'type': 'text', 'score': 1.0}, {'bbox': [304.93402099609375, 264.1156005859375, 343.1300048828125, 273.1156005859375], 'content': ' 日收盘价', 'type': 'text', 'score': 1.0}]}]}], 'index': 24, 'page_num': 'page_0'}, {'type': 'title', 'bbox': [180, 285, 230, 300], 'lines': [{'bbox': [186.5, 277.7750244140625, 189.0019989013672, 290.1410217285156], 'spans': [{'bbox': [186.5, 277.7750244140625, 189.0019989013672, 290.1410217285156], 'content': ' ', 'type': 'text', 'score': 1.0}], 'index': 26}, {'bbox': [180.86000061035156, 280.41796875, 183.79568481445312, 294.9273986816406], 'spans': [{'bbox': [180.86000061035156, 280.41796875, 183.79568481445312, 294.9273986816406], 'content': ' ', 'type': 'text', 'score': 1.0}], 'index': 27}, {'bbox': [180.86000061035156, 287.09747314453125, 235.1300048828125, 299.09747314453125], 'spans': [{'bbox': [180.86000061035156, 287.09747314453125, 235.1300048828125, 299.09747314453125], 'content': '投资要点 ', 'type': 'text', 'score': 1.0}], 'index': 28}], 'index': 27, 'page_num': 'page_0'}, {'type': 'text', 'bbox': [198, 302, 578, 331], 'lines': [{'bbox': [201.88999938964844, 302.3030090332031, 575.02001953125, 315.988037109375], 'spans': [{'bbox': [201.88999938964844, 304.45062255859375, 292.0099792480469, 314.41064453125], 'content': '投资事件：公司发布', 'type': 'text', 'score': 1.0}, {'bbox': [294.6499938964844, 302.3030090332031, 316.8507995605469, 315.988037109375], 'content': '2024', 'type': 'text', 'score': 1.0}, {'bbox': [316.8507995605469, 304.45062255859375, 429.3785705566406, 314.41064453125], 'content': ' 年中报：营业收入规模达', 'type': 'text', 'score': 1.0}, {'bbox': [432.07000732421875, 302.3030090332031, 451.5318298339844, 315.988037109375], 'content': '3.00', 'type': 'text', 'score': 1.0}, {'bbox': [451.5318298339844, 304.45062255859375, 524.1190795898438, 314.41064453125], 'content': ' 亿元，同比增长', 'type': 'text', 'score': 1.0}, {'bbox': [525, 303, 556, 314], 'score': 0.82, 'content': '2.92\\%', 'type': 'inline_equation'}, {'bbox': [555.0999755859375, 304.45062255859375, 575.02001953125, 314.41064453125], 'content': '，归', 'type': 'text', 'score': 1.0}], 'index': 29}, {'bbox': [201.88999938964844, 317.9029846191406, 329.118896484375, 331.6676940917969], 'spans': [{'bbox': [201.88999938964844, 320.05059814453125, 271.7195739746094, 330.0106201171875], 'content': '母净利润为亏损', 'type': 'text', 'score': 1.0}, {'bbox': [274.3699951171875, 317.9029846191406, 293.69873046875, 331.5880126953125], 'content': '0.27', 'type': 'text', 'score': 1.0}, {'bbox': [293.69873046875, 320.05059814453125, 326.31951904296875, 330.0106201171875], 'content': ' 亿元。', 'type': 'text', 'score': 1.0}, {'bbox': [326.3500061035156, 317.9527893066406, 329.118896484375, 331.6676940917969], 'content': ' ', 'type': 'text', 'score': 1.0}], 'index': 30}], 'index': 29.5, 'page_num': 'page_0', 'bbox_fs': [201.88999938964844, 302.3030090332031, 575.02001953125, 331.6676940917969]}, {'type': 'text', 'bbox': [199, 349, 576, 425], 'lines': [{'bbox': [201.88999938964844, 351.2506103515625, 574.9908447265625, 361.21063232421875], 'spans': [{'bbox': [201.88999938964844, 351.2506103515625, 574.9908447265625, 361.21063232421875], 'content': '收入小幅增长，毛利率改善。报告期内，公司医疗临床业务、医疗数据业务等业务板', 'type': 'text', 'score': 1.0}], 'index': 31}, {'bbox': [201.88999938964844, 364.7029724121094, 577.1592407226562, 378.38800048828125], 'spans': [{'bbox': [201.88999938964844, 366.8505859375, 331.8081970214844, 376.81060791015625], 'content': '块平稳发展，整体收入规模达', 'type': 'text', 'score': 1.0}, {'bbox': [334.3900146484375, 364.7029724121094, 353.71875, 378.38800048828125], 'content': '3.00', 'type': 'text', 'score': 1.0}, {'bbox': [353.71875, 366.8505859375, 426.17950439453125, 376.81060791015625], 'content': ' 亿元，同比增长', 'type': 'text', 'score': 1.0}, {'bbox': [427, 365, 457, 377], 'score': 0.92, 'content': '2.92\\%', 'type': 'inline_equation'}, {'bbox': [457.17999267578125, 366.8505859375, 577.1592407226562, 376.81060791015625], 'content': '，整体收入实现平稳增长。', 'type': 'text', 'score': 1.0}], 'index': 32}, {'bbox': [201.88999938964844, 382.4505920410156, 580.0416259765625, 392.4106140136719], 'spans': [{'bbox': [201.88999938964844, 382.4505920410156, 580.0416259765625, 392.4106140136719], 'content': '由于公司优化产品结构，改进实施交付管理，公司业务毛利空间有所提升。报告期内，', 'type': 'text', 'score': 1.0}], 'index': 33}, {'bbox': [201.88999938964844, 395.9229736328125, 574.8645629882812, 409.6080017089844], 'spans': [{'bbox': [201.88999938964844, 398.0705871582031, 291.7491149902344, 408.0306091308594], 'content': '公司综合毛利率达到', 'type': 'text', 'score': 1.0}, {'bbox': [293, 397, 328, 409], 'score': 0.89, 'content': '48.03\\%', 'type': 'inline_equation'}, {'bbox': [328.2699890136719, 398.0705871582031, 386.6952819824219, 408.0306091308594], 'content': '，去年同期为', 'type': 'text', 'score': 1.0}, {'bbox': [388, 397, 423, 409], 'score': 0.89, 'content': '45.52\\%', 'type': 'inline_equation'}, {'bbox': [423.30999755859375, 398.0705871582031, 471.7752990722656, 408.0306091308594], 'content': '，同比提升', 'type': 'text', 'score': 1.0}, {'bbox': [474.3399963378906, 395.9229736328125, 493.80181884765625, 409.6080017089844], 'content': '2.51', 'type': 'text', 'score': 1.0}, {'bbox': [493.80181884765625, 398.0705871582031, 574.8645629882812, 408.0306091308594], 'content': ' 个百分点，公司毛', 'type': 'text', 'score': 1.0}], 'index': 34}, {'bbox': [201.88999938964844, 411.5229797363281, 279.6589050292969, 425.2080078125], 'spans': [{'bbox': [201.88999938964844, 413.67059326171875, 271.7195739746094, 423.630615234375], 'content': '利率明显改善。', 'type': 'text', 'score': 1.0}, {'bbox': [271.7300109863281, 411.5229797363281, 279.6589050292969, 425.2080078125], 'content': '  ', 'type': 'text', 'score': 1.0}], 'index': 35}], 'index': 33, 'page_num': 'page_0'}, {'type': 'text', 'bbox': [199, 427, 577, 503], 'lines': [{'bbox': [201.88999938964844, 429.2705993652344, 574.9743041992188, 439.2306213378906], 'spans': [{'bbox': [201.88999938964844, 429.2705993652344, 574.9743041992188, 439.2306213378906], 'content': '降本增效成效显著，管理、销售费用率下降。报告期内，公司注重内控管理、人员能', 'type': 'text', 'score': 1.0}], 'index': 36}, {'bbox': [201.88999938964844, 442.7229919433594, 575.1400146484375, 456.40802001953125], 'spans': [{'bbox': [201.88999938964844, 444.87060546875, 530.7092895507812, 454.83062744140625], 'content': '效提升，加强管理方式优化及费用控制，公司运营管理方面降本增效明显。', 'type': 'text', 'score': 1.0}, {'bbox': [530.3800048828125, 442.7229919433594, 552.600830078125, 456.40802001953125], 'content': '2024', 'type': 'text', 'score': 1.0}, {'bbox': [552.600830078125, 444.87060546875, 575.1400146484375, 454.83062744140625], 'content': ' 年上', 'type': 'text', 'score': 1.0}], 'index': 37}, {'bbox': [201.88999938964844, 458.3229675292969, 575.1334838867188, 472.00799560546875], 'spans': [{'bbox': [201.88999938964844, 460.4705810546875, 310.71295166015625, 470.43060302734375], 'content': '半年，公司销售费用率为', 'type': 'text', 'score': 1.0}, {'bbox': [312, 459, 348, 471], 'score': 0.91, 'content': '16.39\\%', 'type': 'inline_equation'}, {'bbox': [347.3500061035156, 460.4705810546875, 406.1438293457031, 470.43060302734375], 'content': '，去年同期为', 'type': 'text', 'score': 1.0}, {'bbox': [407, 459, 443, 471], 'score': 0.9, 'content': '17.57\\%', 'type': 'inline_equation'}, {'bbox': [442.75, 460.4705810546875, 501.5438232421875, 470.43060302734375], 'content': '，同比下降个', 'type': 'text', 'score': 1.0}, {'bbox': [504.2200012207031, 458.3229675292969, 523.5487670898438, 472.00799560546875], 'content': '1.18', 'type': 'text', 'score': 1.0}, {'bbox': [523.5487670898438, 460.4705810546875, 575.1334838867188, 470.43060302734375], 'content': ' 百分点；管', 'type': 'text', 'score': 1.0}], 'index': 38}, {'bbox': [201.88999938964844, 473.9229736328125, 575.0936279296875, 487.6080017089844], 'spans': [{'bbox': [201.88999938964844, 476.0705871582031, 251.79959106445312, 486.0306091308594], 'content': '理费用率为', 'type': 'text', 'score': 1.0}, {'bbox': [253, 474, 288, 487], 'score': 0.89, 'content': '16.21\\%', 'type': 'inline_equation'}, {'bbox': [288.2900085449219, 476.0705871582031, 346.8248596191406, 486.0306091308594], 'content': '，去年同期为', 'type': 'text', 'score': 1.0}, {'bbox': [348, 474, 384, 487], 'score': 0.89, 'content': '17.79\\%', 'type': 'inline_equation'}, {'bbox': [383.3500061035156, 476.0705871582031, 431.9348449707031, 486.0306091308594], 'content': '，同比下降', 'type': 'text', 'score': 1.0}, {'bbox': [434.5899963378906, 473.9229736328125, 453.9187316894531, 487.6080017089844], 'content': '1.58', 'type': 'text', 'score': 1.0}, {'bbox': [453.9187316894531, 476.0705871582031, 575.0936279296875, 486.0306091308594], 'content': ' 个百分点。公司管理费用率', 'type': 'text', 'score': 1.0}], 'index': 39}, {'bbox': [201.88999938964844, 489.5727844238281, 434.7189025878906, 503.2876892089844], 'spans': [{'bbox': [201.88999938964844, 491.67059326171875, 431.7367858886719, 501.630615234375], 'content': '及销售费用率均实现下降，公司运营效率明显提升。', 'type': 'text', 'score': 1.0}, {'bbox': [431.95001220703125, 489.5727844238281, 434.7189025878906, 503.2876892089844], 'content': ' ', 'type': 'text', 'score': 1.0}], 'index': 40}], 'index': 38, 'page_num': 'page_0'}, {'type': 'text', 'bbox': [199, 505, 577, 628], 'lines': [{'bbox': [201.88999938964844, 505.1727600097656, 575.0682983398438, 518.8876953125], 'spans': [{'bbox': [201.88999938964844, 507.27056884765625, 241.9491424560547, 517.2305908203125], 'content': '公司加大', 'type': 'text', 'score': 1.0}, {'bbox': [245.2100067138672, 505.1727600097656, 255.1788787841797, 518.8876953125], 'content': 'AI', 'type': 'text', 'score': 1.0}, {'bbox': [255.1788787841797, 507.27056884765625, 328.44818115234375, 517.2305908203125], 'content': ' 投入力度，医疗', 'type': 'text', 'score': 1.0}, {'bbox': [331.75, 505.1727600097656, 341.7189025878906, 518.8876953125], 'content': 'AI', 'type': 'text', 'score': 1.0}, {'bbox': [341.7189025878906, 507.27056884765625, 575.0682983398438, 517.2305908203125], 'content': ' 产品落地情况良好。公司继续加大研发投入力度，尤', 'type': 'text', 'score': 1.0}], 'index': 41}, {'bbox': [201.88999938964844, 520.7230224609375, 575.057861328125, 534.4080200195312], 'spans': [{'bbox': [201.88999938964844, 522.87060546875, 241.83958435058594, 532.8306274414062], 'content': '其是医疗', 'type': 'text', 'score': 1.0}, {'bbox': [244.97000122070312, 520.7230224609375, 254.45889282226562, 534.4080200195312], 'content': 'AI', 'type': 'text', 'score': 1.0}, {'bbox': [254.45889282226562, 522.87060546875, 407.5176696777344, 532.8306274414062], 'content': ' 投入力度。报告期内，公司新申请', 'type': 'text', 'score': 1.0}, {'bbox': [410.8299865722656, 520.7230224609375, 421.8876953125, 534.4080200195312], 'content': '26', 'type': 'text', 'score': 1.0}, {'bbox': [421.8876953125, 522.87060546875, 575.057861328125, 532.8306274414062], 'content': ' 项发明专利，主要集中在医疗数据', 'type': 'text', 'score': 1.0}], 'index': 42}, {'bbox': [201.88999938964844, 536.322998046875, 574.8896484375, 550.0079956054688], 'spans': [{'bbox': [201.88999938964844, 538.4705810546875, 231.77001953125, 548.4306030273438], 'content': '利用和', 'type': 'text', 'score': 1.0}, {'bbox': [234.77000427246094, 536.322998046875, 244.13890075683594, 550.0079956054688], 'content': 'AI', 'type': 'text', 'score': 1.0}, {'bbox': [244.13890075683594, 538.4705810546875, 306.98907470703125, 548.4306030273438], 'content': ' 领域，并获得', 'type': 'text', 'score': 1.0}, {'bbox': [309.8900146484375, 536.322998046875, 315.4277648925781, 550.0079956054688], 'content': '1', 'type': 'text', 'score': 1.0}, {'bbox': [315.4277648925781, 538.4705810546875, 368.31951904296875, 548.4306030273438], 'content': ' 项核心技术', 'type': 'text', 'score': 1.0}, {'bbox': [368.3500061035156, 538.013427734375, 371.6667785644531, 549.140625], 'content': '“', 'type': 'text', 'score': 1.0}, {'bbox': [371.7099914550781, 538.4705810546875, 521.548095703125, 548.4306030273438], 'content': '大模型辅助电子病历自动生成技术', 'type': 'text', 'score': 1.0}, {'bbox': [521.6199951171875, 538.013427734375, 524.936767578125, 549.140625], 'content': '”', 'type': 'text', 'score': 1.0}, {'bbox': [524.97998046875, 538.4705810546875, 574.8896484375, 548.4306030273438], 'content': '。依托公司', 'type': 'text', 'score': 1.0}], 'index': 43}, {'bbox': [201.88999938964844, 551.9229736328125, 574.925048828125, 565.6080322265625], 'spans': [{'bbox': [201.88999938964844, 551.9229736328125, 211.25889587402344, 565.6080322265625], 'content': 'AI', 'type': 'text', 'score': 1.0}, {'bbox': [211.25889587402344, 554.0706176757812, 332.65252685546875, 564.0306396484375], 'content': ' 技术的积累，公司推出医疗', 'type': 'text', 'score': 1.0}, {'bbox': [335.2300109863281, 551.9229736328125, 344.5989074707031, 565.6080322265625], 'content': 'AI', 'type': 'text', 'score': 1.0}, {'bbox': [344.5989074707031, 554.0706176757812, 574.925048828125, 564.0306396484375], 'content': ' 应用开发平台，打造全院智慧化服务接入底座，实现', 'type': 'text', 'score': 1.0}], 'index': 44}, {'bbox': [201.88999938964844, 567.552978515625, 574.8896484375, 581.238037109375], 'spans': [{'bbox': [201.88999938964844, 569.7006225585938, 291.7491149902344, 579.66064453125], 'content': '多技术框架、多业务', 'type': 'text', 'score': 1.0}, {'bbox': [294.4100036621094, 567.552978515625, 303.7789001464844, 581.238037109375], 'content': 'AI', 'type': 'text', 'score': 1.0}, {'bbox': [303.7789001464844, 569.7006225585938, 356.19952392578125, 579.66064453125], 'content': ' 应用接入。', 'type': 'text', 'score': 1.0}, {'bbox': [356.3500061035156, 567.552978515625, 378.5508117675781, 581.238037109375], 'content': '2024', 'type': 'text', 'score': 1.0}, {'bbox': [378.5508117675781, 569.7006225585938, 391.0299987792969, 579.66064453125], 'content': ' 年', 'type': 'text', 'score': 1.0}, {'bbox': [393.54998779296875, 567.552978515625, 399.0877380371094, 581.238037109375], 'content': '7', 'type': 'text', 'score': 1.0}, {'bbox': [399.0877380371094, 569.7006225585938, 531.5186157226562, 579.66064453125], 'content': ' 月，公司与北医三院联合发布', 'type': 'text', 'score': 1.0}, {'bbox': [531.5800170898438, 569.2434692382812, 534.8967895507812, 580.3706665039062], 'content': '“', 'type': 'text', 'score': 1.0}, {'bbox': [534.9400024414062, 569.7006225585938, 574.8896484375, 579.66064453125], 'content': '三生大模', 'type': 'text', 'score': 1.0}], 'index': 45}, {'bbox': [201.88999938964844, 583.1529541015625, 575.1400146484375, 596.8380126953125], 'spans': [{'bbox': [201.88999938964844, 585.3005981445312, 211.85000610351562, 595.2606201171875], 'content': '型', 'type': 'text', 'score': 1.0}, {'bbox': [211.85000610351562, 584.8434448242188, 215.16676330566406, 595.9706420898438], 'content': '”', 'type': 'text', 'score': 1.0}, {'bbox': [215.2100067138672, 585.3005981445312, 540.5800170898438, 595.2606201171875], 'content': '，以大模型为底座的多业务场景得到落地验证并且应用效果良好，比如新型', 'type': 'text', 'score': 1.0}, {'bbox': [543.219970703125, 583.1529541015625, 552.5888671875, 596.8380126953125], 'content': 'AI', 'type': 'text', 'score': 1.0}, {'bbox': [552.5888671875, 585.3005981445312, 575.1400146484375, 595.2606201171875], 'content': ' 产品', 'type': 'text', 'score': 1.0}], 'index': 46}, {'bbox': [201.88999938964844, 600.9005737304688, 575.1082763671875, 610.860595703125], 'spans': [{'bbox': [201.88999938964844, 600.9005737304688, 575.1082763671875, 610.860595703125], 'content': '可以将医务人员曾经数小时的病历书写工作缩减至半小时内完成，大幅提升书写内容', 'type': 'text', 'score': 1.0}], 'index': 47}, {'bbox': [201.88999938964844, 614.4027709960938, 304.618896484375, 628.1177368164062], 'spans': [{'bbox': [201.88999938964844, 616.5006103515625, 301.7195129394531, 626.4606323242188], 'content': '的准确率及工作效率。', 'type': 'text', 'score': 1.0}, {'bbox': [301.8500061035156, 614.4027709960938, 304.618896484375, 628.1177368164062], 'content': ' ', 'type': 'text', 'score': 1.0}], 'index': 48}], 'index': 44.5, 'page_num': 'page_0'}, {'type': 'text', 'bbox': [200, 646, 577, 690], 'lines': [{'bbox': [201.88999938964844, 645.552978515625, 574.8973999023438, 659.238037109375], 'spans': [{'bbox': [201.88999938964844, 647.7006225585938, 310.00994873046875, 657.66064453125], 'content': '投资建议：我们预计公司', 'type': 'text', 'score': 1.0}, {'bbox': [312.5299987792969, 645.552978515625, 384.72003173828125, 659.238037109375], 'content': '2024/2025/2026', 'type': 'text', 'score': 1.0}, {'bbox': [384.72003173828125, 647.7006225585938, 447.2890625, 657.66064453125], 'content': ' 年收入分别为', 'type': 'text', 'score': 1.0}, {'bbox': [449.8299865722656, 645.552978515625, 526.9088745117188, 659.238037109375], 'content': '9.03/11.48/14.47 ', 'type': 'text', 'score': 1.0}, {'bbox': [526.9000244140625, 647.7006225585938, 574.8973999023438, 657.66064453125], 'content': '亿元，净利', 'type': 'text', 'score': 1.0}], 'index': 49}, {'bbox': [201.88999938964844, 661.1529541015625, 574.98876953125, 674.8380126953125], 'spans': [{'bbox': [201.88999938964844, 663.3005981445312, 241.7300262451172, 673.2606201171875], 'content': '润分别为', 'type': 'text', 'score': 1.0}, {'bbox': [241.85000610351562, 661.1529541015625, 311.3388977050781, 674.8380126953125], 'content': ' 0.95/1.20/1.60 ', 'type': 'text', 'score': 1.0}, {'bbox': [311.3299865722656, 663.3005981445312, 361.239501953125, 673.2606201171875], 'content': '亿元，对应', 'type': 'text', 'score': 1.0}, {'bbox': [364.989990234375, 661.1529541015625, 378.35333251953125, 674.8380126953125], 'content': 'PE', 'type': 'text', 'score': 1.0}, {'bbox': [378.35333251953125, 663.3005981445312, 411.8995361328125, 673.2606201171875], 'content': ' 分别为', 'type': 'text', 'score': 1.0}, {'bbox': [411.9100036621094, 661.1529541015625, 481.42193603515625, 674.8380126953125], 'content': '  24.1/19.0/14.3', 'type': 'text', 'score': 1.0}, {'bbox': [481.42193603515625, 663.3005981445312, 574.98876953125, 673.2606201171875], 'content': ' 倍。考虑公司业绩高', 'type': 'text', 'score': 1.0}], 'index': 50}, {'bbox': [201.88999938964844, 676.802734375, 431.35888671875, 690.5177001953125], 'spans': [{'bbox': [201.88999938964844, 678.9005737304688, 371.7577209472656, 688.860595703125], 'content': '增长以及估值处于较低水平，给予公司', 'type': 'text', 'score': 1.0}, {'bbox': [371.8299865722656, 678.4434204101562, 375.1467590332031, 689.5706176757812], 'content': '“', 'type': 'text', 'score': 1.0}, {'bbox': [375.19000244140625, 678.9005737304688, 395.1099853515625, 688.860595703125], 'content': '买入', 'type': 'text', 'score': 1.0}, {'bbox': [395.1099853515625, 678.4434204101562, 398.4267578125, 689.5706176757812], 'content': '”', 'type': 'text', 'score': 1.0}, {'bbox': [398.4700012207031, 678.9005737304688, 428.45953369140625, 688.860595703125], 'content': '评级。', 'type': 'text', 'score': 1.0}, {'bbox': [428.5899963378906, 676.802734375, 431.35888671875, 690.5177001953125], 'content': ' ', 'type': 'text', 'score': 1.0}], 'index': 51}], 'index': 50, 'page_num': 'page_0'}, {'type': 'text', 'bbox': [200, 708, 404, 721], 'lines': [{'bbox': [201.88999938964844, 708.0027465820312, 404.9588928222656, 721.7177124023438], 'spans': [{'bbox': [201.88999938964844, 710.1005859375, 402.00811767578125, 720.0606079101562], 'content': '风险提示：业务发展不及预期，政策推进缓慢', 'type': 'text', 'score': 1.0}, {'bbox': [402.19000244140625, 708.0027465820312, 404.9588928222656, 721.7177124023438], 'content': ' ', 'type': 'text', 'score': 1.0}], 'index': 52}], 'index': 52, 'page_num': 'page_0'}]
+    # 调用函数
+    groups = __process_blocks(input_blocks)
+    for group_index, group in enumerate(groups):
+        print(f"Group {group_index}: {group}")
--- a/magic_pdf/pdf_parse_by_ocr.py
+++ b/magic_pdf/pdf_parse_by_ocr.py
-from magic_pdf.pdf_parse_union_core import pdf_parse_union
+from magic_pdf.pdf_parse_union_core_v2 import pdf_parse_union


 def parse_pdf_by_ocr(pdf_bytes,

--- a/magic_pdf/pdf_parse_by_txt.py
+++ b/magic_pdf/pdf_parse_by_txt.py
-from magic_pdf.pdf_parse_union_core import pdf_parse_union
+from magic_pdf.pdf_parse_union_core_v2 import pdf_parse_union


 def parse_pdf_by_txt(

--- a/magic_pdf/pdf_parse_union_core_v2.py
+++ b/magic_pdf/pdf_parse_union_core_v2.py
+import os
+import statistics
+import time
+
+from loguru import logger
+
+from typing import List
+
+import torch
+
+from magic_pdf.libs.clean_memory import clean_memory
+from magic_pdf.libs.commons import fitz, get_delta_time
+from magic_pdf.libs.config_reader import get_local_layoutreader_model_dir
+from magic_pdf.libs.convert_utils import dict_to_list
+from magic_pdf.libs.drop_reason import DropReason
+from magic_pdf.libs.hash_utils import compute_md5
+from magic_pdf.libs.local_math import float_equal
+from magic_pdf.libs.ocr_content_type import ContentType
+from magic_pdf.model.magic_model import MagicModel
+from magic_pdf.para.para_split_v3 import para_split
+from magic_pdf.pre_proc.citationmarker_remove import remove_citation_marker
+from magic_pdf.pre_proc.construct_page_dict import ocr_construct_page_component_v2
+from magic_pdf.pre_proc.cut_image import ocr_cut_image_and_table
+from magic_pdf.pre_proc.equations_replace import remove_chars_in_text_blocks, replace_equations_in_textblock, \
+    combine_chars_to_pymudict
+from magic_pdf.pre_proc.ocr_detect_all_bboxes import ocr_prepare_bboxes_for_layout_split_v2
+from magic_pdf.pre_proc.ocr_dict_merge import  fill_spans_in_blocks, fix_block_spans, fix_discarded_block
+from magic_pdf.pre_proc.ocr_span_list_modify import remove_overlaps_min_spans, get_qa_need_list_v2, \
+    remove_overlaps_low_confidence_spans
+from magic_pdf.pre_proc.resolve_bbox_conflict import check_useful_block_horizontal_overlap
+
+
+def remove_horizontal_overlap_block_which_smaller(all_bboxes):
+    useful_blocks = []
+    for bbox in all_bboxes:
+        useful_blocks.append({
+            "bbox": bbox[:4]
+        })
+    is_useful_block_horz_overlap, smaller_bbox, bigger_bbox = check_useful_block_horizontal_overlap(useful_blocks)
+    if is_useful_block_horz_overlap:
+        logger.warning(
+            f"skip this page, reason: {DropReason.USEFUL_BLOCK_HOR_OVERLAP}, smaller bbox is {smaller_bbox}, bigger bbox is {bigger_bbox}")
+        for bbox in all_bboxes.copy():
+            if smaller_bbox == bbox[:4]:
+                all_bboxes.remove(bbox)
+
+    return is_useful_block_horz_overlap, all_bboxes
+
+
+def __replace_STX_ETX(text_str:str):
+    """ Replace \u0002 and \u0003, as these characters become garbled when extracted using pymupdf. In fact, they were originally quotation marks.
+Drawback: This issue is only observed in English text; it has not been found in Chinese text so far.
+
+    Args:
+        text_str (str): raw text
+
+    Returns:
+        _type_: replaced text
+    """
+    if text_str:
+        s = text_str.replace('\u0002', "'")
+        s = s.replace("\u0003", "'")
+        return s
+    return text_str
+
+
+def txt_spans_extract(pdf_page, inline_equations, interline_equations):
+    text_raw_blocks = pdf_page.get_text("dict", flags=fitz.TEXTFLAGS_TEXT)["blocks"]
+    char_level_text_blocks = pdf_page.get_text("rawdict", flags=fitz.TEXTFLAGS_TEXT)[
+        "blocks"
+    ]
+    text_blocks = combine_chars_to_pymudict(text_raw_blocks, char_level_text_blocks)
+    text_blocks = replace_equations_in_textblock(
+        text_blocks, inline_equations, interline_equations
+    )
+    text_blocks = remove_citation_marker(text_blocks)
+    text_blocks = remove_chars_in_text_blocks(text_blocks)
+    spans = []
+    for v in text_blocks:
+        for line in v["lines"]:
+            for span in line["spans"]:
+                bbox = span["bbox"]
+                if float_equal(bbox[0], bbox[2]) or float_equal(bbox[1], bbox[3]):
+                    continue
+                if span.get('type') not in (ContentType.InlineEquation, ContentType.InterlineEquation):
+                    spans.append(
+                        {
+                            "bbox": list(span["bbox"]),
+                            "content": __replace_STX_ETX(span["text"]),
+                            "type": ContentType.Text,
+                            "score": 1.0,
+                        }
+                    )
+    return spans
+
+
+def replace_text_span(pymu_spans, ocr_spans):
+    return list(filter(lambda x: x["type"] != ContentType.Text, ocr_spans)) + pymu_spans
+
+
+def model_init(model_name: str):
+    from transformers import LayoutLMv3ForTokenClassification
+    if torch.cuda.is_available():
+        device = torch.device("cuda")
+        if torch.cuda.is_bf16_supported():
+            supports_bfloat16 = True
+        else:
+            supports_bfloat16 = False
+    else:
+        device = torch.device("cpu")
+        supports_bfloat16 = False
+
+    if model_name == "layoutreader":
+        # 检测modelscope的缓存目录是否存在
+        layoutreader_model_dir = get_local_layoutreader_model_dir()
+        if os.path.exists(layoutreader_model_dir):
+            model = LayoutLMv3ForTokenClassification.from_pretrained(layoutreader_model_dir)
+        else:
+            logger.warning(
+                f"local layoutreader model not exists, use online model from huggingface")
+            model = LayoutLMv3ForTokenClassification.from_pretrained("hantian/layoutreader")
+        # 检查设备是否支持 bfloat16
+        if supports_bfloat16:
+            model.bfloat16()
+        model.to(device).eval()
+    else:
+        logger.error("model name not allow")
+        exit(1)
+    return model
+
+
+class ModelSingleton:
+    _instance = None
+    _models = {}
+
+    def __new__(cls, *args, **kwargs):
+        if cls._instance is None:
+            cls._instance = super().__new__(cls)
+        return cls._instance
+
+    def get_model(self, model_name: str):
+        if model_name not in self._models:
+            self._models[model_name] = model_init(model_name=model_name)
+        return self._models[model_name]
+
+
+def do_predict(boxes: List[List[int]], model) -> List[int]:
+    from magic_pdf.model.v3.helpers import prepare_inputs, boxes2inputs, parse_logits
+    inputs = boxes2inputs(boxes)
+    inputs = prepare_inputs(inputs, model)
+    logits = model(**inputs).logits.cpu().squeeze(0)
+    return parse_logits(logits, len(boxes))
+
+
+def cal_block_index(fix_blocks, sorted_bboxes):
+    for block in fix_blocks:
+        # if block['type'] in ['text', 'title', 'interline_equation']:
+        #     line_index_list = []
+        #     if len(block['lines']) == 0:
+        #         block['index'] = sorted_bboxes.index(block['bbox'])
+        #     else:
+        #         for line in block['lines']:
+        #             line['index'] = sorted_bboxes.index(line['bbox'])
+        #             line_index_list.append(line['index'])
+        #         median_value = statistics.median(line_index_list)
+        #         block['index'] = median_value
+        #
+        # elif block['type'] in ['table', 'image']:
+        #     block['index'] = sorted_bboxes.index(block['bbox'])
+
+        line_index_list = []
+        if len(block['lines']) == 0:
+            block['index'] = sorted_bboxes.index(block['bbox'])
+        else:
+            for line in block['lines']:
+                line['index'] = sorted_bboxes.index(line['bbox'])
+                line_index_list.append(line['index'])
+            median_value = statistics.median(line_index_list)
+            block['index'] = median_value
+
+        # 删除图表block中的虚拟line信息
+        if block['type'] in ['table', 'image']:
+            del block['lines']
+
+    return fix_blocks
+
+
+def insert_lines_into_block(block_bbox, line_height, page_w, page_h):
+    # block_bbox是一个元组(x0, y0, x1, y1)，其中(x0, y0)是左下角坐标，(x1, y1)是右上角坐标
+    x0, y0, x1, y1 = block_bbox
+
+    block_height = y1 - y0
+    block_weight = x1 - x0
+
+    # 如果block高度小于n行正文，则直接返回block的bbox
+    if line_height*3 < block_height:
+        if block_height > page_h*0.25 and page_w*0.5 > block_weight > page_w*0.25:  # 可能是双列结构，可以切细点
+            lines = int(block_height/line_height)+1
+        else:
+            # 如果block的宽度超过0.4页面宽度，则将block分成3行
+            if block_weight > page_w*0.4:
+                line_height = (y1 - y0) / 3
+                lines = 3
+            elif block_weight > page_w*0.25: # 否则将block分成两行
+                line_height = (y1 - y0) / 2
+                lines = 2
+            else: # 判断长宽比
+                if block_height/block_weight > 1.2:  # 细长的不分
+                    return [[x0, y0, x1, y1]]
+                else: # 不细长的还是分成两行
+                    line_height = (y1 - y0) / 2
+                    lines = 2
+
+        # 确定从哪个y位置开始绘制线条
+        current_y = y0
+
+        # 用于存储线条的位置信息[(x0, y), ...]
+        lines_positions = []
+
+        for i in range(lines):
+            lines_positions.append([x0, current_y, x1, current_y + line_height])
+            current_y += line_height
+        return lines_positions
+
+    else:
+        return [[x0, y0, x1, y1]]
+
+
+def sort_lines_by_model(fix_blocks, page_w, page_h, line_height):
+    page_line_list = []
+    for block in fix_blocks:
+        if block['type'] in ['text', 'title', 'interline_equation']:
+            if len(block['lines']) == 0:
+                bbox = block['bbox']
+                lines = insert_lines_into_block(bbox, line_height, page_w, page_h)
+                for line in lines:
+                    block['lines'].append({'bbox': line, 'spans': []})
+                page_line_list.extend(lines)
+            else:
+                for line in block['lines']:
+                    bbox = line['bbox']
+                    page_line_list.append(bbox)
+        elif block['type'] in ['table', 'image']:
+            bbox = block['bbox']
+            lines = insert_lines_into_block(bbox, line_height, page_w, page_h)
+            block['lines'] = []
+            for line in lines:
+                block['lines'].append({'bbox': line, 'spans': []})
+            page_line_list.extend(lines)
+
+    # 使用layoutreader排序
+    x_scale = 1000.0 / page_w
+    y_scale = 1000.0 / page_h
+    boxes = []
+    # logger.info(f"Scale: {x_scale}, {y_scale}, Boxes len: {len(page_line_list)}")
+    for left, top, right, bottom in page_line_list:
+        if left < 0:
+            logger.warning(
+                f"left < 0, left: {left}, right: {right}, top: {top}, bottom: {bottom}, page_w: {page_w}, page_h: {page_h}")
+            left = 0
+        if right > page_w:
+            logger.warning(
+                f"right > page_w, left: {left}, right: {right}, top: {top}, bottom: {bottom}, page_w: {page_w}, page_h: {page_h}")
+            right = page_w
+        if top < 0:
+            logger.warning(
+                f"top < 0, left: {left}, right: {right}, top: {top}, bottom: {bottom}, page_w: {page_w}, page_h: {page_h}")
+            top = 0
+        if bottom > page_h:
+            logger.warning(
+                f"bottom > page_h, left: {left}, right: {right}, top: {top}, bottom: {bottom}, page_w: {page_w}, page_h: {page_h}")
+            bottom = page_h
+
+        left = round(left * x_scale)
+        top = round(top * y_scale)
+        right = round(right * x_scale)
+        bottom = round(bottom * y_scale)
+        assert (
+                1000 >= right >= left >= 0 and 1000 >= bottom >= top >= 0
+        ), f"Invalid box. right: {right}, left: {left}, bottom: {bottom}, top: {top}"
+        boxes.append([left, top, right, bottom])
+    model_manager = ModelSingleton()
+    model = model_manager.get_model("layoutreader")
+    with torch.no_grad():
+        orders = do_predict(boxes, model)
+    sorted_bboxes = [page_line_list[i] for i in orders]
+
+    return sorted_bboxes
+
+
+def get_line_height(blocks):
+    page_line_height_list = []
+    for block in blocks:
+        if block['type'] in ['text', 'title', 'interline_equation']:
+            for line in block['lines']:
+                bbox = line['bbox']
+                page_line_height_list.append(int(bbox[3]-bbox[1]))
+    if len(page_line_height_list) > 0:
+        return statistics.median(page_line_height_list)
+    else:
+        return 10
+
+
+def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter, parse_mode):
+    need_drop = False
+    drop_reason = []
+
+    '''从magic_model对象中获取后面会用到的区块信息'''
+    img_blocks = magic_model.get_imgs(page_id)
+    table_blocks = magic_model.get_tables(page_id)
+    discarded_blocks = magic_model.get_discarded(page_id)
+    text_blocks = magic_model.get_text_blocks(page_id)
+    title_blocks = magic_model.get_title_blocks(page_id)
+    inline_equations, interline_equations, interline_equation_blocks = magic_model.get_equations(page_id)
+
+    page_w, page_h = magic_model.get_page_size(page_id)
+
+    spans = magic_model.get_all_spans(page_id)
+
+    '''根据parse_mode，构造spans'''
+    if parse_mode == "txt":
+        """ocr 中文本类的 span 用 pymu spans 替换！"""
+        pymu_spans = txt_spans_extract(
+            pdf_docs[page_id], inline_equations, interline_equations
+        )
+        spans = replace_text_span(pymu_spans, spans)
+    elif parse_mode == "ocr":
+        pass
+    else:
+        raise Exception("parse_mode must be txt or ocr")
+
+    '''删除重叠spans中置信度较低的那些'''
+    spans, dropped_spans_by_confidence = remove_overlaps_low_confidence_spans(spans)
+    '''删除重叠spans中较小的那些'''
+    spans, dropped_spans_by_span_overlap = remove_overlaps_min_spans(spans)
+    '''对image和table截图'''
+    spans = ocr_cut_image_and_table(spans, pdf_docs[page_id], page_id, pdf_bytes_md5, imageWriter)
+
+    '''将所有区块的bbox整理到一起'''
+    # interline_equation_blocks参数不够准，后面切换到interline_equations上
+    interline_equation_blocks = []
+    if len(interline_equation_blocks) > 0:
+        all_bboxes, all_discarded_blocks = ocr_prepare_bboxes_for_layout_split_v2(
+            img_blocks, table_blocks, discarded_blocks, text_blocks, title_blocks,
+            interline_equation_blocks, page_w, page_h)
+    else:
+        all_bboxes, all_discarded_blocks = ocr_prepare_bboxes_for_layout_split_v2(
+            img_blocks, table_blocks, discarded_blocks, text_blocks, title_blocks,
+            interline_equations, page_w, page_h)
+
+    '''先处理不需要排版的discarded_blocks'''
+    discarded_block_with_spans, spans = fill_spans_in_blocks(all_discarded_blocks, spans, 0.4)
+    fix_discarded_blocks = fix_discarded_block(discarded_block_with_spans)
+
+    '''如果当前页面没有bbox则跳过'''
+    if len(all_bboxes) == 0:
+        logger.warning(f"skip this page, not found useful bbox, page_id: {page_id}")
+        return ocr_construct_page_component_v2([], [], page_id, page_w, page_h, [],
+                                               [], [], interline_equations, fix_discarded_blocks,
+                                               need_drop, drop_reason)
+
+    '''将span填入blocks中'''
+    block_with_spans, spans = fill_spans_in_blocks(all_bboxes, spans, 0.3)
+
+    '''对block进行fix操作'''
+    fix_blocks = fix_block_spans(block_with_spans, img_blocks, table_blocks)
+
+    '''获取所有line并计算正文line的高度'''
+    line_height = get_line_height(fix_blocks)
+
+    '''获取所有line并对line排序'''
+    sorted_bboxes = sort_lines_by_model(fix_blocks, page_w, page_h, line_height)
+
+    '''根据line的中位数算block的序列关系'''
+    fix_blocks = cal_block_index(fix_blocks, sorted_bboxes)
+
+    '''重排block'''
+    sorted_blocks = sorted(fix_blocks, key=lambda b: b['index'])
+
+    '''获取QA需要外置的list'''
+    images, tables, interline_equations = get_qa_need_list_v2(sorted_blocks)
+
+    '''构造pdf_info_dict'''
+    page_info = ocr_construct_page_component_v2(sorted_blocks, [], page_id, page_w, page_h, [],
+                                                images, tables, interline_equations, fix_discarded_blocks,
+                                                need_drop, drop_reason)
+    return page_info
+
+
+def pdf_parse_union(pdf_bytes,
+                    model_list,
+                    imageWriter,
+                    parse_mode,
+                    start_page_id=0,
+                    end_page_id=None,
+                    debug_mode=False,
+                    ):
+    pdf_bytes_md5 = compute_md5(pdf_bytes)
+    pdf_docs = fitz.open("pdf", pdf_bytes)
+
+    '''初始化空的pdf_info_dict'''
+    pdf_info_dict = {}
+
+    '''用model_list和docs对象初始化magic_model'''
+    magic_model = MagicModel(model_list, pdf_docs)
+
+    '''根据输入的起始范围解析pdf'''
+    # end_page_id = end_page_id if end_page_id else len(pdf_docs) - 1
+    end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else len(pdf_docs) - 1
+
+    if end_page_id > len(pdf_docs) - 1:
+        logger.warning("end_page_id is out of range, use pdf_docs length")
+        end_page_id = len(pdf_docs) - 1
+
+    '''初始化启动时间'''
+    start_time = time.time()
+
+    for page_id, page in enumerate(pdf_docs):
+        '''debug时输出每页解析的耗时'''
+        if debug_mode:
+            time_now = time.time()
+            logger.info(
+                f"page_id: {page_id}, last_page_cost_time: {get_delta_time(start_time)}"
+            )
+            start_time = time_now
+
+        '''解析pdf中的每一页'''
+        if start_page_id <= page_id <= end_page_id:
+            page_info = parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter, parse_mode)
+        else:
+            page_w = page.rect.width
+            page_h = page.rect.height
+            page_info = ocr_construct_page_component_v2([], [], page_id, page_w, page_h, [],
+                                                [], [], [], [],
+                                                True, "skip page")
+        pdf_info_dict[f"page_{page_id}"] = page_info
+
+    """分段"""
+    para_split(pdf_info_dict, debug_mode=debug_mode)
+
+    """dict转list"""
+    pdf_info_list = dict_to_list(pdf_info_dict)
+    new_pdf_info_dict = {
+        "pdf_info": pdf_info_list,
+    }
+
+    clean_memory()
+
+    return new_pdf_info_dict
+
+
+if __name__ == '__main__':
+    pass
--- a/magic_pdf/pipe/AbsPipe.py
+++ b/magic_pdf/pipe/AbsPipe.py
@@ -17,7 +17,7 @@ class AbsPipe(ABC):
    PIP_TXT = "txt"

    def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False,
-                 start_page_id=0, end_page_id=None):
+                 start_page_id=0, end_page_id=None, lang=None):
        self.pdf_bytes = pdf_bytes
        self.model_list = model_list
        self.image_writer = image_writer
@@ -25,6 +25,7 @@ class AbsPipe(ABC):
        self.is_debug = is_debug
        self.start_page_id = start_page_id
        self.end_page_id = end_page_id
+        self.lang = lang
    
    def get_compress_pdf_mid_data(self):
        return JsonCompressor.compress_json(self.pdf_mid_data)
@@ -94,7 +95,9 @@ class AbsPipe(ABC):
        """
        pdf_mid_data = JsonCompressor.decompress_json(compressed_pdf_mid_data)
        pdf_info_list = pdf_mid_data["pdf_info"]
-        content_list = union_make(pdf_info_list, MakeMode.STANDARD_FORMAT, drop_mode, img_buket_path)
+        parse_type = pdf_mid_data["_parse_type"]
+        lang = pdf_mid_data.get("_lang", None)
+        content_list = union_make(pdf_info_list, MakeMode.STANDARD_FORMAT, drop_mode, img_buket_path, parse_type, lang)
        return content_list

    @staticmethod
@@ -104,7 +107,9 @@ class AbsPipe(ABC):
        """
        pdf_mid_data = JsonCompressor.decompress_json(compressed_pdf_mid_data)
        pdf_info_list = pdf_mid_data["pdf_info"]
-        md_content = union_make(pdf_info_list, md_make_mode, drop_mode, img_buket_path)
+        parse_type = pdf_mid_data["_parse_type"]
+        lang = pdf_mid_data.get("_lang", None)
+        md_content = union_make(pdf_info_list, md_make_mode, drop_mode, img_buket_path, parse_type, lang)
        return md_content


--- a/magic_pdf/pipe/OCRPipe.py
+++ b/magic_pdf/pipe/OCRPipe.py
@@ -10,19 +10,21 @@ from magic_pdf.user_api import parse_ocr_pdf
 class OCRPipe(AbsPipe):

    def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False,
-                 start_page_id=0, end_page_id=None):
-        super().__init__(pdf_bytes, model_list, image_writer, is_debug, start_page_id, end_page_id)
+                 start_page_id=0, end_page_id=None, lang=None):
+        super().__init__(pdf_bytes, model_list, image_writer, is_debug, start_page_id, end_page_id, lang)

    def pipe_classify(self):
        pass

    def pipe_analyze(self):
        self.model_list = doc_analyze(self.pdf_bytes, ocr=True,
-                                      start_page_id=self.start_page_id, end_page_id=self.end_page_id)
+                                      start_page_id=self.start_page_id, end_page_id=self.end_page_id,
+                                      lang=self.lang)

    def pipe_parse(self):
        self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug,
-                                          start_page_id=self.start_page_id, end_page_id=self.end_page_id)
+                                          start_page_id=self.start_page_id, end_page_id=self.end_page_id,
+                                          lang=self.lang)

    def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
        result = super().pipe_mk_uni_format(img_parent_path, drop_mode)

--- a/magic_pdf/pipe/TXTPipe.py
+++ b/magic_pdf/pipe/TXTPipe.py
@@ -11,19 +11,21 @@ from magic_pdf.user_api import parse_txt_pdf
 class TXTPipe(AbsPipe):

    def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False,
-                 start_page_id=0, end_page_id=None):
-        super().__init__(pdf_bytes, model_list, image_writer, is_debug, start_page_id, end_page_id)
+                 start_page_id=0, end_page_id=None, lang=None):
+        super().__init__(pdf_bytes, model_list, image_writer, is_debug, start_page_id, end_page_id, lang)

    def pipe_classify(self):
        pass

    def pipe_analyze(self):
        self.model_list = doc_analyze(self.pdf_bytes, ocr=False,
-                                      start_page_id=self.start_page_id, end_page_id=self.end_page_id)
+                                      start_page_id=self.start_page_id, end_page_id=self.end_page_id,
+                                      lang=self.lang)

    def pipe_parse(self):
        self.pdf_mid_data = parse_txt_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug,
-                                          start_page_id=self.start_page_id, end_page_id=self.end_page_id)
+                                          start_page_id=self.start_page_id, end_page_id=self.end_page_id,
+                                          lang=self.lang)

    def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
        result = super().pipe_mk_uni_format(img_parent_path, drop_mode)

--- a/magic_pdf/pipe/UNIPipe.py
+++ b/magic_pdf/pipe/UNIPipe.py
@@ -14,9 +14,9 @@ from magic_pdf.user_api import parse_union_pdf, parse_ocr_pdf
 class UNIPipe(AbsPipe):

    def __init__(self, pdf_bytes: bytes, jso_useful_key: dict, image_writer: AbsReaderWriter, is_debug: bool = False,
-                 start_page_id=0, end_page_id=None):
+                 start_page_id=0, end_page_id=None, lang=None):
        self.pdf_type = jso_useful_key["_pdf_type"]
-        super().__init__(pdf_bytes, jso_useful_key["model_list"], image_writer, is_debug, start_page_id, end_page_id)
+        super().__init__(pdf_bytes, jso_useful_key["model_list"], image_writer, is_debug, start_page_id, end_page_id, lang)
        if len(self.model_list) == 0:
            self.input_model_is_empty = True
        else:
@@ -28,22 +28,26 @@ class UNIPipe(AbsPipe):
    def pipe_analyze(self):
        if self.pdf_type == self.PIP_TXT:
            self.model_list = doc_analyze(self.pdf_bytes, ocr=False,
-                                          start_page_id=self.start_page_id, end_page_id=self.end_page_id)
+                                          start_page_id=self.start_page_id, end_page_id=self.end_page_id,
+                                          lang=self.lang)
        elif self.pdf_type == self.PIP_OCR:
            self.model_list = doc_analyze(self.pdf_bytes, ocr=True,
-                                          start_page_id=self.start_page_id, end_page_id=self.end_page_id)
+                                          start_page_id=self.start_page_id, end_page_id=self.end_page_id,
+                                          lang=self.lang)

    def pipe_parse(self):
        if self.pdf_type == self.PIP_TXT:
            self.pdf_mid_data = parse_union_pdf(self.pdf_bytes, self.model_list, self.image_writer,
                                                is_debug=self.is_debug, input_model_is_empty=self.input_model_is_empty,
-                                                start_page_id=self.start_page_id, end_page_id=self.end_page_id)
+                                                start_page_id=self.start_page_id, end_page_id=self.end_page_id,
+                                                lang=self.lang)
        elif self.pdf_type == self.PIP_OCR:
            self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer,
                                              is_debug=self.is_debug,
-                                              start_page_id=self.start_page_id, end_page_id=self.end_page_id)
+                                              start_page_id=self.start_page_id, end_page_id=self.end_page_id,
+                                              lang=self.lang)

-    def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
+    def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.NONE_WITH_REASON):
        result = super().pipe_mk_uni_format(img_parent_path, drop_mode)
        logger.info("uni_pipe mk content list finished")
        return result

--- a/magic_pdf/pre_proc/ocr_detect_all_bboxes.py
+++ b/magic_pdf/pre_proc/ocr_detect_all_bboxes.py
@@ -60,6 +60,59 @@ def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_bloc
    return all_bboxes, all_discarded_blocks, drop_reasons


+def ocr_prepare_bboxes_for_layout_split_v2(img_blocks, table_blocks, discarded_blocks, text_blocks,
+                                        title_blocks, interline_equation_blocks, page_w, page_h):
+    all_bboxes = []
+    all_discarded_blocks = []
+    for image in img_blocks:
+        x0, y0, x1, y1 = image['bbox']
+        all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Image, None, None, None, None, image["score"]])
+
+    for table in table_blocks:
+        x0, y0, x1, y1 = table['bbox']
+        all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Table, None, None, None, None, table["score"]])
+
+    for text in text_blocks:
+        x0, y0, x1, y1 = text['bbox']
+        all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Text, None, None, None, None, text["score"]])
+
+    for title in title_blocks:
+        x0, y0, x1, y1 = title['bbox']
+        all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Title, None, None, None, None, title["score"]])
+
+    for interline_equation in interline_equation_blocks:
+        x0, y0, x1, y1 = interline_equation['bbox']
+        all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.InterlineEquation, None, None, None, None, interline_equation["score"]])
+
+    '''block嵌套问题解决'''
+    '''文本框与标题框重叠，优先信任文本框'''
+    all_bboxes = fix_text_overlap_title_blocks(all_bboxes)
+    '''任何框体与舍弃框重叠，优先信任舍弃框'''
+    all_bboxes = remove_need_drop_blocks(all_bboxes, discarded_blocks)
+
+    # interline_equation 与title或text框冲突的情况，分两种情况处理
+    '''interline_equation框与文本类型框iou比较接近1的时候，信任行间公式框'''
+    all_bboxes = fix_interline_equation_overlap_text_blocks_with_hi_iou(all_bboxes)
+    '''interline_equation框被包含在文本类型框内，且interline_equation比文本区块小很多时信任文本框，这时需要舍弃公式框'''
+    # 通过后续大框套小框逻辑删除
+
+    '''discarded_blocks中只保留宽度超过1/3页面宽度的，高度超过10的，处于页面下半50%区域的（限定footnote）'''
+    for discarded in discarded_blocks:
+        x0, y0, x1, y1 = discarded['bbox']
+        all_discarded_blocks.append([x0, y0, x1, y1, None, None, None, BlockType.Discarded, None, None, None, None, discarded["score"]])
+        # 将footnote加入到all_bboxes中，用来计算layout
+        # if (x1 - x0) > (page_w / 3) and (y1 - y0) > 10 and y0 > (page_h / 2):
+        #     all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Footnote, None, None, None, None, discarded["score"]])
+
+    '''经过以上处理后，还存在大框套小框的情况，则删除小框'''
+    all_bboxes = remove_overlaps_min_blocks(all_bboxes)
+    all_discarded_blocks = remove_overlaps_min_blocks(all_discarded_blocks)
+    '''将剩余的bbox做分离处理，防止后面分layout时出错'''
+    all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes)
+
+    return all_bboxes, all_discarded_blocks
+
+
 def fix_interline_equation_overlap_text_blocks_with_hi_iou(all_bboxes):
    # 先提取所有text和interline block
    text_blocks = []

--- a/magic_pdf/pre_proc/ocr_dict_merge.py
+++ b/magic_pdf/pre_proc/ocr_dict_merge.py
@@ -49,8 +49,7 @@ def merge_spans_to_line(spans):
                continue

            # 如果当前的span与当前行的最后一个span在y轴上重叠，则添加到当前行
-            if __is_overlaps_y_exceeds_threshold(span['bbox'],
-                                                 current_line[-1]['bbox']):
+            if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox'], 0.6):
                current_line.append(span)
            else:
                # 否则，开始新行

--- a/magic_pdf/resources/model_config/UniMERNet/demo.yaml
+++ b/magic_pdf/resources/model_config/UniMERNet/demo.yaml
@@ -2,13 +2,13 @@ model:
  arch: unimernet
  model_type: unimernet
  model_config:
-    model_name: ./models
-    max_seq_len: 1024
-    length_aware: False
+    model_name: ./models/unimernet_base
+    max_seq_len: 1536
+
  load_pretrained: True
-  pretrained: ./models/pytorch_model.bin
+  pretrained: './models/unimernet_base/pytorch_model.pth'
  tokenizer_config:
-    path: ./models
+    path: ./models/unimernet_base

 datasets:
  formula_rec_eval:

--- a/magic_pdf/resources/model_config/model_configs.yaml
+++ b/magic_pdf/resources/model_config/model_configs.yaml
@@ -10,6 +10,6 @@ config:
 weights:
  layout: Layout/model_final.pth
  mfd: MFD/weights.pt
-  mfr: MFR/UniMERNet
+  mfr: MFR/unimernet_small
  struct_eqtable: TabRec/StructEqTable
  TableMaster: TabRec/TableMaster
\ No newline at end of file