Merge branch 'master' of https://github.com/opendatalab/MinerU

4a823359 · quyuan · 611e2f59 · b6df9b18 · 4a823359 · 4a823359
Commit 4a823359 authored Jul 12, 2024 by quyuan
18 changed files
--- a/magic_pdf/model/pek_sub_modules/layoutlmv3/visualizer.py
+++ b/magic_pdf/model/pek_sub_modules/layoutlmv3/visualizer.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+import colorsys
+import logging
+import math
+import numpy as np
+from enum import Enum, unique
+import cv2
+import matplotlib as mpl
+import matplotlib.colors as mplc
+import matplotlib.figure as mplfigure
+import pycocotools.mask as mask_util
+import torch
+from matplotlib.backends.backend_agg import FigureCanvasAgg
+from PIL import Image
+
+from detectron2.data import MetadataCatalog
+from detectron2.structures import BitMasks, Boxes, BoxMode, Keypoints, PolygonMasks, RotatedBoxes
+from detectron2.utils.file_io import PathManager
+
+from detectron2.utils.colormap import random_color
+
+import pdb
+
+logger = logging.getLogger(__name__)
+
+__all__ = ["ColorMode", "VisImage", "Visualizer"]
+
+
+_SMALL_OBJECT_AREA_THRESH = 1000
+_LARGE_MASK_AREA_THRESH = 120000
+_OFF_WHITE = (1.0, 1.0, 240.0 / 255)
+_BLACK = (0, 0, 0)
+_RED = (1.0, 0, 0)
+
+_KEYPOINT_THRESHOLD = 0.05
+
+#CLASS_NAMES = ["footnote", "footer", "header"]
+
+@unique
+class ColorMode(Enum):
+    """
+    Enum of different color modes to use for instance visualizations.
+    """
+
+    IMAGE = 0
+    """
+    Picks a random color for every instance and overlay segmentations with low opacity.
+    """
+    SEGMENTATION = 1
+    """
+    Let instances of the same category have similar colors
+    (from metadata.thing_colors), and overlay them with
+    high opacity. This provides more attention on the quality of segmentation.
+    """
+    IMAGE_BW = 2
+    """
+    Same as IMAGE, but convert all areas without masks to gray-scale.
+    Only available for drawing per-instance mask predictions.
+    """
+
+
+class GenericMask:
+    """
+    Attribute:
+        polygons (list[ndarray]): list[ndarray]: polygons for this mask.
+            Each ndarray has format [x, y, x, y, ...]
+        mask (ndarray): a binary mask
+    """
+
+    def __init__(self, mask_or_polygons, height, width):
+        self._mask = self._polygons = self._has_holes = None
+        self.height = height
+        self.width = width
+
+        m = mask_or_polygons
+        if isinstance(m, dict):
+            # RLEs
+            assert "counts" in m and "size" in m
+            if isinstance(m["counts"], list):  # uncompressed RLEs
+                h, w = m["size"]
+                assert h == height and w == width
+                m = mask_util.frPyObjects(m, h, w)
+            self._mask = mask_util.decode(m)[:, :]
+            return
+
+        if isinstance(m, list):  # list[ndarray]
+            self._polygons = [np.asarray(x).reshape(-1) for x in m]
+            return
+
+        if isinstance(m, np.ndarray):  # assumed to be a binary mask
+            assert m.shape[1] != 2, m.shape
+            assert m.shape == (
+                height,
+                width,
+            ), f"mask shape: {m.shape}, target dims: {height}, {width}"
+            self._mask = m.astype("uint8")
+            return
+
+        raise ValueError("GenericMask cannot handle object {} of type '{}'".format(m, type(m)))
+
+    @property
+    def mask(self):
+        if self._mask is None:
+            self._mask = self.polygons_to_mask(self._polygons)
+        return self._mask
+
+    @property
+    def polygons(self):
+        if self._polygons is None:
+            self._polygons, self._has_holes = self.mask_to_polygons(self._mask)
+        return self._polygons
+
+    @property
+    def has_holes(self):
+        if self._has_holes is None:
+            if self._mask is not None:
+                self._polygons, self._has_holes = self.mask_to_polygons(self._mask)
+            else:
+                self._has_holes = False  # if original format is polygon, does not have holes
+        return self._has_holes
+
+    def mask_to_polygons(self, mask):
+        # cv2.RETR_CCOMP flag retrieves all the contours and arranges them to a 2-level
+        # hierarchy. External contours (boundary) of the object are placed in hierarchy-1.
+        # Internal contours (holes) are placed in hierarchy-2.
+        # cv2.CHAIN_APPROX_NONE flag gets vertices of polygons from contours.
+        mask = np.ascontiguousarray(mask)  # some versions of cv2 does not support incontiguous arr
+        res = cv2.findContours(mask.astype("uint8"), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_NONE)
+        hierarchy = res[-1]
+        if hierarchy is None:  # empty mask
+            return [], False
+        has_holes = (hierarchy.reshape(-1, 4)[:, 3] >= 0).sum() > 0
+        res = res[-2]
+        res = [x.flatten() for x in res]
+        # These coordinates from OpenCV are integers in range [0, W-1 or H-1].
+        # We add 0.5 to turn them into real-value coordinate space. A better solution
+        # would be to first +0.5 and then dilate the returned polygon by 0.5.
+        res = [x + 0.5 for x in res if len(x) >= 6]
+        return res, has_holes
+
+    def polygons_to_mask(self, polygons):
+        rle = mask_util.frPyObjects(polygons, self.height, self.width)
+        rle = mask_util.merge(rle)
+        return mask_util.decode(rle)[:, :]
+
+    def area(self):
+        return self.mask.sum()
+
+    def bbox(self):
+        p = mask_util.frPyObjects(self.polygons, self.height, self.width)
+        p = mask_util.merge(p)
+        bbox = mask_util.toBbox(p)
+        bbox[2] += bbox[0]
+        bbox[3] += bbox[1]
+        return bbox
+
+
+class _PanopticPrediction:
+    """
+    Unify different panoptic annotation/prediction formats
+    """
+
+    def __init__(self, panoptic_seg, segments_info, metadata=None):
+        if segments_info is None:
+            assert metadata is not None
+            # If "segments_info" is None, we assume "panoptic_img" is a
+            # H*W int32 image storing the panoptic_id in the format of
+            # category_id * label_divisor + instance_id. We reserve -1 for
+            # VOID label.
+            label_divisor = metadata.label_divisor
+            segments_info = []
+            for panoptic_label in np.unique(panoptic_seg.numpy()):
+                if panoptic_label == -1:
+                    # VOID region.
+                    continue
+                pred_class = panoptic_label // label_divisor
+                isthing = pred_class in metadata.thing_dataset_id_to_contiguous_id.values()
+                segments_info.append(
+                    {
+                        "id": int(panoptic_label),
+                        "category_id": int(pred_class),
+                        "isthing": bool(isthing),
+                    }
+                )
+        del metadata
+
+        self._seg = panoptic_seg
+
+        self._sinfo = {s["id"]: s for s in segments_info}  # seg id -> seg info
+        segment_ids, areas = torch.unique(panoptic_seg, sorted=True, return_counts=True)
+        areas = areas.numpy()
+        sorted_idxs = np.argsort(-areas)
+        self._seg_ids, self._seg_areas = segment_ids[sorted_idxs], areas[sorted_idxs]
+        self._seg_ids = self._seg_ids.tolist()
+        for sid, area in zip(self._seg_ids, self._seg_areas):
+            if sid in self._sinfo:
+                self._sinfo[sid]["area"] = float(area)
+
+    def non_empty_mask(self):
+        """
+        Returns:
+            (H, W) array, a mask for all pixels that have a prediction
+        """
+        empty_ids = []
+        for id in self._seg_ids:
+            if id not in self._sinfo:
+                empty_ids.append(id)
+        if len(empty_ids) == 0:
+            return np.zeros(self._seg.shape, dtype=np.uint8)
+        assert (
+            len(empty_ids) == 1
+        ), ">1 ids corresponds to no labels. This is currently not supported"
+        return (self._seg != empty_ids[0]).numpy().astype(np.bool)
+
+    def semantic_masks(self):
+        for sid in self._seg_ids:
+            sinfo = self._sinfo.get(sid)
+            if sinfo is None or sinfo["isthing"]:
+                # Some pixels (e.g. id 0 in PanopticFPN) have no instance or semantic predictions.
+                continue
+            yield (self._seg == sid).numpy().astype(np.bool), sinfo
+
+    def instance_masks(self):
+        for sid in self._seg_ids:
+            sinfo = self._sinfo.get(sid)
+            if sinfo is None or not sinfo["isthing"]:
+                continue
+            mask = (self._seg == sid).numpy().astype(np.bool)
+            if mask.sum() > 0:
+                yield mask, sinfo
+
+
+def _create_text_labels(classes, scores, class_names, is_crowd=None):
+    """
+    Args:
+        classes (list[int] or None):
+        scores (list[float] or None):
+        class_names (list[str] or None):
+        is_crowd (list[bool] or None):
+
+    Returns:
+        list[str] or None
+    """
+    #class_names = CLASS_NAMES
+    labels = None
+    if classes is not None:
+        if class_names is not None and len(class_names) > 0:
+            labels = [class_names[i] for i in classes]
+        else:
+            labels = [str(i) for i in classes]
+            
+    if scores is not None:
+        if labels is None:
+            labels = ["{:.0f}%".format(s * 100) for s in scores]
+        else:
+            labels = ["{} {:.0f}%".format(l, s * 100) for l, s in zip(labels, scores)]
+    if labels is not None and is_crowd is not None:
+        labels = [l + ("|crowd" if crowd else "") for l, crowd in zip(labels, is_crowd)]
+    return labels
+
+
+class VisImage:
+    def __init__(self, img, scale=1.0):
+        """
+        Args:
+            img (ndarray): an RGB image of shape (H, W, 3) in range [0, 255].
+            scale (float): scale the input image
+        """
+        self.img = img
+        self.scale = scale
+        self.width, self.height = img.shape[1], img.shape[0]
+        self._setup_figure(img)
+
+    def _setup_figure(self, img):
+        """
+        Args:
+            Same as in :meth:`__init__()`.
+
+        Returns:
+            fig (matplotlib.pyplot.figure): top level container for all the image plot elements.
+            ax (matplotlib.pyplot.Axes): contains figure elements and sets the coordinate system.
+        """
+        fig = mplfigure.Figure(frameon=False)
+        self.dpi = fig.get_dpi()
+        # add a small 1e-2 to avoid precision lost due to matplotlib's truncation
+        # (https://github.com/matplotlib/matplotlib/issues/15363)
+        fig.set_size_inches(
+            (self.width * self.scale + 1e-2) / self.dpi,
+            (self.height * self.scale + 1e-2) / self.dpi,
+        )
+        self.canvas = FigureCanvasAgg(fig)
+        # self.canvas = mpl.backends.backend_cairo.FigureCanvasCairo(fig)
+        ax = fig.add_axes([0.0, 0.0, 1.0, 1.0])
+        ax.axis("off")
+        self.fig = fig
+        self.ax = ax
+        self.reset_image(img)
+
+    def reset_image(self, img):
+        """
+        Args:
+            img: same as in __init__
+        """
+        img = img.astype("uint8")
+        self.ax.imshow(img, extent=(0, self.width, self.height, 0), interpolation="nearest")
+
+    def save(self, filepath):
+        """
+        Args:
+            filepath (str): a string that contains the absolute path, including the file name, where
+                the visualized image will be saved.
+        """
+        self.fig.savefig(filepath)
+
+    def get_image(self):
+        """
+        Returns:
+            ndarray:
+                the visualized image of shape (H, W, 3) (RGB) in uint8 type.
+                The shape is scaled w.r.t the input image using the given `scale` argument.
+        """
+        canvas = self.canvas
+        s, (width, height) = canvas.print_to_buffer()
+        # buf = io.BytesIO()  # works for cairo backend
+        # canvas.print_rgba(buf)
+        # width, height = self.width, self.height
+        # s = buf.getvalue()
+
+        buffer = np.frombuffer(s, dtype="uint8")
+
+        img_rgba = buffer.reshape(height, width, 4)
+        rgb, alpha = np.split(img_rgba, [3], axis=2)
+        return rgb.astype("uint8")
+
+
+class Visualizer:
+    """
+    Visualizer that draws data about detection/segmentation on images.
+
+    It contains methods like `draw_{text,box,circle,line,binary_mask,polygon}`
+    that draw primitive objects to images, as well as high-level wrappers like
+    `draw_{instance_predictions,sem_seg,panoptic_seg_predictions,dataset_dict}`
+    that draw composite data in some pre-defined style.
+
+    Note that the exact visualization style for the high-level wrappers are subject to change.
+    Style such as color, opacity, label contents, visibility of labels, or even the visibility
+    of objects themselves (e.g. when the object is too small) may change according
+    to different heuristics, as long as the results still look visually reasonable.
+
+    To obtain a consistent style, you can implement custom drawing functions with the
+    abovementioned primitive methods instead. If you need more customized visualization
+    styles, you can process the data yourself following their format documented in
+    tutorials (:doc:`/tutorials/models`, :doc:`/tutorials/datasets`). This class does not
+    intend to satisfy everyone's preference on drawing styles.
+
+    This visualizer focuses on high rendering quality rather than performance. It is not
+    designed to be used for real-time applications.
+    """
+
+    # TODO implement a fast, rasterized version using OpenCV
+
+    def __init__(self, img_rgb, metadata=None, scale=1.0, instance_mode=ColorMode.IMAGE):
+        """
+        Args:
+            img_rgb: a numpy array of shape (H, W, C), where H and W correspond to
+                the height and width of the image respectively. C is the number of
+                color channels. The image is required to be in RGB format since that
+                is a requirement of the Matplotlib library. The image is also expected
+                to be in the range [0, 255].
+            metadata (Metadata): dataset metadata (e.g. class names and colors)
+            instance_mode (ColorMode): defines one of the pre-defined style for drawing
+                instances on an image.
+        """
+        self.img = np.asarray(img_rgb).clip(0, 255).astype(np.uint8)
+        if metadata is None:
+            metadata = MetadataCatalog.get("__nonexist__")
+        self.metadata = metadata
+        self.output = VisImage(self.img, scale=scale)
+        self.cpu_device = torch.device("cpu")
+
+        # too small texts are useless, therefore clamp to 9
+        self._default_font_size = max(
+            np.sqrt(self.output.height * self.output.width) // 90, 10 // scale
+        )
+        self._instance_mode = instance_mode
+        self.keypoint_threshold = _KEYPOINT_THRESHOLD
+
+    def draw_instance_predictions(self, predictions):
+        """
+        Draw instance-level prediction results on an image.
+
+        Args:
+            predictions (Instances): the output of an instance detection/segmentation
+                model. Following fields will be used to draw:
+                "pred_boxes", "pred_classes", "scores", "pred_masks" (or "pred_masks_rle").
+
+        Returns:
+            output (VisImage): image object with visualizations.
+        """
+        boxes = predictions.pred_boxes if predictions.has("pred_boxes") else None
+        scores = predictions.scores if predictions.has("scores") else None
+        classes = predictions.pred_classes.tolist() if predictions.has("pred_classes") else None
+        labels = _create_text_labels(classes, scores, self.metadata.get("thing_classes", None))
+        keypoints = predictions.pred_keypoints if predictions.has("pred_keypoints") else None
+
+        if predictions.has("pred_masks"):
+            masks = np.asarray(predictions.pred_masks)
+            masks = [GenericMask(x, self.output.height, self.output.width) for x in masks]
+        else:
+            masks = None
+
+        if self._instance_mode == ColorMode.SEGMENTATION and self.metadata.get("thing_colors"):
+            colors = [
+                self._jitter([x / 255 for x in self.metadata.thing_colors[c]]) for c in classes
+            ]
+            alpha = 0.8
+        else:
+            colors = None
+            alpha = 0.5
+
+        if self._instance_mode == ColorMode.IMAGE_BW:
+            self.output.reset_image(
+                self._create_grayscale_image(
+                    (predictions.pred_masks.any(dim=0) > 0).numpy()
+                    if predictions.has("pred_masks")
+                    else None
+                )
+            )
+            alpha = 0.3
+
+        self.overlay_instances(
+            masks=masks,
+            boxes=boxes,
+            labels=labels,
+            keypoints=keypoints,
+            assigned_colors=colors,
+            alpha=alpha,
+        )
+        return self.output
+
+    def draw_sem_seg(self, sem_seg, area_threshold=None, alpha=0.8):
+        """
+        Draw semantic segmentation predictions/labels.
+
+        Args:
+            sem_seg (Tensor or ndarray): the segmentation of shape (H, W).
+                Each value is the integer label of the pixel.
+            area_threshold (int): segments with less than `area_threshold` are not drawn.
+            alpha (float): the larger it is, the more opaque the segmentations are.
+
+        Returns:
+            output (VisImage): image object with visualizations.
+        """
+        if isinstance(sem_seg, torch.Tensor):
+            sem_seg = sem_seg.numpy()
+        labels, areas = np.unique(sem_seg, return_counts=True)
+        sorted_idxs = np.argsort(-areas).tolist()
+        labels = labels[sorted_idxs]
+        for label in filter(lambda l: l < len(self.metadata.stuff_classes), labels):
+            try:
+                mask_color = [x / 255 for x in self.metadata.stuff_colors[label]]
+            except (AttributeError, IndexError):
+                mask_color = None
+
+            binary_mask = (sem_seg == label).astype(np.uint8)
+            text = self.metadata.stuff_classes[label]
+            self.draw_binary_mask(
+                binary_mask,
+                color=mask_color,
+                edge_color=_OFF_WHITE,
+                text=text,
+                alpha=alpha,
+                area_threshold=area_threshold,
+            )
+        return self.output
+
+    def draw_panoptic_seg(self, panoptic_seg, segments_info, area_threshold=None, alpha=0.7):
+        """
+        Draw panoptic prediction annotations or results.
+
+        Args:
+            panoptic_seg (Tensor): of shape (height, width) where the values are ids for each
+                segment.
+            segments_info (list[dict] or None): Describe each segment in `panoptic_seg`.
+                If it is a ``list[dict]``, each dict contains keys "id", "category_id".
+                If None, category id of each pixel is computed by
+                ``pixel // metadata.label_divisor``.
+            area_threshold (int): stuff segments with less than `area_threshold` are not drawn.
+
+        Returns:
+            output (VisImage): image object with visualizations.
+        """
+        pred = _PanopticPrediction(panoptic_seg, segments_info, self.metadata)
+
+        if self._instance_mode == ColorMode.IMAGE_BW:
+            self.output.reset_image(self._create_grayscale_image(pred.non_empty_mask()))
+
+        # draw mask for all semantic segments first i.e. "stuff"
+        for mask, sinfo in pred.semantic_masks():
+            category_idx = sinfo["category_id"]
+            try:
+                mask_color = [x / 255 for x in self.metadata.stuff_colors[category_idx]]
+            except AttributeError:
+                mask_color = None
+
+            text = self.metadata.stuff_classes[category_idx]
+            self.draw_binary_mask(
+                mask,
+                color=mask_color,
+                edge_color=_OFF_WHITE,
+                text=text,
+                alpha=alpha,
+                area_threshold=area_threshold,
+            )
+
+        # draw mask for all instances second
+        all_instances = list(pred.instance_masks())
+        if len(all_instances) == 0:
+            return self.output
+        masks, sinfo = list(zip(*all_instances))
+        category_ids = [x["category_id"] for x in sinfo]
+
+        try:
+            scores = [x["score"] for x in sinfo]
+        except KeyError:
+            scores = None
+        labels = _create_text_labels(
+            category_ids, scores, self.metadata.thing_classes, [x.get("iscrowd", 0) for x in sinfo]
+        )
+
+        try:
+            colors = [
+                self._jitter([x / 255 for x in self.metadata.thing_colors[c]]) for c in category_ids
+            ]
+        except AttributeError:
+            colors = None
+        self.overlay_instances(masks=masks, labels=labels, assigned_colors=colors, alpha=alpha)
+
+        return self.output
+
+    draw_panoptic_seg_predictions = draw_panoptic_seg  # backward compatibility
+
+    def draw_dataset_dict(self, dic):
+        """
+        Draw annotations/segmentaions in Detectron2 Dataset format.
+
+        Args:
+            dic (dict): annotation/segmentation data of one image, in Detectron2 Dataset format.
+
+        Returns:
+            output (VisImage): image object with visualizations.
+        """
+        annos = dic.get("annotations", None)
+        if annos:
+            if "segmentation" in annos[0]:
+                masks = [x["segmentation"] for x in annos]
+            else:
+                masks = None
+            if "keypoints" in annos[0]:
+                keypts = [x["keypoints"] for x in annos]
+                keypts = np.array(keypts).reshape(len(annos), -1, 3)
+            else:
+                keypts = None
+
+            boxes = [
+                BoxMode.convert(x["bbox"], x["bbox_mode"], BoxMode.XYXY_ABS)
+                if len(x["bbox"]) == 4
+                else x["bbox"]
+                for x in annos
+            ]
+
+            colors = None
+            category_ids = [x["category_id"] for x in annos]
+            if self._instance_mode == ColorMode.SEGMENTATION and self.metadata.get("thing_colors"):
+                colors = [
+                    self._jitter([x / 255 for x in self.metadata.thing_colors[c]])
+                    for c in category_ids
+                ]
+            names = self.metadata.get("thing_classes", None)
+            labels = _create_text_labels(
+                category_ids,
+                scores=None,
+                class_names=names,
+                is_crowd=[x.get("iscrowd", 0) for x in annos],
+            )
+            self.overlay_instances(
+                labels=labels, boxes=boxes, masks=masks, keypoints=keypts, assigned_colors=colors
+            )
+
+        sem_seg = dic.get("sem_seg", None)
+        if sem_seg is None and "sem_seg_file_name" in dic:
+            with PathManager.open(dic["sem_seg_file_name"], "rb") as f:
+                sem_seg = Image.open(f)
+                sem_seg = np.asarray(sem_seg, dtype="uint8")
+        if sem_seg is not None:
+            self.draw_sem_seg(sem_seg, area_threshold=0, alpha=0.5)
+
+        pan_seg = dic.get("pan_seg", None)
+        if pan_seg is None and "pan_seg_file_name" in dic:
+            with PathManager.open(dic["pan_seg_file_name"], "rb") as f:
+                pan_seg = Image.open(f)
+                pan_seg = np.asarray(pan_seg)
+                from panopticapi.utils import rgb2id
+
+                pan_seg = rgb2id(pan_seg)
+        if pan_seg is not None:
+            segments_info = dic["segments_info"]
+            pan_seg = torch.tensor(pan_seg)
+            self.draw_panoptic_seg(pan_seg, segments_info, area_threshold=0, alpha=0.5)
+        return self.output
+
+    def overlay_instances(
+        self,
+        *,
+        boxes=None,
+        labels=None,
+        masks=None,
+        keypoints=None,
+        assigned_colors=None,
+        alpha=0.5,
+    ):
+        """
+        Args:
+            boxes (Boxes, RotatedBoxes or ndarray): either a :class:`Boxes`,
+                or an Nx4 numpy array of XYXY_ABS format for the N objects in a single image,
+                or a :class:`RotatedBoxes`,
+                or an Nx5 numpy array of (x_center, y_center, width, height, angle_degrees) format
+                for the N objects in a single image,
+            labels (list[str]): the text to be displayed for each instance.
+            masks (masks-like object): Supported types are:
+
+                * :class:`detectron2.structures.PolygonMasks`,
+                  :class:`detectron2.structures.BitMasks`.
+                * list[list[ndarray]]: contains the segmentation masks for all objects in one image.
+                  The first level of the list corresponds to individual instances. The second
+                  level to all the polygon that compose the instance, and the third level
+                  to the polygon coordinates. The third level should have the format of
+                  [x0, y0, x1, y1, ..., xn, yn] (n >= 3).
+                * list[ndarray]: each ndarray is a binary mask of shape (H, W).
+                * list[dict]: each dict is a COCO-style RLE.
+            keypoints (Keypoint or array like): an array-like object of shape (N, K, 3),
+                where the N is the number of instances and K is the number of keypoints.
+                The last dimension corresponds to (x, y, visibility or score).
+            assigned_colors (list[matplotlib.colors]): a list of colors, where each color
+                corresponds to each mask or box in the image. Refer to 'matplotlib.colors'
+                for full list of formats that the colors are accepted in.
+
+        Returns:
+            output (VisImage): image object with visualizations.
+        """
+        num_instances = 0
+        if boxes is not None:
+            boxes = self._convert_boxes(boxes)
+            num_instances = len(boxes)
+        if masks is not None:
+            masks = self._convert_masks(masks)
+            if num_instances:
+                assert len(masks) == num_instances
+            else:
+                num_instances = len(masks)
+        if keypoints is not None:
+            if num_instances:
+                assert len(keypoints) == num_instances
+            else:
+                num_instances = len(keypoints)
+            keypoints = self._convert_keypoints(keypoints)
+        if labels is not None:
+            assert len(labels) == num_instances
+        if assigned_colors is None:
+            assigned_colors = [random_color(rgb=True, maximum=1) for _ in range(num_instances)]
+        if num_instances == 0:
+            return self.output
+        if boxes is not None and boxes.shape[1] == 5:
+            return self.overlay_rotated_instances(
+                boxes=boxes, labels=labels, assigned_colors=assigned_colors
+            )
+
+        # Display in largest to smallest order to reduce occlusion.
+        areas = None
+        if boxes is not None:
+            areas = np.prod(boxes[:, 2:] - boxes[:, :2], axis=1)
+        elif masks is not None:
+            areas = np.asarray([x.area() for x in masks])
+
+        if areas is not None:
+            sorted_idxs = np.argsort(-areas).tolist()
+            # Re-order overlapped instances in descending order.
+            boxes = boxes[sorted_idxs] if boxes is not None else None
+            labels = [labels[k] for k in sorted_idxs] if labels is not None else None
+            masks = [masks[idx] for idx in sorted_idxs] if masks is not None else None
+            assigned_colors = [assigned_colors[idx] for idx in sorted_idxs]
+            keypoints = keypoints[sorted_idxs] if keypoints is not None else None
+
+        for i in range(num_instances):
+            color = assigned_colors[i]
+            if boxes is not None:
+                self.draw_box(boxes[i], edge_color=color)
+
+            if masks is not None:
+                for segment in masks[i].polygons:
+                    self.draw_polygon(segment.reshape(-1, 2), color, alpha=alpha)
+
+            if labels is not None:
+                # first get a box
+                if boxes is not None:
+                    x0, y0, x1, y1 = boxes[i]
+                    text_pos = (x0, y0)  # if drawing boxes, put text on the box corner.
+                    horiz_align = "left"
+                elif masks is not None:
+                    # skip small mask without polygon
+                    if len(masks[i].polygons) == 0:
+                        continue
+
+                    x0, y0, x1, y1 = masks[i].bbox()
+
+                    # draw text in the center (defined by median) when box is not drawn
+                    # median is less sensitive to outliers.
+                    text_pos = np.median(masks[i].mask.nonzero(), axis=1)[::-1]
+                    horiz_align = "center"
+                else:
+                    continue  # drawing the box confidence for keypoints isn't very useful.
+                # for small objects, draw text at the side to avoid occlusion
+                instance_area = (y1 - y0) * (x1 - x0)
+                if (
+                    instance_area < _SMALL_OBJECT_AREA_THRESH * self.output.scale
+                    or y1 - y0 < 40 * self.output.scale
+                ):
+                    if y1 >= self.output.height - 5:
+                        text_pos = (x1, y0)
+                    else:
+                        text_pos = (x0, y1)
+
+                height_ratio = (y1 - y0) / np.sqrt(self.output.height * self.output.width)
+                lighter_color = self._change_color_brightness(color, brightness_factor=0.7)
+                font_size = (
+                    np.clip((height_ratio - 0.02) / 0.08 + 1, 1.2, 2)
+                    * 0.5
+                    * self._default_font_size
+                )
+                self.draw_text(
+                    labels[i],
+                    text_pos,
+                    color=lighter_color,
+                    horizontal_alignment=horiz_align,
+                    font_size=font_size,
+                )
+
+        # draw keypoints
+        if keypoints is not None:
+            for keypoints_per_instance in keypoints:
+                self.draw_and_connect_keypoints(keypoints_per_instance)
+
+        return self.output
+
+    def overlay_rotated_instances(self, boxes=None, labels=None, assigned_colors=None):
+        """
+        Args:
+            boxes (ndarray): an Nx5 numpy array of
+                (x_center, y_center, width, height, angle_degrees) format
+                for the N objects in a single image.
+            labels (list[str]): the text to be displayed for each instance.
+            assigned_colors (list[matplotlib.colors]): a list of colors, where each color
+                corresponds to each mask or box in the image. Refer to 'matplotlib.colors'
+                for full list of formats that the colors are accepted in.
+
+        Returns:
+            output (VisImage): image object with visualizations.
+        """
+        num_instances = len(boxes)
+
+        if assigned_colors is None:
+            assigned_colors = [random_color(rgb=True, maximum=1) for _ in range(num_instances)]
+        if num_instances == 0:
+            return self.output
+
+        # Display in largest to smallest order to reduce occlusion.
+        if boxes is not None:
+            areas = boxes[:, 2] * boxes[:, 3]
+
+        sorted_idxs = np.argsort(-areas).tolist()
+        # Re-order overlapped instances in descending order.
+        boxes = boxes[sorted_idxs]
+        labels = [labels[k] for k in sorted_idxs] if labels is not None else None
+        colors = [assigned_colors[idx] for idx in sorted_idxs]
+
+        for i in range(num_instances):
+            self.draw_rotated_box_with_label(
+                boxes[i], edge_color=colors[i], label=labels[i] if labels is not None else None
+            )
+
+        return self.output
+
+    def draw_and_connect_keypoints(self, keypoints):
+        """
+        Draws keypoints of an instance and follows the rules for keypoint connections
+        to draw lines between appropriate keypoints. This follows color heuristics for
+        line color.
+
+        Args:
+            keypoints (Tensor): a tensor of shape (K, 3), where K is the number of keypoints
+                and the last dimension corresponds to (x, y, probability).
+
+        Returns:
+            output (VisImage): image object with visualizations.
+        """
+        visible = {}
+        keypoint_names = self.metadata.get("keypoint_names")
+        for idx, keypoint in enumerate(keypoints):
+            # draw keypoint
+            x, y, prob = keypoint
+            if prob > self.keypoint_threshold:
+                self.draw_circle((x, y), color=_RED)
+                if keypoint_names:
+                    keypoint_name = keypoint_names[idx]
+                    visible[keypoint_name] = (x, y)
+
+        if self.metadata.get("keypoint_connection_rules"):
+            for kp0, kp1, color in self.metadata.keypoint_connection_rules:
+                if kp0 in visible and kp1 in visible:
+                    x0, y0 = visible[kp0]
+                    x1, y1 = visible[kp1]
+                    color = tuple(x / 255.0 for x in color)
+                    self.draw_line([x0, x1], [y0, y1], color=color)
+
+        # draw lines from nose to mid-shoulder and mid-shoulder to mid-hip
+        # Note that this strategy is specific to person keypoints.
+        # For other keypoints, it should just do nothing
+        try:
+            ls_x, ls_y = visible["left_shoulder"]
+            rs_x, rs_y = visible["right_shoulder"]
+            mid_shoulder_x, mid_shoulder_y = (ls_x + rs_x) / 2, (ls_y + rs_y) / 2
+        except KeyError:
+            pass
+        else:
+            # draw line from nose to mid-shoulder
+            nose_x, nose_y = visible.get("nose", (None, None))
+            if nose_x is not None:
+                self.draw_line([nose_x, mid_shoulder_x], [nose_y, mid_shoulder_y], color=_RED)
+
+            try:
+                # draw line from mid-shoulder to mid-hip
+                lh_x, lh_y = visible["left_hip"]
+                rh_x, rh_y = visible["right_hip"]
+            except KeyError:
+                pass
+            else:
+                mid_hip_x, mid_hip_y = (lh_x + rh_x) / 2, (lh_y + rh_y) / 2
+                self.draw_line([mid_hip_x, mid_shoulder_x], [mid_hip_y, mid_shoulder_y], color=_RED)
+        return self.output
+
+    """
+    Primitive drawing functions:
+    """
+
+    def draw_text(
+        self,
+        text,
+        position,
+        *,
+        font_size=None,
+        color="g",
+        horizontal_alignment="center",
+        rotation=0,
+    ):
+        """
+        Args:
+            text (str): class label
+            position (tuple): a tuple of the x and y coordinates to place text on image.
+            font_size (int, optional): font of the text. If not provided, a font size
+                proportional to the image width is calculated and used.
+            color: color of the text. Refer to `matplotlib.colors` for full list
+                of formats that are accepted.
+            horizontal_alignment (str): see `matplotlib.text.Text`
+            rotation: rotation angle in degrees CCW
+
+        Returns:
+            output (VisImage): image object with text drawn.
+        """
+        if not font_size:
+            font_size = self._default_font_size
+
+        # since the text background is dark, we don't want the text to be dark
+        color = np.maximum(list(mplc.to_rgb(color)), 0.2)
+        color[np.argmax(color)] = max(0.8, np.max(color))
+
+        x, y = position
+        self.output.ax.text(
+            x,
+            y,
+            text,
+            size=font_size * self.output.scale,
+            family="sans-serif",
+            bbox={"facecolor": "black", "alpha": 0.8, "pad": 0.7, "edgecolor": "none"},
+            verticalalignment="top",
+            horizontalalignment=horizontal_alignment,
+            color=color,
+            zorder=10,
+            rotation=rotation,
+        )
+        return self.output
+
+    def draw_box(self, box_coord, alpha=0.5, edge_color="g", line_style="-"):
+        """
+        Args:
+            box_coord (tuple): a tuple containing x0, y0, x1, y1 coordinates, where x0 and y0
+                are the coordinates of the image's top left corner. x1 and y1 are the
+                coordinates of the image's bottom right corner.
+            alpha (float): blending efficient. Smaller values lead to more transparent masks.
+            edge_color: color of the outline of the box. Refer to `matplotlib.colors`
+                for full list of formats that are accepted.
+            line_style (string): the string to use to create the outline of the boxes.
+
+        Returns:
+            output (VisImage): image object with box drawn.
+        """
+        x0, y0, x1, y1 = box_coord
+        width = x1 - x0
+        height = y1 - y0
+
+        linewidth = max(self._default_font_size / 4, 1)
+
+        self.output.ax.add_patch(
+            mpl.patches.Rectangle(
+                (x0, y0),
+                width,
+                height,
+                fill=False,
+                edgecolor=edge_color,
+                linewidth=linewidth * self.output.scale,
+                alpha=alpha,
+                linestyle=line_style,
+            )
+        )
+        return self.output
+
+    def draw_rotated_box_with_label(
+        self, rotated_box, alpha=0.5, edge_color="g", line_style="-", label=None
+    ):
+        """
+        Draw a rotated box with label on its top-left corner.
+
+        Args:
+            rotated_box (tuple): a tuple containing (cnt_x, cnt_y, w, h, angle),
+                where cnt_x and cnt_y are the center coordinates of the box.
+                w and h are the width and height of the box. angle represents how
+                many degrees the box is rotated CCW with regard to the 0-degree box.
+            alpha (float): blending efficient. Smaller values lead to more transparent masks.
+            edge_color: color of the outline of the box. Refer to `matplotlib.colors`
+                for full list of formats that are accepted.
+            line_style (string): the string to use to create the outline of the boxes.
+            label (string): label for rotated box. It will not be rendered when set to None.
+
+        Returns:
+            output (VisImage): image object with box drawn.
+        """
+        cnt_x, cnt_y, w, h, angle = rotated_box
+        area = w * h
+        # use thinner lines when the box is small
+        linewidth = self._default_font_size / (
+            6 if area < _SMALL_OBJECT_AREA_THRESH * self.output.scale else 3
+        )
+
+        theta = angle * math.pi / 180.0
+        c = math.cos(theta)
+        s = math.sin(theta)
+        rect = [(-w / 2, h / 2), (-w / 2, -h / 2), (w / 2, -h / 2), (w / 2, h / 2)]
+        # x: left->right ; y: top->down
+        rotated_rect = [(s * yy + c * xx + cnt_x, c * yy - s * xx + cnt_y) for (xx, yy) in rect]
+        for k in range(4):
+            j = (k + 1) % 4
+            self.draw_line(
+                [rotated_rect[k][0], rotated_rect[j][0]],
+                [rotated_rect[k][1], rotated_rect[j][1]],
+                color=edge_color,
+                linestyle="--" if k == 1 else line_style,
+                linewidth=linewidth,
+            )
+
+        if label is not None:
+            text_pos = rotated_rect[1]  # topleft corner
+
+            height_ratio = h / np.sqrt(self.output.height * self.output.width)
+            label_color = self._change_color_brightness(edge_color, brightness_factor=0.7)
+            font_size = (
+                np.clip((height_ratio - 0.02) / 0.08 + 1, 1.2, 2) * 0.5 * self._default_font_size
+            )
+            self.draw_text(label, text_pos, color=label_color, font_size=font_size, rotation=angle)
+
+        return self.output
+
+    def draw_circle(self, circle_coord, color, radius=3):
+        """
+        Args:
+            circle_coord (list(int) or tuple(int)): contains the x and y coordinates
+                of the center of the circle.
+            color: color of the polygon. Refer to `matplotlib.colors` for a full list of
+                formats that are accepted.
+            radius (int): radius of the circle.
+
+        Returns:
+            output (VisImage): image object with box drawn.
+        """
+        x, y = circle_coord
+        self.output.ax.add_patch(
+            mpl.patches.Circle(circle_coord, radius=radius, fill=True, color=color)
+        )
+        return self.output
+
+    def draw_line(self, x_data, y_data, color, linestyle="-", linewidth=None):
+        """
+        Args:
+            x_data (list[int]): a list containing x values of all the points being drawn.
+                Length of list should match the length of y_data.
+            y_data (list[int]): a list containing y values of all the points being drawn.
+                Length of list should match the length of x_data.
+            color: color of the line. Refer to `matplotlib.colors` for a full list of
+                formats that are accepted.
+            linestyle: style of the line. Refer to `matplotlib.lines.Line2D`
+                for a full list of formats that are accepted.
+            linewidth (float or None): width of the line. When it's None,
+                a default value will be computed and used.
+
+        Returns:
+            output (VisImage): image object with line drawn.
+        """
+        if linewidth is None:
+            linewidth = self._default_font_size / 3
+        linewidth = max(linewidth, 1)
+        self.output.ax.add_line(
+            mpl.lines.Line2D(
+                x_data,
+                y_data,
+                linewidth=linewidth * self.output.scale,
+                color=color,
+                linestyle=linestyle,
+            )
+        )
+        return self.output
+
+    def draw_binary_mask(
+        self, binary_mask, color=None, *, edge_color=None, text=None, alpha=0.5, area_threshold=0
+    ):
+        """
+        Args:
+            binary_mask (ndarray): numpy array of shape (H, W), where H is the image height and
+                W is the image width. Each value in the array is either a 0 or 1 value of uint8
+                type.
+            color: color of the mask. Refer to `matplotlib.colors` for a full list of
+                formats that are accepted. If None, will pick a random color.
+            edge_color: color of the polygon edges. Refer to `matplotlib.colors` for a
+                full list of formats that are accepted.
+            text (str): if None, will be drawn in the object's center of mass.
+            alpha (float): blending efficient. Smaller values lead to more transparent masks.
+            area_threshold (float): a connected component small than this will not be shown.
+
+        Returns:
+            output (VisImage): image object with mask drawn.
+        """
+        if color is None:
+            color = random_color(rgb=True, maximum=1)
+        color = mplc.to_rgb(color)
+
+        has_valid_segment = False
+        binary_mask = binary_mask.astype("uint8")  # opencv needs uint8
+        mask = GenericMask(binary_mask, self.output.height, self.output.width)
+        shape2d = (binary_mask.shape[0], binary_mask.shape[1])
+
+        if not mask.has_holes:
+            # draw polygons for regular masks
+            for segment in mask.polygons:
+                area = mask_util.area(mask_util.frPyObjects([segment], shape2d[0], shape2d[1]))
+                if area < (area_threshold or 0):
+                    continue
+                has_valid_segment = True
+                segment = segment.reshape(-1, 2)
+                self.draw_polygon(segment, color=color, edge_color=edge_color, alpha=alpha)
+        else:
+            # TODO: Use Path/PathPatch to draw vector graphics:
+            # https://stackoverflow.com/questions/8919719/how-to-plot-a-complex-polygon
+            rgba = np.zeros(shape2d + (4,), dtype="float32")
+            rgba[:, :, :3] = color
+            rgba[:, :, 3] = (mask.mask == 1).astype("float32") * alpha
+            has_valid_segment = True
+            self.output.ax.imshow(rgba, extent=(0, self.output.width, self.output.height, 0))
+
+        if text is not None and has_valid_segment:
+            # TODO sometimes drawn on wrong objects. the heuristics here can improve.
+            lighter_color = self._change_color_brightness(color, brightness_factor=0.7)
+            _num_cc, cc_labels, stats, centroids = cv2.connectedComponentsWithStats(binary_mask, 8)
+            largest_component_id = np.argmax(stats[1:, -1]) + 1
+
+            # draw text on the largest component, as well as other very large components.
+            for cid in range(1, _num_cc):
+                if cid == largest_component_id or stats[cid, -1] > _LARGE_MASK_AREA_THRESH:
+                    # median is more stable than centroid
+                    # center = centroids[largest_component_id]
+                    center = np.median((cc_labels == cid).nonzero(), axis=1)[::-1]
+                    self.draw_text(text, center, color=lighter_color)
+        return self.output
+
+    def draw_polygon(self, segment, color, edge_color=None, alpha=0.5):
+        """
+        Args:
+            segment: numpy array of shape Nx2, containing all the points in the polygon.
+            color: color of the polygon. Refer to `matplotlib.colors` for a full list of
+                formats that are accepted.
+            edge_color: color of the polygon edges. Refer to `matplotlib.colors` for a
+                full list of formats that are accepted. If not provided, a darker shade
+                of the polygon color will be used instead.
+            alpha (float): blending efficient. Smaller values lead to more transparent masks.
+
+        Returns:
+            output (VisImage): image object with polygon drawn.
+        """
+        if edge_color is None:
+            # make edge color darker than the polygon color
+            if alpha > 0.8:
+                edge_color = self._change_color_brightness(color, brightness_factor=-0.7)
+            else:
+                edge_color = color
+        edge_color = mplc.to_rgb(edge_color) + (1,)
+
+        polygon = mpl.patches.Polygon(
+            segment,
+            fill=True,
+            facecolor=mplc.to_rgb(color) + (alpha,),
+            edgecolor=edge_color,
+            linewidth=max(self._default_font_size // 15 * self.output.scale, 1),
+        )
+        self.output.ax.add_patch(polygon)
+        return self.output
+
+    """
+    Internal methods:
+    """
+
+    def _jitter(self, color):
+        """
+        Randomly modifies given color to produce a slightly different color than the color given.
+
+        Args:
+            color (tuple[double]): a tuple of 3 elements, containing the RGB values of the color
+                picked. The values in the list are in the [0.0, 1.0] range.
+
+        Returns:
+            jittered_color (tuple[double]): a tuple of 3 elements, containing the RGB values of the
+                color after being jittered. The values in the list are in the [0.0, 1.0] range.
+        """
+        color = mplc.to_rgb(color)
+        vec = np.random.rand(3)
+        # better to do it in another color space
+        vec = vec / np.linalg.norm(vec) * 0.5
+        res = np.clip(vec + color, 0, 1)
+        return tuple(res)
+
+    def _create_grayscale_image(self, mask=None):
+        """
+        Create a grayscale version of the original image.
+        The colors in masked area, if given, will be kept.
+        """
+        img_bw = self.img.astype("f4").mean(axis=2)
+        img_bw = np.stack([img_bw] * 3, axis=2)
+        if mask is not None:
+            img_bw[mask] = self.img[mask]
+        return img_bw
+
+    def _change_color_brightness(self, color, brightness_factor):
+        """
+        Depending on the brightness_factor, gives a lighter or darker color i.e. a color with
+        less or more saturation than the original color.
+
+        Args:
+            color: color of the polygon. Refer to `matplotlib.colors` for a full list of
+                formats that are accepted.
+            brightness_factor (float): a value in [-1.0, 1.0] range. A lightness factor of
+                0 will correspond to no change, a factor in [-1.0, 0) range will result in
+                a darker color and a factor in (0, 1.0] range will result in a lighter color.
+
+        Returns:
+            modified_color (tuple[double]): a tuple containing the RGB values of the
+                modified color. Each value in the tuple is in the [0.0, 1.0] range.
+        """
+        assert brightness_factor >= -1.0 and brightness_factor <= 1.0
+        color = mplc.to_rgb(color)
+        polygon_color = colorsys.rgb_to_hls(*mplc.to_rgb(color))
+        modified_lightness = polygon_color[1] + (brightness_factor * polygon_color[1])
+        modified_lightness = 0.0 if modified_lightness < 0.0 else modified_lightness
+        modified_lightness = 1.0 if modified_lightness > 1.0 else modified_lightness
+        modified_color = colorsys.hls_to_rgb(polygon_color[0], modified_lightness, polygon_color[2])
+        return modified_color
+
+    def _convert_boxes(self, boxes):
+        """
+        Convert different format of boxes to an NxB array, where B = 4 or 5 is the box dimension.
+        """
+        if isinstance(boxes, Boxes) or isinstance(boxes, RotatedBoxes):
+            return boxes.tensor.detach().numpy()
+        else:
+            return np.asarray(boxes)
+
+    def _convert_masks(self, masks_or_polygons):
+        """
+        Convert different format of masks or polygons to a tuple of masks and polygons.
+
+        Returns:
+            list[GenericMask]:
+        """
+
+        m = masks_or_polygons
+        if isinstance(m, PolygonMasks):
+            m = m.polygons
+        if isinstance(m, BitMasks):
+            m = m.tensor.numpy()
+        if isinstance(m, torch.Tensor):
+            m = m.numpy()
+        ret = []
+        for x in m:
+            if isinstance(x, GenericMask):
+                ret.append(x)
+            else:
+                ret.append(GenericMask(x, self.output.height, self.output.width))
+        return ret
+
+    def _convert_keypoints(self, keypoints):
+        if isinstance(keypoints, Keypoints):
+            keypoints = keypoints.tensor
+        keypoints = np.asarray(keypoints)
+        return keypoints
+
+    def get_output(self):
+        """
+        Returns:
+            output (VisImage): the image output containing the visualizations added
+            to the image.
+        """
+        return self.output
--- a/magic_pdf/model/pek_sub_modules/post_process.py
+++ b/magic_pdf/model/pek_sub_modules/post_process.py
+import re
+
+def layout_rm_equation(layout_res):
+    rm_idxs = []
+    for idx, ele in enumerate(layout_res['layout_dets']):
+        if ele['category_id'] == 10:
+            rm_idxs.append(idx)
+    
+    for idx in rm_idxs[::-1]:
+        del layout_res['layout_dets'][idx]
+    return layout_res
+
+
+def get_croped_image(image_pil, bbox):
+    x_min, y_min, x_max, y_max = bbox
+    croped_img = image_pil.crop((x_min, y_min, x_max, y_max))
+    return croped_img
+
+
+def latex_rm_whitespace(s: str):
+    """Remove unnecessary whitespace from LaTeX code.
+    """
+    text_reg = r'(\\(operatorname|mathrm|text|mathbf)\s?\*? {.*?})'
+    letter = '[a-zA-Z]'
+    noletter = '[\W_^\d]'
+    names = [x[0].replace(' ', '') for x in re.findall(text_reg, s)]
+    s = re.sub(text_reg, lambda match: str(names.pop(0)), s)
+    news = s
+    while True:
+        s = news
+        news = re.sub(r'(?!\\ )(%s)\s+?(%s)' % (noletter, noletter), r'\1\2', s)
+        news = re.sub(r'(?!\\ )(%s)\s+?(%s)' % (noletter, letter), r'\1\2', news)
+        news = re.sub(r'(%s)\s+?(%s)' % (letter, noletter), r'\1\2', news)
+        if news == s:
+            break
+    return s
\ No newline at end of file
--- a/magic_pdf/model/pek_sub_modules/self_modify.py
+++ b/magic_pdf/model/pek_sub_modules/self_modify.py
+import time
+import copy
+import base64
+import cv2
+import numpy as np
+from io import BytesIO
+from PIL import Image
+
+from paddleocr import PaddleOCR
+from paddleocr.ppocr.utils.logging import get_logger
+from paddleocr.ppocr.utils.utility import check_and_read, alpha_to_color, binarize_img
+from paddleocr.tools.infer.utility import draw_ocr_box_txt, get_rotate_crop_image, get_minarea_rect_crop
+logger = get_logger()
+
+def img_decode(content: bytes):
+    np_arr = np.frombuffer(content, dtype=np.uint8)
+    return cv2.imdecode(np_arr, cv2.IMREAD_UNCHANGED)
+
+def check_img(img):
+    if isinstance(img, bytes):
+        img = img_decode(img)
+    if isinstance(img, str):
+        image_file = img
+        img, flag_gif, flag_pdf = check_and_read(image_file)
+        if not flag_gif and not flag_pdf:
+            with open(image_file, 'rb') as f:
+                img_str = f.read()
+                img = img_decode(img_str)
+            if img is None:
+                try:
+                    buf = BytesIO()
+                    image = BytesIO(img_str)
+                    im = Image.open(image)
+                    rgb = im.convert('RGB')
+                    rgb.save(buf, 'jpeg')
+                    buf.seek(0)
+                    image_bytes = buf.read()
+                    data_base64 = str(base64.b64encode(image_bytes),
+                                      encoding="utf-8")
+                    image_decode = base64.b64decode(data_base64)
+                    img_array = np.frombuffer(image_decode, np.uint8)
+                    img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
+                except:
+                    logger.error("error in loading image:{}".format(image_file))
+                    return None
+        if img is None:
+            logger.error("error in loading image:{}".format(image_file))
+            return None
+    if isinstance(img, np.ndarray) and len(img.shape) == 2:
+        img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
+
+    return img
+
+def sorted_boxes(dt_boxes):
+    """
+    Sort text boxes in order from top to bottom, left to right
+    args:
+        dt_boxes(array):detected text boxes with shape [4, 2]
+    return:
+        sorted boxes(array) with shape [4, 2]
+    """
+    num_boxes = dt_boxes.shape[0]
+    sorted_boxes = sorted(dt_boxes, key=lambda x: (x[0][1], x[0][0]))
+    _boxes = list(sorted_boxes)
+
+    for i in range(num_boxes - 1):
+        for j in range(i, -1, -1):
+            if abs(_boxes[j + 1][0][1] - _boxes[j][0][1]) < 10 and \
+                    (_boxes[j + 1][0][0] < _boxes[j][0][0]):
+                tmp = _boxes[j]
+                _boxes[j] = _boxes[j + 1]
+                _boxes[j + 1] = tmp
+            else:
+                break
+    return _boxes
+
+
+def formula_in_text(mf_bbox, text_bbox):
+    x1, y1, x2, y2 = mf_bbox
+    x3, y3 = text_bbox[0]
+    x4, y4 = text_bbox[2]
+    left_box, right_box = None, None
+    same_line = abs((y1+y2)/2 - (y3+y4)/2) / abs(y4-y3) < 0.2
+    if not same_line:
+        return False, left_box, right_box
+    else:
+        drop_origin = False
+        left_x = x1 - 1
+        right_x = x2 + 1
+        if x3 < x1 and x2 < x4:
+            drop_origin = True
+            left_box = np.array([text_bbox[0], [left_x, text_bbox[1][1]], [left_x, text_bbox[2][1]], text_bbox[3]]).astype('float32')
+            right_box = np.array([[right_x, text_bbox[0][1]], text_bbox[1], text_bbox[2], [right_x, text_bbox[3][1]]]).astype('float32')
+        if x3 < x1 and x1 <= x4 <= x2:
+            drop_origin = True
+            left_box = np.array([text_bbox[0], [left_x, text_bbox[1][1]], [left_x, text_bbox[2][1]], text_bbox[3]]).astype('float32')
+        if x1 <= x3 <= x2 and x2 < x4:
+            drop_origin = True
+            right_box = np.array([[right_x, text_bbox[0][1]], text_bbox[1], text_bbox[2], [right_x, text_bbox[3][1]]]).astype('float32')
+        if x1 <= x3 < x4 <= x2:
+            drop_origin = True
+        return drop_origin, left_box, right_box
+
+    
+def update_det_boxes(dt_boxes, mfdetrec_res):
+    new_dt_boxes = dt_boxes
+    for mf_box in mfdetrec_res:
+        flag, left_box, right_box = False, None, None
+        for idx, text_box in enumerate(new_dt_boxes):
+            ret, left_box, right_box = formula_in_text(mf_box['bbox'], text_box)
+            if ret:
+                new_dt_boxes.pop(idx)
+                if left_box is not None:
+                    new_dt_boxes.append(left_box)
+                if right_box is not None:
+                    new_dt_boxes.append(right_box)
+                break
+            
+    return new_dt_boxes
+
+class ModifiedPaddleOCR(PaddleOCR):
+    def ocr(self, img, det=True, rec=True, cls=True, bin=False, inv=False, mfd_res=None, alpha_color=(255, 255, 255)):
+        """
+        OCR with PaddleOCR
+        args：
+            img: img for OCR, support ndarray, img_path and list or ndarray
+            det: use text detection or not. If False, only rec will be exec. Default is True
+            rec: use text recognition or not. If False, only det will be exec. Default is True
+            cls: use angle classifier or not. Default is True. If True, the text with rotation of 180 degrees can be recognized. If no text is rotated by 180 degrees, use cls=False to get better performance. Text with rotation of 90 or 270 degrees can be recognized even if cls=False.
+            bin: binarize image to black and white. Default is False.
+            inv: invert image colors. Default is False.
+            alpha_color: set RGB color Tuple for transparent parts replacement. Default is pure white.
+        """
+        assert isinstance(img, (np.ndarray, list, str, bytes))
+        if isinstance(img, list) and det == True:
+            logger.error('When input a list of images, det must be false')
+            exit(0)
+        if cls == True and self.use_angle_cls == False:
+            pass
+            # logger.warning(
+            #     'Since the angle classifier is not initialized, it will not be used during the forward process'
+            # )
+
+        img = check_img(img)
+        # for infer pdf file
+        if isinstance(img, list):
+            if self.page_num > len(img) or self.page_num == 0:
+                self.page_num = len(img)
+            imgs = img[:self.page_num]
+        else:
+            imgs = [img]
+
+        def preprocess_image(_image):
+            _image = alpha_to_color(_image, alpha_color)
+            if inv:
+                _image = cv2.bitwise_not(_image)
+            if bin:
+                _image = binarize_img(_image)
+            return _image
+
+        if det and rec:
+            ocr_res = []
+            for idx, img in enumerate(imgs):
+                img = preprocess_image(img)
+                dt_boxes, rec_res, _ = self.__call__(img, cls, mfd_res=mfd_res)
+                if not dt_boxes and not rec_res:
+                    ocr_res.append(None)
+                    continue
+                tmp_res = [[box.tolist(), res]
+                           for box, res in zip(dt_boxes, rec_res)]
+                ocr_res.append(tmp_res)
+            return ocr_res
+        elif det and not rec:
+            ocr_res = []
+            for idx, img in enumerate(imgs):
+                img = preprocess_image(img)
+                dt_boxes, elapse = self.text_detector(img)
+                if not dt_boxes:
+                    ocr_res.append(None)
+                    continue
+                tmp_res = [box.tolist() for box in dt_boxes]
+                ocr_res.append(tmp_res)
+            return ocr_res
+        else:
+            ocr_res = []
+            cls_res = []
+            for idx, img in enumerate(imgs):
+                if not isinstance(img, list):
+                    img = preprocess_image(img)
+                    img = [img]
+                if self.use_angle_cls and cls:
+                    img, cls_res_tmp, elapse = self.text_classifier(img)
+                    if not rec:
+                        cls_res.append(cls_res_tmp)
+                rec_res, elapse = self.text_recognizer(img)
+                ocr_res.append(rec_res)
+            if not rec:
+                return cls_res
+            return ocr_res
+        
+    def __call__(self, img, cls=True, mfd_res=None):
+        time_dict = {'det': 0, 'rec': 0, 'cls': 0, 'all': 0}
+
+        if img is None:
+            logger.debug("no valid image provided")
+            return None, None, time_dict
+
+        start = time.time()
+        ori_im = img.copy()
+        dt_boxes, elapse = self.text_detector(img)
+        time_dict['det'] = elapse
+
+        if dt_boxes is None:
+            logger.debug("no dt_boxes found, elapsed : {}".format(elapse))
+            end = time.time()
+            time_dict['all'] = end - start
+            return None, None, time_dict
+        else:
+            logger.debug("dt_boxes num : {}, elapsed : {}".format(
+                len(dt_boxes), elapse))
+        img_crop_list = []
+
+        dt_boxes = sorted_boxes(dt_boxes)
+        if mfd_res:
+            bef = time.time()
+            dt_boxes = update_det_boxes(dt_boxes, mfd_res)
+            aft = time.time()
+            logger.debug("split text box by formula, new dt_boxes num : {}, elapsed : {}".format(
+                len(dt_boxes), aft-bef))
+
+        for bno in range(len(dt_boxes)):
+            tmp_box = copy.deepcopy(dt_boxes[bno])
+            if self.args.det_box_type == "quad":
+                img_crop = get_rotate_crop_image(ori_im, tmp_box)
+            else:
+                img_crop = get_minarea_rect_crop(ori_im, tmp_box)
+            img_crop_list.append(img_crop)
+        if self.use_angle_cls and cls:
+            img_crop_list, angle_list, elapse = self.text_classifier(
+                img_crop_list)
+            time_dict['cls'] = elapse
+            logger.debug("cls num  : {}, elapsed : {}".format(
+                len(img_crop_list), elapse))
+
+        rec_res, elapse = self.text_recognizer(img_crop_list)
+        time_dict['rec'] = elapse
+        logger.debug("rec_res num  : {}, elapsed : {}".format(
+            len(rec_res), elapse))
+        if self.args.save_crop_res:
+            self.draw_crop_rec_res(self.args.crop_res_save_dir, img_crop_list,
+                                   rec_res)
+        filter_boxes, filter_rec_res = [], []
+        for box, rec_result in zip(dt_boxes, rec_res):
+            text, score = rec_result
+            if score >= self.drop_score:
+                filter_boxes.append(box)
+                filter_rec_res.append(rec_result)
+        end = time.time()
+        time_dict['all'] = end - start
+        return filter_boxes, filter_rec_res, time_dict
\ No newline at end of file
--- a/magic_pdf/model/pp_structure_v2.py
+++ b/magic_pdf/model/pp_structure_v2.py
@@ -22,6 +22,13 @@ class CustomPaddleModel:
        self.model = PPStructure(table=False, ocr=ocr, show_log=show_log)

    def __call__(self, img):
+        try:
+            import cv2
+        except ImportError:
+            logger.error("opencv-python not installed, please install by pip.")
+            exit(1)
+        # 将RGB图片转换为BGR格式适配paddle
+        img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
        result = self.model(img)
        spans = []
        for line in result:

--- a/magic_pdf/pipe/AbsPipe.py
+++ b/magic_pdf/pipe/AbsPipe.py
@@ -47,19 +47,13 @@ class AbsPipe(ABC):
        """
        raise NotImplementedError

-    @abstractmethod
-    def pipe_mk_uni_format(self, img_parent_path, drop_mode):
-        """
-        有状态的组装统一格式
-        """
-        raise NotImplementedError
+    def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
+        content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), img_parent_path, drop_mode)
+        return content_list

-    @abstractmethod
-    def pipe_mk_markdown(self, img_parent_path, drop_mode):
-        """
-        有状态的组装markdown
-        """
-        raise NotImplementedError
+    def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD):
+        md_content = AbsPipe.mk_markdown(self.get_compress_pdf_mid_data(), img_parent_path, drop_mode, md_make_mode)
+        return md_content

    @staticmethod
    def classify(pdf_bytes: bytes) -> str:
@@ -101,13 +95,13 @@ class AbsPipe(ABC):
        return content_list

    @staticmethod
-    def mk_markdown(compressed_pdf_mid_data: str, img_buket_path: str, drop_mode=DropMode.WHOLE_PDF) -> list:
+    def mk_markdown(compressed_pdf_mid_data: str, img_buket_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD) -> list:
        """
        根据pdf类型，markdown
        """
        pdf_mid_data = JsonCompressor.decompress_json(compressed_pdf_mid_data)
        pdf_info_list = pdf_mid_data["pdf_info"]
-        md_content = union_make(pdf_info_list, MakeMode.MM_MD, drop_mode, img_buket_path)
+        md_content = union_make(pdf_info_list, md_make_mode, drop_mode, img_buket_path)
        return md_content


--- a/magic_pdf/pipe/OCRPipe.py
+++ b/magic_pdf/pipe/OCRPipe.py
-from magic_pdf.libs.MakeContentConfig import DropMode
+from loguru import logger
+
+from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
 from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
 from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
 from magic_pdf.pipe.AbsPipe import AbsPipe
@@ -7,7 +9,7 @@ from magic_pdf.user_api import parse_ocr_pdf

 class OCRPipe(AbsPipe):

-    def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool=False):
+    def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False):
        super().__init__(pdf_bytes, model_list, image_writer, is_debug)

    def pipe_classify(self):
@@ -20,9 +22,11 @@ class OCRPipe(AbsPipe):
        self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug)

    def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
-        content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), img_parent_path, drop_mode)
-        return content_list
-
-    def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
-        md_content = AbsPipe.mk_markdown(self.get_compress_pdf_mid_data(), img_parent_path, drop_mode)
-        return md_content
+        result = super().pipe_mk_uni_format(img_parent_path, drop_mode)
+        logger.info("ocr_pipe mk content list finished")
+        return result
+
+    def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD):
+        result = super().pipe_mk_markdown(img_parent_path, drop_mode, md_make_mode)
+        logger.info(f"ocr_pipe mk {md_make_mode} finished")
+        return result
--- a/magic_pdf/pipe/TXTPipe.py
+++ b/magic_pdf/pipe/TXTPipe.py
-from magic_pdf.libs.MakeContentConfig import DropMode
+from loguru import logger
+
+from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
 from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
 from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
 from magic_pdf.libs.json_compressor import JsonCompressor
@@ -8,7 +10,7 @@ from magic_pdf.user_api import parse_txt_pdf

 class TXTPipe(AbsPipe):

-    def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool=False):
+    def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False):
        super().__init__(pdf_bytes, model_list, image_writer, is_debug)

    def pipe_classify(self):
@@ -21,9 +23,11 @@ class TXTPipe(AbsPipe):
        self.pdf_mid_data = parse_txt_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug)

    def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
-        content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), img_parent_path, drop_mode)
-        return content_list
-
-    def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
-        md_content = AbsPipe.mk_markdown(self.get_compress_pdf_mid_data(), img_parent_path, drop_mode)
-        return md_content
+        result = super().pipe_mk_uni_format(img_parent_path, drop_mode)
+        logger.info("txt_pipe mk content list finished")
+        return result
+
+    def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD):
+        result = super().pipe_mk_markdown(img_parent_path, drop_mode, md_make_mode)
+        logger.info(f"txt_pipe mk {md_make_mode} finished")
+        return result
--- a/magic_pdf/pipe/UNIPipe.py
+++ b/magic_pdf/pipe/UNIPipe.py
@@ -2,7 +2,7 @@ import json

 from loguru import logger

-from magic_pdf.libs.MakeContentConfig import DropMode
+from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
 from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
 from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
 from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
@@ -39,12 +39,14 @@ class UNIPipe(AbsPipe):
                                              is_debug=self.is_debug)

    def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
-        content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), img_parent_path, drop_mode)
-        return content_list
-
-    def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
-        markdown_content = AbsPipe.mk_markdown(self.get_compress_pdf_mid_data(), img_parent_path, drop_mode)
-        return markdown_content
+        result = super().pipe_mk_uni_format(img_parent_path, drop_mode)
+        logger.info("uni_pipe mk content list finished")
+        return result
+
+    def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD):
+        result = super().pipe_mk_markdown(img_parent_path, drop_mode, md_make_mode)
+        logger.info(f"uni_pipe mk {md_make_mode} finished")
+        return result


 if __name__ == '__main__':

--- a/magic_pdf/resources/model_config/UniMERNet/demo.yaml
+++ b/magic_pdf/resources/model_config/UniMERNet/demo.yaml
+model:
+  arch: unimernet
+  model_type: unimernet
+  model_config:
+    model_name: ./models
+    max_seq_len: 1024
+    length_aware: False
+  load_pretrained: True
+  pretrained: ./models/pytorch_model.bin
+  tokenizer_config:
+    path: ./models
+
+datasets:
+  formula_rec_eval:
+    vis_processor:
+      eval:
+        name: "formula_image_eval"
+        image_size:
+          - 192
+          - 672
+   
+run:
+  runner: runner_iter
+  task: unimernet_train
+
+  batch_size_train: 64
+  batch_size_eval: 64
+  num_workers: 1
+
+  iters_per_inner_epoch: 2000
+  max_iters: 60000
+
+  seed: 42
+  output_dir: "../output/demo"
+
+  evaluate: True
+  test_splits: [ "eval" ]
+
+  device: "cuda"
+  world_size: 1
+  dist_url: "env://"
+  distributed: True
+  distributed_type: ddp  # or fsdp when train llm
+
+  generate_cfg:
+    temperature: 0.0
--- a/magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml
+++ b/magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml
+AUG:
+  DETR: true
+CACHE_DIR: /mnt/localdata/users/yupanhuang/cache/huggingface
+CUDNN_BENCHMARK: false
+DATALOADER:
+  ASPECT_RATIO_GROUPING: true
+  FILTER_EMPTY_ANNOTATIONS: false
+  NUM_WORKERS: 4
+  REPEAT_THRESHOLD: 0.0
+  SAMPLER_TRAIN: TrainingSampler
+DATASETS:
+  PRECOMPUTED_PROPOSAL_TOPK_TEST: 1000
+  PRECOMPUTED_PROPOSAL_TOPK_TRAIN: 2000
+  PROPOSAL_FILES_TEST: []
+  PROPOSAL_FILES_TRAIN: []
+  TEST:
+  - scihub_train
+  TRAIN:
+  - scihub_train
+GLOBAL:
+  HACK: 1.0
+ICDAR_DATA_DIR_TEST: ''
+ICDAR_DATA_DIR_TRAIN: ''
+INPUT:
+  CROP:
+    ENABLED: true
+    SIZE:
+    - 384
+    - 600
+    TYPE: absolute_range
+  FORMAT: RGB
+  MASK_FORMAT: polygon
+  MAX_SIZE_TEST: 1333
+  MAX_SIZE_TRAIN: 1333
+  MIN_SIZE_TEST: 800
+  MIN_SIZE_TRAIN:
+  - 480
+  - 512
+  - 544
+  - 576
+  - 608
+  - 640
+  - 672
+  - 704
+  - 736
+  - 768
+  - 800
+  MIN_SIZE_TRAIN_SAMPLING: choice
+  RANDOM_FLIP: horizontal
+MODEL:
+  ANCHOR_GENERATOR:
+    ANGLES:
+    - - -90
+      - 0
+      - 90
+    ASPECT_RATIOS:
+    - - 0.5
+      - 1.0
+      - 2.0
+    NAME: DefaultAnchorGenerator
+    OFFSET: 0.0
+    SIZES:
+    - - 32
+    - - 64
+    - - 128
+    - - 256
+    - - 512
+  BACKBONE:
+    FREEZE_AT: 2
+    NAME: build_vit_fpn_backbone
+  CONFIG_PATH: ''
+  DEVICE: cuda
+  FPN:
+    FUSE_TYPE: sum
+    IN_FEATURES:
+    - layer3
+    - layer5
+    - layer7
+    - layer11
+    NORM: ''
+    OUT_CHANNELS: 256
+  IMAGE_ONLY: true
+  KEYPOINT_ON: false
+  LOAD_PROPOSALS: false
+  MASK_ON: true
+  META_ARCHITECTURE: VLGeneralizedRCNN
+  PANOPTIC_FPN:
+    COMBINE:
+      ENABLED: true
+      INSTANCES_CONFIDENCE_THRESH: 0.5
+      OVERLAP_THRESH: 0.5
+      STUFF_AREA_LIMIT: 4096
+    INSTANCE_LOSS_WEIGHT: 1.0
+  PIXEL_MEAN:
+  - 127.5
+  - 127.5
+  - 127.5
+  PIXEL_STD:
+  - 127.5
+  - 127.5
+  - 127.5
+  PROPOSAL_GENERATOR:
+    MIN_SIZE: 0
+    NAME: RPN
+  RESNETS:
+    DEFORM_MODULATED: false
+    DEFORM_NUM_GROUPS: 1
+    DEFORM_ON_PER_STAGE:
+    - false
+    - false
+    - false
+    - false
+    DEPTH: 50
+    NORM: FrozenBN
+    NUM_GROUPS: 1
+    OUT_FEATURES:
+    - res4
+    RES2_OUT_CHANNELS: 256
+    RES5_DILATION: 1
+    STEM_OUT_CHANNELS: 64
+    STRIDE_IN_1X1: true
+    WIDTH_PER_GROUP: 64
+  RETINANET:
+    BBOX_REG_LOSS_TYPE: smooth_l1
+    BBOX_REG_WEIGHTS:
+    - 1.0
+    - 1.0
+    - 1.0
+    - 1.0
+    FOCAL_LOSS_ALPHA: 0.25
+    FOCAL_LOSS_GAMMA: 2.0
+    IN_FEATURES:
+    - p3
+    - p4
+    - p5
+    - p6
+    - p7
+    IOU_LABELS:
+    - 0
+    - -1
+    - 1
+    IOU_THRESHOLDS:
+    - 0.4
+    - 0.5
+    NMS_THRESH_TEST: 0.5
+    NORM: ''
+    NUM_CLASSES: 10
+    NUM_CONVS: 4
+    PRIOR_PROB: 0.01
+    SCORE_THRESH_TEST: 0.05
+    SMOOTH_L1_LOSS_BETA: 0.1
+    TOPK_CANDIDATES_TEST: 1000
+  ROI_BOX_CASCADE_HEAD:
+    BBOX_REG_WEIGHTS:
+    - - 10.0
+      - 10.0
+      - 5.0
+      - 5.0
+    - - 20.0
+      - 20.0
+      - 10.0
+      - 10.0
+    - - 30.0
+      - 30.0
+      - 15.0
+      - 15.0
+    IOUS:
+    - 0.5
+    - 0.6
+    - 0.7
+  ROI_BOX_HEAD:
+    BBOX_REG_LOSS_TYPE: smooth_l1
+    BBOX_REG_LOSS_WEIGHT: 1.0
+    BBOX_REG_WEIGHTS:
+    - 10.0
+    - 10.0
+    - 5.0
+    - 5.0
+    CLS_AGNOSTIC_BBOX_REG: true
+    CONV_DIM: 256
+    FC_DIM: 1024
+    NAME: FastRCNNConvFCHead
+    NORM: ''
+    NUM_CONV: 0
+    NUM_FC: 2
+    POOLER_RESOLUTION: 7
+    POOLER_SAMPLING_RATIO: 0
+    POOLER_TYPE: ROIAlignV2
+    SMOOTH_L1_BETA: 0.0
+    TRAIN_ON_PRED_BOXES: false
+  ROI_HEADS:
+    BATCH_SIZE_PER_IMAGE: 512
+    IN_FEATURES:
+    - p2
+    - p3
+    - p4
+    - p5
+    IOU_LABELS:
+    - 0
+    - 1
+    IOU_THRESHOLDS:
+    - 0.5
+    NAME: CascadeROIHeads
+    NMS_THRESH_TEST: 0.5
+    NUM_CLASSES: 10
+    POSITIVE_FRACTION: 0.25
+    PROPOSAL_APPEND_GT: true
+    SCORE_THRESH_TEST: 0.05
+  ROI_KEYPOINT_HEAD:
+    CONV_DIMS:
+    - 512
+    - 512
+    - 512
+    - 512
+    - 512
+    - 512
+    - 512
+    - 512
+    LOSS_WEIGHT: 1.0
+    MIN_KEYPOINTS_PER_IMAGE: 1
+    NAME: KRCNNConvDeconvUpsampleHead
+    NORMALIZE_LOSS_BY_VISIBLE_KEYPOINTS: true
+    NUM_KEYPOINTS: 17
+    POOLER_RESOLUTION: 14
+    POOLER_SAMPLING_RATIO: 0
+    POOLER_TYPE: ROIAlignV2
+  ROI_MASK_HEAD:
+    CLS_AGNOSTIC_MASK: false
+    CONV_DIM: 256
+    NAME: MaskRCNNConvUpsampleHead
+    NORM: ''
+    NUM_CONV: 4
+    POOLER_RESOLUTION: 14
+    POOLER_SAMPLING_RATIO: 0
+    POOLER_TYPE: ROIAlignV2
+  RPN:
+    BATCH_SIZE_PER_IMAGE: 256
+    BBOX_REG_LOSS_TYPE: smooth_l1
+    BBOX_REG_LOSS_WEIGHT: 1.0
+    BBOX_REG_WEIGHTS:
+    - 1.0
+    - 1.0
+    - 1.0
+    - 1.0
+    BOUNDARY_THRESH: -1
+    CONV_DIMS:
+    - -1
+    HEAD_NAME: StandardRPNHead
+    IN_FEATURES:
+    - p2
+    - p3
+    - p4
+    - p5
+    - p6
+    IOU_LABELS:
+    - 0
+    - -1
+    - 1
+    IOU_THRESHOLDS:
+    - 0.3
+    - 0.7
+    LOSS_WEIGHT: 1.0
+    NMS_THRESH: 0.7
+    POSITIVE_FRACTION: 0.5
+    POST_NMS_TOPK_TEST: 1000
+    POST_NMS_TOPK_TRAIN: 2000
+    PRE_NMS_TOPK_TEST: 1000
+    PRE_NMS_TOPK_TRAIN: 2000
+    SMOOTH_L1_BETA: 0.0
+  SEM_SEG_HEAD:
+    COMMON_STRIDE: 4
+    CONVS_DIM: 128
+    IGNORE_VALUE: 255
+    IN_FEATURES:
+    - p2
+    - p3
+    - p4
+    - p5
+    LOSS_WEIGHT: 1.0
+    NAME: SemSegFPNHead
+    NORM: GN
+    NUM_CLASSES: 10
+  VIT:
+    DROP_PATH: 0.1
+    IMG_SIZE:
+    - 224
+    - 224
+    NAME: layoutlmv3_base
+    OUT_FEATURES:
+    - layer3
+    - layer5
+    - layer7
+    - layer11
+    POS_TYPE: abs
+  WEIGHTS: 
+OUTPUT_DIR: 
+SCIHUB_DATA_DIR_TRAIN: /mnt/petrelfs/share_data/zhaozhiyuan/publaynet/layout_scihub/train
+SEED: 42
+SOLVER:
+  AMP:
+    ENABLED: true
+  BACKBONE_MULTIPLIER: 1.0
+  BASE_LR: 0.0002
+  BIAS_LR_FACTOR: 1.0
+  CHECKPOINT_PERIOD: 2000
+  CLIP_GRADIENTS:
+    CLIP_TYPE: full_model
+    CLIP_VALUE: 1.0
+    ENABLED: true
+    NORM_TYPE: 2.0
+  GAMMA: 0.1
+  GRADIENT_ACCUMULATION_STEPS: 1
+  IMS_PER_BATCH: 32
+  LR_SCHEDULER_NAME: WarmupCosineLR
+  MAX_ITER: 20000
+  MOMENTUM: 0.9
+  NESTEROV: false
+  OPTIMIZER: ADAMW
+  REFERENCE_WORLD_SIZE: 0
+  STEPS:
+  - 10000
+  WARMUP_FACTOR: 0.01
+  WARMUP_ITERS: 333
+  WARMUP_METHOD: linear
+  WEIGHT_DECAY: 0.05
+  WEIGHT_DECAY_BIAS: null
+  WEIGHT_DECAY_NORM: 0.0
+TEST:
+  AUG:
+    ENABLED: false
+    FLIP: true
+    MAX_SIZE: 4000
+    MIN_SIZES:
+    - 400
+    - 500
+    - 600
+    - 700
+    - 800
+    - 900
+    - 1000
+    - 1100
+    - 1200
+  DETECTIONS_PER_IMAGE: 100
+  EVAL_PERIOD: 1000
+  EXPECTED_RESULTS: []
+  KEYPOINT_OKS_SIGMAS: []
+  PRECISE_BN:
+    ENABLED: false
+    NUM_ITER: 200
+VERSION: 2
+VIS_PERIOD: 0
--- a/magic_pdf/resources/model_config/model_configs.yaml
+++ b/magic_pdf/resources/model_config/model_configs.yaml
+config:
+  device: cpu
+  layout: True
+  formula: True
+
+weights:
+  layout: Layout/model_final.pth
+  mfd: MFD/weights.pt
+  mfr: MFR/UniMERNet
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,8 +4,8 @@ click>=8.1.7
 PyMuPDF>=1.24.7
 loguru>=0.6.0
 numpy>=1.21.6
-fast-langdetect>=0.1.1
+fast-langdetect>=0.2.1
 wordninja>=2.0.0
 scikit-learn>=1.0.2
 pdfminer.six>=20231228
-# requirements.txt 须保证只引入必需的外部依赖,如有新依赖添加请联系项目管理员
\ No newline at end of file
+# The requirements.txt must ensure that only necessary external dependencies are introduced. If there are new dependencies to add, please contact the project administrator.
--- a/setup.py
+++ b/setup.py
@@ -26,16 +26,20 @@ if __name__ == '__main__':
    setup(
        name="magic_pdf",  # 项目名
        version=__version__,  # 自动从tag中获取版本号
-        packages=find_packages(),  # 包含所有的包
+        packages=find_packages() + ["magic_pdf.resources"],  # 包含所有的包
+        package_data={
+            "magic_pdf.resources": ["**"],  # 包含magic_pdf.resources目录下的所有文件
+        },
        install_requires=parse_requirements('requirements.txt'),  # 项目依赖的第三方库
        extras_require={
            "gpu": ["paddleocr", "paddlepaddle-gpu"],
            "cpu": ["paddleocr", "paddlepaddle"],
+            "full-cpu": ["unimernet", "matplotlib", "ultralytics", "paddleocr", "paddlepaddle"],
        },
        description="A practical tool for converting PDF to Markdown",  # 简短描述
        long_description=long_description,  # 详细描述
        long_description_content_type="text/markdown",  # 如果README是Markdown格式
-        url="https://github.com/magicpdf/Magic-PDF",
+        url="https://github.com/opendatalab/MinerU",
        python_requires=">=3.9",  # 项目依赖的 Python 版本
        entry_points={
            "console_scripts": [

--- a/tests/test_cli/test_cli.py
+++ b/tests/test_cli/test_cli.py
@@ -19,32 +19,32 @@ class TestCli:
        #common.count_folders_and_check_contents(pdf_res_path)      
   

-    def test_pdf_specify_jsonl(self):
-        """
-        输入jsonl, 默认方式解析
-        """
-        cmd = "cd %s && export PYTHONPATH=. && python magic_pdf/cli/magicpdf.py  json-command --json 's3://llm-process-pperf/ebook_index_textbook_40k/中高考&竞赛知识点/part-663f1ef5e7c1-009416.jsonl?bytes=0,1133972'" % (code_path)
-        logging.info(cmd)
-        common.check_shell(cmd)
-        #common.count_folders_and_check_contents(pdf_res_path)
+    # def test_pdf_specify_jsonl(self):
+    #     """
+    #     输入jsonl, 默认方式解析
+    #     """
+    #     cmd = "cd %s && export PYTHONPATH=. && python magic_pdf/cli/magicpdf.py  json-command --json 's3://llm-process-pperf/ebook_index_textbook_40k/中高考&竞赛知识点/part-663f1ef5e7c1-009416.jsonl?bytes=0,1133972'" % (code_path)
+    #     logging.info(cmd)
+    #     common.check_shell(cmd)
+    #     #common.count_folders_and_check_contents(pdf_res_path)

-    def test_pdf_specify_jsonl_txt(self):
-        """
-        输入jsonl, txt方式解析  
-        """
-        cmd = "cd %s && export PYTHONPATH=. && python magic_pdf/cli/magicpdf.py  json-command --json 's3://llm-process-pperf/ebook_index_textbook_40k/中高考&竞赛知识点/part-663f1ef5e7c1-009416.jsonl?bytes=0,1133972' --method txt" % (code_path)
-        logging.info(cmd)
-        common.check_shell(cmd)
-        #common.count_folders_and_check_contents(pdf_res_path)
-    
-    def test_pdf_specify_jsonl_ocr(self):
-        """
-        输入jsonl, ocr方式解析
-        """
-        cmd = "cd %s && export PYTHONPATH=. && python magic_pdf/cli/magicpdf.py  json-command --json 's3://llm-process-pperf/ebook_index_textbook_40k/中高考&竞赛知识点/part-663f1ef5e7c1-009416.jsonl?bytes=0,1133972' --method ocr" % (code_path)
-        logging.info(cmd)
-        common.check_shell(cmd)
-        #common.count_folders_and_check_contents(pdf_res_path)
+    # def test_pdf_specify_jsonl_txt(self):
+    #     """
+    #     输入jsonl, txt方式解析
+    #     """
+    #     cmd = "cd %s && export PYTHONPATH=. && python magic_pdf/cli/magicpdf.py  json-command --json 's3://llm-process-pperf/ebook_index_textbook_40k/中高考&竞赛知识点/part-663f1ef5e7c1-009416.jsonl?bytes=0,1133972' --method txt" % (code_path)
+    #     logging.info(cmd)
+    #     common.check_shell(cmd)
+    #     #common.count_folders_and_check_contents(pdf_res_path)
+    #
+    # def test_pdf_specify_jsonl_ocr(self):
+    #     """
+    #     输入jsonl, ocr方式解析
+    #     """
+    #     cmd = "cd %s && export PYTHONPATH=. && python magic_pdf/cli/magicpdf.py  json-command --json 's3://llm-process-pperf/ebook_index_textbook_40k/中高考&竞赛知识点/part-663f1ef5e7c1-009416.jsonl?bytes=0,1133972' --method ocr" % (code_path)
+    #     logging.info(cmd)
+    #     common.check_shell(cmd)
+    #     #common.count_folders_and_check_contents(pdf_res_path)
 
 
 if __name__ == "__main__":

--- a/tools/benchmark.py
+++ b/tools/benchmark.py
-import zipfile
-import os
-import shutil
-import json
-import markdown_calculate
-code_path = os.environ.get('GITHUB_WORKSPACE')
-#数据集存放路径
-pdf_dev_path = "/share/quyuan/mineru/data/"
-#magicpdf最终结果
-pdf_res_path = "/share/quyuan/mineru/data/mineru"
-file_types = ["academic_literature", "atlas", "courseware", "colorful_textbook", "historical_documents", "notes", "ordinary_books", "ordinary_exam_paper", "ordinary_textbook", "research_report", "special_exam_paper"]
-def test_cli():
-    #magicpdf模型输出结果
-    magicpdf_path = os.path.join(pdf_dev_path, "output")
-    rm_cmd = "rm -rf %s" % (pdf_res_path)
-    os.system(rm_cmd)
-    os.makedirs(pdf_res_path)
-    cmd = 'cd %s && export PYTHONPATH=. && find %s -type f -name "*.pdf" | xargs -I{} python magic_pdf/cli/magicpdf.py  pdf-command  --pdf {}' % (code_path, magicpdf_path)
-    os.system(cmd)
-    for root, dirs, files in os.walk(pdf_res_path):
-         for magic_file in files:
-            for file_type in file_types:
-                target_dir = os.path.join(pdf_dev_path, "ci", file_type, "magicpdf")
-                if magic_file.endswith(".md") and magic_file.startswith(file_type):
-                    source_file = os.path.join(root, magic_file)
-                    target_file = os.path.join(pdf_dev_path, "ci", file_type, "magicpdf", magic_file)
-                    if not os.path.exists(target_dir):
-                         os.makedirs(target_dir) 
-                    shutil.copy(source_file, target_file)   
-
-def calculate_score():
-    data_path = os.path.join(pdf_dev_path, "ci")
-    cmd = "cd %s && export PYTHONPATH=. && python tools/clean_photo.py --tool_name annotations --download_dir %s" % (code_path, data_path)
-    os.system(cmd)
-    cmd = "cd %s && export PYTHONPATH=. && python tools/clean_photo.py --tool_name magicpdf --download_dir %s" % (code_path, data_path)
-    os.system(cmd)
-    score = markdown_calculate.Scoring(os.path.join(data_path, "result.json"))
-    score.calculate_similarity_total("magicpdf", file_types, data_path)
-    res = score.summary_scores()
-    return res
-
-
-def extrat_zip(zip_file_path, extract_to_path):
-    if zipfile.is_zipfile(zip_file_path):
-        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
-            zip_ref.extractall(extract_to_path)
-        print(f'Files extracted to {extract_to_path}')
-    else:
-        print(f'{zip_file_path} is not a zip file')
-
-
-def ci_ben():
-    fr = open(os.path.join(pdf_dev_path, "ci", "result.json"), "r")
-    lines = fr.readlines()
-    last_line = lines[-1].strip()
-    last_score = json.loads(last_line)
-    print ("last_score:", last_score)
-    last_simscore = last_score["average_sim_score"]
-    last_editdistance = last_score["average_edit_distance"]
-    last_bleu = last_score["average_bleu_score"]
-    extrat_zip(os.path.join(pdf_dev_path, 'output.zip'), os.path.join(pdf_dev_path))
-    test_cli()
-    now_score = calculate_score()
-    print ("now_score:", now_score)
-    now_simscore = now_score["average_sim_score"]
-    now_editdistance = now_score["average_edit_distance"]
-    now_bleu = now_score["average_bleu_score"]
-    assert last_simscore <= now_simscore
-    assert last_editdistance <= now_editdistance
-    assert last_bleu <= now_bleu
-
-
-if __name__ == "__main__":
-    ci_ben()
--- a/tools/clean_photo.py
+++ b/tools/clean_photo.py
-import pypandoc
-import re  
-import htmltabletomd
-import os  
-import argparse
-import zipfile
-
-parser = argparse.ArgumentParser(description="get tool type")
-parser.add_argument(
-    "--tool_name",
-    type=str,
-    required=True,
-    help="input tool name",
-)
-parser.add_argument(
-    "--download_dir",
-    type=str,
-    required=True,
-    help="input download dir",
-)
-args = parser.parse_args()
-
-def clean_markdown_images(content):  
-    pattern = re.compile(r'!\[[^\]]*\]\([^)]*\)', re.IGNORECASE)  
-    cleaned_content = pattern.sub('', content)   
-    return cleaned_content
-   
-def clean_ocrmath_photo(content):
-    pattern = re.compile(r'\\includegraphics\[.*?\]\{.*?\}', re.IGNORECASE)  
-    cleaned_content = pattern.sub('', content)   
-    return cleaned_content
-
-def convert_html_table_to_md(html_table):  
-    lines = html_table.strip().split('\n')  
-    md_table = ''  
-    if lines and '<tr>' in lines[0]:  
-        in_thead = True  
-        for line in lines:  
-            if '<th>' in line:  
-                cells = re.findall(r'<th>(.*?)</th>', line)  
-                md_table += '| ' + ' | '.join(cells) + ' |\n'  
-                in_thead = False  
-            elif '<td>' in line and not in_thead:  
-                cells = re.findall(r'<td>(.*?)</td>', line)  
-                md_table += '| ' + ' | '.join(cells) + ' |\n'  
-        md_table = md_table.rstrip() + '\n'    
-    return md_table  
- 
-def convert_latext_to_md(content):  
-    tables = re.findall(r'\\begin\{tabular\}(.*?)\\end\{tabular\}', content, re.DOTALL)  
-    placeholders = []  
-    for table in tables:  
-        placeholder = f"<!-- TABLE_PLACEHOLDER_{len(placeholders)} -->"  
-        replace_str = f"\\begin{{tabular}}{table}cl\\end{{tabular}}"
-        content = content.replace(replace_str, placeholder)  
-        try:
-            pypandoc.convert_text(replace_str,  format="latex", to="md", outputfile="output.md", encoding="utf-8")
-        except:
-            markdown_string = replace_str
-        else: 
-            markdown_string = open('output.md', 'r', encoding='utf-8').read()
-        placeholders.append((placeholder, markdown_string)) 
-    new_content = content  
-    for placeholder, md_table in placeholders:  
-        new_content = new_content.replace(placeholder, md_table)  
-        # 写入文件  
-    return new_content
-
- 
-def convert_htmltale_to_md(content):  
-    tables = re.findall(r'<table>(.*?)</table>', content, re.DOTALL)  
-    placeholders = []  
-    for table in tables:  
-        placeholder = f"<!-- TABLE_PLACEHOLDER_{len(placeholders)} -->"  
-        content = content.replace(f"<table>{table}</table>", placeholder)  
-        try:
-            convert_table = htmltabletomd.convert_table(table)
-        except:
-            convert_table = table
-        placeholders.append((placeholder,convert_table)) 
-    new_content = content  
-    for placeholder, md_table in placeholders:  
-        new_content = new_content.replace(placeholder, md_table)  
-        # 写入文件  
-    return new_content
-
-def clean_data(prod_type, download_dir):
-    file_type = ["academic_literature", "atlas", "courseware", "colorful_textbook", "historical_documents", "notes", "ordinary_books", "ordinary_exam_paper", "ordinary_textbook", "research_report", "special_exam_paper"]
-    for filetype in file_type:
-        tgt_dir = os.path.join(download_dir, filetype, prod_type, "cleaned")
-        if not os.path.exists(tgt_dir):  
-            os.makedirs(tgt_dir) 
-        source_dir = os.path.join(download_dir, filetype, prod_type)
-        filenames = os.listdir(source_dir)
-        for filename in filenames:
-            if filename.endswith('.md'):
-                input_file = os.path.join(source_dir, filename)
-                output_file = os.path.join(tgt_dir, "cleaned_" + filename)
-                with open(input_file, 'r', encoding='utf-8') as fr:
-                    content = fr.read()
-                    new_content = convert_htmltale_to_md(content)
-                    new_content = clean_markdown_images(new_content)
-                    new_content = clean_ocrmath_photo(new_content)
-                    new_content = convert_latext_to_md(new_content)
-                    with open(output_file, 'w', encoding='utf-8') as fw:
-                        fw.write(new_content)
-
-
-if __name__ == '__main__':
-    tool_type = args.tool_name
-    download_dir = args.download_dir
-    clean_data(tool_type, download_dir)
--- a/tools/markdown_calculate.py
+++ b/tools/markdown_calculate.py
-import os  
-from Levenshtein import distance  
-from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction, corpus_bleu
-from nltk.tokenize import word_tokenize  
-import json 
-import re
-import scoring
-import argparse
-import nltk
-nltk.download('punkt')
-# 初始化列表来存储编辑距离和BLEU分数  
-class Scoring:
-    def __init__(self, result_path):
-        self.edit_distances = []
-        self.bleu_scores = []
-        self.sim_scores = []
-        self.filenames = []
-        self.score_dict = {}
-        self.anntion_cnt = 0
-        self.fw = open(result_path, "w+")
-    def simple_bleu_score(self, candidate, reference):  
-        candidate_tokens = word_tokenize(candidate)  
-        reference_tokens = word_tokenize(reference) 
-        return sentence_bleu([reference_tokens], candidate_tokens, smoothing_function=SmoothingFunction().method1) 
-
-
-    def preprocess_string(self, s):  
-        sub_enter = re.sub(r'\n+', '\n', s)
-        return re.sub(r'  ', ' ', sub_enter)
-    
-    def calculate_similarity(self, annotion, actual, tool_type):
-        class_dict = {}
-        edit_distances = []
-        bleu_scores = []
-        sim_scores = list()
-        total_file = 0
-        for filename in os.listdir(annotion):  
-            if filename.endswith('.md') and not filename.startswith('.'):  # 忽略隐藏文件  
-                total_file = total_file + 1
-                # 读取A目录中的文件  
-                with open(os.path.join(annotion, filename), 'r', encoding='utf-8') as file_a:  
-                    content_a = file_a.read()
-                self.anntion_cnt = self.anntion_cnt + 1
-                filepath_b = os.path.join(actual, filename)  
-                if os.path.exists(filepath_b):  
-                    with open(filepath_b, 'r', encoding='utf-8') as file_b:  
-                        content_b = file_b.read()
-                        self.filenames.append(filename)
-                        # 计算编辑距离
-                        edit_dist = distance(self.preprocess_string(content_b),self.preprocess_string(content_a)) / max(len(content_a), len(content_b))
-                        self.edit_distances.append(edit_dist)  
-                        edit_distances.append(edit_dist)
-                        #计算BLUE分数
-                        bleu_score = self.simple_bleu_score(content_b, content_a)  
-                        bleu_scores.append(bleu_score)
-                        self.bleu_scores.append(bleu_score)  
-                        #计算marker分数
-                        score = scoring.score_text(content_b, content_a)
-                        sim_scores.append(score)
-                        self.sim_scores.append(score)
-                        class_dict[filename] = {"edit_dist": edit_dist, "bleu_score": bleu_score, "sim_score": score}
-                        self.score_dict[filename] = {"edit_dist": edit_dist, "bleu_score": bleu_score, "sim_score": score}
-                else:  
-                    print(f"File {filename} not found in actual directory.")  
-        # 计算每类平均值
-        class_average_edit_distance = sum(edit_distances) / len(edit_distances) if edit_distances else 0  
-        class_average_bleu_score = sum(bleu_scores) / len(bleu_scores) if bleu_scores else 0  
-        class_average_sim_score = sum(sim_scores) / len(sim_scores) if sim_scores else 0
-        self.fw.write(json.dumps(class_dict, ensure_ascii=False) + "\n")
-        ratio = len(class_dict)/total_file
-        self.fw.write(f"{tool_type} extract ratio:  {ratio}" + "\n")
-        self.fw.write(f"{tool_type} Average Levenshtein Distance: {class_average_edit_distance}" + "\n")
-        self.fw.write(f"{tool_type} Average BLEU Score: {class_average_bleu_score}" + "\n")
-        self.fw.write(f"{tool_type} Average Sim Score: {class_average_sim_score}" + "\n")
-
-        print (f"{tool_type} extract ratio: {ratio}")
-        print (f"{tool_type} Average Levenshtein Distance: {class_average_edit_distance}")
-        print (f"{tool_type} Average BLEU Score: {class_average_bleu_score}")
-        print (f"{tool_type} Average Sim Score: {class_average_sim_score}")
-        return self.score_dict
-    
-    def summary_scores(self):
-         # 计算整体平均值
-        over_all_dict = dict()
-        average_edit_distance = sum(self.edit_distances) / len(self.edit_distances) if self.edit_distances else 0  
-        average_bleu_score = sum(self.bleu_scores) / len(self.bleu_scores) if self.bleu_scores else 0  
-        average_sim_score = sum(self.sim_scores) / len(self.sim_scores) if self.sim_scores else 0
-        over_all_dict["average_edit_distance"] = average_edit_distance
-        over_all_dict["average_bleu_score"] = average_bleu_score
-        over_all_dict["average_sim_score"] = average_sim_score
-        self.fw.write(json.dumps(over_all_dict, ensure_ascii=False) + "\n")
-        return over_all_dict
-
-    def calculate_similarity_total(self, tool_type, file_types, download_dir):
-        for file_type in file_types:
-            annotion = os.path.join(download_dir, file_type, "annotations", "cleaned")
-            actual = os.path.join(download_dir, file_type, tool_type, "cleaned")
-            self.calculate_similarity(annotion, actual, file_type)
-
--- a/tools/scoring.py
+++ b/tools/scoring.py
-import math
-
-from rapidfuzz import fuzz
-import re
-import regex
-from statistics import mean
-
-CHUNK_MIN_CHARS = 25
-
-def chunk_text(text, chunk_len=500):
-    chunks = [text[i:i+chunk_len] for i in range(0, len(text), chunk_len)]
-    chunks = [c for c in chunks if c.strip() and len(c) > CHUNK_MIN_CHARS]
-    return chunks
-
-
-def overlap_score(hypothesis_chunks, reference_chunks):
-    if len(reference_chunks) > 0:
-        length_modifier = len(hypothesis_chunks) / len(reference_chunks)
-    else:
-        length_modifier = 0
-    search_distance = max(len(reference_chunks) // 5, 10)
-    chunk_scores = []
-    for i, hyp_chunk in enumerate(hypothesis_chunks):
-        max_score = 0
-        total_len = 0
-        i_offset = int(i * length_modifier)
-        chunk_range = range(max(0, i_offset-search_distance), min(len(reference_chunks), i_offset+search_distance))
-        for j in chunk_range:
-            ref_chunk = reference_chunks[j]
-            score = fuzz.ratio(hyp_chunk, ref_chunk, score_cutoff=30) / 100
-            if score > max_score:
-                max_score = score
-                total_len = len(ref_chunk)
-        chunk_scores.append(max_score)
-    return chunk_scores
-
-
-def score_text(hypothesis, reference):
-    # Returns a 0-1 alignment score
-    hypothesis_chunks = chunk_text(hypothesis)
-    reference_chunks = chunk_text(reference)
-    chunk_scores = overlap_score(hypothesis_chunks, reference_chunks)
-    if len(chunk_scores) > 0:
-        mean_score = mean(chunk_scores)
-        return mean_score
-    else:
-        return 0
-    #return mean(chunk_scores)
\ No newline at end of file