"tools/docker/ubuntu_2204.dockerfile" did not exist on "9852aaef3b8b2df28a74ae32ecb12dcb85f30362"
Commit 4a823359 authored by quyuan's avatar quyuan
Browse files

Merge branch 'master' of https://github.com/opendatalab/MinerU

parents 611e2f59 b6df9b18
# Copyright (c) Facebook, Inc. and its affiliates.
import colorsys
import logging
import math
import numpy as np
from enum import Enum, unique
import cv2
import matplotlib as mpl
import matplotlib.colors as mplc
import matplotlib.figure as mplfigure
import pycocotools.mask as mask_util
import torch
from matplotlib.backends.backend_agg import FigureCanvasAgg
from PIL import Image
from detectron2.data import MetadataCatalog
from detectron2.structures import BitMasks, Boxes, BoxMode, Keypoints, PolygonMasks, RotatedBoxes
from detectron2.utils.file_io import PathManager
from detectron2.utils.colormap import random_color
import pdb
logger = logging.getLogger(__name__)
__all__ = ["ColorMode", "VisImage", "Visualizer"]
_SMALL_OBJECT_AREA_THRESH = 1000
_LARGE_MASK_AREA_THRESH = 120000
_OFF_WHITE = (1.0, 1.0, 240.0 / 255)
_BLACK = (0, 0, 0)
_RED = (1.0, 0, 0)
_KEYPOINT_THRESHOLD = 0.05
#CLASS_NAMES = ["footnote", "footer", "header"]
@unique
class ColorMode(Enum):
"""
Enum of different color modes to use for instance visualizations.
"""
IMAGE = 0
"""
Picks a random color for every instance and overlay segmentations with low opacity.
"""
SEGMENTATION = 1
"""
Let instances of the same category have similar colors
(from metadata.thing_colors), and overlay them with
high opacity. This provides more attention on the quality of segmentation.
"""
IMAGE_BW = 2
"""
Same as IMAGE, but convert all areas without masks to gray-scale.
Only available for drawing per-instance mask predictions.
"""
class GenericMask:
"""
Attribute:
polygons (list[ndarray]): list[ndarray]: polygons for this mask.
Each ndarray has format [x, y, x, y, ...]
mask (ndarray): a binary mask
"""
def __init__(self, mask_or_polygons, height, width):
self._mask = self._polygons = self._has_holes = None
self.height = height
self.width = width
m = mask_or_polygons
if isinstance(m, dict):
# RLEs
assert "counts" in m and "size" in m
if isinstance(m["counts"], list): # uncompressed RLEs
h, w = m["size"]
assert h == height and w == width
m = mask_util.frPyObjects(m, h, w)
self._mask = mask_util.decode(m)[:, :]
return
if isinstance(m, list): # list[ndarray]
self._polygons = [np.asarray(x).reshape(-1) for x in m]
return
if isinstance(m, np.ndarray): # assumed to be a binary mask
assert m.shape[1] != 2, m.shape
assert m.shape == (
height,
width,
), f"mask shape: {m.shape}, target dims: {height}, {width}"
self._mask = m.astype("uint8")
return
raise ValueError("GenericMask cannot handle object {} of type '{}'".format(m, type(m)))
@property
def mask(self):
if self._mask is None:
self._mask = self.polygons_to_mask(self._polygons)
return self._mask
@property
def polygons(self):
if self._polygons is None:
self._polygons, self._has_holes = self.mask_to_polygons(self._mask)
return self._polygons
@property
def has_holes(self):
if self._has_holes is None:
if self._mask is not None:
self._polygons, self._has_holes = self.mask_to_polygons(self._mask)
else:
self._has_holes = False # if original format is polygon, does not have holes
return self._has_holes
def mask_to_polygons(self, mask):
# cv2.RETR_CCOMP flag retrieves all the contours and arranges them to a 2-level
# hierarchy. External contours (boundary) of the object are placed in hierarchy-1.
# Internal contours (holes) are placed in hierarchy-2.
# cv2.CHAIN_APPROX_NONE flag gets vertices of polygons from contours.
mask = np.ascontiguousarray(mask) # some versions of cv2 does not support incontiguous arr
res = cv2.findContours(mask.astype("uint8"), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_NONE)
hierarchy = res[-1]
if hierarchy is None: # empty mask
return [], False
has_holes = (hierarchy.reshape(-1, 4)[:, 3] >= 0).sum() > 0
res = res[-2]
res = [x.flatten() for x in res]
# These coordinates from OpenCV are integers in range [0, W-1 or H-1].
# We add 0.5 to turn them into real-value coordinate space. A better solution
# would be to first +0.5 and then dilate the returned polygon by 0.5.
res = [x + 0.5 for x in res if len(x) >= 6]
return res, has_holes
def polygons_to_mask(self, polygons):
rle = mask_util.frPyObjects(polygons, self.height, self.width)
rle = mask_util.merge(rle)
return mask_util.decode(rle)[:, :]
def area(self):
return self.mask.sum()
def bbox(self):
p = mask_util.frPyObjects(self.polygons, self.height, self.width)
p = mask_util.merge(p)
bbox = mask_util.toBbox(p)
bbox[2] += bbox[0]
bbox[3] += bbox[1]
return bbox
class _PanopticPrediction:
"""
Unify different panoptic annotation/prediction formats
"""
def __init__(self, panoptic_seg, segments_info, metadata=None):
if segments_info is None:
assert metadata is not None
# If "segments_info" is None, we assume "panoptic_img" is a
# H*W int32 image storing the panoptic_id in the format of
# category_id * label_divisor + instance_id. We reserve -1 for
# VOID label.
label_divisor = metadata.label_divisor
segments_info = []
for panoptic_label in np.unique(panoptic_seg.numpy()):
if panoptic_label == -1:
# VOID region.
continue
pred_class = panoptic_label // label_divisor
isthing = pred_class in metadata.thing_dataset_id_to_contiguous_id.values()
segments_info.append(
{
"id": int(panoptic_label),
"category_id": int(pred_class),
"isthing": bool(isthing),
}
)
del metadata
self._seg = panoptic_seg
self._sinfo = {s["id"]: s for s in segments_info} # seg id -> seg info
segment_ids, areas = torch.unique(panoptic_seg, sorted=True, return_counts=True)
areas = areas.numpy()
sorted_idxs = np.argsort(-areas)
self._seg_ids, self._seg_areas = segment_ids[sorted_idxs], areas[sorted_idxs]
self._seg_ids = self._seg_ids.tolist()
for sid, area in zip(self._seg_ids, self._seg_areas):
if sid in self._sinfo:
self._sinfo[sid]["area"] = float(area)
def non_empty_mask(self):
"""
Returns:
(H, W) array, a mask for all pixels that have a prediction
"""
empty_ids = []
for id in self._seg_ids:
if id not in self._sinfo:
empty_ids.append(id)
if len(empty_ids) == 0:
return np.zeros(self._seg.shape, dtype=np.uint8)
assert (
len(empty_ids) == 1
), ">1 ids corresponds to no labels. This is currently not supported"
return (self._seg != empty_ids[0]).numpy().astype(np.bool)
def semantic_masks(self):
for sid in self._seg_ids:
sinfo = self._sinfo.get(sid)
if sinfo is None or sinfo["isthing"]:
# Some pixels (e.g. id 0 in PanopticFPN) have no instance or semantic predictions.
continue
yield (self._seg == sid).numpy().astype(np.bool), sinfo
def instance_masks(self):
for sid in self._seg_ids:
sinfo = self._sinfo.get(sid)
if sinfo is None or not sinfo["isthing"]:
continue
mask = (self._seg == sid).numpy().astype(np.bool)
if mask.sum() > 0:
yield mask, sinfo
def _create_text_labels(classes, scores, class_names, is_crowd=None):
"""
Args:
classes (list[int] or None):
scores (list[float] or None):
class_names (list[str] or None):
is_crowd (list[bool] or None):
Returns:
list[str] or None
"""
#class_names = CLASS_NAMES
labels = None
if classes is not None:
if class_names is not None and len(class_names) > 0:
labels = [class_names[i] for i in classes]
else:
labels = [str(i) for i in classes]
if scores is not None:
if labels is None:
labels = ["{:.0f}%".format(s * 100) for s in scores]
else:
labels = ["{} {:.0f}%".format(l, s * 100) for l, s in zip(labels, scores)]
if labels is not None and is_crowd is not None:
labels = [l + ("|crowd" if crowd else "") for l, crowd in zip(labels, is_crowd)]
return labels
class VisImage:
def __init__(self, img, scale=1.0):
"""
Args:
img (ndarray): an RGB image of shape (H, W, 3) in range [0, 255].
scale (float): scale the input image
"""
self.img = img
self.scale = scale
self.width, self.height = img.shape[1], img.shape[0]
self._setup_figure(img)
def _setup_figure(self, img):
"""
Args:
Same as in :meth:`__init__()`.
Returns:
fig (matplotlib.pyplot.figure): top level container for all the image plot elements.
ax (matplotlib.pyplot.Axes): contains figure elements and sets the coordinate system.
"""
fig = mplfigure.Figure(frameon=False)
self.dpi = fig.get_dpi()
# add a small 1e-2 to avoid precision lost due to matplotlib's truncation
# (https://github.com/matplotlib/matplotlib/issues/15363)
fig.set_size_inches(
(self.width * self.scale + 1e-2) / self.dpi,
(self.height * self.scale + 1e-2) / self.dpi,
)
self.canvas = FigureCanvasAgg(fig)
# self.canvas = mpl.backends.backend_cairo.FigureCanvasCairo(fig)
ax = fig.add_axes([0.0, 0.0, 1.0, 1.0])
ax.axis("off")
self.fig = fig
self.ax = ax
self.reset_image(img)
def reset_image(self, img):
"""
Args:
img: same as in __init__
"""
img = img.astype("uint8")
self.ax.imshow(img, extent=(0, self.width, self.height, 0), interpolation="nearest")
def save(self, filepath):
"""
Args:
filepath (str): a string that contains the absolute path, including the file name, where
the visualized image will be saved.
"""
self.fig.savefig(filepath)
def get_image(self):
"""
Returns:
ndarray:
the visualized image of shape (H, W, 3) (RGB) in uint8 type.
The shape is scaled w.r.t the input image using the given `scale` argument.
"""
canvas = self.canvas
s, (width, height) = canvas.print_to_buffer()
# buf = io.BytesIO() # works for cairo backend
# canvas.print_rgba(buf)
# width, height = self.width, self.height
# s = buf.getvalue()
buffer = np.frombuffer(s, dtype="uint8")
img_rgba = buffer.reshape(height, width, 4)
rgb, alpha = np.split(img_rgba, [3], axis=2)
return rgb.astype("uint8")
class Visualizer:
"""
Visualizer that draws data about detection/segmentation on images.
It contains methods like `draw_{text,box,circle,line,binary_mask,polygon}`
that draw primitive objects to images, as well as high-level wrappers like
`draw_{instance_predictions,sem_seg,panoptic_seg_predictions,dataset_dict}`
that draw composite data in some pre-defined style.
Note that the exact visualization style for the high-level wrappers are subject to change.
Style such as color, opacity, label contents, visibility of labels, or even the visibility
of objects themselves (e.g. when the object is too small) may change according
to different heuristics, as long as the results still look visually reasonable.
To obtain a consistent style, you can implement custom drawing functions with the
abovementioned primitive methods instead. If you need more customized visualization
styles, you can process the data yourself following their format documented in
tutorials (:doc:`/tutorials/models`, :doc:`/tutorials/datasets`). This class does not
intend to satisfy everyone's preference on drawing styles.
This visualizer focuses on high rendering quality rather than performance. It is not
designed to be used for real-time applications.
"""
# TODO implement a fast, rasterized version using OpenCV
def __init__(self, img_rgb, metadata=None, scale=1.0, instance_mode=ColorMode.IMAGE):
"""
Args:
img_rgb: a numpy array of shape (H, W, C), where H and W correspond to
the height and width of the image respectively. C is the number of
color channels. The image is required to be in RGB format since that
is a requirement of the Matplotlib library. The image is also expected
to be in the range [0, 255].
metadata (Metadata): dataset metadata (e.g. class names and colors)
instance_mode (ColorMode): defines one of the pre-defined style for drawing
instances on an image.
"""
self.img = np.asarray(img_rgb).clip(0, 255).astype(np.uint8)
if metadata is None:
metadata = MetadataCatalog.get("__nonexist__")
self.metadata = metadata
self.output = VisImage(self.img, scale=scale)
self.cpu_device = torch.device("cpu")
# too small texts are useless, therefore clamp to 9
self._default_font_size = max(
np.sqrt(self.output.height * self.output.width) // 90, 10 // scale
)
self._instance_mode = instance_mode
self.keypoint_threshold = _KEYPOINT_THRESHOLD
def draw_instance_predictions(self, predictions):
"""
Draw instance-level prediction results on an image.
Args:
predictions (Instances): the output of an instance detection/segmentation
model. Following fields will be used to draw:
"pred_boxes", "pred_classes", "scores", "pred_masks" (or "pred_masks_rle").
Returns:
output (VisImage): image object with visualizations.
"""
boxes = predictions.pred_boxes if predictions.has("pred_boxes") else None
scores = predictions.scores if predictions.has("scores") else None
classes = predictions.pred_classes.tolist() if predictions.has("pred_classes") else None
labels = _create_text_labels(classes, scores, self.metadata.get("thing_classes", None))
keypoints = predictions.pred_keypoints if predictions.has("pred_keypoints") else None
if predictions.has("pred_masks"):
masks = np.asarray(predictions.pred_masks)
masks = [GenericMask(x, self.output.height, self.output.width) for x in masks]
else:
masks = None
if self._instance_mode == ColorMode.SEGMENTATION and self.metadata.get("thing_colors"):
colors = [
self._jitter([x / 255 for x in self.metadata.thing_colors[c]]) for c in classes
]
alpha = 0.8
else:
colors = None
alpha = 0.5
if self._instance_mode == ColorMode.IMAGE_BW:
self.output.reset_image(
self._create_grayscale_image(
(predictions.pred_masks.any(dim=0) > 0).numpy()
if predictions.has("pred_masks")
else None
)
)
alpha = 0.3
self.overlay_instances(
masks=masks,
boxes=boxes,
labels=labels,
keypoints=keypoints,
assigned_colors=colors,
alpha=alpha,
)
return self.output
def draw_sem_seg(self, sem_seg, area_threshold=None, alpha=0.8):
"""
Draw semantic segmentation predictions/labels.
Args:
sem_seg (Tensor or ndarray): the segmentation of shape (H, W).
Each value is the integer label of the pixel.
area_threshold (int): segments with less than `area_threshold` are not drawn.
alpha (float): the larger it is, the more opaque the segmentations are.
Returns:
output (VisImage): image object with visualizations.
"""
if isinstance(sem_seg, torch.Tensor):
sem_seg = sem_seg.numpy()
labels, areas = np.unique(sem_seg, return_counts=True)
sorted_idxs = np.argsort(-areas).tolist()
labels = labels[sorted_idxs]
for label in filter(lambda l: l < len(self.metadata.stuff_classes), labels):
try:
mask_color = [x / 255 for x in self.metadata.stuff_colors[label]]
except (AttributeError, IndexError):
mask_color = None
binary_mask = (sem_seg == label).astype(np.uint8)
text = self.metadata.stuff_classes[label]
self.draw_binary_mask(
binary_mask,
color=mask_color,
edge_color=_OFF_WHITE,
text=text,
alpha=alpha,
area_threshold=area_threshold,
)
return self.output
def draw_panoptic_seg(self, panoptic_seg, segments_info, area_threshold=None, alpha=0.7):
"""
Draw panoptic prediction annotations or results.
Args:
panoptic_seg (Tensor): of shape (height, width) where the values are ids for each
segment.
segments_info (list[dict] or None): Describe each segment in `panoptic_seg`.
If it is a ``list[dict]``, each dict contains keys "id", "category_id".
If None, category id of each pixel is computed by
``pixel // metadata.label_divisor``.
area_threshold (int): stuff segments with less than `area_threshold` are not drawn.
Returns:
output (VisImage): image object with visualizations.
"""
pred = _PanopticPrediction(panoptic_seg, segments_info, self.metadata)
if self._instance_mode == ColorMode.IMAGE_BW:
self.output.reset_image(self._create_grayscale_image(pred.non_empty_mask()))
# draw mask for all semantic segments first i.e. "stuff"
for mask, sinfo in pred.semantic_masks():
category_idx = sinfo["category_id"]
try:
mask_color = [x / 255 for x in self.metadata.stuff_colors[category_idx]]
except AttributeError:
mask_color = None
text = self.metadata.stuff_classes[category_idx]
self.draw_binary_mask(
mask,
color=mask_color,
edge_color=_OFF_WHITE,
text=text,
alpha=alpha,
area_threshold=area_threshold,
)
# draw mask for all instances second
all_instances = list(pred.instance_masks())
if len(all_instances) == 0:
return self.output
masks, sinfo = list(zip(*all_instances))
category_ids = [x["category_id"] for x in sinfo]
try:
scores = [x["score"] for x in sinfo]
except KeyError:
scores = None
labels = _create_text_labels(
category_ids, scores, self.metadata.thing_classes, [x.get("iscrowd", 0) for x in sinfo]
)
try:
colors = [
self._jitter([x / 255 for x in self.metadata.thing_colors[c]]) for c in category_ids
]
except AttributeError:
colors = None
self.overlay_instances(masks=masks, labels=labels, assigned_colors=colors, alpha=alpha)
return self.output
draw_panoptic_seg_predictions = draw_panoptic_seg # backward compatibility
def draw_dataset_dict(self, dic):
"""
Draw annotations/segmentaions in Detectron2 Dataset format.
Args:
dic (dict): annotation/segmentation data of one image, in Detectron2 Dataset format.
Returns:
output (VisImage): image object with visualizations.
"""
annos = dic.get("annotations", None)
if annos:
if "segmentation" in annos[0]:
masks = [x["segmentation"] for x in annos]
else:
masks = None
if "keypoints" in annos[0]:
keypts = [x["keypoints"] for x in annos]
keypts = np.array(keypts).reshape(len(annos), -1, 3)
else:
keypts = None
boxes = [
BoxMode.convert(x["bbox"], x["bbox_mode"], BoxMode.XYXY_ABS)
if len(x["bbox"]) == 4
else x["bbox"]
for x in annos
]
colors = None
category_ids = [x["category_id"] for x in annos]
if self._instance_mode == ColorMode.SEGMENTATION and self.metadata.get("thing_colors"):
colors = [
self._jitter([x / 255 for x in self.metadata.thing_colors[c]])
for c in category_ids
]
names = self.metadata.get("thing_classes", None)
labels = _create_text_labels(
category_ids,
scores=None,
class_names=names,
is_crowd=[x.get("iscrowd", 0) for x in annos],
)
self.overlay_instances(
labels=labels, boxes=boxes, masks=masks, keypoints=keypts, assigned_colors=colors
)
sem_seg = dic.get("sem_seg", None)
if sem_seg is None and "sem_seg_file_name" in dic:
with PathManager.open(dic["sem_seg_file_name"], "rb") as f:
sem_seg = Image.open(f)
sem_seg = np.asarray(sem_seg, dtype="uint8")
if sem_seg is not None:
self.draw_sem_seg(sem_seg, area_threshold=0, alpha=0.5)
pan_seg = dic.get("pan_seg", None)
if pan_seg is None and "pan_seg_file_name" in dic:
with PathManager.open(dic["pan_seg_file_name"], "rb") as f:
pan_seg = Image.open(f)
pan_seg = np.asarray(pan_seg)
from panopticapi.utils import rgb2id
pan_seg = rgb2id(pan_seg)
if pan_seg is not None:
segments_info = dic["segments_info"]
pan_seg = torch.tensor(pan_seg)
self.draw_panoptic_seg(pan_seg, segments_info, area_threshold=0, alpha=0.5)
return self.output
def overlay_instances(
self,
*,
boxes=None,
labels=None,
masks=None,
keypoints=None,
assigned_colors=None,
alpha=0.5,
):
"""
Args:
boxes (Boxes, RotatedBoxes or ndarray): either a :class:`Boxes`,
or an Nx4 numpy array of XYXY_ABS format for the N objects in a single image,
or a :class:`RotatedBoxes`,
or an Nx5 numpy array of (x_center, y_center, width, height, angle_degrees) format
for the N objects in a single image,
labels (list[str]): the text to be displayed for each instance.
masks (masks-like object): Supported types are:
* :class:`detectron2.structures.PolygonMasks`,
:class:`detectron2.structures.BitMasks`.
* list[list[ndarray]]: contains the segmentation masks for all objects in one image.
The first level of the list corresponds to individual instances. The second
level to all the polygon that compose the instance, and the third level
to the polygon coordinates. The third level should have the format of
[x0, y0, x1, y1, ..., xn, yn] (n >= 3).
* list[ndarray]: each ndarray is a binary mask of shape (H, W).
* list[dict]: each dict is a COCO-style RLE.
keypoints (Keypoint or array like): an array-like object of shape (N, K, 3),
where the N is the number of instances and K is the number of keypoints.
The last dimension corresponds to (x, y, visibility or score).
assigned_colors (list[matplotlib.colors]): a list of colors, where each color
corresponds to each mask or box in the image. Refer to 'matplotlib.colors'
for full list of formats that the colors are accepted in.
Returns:
output (VisImage): image object with visualizations.
"""
num_instances = 0
if boxes is not None:
boxes = self._convert_boxes(boxes)
num_instances = len(boxes)
if masks is not None:
masks = self._convert_masks(masks)
if num_instances:
assert len(masks) == num_instances
else:
num_instances = len(masks)
if keypoints is not None:
if num_instances:
assert len(keypoints) == num_instances
else:
num_instances = len(keypoints)
keypoints = self._convert_keypoints(keypoints)
if labels is not None:
assert len(labels) == num_instances
if assigned_colors is None:
assigned_colors = [random_color(rgb=True, maximum=1) for _ in range(num_instances)]
if num_instances == 0:
return self.output
if boxes is not None and boxes.shape[1] == 5:
return self.overlay_rotated_instances(
boxes=boxes, labels=labels, assigned_colors=assigned_colors
)
# Display in largest to smallest order to reduce occlusion.
areas = None
if boxes is not None:
areas = np.prod(boxes[:, 2:] - boxes[:, :2], axis=1)
elif masks is not None:
areas = np.asarray([x.area() for x in masks])
if areas is not None:
sorted_idxs = np.argsort(-areas).tolist()
# Re-order overlapped instances in descending order.
boxes = boxes[sorted_idxs] if boxes is not None else None
labels = [labels[k] for k in sorted_idxs] if labels is not None else None
masks = [masks[idx] for idx in sorted_idxs] if masks is not None else None
assigned_colors = [assigned_colors[idx] for idx in sorted_idxs]
keypoints = keypoints[sorted_idxs] if keypoints is not None else None
for i in range(num_instances):
color = assigned_colors[i]
if boxes is not None:
self.draw_box(boxes[i], edge_color=color)
if masks is not None:
for segment in masks[i].polygons:
self.draw_polygon(segment.reshape(-1, 2), color, alpha=alpha)
if labels is not None:
# first get a box
if boxes is not None:
x0, y0, x1, y1 = boxes[i]
text_pos = (x0, y0) # if drawing boxes, put text on the box corner.
horiz_align = "left"
elif masks is not None:
# skip small mask without polygon
if len(masks[i].polygons) == 0:
continue
x0, y0, x1, y1 = masks[i].bbox()
# draw text in the center (defined by median) when box is not drawn
# median is less sensitive to outliers.
text_pos = np.median(masks[i].mask.nonzero(), axis=1)[::-1]
horiz_align = "center"
else:
continue # drawing the box confidence for keypoints isn't very useful.
# for small objects, draw text at the side to avoid occlusion
instance_area = (y1 - y0) * (x1 - x0)
if (
instance_area < _SMALL_OBJECT_AREA_THRESH * self.output.scale
or y1 - y0 < 40 * self.output.scale
):
if y1 >= self.output.height - 5:
text_pos = (x1, y0)
else:
text_pos = (x0, y1)
height_ratio = (y1 - y0) / np.sqrt(self.output.height * self.output.width)
lighter_color = self._change_color_brightness(color, brightness_factor=0.7)
font_size = (
np.clip((height_ratio - 0.02) / 0.08 + 1, 1.2, 2)
* 0.5
* self._default_font_size
)
self.draw_text(
labels[i],
text_pos,
color=lighter_color,
horizontal_alignment=horiz_align,
font_size=font_size,
)
# draw keypoints
if keypoints is not None:
for keypoints_per_instance in keypoints:
self.draw_and_connect_keypoints(keypoints_per_instance)
return self.output
def overlay_rotated_instances(self, boxes=None, labels=None, assigned_colors=None):
"""
Args:
boxes (ndarray): an Nx5 numpy array of
(x_center, y_center, width, height, angle_degrees) format
for the N objects in a single image.
labels (list[str]): the text to be displayed for each instance.
assigned_colors (list[matplotlib.colors]): a list of colors, where each color
corresponds to each mask or box in the image. Refer to 'matplotlib.colors'
for full list of formats that the colors are accepted in.
Returns:
output (VisImage): image object with visualizations.
"""
num_instances = len(boxes)
if assigned_colors is None:
assigned_colors = [random_color(rgb=True, maximum=1) for _ in range(num_instances)]
if num_instances == 0:
return self.output
# Display in largest to smallest order to reduce occlusion.
if boxes is not None:
areas = boxes[:, 2] * boxes[:, 3]
sorted_idxs = np.argsort(-areas).tolist()
# Re-order overlapped instances in descending order.
boxes = boxes[sorted_idxs]
labels = [labels[k] for k in sorted_idxs] if labels is not None else None
colors = [assigned_colors[idx] for idx in sorted_idxs]
for i in range(num_instances):
self.draw_rotated_box_with_label(
boxes[i], edge_color=colors[i], label=labels[i] if labels is not None else None
)
return self.output
def draw_and_connect_keypoints(self, keypoints):
"""
Draws keypoints of an instance and follows the rules for keypoint connections
to draw lines between appropriate keypoints. This follows color heuristics for
line color.
Args:
keypoints (Tensor): a tensor of shape (K, 3), where K is the number of keypoints
and the last dimension corresponds to (x, y, probability).
Returns:
output (VisImage): image object with visualizations.
"""
visible = {}
keypoint_names = self.metadata.get("keypoint_names")
for idx, keypoint in enumerate(keypoints):
# draw keypoint
x, y, prob = keypoint
if prob > self.keypoint_threshold:
self.draw_circle((x, y), color=_RED)
if keypoint_names:
keypoint_name = keypoint_names[idx]
visible[keypoint_name] = (x, y)
if self.metadata.get("keypoint_connection_rules"):
for kp0, kp1, color in self.metadata.keypoint_connection_rules:
if kp0 in visible and kp1 in visible:
x0, y0 = visible[kp0]
x1, y1 = visible[kp1]
color = tuple(x / 255.0 for x in color)
self.draw_line([x0, x1], [y0, y1], color=color)
# draw lines from nose to mid-shoulder and mid-shoulder to mid-hip
# Note that this strategy is specific to person keypoints.
# For other keypoints, it should just do nothing
try:
ls_x, ls_y = visible["left_shoulder"]
rs_x, rs_y = visible["right_shoulder"]
mid_shoulder_x, mid_shoulder_y = (ls_x + rs_x) / 2, (ls_y + rs_y) / 2
except KeyError:
pass
else:
# draw line from nose to mid-shoulder
nose_x, nose_y = visible.get("nose", (None, None))
if nose_x is not None:
self.draw_line([nose_x, mid_shoulder_x], [nose_y, mid_shoulder_y], color=_RED)
try:
# draw line from mid-shoulder to mid-hip
lh_x, lh_y = visible["left_hip"]
rh_x, rh_y = visible["right_hip"]
except KeyError:
pass
else:
mid_hip_x, mid_hip_y = (lh_x + rh_x) / 2, (lh_y + rh_y) / 2
self.draw_line([mid_hip_x, mid_shoulder_x], [mid_hip_y, mid_shoulder_y], color=_RED)
return self.output
"""
Primitive drawing functions:
"""
def draw_text(
self,
text,
position,
*,
font_size=None,
color="g",
horizontal_alignment="center",
rotation=0,
):
"""
Args:
text (str): class label
position (tuple): a tuple of the x and y coordinates to place text on image.
font_size (int, optional): font of the text. If not provided, a font size
proportional to the image width is calculated and used.
color: color of the text. Refer to `matplotlib.colors` for full list
of formats that are accepted.
horizontal_alignment (str): see `matplotlib.text.Text`
rotation: rotation angle in degrees CCW
Returns:
output (VisImage): image object with text drawn.
"""
if not font_size:
font_size = self._default_font_size
# since the text background is dark, we don't want the text to be dark
color = np.maximum(list(mplc.to_rgb(color)), 0.2)
color[np.argmax(color)] = max(0.8, np.max(color))
x, y = position
self.output.ax.text(
x,
y,
text,
size=font_size * self.output.scale,
family="sans-serif",
bbox={"facecolor": "black", "alpha": 0.8, "pad": 0.7, "edgecolor": "none"},
verticalalignment="top",
horizontalalignment=horizontal_alignment,
color=color,
zorder=10,
rotation=rotation,
)
return self.output
def draw_box(self, box_coord, alpha=0.5, edge_color="g", line_style="-"):
"""
Args:
box_coord (tuple): a tuple containing x0, y0, x1, y1 coordinates, where x0 and y0
are the coordinates of the image's top left corner. x1 and y1 are the
coordinates of the image's bottom right corner.
alpha (float): blending efficient. Smaller values lead to more transparent masks.
edge_color: color of the outline of the box. Refer to `matplotlib.colors`
for full list of formats that are accepted.
line_style (string): the string to use to create the outline of the boxes.
Returns:
output (VisImage): image object with box drawn.
"""
x0, y0, x1, y1 = box_coord
width = x1 - x0
height = y1 - y0
linewidth = max(self._default_font_size / 4, 1)
self.output.ax.add_patch(
mpl.patches.Rectangle(
(x0, y0),
width,
height,
fill=False,
edgecolor=edge_color,
linewidth=linewidth * self.output.scale,
alpha=alpha,
linestyle=line_style,
)
)
return self.output
def draw_rotated_box_with_label(
self, rotated_box, alpha=0.5, edge_color="g", line_style="-", label=None
):
"""
Draw a rotated box with label on its top-left corner.
Args:
rotated_box (tuple): a tuple containing (cnt_x, cnt_y, w, h, angle),
where cnt_x and cnt_y are the center coordinates of the box.
w and h are the width and height of the box. angle represents how
many degrees the box is rotated CCW with regard to the 0-degree box.
alpha (float): blending efficient. Smaller values lead to more transparent masks.
edge_color: color of the outline of the box. Refer to `matplotlib.colors`
for full list of formats that are accepted.
line_style (string): the string to use to create the outline of the boxes.
label (string): label for rotated box. It will not be rendered when set to None.
Returns:
output (VisImage): image object with box drawn.
"""
cnt_x, cnt_y, w, h, angle = rotated_box
area = w * h
# use thinner lines when the box is small
linewidth = self._default_font_size / (
6 if area < _SMALL_OBJECT_AREA_THRESH * self.output.scale else 3
)
theta = angle * math.pi / 180.0
c = math.cos(theta)
s = math.sin(theta)
rect = [(-w / 2, h / 2), (-w / 2, -h / 2), (w / 2, -h / 2), (w / 2, h / 2)]
# x: left->right ; y: top->down
rotated_rect = [(s * yy + c * xx + cnt_x, c * yy - s * xx + cnt_y) for (xx, yy) in rect]
for k in range(4):
j = (k + 1) % 4
self.draw_line(
[rotated_rect[k][0], rotated_rect[j][0]],
[rotated_rect[k][1], rotated_rect[j][1]],
color=edge_color,
linestyle="--" if k == 1 else line_style,
linewidth=linewidth,
)
if label is not None:
text_pos = rotated_rect[1] # topleft corner
height_ratio = h / np.sqrt(self.output.height * self.output.width)
label_color = self._change_color_brightness(edge_color, brightness_factor=0.7)
font_size = (
np.clip((height_ratio - 0.02) / 0.08 + 1, 1.2, 2) * 0.5 * self._default_font_size
)
self.draw_text(label, text_pos, color=label_color, font_size=font_size, rotation=angle)
return self.output
def draw_circle(self, circle_coord, color, radius=3):
"""
Args:
circle_coord (list(int) or tuple(int)): contains the x and y coordinates
of the center of the circle.
color: color of the polygon. Refer to `matplotlib.colors` for a full list of
formats that are accepted.
radius (int): radius of the circle.
Returns:
output (VisImage): image object with box drawn.
"""
x, y = circle_coord
self.output.ax.add_patch(
mpl.patches.Circle(circle_coord, radius=radius, fill=True, color=color)
)
return self.output
def draw_line(self, x_data, y_data, color, linestyle="-", linewidth=None):
"""
Args:
x_data (list[int]): a list containing x values of all the points being drawn.
Length of list should match the length of y_data.
y_data (list[int]): a list containing y values of all the points being drawn.
Length of list should match the length of x_data.
color: color of the line. Refer to `matplotlib.colors` for a full list of
formats that are accepted.
linestyle: style of the line. Refer to `matplotlib.lines.Line2D`
for a full list of formats that are accepted.
linewidth (float or None): width of the line. When it's None,
a default value will be computed and used.
Returns:
output (VisImage): image object with line drawn.
"""
if linewidth is None:
linewidth = self._default_font_size / 3
linewidth = max(linewidth, 1)
self.output.ax.add_line(
mpl.lines.Line2D(
x_data,
y_data,
linewidth=linewidth * self.output.scale,
color=color,
linestyle=linestyle,
)
)
return self.output
def draw_binary_mask(
self, binary_mask, color=None, *, edge_color=None, text=None, alpha=0.5, area_threshold=0
):
"""
Args:
binary_mask (ndarray): numpy array of shape (H, W), where H is the image height and
W is the image width. Each value in the array is either a 0 or 1 value of uint8
type.
color: color of the mask. Refer to `matplotlib.colors` for a full list of
formats that are accepted. If None, will pick a random color.
edge_color: color of the polygon edges. Refer to `matplotlib.colors` for a
full list of formats that are accepted.
text (str): if None, will be drawn in the object's center of mass.
alpha (float): blending efficient. Smaller values lead to more transparent masks.
area_threshold (float): a connected component small than this will not be shown.
Returns:
output (VisImage): image object with mask drawn.
"""
if color is None:
color = random_color(rgb=True, maximum=1)
color = mplc.to_rgb(color)
has_valid_segment = False
binary_mask = binary_mask.astype("uint8") # opencv needs uint8
mask = GenericMask(binary_mask, self.output.height, self.output.width)
shape2d = (binary_mask.shape[0], binary_mask.shape[1])
if not mask.has_holes:
# draw polygons for regular masks
for segment in mask.polygons:
area = mask_util.area(mask_util.frPyObjects([segment], shape2d[0], shape2d[1]))
if area < (area_threshold or 0):
continue
has_valid_segment = True
segment = segment.reshape(-1, 2)
self.draw_polygon(segment, color=color, edge_color=edge_color, alpha=alpha)
else:
# TODO: Use Path/PathPatch to draw vector graphics:
# https://stackoverflow.com/questions/8919719/how-to-plot-a-complex-polygon
rgba = np.zeros(shape2d + (4,), dtype="float32")
rgba[:, :, :3] = color
rgba[:, :, 3] = (mask.mask == 1).astype("float32") * alpha
has_valid_segment = True
self.output.ax.imshow(rgba, extent=(0, self.output.width, self.output.height, 0))
if text is not None and has_valid_segment:
# TODO sometimes drawn on wrong objects. the heuristics here can improve.
lighter_color = self._change_color_brightness(color, brightness_factor=0.7)
_num_cc, cc_labels, stats, centroids = cv2.connectedComponentsWithStats(binary_mask, 8)
largest_component_id = np.argmax(stats[1:, -1]) + 1
# draw text on the largest component, as well as other very large components.
for cid in range(1, _num_cc):
if cid == largest_component_id or stats[cid, -1] > _LARGE_MASK_AREA_THRESH:
# median is more stable than centroid
# center = centroids[largest_component_id]
center = np.median((cc_labels == cid).nonzero(), axis=1)[::-1]
self.draw_text(text, center, color=lighter_color)
return self.output
def draw_polygon(self, segment, color, edge_color=None, alpha=0.5):
"""
Args:
segment: numpy array of shape Nx2, containing all the points in the polygon.
color: color of the polygon. Refer to `matplotlib.colors` for a full list of
formats that are accepted.
edge_color: color of the polygon edges. Refer to `matplotlib.colors` for a
full list of formats that are accepted. If not provided, a darker shade
of the polygon color will be used instead.
alpha (float): blending efficient. Smaller values lead to more transparent masks.
Returns:
output (VisImage): image object with polygon drawn.
"""
if edge_color is None:
# make edge color darker than the polygon color
if alpha > 0.8:
edge_color = self._change_color_brightness(color, brightness_factor=-0.7)
else:
edge_color = color
edge_color = mplc.to_rgb(edge_color) + (1,)
polygon = mpl.patches.Polygon(
segment,
fill=True,
facecolor=mplc.to_rgb(color) + (alpha,),
edgecolor=edge_color,
linewidth=max(self._default_font_size // 15 * self.output.scale, 1),
)
self.output.ax.add_patch(polygon)
return self.output
"""
Internal methods:
"""
def _jitter(self, color):
"""
Randomly modifies given color to produce a slightly different color than the color given.
Args:
color (tuple[double]): a tuple of 3 elements, containing the RGB values of the color
picked. The values in the list are in the [0.0, 1.0] range.
Returns:
jittered_color (tuple[double]): a tuple of 3 elements, containing the RGB values of the
color after being jittered. The values in the list are in the [0.0, 1.0] range.
"""
color = mplc.to_rgb(color)
vec = np.random.rand(3)
# better to do it in another color space
vec = vec / np.linalg.norm(vec) * 0.5
res = np.clip(vec + color, 0, 1)
return tuple(res)
def _create_grayscale_image(self, mask=None):
"""
Create a grayscale version of the original image.
The colors in masked area, if given, will be kept.
"""
img_bw = self.img.astype("f4").mean(axis=2)
img_bw = np.stack([img_bw] * 3, axis=2)
if mask is not None:
img_bw[mask] = self.img[mask]
return img_bw
def _change_color_brightness(self, color, brightness_factor):
"""
Depending on the brightness_factor, gives a lighter or darker color i.e. a color with
less or more saturation than the original color.
Args:
color: color of the polygon. Refer to `matplotlib.colors` for a full list of
formats that are accepted.
brightness_factor (float): a value in [-1.0, 1.0] range. A lightness factor of
0 will correspond to no change, a factor in [-1.0, 0) range will result in
a darker color and a factor in (0, 1.0] range will result in a lighter color.
Returns:
modified_color (tuple[double]): a tuple containing the RGB values of the
modified color. Each value in the tuple is in the [0.0, 1.0] range.
"""
assert brightness_factor >= -1.0 and brightness_factor <= 1.0
color = mplc.to_rgb(color)
polygon_color = colorsys.rgb_to_hls(*mplc.to_rgb(color))
modified_lightness = polygon_color[1] + (brightness_factor * polygon_color[1])
modified_lightness = 0.0 if modified_lightness < 0.0 else modified_lightness
modified_lightness = 1.0 if modified_lightness > 1.0 else modified_lightness
modified_color = colorsys.hls_to_rgb(polygon_color[0], modified_lightness, polygon_color[2])
return modified_color
def _convert_boxes(self, boxes):
"""
Convert different format of boxes to an NxB array, where B = 4 or 5 is the box dimension.
"""
if isinstance(boxes, Boxes) or isinstance(boxes, RotatedBoxes):
return boxes.tensor.detach().numpy()
else:
return np.asarray(boxes)
def _convert_masks(self, masks_or_polygons):
"""
Convert different format of masks or polygons to a tuple of masks and polygons.
Returns:
list[GenericMask]:
"""
m = masks_or_polygons
if isinstance(m, PolygonMasks):
m = m.polygons
if isinstance(m, BitMasks):
m = m.tensor.numpy()
if isinstance(m, torch.Tensor):
m = m.numpy()
ret = []
for x in m:
if isinstance(x, GenericMask):
ret.append(x)
else:
ret.append(GenericMask(x, self.output.height, self.output.width))
return ret
def _convert_keypoints(self, keypoints):
if isinstance(keypoints, Keypoints):
keypoints = keypoints.tensor
keypoints = np.asarray(keypoints)
return keypoints
def get_output(self):
"""
Returns:
output (VisImage): the image output containing the visualizations added
to the image.
"""
return self.output
import re
def layout_rm_equation(layout_res):
rm_idxs = []
for idx, ele in enumerate(layout_res['layout_dets']):
if ele['category_id'] == 10:
rm_idxs.append(idx)
for idx in rm_idxs[::-1]:
del layout_res['layout_dets'][idx]
return layout_res
def get_croped_image(image_pil, bbox):
x_min, y_min, x_max, y_max = bbox
croped_img = image_pil.crop((x_min, y_min, x_max, y_max))
return croped_img
def latex_rm_whitespace(s: str):
"""Remove unnecessary whitespace from LaTeX code.
"""
text_reg = r'(\\(operatorname|mathrm|text|mathbf)\s?\*? {.*?})'
letter = '[a-zA-Z]'
noletter = '[\W_^\d]'
names = [x[0].replace(' ', '') for x in re.findall(text_reg, s)]
s = re.sub(text_reg, lambda match: str(names.pop(0)), s)
news = s
while True:
s = news
news = re.sub(r'(?!\\ )(%s)\s+?(%s)' % (noletter, noletter), r'\1\2', s)
news = re.sub(r'(?!\\ )(%s)\s+?(%s)' % (noletter, letter), r'\1\2', news)
news = re.sub(r'(%s)\s+?(%s)' % (letter, noletter), r'\1\2', news)
if news == s:
break
return s
\ No newline at end of file
import time
import copy
import base64
import cv2
import numpy as np
from io import BytesIO
from PIL import Image
from paddleocr import PaddleOCR
from paddleocr.ppocr.utils.logging import get_logger
from paddleocr.ppocr.utils.utility import check_and_read, alpha_to_color, binarize_img
from paddleocr.tools.infer.utility import draw_ocr_box_txt, get_rotate_crop_image, get_minarea_rect_crop
logger = get_logger()
def img_decode(content: bytes):
np_arr = np.frombuffer(content, dtype=np.uint8)
return cv2.imdecode(np_arr, cv2.IMREAD_UNCHANGED)
def check_img(img):
if isinstance(img, bytes):
img = img_decode(img)
if isinstance(img, str):
image_file = img
img, flag_gif, flag_pdf = check_and_read(image_file)
if not flag_gif and not flag_pdf:
with open(image_file, 'rb') as f:
img_str = f.read()
img = img_decode(img_str)
if img is None:
try:
buf = BytesIO()
image = BytesIO(img_str)
im = Image.open(image)
rgb = im.convert('RGB')
rgb.save(buf, 'jpeg')
buf.seek(0)
image_bytes = buf.read()
data_base64 = str(base64.b64encode(image_bytes),
encoding="utf-8")
image_decode = base64.b64decode(data_base64)
img_array = np.frombuffer(image_decode, np.uint8)
img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
except:
logger.error("error in loading image:{}".format(image_file))
return None
if img is None:
logger.error("error in loading image:{}".format(image_file))
return None
if isinstance(img, np.ndarray) and len(img.shape) == 2:
img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
return img
def sorted_boxes(dt_boxes):
"""
Sort text boxes in order from top to bottom, left to right
args:
dt_boxes(array):detected text boxes with shape [4, 2]
return:
sorted boxes(array) with shape [4, 2]
"""
num_boxes = dt_boxes.shape[0]
sorted_boxes = sorted(dt_boxes, key=lambda x: (x[0][1], x[0][0]))
_boxes = list(sorted_boxes)
for i in range(num_boxes - 1):
for j in range(i, -1, -1):
if abs(_boxes[j + 1][0][1] - _boxes[j][0][1]) < 10 and \
(_boxes[j + 1][0][0] < _boxes[j][0][0]):
tmp = _boxes[j]
_boxes[j] = _boxes[j + 1]
_boxes[j + 1] = tmp
else:
break
return _boxes
def formula_in_text(mf_bbox, text_bbox):
x1, y1, x2, y2 = mf_bbox
x3, y3 = text_bbox[0]
x4, y4 = text_bbox[2]
left_box, right_box = None, None
same_line = abs((y1+y2)/2 - (y3+y4)/2) / abs(y4-y3) < 0.2
if not same_line:
return False, left_box, right_box
else:
drop_origin = False
left_x = x1 - 1
right_x = x2 + 1
if x3 < x1 and x2 < x4:
drop_origin = True
left_box = np.array([text_bbox[0], [left_x, text_bbox[1][1]], [left_x, text_bbox[2][1]], text_bbox[3]]).astype('float32')
right_box = np.array([[right_x, text_bbox[0][1]], text_bbox[1], text_bbox[2], [right_x, text_bbox[3][1]]]).astype('float32')
if x3 < x1 and x1 <= x4 <= x2:
drop_origin = True
left_box = np.array([text_bbox[0], [left_x, text_bbox[1][1]], [left_x, text_bbox[2][1]], text_bbox[3]]).astype('float32')
if x1 <= x3 <= x2 and x2 < x4:
drop_origin = True
right_box = np.array([[right_x, text_bbox[0][1]], text_bbox[1], text_bbox[2], [right_x, text_bbox[3][1]]]).astype('float32')
if x1 <= x3 < x4 <= x2:
drop_origin = True
return drop_origin, left_box, right_box
def update_det_boxes(dt_boxes, mfdetrec_res):
new_dt_boxes = dt_boxes
for mf_box in mfdetrec_res:
flag, left_box, right_box = False, None, None
for idx, text_box in enumerate(new_dt_boxes):
ret, left_box, right_box = formula_in_text(mf_box['bbox'], text_box)
if ret:
new_dt_boxes.pop(idx)
if left_box is not None:
new_dt_boxes.append(left_box)
if right_box is not None:
new_dt_boxes.append(right_box)
break
return new_dt_boxes
class ModifiedPaddleOCR(PaddleOCR):
def ocr(self, img, det=True, rec=True, cls=True, bin=False, inv=False, mfd_res=None, alpha_color=(255, 255, 255)):
"""
OCR with PaddleOCR
args:
img: img for OCR, support ndarray, img_path and list or ndarray
det: use text detection or not. If False, only rec will be exec. Default is True
rec: use text recognition or not. If False, only det will be exec. Default is True
cls: use angle classifier or not. Default is True. If True, the text with rotation of 180 degrees can be recognized. If no text is rotated by 180 degrees, use cls=False to get better performance. Text with rotation of 90 or 270 degrees can be recognized even if cls=False.
bin: binarize image to black and white. Default is False.
inv: invert image colors. Default is False.
alpha_color: set RGB color Tuple for transparent parts replacement. Default is pure white.
"""
assert isinstance(img, (np.ndarray, list, str, bytes))
if isinstance(img, list) and det == True:
logger.error('When input a list of images, det must be false')
exit(0)
if cls == True and self.use_angle_cls == False:
pass
# logger.warning(
# 'Since the angle classifier is not initialized, it will not be used during the forward process'
# )
img = check_img(img)
# for infer pdf file
if isinstance(img, list):
if self.page_num > len(img) or self.page_num == 0:
self.page_num = len(img)
imgs = img[:self.page_num]
else:
imgs = [img]
def preprocess_image(_image):
_image = alpha_to_color(_image, alpha_color)
if inv:
_image = cv2.bitwise_not(_image)
if bin:
_image = binarize_img(_image)
return _image
if det and rec:
ocr_res = []
for idx, img in enumerate(imgs):
img = preprocess_image(img)
dt_boxes, rec_res, _ = self.__call__(img, cls, mfd_res=mfd_res)
if not dt_boxes and not rec_res:
ocr_res.append(None)
continue
tmp_res = [[box.tolist(), res]
for box, res in zip(dt_boxes, rec_res)]
ocr_res.append(tmp_res)
return ocr_res
elif det and not rec:
ocr_res = []
for idx, img in enumerate(imgs):
img = preprocess_image(img)
dt_boxes, elapse = self.text_detector(img)
if not dt_boxes:
ocr_res.append(None)
continue
tmp_res = [box.tolist() for box in dt_boxes]
ocr_res.append(tmp_res)
return ocr_res
else:
ocr_res = []
cls_res = []
for idx, img in enumerate(imgs):
if not isinstance(img, list):
img = preprocess_image(img)
img = [img]
if self.use_angle_cls and cls:
img, cls_res_tmp, elapse = self.text_classifier(img)
if not rec:
cls_res.append(cls_res_tmp)
rec_res, elapse = self.text_recognizer(img)
ocr_res.append(rec_res)
if not rec:
return cls_res
return ocr_res
def __call__(self, img, cls=True, mfd_res=None):
time_dict = {'det': 0, 'rec': 0, 'cls': 0, 'all': 0}
if img is None:
logger.debug("no valid image provided")
return None, None, time_dict
start = time.time()
ori_im = img.copy()
dt_boxes, elapse = self.text_detector(img)
time_dict['det'] = elapse
if dt_boxes is None:
logger.debug("no dt_boxes found, elapsed : {}".format(elapse))
end = time.time()
time_dict['all'] = end - start
return None, None, time_dict
else:
logger.debug("dt_boxes num : {}, elapsed : {}".format(
len(dt_boxes), elapse))
img_crop_list = []
dt_boxes = sorted_boxes(dt_boxes)
if mfd_res:
bef = time.time()
dt_boxes = update_det_boxes(dt_boxes, mfd_res)
aft = time.time()
logger.debug("split text box by formula, new dt_boxes num : {}, elapsed : {}".format(
len(dt_boxes), aft-bef))
for bno in range(len(dt_boxes)):
tmp_box = copy.deepcopy(dt_boxes[bno])
if self.args.det_box_type == "quad":
img_crop = get_rotate_crop_image(ori_im, tmp_box)
else:
img_crop = get_minarea_rect_crop(ori_im, tmp_box)
img_crop_list.append(img_crop)
if self.use_angle_cls and cls:
img_crop_list, angle_list, elapse = self.text_classifier(
img_crop_list)
time_dict['cls'] = elapse
logger.debug("cls num : {}, elapsed : {}".format(
len(img_crop_list), elapse))
rec_res, elapse = self.text_recognizer(img_crop_list)
time_dict['rec'] = elapse
logger.debug("rec_res num : {}, elapsed : {}".format(
len(rec_res), elapse))
if self.args.save_crop_res:
self.draw_crop_rec_res(self.args.crop_res_save_dir, img_crop_list,
rec_res)
filter_boxes, filter_rec_res = [], []
for box, rec_result in zip(dt_boxes, rec_res):
text, score = rec_result
if score >= self.drop_score:
filter_boxes.append(box)
filter_rec_res.append(rec_result)
end = time.time()
time_dict['all'] = end - start
return filter_boxes, filter_rec_res, time_dict
\ No newline at end of file
......@@ -22,6 +22,13 @@ class CustomPaddleModel:
self.model = PPStructure(table=False, ocr=ocr, show_log=show_log)
def __call__(self, img):
try:
import cv2
except ImportError:
logger.error("opencv-python not installed, please install by pip.")
exit(1)
# 将RGB图片转换为BGR格式适配paddle
img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
result = self.model(img)
spans = []
for line in result:
......
......@@ -47,19 +47,13 @@ class AbsPipe(ABC):
"""
raise NotImplementedError
@abstractmethod
def pipe_mk_uni_format(self, img_parent_path, drop_mode):
"""
有状态的组装统一格式
"""
raise NotImplementedError
def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), img_parent_path, drop_mode)
return content_list
@abstractmethod
def pipe_mk_markdown(self, img_parent_path, drop_mode):
"""
有状态的组装markdown
"""
raise NotImplementedError
def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD):
md_content = AbsPipe.mk_markdown(self.get_compress_pdf_mid_data(), img_parent_path, drop_mode, md_make_mode)
return md_content
@staticmethod
def classify(pdf_bytes: bytes) -> str:
......@@ -101,13 +95,13 @@ class AbsPipe(ABC):
return content_list
@staticmethod
def mk_markdown(compressed_pdf_mid_data: str, img_buket_path: str, drop_mode=DropMode.WHOLE_PDF) -> list:
def mk_markdown(compressed_pdf_mid_data: str, img_buket_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD) -> list:
"""
根据pdf类型,markdown
"""
pdf_mid_data = JsonCompressor.decompress_json(compressed_pdf_mid_data)
pdf_info_list = pdf_mid_data["pdf_info"]
md_content = union_make(pdf_info_list, MakeMode.MM_MD, drop_mode, img_buket_path)
md_content = union_make(pdf_info_list, md_make_mode, drop_mode, img_buket_path)
return md_content
from magic_pdf.libs.MakeContentConfig import DropMode
from loguru import logger
from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
from magic_pdf.pipe.AbsPipe import AbsPipe
......@@ -7,7 +9,7 @@ from magic_pdf.user_api import parse_ocr_pdf
class OCRPipe(AbsPipe):
def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool=False):
def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False):
super().__init__(pdf_bytes, model_list, image_writer, is_debug)
def pipe_classify(self):
......@@ -20,9 +22,11 @@ class OCRPipe(AbsPipe):
self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug)
def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), img_parent_path, drop_mode)
return content_list
def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
md_content = AbsPipe.mk_markdown(self.get_compress_pdf_mid_data(), img_parent_path, drop_mode)
return md_content
result = super().pipe_mk_uni_format(img_parent_path, drop_mode)
logger.info("ocr_pipe mk content list finished")
return result
def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD):
result = super().pipe_mk_markdown(img_parent_path, drop_mode, md_make_mode)
logger.info(f"ocr_pipe mk {md_make_mode} finished")
return result
from magic_pdf.libs.MakeContentConfig import DropMode
from loguru import logger
from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
from magic_pdf.libs.json_compressor import JsonCompressor
......@@ -8,7 +10,7 @@ from magic_pdf.user_api import parse_txt_pdf
class TXTPipe(AbsPipe):
def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool=False):
def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False):
super().__init__(pdf_bytes, model_list, image_writer, is_debug)
def pipe_classify(self):
......@@ -21,9 +23,11 @@ class TXTPipe(AbsPipe):
self.pdf_mid_data = parse_txt_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug)
def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), img_parent_path, drop_mode)
return content_list
def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
md_content = AbsPipe.mk_markdown(self.get_compress_pdf_mid_data(), img_parent_path, drop_mode)
return md_content
result = super().pipe_mk_uni_format(img_parent_path, drop_mode)
logger.info("txt_pipe mk content list finished")
return result
def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD):
result = super().pipe_mk_markdown(img_parent_path, drop_mode, md_make_mode)
logger.info(f"txt_pipe mk {md_make_mode} finished")
return result
......@@ -2,7 +2,7 @@ import json
from loguru import logger
from magic_pdf.libs.MakeContentConfig import DropMode
from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
......@@ -39,12 +39,14 @@ class UNIPipe(AbsPipe):
is_debug=self.is_debug)
def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), img_parent_path, drop_mode)
return content_list
def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
markdown_content = AbsPipe.mk_markdown(self.get_compress_pdf_mid_data(), img_parent_path, drop_mode)
return markdown_content
result = super().pipe_mk_uni_format(img_parent_path, drop_mode)
logger.info("uni_pipe mk content list finished")
return result
def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD):
result = super().pipe_mk_markdown(img_parent_path, drop_mode, md_make_mode)
logger.info(f"uni_pipe mk {md_make_mode} finished")
return result
if __name__ == '__main__':
......
model:
arch: unimernet
model_type: unimernet
model_config:
model_name: ./models
max_seq_len: 1024
length_aware: False
load_pretrained: True
pretrained: ./models/pytorch_model.bin
tokenizer_config:
path: ./models
datasets:
formula_rec_eval:
vis_processor:
eval:
name: "formula_image_eval"
image_size:
- 192
- 672
run:
runner: runner_iter
task: unimernet_train
batch_size_train: 64
batch_size_eval: 64
num_workers: 1
iters_per_inner_epoch: 2000
max_iters: 60000
seed: 42
output_dir: "../output/demo"
evaluate: True
test_splits: [ "eval" ]
device: "cuda"
world_size: 1
dist_url: "env://"
distributed: True
distributed_type: ddp # or fsdp when train llm
generate_cfg:
temperature: 0.0
AUG:
DETR: true
CACHE_DIR: /mnt/localdata/users/yupanhuang/cache/huggingface
CUDNN_BENCHMARK: false
DATALOADER:
ASPECT_RATIO_GROUPING: true
FILTER_EMPTY_ANNOTATIONS: false
NUM_WORKERS: 4
REPEAT_THRESHOLD: 0.0
SAMPLER_TRAIN: TrainingSampler
DATASETS:
PRECOMPUTED_PROPOSAL_TOPK_TEST: 1000
PRECOMPUTED_PROPOSAL_TOPK_TRAIN: 2000
PROPOSAL_FILES_TEST: []
PROPOSAL_FILES_TRAIN: []
TEST:
- scihub_train
TRAIN:
- scihub_train
GLOBAL:
HACK: 1.0
ICDAR_DATA_DIR_TEST: ''
ICDAR_DATA_DIR_TRAIN: ''
INPUT:
CROP:
ENABLED: true
SIZE:
- 384
- 600
TYPE: absolute_range
FORMAT: RGB
MASK_FORMAT: polygon
MAX_SIZE_TEST: 1333
MAX_SIZE_TRAIN: 1333
MIN_SIZE_TEST: 800
MIN_SIZE_TRAIN:
- 480
- 512
- 544
- 576
- 608
- 640
- 672
- 704
- 736
- 768
- 800
MIN_SIZE_TRAIN_SAMPLING: choice
RANDOM_FLIP: horizontal
MODEL:
ANCHOR_GENERATOR:
ANGLES:
- - -90
- 0
- 90
ASPECT_RATIOS:
- - 0.5
- 1.0
- 2.0
NAME: DefaultAnchorGenerator
OFFSET: 0.0
SIZES:
- - 32
- - 64
- - 128
- - 256
- - 512
BACKBONE:
FREEZE_AT: 2
NAME: build_vit_fpn_backbone
CONFIG_PATH: ''
DEVICE: cuda
FPN:
FUSE_TYPE: sum
IN_FEATURES:
- layer3
- layer5
- layer7
- layer11
NORM: ''
OUT_CHANNELS: 256
IMAGE_ONLY: true
KEYPOINT_ON: false
LOAD_PROPOSALS: false
MASK_ON: true
META_ARCHITECTURE: VLGeneralizedRCNN
PANOPTIC_FPN:
COMBINE:
ENABLED: true
INSTANCES_CONFIDENCE_THRESH: 0.5
OVERLAP_THRESH: 0.5
STUFF_AREA_LIMIT: 4096
INSTANCE_LOSS_WEIGHT: 1.0
PIXEL_MEAN:
- 127.5
- 127.5
- 127.5
PIXEL_STD:
- 127.5
- 127.5
- 127.5
PROPOSAL_GENERATOR:
MIN_SIZE: 0
NAME: RPN
RESNETS:
DEFORM_MODULATED: false
DEFORM_NUM_GROUPS: 1
DEFORM_ON_PER_STAGE:
- false
- false
- false
- false
DEPTH: 50
NORM: FrozenBN
NUM_GROUPS: 1
OUT_FEATURES:
- res4
RES2_OUT_CHANNELS: 256
RES5_DILATION: 1
STEM_OUT_CHANNELS: 64
STRIDE_IN_1X1: true
WIDTH_PER_GROUP: 64
RETINANET:
BBOX_REG_LOSS_TYPE: smooth_l1
BBOX_REG_WEIGHTS:
- 1.0
- 1.0
- 1.0
- 1.0
FOCAL_LOSS_ALPHA: 0.25
FOCAL_LOSS_GAMMA: 2.0
IN_FEATURES:
- p3
- p4
- p5
- p6
- p7
IOU_LABELS:
- 0
- -1
- 1
IOU_THRESHOLDS:
- 0.4
- 0.5
NMS_THRESH_TEST: 0.5
NORM: ''
NUM_CLASSES: 10
NUM_CONVS: 4
PRIOR_PROB: 0.01
SCORE_THRESH_TEST: 0.05
SMOOTH_L1_LOSS_BETA: 0.1
TOPK_CANDIDATES_TEST: 1000
ROI_BOX_CASCADE_HEAD:
BBOX_REG_WEIGHTS:
- - 10.0
- 10.0
- 5.0
- 5.0
- - 20.0
- 20.0
- 10.0
- 10.0
- - 30.0
- 30.0
- 15.0
- 15.0
IOUS:
- 0.5
- 0.6
- 0.7
ROI_BOX_HEAD:
BBOX_REG_LOSS_TYPE: smooth_l1
BBOX_REG_LOSS_WEIGHT: 1.0
BBOX_REG_WEIGHTS:
- 10.0
- 10.0
- 5.0
- 5.0
CLS_AGNOSTIC_BBOX_REG: true
CONV_DIM: 256
FC_DIM: 1024
NAME: FastRCNNConvFCHead
NORM: ''
NUM_CONV: 0
NUM_FC: 2
POOLER_RESOLUTION: 7
POOLER_SAMPLING_RATIO: 0
POOLER_TYPE: ROIAlignV2
SMOOTH_L1_BETA: 0.0
TRAIN_ON_PRED_BOXES: false
ROI_HEADS:
BATCH_SIZE_PER_IMAGE: 512
IN_FEATURES:
- p2
- p3
- p4
- p5
IOU_LABELS:
- 0
- 1
IOU_THRESHOLDS:
- 0.5
NAME: CascadeROIHeads
NMS_THRESH_TEST: 0.5
NUM_CLASSES: 10
POSITIVE_FRACTION: 0.25
PROPOSAL_APPEND_GT: true
SCORE_THRESH_TEST: 0.05
ROI_KEYPOINT_HEAD:
CONV_DIMS:
- 512
- 512
- 512
- 512
- 512
- 512
- 512
- 512
LOSS_WEIGHT: 1.0
MIN_KEYPOINTS_PER_IMAGE: 1
NAME: KRCNNConvDeconvUpsampleHead
NORMALIZE_LOSS_BY_VISIBLE_KEYPOINTS: true
NUM_KEYPOINTS: 17
POOLER_RESOLUTION: 14
POOLER_SAMPLING_RATIO: 0
POOLER_TYPE: ROIAlignV2
ROI_MASK_HEAD:
CLS_AGNOSTIC_MASK: false
CONV_DIM: 256
NAME: MaskRCNNConvUpsampleHead
NORM: ''
NUM_CONV: 4
POOLER_RESOLUTION: 14
POOLER_SAMPLING_RATIO: 0
POOLER_TYPE: ROIAlignV2
RPN:
BATCH_SIZE_PER_IMAGE: 256
BBOX_REG_LOSS_TYPE: smooth_l1
BBOX_REG_LOSS_WEIGHT: 1.0
BBOX_REG_WEIGHTS:
- 1.0
- 1.0
- 1.0
- 1.0
BOUNDARY_THRESH: -1
CONV_DIMS:
- -1
HEAD_NAME: StandardRPNHead
IN_FEATURES:
- p2
- p3
- p4
- p5
- p6
IOU_LABELS:
- 0
- -1
- 1
IOU_THRESHOLDS:
- 0.3
- 0.7
LOSS_WEIGHT: 1.0
NMS_THRESH: 0.7
POSITIVE_FRACTION: 0.5
POST_NMS_TOPK_TEST: 1000
POST_NMS_TOPK_TRAIN: 2000
PRE_NMS_TOPK_TEST: 1000
PRE_NMS_TOPK_TRAIN: 2000
SMOOTH_L1_BETA: 0.0
SEM_SEG_HEAD:
COMMON_STRIDE: 4
CONVS_DIM: 128
IGNORE_VALUE: 255
IN_FEATURES:
- p2
- p3
- p4
- p5
LOSS_WEIGHT: 1.0
NAME: SemSegFPNHead
NORM: GN
NUM_CLASSES: 10
VIT:
DROP_PATH: 0.1
IMG_SIZE:
- 224
- 224
NAME: layoutlmv3_base
OUT_FEATURES:
- layer3
- layer5
- layer7
- layer11
POS_TYPE: abs
WEIGHTS:
OUTPUT_DIR:
SCIHUB_DATA_DIR_TRAIN: /mnt/petrelfs/share_data/zhaozhiyuan/publaynet/layout_scihub/train
SEED: 42
SOLVER:
AMP:
ENABLED: true
BACKBONE_MULTIPLIER: 1.0
BASE_LR: 0.0002
BIAS_LR_FACTOR: 1.0
CHECKPOINT_PERIOD: 2000
CLIP_GRADIENTS:
CLIP_TYPE: full_model
CLIP_VALUE: 1.0
ENABLED: true
NORM_TYPE: 2.0
GAMMA: 0.1
GRADIENT_ACCUMULATION_STEPS: 1
IMS_PER_BATCH: 32
LR_SCHEDULER_NAME: WarmupCosineLR
MAX_ITER: 20000
MOMENTUM: 0.9
NESTEROV: false
OPTIMIZER: ADAMW
REFERENCE_WORLD_SIZE: 0
STEPS:
- 10000
WARMUP_FACTOR: 0.01
WARMUP_ITERS: 333
WARMUP_METHOD: linear
WEIGHT_DECAY: 0.05
WEIGHT_DECAY_BIAS: null
WEIGHT_DECAY_NORM: 0.0
TEST:
AUG:
ENABLED: false
FLIP: true
MAX_SIZE: 4000
MIN_SIZES:
- 400
- 500
- 600
- 700
- 800
- 900
- 1000
- 1100
- 1200
DETECTIONS_PER_IMAGE: 100
EVAL_PERIOD: 1000
EXPECTED_RESULTS: []
KEYPOINT_OKS_SIGMAS: []
PRECISE_BN:
ENABLED: false
NUM_ITER: 200
VERSION: 2
VIS_PERIOD: 0
config:
device: cpu
layout: True
formula: True
weights:
layout: Layout/model_final.pth
mfd: MFD/weights.pt
mfr: MFR/UniMERNet
......@@ -4,8 +4,8 @@ click>=8.1.7
PyMuPDF>=1.24.7
loguru>=0.6.0
numpy>=1.21.6
fast-langdetect>=0.1.1
fast-langdetect>=0.2.1
wordninja>=2.0.0
scikit-learn>=1.0.2
pdfminer.six>=20231228
# requirements.txt 须保证只引入必需的外部依赖,如有新依赖添加请联系项目管理员
\ No newline at end of file
# The requirements.txt must ensure that only necessary external dependencies are introduced. If there are new dependencies to add, please contact the project administrator.
......@@ -26,16 +26,20 @@ if __name__ == '__main__':
setup(
name="magic_pdf", # 项目名
version=__version__, # 自动从tag中获取版本号
packages=find_packages(), # 包含所有的包
packages=find_packages() + ["magic_pdf.resources"], # 包含所有的包
package_data={
"magic_pdf.resources": ["**"], # 包含magic_pdf.resources目录下的所有文件
},
install_requires=parse_requirements('requirements.txt'), # 项目依赖的第三方库
extras_require={
"gpu": ["paddleocr", "paddlepaddle-gpu"],
"cpu": ["paddleocr", "paddlepaddle"],
"full-cpu": ["unimernet", "matplotlib", "ultralytics", "paddleocr", "paddlepaddle"],
},
description="A practical tool for converting PDF to Markdown", # 简短描述
long_description=long_description, # 详细描述
long_description_content_type="text/markdown", # 如果README是Markdown格式
url="https://github.com/magicpdf/Magic-PDF",
url="https://github.com/opendatalab/MinerU",
python_requires=">=3.9", # 项目依赖的 Python 版本
entry_points={
"console_scripts": [
......
......@@ -19,32 +19,32 @@ class TestCli:
#common.count_folders_and_check_contents(pdf_res_path)
def test_pdf_specify_jsonl(self):
"""
输入jsonl, 默认方式解析
"""
cmd = "cd %s && export PYTHONPATH=. && python magic_pdf/cli/magicpdf.py json-command --json 's3://llm-process-pperf/ebook_index_textbook_40k/中高考&竞赛知识点/part-663f1ef5e7c1-009416.jsonl?bytes=0,1133972'" % (code_path)
logging.info(cmd)
common.check_shell(cmd)
#common.count_folders_and_check_contents(pdf_res_path)
# def test_pdf_specify_jsonl(self):
# """
# 输入jsonl, 默认方式解析
# """
# cmd = "cd %s && export PYTHONPATH=. && python magic_pdf/cli/magicpdf.py json-command --json 's3://llm-process-pperf/ebook_index_textbook_40k/中高考&竞赛知识点/part-663f1ef5e7c1-009416.jsonl?bytes=0,1133972'" % (code_path)
# logging.info(cmd)
# common.check_shell(cmd)
# #common.count_folders_and_check_contents(pdf_res_path)
def test_pdf_specify_jsonl_txt(self):
"""
输入jsonl, txt方式解析
"""
cmd = "cd %s && export PYTHONPATH=. && python magic_pdf/cli/magicpdf.py json-command --json 's3://llm-process-pperf/ebook_index_textbook_40k/中高考&竞赛知识点/part-663f1ef5e7c1-009416.jsonl?bytes=0,1133972' --method txt" % (code_path)
logging.info(cmd)
common.check_shell(cmd)
#common.count_folders_and_check_contents(pdf_res_path)
def test_pdf_specify_jsonl_ocr(self):
"""
输入jsonl, ocr方式解析
"""
cmd = "cd %s && export PYTHONPATH=. && python magic_pdf/cli/magicpdf.py json-command --json 's3://llm-process-pperf/ebook_index_textbook_40k/中高考&竞赛知识点/part-663f1ef5e7c1-009416.jsonl?bytes=0,1133972' --method ocr" % (code_path)
logging.info(cmd)
common.check_shell(cmd)
#common.count_folders_and_check_contents(pdf_res_path)
# def test_pdf_specify_jsonl_txt(self):
# """
# 输入jsonl, txt方式解析
# """
# cmd = "cd %s && export PYTHONPATH=. && python magic_pdf/cli/magicpdf.py json-command --json 's3://llm-process-pperf/ebook_index_textbook_40k/中高考&竞赛知识点/part-663f1ef5e7c1-009416.jsonl?bytes=0,1133972' --method txt" % (code_path)
# logging.info(cmd)
# common.check_shell(cmd)
# #common.count_folders_and_check_contents(pdf_res_path)
#
# def test_pdf_specify_jsonl_ocr(self):
# """
# 输入jsonl, ocr方式解析
# """
# cmd = "cd %s && export PYTHONPATH=. && python magic_pdf/cli/magicpdf.py json-command --json 's3://llm-process-pperf/ebook_index_textbook_40k/中高考&竞赛知识点/part-663f1ef5e7c1-009416.jsonl?bytes=0,1133972' --method ocr" % (code_path)
# logging.info(cmd)
# common.check_shell(cmd)
# #common.count_folders_and_check_contents(pdf_res_path)
if __name__ == "__main__":
......
import zipfile
import os
import shutil
import json
import markdown_calculate
code_path = os.environ.get('GITHUB_WORKSPACE')
#数据集存放路径
pdf_dev_path = "/share/quyuan/mineru/data/"
#magicpdf最终结果
pdf_res_path = "/share/quyuan/mineru/data/mineru"
file_types = ["academic_literature", "atlas", "courseware", "colorful_textbook", "historical_documents", "notes", "ordinary_books", "ordinary_exam_paper", "ordinary_textbook", "research_report", "special_exam_paper"]
def test_cli():
#magicpdf模型输出结果
magicpdf_path = os.path.join(pdf_dev_path, "output")
rm_cmd = "rm -rf %s" % (pdf_res_path)
os.system(rm_cmd)
os.makedirs(pdf_res_path)
cmd = 'cd %s && export PYTHONPATH=. && find %s -type f -name "*.pdf" | xargs -I{} python magic_pdf/cli/magicpdf.py pdf-command --pdf {}' % (code_path, magicpdf_path)
os.system(cmd)
for root, dirs, files in os.walk(pdf_res_path):
for magic_file in files:
for file_type in file_types:
target_dir = os.path.join(pdf_dev_path, "ci", file_type, "magicpdf")
if magic_file.endswith(".md") and magic_file.startswith(file_type):
source_file = os.path.join(root, magic_file)
target_file = os.path.join(pdf_dev_path, "ci", file_type, "magicpdf", magic_file)
if not os.path.exists(target_dir):
os.makedirs(target_dir)
shutil.copy(source_file, target_file)
def calculate_score():
data_path = os.path.join(pdf_dev_path, "ci")
cmd = "cd %s && export PYTHONPATH=. && python tools/clean_photo.py --tool_name annotations --download_dir %s" % (code_path, data_path)
os.system(cmd)
cmd = "cd %s && export PYTHONPATH=. && python tools/clean_photo.py --tool_name magicpdf --download_dir %s" % (code_path, data_path)
os.system(cmd)
score = markdown_calculate.Scoring(os.path.join(data_path, "result.json"))
score.calculate_similarity_total("magicpdf", file_types, data_path)
res = score.summary_scores()
return res
def extrat_zip(zip_file_path, extract_to_path):
if zipfile.is_zipfile(zip_file_path):
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
zip_ref.extractall(extract_to_path)
print(f'Files extracted to {extract_to_path}')
else:
print(f'{zip_file_path} is not a zip file')
def ci_ben():
fr = open(os.path.join(pdf_dev_path, "ci", "result.json"), "r")
lines = fr.readlines()
last_line = lines[-1].strip()
last_score = json.loads(last_line)
print ("last_score:", last_score)
last_simscore = last_score["average_sim_score"]
last_editdistance = last_score["average_edit_distance"]
last_bleu = last_score["average_bleu_score"]
extrat_zip(os.path.join(pdf_dev_path, 'output.zip'), os.path.join(pdf_dev_path))
test_cli()
now_score = calculate_score()
print ("now_score:", now_score)
now_simscore = now_score["average_sim_score"]
now_editdistance = now_score["average_edit_distance"]
now_bleu = now_score["average_bleu_score"]
assert last_simscore <= now_simscore
assert last_editdistance <= now_editdistance
assert last_bleu <= now_bleu
if __name__ == "__main__":
ci_ben()
import pypandoc
import re
import htmltabletomd
import os
import argparse
import zipfile
parser = argparse.ArgumentParser(description="get tool type")
parser.add_argument(
"--tool_name",
type=str,
required=True,
help="input tool name",
)
parser.add_argument(
"--download_dir",
type=str,
required=True,
help="input download dir",
)
args = parser.parse_args()
def clean_markdown_images(content):
pattern = re.compile(r'!\[[^\]]*\]\([^)]*\)', re.IGNORECASE)
cleaned_content = pattern.sub('', content)
return cleaned_content
def clean_ocrmath_photo(content):
pattern = re.compile(r'\\includegraphics\[.*?\]\{.*?\}', re.IGNORECASE)
cleaned_content = pattern.sub('', content)
return cleaned_content
def convert_html_table_to_md(html_table):
lines = html_table.strip().split('\n')
md_table = ''
if lines and '<tr>' in lines[0]:
in_thead = True
for line in lines:
if '<th>' in line:
cells = re.findall(r'<th>(.*?)</th>', line)
md_table += '| ' + ' | '.join(cells) + ' |\n'
in_thead = False
elif '<td>' in line and not in_thead:
cells = re.findall(r'<td>(.*?)</td>', line)
md_table += '| ' + ' | '.join(cells) + ' |\n'
md_table = md_table.rstrip() + '\n'
return md_table
def convert_latext_to_md(content):
tables = re.findall(r'\\begin\{tabular\}(.*?)\\end\{tabular\}', content, re.DOTALL)
placeholders = []
for table in tables:
placeholder = f"<!-- TABLE_PLACEHOLDER_{len(placeholders)} -->"
replace_str = f"\\begin{{tabular}}{table}cl\\end{{tabular}}"
content = content.replace(replace_str, placeholder)
try:
pypandoc.convert_text(replace_str, format="latex", to="md", outputfile="output.md", encoding="utf-8")
except:
markdown_string = replace_str
else:
markdown_string = open('output.md', 'r', encoding='utf-8').read()
placeholders.append((placeholder, markdown_string))
new_content = content
for placeholder, md_table in placeholders:
new_content = new_content.replace(placeholder, md_table)
# 写入文件
return new_content
def convert_htmltale_to_md(content):
tables = re.findall(r'<table>(.*?)</table>', content, re.DOTALL)
placeholders = []
for table in tables:
placeholder = f"<!-- TABLE_PLACEHOLDER_{len(placeholders)} -->"
content = content.replace(f"<table>{table}</table>", placeholder)
try:
convert_table = htmltabletomd.convert_table(table)
except:
convert_table = table
placeholders.append((placeholder,convert_table))
new_content = content
for placeholder, md_table in placeholders:
new_content = new_content.replace(placeholder, md_table)
# 写入文件
return new_content
def clean_data(prod_type, download_dir):
file_type = ["academic_literature", "atlas", "courseware", "colorful_textbook", "historical_documents", "notes", "ordinary_books", "ordinary_exam_paper", "ordinary_textbook", "research_report", "special_exam_paper"]
for filetype in file_type:
tgt_dir = os.path.join(download_dir, filetype, prod_type, "cleaned")
if not os.path.exists(tgt_dir):
os.makedirs(tgt_dir)
source_dir = os.path.join(download_dir, filetype, prod_type)
filenames = os.listdir(source_dir)
for filename in filenames:
if filename.endswith('.md'):
input_file = os.path.join(source_dir, filename)
output_file = os.path.join(tgt_dir, "cleaned_" + filename)
with open(input_file, 'r', encoding='utf-8') as fr:
content = fr.read()
new_content = convert_htmltale_to_md(content)
new_content = clean_markdown_images(new_content)
new_content = clean_ocrmath_photo(new_content)
new_content = convert_latext_to_md(new_content)
with open(output_file, 'w', encoding='utf-8') as fw:
fw.write(new_content)
if __name__ == '__main__':
tool_type = args.tool_name
download_dir = args.download_dir
clean_data(tool_type, download_dir)
import os
from Levenshtein import distance
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction, corpus_bleu
from nltk.tokenize import word_tokenize
import json
import re
import scoring
import argparse
import nltk
nltk.download('punkt')
# 初始化列表来存储编辑距离和BLEU分数
class Scoring:
def __init__(self, result_path):
self.edit_distances = []
self.bleu_scores = []
self.sim_scores = []
self.filenames = []
self.score_dict = {}
self.anntion_cnt = 0
self.fw = open(result_path, "w+")
def simple_bleu_score(self, candidate, reference):
candidate_tokens = word_tokenize(candidate)
reference_tokens = word_tokenize(reference)
return sentence_bleu([reference_tokens], candidate_tokens, smoothing_function=SmoothingFunction().method1)
def preprocess_string(self, s):
sub_enter = re.sub(r'\n+', '\n', s)
return re.sub(r' ', ' ', sub_enter)
def calculate_similarity(self, annotion, actual, tool_type):
class_dict = {}
edit_distances = []
bleu_scores = []
sim_scores = list()
total_file = 0
for filename in os.listdir(annotion):
if filename.endswith('.md') and not filename.startswith('.'): # 忽略隐藏文件
total_file = total_file + 1
# 读取A目录中的文件
with open(os.path.join(annotion, filename), 'r', encoding='utf-8') as file_a:
content_a = file_a.read()
self.anntion_cnt = self.anntion_cnt + 1
filepath_b = os.path.join(actual, filename)
if os.path.exists(filepath_b):
with open(filepath_b, 'r', encoding='utf-8') as file_b:
content_b = file_b.read()
self.filenames.append(filename)
# 计算编辑距离
edit_dist = distance(self.preprocess_string(content_b),self.preprocess_string(content_a)) / max(len(content_a), len(content_b))
self.edit_distances.append(edit_dist)
edit_distances.append(edit_dist)
#计算BLUE分数
bleu_score = self.simple_bleu_score(content_b, content_a)
bleu_scores.append(bleu_score)
self.bleu_scores.append(bleu_score)
#计算marker分数
score = scoring.score_text(content_b, content_a)
sim_scores.append(score)
self.sim_scores.append(score)
class_dict[filename] = {"edit_dist": edit_dist, "bleu_score": bleu_score, "sim_score": score}
self.score_dict[filename] = {"edit_dist": edit_dist, "bleu_score": bleu_score, "sim_score": score}
else:
print(f"File {filename} not found in actual directory.")
# 计算每类平均值
class_average_edit_distance = sum(edit_distances) / len(edit_distances) if edit_distances else 0
class_average_bleu_score = sum(bleu_scores) / len(bleu_scores) if bleu_scores else 0
class_average_sim_score = sum(sim_scores) / len(sim_scores) if sim_scores else 0
self.fw.write(json.dumps(class_dict, ensure_ascii=False) + "\n")
ratio = len(class_dict)/total_file
self.fw.write(f"{tool_type} extract ratio: {ratio}" + "\n")
self.fw.write(f"{tool_type} Average Levenshtein Distance: {class_average_edit_distance}" + "\n")
self.fw.write(f"{tool_type} Average BLEU Score: {class_average_bleu_score}" + "\n")
self.fw.write(f"{tool_type} Average Sim Score: {class_average_sim_score}" + "\n")
print (f"{tool_type} extract ratio: {ratio}")
print (f"{tool_type} Average Levenshtein Distance: {class_average_edit_distance}")
print (f"{tool_type} Average BLEU Score: {class_average_bleu_score}")
print (f"{tool_type} Average Sim Score: {class_average_sim_score}")
return self.score_dict
def summary_scores(self):
# 计算整体平均值
over_all_dict = dict()
average_edit_distance = sum(self.edit_distances) / len(self.edit_distances) if self.edit_distances else 0
average_bleu_score = sum(self.bleu_scores) / len(self.bleu_scores) if self.bleu_scores else 0
average_sim_score = sum(self.sim_scores) / len(self.sim_scores) if self.sim_scores else 0
over_all_dict["average_edit_distance"] = average_edit_distance
over_all_dict["average_bleu_score"] = average_bleu_score
over_all_dict["average_sim_score"] = average_sim_score
self.fw.write(json.dumps(over_all_dict, ensure_ascii=False) + "\n")
return over_all_dict
def calculate_similarity_total(self, tool_type, file_types, download_dir):
for file_type in file_types:
annotion = os.path.join(download_dir, file_type, "annotations", "cleaned")
actual = os.path.join(download_dir, file_type, tool_type, "cleaned")
self.calculate_similarity(annotion, actual, file_type)
import math
from rapidfuzz import fuzz
import re
import regex
from statistics import mean
CHUNK_MIN_CHARS = 25
def chunk_text(text, chunk_len=500):
chunks = [text[i:i+chunk_len] for i in range(0, len(text), chunk_len)]
chunks = [c for c in chunks if c.strip() and len(c) > CHUNK_MIN_CHARS]
return chunks
def overlap_score(hypothesis_chunks, reference_chunks):
if len(reference_chunks) > 0:
length_modifier = len(hypothesis_chunks) / len(reference_chunks)
else:
length_modifier = 0
search_distance = max(len(reference_chunks) // 5, 10)
chunk_scores = []
for i, hyp_chunk in enumerate(hypothesis_chunks):
max_score = 0
total_len = 0
i_offset = int(i * length_modifier)
chunk_range = range(max(0, i_offset-search_distance), min(len(reference_chunks), i_offset+search_distance))
for j in chunk_range:
ref_chunk = reference_chunks[j]
score = fuzz.ratio(hyp_chunk, ref_chunk, score_cutoff=30) / 100
if score > max_score:
max_score = score
total_len = len(ref_chunk)
chunk_scores.append(max_score)
return chunk_scores
def score_text(hypothesis, reference):
# Returns a 0-1 alignment score
hypothesis_chunks = chunk_text(hypothesis)
reference_chunks = chunk_text(reference)
chunk_scores = overlap_score(hypothesis_chunks, reference_chunks)
if len(chunk_scores) > 0:
mean_score = mean(chunk_scores)
return mean_score
else:
return 0
#return mean(chunk_scores)
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment