Unverified Commit 2e30cc17 authored by Xiaomeng Zhao's avatar Xiaomeng Zhao Committed by GitHub
Browse files

Merge pull request #2767 from opendatalab/dev-yolo-update

Dev yolo update
parents af7dee49 4243b0ea
...@@ -9,7 +9,7 @@ from ...utils.config_reader import get_formula_enable, get_table_enable ...@@ -9,7 +9,7 @@ from ...utils.config_reader import get_formula_enable, get_table_enable
from ...utils.model_utils import crop_img, get_res_list_from_layout_res from ...utils.model_utils import crop_img, get_res_list_from_layout_res
from ...utils.ocr_utils import get_adjusted_mfdetrec_res, get_ocr_result_list, OcrConfidence from ...utils.ocr_utils import get_adjusted_mfdetrec_res, get_ocr_result_list, OcrConfidence
YOLO_LAYOUT_BASE_BATCH_SIZE = 1 YOLO_LAYOUT_BASE_BATCH_SIZE = 8
MFD_BASE_BATCH_SIZE = 1 MFD_BASE_BATCH_SIZE = 1
MFR_BASE_BATCH_SIZE = 16 MFR_BASE_BATCH_SIZE = 16
......
from typing import List, Dict, Union
from doclayout_yolo import YOLOv10 from doclayout_yolo import YOLOv10
from tqdm import tqdm from tqdm import tqdm
import numpy as np
from PIL import Image
class DocLayoutYOLOModel(object): class DocLayoutYOLOModel:
def __init__(self, weight, device): def __init__(
self.model = YOLOv10(weight) self,
weight: str,
device: str = "cuda",
imgsz: int = 1280,
conf: float = 0.1,
iou: float = 0.45,
):
self.model = YOLOv10(weight).to(device)
self.device = device self.device = device
self.imgsz = imgsz
self.conf = conf
self.iou = iou
def predict(self, image): def _parse_prediction(self, prediction) -> List[Dict]:
layout_res = [] layout_res = []
doclayout_yolo_res = self.model.predict(
image, # 容错处理
imgsz=1280, if not hasattr(prediction, "boxes") or prediction.boxes is None:
conf=0.10, return layout_res
iou=0.45,
verbose=False, device=self.device for xyxy, conf, cls in zip(
)[0] prediction.boxes.xyxy.cpu(),
for xyxy, conf, cla in zip( prediction.boxes.conf.cpu(),
doclayout_yolo_res.boxes.xyxy.cpu(), prediction.boxes.cls.cpu(),
doclayout_yolo_res.boxes.conf.cpu(),
doclayout_yolo_res.boxes.cls.cpu(),
): ):
xmin, ymin, xmax, ymax = [int(p.item()) for p in xyxy] coords = list(map(int, xyxy.tolist()))
new_item = { xmin, ymin, xmax, ymax = coords
"category_id": int(cla.item()), layout_res.append({
"category_id": int(cls.item()),
"poly": [xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax], "poly": [xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax],
"score": round(float(conf.item()), 3), "score": round(float(conf.item()), 3),
} })
layout_res.append(new_item)
return layout_res return layout_res
def batch_predict(self, images: list, batch_size: int) -> list: def predict(self, image: Union[np.ndarray, Image.Image]) -> List[Dict]:
images_layout_res = [] prediction = self.model.predict(
# for index in range(0, len(images), batch_size): image,
for index in tqdm(range(0, len(images), batch_size), desc="Layout Predict"): imgsz=self.imgsz,
doclayout_yolo_res = [ conf=self.conf,
image_res.cpu() iou=self.iou,
for image_res in self.model.predict( verbose=False
images[index : index + batch_size], )[0]
imgsz=1280, return self._parse_prediction(prediction)
conf=0.10,
iou=0.45, def batch_predict(
self,
images: List[Union[np.ndarray, Image.Image]],
batch_size: int = 4
) -> List[List[Dict]]:
results = []
with tqdm(total=len(images), desc="Layout Predict") as pbar:
for idx in range(0, len(images), batch_size):
batch = images[idx: idx + batch_size]
predictions = self.model.predict(
batch,
imgsz=self.imgsz,
conf=self.conf,
iou=self.iou,
verbose=False, verbose=False,
device=self.device,
) )
] for pred in predictions:
for image_res in doclayout_yolo_res: results.append(self._parse_prediction(pred))
layout_res = [] pbar.update(len(batch))
for xyxy, conf, cla in zip( return results
image_res.boxes.xyxy, \ No newline at end of file
image_res.boxes.conf,
image_res.boxes.cls,
):
xmin, ymin, xmax, ymax = [int(p.item()) for p in xyxy]
new_item = {
"category_id": int(cla.item()),
"poly": [xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax],
"score": round(float(conf.item()), 3),
}
layout_res.append(new_item)
images_layout_res.append(layout_res)
return images_layout_res
from typing import List, Union
from tqdm import tqdm from tqdm import tqdm
from ultralytics import YOLO from ultralytics import YOLO
import numpy as np
from PIL import Image
class YOLOv8MFDModel(object): class YOLOv8MFDModel:
def __init__(self, weight, device="cpu"): def __init__(
self.mfd_model = YOLO(weight) self,
weight: str,
device: str = "cpu",
imgsz: int = 1888,
conf: float = 0.25,
iou: float = 0.45,
):
self.model = YOLO(weight).to(device)
self.device = device self.device = device
self.imgsz = imgsz
self.conf = conf
self.iou = iou
def predict(self, image): def _run_predict(
mfd_res = self.mfd_model.predict( self,
image, imgsz=1888, conf=0.25, iou=0.45, verbose=False, device=self.device inputs: Union[np.ndarray, Image.Image, List],
)[0] is_batch: bool = False
return mfd_res ) -> List:
preds = self.model.predict(
def batch_predict(self, images: list, batch_size: int) -> list: inputs,
images_mfd_res = [] imgsz=self.imgsz,
# for index in range(0, len(images), batch_size): conf=self.conf,
for index in tqdm(range(0, len(images), batch_size), desc="MFD Predict"): iou=self.iou,
mfd_res = [
image_res.cpu()
for image_res in self.mfd_model.predict(
images[index : index + batch_size],
imgsz=1888,
conf=0.25,
iou=0.45,
verbose=False, verbose=False,
device=self.device, device=self.device
) )
] return [pred.cpu() for pred in preds] if is_batch else preds[0].cpu()
for image_res in mfd_res:
images_mfd_res.append(image_res) def predict(self, image: Union[np.ndarray, Image.Image]):
return images_mfd_res return self._run_predict(image)
def batch_predict(
self,
images: List[Union[np.ndarray, Image.Image]],
batch_size: int = 4
) -> List:
results = []
with tqdm(total=len(images), desc="MFD Predict") as pbar:
for idx in range(0, len(images), batch_size):
batch = images[idx: idx + batch_size]
batch_preds = self._run_predict(batch, is_batch=True)
results.extend(batch_preds)
pbar.update(len(batch))
return results
\ No newline at end of file
...@@ -15,7 +15,7 @@ def page_to_image( ...@@ -15,7 +15,7 @@ def page_to_image(
scale = dpi / 72 scale = dpi / 72
long_side_length = max(*page.get_size()) long_side_length = max(*page.get_size())
if long_side_length > max_width_or_height: if (long_side_length*scale) > max_width_or_height:
scale = max_width_or_height / long_side_length scale = max_width_or_height / long_side_length
bitmap: PdfBitmap = page.render(scale=scale) # type: ignore bitmap: PdfBitmap = page.render(scale=scale) # type: ignore
......
...@@ -343,6 +343,14 @@ ...@@ -343,6 +343,14 @@
"created_at": "2025-06-18T11:27:23Z", "created_at": "2025-06-18T11:27:23Z",
"repoId": 765083837, "repoId": 765083837,
"pullRequestNo": 2727 "pullRequestNo": 2727
},
{
"name": "QIN2DIM",
"id": 62018067,
"comment_id": 2992279796,
"created_at": "2025-06-20T17:04:59Z",
"repoId": 765083837,
"pullRequestNo": 2758
} }
] ]
} }
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment