Commit 0063a668 authored by chenzk's avatar chenzk
Browse files

v1.0

parents
from typing import List, Optional, Union, Tuple
import cv2
import numpy as np
from supervision.detection.core import Detections
from supervision.draw.color import Color, ColorPalette
class BoxAnnotator:
"""
A class for drawing bounding boxes on an image using detections provided.
Attributes:
color (Union[Color, ColorPalette]): The color to draw the bounding box,
can be a single color or a color palette
thickness (int): The thickness of the bounding box lines, default is 2
text_color (Color): The color of the text on the bounding box, default is white
text_scale (float): The scale of the text on the bounding box, default is 0.5
text_thickness (int): The thickness of the text on the bounding box,
default is 1
text_padding (int): The padding around the text on the bounding box,
default is 5
"""
def __init__(
self,
color: Union[Color, ColorPalette] = ColorPalette.DEFAULT,
thickness: int = 3, # 1 for seeclick 2 for mind2web and 3 for demo
text_color: Color = Color.BLACK,
text_scale: float = 0.5, # 0.8 for mobile/web, 0.3 for desktop # 0.4 for mind2web
text_thickness: int = 2, #1, # 2 for demo
text_padding: int = 10,
avoid_overlap: bool = True,
):
self.color: Union[Color, ColorPalette] = color
self.thickness: int = thickness
self.text_color: Color = text_color
self.text_scale: float = text_scale
self.text_thickness: int = text_thickness
self.text_padding: int = text_padding
self.avoid_overlap: bool = avoid_overlap
def annotate(
self,
scene: np.ndarray,
detections: Detections,
labels: Optional[List[str]] = None,
skip_label: bool = False,
image_size: Optional[Tuple[int, int]] = None,
) -> np.ndarray:
"""
Draws bounding boxes on the frame using the detections provided.
Args:
scene (np.ndarray): The image on which the bounding boxes will be drawn
detections (Detections): The detections for which the
bounding boxes will be drawn
labels (Optional[List[str]]): An optional list of labels
corresponding to each detection. If `labels` are not provided,
corresponding `class_id` will be used as label.
skip_label (bool): Is set to `True`, skips bounding box label annotation.
Returns:
np.ndarray: The image with the bounding boxes drawn on it
Example:
```python
import supervision as sv
classes = ['person', ...]
image = ...
detections = sv.Detections(...)
box_annotator = sv.BoxAnnotator()
labels = [
f"{classes[class_id]} {confidence:0.2f}"
for _, _, confidence, class_id, _ in detections
]
annotated_frame = box_annotator.annotate(
scene=image.copy(),
detections=detections,
labels=labels
)
```
"""
font = cv2.FONT_HERSHEY_SIMPLEX
for i in range(len(detections)):
x1, y1, x2, y2 = detections.xyxy[i].astype(int)
class_id = (
detections.class_id[i] if detections.class_id is not None else None
)
idx = class_id if class_id is not None else i
color = (
self.color.by_idx(idx)
if isinstance(self.color, ColorPalette)
else self.color
)
cv2.rectangle(
img=scene,
pt1=(x1, y1),
pt2=(x2, y2),
color=color.as_bgr(),
thickness=self.thickness,
)
if skip_label:
continue
text = (
f"{class_id}"
if (labels is None or len(detections) != len(labels))
else labels[i]
)
text_width, text_height = cv2.getTextSize(
text=text,
fontFace=font,
fontScale=self.text_scale,
thickness=self.text_thickness,
)[0]
if not self.avoid_overlap:
text_x = x1 + self.text_padding
text_y = y1 - self.text_padding
text_background_x1 = x1
text_background_y1 = y1 - 2 * self.text_padding - text_height
text_background_x2 = x1 + 2 * self.text_padding + text_width
text_background_y2 = y1
# text_x = x1 - self.text_padding - text_width
# text_y = y1 + self.text_padding + text_height
# text_background_x1 = x1 - 2 * self.text_padding - text_width
# text_background_y1 = y1
# text_background_x2 = x1
# text_background_y2 = y1 + 2 * self.text_padding + text_height
else:
text_x, text_y, text_background_x1, text_background_y1, text_background_x2, text_background_y2 = get_optimal_label_pos(self.text_padding, text_width, text_height, x1, y1, x2, y2, detections, image_size)
cv2.rectangle(
img=scene,
pt1=(text_background_x1, text_background_y1),
pt2=(text_background_x2, text_background_y2),
color=color.as_bgr(),
thickness=cv2.FILLED,
)
# import pdb; pdb.set_trace()
box_color = color.as_rgb()
luminance = 0.299 * box_color[0] + 0.587 * box_color[1] + 0.114 * box_color[2]
text_color = (0,0,0) if luminance > 160 else (255,255,255)
cv2.putText(
img=scene,
text=text,
org=(text_x, text_y),
fontFace=font,
fontScale=self.text_scale,
# color=self.text_color.as_rgb(),
color=text_color,
thickness=self.text_thickness,
lineType=cv2.LINE_AA,
)
return scene
def box_area(box):
return (box[2] - box[0]) * (box[3] - box[1])
def intersection_area(box1, box2):
x1 = max(box1[0], box2[0])
y1 = max(box1[1], box2[1])
x2 = min(box1[2], box2[2])
y2 = min(box1[3], box2[3])
return max(0, x2 - x1) * max(0, y2 - y1)
def IoU(box1, box2, return_max=True):
intersection = intersection_area(box1, box2)
union = box_area(box1) + box_area(box2) - intersection
if box_area(box1) > 0 and box_area(box2) > 0:
ratio1 = intersection / box_area(box1)
ratio2 = intersection / box_area(box2)
else:
ratio1, ratio2 = 0, 0
if return_max:
return max(intersection / union, ratio1, ratio2)
else:
return intersection / union
def get_optimal_label_pos(text_padding, text_width, text_height, x1, y1, x2, y2, detections, image_size):
""" check overlap of text and background detection box, and get_optimal_label_pos,
pos: str, position of the text, must be one of 'top left', 'top right', 'outer left', 'outer right' TODO: if all are overlapping, return the last one, i.e. outer right
Threshold: default to 0.3
"""
def get_is_overlap(detections, text_background_x1, text_background_y1, text_background_x2, text_background_y2, image_size):
is_overlap = False
for i in range(len(detections)):
detection = detections.xyxy[i].astype(int)
if IoU([text_background_x1, text_background_y1, text_background_x2, text_background_y2], detection) > 0.3:
is_overlap = True
break
# check if the text is out of the image
if text_background_x1 < 0 or text_background_x2 > image_size[0] or text_background_y1 < 0 or text_background_y2 > image_size[1]:
is_overlap = True
return is_overlap
# if pos == 'top left':
text_x = x1 + text_padding
text_y = y1 - text_padding
text_background_x1 = x1
text_background_y1 = y1 - 2 * text_padding - text_height
text_background_x2 = x1 + 2 * text_padding + text_width
text_background_y2 = y1
is_overlap = get_is_overlap(detections, text_background_x1, text_background_y1, text_background_x2, text_background_y2, image_size)
if not is_overlap:
return text_x, text_y, text_background_x1, text_background_y1, text_background_x2, text_background_y2
# elif pos == 'outer left':
text_x = x1 - text_padding - text_width
text_y = y1 + text_padding + text_height
text_background_x1 = x1 - 2 * text_padding - text_width
text_background_y1 = y1
text_background_x2 = x1
text_background_y2 = y1 + 2 * text_padding + text_height
is_overlap = get_is_overlap(detections, text_background_x1, text_background_y1, text_background_x2, text_background_y2, image_size)
if not is_overlap:
return text_x, text_y, text_background_x1, text_background_y1, text_background_x2, text_background_y2
# elif pos == 'outer right':
text_x = x2 + text_padding
text_y = y1 + text_padding + text_height
text_background_x1 = x2
text_background_y1 = y1
text_background_x2 = x2 + 2 * text_padding + text_width
text_background_y2 = y1 + 2 * text_padding + text_height
is_overlap = get_is_overlap(detections, text_background_x1, text_background_y1, text_background_x2, text_background_y2, image_size)
if not is_overlap:
return text_x, text_y, text_background_x1, text_background_y1, text_background_x2, text_background_y2
# elif pos == 'top right':
text_x = x2 - text_padding - text_width
text_y = y1 - text_padding
text_background_x1 = x2 - 2 * text_padding - text_width
text_background_y1 = y1 - 2 * text_padding - text_height
text_background_x2 = x2
text_background_y2 = y1
is_overlap = get_is_overlap(detections, text_background_x1, text_background_y1, text_background_x2, text_background_y2, image_size)
if not is_overlap:
return text_x, text_y, text_background_x1, text_background_y1, text_background_x2, text_background_y2
return text_x, text_y, text_background_x1, text_background_y1, text_background_x2, text_background_y2
from util.utils import get_som_labeled_img, get_caption_model_processor, get_yolo_model, check_ocr_box
import torch
from PIL import Image
import io
import base64
from typing import Dict
class Omniparser(object):
def __init__(self, config: Dict):
self.config = config
device = 'cuda' if torch.cuda.is_available() else 'cpu'
self.som_model = get_yolo_model(model_path=config['som_model_path'])
self.caption_model_processor = get_caption_model_processor(model_name=config['caption_model_name'], model_name_or_path=config['caption_model_path'], device=device)
print('Omniparser initialized!!!')
def parse(self, image_base64: str):
image_bytes = base64.b64decode(image_base64)
image = Image.open(io.BytesIO(image_bytes))
print('image size:', image.size)
box_overlay_ratio = max(image.size) / 3200
draw_bbox_config = {
'text_scale': 0.8 * box_overlay_ratio,
'text_thickness': max(int(2 * box_overlay_ratio), 1),
'text_padding': max(int(3 * box_overlay_ratio), 1),
'thickness': max(int(3 * box_overlay_ratio), 1),
}
(text, ocr_bbox), _ = check_ocr_box(image, display_img=False, output_bb_format='xyxy', easyocr_args={'text_threshold': 0.8}, use_paddleocr=False)
dino_labled_img, label_coordinates, parsed_content_list = get_som_labeled_img(image, self.som_model, BOX_TRESHOLD = self.config['BOX_TRESHOLD'], output_coord_in_ratio=True, ocr_bbox=ocr_bbox,draw_bbox_config=draw_bbox_config, caption_model_processor=self.caption_model_processor, ocr_text=text,use_local_semantics=True, iou_threshold=0.7, scale_img=False, batch_size=128)
return dino_labled_img, parsed_content_list
\ No newline at end of file
import re
# is instruction English
def is_english_simple(text):
try:
text.encode(encoding='utf-8').decode('ascii')
except UnicodeDecodeError:
return False
else:
return True
# bbox -> point (str)
def bbox_2_point(bbox, dig=2):
# bbox [left, top, right, bottom]
point = [(bbox[0]+bbox[2])/2, (bbox[1]+bbox[3])/2]
point = [f"{item:.2f}" for item in point]
point_str = "({},{})".format(point[0], point[1])
return point_str
# bbox -> bbox (str)
def bbox_2_bbox(bbox, dig=2):
bbox = [f"{item:.2f}" for item in bbox]
bbox_str = "({},{},{},{})".format(bbox[0], bbox[1], bbox[2], bbox[3])
return bbox_str
# point (str) -> point
def pred_2_point(s):
floats = re.findall(r'-?\d+\.?\d*', s)
floats = [float(num) for num in floats]
if len(floats) == 2:
click_point = floats
elif len(floats) == 4:
click_point = [(floats[0]+floats[2])/2, (floats[1]+floats[3])/2]
return click_point
# bbox (qwen str) -> bbox
def extract_bbox(s):
# Regular expression to find the content inside <box> and </box>
pattern = r"<box>\((\d+,\d+)\),\((\d+,\d+)\)</box>"
matches = re.findall(pattern, s)
# Convert the tuples of strings into tuples of integers
return [(int(x.split(',')[0]), int(x.split(',')[1])) for x in sum(matches, ())]
def extract_mark_id(s):
match = re.search(r'Mark: (\d+)', s)
if match:
return int(match.group(1))
return None
\ No newline at end of file
import torch
from ultralytics import YOLO
from PIL import Image
import io
import base64
device = 'cuda'
from PIL import Image, ImageDraw, ImageFont
import numpy as np
import networkx as nx
# import cv2
font_path = "agents/ui_agent/util/arial.ttf"
class MarkHelper:
def __init__(self):
self.markSize_dict = {}
self.font_dict = {}
self.min_font_size = 20 # 1 in v1
self.max_font_size = 30
self.max_font_proportion = 0.04 # 0.032 in v1
def __get_markSize(self, text, image_height, image_width, font):
im = Image.new('RGB', (image_width, image_height))
draw = ImageDraw.Draw(im)
_, _, width, height = draw.textbbox((0, 0), text=text, font=font)
return height, width
def _setup_new_font(self, image_height, image_width):
key = f"{image_height}_{image_width}"
# print(f"Setting up new font for image size: {key}")
# setup the font
fontsize = self.min_font_size
font = ImageFont.truetype(font_path, fontsize)
# font = ImageFont.load_default(size=fontsize)
while min(self.__get_markSize("555", image_height, image_width, font)) < min(self.max_font_size, self.max_font_proportion * min(image_height, image_width)):
# iterate until the text size is just larger than the criteria
fontsize += 1
font = ImageFont.truetype(font_path, fontsize)
# font = ImageFont.load_default(size=fontsize)
self.font_dict[key] = font
# setup the markSize dict
markSize_3digits = self.__get_markSize('555', image_height, image_width, font)
markSize_2digits = self.__get_markSize('55', image_height, image_width, font)
markSize_1digit = self.__get_markSize('5', image_height, image_width, font)
self.markSize_dict[key] = {
1: markSize_1digit,
2: markSize_2digits,
3: markSize_3digits
}
def get_font(self, image_height, image_width):
key = f"{image_height}_{image_width}"
if key not in self.font_dict:
self._setup_new_font(image_height, image_width)
return self.font_dict[key]
def get_mark_size(self, text_str, image_height, image_width):
"""Get the font size for the given image dimensions."""
key = f"{image_height}_{image_width}"
if key not in self.markSize_dict:
self._setup_new_font(image_height, image_width)
largest_size = self.markSize_dict[key].get(3, None)
text_h, text_w = self.markSize_dict[key].get(len(text_str), largest_size) # default to the largest size if the text is too long
return text_h, text_w
def __calculate_iou(box1, box2, return_area=False):
"""
Calculate the Intersection over Union (IoU) of two bounding boxes.
:param box1: Tuple of (y, x, h, w) for the first bounding box
:param box2: Tuple of (y, x, h, w) for the second bounding box
:return: IoU value
"""
y1, x1, h1, w1 = box1
y2, x2, h2, w2 = box2
# Calculate the intersection area
y_min = max(y1, y2)
x_min = max(x1, x2)
y_max = min(y1 + h1, y2 + h2)
x_max = min(x1 + w1, x2 + w2)
intersection_area = max(0, y_max - y_min) * max(0, x_max - x_min)
# Compute the area of both bounding boxes
box1_area = h1 * w1
box2_area = h2 * w2
# Calculate the IoU
# iou = intersection_area / box1_area + box2_area - intersection_area
iou = intersection_area / (min(box1_area, box2_area) + 0.0001)
if return_area:
return iou, intersection_area
return iou
def __calculate_nearest_corner_distance(box1, box2):
"""Calculate the distance between the nearest edge or corner of two bounding boxes."""
y1, x1, h1, w1 = box1
y2, x2, h2, w2 = box2
corners1 = np.array([
[y1, x1],
[y1, x1 + w1],
[y1 + h1, x1],
[y1 + h1, x1 + w1]
])
corners2 = np.array([
[y2, x2],
[y2, x2 + w2],
[y2 + h2, x2],
[y2 + h2, x2 + w2]
])
# Calculate pairwise distances between corners
distances = np.linalg.norm(corners1[:, np.newaxis] - corners2, axis=2)
# Find the minimum distance
min_distance = np.min(distances)
return min_distance
def _find_least_overlapping_corner(bbox, bboxes, drawn_boxes, text_size, image_size):
"""Find the corner with the least overlap with other bboxes.
Args:
bbox: (y, x, h, w) The bounding box to place the text on.
bboxes: [(y, x, h, w)] The list of bounding boxes to compare against.
drawn_boxes: [(y, x, h, w)] The list of bounding boxes that have already been drawn on.
text_size: (height, width) The size of the text to be drawn.
image_size: (height, width) The size of the image.
"""
y, x, h, w = bbox
h_text, w_text = text_size
image_height, image_width = image_size
corners = [
# top-left
(y - h_text, x),
# top-right
(y - h_text, x + w - w_text),
# right-top
(y, x + w),
# right-bottom
(y + h - h_text, x + w),
# bottom-right
(y + h, x + w - w_text),
# bottom-left
(y + h, x),
# left-bottom
(y + h - h_text, x - w_text),
# left-top
(y, x - w_text),
]
best_corner = corners[0]
max_flag = float('inf')
for corner in corners:
corner_bbox = (corner[0], corner[1], h_text, w_text)
# if the corner is out of the image, skip
if corner[0] < 0 or corner[1] < 0 or corner[0] + h_text > image_height or corner[1] + w_text > image_width:
continue
max_iou = - (image_width + image_height)
# 找到关于这个角最差的 case
# given the current corner, find the larget iou with other bboxes.
for other_bbox in bboxes + drawn_boxes:
if np.array_equal(bbox, other_bbox):
continue
iou = __calculate_iou(corner_bbox, other_bbox, return_area=True)[1]
max_iou = max(max_iou, iou - 0.0001 * __calculate_nearest_corner_distance(corner_bbox, other_bbox))
# the smaller the max_IOU, the better the corner
# 取最差的值 相对最好的那个角
if max_iou < max_flag:
max_flag = max_iou
best_corner = corner
return best_corner
def plot_boxes_with_marks(
image: Image.Image,
bboxes, # (y, x, h, w)
mark_helper: MarkHelper,
linewidth=2,
alpha=0,
edgecolor=None,
fn_save=None,
normalized_to_pixel=True,
add_mark=True
) -> np.ndarray:
"""Plots bounding boxes on an image with marks attached to the edges of the boxes where no overlap with other boxes occurs.
Args:
image: The image to plot the bounding boxes on.
bboxes: A 2D int array of shape (num_boxes, 4), where each row represents a bounding box: (y_top_left, x_top_left, box_height, box_width). If normalized_to_pixel is True, the values are float and are normalized with the image size. If normalized_to_pixel is False, the values are int and are in pixel.
"""
# Then modify the drawing code
draw = ImageDraw.Draw(image)
# draw boxes on the image
image_width, image_height = image.size
if normalized_to_pixel:
bboxes = [(int(y * image_height), int(x * image_width), int(h * image_height), int(w * image_width)) for y, x, h, w in bboxes]
for box in bboxes:
y, x, h, w = box
draw.rectangle([x, y, x + w, y + h], outline=edgecolor, width=linewidth)
# Draw the bounding boxes with index at the least overlapping corner
drawn_boxes = []
for idx, bbox in enumerate(bboxes):
text = str(idx)
text_h, text_w = mark_helper.get_mark_size(text, image_height, image_width)
corner_y, corner_x = _find_least_overlapping_corner(
bbox, bboxes, drawn_boxes, (text_h, text_w), (image_height, image_width))
# Define the index box (y, x, y + h, x + w)
text_box = (corner_y, corner_x, text_h, text_w)
if add_mark:
# Draw the filled index box and text
draw.rectangle([corner_x, corner_y, corner_x + text_w, corner_y + text_h], # (x, y, x + w, y + h)
fill="red")
font = mark_helper.get_font(image_height, image_width)
draw.text((corner_x, corner_y), text, fill='white', font=font)
# Update the list of drawn boxes
drawn_boxes.append(np.array(text_box))
if fn_save is not None: # PIL image
image.save(fn_save)
return image
def plot_circles_with_marks(
image: Image.Image,
points, # (x, y)
mark_helper: MarkHelper,
linewidth=2,
edgecolor=None,
fn_save=None,
normalized_to_pixel=True,
add_mark=True
) -> np.ndarray:
"""Plots bounding boxes on an image with marks attached to the edges of the boxes where no overlap with other boxes occurs.
Args:
image: The image to plot the bounding boxes on.
bboxes: A 2D int array of shape (num_boxes, 4), where each row represents a bounding box: (y_top_left, x_top_left, box_height, box_width). If normalized_to_pixel is True, the values are float and are normalized with the image size. If normalized_to_pixel is False, the values are int and are in pixel.
"""
# draw boxes on the image
image_width, image_height = image.size
if normalized_to_pixel:
bboxes = [(int(y * image_height), int(x * image_width), int(h * image_height), int(w * image_width)) for y, x, h, w in bboxes]
draw = ImageDraw.Draw(image)
for point in points:
x, y = point
draw.circle((x, y), radius=5, outline=edgecolor, width=linewidth)
if fn_save is not None: # PIL image
image.save(fn_save)
return image
markhelper = MarkHelper()
BBOX_DEDUPLICATION_IOU_PROPORTION = 0.5
BBOX_GROUPING_VERTICAL_THRESHOLD = 20
BBOX_GROUPING_HORIZONTAL_THRESHOLD = 20
BBOX_AUG_TARGET = 2.0
def _is_boxes_same_line_or_near(bbox1, bbox2, vertical_threshold, horizontal_threshold):
"""check if two boxes are in the same line or close enough to be considered together"""
y1, x1, h1, w1 = bbox1
y2, x2, h2, w2 = bbox2
# Check if the boxes are close horizontally (consider the edge case where the boxes are touching)
horizontally_close = (x1 <= x2 and x2 - x1 <= w1 + horizontal_threshold) or (x2 <= x1 and x1 - x2 <= w2 + horizontal_threshold)
# Check if the boxes are close vertically (consider the edge case where the boxes are touching)
vertically_close = (y1 <= y2 and y2 - y1 <= h1 + vertical_threshold) or (y2 <= y1 and y1 - y2 <= h2 + vertical_threshold)
# Consider the boxes to be in the same line if they are vertically close and either overlap or are close horizontally
return vertically_close and horizontally_close
def _build_adjacency_matrix(bboxes, vertical_threshold, horizontal_threshold):
"""Build the adjacency matrix based on the merging criteria."""
num_boxes = len(bboxes)
A = np.zeros((num_boxes, num_boxes), dtype=int)
for i in range(num_boxes):
for j in range(i + 1, num_boxes):
if _is_boxes_same_line_or_near(bboxes[i], bboxes[j], vertical_threshold, horizontal_threshold):
A[i, j] = 1
A[j, i] = 1 # Symmetric matrix
return A
def merge_connected_bboxes(bboxes, text_details,
vertical_threshold=BBOX_GROUPING_VERTICAL_THRESHOLD,
horizontal_threshold=BBOX_GROUPING_HORIZONTAL_THRESHOLD
):
"""Merge bboxes based on the adjacency matrix and return merged bboxes.
Args:
bboxes: A 2D array of shape (num_boxes, 4), where each row represents a bounding box: (y, x, height, width).
text_details: A list of text details for each bounding box.
vertical_threshold: The maximum vertical distance between two boxes to be considered in the same line.
horizontal_threshold: The maximum horizontal distance between two boxes to be considered close.
"""
# return if there are no bboxes
if len(bboxes) <= 1:
return bboxes, text_details
# Convert bboxes (x1, y1, x2, y2) to (y, x, height, width) format
bboxes = np.array(bboxes)
bboxes = np.array([bboxes[:, 1], bboxes[:, 0], bboxes[:, 3] - bboxes[:, 1], bboxes[:, 2] - bboxes[:, 0]]).T
# Build adjacency matrix
A = _build_adjacency_matrix(bboxes, vertical_threshold, horizontal_threshold)
# Create graph from adjacency matrix
G = nx.from_numpy_array(A)
# Find connected components
components = list(nx.connected_components(G))
# Convert bboxes to (y_min, x_min, y_max, x_max) format
corners = np.copy(bboxes)
corners_y, corners_x, corners_h, corners_w = corners[:, 0], corners[:, 1], corners[:, 2], corners[:, 3]
corners_y_max = corners_y + corners_h
corners_x_max = corners_x + corners_w
# Merge bboxes for each connected component
merged_bboxes = []
merged_text_details = []
for component in components:
indices = list(component) # e.g., [32, 33, 34, 30, 31]
indices = sorted(indices)
# merge the text details
merged_text_details.append(' '.join([text_details[i] for i in indices]))
# merge the bboxes
y_min = min(corners_y[i] for i in indices)
x_min = min(corners_x[i] for i in indices)
y_max = max(corners_y_max[i] for i in indices)
x_max = max(corners_x_max[i] for i in indices)
merged_bboxes.append((y_min, x_min, y_max - y_min, x_max - x_min)) # Convert merged_bbox back to (y, x, height, width) format
# convert (y, x, height, width) to (x1, y1, x2, y2) format without np.array
merged_bboxes = [(bbox[1], bbox[0], bbox[1] + bbox[3], bbox[0] + bbox[2]) for bbox in merged_bboxes]
return merged_bboxes, merged_text_details
\ No newline at end of file
This diff is collapsed.
# datasets
from .epic import epic
from .ego4d import ego4d
from .openx import openx
from .openx_magma import openx_magma
from .magma import magma
from .llava import llava
from .seeclick import seeclick
# (joint) datasets
from .dataset import build_joint_dataset
# data collators
from .data_collator import DataCollatorForSupervisedDataset
from .data_collator import DataCollatorForHFDataset
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment