"megatron/data/gpt_dataset.py" did not exist on "25c07e1467838525ce1a750fc3c43e665d2ad82a"
Commit e9cee049 authored by luopl's avatar luopl
Browse files

Initial commit

parents
Pipeline #1056 canceled with stages
import argparse
import os
import sys
import warnings
from io import BytesIO
from pathlib import Path
import onnx
import torch
from mmdet.apis import init_detector
from mmengine.config import ConfigDict
from mmengine.logging import print_log
from mmengine.utils.path import mkdir_or_exist
# Add MMYOLO ROOT to sys.path
sys.path.append(str(Path(__file__).resolve().parents[3]))
from projects.easydeploy.model import DeployModel, MMYOLOBackend # noqa E402
warnings.filterwarnings(action='ignore', category=torch.jit.TracerWarning)
warnings.filterwarnings(action='ignore', category=torch.jit.ScriptWarning)
warnings.filterwarnings(action='ignore', category=UserWarning)
warnings.filterwarnings(action='ignore', category=FutureWarning)
warnings.filterwarnings(action='ignore', category=ResourceWarning)
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('config', help='Config file')
parser.add_argument('checkpoint', help='Checkpoint file')
parser.add_argument(
'--model-only', action='store_true', help='Export model only')
parser.add_argument(
'--work-dir', default='./work_dir', help='Path to save export model')
parser.add_argument(
'--img-size',
nargs='+',
type=int,
default=[640, 640],
help='Image size of height and width')
parser.add_argument('--batch-size', type=int, default=1, help='Batch size')
parser.add_argument(
'--device', default='cuda:0', help='Device used for inference')
parser.add_argument(
'--simplify',
action='store_true',
help='Simplify onnx model by onnx-sim')
parser.add_argument(
'--opset', type=int, default=11, help='ONNX opset version')
parser.add_argument(
'--backend',
type=str,
default='onnxruntime',
help='Backend for export onnx')
parser.add_argument(
'--pre-topk',
type=int,
default=1000,
help='Postprocess pre topk bboxes feed into NMS')
parser.add_argument(
'--keep-topk',
type=int,
default=100,
help='Postprocess keep topk bboxes out of NMS')
parser.add_argument(
'--iou-threshold',
type=float,
default=0.65,
help='IoU threshold for NMS')
parser.add_argument(
'--score-threshold',
type=float,
default=0.25,
help='Score threshold for NMS')
args = parser.parse_args()
args.img_size *= 2 if len(args.img_size) == 1 else 1
return args
def build_model_from_cfg(config_path, checkpoint_path, device):
model = init_detector(config_path, checkpoint_path, device=device)
model.eval()
return model
def main():
args = parse_args()
mkdir_or_exist(args.work_dir)
backend = MMYOLOBackend(args.backend.lower())
if backend in (MMYOLOBackend.ONNXRUNTIME, MMYOLOBackend.OPENVINO,
MMYOLOBackend.TENSORRT8, MMYOLOBackend.TENSORRT7):
if not args.model_only:
print_log('Export ONNX with bbox decoder and NMS ...')
else:
args.model_only = True
print_log(f'Can not export postprocess for {args.backend.lower()}.\n'
f'Set "args.model_only=True" default.')
if args.model_only:
postprocess_cfg = None
output_names = None
else:
postprocess_cfg = ConfigDict(
pre_top_k=args.pre_topk,
keep_top_k=args.keep_topk,
iou_threshold=args.iou_threshold,
score_threshold=args.score_threshold)
output_names = ['num_dets', 'boxes', 'scores', 'labels']
baseModel = build_model_from_cfg(args.config, args.checkpoint, args.device)
deploy_model = DeployModel(
baseModel=baseModel, backend=backend, postprocess_cfg=postprocess_cfg)
deploy_model.eval()
fake_input = torch.randn(args.batch_size, 3,
*args.img_size).to(args.device)
# dry run
deploy_model(fake_input)
save_onnx_path = os.path.join(
args.work_dir,
os.path.basename(args.checkpoint).replace('pth', 'onnx'))
# export onnx
with BytesIO() as f:
torch.onnx.export(
deploy_model,
fake_input,
f,
input_names=['images'],
output_names=output_names,
opset_version=args.opset)
f.seek(0)
onnx_model = onnx.load(f)
onnx.checker.check_model(onnx_model)
# Fix tensorrt onnx output shape, just for view
if not args.model_only and backend in (MMYOLOBackend.TENSORRT8,
MMYOLOBackend.TENSORRT7):
shapes = [
args.batch_size, 1, args.batch_size, args.keep_topk, 4,
args.batch_size, args.keep_topk, args.batch_size,
args.keep_topk
]
for i in onnx_model.graph.output:
for j in i.type.tensor_type.shape.dim:
j.dim_param = str(shapes.pop(0))
if args.simplify:
try:
import onnxsim
onnx_model, check = onnxsim.simplify(onnx_model)
assert check, 'assert check failed'
except Exception as e:
print_log(f'Simplify failure: {e}')
onnx.save(onnx_model, save_onnx_path)
print_log(f'ONNX export success, save into {save_onnx_path}')
if __name__ == '__main__':
main()
# Copyright (c) OpenMMLab. All rights reserved.
from easydeploy.model import ORTWrapper, TRTWrapper # isort:skip
import os
import random
from argparse import ArgumentParser
import cv2
import mmcv
import numpy as np
import torch
from mmcv.transforms import Compose
from mmdet.utils import get_test_pipeline_cfg
from mmengine.config import Config, ConfigDict
from mmengine.utils import ProgressBar, path
from mmyolo.utils import register_all_modules
from mmyolo.utils.misc import get_file_list
def parse_args():
parser = ArgumentParser()
parser.add_argument(
'img', help='Image path, include image file, dir and URL.')
parser.add_argument('config', help='Config file')
parser.add_argument('checkpoint', help='Checkpoint file')
parser.add_argument(
'--out-dir', default='./output', help='Path to output file')
parser.add_argument(
'--device', default='cuda:0', help='Device used for inference')
parser.add_argument(
'--show', action='store_true', help='Show the detection results')
args = parser.parse_args()
return args
def preprocess(config):
data_preprocess = config.get('model', {}).get('data_preprocessor', {})
mean = data_preprocess.get('mean', [0., 0., 0.])
std = data_preprocess.get('std', [1., 1., 1.])
mean = torch.tensor(mean, dtype=torch.float32).reshape(1, 3, 1, 1)
std = torch.tensor(std, dtype=torch.float32).reshape(1, 3, 1, 1)
class PreProcess(torch.nn.Module):
def __init__(self):
super().__init__()
def forward(self, x):
x = x[None].float()
x -= mean.to(x.device)
x /= std.to(x.device)
return x
return PreProcess().eval()
def main():
args = parse_args()
# register all modules in mmdet into the registries
register_all_modules()
colors = [[random.randint(0, 255) for _ in range(3)] for _ in range(1000)]
# build the model from a config file and a checkpoint file
if args.checkpoint.endswith('.onnx'):
model = ORTWrapper(args.checkpoint, args.device)
elif args.checkpoint.endswith('.engine') or args.checkpoint.endswith(
'.plan'):
model = TRTWrapper(args.checkpoint, args.device)
else:
raise NotImplementedError
model.to(args.device)
cfg = Config.fromfile(args.config)
class_names = cfg.get('class_name')
test_pipeline = get_test_pipeline_cfg(cfg)
test_pipeline[0] = ConfigDict({'type': 'mmdet.LoadImageFromNDArray'})
test_pipeline = Compose(test_pipeline)
pre_pipeline = preprocess(cfg)
if not args.show:
path.mkdir_or_exist(args.out_dir)
# get file list
files, source_type = get_file_list(args.img)
# start detector inference
progress_bar = ProgressBar(len(files))
for i, file in enumerate(files):
bgr = mmcv.imread(file)
rgb = mmcv.imconvert(bgr, 'bgr', 'rgb')
data, samples = test_pipeline(dict(img=rgb, img_id=i)).values()
pad_param = samples.get('pad_param',
np.array([0, 0, 0, 0], dtype=np.float32))
h, w = samples.get('ori_shape', rgb.shape[:2])
pad_param = torch.asarray(
[pad_param[2], pad_param[0], pad_param[2], pad_param[0]],
device=args.device)
scale_factor = samples.get('scale_factor', [1., 1])
scale_factor = torch.asarray(scale_factor * 2, device=args.device)
data = pre_pipeline(data).to(args.device)
result = model(data)
if source_type['is_dir']:
filename = os.path.relpath(file, args.img).replace('/', '_')
else:
filename = os.path.basename(file)
out_file = None if args.show else os.path.join(args.out_dir, filename)
# Get candidate predict info by num_dets
num_dets, bboxes, scores, labels = result
scores = scores[0, :num_dets]
bboxes = bboxes[0, :num_dets]
labels = labels[0, :num_dets]
bboxes -= pad_param
bboxes /= scale_factor
bboxes[:, 0::2].clamp_(0, w)
bboxes[:, 1::2].clamp_(0, h)
bboxes = bboxes.round().int()
for (bbox, score, label) in zip(bboxes, scores, labels):
bbox = bbox.tolist()
color = colors[label]
if class_names is not None:
label_name = class_names[label]
name = f'cls:{label_name}_score:{score:0.4f}'
else:
name = f'cls:{label}_score:{score:0.4f}'
cv2.rectangle(bgr, bbox[:2], bbox[2:], color, 2)
cv2.putText(
bgr,
name, (bbox[0], bbox[1] - 2),
cv2.FONT_HERSHEY_SIMPLEX,
2.0, [225, 255, 255],
thickness=3)
if args.show:
mmcv.imshow(bgr, 'result', 0)
else:
mmcv.imwrite(bgr, out_file)
progress_bar.update()
if __name__ == '__main__':
main()
# # Copyright (c) OpenMMLab. All rights reserved.
import os
import json
import warnings
import argparse
from io import BytesIO
import onnx
import torch
from mmdet.apis import init_detector
from mmengine.config import ConfigDict
from mmengine.logging import print_log
from mmengine.utils.path import mkdir_or_exist
from easydeploy.model import DeployModel, MMYOLOBackend # noqa E402
warnings.filterwarnings(action='ignore', category=torch.jit.TracerWarning)
warnings.filterwarnings(action='ignore', category=torch.jit.ScriptWarning)
warnings.filterwarnings(action='ignore', category=UserWarning)
warnings.filterwarnings(action='ignore', category=FutureWarning)
warnings.filterwarnings(action='ignore', category=ResourceWarning)
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('config', help='Config file')
parser.add_argument('checkpoint', help='Checkpoint file')
parser.add_argument('--custom-text',
type=str,
help='custom text inputs (text json) for YOLO-World.')
parser.add_argument('--add-padding',
action="store_true",
help="add an empty padding to texts.")
parser.add_argument('--model-only',
action='store_true',
help='Export model only')
parser.add_argument('--without-nms',
action='store_true',
help='Export model without NMS')
parser.add_argument('--without-bbox-decoder',
action='store_true',
help='Export model without Bbox Decoder (for INT8 Quantization)')
parser.add_argument('--work-dir',
default='./work_dirs',
help='Path to save export model')
parser.add_argument('--img-size',
nargs='+',
type=int,
default=[640, 640],
help='Image size of height and width')
parser.add_argument('--batch-size', type=int, default=1, help='Batch size')
parser.add_argument('--device',
default='cuda:0',
help='Device used for inference')
parser.add_argument('--simplify',
action='store_true',
help='Simplify onnx model by onnx-sim')
parser.add_argument('--opset',
type=int,
default=11,
help='ONNX opset version')
parser.add_argument('--backend',
type=str,
default='onnxruntime',
help='Backend for export onnx')
parser.add_argument('--pre-topk',
type=int,
default=1000,
help='Postprocess pre topk bboxes feed into NMS')
parser.add_argument('--keep-topk',
type=int,
default=100,
help='Postprocess keep topk bboxes out of NMS')
parser.add_argument('--iou-threshold',
type=float,
default=0.65,
help='IoU threshold for NMS')
parser.add_argument('--score-threshold',
type=float,
default=0.25,
help='Score threshold for NMS')
args = parser.parse_args()
args.img_size *= 2 if len(args.img_size) == 1 else 1
return args
def build_model_from_cfg(config_path, checkpoint_path, device):
model = init_detector(config_path, checkpoint_path, device=device)
model.eval()
return model
def main():
args = parse_args()
mkdir_or_exist(args.work_dir)
backend = MMYOLOBackend(args.backend.lower())
if backend in (MMYOLOBackend.ONNXRUNTIME, MMYOLOBackend.OPENVINO,
MMYOLOBackend.TENSORRT8, MMYOLOBackend.TENSORRT7):
if not args.model_only:
print_log('Export ONNX with bbox decoder and NMS ...')
else:
args.model_only = True
print_log(f'Can not export postprocess for {args.backend.lower()}.\n'
f'Set "args.model_only=True" default.')
if args.model_only:
postprocess_cfg = None
output_names = None
else:
postprocess_cfg = ConfigDict(pre_top_k=args.pre_topk,
keep_top_k=args.keep_topk,
iou_threshold=args.iou_threshold,
score_threshold=args.score_threshold)
output_names = ['num_dets', 'boxes', 'scores', 'labels']
if args.without_bbox_decoder or args.without_nms:
output_names = ['scores', 'boxes']
if args.custom_text is not None and len(args.custom_text) > 0:
with open(args.custom_text) as f:
texts = json.load(f)
texts = [x[0] for x in texts]
else:
from mmdet.datasets import CocoDataset
texts = CocoDataset.METAINFO['classes']
if args.add_padding:
texts = texts + [' ']
baseModel = build_model_from_cfg(args.config, args.checkpoint, args.device)
if hasattr(baseModel, 'reparameterize'):
# reparameterize text into YOLO-World
baseModel.reparameterize([texts])
deploy_model = DeployModel(baseModel=baseModel,
backend=backend,
postprocess_cfg=postprocess_cfg,
with_nms=not args.without_nms,
without_bbox_decoder=args.without_bbox_decoder)
deploy_model.eval()
fake_input = torch.randn(args.batch_size, 3,
*args.img_size).to(args.device)
# dry run
deploy_model(fake_input)
save_onnx_path = os.path.join(
args.work_dir,
os.path.basename(args.checkpoint).replace('pth', 'onnx'))
# export onnx
with BytesIO() as f:
torch.onnx.export(deploy_model,
fake_input,
f,
input_names=['images'],
output_names=output_names,
opset_version=args.opset)
f.seek(0)
onnx_model = onnx.load(f)
onnx.checker.check_model(onnx_model)
# Fix tensorrt onnx output shape, just for view
if not args.model_only and not args.without_nms and backend in (
MMYOLOBackend.TENSORRT8, MMYOLOBackend.TENSORRT7):
shapes = [
args.batch_size, 1, args.batch_size, args.keep_topk, 4,
args.batch_size, args.keep_topk, args.batch_size,
args.keep_topk
]
for i in onnx_model.graph.output:
for j in i.type.tensor_type.shape.dim:
j.dim_param = str(shapes.pop(0))
if args.simplify:
try:
import onnxsim
onnx_model, check = onnxsim.simplify(onnx_model)
assert check, 'assert check failed'
except Exception as e:
print_log(f'Simplify failure: {e}')
onnx.save(onnx_model, save_onnx_path)
print_log(f'ONNX export success, save into {save_onnx_path}')
if __name__ == '__main__':
main()
import os
import json
import argparse
import os.path as osp
import cv2
import numpy as np
import supervision as sv
import onnxruntime as ort
from mmengine.utils import ProgressBar
try:
import torch
from torchvision.ops import nms
except Exception as e:
print(e)
BOUNDING_BOX_ANNOTATOR = sv.BoundingBoxAnnotator(thickness=1)
MASK_ANNOTATOR = sv.MaskAnnotator()
class LabelAnnotator(sv.LabelAnnotator):
@staticmethod
def resolve_text_background_xyxy(
center_coordinates,
text_wh,
position,
):
center_x, center_y = center_coordinates
text_w, text_h = text_wh
return center_x, center_y, center_x + text_w, center_y + text_h
LABEL_ANNOTATOR = LabelAnnotator(text_padding=4,
text_scale=0.5,
text_thickness=1)
def parse_args():
parser = argparse.ArgumentParser('YOLO-World ONNX Demo')
parser.add_argument('onnx', help='onnx file')
parser.add_argument('image', help='image path, include image file or dir.')
parser.add_argument(
'text',
help=
'detecting texts (str or json), should be consistent with the ONNX model'
)
parser.add_argument('--output-dir',
default='./output',
help='directory to save output files')
parser.add_argument('--device',
default='cuda:0',
help='device used for inference')
parser.add_argument(
'--onnx-nms',
action='store_false',
help='whether ONNX model contains NMS and postprocessing')
args = parser.parse_args()
return args
def preprocess(image, size=(640, 640)):
h, w = image.shape[:2]
max_size = max(h, w)
scale_factor = size[0] / max_size
pad_h = (max_size - h) // 2
pad_w = (max_size - w) // 2
pad_image = np.zeros((max_size, max_size, 3), dtype=image.dtype)
pad_image[pad_h:h + pad_h, pad_w:w + pad_w] = image
image = cv2.resize(pad_image, size,
interpolation=cv2.INTER_LINEAR).astype('float32')
image /= 255.0
image = image[None]
return image, scale_factor, (pad_h, pad_w)
def visualize(image, bboxes, labels, scores, texts):
detections = sv.Detections(xyxy=bboxes, class_id=labels, confidence=scores)
labels = [
f"{texts[class_id][0]} {confidence:0.2f}" for class_id, confidence in
zip(detections.class_id, detections.confidence)
]
image = BOUNDING_BOX_ANNOTATOR.annotate(image, detections)
image = LABEL_ANNOTATOR.annotate(image, detections, labels=labels)
return image
def inference(ort_session,
image_path,
texts,
output_dir,
size=(640, 640),
**kwargs):
# normal export
# with NMS and postprocessing
ori_image = cv2.imread(image_path)
h, w = ori_image.shape[:2]
image, scale_factor, pad_param = preprocess(ori_image[:, :, [2, 1, 0]],
size)
input_ort = ort.OrtValue.ortvalue_from_numpy(image.transpose((0, 3, 1, 2)))
results = ort_session.run(["num_dets", "labels", "scores", "boxes"],
{"images": input_ort})
num_dets, labels, scores, bboxes = results
num_dets = num_dets[0][0]
labels = labels[0, :num_dets]
scores = scores[0, :num_dets]
bboxes = bboxes[0, :num_dets]
bboxes -= np.array(
[pad_param[1], pad_param[0], pad_param[1], pad_param[0]])
bboxes /= scale_factor
bboxes[:, 0::2] = np.clip(bboxes[:, 0::2], 0, w)
bboxes[:, 1::2] = np.clip(bboxes[:, 1::2], 0, h)
bboxes = bboxes.round().astype('int')
image_out = visualize(ori_image, bboxes, labels, scores, texts)
cv2.imwrite(osp.join(output_dir, osp.basename(image_path)), image_out)
return image_out
def inference_with_postprocessing(ort_session,
image_path,
texts,
output_dir,
size=(640, 640),
nms_thr=0.7,
score_thr=0.3,
max_dets=300):
# export with `--without-nms`
ori_image = cv2.imread(image_path)
h, w = ori_image.shape[:2]
image, scale_factor, pad_param = preprocess(ori_image[:, :, [2, 1, 0]],
size)
input_ort = ort.OrtValue.ortvalue_from_numpy(image.transpose((0, 3, 1, 2)))
results = ort_session.run(["scores", "boxes"], {"images": input_ort})
scores, bboxes = results
# move numpy array to torch
ori_scores = torch.from_numpy(scores[0]).to('cuda:0')
ori_bboxes = torch.from_numpy(bboxes[0]).to('cuda:0')
scores_list = []
labels_list = []
bboxes_list = []
# class-specific NMS
for cls_id in range(len(texts)):
cls_scores = ori_scores[:, cls_id]
labels = torch.ones(cls_scores.shape[0], dtype=torch.long) * cls_id
keep_idxs = nms(ori_bboxes, cls_scores, iou_threshold=nms_thr)
cur_bboxes = ori_bboxes[keep_idxs]
cls_scores = cls_scores[keep_idxs]
labels = labels[keep_idxs]
scores_list.append(cls_scores)
labels_list.append(labels)
bboxes_list.append(cur_bboxes)
scores = torch.cat(scores_list, dim=0)
labels = torch.cat(labels_list, dim=0)
bboxes = torch.cat(bboxes_list, dim=0)
keep_idxs = scores > score_thr
scores = scores[keep_idxs]
labels = labels[keep_idxs]
bboxes = bboxes[keep_idxs]
if len(keep_idxs) > max_dets:
_, sorted_idx = torch.sort(scores, descending=True)
keep_idxs = sorted_idx[:max_dets]
bboxes = bboxes[keep_idxs]
scores = scores[keep_idxs]
labels = labels[keep_idxs]
# Get candidate predict info by num_dets
scores = scores.cpu().numpy()
bboxes = bboxes.cpu().numpy()
labels = labels.cpu().numpy()
bboxes -= np.array(
[pad_param[1], pad_param[0], pad_param[1], pad_param[0]])
bboxes /= scale_factor
bboxes[:, 0::2] = np.clip(bboxes[:, 0::2], 0, w)
bboxes[:, 1::2] = np.clip(bboxes[:, 1::2], 0, h)
bboxes = bboxes.round().astype('int')
image_out = visualize(ori_image, bboxes, labels, scores, texts)
cv2.imwrite(osp.join(output_dir, osp.basename(image_path)), image_out)
return image_out
def main():
args = parse_args()
onnx_file = args.onnx
# init ONNX session
ort_session = ort.InferenceSession(
onnx_file, providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
print("Init ONNX Runtime session")
output_dir = "onnx_outputs"
if not osp.exists(output_dir):
os.mkdir(output_dir)
# load images
if not osp.isfile(args.image):
images = [
osp.join(args.image, img) for img in os.listdir(args.image)
if img.endswith('.png') or img.endswith('.jpg')
]
else:
images = [args.image]
if args.text.endswith('.txt'):
with open(args.text) as f:
lines = f.readlines()
texts = [[t.rstrip('\r\n')] for t in lines]
elif args.text.endswith('.json'):
texts = json.load(open(args.text))
else:
texts = [[t.strip()] for t in args.text.split(',')]
print("Start to inference.")
progress_bar = ProgressBar(len(images))
if args.onnx_nms:
inference_func = inference
else:
inference_func = inference_with_postprocessing
for img in images:
inference_func(ort_session, img, texts, output_dir=output_dir)
progress_bar.update()
print("Finish inference")
if __name__ == "__main__":
main()
import os
import json
import argparse
import os.path as osp
import cv2
import tqdm
import torch
import numpy as np
import tensorflow as tf
import supervision as sv
from torchvision.ops import nms
BOUNDING_BOX_ANNOTATOR = sv.BoundingBoxAnnotator(thickness=1)
MASK_ANNOTATOR = sv.MaskAnnotator()
class LabelAnnotator(sv.LabelAnnotator):
@staticmethod
def resolve_text_background_xyxy(
center_coordinates,
text_wh,
position,
):
center_x, center_y = center_coordinates
text_w, text_h = text_wh
return center_x, center_y, center_x + text_w, center_y + text_h
LABEL_ANNOTATOR = LabelAnnotator(text_padding=4,
text_scale=0.5,
text_thickness=1)
def parse_args():
parser = argparse.ArgumentParser('YOLO-World TFLite (INT8) Demo')
parser.add_argument('path', help='TFLite Model `.tflite`')
parser.add_argument('image', help='image path, include image file or dir.')
parser.add_argument(
'text',
help=
'detecting texts (str, txt, or json), should be consistent with the ONNX model'
)
parser.add_argument('--output-dir',
default='./output',
help='directory to save output files')
args = parser.parse_args()
return args
def preprocess(image, size=(640, 640)):
h, w = image.shape[:2]
max_size = max(h, w)
scale_factor = size[0] / max_size
pad_h = (max_size - h) // 2
pad_w = (max_size - w) // 2
pad_image = np.zeros((max_size, max_size, 3), dtype=image.dtype)
pad_image[pad_h:h + pad_h, pad_w:w + pad_w] = image
image = cv2.resize(pad_image, size,
interpolation=cv2.INTER_LINEAR).astype('float32')
image /= 255.0
image = image[None]
return image, scale_factor, (pad_h, pad_w)
def generate_anchors_per_level(feat_size, stride, offset=0.5):
h, w = feat_size
shift_x = (torch.arange(0, w) + offset) * stride
shift_y = (torch.arange(0, h) + offset) * stride
yy, xx = torch.meshgrid(shift_y, shift_x)
anchors = torch.stack([xx, yy]).reshape(2, -1).transpose(0, 1)
return anchors
def generate_anchors(feat_sizes=[(80, 80), (40, 40), (20, 20)],
strides=[8, 16, 32],
offset=0.5):
anchors = [
generate_anchors_per_level(fs, s, offset)
for fs, s in zip(feat_sizes, strides)
]
anchors = torch.cat(anchors)
return anchors
def simple_bbox_decode(points, pred_bboxes, stride):
pred_bboxes = pred_bboxes * stride[None, :, None]
x1 = points[..., 0] - pred_bboxes[..., 0]
y1 = points[..., 1] - pred_bboxes[..., 1]
x2 = points[..., 0] + pred_bboxes[..., 2]
y2 = points[..., 1] + pred_bboxes[..., 3]
bboxes = torch.stack([x1, y1, x2, y2], -1)
return bboxes
def visualize(image, bboxes, labels, scores, texts):
detections = sv.Detections(xyxy=bboxes, class_id=labels, confidence=scores)
labels = [
f"{texts[class_id][0]} {confidence:0.2f}" for class_id, confidence in
zip(detections.class_id, detections.confidence)
]
image = BOUNDING_BOX_ANNOTATOR.annotate(image, detections)
image = LABEL_ANNOTATOR.annotate(image, detections, labels=labels)
return image
def inference_per_sample(interp,
image_path,
texts,
priors,
strides,
output_dir,
size=(640, 640),
vis=False,
score_thr=0.05,
nms_thr=0.3,
max_dets=300):
# input / output details from TFLite
input_details = interp.get_input_details()
output_details = interp.get_output_details()
# load image from path
ori_image = cv2.imread(image_path)
h, w = ori_image.shape[:2]
image, scale_factor, pad_param = preprocess(ori_image[:, :, [2, 1, 0]],
size)
# inference
interp.set_tensor(input_details[0]['index'], image)
interp.invoke()
scores = interp.get_tensor(output_details[1]['index'])
bboxes = interp.get_tensor(output_details[0]['index'])
# can be converted to numpy for other devices
# using torch here is only for references.
ori_scores = torch.from_numpy(scores[0])
ori_bboxes = torch.from_numpy(bboxes)
# decode bbox cordinates with priors
decoded_bboxes = simple_bbox_decode(priors, ori_bboxes, strides)[0]
scores_list = []
labels_list = []
bboxes_list = []
for cls_id in range(len(texts)):
cls_scores = ori_scores[:, cls_id]
labels = torch.ones(cls_scores.shape[0], dtype=torch.long) * cls_id
keep_idxs = nms(decoded_bboxes, cls_scores, iou_threshold=0.5)
cur_bboxes = decoded_bboxes[keep_idxs]
cls_scores = cls_scores[keep_idxs]
labels = labels[keep_idxs]
scores_list.append(cls_scores)
labels_list.append(labels)
bboxes_list.append(cur_bboxes)
scores = torch.cat(scores_list, dim=0)
labels = torch.cat(labels_list, dim=0)
bboxes = torch.cat(bboxes_list, dim=0)
keep_idxs = scores > score_thr
scores = scores[keep_idxs]
labels = labels[keep_idxs]
bboxes = bboxes[keep_idxs]
# only for visualization, add an extra NMS
keep_idxs = nms(bboxes, scores, iou_threshold=nms_thr)
num_dets = min(len(keep_idxs), max_dets)
bboxes = bboxes[keep_idxs].unsqueeze(0)
scores = scores[keep_idxs].unsqueeze(0)
labels = labels[keep_idxs].unsqueeze(0)
scores = scores[0, :num_dets].numpy()
bboxes = bboxes[0, :num_dets].numpy()
labels = labels[0, :num_dets].numpy()
bboxes -= np.array(
[pad_param[1], pad_param[0], pad_param[1], pad_param[0]])
bboxes /= scale_factor
bboxes[:, 0::2] = np.clip(bboxes[:, 0::2], 0, w)
bboxes[:, 1::2] = np.clip(bboxes[:, 1::2], 0, h)
if vis:
image_out = visualize(ori_image, bboxes, labels, scores, texts)
cv2.imwrite(osp.join(output_dir, osp.basename(image_path)), image_out)
print(f"detecting {num_dets} objects.")
return image_out, ori_scores, ori_bboxes[0]
else:
return bboxes, labels, scores
def main():
args = parse_args()
tflite_file = args.tflite
# init ONNX session
interpreter = tf.lite.Interpreter(model_path=tflite_file,
experimental_preserve_all_tensors=True)
interpreter.allocate_tensors()
print("Init TFLite Interpter")
output_dir = "onnx_outputs"
if not osp.exists(output_dir):
os.mkdir(output_dir)
# load images
if not osp.isfile(args.image):
images = [
osp.join(args.image, img) for img in os.listdir(args.image)
if img.endswith('.png') or img.endswith('.jpg')
]
else:
images = [args.image]
if args.text.endswith('.txt'):
with open(args.text) as f:
lines = f.readlines()
texts = [[t.rstrip('\r\n')] for t in lines]
elif args.text.endswith('.json'):
texts = json.load(open(args.text))
else:
texts = [[t.strip()] for t in args.text.split(',')]
size = (640, 640)
strides = [8, 16, 32]
# prepare anchors, since TFLite models does not contain anchors, due to INT8 quantization.
featmap_sizes = [(size[0] // s, size[1] // s) for s in strides]
flatten_priors = generate_anchors(featmap_sizes, strides=strides)
mlvl_strides = [
flatten_priors.new_full((featmap_size[0] * featmap_size[1] * 1, ),
stride)
for featmap_size, stride in zip(featmap_sizes, strides)
]
flatten_strides = torch.cat(mlvl_strides)
print("Start to inference.")
for img in tqdm.tqdm(images):
inference_per_sample(interpreter,
img,
texts,
flatten_priors[None],
flatten_strides,
output_dir=output_dir,
vis=True,
score_thr=0.3,
nms_thr=0.5)
print("Finish inference")
if __name__ == "__main__":
main()
## Preparing Data for YOLO-World
### Overview
For pre-training YOLO-World, we adopt several datasets as listed in the below table:
| Data | Samples | Type | Boxes |
| :-- | :-----: | :---:| :---: |
| Objects365v1 | 609k | detection | 9,621k |
| GQA | 621k | grounding | 3,681k |
| Flickr | 149k | grounding | 641k |
| CC3M-Lite | 245k | image-text | 821k |
### Dataset Directory
We put all data into the `data` directory, such as:
```bash
├── coco
│ ├── annotations
│ ├── lvis
│ ├── train2017
│ ├── val2017
├── flickr
│ ├── annotations
│ └── images
├── mixed_grounding
│ ├── annotations
│ ├── images
├── mixed_grounding
│ ├── annotations
│ ├── images
├── objects365v1
│ ├── annotations
│ ├── train
│ ├── val
```
**NOTE**: We strongly suggest that you check the directories or paths in the dataset part of the config file, especially for the values `ann_file`, `data_root`, and `data_prefix`.
We provide the annotations of the pre-training data in the below table:
| Data | images | Annotation File |
| :--- | :------| :-------------- |
| Objects365v1 | [`Objects365 train`](https://opendatalab.com/OpenDataLab/Objects365_v1) | [`objects365_train.json`](https://opendatalab.com/OpenDataLab/Objects365_v1) |
| MixedGrounding | [`GQA`](https://nlp.stanford.edu/data/gqa/images.zip) | [`final_mixed_train_no_coco.json`](https://huggingface.co/GLIPModel/GLIP/tree/main/mdetr_annotations/final_mixed_train_no_coco.json) |
| Flickr30k | [`Flickr30k`](https://shannon.cs.illinois.edu/DenotationGraph/) |[`final_flickr_separateGT_train.json`](https://huggingface.co/GLIPModel/GLIP/tree/main/mdetr_annotations/final_flickr_separateGT_train.json) |
| LVIS-minival | [`COCO val2017`](https://cocodataset.org/) | [`lvis_v1_minival_inserted_image_name.json`](https://huggingface.co/GLIPModel/GLIP/blob/main/lvis_v1_minival_inserted_image_name.json) |
**Acknowledgement:** We sincerely thank [GLIP](https://github.com/microsoft/GLIP) and [mdetr](https://github.com/ashkamath/mdetr) for providing the annotation files for pre-training.
### Dataset Class
> For fine-tuning YOLO-World on Close-set Object Detection, using `MultiModalDataset` is recommended.
#### Setting CLASSES/Categories
If you use `COCO-format` custom datasets, you "DO NOT" need to define a dataset class for custom vocabularies/categories.
Explicitly setting the CLASSES in the config file through `metainfo=dict(classes=your_classes),` is simple:
```python
coco_train_dataset = dict(
_delete_=True,
type='MultiModalDataset',
dataset=dict(
type='YOLOv5CocoDataset',
metainfo=dict(classes=your_classes),
data_root='data/your_data',
ann_file='annotations/your_annotation.json',
data_prefix=dict(img='images/'),
filter_cfg=dict(filter_empty_gt=False, min_size=32)),
class_text_path='data/texts/your_class_texts.json',
pipeline=train_pipeline)
```
For training YOLO-World, we mainly adopt two kinds of dataset classs:
#### 1. `MultiModalDataset`
`MultiModalDataset` is a simple wrapper for pre-defined Dataset Class, such as `Objects365` or `COCO`, which add the texts (category texts) into the dataset instance for formatting input texts.
**Text JSON**
The json file is formatted as follows:
```json
[
['A_1','A_2'],
['B'],
['C_1', 'C_2', 'C_3'],
...
]
```
We have provided the text json for [`LVIS`](./../data/texts/lvis_v1_class_texts.json), [`COCO`](../data/texts/coco_class_texts.json), and [`Objects365`](../data/texts/obj365v1_class_texts.json)
#### 2. `YOLOv5MixedGroundingDataset`
The `YOLOv5MixedGroundingDataset` extends the `COCO` dataset by supporting loading texts/captions from the json file. It's desgined for `MixedGrounding` or `Flickr30K` with text tokens for each object.
### 🔥 Custom Datasets
For custom dataset, we suggest the users convert the annotation files according to the usage. Note that, converting the annotations to the **standard COCO format** is basically required.
1. **Large vocabulary, grounding, referring:** you can follow the annotation format as the `MixedGrounding` dataset, which adds `caption` and `tokens_positive` for assigning the text for each object. The texts can be a category or a noun phrases.
2. **Custom vocabulary (fixed):** you can adopt the `MultiModalDataset` wrapper as the `Objects365` and create a **text json** for your custom categories.
### CC3M Pseudo Annotations
The following annotations are generated according to the automatic labeling process in our paper. Adn we report the results based on these annotations.
To use CC3M annotations, you need to prepare the `CC3M` images first.
| Data | Images | Boxes | File |
| :--: | :----: | :---: | :---: |
| CC3M-246K | 246,363 | 820,629 | [Download 🤗](https://huggingface.co/wondervictor/YOLO-World/blob/main/cc3m_pseudo_annotations.json) |
| CC3M-500K | 536,405 | 1,784,405| [Download 🤗](https://huggingface.co/wondervictor/YOLO-World/blob/main/cc3m_pseudo_500k_annotations.json) |
| CC3M-750K | 750,000 | 4,504,805 | [Download 🤗](https://huggingface.co/wondervictor/YOLO-World/blob/main/cc3m_pseudo_750k_annotations.json) |
\ No newline at end of file
## Deploy YOLO-World
- [x] ONNX export
- [x] ONNX demo
- [ ] TensorRT
- [ ] TFLite
We provide several ways to deploy YOLO-World with ONNX or TensorRT
### Priliminaries
```bash
pip install supervision onnx onnxruntime onnxsim
```
### Export ONNX on Gradio Demo
start the `demo.py` and you can modify the texts in the demo and output the ONNX model.
```bash
python demo.py path/to/config path/to/weights
```
### Export YOLO-World to ONNX models
You can also use [`export_onnx.py`](../deploy/export_onnx.py) to obtain the ONNX model. You might specify the `--custom-text` with your own `Text JSON` for your custom prompts. The format of `Text JSON` can be found in [`docs/data`](/data.md).
```bash
PYTHONPATH=./ python deploy/export_onnx.py path/to/config path/to/weights --custom-text path/to/customtexts --opset 11
```
If you don't want to include `NMS` or "post-processing" into the ONNX model, you can add `--without-nms`
```bash
PYTHONPATH=./ python deploy/export_onnx.py path/to/config path/to/weights --custom-text path/to/customtexts --opset 11 --without-nms
```
If you want to quantize YOLO-World with ONNX model, you'd better remove `NMS` and `bbox_decoder` by adding `--without-bbox-decoder`
```bash
PYTHONPATH=./ python deploy/export_onnx.py path/to/config path/to/weights --custom-text path/to/customtexts --opset 11 --without-bbox-decoder
```
**Running ONNX demo**
```bash
python deploy/onnx_demo.py path/to/model.onnx path/to/images path/to/texts
```
### Export YOLO-World to TensorRT models
coming soon.
### FAQ
**Q1**. `RuntimeError: Exporting the operator einsum to ONNX opset version 11 is not supported. Support for this operator was added in version 12, try exporting with this version.`
**A:** This error arises because YOLO-World adopts `einsum` for matrix multiplication while it is not supported by `opset 11`. You can set the `--opset` from `11` to `12` if your device supports or change the `einsum` to normal `permute/reshape/multiplication` by set `use_einsum=False` in the `MaxSigmoidCSPLayerWithTwoConv` and `YOLOWorldHeadModule`. You can refer to the [sample config](../configs/pretrain/yolo_world_v2_m_vlpan_bn_noeinsum_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py) without einsum.
## Frequently Asked Questions (FAQ)
1. ` Incorrect path_or_model_id`
```bash
OSError: class `YOLOWorldDetector` in yolo_world/models/detectors/yolo_world.py: class `MultiModalYOLOBackbone` in yolo_world/models/backbones/mm_backbone.py: class `HuggingCLIPLanguageBackbone` in yolo_world/models/backbones/mm_backbone.py: Incorrect path_or_model_id: '../pretrained_models/clip-vit-base-patch32-projection'. Please provide either the path to a local folder or the repo_id of a model on the Hub.
```
**Solution:**
\ No newline at end of file
## Fine-tuning YOLO-World
Fine-tuning YOLO-World is easy and we provide the samples for COCO object detection as a simple guidance.
### Fine-tuning Requirements
Fine-tuning YOLO-World is cheap:
* it does not require 32 GPUs for multi-node distributed training. **8 GPUs or even 1 GPU** is enough.
* it does not require the long schedule, *e.g.,* 300 epochs or 500 epochs for training YOLOv5 or YOLOv8. **80 epochs or fewer** is enough considering that we provide the good pre-trained weights.
### Data Preparation
The fine-tuning dataset should have the similar format as the that of the pre-training dataset.
We suggest you refer to [`docs/data`](./data.md) for more details about how to build the datasets:
* if you fine-tune YOLO-World for close-set / custom vocabulary object detection, using `MultiModalDataset` with a `text json` is preferred.
* if you fine-tune YOLO-World for open-vocabulary detection with rich texts or grounding tasks, using `MixedGroundingDataset` is preferred.
### Hyper-parameters and Config
Please refer to the [config for fine-tuning YOLO-World-L on COCO](../configs/finetune_coco/yolo_world_l_dual_vlpan_2e-4_80e_8gpus_finetune_coco.py) for more details.
1. Basic config file:
If the fine-tuning dataset **contains mask annotations**:
```python
_base_ = ('../../third_party/mmyolo/configs/yolov8/yolov8_l_mask-refine_syncbn_fast_8xb16-500e_coco.py')
```
If the fine-tuning dataset **doesn't contain mask annotations**:
```python
_base_ = ('../../third_party/mmyolo/configs/yolov8/yolov8_l_syncbn_fast_8xb16-500e_coco.py')
```
2. Training Schemes:
Reducing the epochs and adjusting the learning rate
```python
max_epochs = 80
base_lr = 2e-4
weight_decay = 0.05
train_batch_size_per_gpu = 16
close_mosaic_epochs=10
train_cfg = dict(
max_epochs=max_epochs,
val_interval=5,
dynamic_intervals=[((max_epochs - close_mosaic_epochs),
_base_.val_interval_stage2)])
```
3. Datasets:
```python
coco_train_dataset = dict(
_delete_=True,
type='MultiModalDataset',
dataset=dict(
type='YOLOv5CocoDataset',
data_root='data/coco',
ann_file='annotations/instances_train2017.json',
data_prefix=dict(img='train2017/'),
filter_cfg=dict(filter_empty_gt=False, min_size=32)),
class_text_path='data/texts/coco_class_texts.json',
pipeline=train_pipeline)
```
#### Finetuning without RepVL-PAN or Text Encoder 🚀
For further efficiency and simplicity, we can fine-tune an efficient version of YOLO-World without RepVL-PAN and the text encoder.
The efficient version of YOLO-World has the similar architecture or layers with the orignial YOLOv8 but we provide the pre-trained weights on large-scale datasets.
The pre-trained YOLO-World has strong generalization capabilities and is more robust compared to YOLOv8 trained on the COCO dataset.
You can refer to the [config for Efficient YOLO-World](./../configs/finetune_coco/yolo_world_l_efficient_neck_2e-4_80e_8gpus_finetune_coco.py) for more details.
The efficient YOLO-World adopts `EfficientCSPLayerWithTwoConv` and the text encoder can be removed during inference or exporting models.
```python
model = dict(
type='YOLOWorldDetector',
mm_neck=True,
neck=dict(type='YOLOWorldPAFPN',
guide_channels=text_channels,
embed_channels=neck_embed_channels,
num_heads=neck_num_heads,
block_cfg=dict(type='EfficientCSPLayerWithTwoConv')))
```
### Launch Fine-tuning!
It's easy:
```bash
./dist_train.sh <path/to/config> <NUM_GPUS> --amp
```
## Installation Guide
We provide the `requirements` files in [./requirements](./../requirements/):
* `basic_requirements`: training, finetuning, evaluation.
* `demo_requirements`: running YOLO-World [demos](./../demo/).
* `onnx_requirements`: converting YOLO-World to ONNX or TFLite models (TFLite is coming soon).
#### Install `MMCV`
YOLO-World adopts `mmcv>=2.0.0`. There are several ways to install `mmcv`
**1. using `openmim`**:
see more in [official guide](https://github.com/open-mmlab/mmcv/tree/master?tab=readme-ov-file#install-mmcv-full).
```bash
pip install openmim
mim install mmcv==2.0.0
```
**2. using `pip`**:
go to [install-with-pip](https://mmcv.readthedocs.io/en/latest/get_started/installation.html#install-with-pip) to select the pip index.
```bash
# cuda=11.3, torch=1.11
pip install mmcv==2.0.0 -f https://download.openmmlab.com/mmcv/dist/cu113/torch1.11/index.html
# cuda=11.7, torch=1.13
pip install mmcv==2.2.0 -f https://download.openmmlab.com/mmcv/dist/cu117/torch1.13/index.html
# cuda=12.1, torch=2.1
pip install mmcv==2.1.0 -f https://download.openmmlab.com/mmcv/dist/cu121/torch2.1/index.html
```
**3. using `whl`**
go to [index packages](https://download.openmmlab.com/mmcv/dist/cu117/torch1.13/index.html) to find a suitable version and download.
```bash
pip install mmcv-2.0.1-cp38-cp38-manylinux1_x86_64.whl
```
\ No newline at end of file
## Prompt YOLO-World
### 1. Simple YOLO-World with Embeddings
For simplifying YOLO-World and get rid of the language model, we define a new basic detector `YOLOWorldPromptDetector`:
The `YOLOWorldPromptDetector` supports prompt embeddings as the input and doesn't not contain a language model anymore!
Now, YOLO-World adopts `embeddings` as language inputs, and the embeddings support several kinds: (1) text embeddings from the language model, e.g., CLIP language encoder, (2) image embeddings from a vision model, e.g., CLIP vision encoder, and (3) image-text fused embeddings, and (4) random embeddings.
The (1)(2)(3) supports zero-shot inference and (4), including (1)(2)(3) are designed for prompt tuning on your custom data.
The basic detector is defined as follows:
```python
class YOLOWorldPromptDetector(YOLODetector):
"""Implementation of YOLO World Series"""
def __init__(self,
*args,
mm_neck: bool = False,
num_train_classes=80,
num_test_classes=80,
prompt_dim=512,
num_prompts=80,
embedding_path='',
freeze_prompt=False,
use_mlp_adapter=False,
**kwargs)
```
To use it in a zero-shot manner, you need to pre-compute the text embeddings (image embeddings) and save it as a `numpy array (*.npy)` with a `NxD` shape (N is the number of prompts and D is the dimension of the embeddings). Currently, we only support one prompt for one class. You can use several prompts for one class but you need to merge the results in the post-processing steps.
### 2. Prompt Tuning YOLO-World
We introduce prompt tuning for YOLO-World to maintain the zero-shot ability while improve the performance on your custom datasets.
For more details about writing configs for prompt tuning, you can refer to [`prompt tuning for COCO data`](./../configs/prompt_tuning_coco/yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_prompt_tuning_coco.py).
1. Use random prompts
```python
dict(type='YOLOWorldPromptDetector',
mm_neck=True,
num_train_classes=num_training_classes,
num_test_classes=num_classes,
prompt_dim=text_channels,
num_prompts=80,
...)
```
2. Use CLIP embeddings (text, image, or text-image embeddings)
the `clip_vit_b32_coco_80_embeddings.npy` can be downloaded at [HuggingFace](https://huggingface.co/wondervictor/YOLO-World/blob/main/clip_vit_b32_coco_80_embeddings.npy).
```python
dict(type='YOLOWorldPromptDetector',
mm_neck=True,
num_train_classes=num_training_classes,
num_test_classes=num_classes,
embedding_path='embeddings/clip_vit_b32_coco_80_embeddings.npy',
prompt_dim=text_channels,
num_prompts=80,
...)
```
Using CLIP model to obtains the image and text embeddings will maintain the zero-shot performace.
| Model | Config | AP | AP50 | AP75 | APS | APM | APL |
| :---- | :----: | :--: | :--: | :---: | :-: | :-: | :-: |
| YOLO-World-v2-L | Zero-shot | 45.7 | 61.6 | 49.8 | 29.9 | 50.0 | 60.8 |
| [YOLO-World-v2-L](./../configs/prompt_tuning_coco/yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_prompt_tuning_coco.py) | Prompt tuning | 47.9 | 64.3 | 52.5 | 31.9 | 52.6 | 61.3 |
## Reparameterize YOLO-World
The reparameterization incorporates text embeddings as parameters into the model. For example, in the final classification layer, text embeddings are reparameterized into a simple 1x1 convolutional layer.
<div align="center">
<img width="600" src="../assets/reparameterize.png">
</div>
### Key Advantages from Reparameterization
> Reparameterized YOLO-World still has zero-shot ability!
* **Efficiency:** reparameterized YOLO-World has a simple and efficient archtecture, e.g., `conv1x1` is faster than `transpose & matmul`. In addition, it enables further optmization for deployment.
* **Accuracy:** reparameterized YOLO-World supports fine-tuning. Compared to the normal `fine-tuning` or `prompt tuning`, **reparameterized version can optimize the `neck` and `head` independently** since the `neck` and `head` have different parameters and do not depend on `text embeddings` anymore!
For example, fine-tuning the **reparameterized YOLO-World** obtains *46.3 AP* on COCO *val2017* while fine-tuning the normal version obtains *46.1 AP*, with all hyper-parameters kept the same.
### Getting Started
#### 1. Prepare cutstom text embeddings
You need to generate the text embeddings by [`toos/generate_text_prompts.py`](../tools/generate_text_prompts.py) and save it as a `numpy.array` with shape `NxD`.
#### 2. Reparameterizing
Reparameterizing will generate a new checkpoint with text embeddings!
Check those files first:
* model checkpoint
* text embeddings
We mainly reparameterize two groups of modules:
* head (`YOLOWorldHeadModule`)
* neck (`MaxSigmoidCSPLayerWithTwoConv`)
```bash
python tools/reparameterize_yoloworld.py \
--model path/to/checkpoint \
--out-dir path/to/save/re-parameterized/ \
--text-embed path/to/text/embeddings \
--conv-neck
```
#### 3. Prepare the model config
Please see the sample config: [`finetune_coco/yolo_world_v2_s_rep_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py`](../configs/finetune_coco/yolo_world_v2_s_rep_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py) for reparameterized training.
* `RepConvMaxSigmoidCSPLayerWithTwoConv`:
```python
neck=dict(type='YOLOWorldPAFPN',
guide_channels=num_classes,
embed_channels=neck_embed_channels,
num_heads=neck_num_heads,
block_cfg=dict(type='RepConvMaxSigmoidCSPLayerWithTwoConv',
guide_channels=num_classes)),
```
* `RepYOLOWorldHeadModule`:
```python
bbox_head=dict(head_module=dict(type='RepYOLOWorldHeadModule',
embed_dims=text_channels,
num_guide=num_classes,
num_classes=num_classes)),
```
#### 4. Reparameterized Training
**Reparameterized YOLO-World** is easier to fine-tune and can be treated as an enhanced and pre-trained YOLOv8!
You can check [`finetune_coco/yolo_world_v2_s_rep_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py`](../configs/finetune_coco/yolo_world_v2_s_rep_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py) for more details.
\ No newline at end of file
## Run YOLO-World (Quantized) on TF-Lite
- [x] Export YOLO-World to TFLite with INT8 Quantization.
- [x] TFLite demo
### Priliminaries
```bash
pip install onnxruntime onnx onnx-simplifier
pip install tensorflow==2.15.1
```
See [onnx2tf](https://github.com/PINTO0309/onnx2tf) for more details about export TFLite models.
The contributor of `onnx2tf` is very nice!
### Export TFLite INT8 Quantization models
Please use **Reparameterized YOLO-World** for TFLite!!
1. Prepare the ONNX model
Please export the ONNX model without `postprocessing` and `bbox_decoder`, just add `--without-bbox-decoder`!
`bbox_decoder` is not supported for INT8 quantization, please take care!
```bash
PYTHONPATH=./ python deploy/export_onnx.py path/to/config path/to/weights --custom-text path/to/customtexts --opset 11 --without-bbox-decoder
```
2. Generate the calibration samples
Using 100 COCO images is suggested to create a simple calibration dataset for quantization.
```python
import os
import random
from PIL import Image, ImageOps
import cv2
import glob
import numpy as np
root = "data/coco/val2017/"
image_list = os.listdir(root)
image_list = [os.path.join(root, f) for f in image_list]
random.shuffle(image_list)
img_datas = []
for idx, file in enumerate(image_list[:100]):
image = Image.open(file).convert('RGB')
# Get sample input data as a numpy array in a method of your choosing.
img_width, img_height = image.size
size = max(img_width, img_height)
image = ImageOps.pad(image, (size, size), method=Image.BILINEAR)
image = image.resize((640, 640), Image.BILINEAR)
tensor_image = np.asarray(image).astype(np.float32)
tensor_image /= 255.0
tensor_image = np.expand_dims(tensor_image, axis=0)
img_datas.append(tensor_image)
calib_datas = np.vstack(img_datas)
print(f'calib_datas.shape: {calib_datas.shape}')
np.save(file='tflite_calibration_data_100_images_640.npy', arr=calib_datas)
```
3. Export ONNX to TFLite using `onnx2tf`
```bash
onnx2tf -i [ONNX] -o [OUTPUT] -oiqt -cind "images" "tflite_calibration_data_100_images_640.npy" "[[[[0.,0.,0.]]]]" "[[[[1.,1.,1.]]]]" -onimc "scores" "bboxes" --verbosity debug
```
We provide a sample TFLite INT8 model: [yolo_world_x_coco_zeroshot_rep_integer_quant.tflite](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_x_coco_zeroshot_rep_integer_quant.tflite)
### Inference using TFLite
```bash
python deploy/tflite_demo.py path/to/tflite path/to/images path/to/texts
```
\ No newline at end of file
## Update Notes
We provide the details for important updates of YOLO-World in this note.
### Model Architecture
**[2024-2-29]:** YOLO-World-v2:
1. We remove the `I-PoolingAttention`: though it improves the performance for zero-shot LVIS evaluation, it affects the inference speeds after exporting YOLO-World to ONNX or TensorRT. Considering the trade-off, we remove the `I-PoolingAttention` in the newest version.
2. We replace the `L2-Norm` in the contrastive head with the `BatchNorm`. The `L2-Norm` contains complex operations, such as `reduce`, which is time-consuming for deployment. However, the `BatchNorm` can be fused into the convolution, which is much more efficient and also improves the zero-shot performance.
# 模型唯一标识
modelCode=673
# 模型名称
modelName=yolo_world_pytorch
# 模型描述
modelDescription=实时开放词汇目标检测模型YOLO-World的训练、推理
# 应用场景
appScenario=训练,推理,科研,制造,医疗,家居,教育
# 框架类型
frameType=Pytorch
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment