Initial commit

e9cee049 · luopl · e9cee049 · e9cee049 · e9cee049 · e9cee049
Commit e9cee049 authored May 31, 2024 by luopl
20 changed files
--- a/deploy/easydeploy/tools/export_onnx.py
+++ b/deploy/easydeploy/tools/export_onnx.py
+import argparse
+import os
+import sys
+import warnings
+from io import BytesIO
+from pathlib import Path
+
+import onnx
+import torch
+from mmdet.apis import init_detector
+from mmengine.config import ConfigDict
+from mmengine.logging import print_log
+from mmengine.utils.path import mkdir_or_exist
+
+# Add MMYOLO ROOT to sys.path
+sys.path.append(str(Path(__file__).resolve().parents[3]))
+from projects.easydeploy.model import DeployModel, MMYOLOBackend  # noqa E402
+
+warnings.filterwarnings(action='ignore', category=torch.jit.TracerWarning)
+warnings.filterwarnings(action='ignore', category=torch.jit.ScriptWarning)
+warnings.filterwarnings(action='ignore', category=UserWarning)
+warnings.filterwarnings(action='ignore', category=FutureWarning)
+warnings.filterwarnings(action='ignore', category=ResourceWarning)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('config', help='Config file')
+    parser.add_argument('checkpoint', help='Checkpoint file')
+    parser.add_argument(
+        '--model-only', action='store_true', help='Export model only')
+    parser.add_argument(
+        '--work-dir', default='./work_dir', help='Path to save export model')
+    parser.add_argument(
+        '--img-size',
+        nargs='+',
+        type=int,
+        default=[640, 640],
+        help='Image size of height and width')
+    parser.add_argument('--batch-size', type=int, default=1, help='Batch size')
+    parser.add_argument(
+        '--device', default='cuda:0', help='Device used for inference')
+    parser.add_argument(
+        '--simplify',
+        action='store_true',
+        help='Simplify onnx model by onnx-sim')
+    parser.add_argument(
+        '--opset', type=int, default=11, help='ONNX opset version')
+    parser.add_argument(
+        '--backend',
+        type=str,
+        default='onnxruntime',
+        help='Backend for export onnx')
+    parser.add_argument(
+        '--pre-topk',
+        type=int,
+        default=1000,
+        help='Postprocess pre topk bboxes feed into NMS')
+    parser.add_argument(
+        '--keep-topk',
+        type=int,
+        default=100,
+        help='Postprocess keep topk bboxes out of NMS')
+    parser.add_argument(
+        '--iou-threshold',
+        type=float,
+        default=0.65,
+        help='IoU threshold for NMS')
+    parser.add_argument(
+        '--score-threshold',
+        type=float,
+        default=0.25,
+        help='Score threshold for NMS')
+    args = parser.parse_args()
+    args.img_size *= 2 if len(args.img_size) == 1 else 1
+    return args
+
+
+def build_model_from_cfg(config_path, checkpoint_path, device):
+    model = init_detector(config_path, checkpoint_path, device=device)
+    model.eval()
+    return model
+
+
+def main():
+    args = parse_args()
+    mkdir_or_exist(args.work_dir)
+    backend = MMYOLOBackend(args.backend.lower())
+    if backend in (MMYOLOBackend.ONNXRUNTIME, MMYOLOBackend.OPENVINO,
+                   MMYOLOBackend.TENSORRT8, MMYOLOBackend.TENSORRT7):
+        if not args.model_only:
+            print_log('Export ONNX with bbox decoder and NMS ...')
+    else:
+        args.model_only = True
+        print_log(f'Can not export postprocess for {args.backend.lower()}.\n'
+                  f'Set "args.model_only=True" default.')
+    if args.model_only:
+        postprocess_cfg = None
+        output_names = None
+    else:
+        postprocess_cfg = ConfigDict(
+            pre_top_k=args.pre_topk,
+            keep_top_k=args.keep_topk,
+            iou_threshold=args.iou_threshold,
+            score_threshold=args.score_threshold)
+        output_names = ['num_dets', 'boxes', 'scores', 'labels']
+    baseModel = build_model_from_cfg(args.config, args.checkpoint, args.device)
+
+    deploy_model = DeployModel(
+        baseModel=baseModel, backend=backend, postprocess_cfg=postprocess_cfg)
+    deploy_model.eval()
+
+    fake_input = torch.randn(args.batch_size, 3,
+                             *args.img_size).to(args.device)
+    # dry run
+    deploy_model(fake_input)
+
+    save_onnx_path = os.path.join(
+        args.work_dir,
+        os.path.basename(args.checkpoint).replace('pth', 'onnx'))
+    # export onnx
+    with BytesIO() as f:
+        torch.onnx.export(
+            deploy_model,
+            fake_input,
+            f,
+            input_names=['images'],
+            output_names=output_names,
+            opset_version=args.opset)
+        f.seek(0)
+        onnx_model = onnx.load(f)
+        onnx.checker.check_model(onnx_model)
+
+        # Fix tensorrt onnx output shape, just for view
+        if not args.model_only and backend in (MMYOLOBackend.TENSORRT8,
+                                               MMYOLOBackend.TENSORRT7):
+            shapes = [
+                args.batch_size, 1, args.batch_size, args.keep_topk, 4,
+                args.batch_size, args.keep_topk, args.batch_size,
+                args.keep_topk
+            ]
+            for i in onnx_model.graph.output:
+                for j in i.type.tensor_type.shape.dim:
+                    j.dim_param = str(shapes.pop(0))
+    if args.simplify:
+        try:
+            import onnxsim
+            onnx_model, check = onnxsim.simplify(onnx_model)
+            assert check, 'assert check failed'
+        except Exception as e:
+            print_log(f'Simplify failure: {e}')
+    onnx.save(onnx_model, save_onnx_path)
+    print_log(f'ONNX export success, save into {save_onnx_path}')
+
+
+if __name__ == '__main__':
+    main()
--- a/deploy/easydeploy/tools/image-demo.py
+++ b/deploy/easydeploy/tools/image-demo.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from easydeploy.model import ORTWrapper, TRTWrapper  # isort:skip
+import os
+import random
+from argparse import ArgumentParser
+
+import cv2
+import mmcv
+import numpy as np
+import torch
+from mmcv.transforms import Compose
+from mmdet.utils import get_test_pipeline_cfg
+from mmengine.config import Config, ConfigDict
+from mmengine.utils import ProgressBar, path
+
+from mmyolo.utils import register_all_modules
+from mmyolo.utils.misc import get_file_list
+
+
+def parse_args():
+    parser = ArgumentParser()
+    parser.add_argument(
+        'img', help='Image path, include image file, dir and URL.')
+    parser.add_argument('config', help='Config file')
+    parser.add_argument('checkpoint', help='Checkpoint file')
+    parser.add_argument(
+        '--out-dir', default='./output', help='Path to output file')
+    parser.add_argument(
+        '--device', default='cuda:0', help='Device used for inference')
+    parser.add_argument(
+        '--show', action='store_true', help='Show the detection results')
+    args = parser.parse_args()
+    return args
+
+
+def preprocess(config):
+    data_preprocess = config.get('model', {}).get('data_preprocessor', {})
+    mean = data_preprocess.get('mean', [0., 0., 0.])
+    std = data_preprocess.get('std', [1., 1., 1.])
+    mean = torch.tensor(mean, dtype=torch.float32).reshape(1, 3, 1, 1)
+    std = torch.tensor(std, dtype=torch.float32).reshape(1, 3, 1, 1)
+
+    class PreProcess(torch.nn.Module):
+
+        def __init__(self):
+            super().__init__()
+
+        def forward(self, x):
+            x = x[None].float()
+            x -= mean.to(x.device)
+            x /= std.to(x.device)
+            return x
+
+    return PreProcess().eval()
+
+
+def main():
+    args = parse_args()
+
+    # register all modules in mmdet into the registries
+    register_all_modules()
+
+    colors = [[random.randint(0, 255) for _ in range(3)] for _ in range(1000)]
+
+    # build the model from a config file and a checkpoint file
+    if args.checkpoint.endswith('.onnx'):
+        model = ORTWrapper(args.checkpoint, args.device)
+    elif args.checkpoint.endswith('.engine') or args.checkpoint.endswith(
+            '.plan'):
+        model = TRTWrapper(args.checkpoint, args.device)
+    else:
+        raise NotImplementedError
+
+    model.to(args.device)
+
+    cfg = Config.fromfile(args.config)
+    class_names = cfg.get('class_name')
+
+    test_pipeline = get_test_pipeline_cfg(cfg)
+    test_pipeline[0] = ConfigDict({'type': 'mmdet.LoadImageFromNDArray'})
+    test_pipeline = Compose(test_pipeline)
+
+    pre_pipeline = preprocess(cfg)
+
+    if not args.show:
+        path.mkdir_or_exist(args.out_dir)
+
+    # get file list
+    files, source_type = get_file_list(args.img)
+
+    # start detector inference
+    progress_bar = ProgressBar(len(files))
+    for i, file in enumerate(files):
+        bgr = mmcv.imread(file)
+        rgb = mmcv.imconvert(bgr, 'bgr', 'rgb')
+        data, samples = test_pipeline(dict(img=rgb, img_id=i)).values()
+        pad_param = samples.get('pad_param',
+                                np.array([0, 0, 0, 0], dtype=np.float32))
+        h, w = samples.get('ori_shape', rgb.shape[:2])
+        pad_param = torch.asarray(
+            [pad_param[2], pad_param[0], pad_param[2], pad_param[0]],
+            device=args.device)
+        scale_factor = samples.get('scale_factor', [1., 1])
+        scale_factor = torch.asarray(scale_factor * 2, device=args.device)
+        data = pre_pipeline(data).to(args.device)
+
+        result = model(data)
+        if source_type['is_dir']:
+            filename = os.path.relpath(file, args.img).replace('/', '_')
+        else:
+            filename = os.path.basename(file)
+        out_file = None if args.show else os.path.join(args.out_dir, filename)
+
+        # Get candidate predict info by num_dets
+        num_dets, bboxes, scores, labels = result
+        scores = scores[0, :num_dets]
+        bboxes = bboxes[0, :num_dets]
+        labels = labels[0, :num_dets]
+        bboxes -= pad_param
+        bboxes /= scale_factor
+
+        bboxes[:, 0::2].clamp_(0, w)
+        bboxes[:, 1::2].clamp_(0, h)
+        bboxes = bboxes.round().int()
+
+        for (bbox, score, label) in zip(bboxes, scores, labels):
+            bbox = bbox.tolist()
+            color = colors[label]
+
+            if class_names is not None:
+                label_name = class_names[label]
+                name = f'cls:{label_name}_score:{score:0.4f}'
+            else:
+                name = f'cls:{label}_score:{score:0.4f}'
+
+            cv2.rectangle(bgr, bbox[:2], bbox[2:], color, 2)
+            cv2.putText(
+                bgr,
+                name, (bbox[0], bbox[1] - 2),
+                cv2.FONT_HERSHEY_SIMPLEX,
+                2.0, [225, 255, 255],
+                thickness=3)
+
+        if args.show:
+            mmcv.imshow(bgr, 'result', 0)
+        else:
+            mmcv.imwrite(bgr, out_file)
+        progress_bar.update()
+
+
+if __name__ == '__main__':
+    main()
--- a/deploy/export_onnx.py
+++ b/deploy/export_onnx.py
+# # Copyright (c) OpenMMLab. All rights reserved.
+import os
+import json
+import warnings
+import argparse
+from io import BytesIO
+
+import onnx
+import torch
+from mmdet.apis import init_detector
+from mmengine.config import ConfigDict
+from mmengine.logging import print_log
+from mmengine.utils.path import mkdir_or_exist
+
+from easydeploy.model import DeployModel, MMYOLOBackend  # noqa E402
+
+warnings.filterwarnings(action='ignore', category=torch.jit.TracerWarning)
+warnings.filterwarnings(action='ignore', category=torch.jit.ScriptWarning)
+warnings.filterwarnings(action='ignore', category=UserWarning)
+warnings.filterwarnings(action='ignore', category=FutureWarning)
+warnings.filterwarnings(action='ignore', category=ResourceWarning)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('config', help='Config file')
+    parser.add_argument('checkpoint', help='Checkpoint file')
+    parser.add_argument('--custom-text',
+                        type=str,
+                        help='custom text inputs (text json) for YOLO-World.')
+    parser.add_argument('--add-padding',
+                        action="store_true",
+                        help="add an empty padding to texts.")
+    parser.add_argument('--model-only',
+                        action='store_true',
+                        help='Export model only')
+    parser.add_argument('--without-nms',
+                        action='store_true',
+                        help='Export model without NMS')
+    parser.add_argument('--without-bbox-decoder',
+                        action='store_true',
+                        help='Export model without Bbox Decoder (for INT8 Quantization)')
+    parser.add_argument('--work-dir',
+                        default='./work_dirs',
+                        help='Path to save export model')
+    parser.add_argument('--img-size',
+                        nargs='+',
+                        type=int,
+                        default=[640, 640],
+                        help='Image size of height and width')
+    parser.add_argument('--batch-size', type=int, default=1, help='Batch size')
+    parser.add_argument('--device',
+                        default='cuda:0',
+                        help='Device used for inference')
+    parser.add_argument('--simplify',
+                        action='store_true',
+                        help='Simplify onnx model by onnx-sim')
+    parser.add_argument('--opset',
+                        type=int,
+                        default=11,
+                        help='ONNX opset version')
+    parser.add_argument('--backend',
+                        type=str,
+                        default='onnxruntime',
+                        help='Backend for export onnx')
+    parser.add_argument('--pre-topk',
+                        type=int,
+                        default=1000,
+                        help='Postprocess pre topk bboxes feed into NMS')
+    parser.add_argument('--keep-topk',
+                        type=int,
+                        default=100,
+                        help='Postprocess keep topk bboxes out of NMS')
+    parser.add_argument('--iou-threshold',
+                        type=float,
+                        default=0.65,
+                        help='IoU threshold for NMS')
+    parser.add_argument('--score-threshold',
+                        type=float,
+                        default=0.25,
+                        help='Score threshold for NMS')
+    args = parser.parse_args()
+    args.img_size *= 2 if len(args.img_size) == 1 else 1
+    return args
+
+
+def build_model_from_cfg(config_path, checkpoint_path, device):
+    model = init_detector(config_path, checkpoint_path, device=device)
+    model.eval()
+    return model
+
+
+def main():
+    args = parse_args()
+    mkdir_or_exist(args.work_dir)
+    backend = MMYOLOBackend(args.backend.lower())
+    if backend in (MMYOLOBackend.ONNXRUNTIME, MMYOLOBackend.OPENVINO,
+                   MMYOLOBackend.TENSORRT8, MMYOLOBackend.TENSORRT7):
+        if not args.model_only:
+            print_log('Export ONNX with bbox decoder and NMS ...')
+    else:
+        args.model_only = True
+        print_log(f'Can not export postprocess for {args.backend.lower()}.\n'
+                  f'Set "args.model_only=True" default.')
+    if args.model_only:
+        postprocess_cfg = None
+        output_names = None
+    else:
+        postprocess_cfg = ConfigDict(pre_top_k=args.pre_topk,
+                                     keep_top_k=args.keep_topk,
+                                     iou_threshold=args.iou_threshold,
+                                     score_threshold=args.score_threshold)
+
+        output_names = ['num_dets', 'boxes', 'scores', 'labels']
+        if args.without_bbox_decoder or args.without_nms:
+            output_names = ['scores', 'boxes']
+
+    if args.custom_text is not None and len(args.custom_text) > 0:
+        with open(args.custom_text) as f:
+            texts = json.load(f)
+        texts = [x[0] for x in texts]
+    else:
+        from mmdet.datasets import CocoDataset
+        texts = CocoDataset.METAINFO['classes']
+    if args.add_padding:
+        texts = texts + [' ']
+
+    baseModel = build_model_from_cfg(args.config, args.checkpoint, args.device)
+    if hasattr(baseModel, 'reparameterize'):
+        # reparameterize text into YOLO-World
+        baseModel.reparameterize([texts])
+    deploy_model = DeployModel(baseModel=baseModel,
+                               backend=backend,
+                               postprocess_cfg=postprocess_cfg,
+                               with_nms=not args.without_nms,
+                               without_bbox_decoder=args.without_bbox_decoder)
+    deploy_model.eval()
+
+    fake_input = torch.randn(args.batch_size, 3,
+                             *args.img_size).to(args.device)
+    # dry run
+    deploy_model(fake_input)
+
+    save_onnx_path = os.path.join(
+        args.work_dir,
+        os.path.basename(args.checkpoint).replace('pth', 'onnx'))
+    # export onnx
+    with BytesIO() as f:
+        torch.onnx.export(deploy_model,
+                          fake_input,
+                          f,
+                          input_names=['images'],
+                          output_names=output_names,
+                          opset_version=args.opset)
+        f.seek(0)
+        onnx_model = onnx.load(f)
+        onnx.checker.check_model(onnx_model)
+
+        # Fix tensorrt onnx output shape, just for view
+        if not args.model_only and not args.without_nms and backend in (
+                MMYOLOBackend.TENSORRT8, MMYOLOBackend.TENSORRT7):
+            shapes = [
+                args.batch_size, 1, args.batch_size, args.keep_topk, 4,
+                args.batch_size, args.keep_topk, args.batch_size,
+                args.keep_topk
+            ]
+            for i in onnx_model.graph.output:
+                for j in i.type.tensor_type.shape.dim:
+                    j.dim_param = str(shapes.pop(0))
+    if args.simplify:
+        try:
+            import onnxsim
+            onnx_model, check = onnxsim.simplify(onnx_model)
+            assert check, 'assert check failed'
+        except Exception as e:
+            print_log(f'Simplify failure: {e}')
+    onnx.save(onnx_model, save_onnx_path)
+    print_log(f'ONNX export success, save into {save_onnx_path}')
+
+
+if __name__ == '__main__':
+    main()
--- a/deploy/onnx_demo.py
+++ b/deploy/onnx_demo.py
+import os
+import json
+import argparse
+import os.path as osp
+
+import cv2
+import numpy as np
+import supervision as sv
+import onnxruntime as ort
+from mmengine.utils import ProgressBar
+
+try:
+    import torch
+    from torchvision.ops import nms
+except Exception as e:
+    print(e)
+
+BOUNDING_BOX_ANNOTATOR = sv.BoundingBoxAnnotator(thickness=1)
+MASK_ANNOTATOR = sv.MaskAnnotator()
+
+
+class LabelAnnotator(sv.LabelAnnotator):
+
+    @staticmethod
+    def resolve_text_background_xyxy(
+        center_coordinates,
+        text_wh,
+        position,
+    ):
+        center_x, center_y = center_coordinates
+        text_w, text_h = text_wh
+        return center_x, center_y, center_x + text_w, center_y + text_h
+
+
+LABEL_ANNOTATOR = LabelAnnotator(text_padding=4,
+                                 text_scale=0.5,
+                                 text_thickness=1)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser('YOLO-World ONNX Demo')
+    parser.add_argument('onnx', help='onnx file')
+    parser.add_argument('image', help='image path, include image file or dir.')
+    parser.add_argument(
+        'text',
+        help=
+        'detecting texts (str or json), should be consistent with the ONNX model'
+    )
+    parser.add_argument('--output-dir',
+                        default='./output',
+                        help='directory to save output files')
+    parser.add_argument('--device',
+                        default='cuda:0',
+                        help='device used for inference')
+    parser.add_argument(
+        '--onnx-nms',
+        action='store_false',
+        help='whether ONNX model contains NMS and postprocessing')
+    args = parser.parse_args()
+    return args
+
+
+def preprocess(image, size=(640, 640)):
+    h, w = image.shape[:2]
+    max_size = max(h, w)
+    scale_factor = size[0] / max_size
+    pad_h = (max_size - h) // 2
+    pad_w = (max_size - w) // 2
+    pad_image = np.zeros((max_size, max_size, 3), dtype=image.dtype)
+    pad_image[pad_h:h + pad_h, pad_w:w + pad_w] = image
+    image = cv2.resize(pad_image, size,
+                       interpolation=cv2.INTER_LINEAR).astype('float32')
+    image /= 255.0
+    image = image[None]
+    return image, scale_factor, (pad_h, pad_w)
+
+
+def visualize(image, bboxes, labels, scores, texts):
+    detections = sv.Detections(xyxy=bboxes, class_id=labels, confidence=scores)
+    labels = [
+        f"{texts[class_id][0]} {confidence:0.2f}" for class_id, confidence in
+        zip(detections.class_id, detections.confidence)
+    ]
+
+    image = BOUNDING_BOX_ANNOTATOR.annotate(image, detections)
+    image = LABEL_ANNOTATOR.annotate(image, detections, labels=labels)
+    return image
+
+
+def inference(ort_session,
+              image_path,
+              texts,
+              output_dir,
+              size=(640, 640),
+              **kwargs):
+    # normal export
+    # with NMS and postprocessing
+    ori_image = cv2.imread(image_path)
+    h, w = ori_image.shape[:2]
+    image, scale_factor, pad_param = preprocess(ori_image[:, :, [2, 1, 0]],
+                                                size)
+    input_ort = ort.OrtValue.ortvalue_from_numpy(image.transpose((0, 3, 1, 2)))
+    results = ort_session.run(["num_dets", "labels", "scores", "boxes"],
+                              {"images": input_ort})
+    num_dets, labels, scores, bboxes = results
+    num_dets = num_dets[0][0]
+    labels = labels[0, :num_dets]
+    scores = scores[0, :num_dets]
+    bboxes = bboxes[0, :num_dets]
+
+    bboxes -= np.array(
+        [pad_param[1], pad_param[0], pad_param[1], pad_param[0]])
+    bboxes /= scale_factor
+    bboxes[:, 0::2] = np.clip(bboxes[:, 0::2], 0, w)
+    bboxes[:, 1::2] = np.clip(bboxes[:, 1::2], 0, h)
+    bboxes = bboxes.round().astype('int')
+
+    image_out = visualize(ori_image, bboxes, labels, scores, texts)
+    cv2.imwrite(osp.join(output_dir, osp.basename(image_path)), image_out)
+    return image_out
+
+
+def inference_with_postprocessing(ort_session,
+                                  image_path,
+                                  texts,
+                                  output_dir,
+                                  size=(640, 640),
+                                  nms_thr=0.7,
+                                  score_thr=0.3,
+                                  max_dets=300):
+    # export with `--without-nms`
+    ori_image = cv2.imread(image_path)
+    h, w = ori_image.shape[:2]
+    image, scale_factor, pad_param = preprocess(ori_image[:, :, [2, 1, 0]],
+                                                size)
+    input_ort = ort.OrtValue.ortvalue_from_numpy(image.transpose((0, 3, 1, 2)))
+    results = ort_session.run(["scores", "boxes"], {"images": input_ort})
+    scores, bboxes = results
+    # move numpy array to torch
+    ori_scores = torch.from_numpy(scores[0]).to('cuda:0')
+    ori_bboxes = torch.from_numpy(bboxes[0]).to('cuda:0')
+
+    scores_list = []
+    labels_list = []
+    bboxes_list = []
+    # class-specific NMS
+    for cls_id in range(len(texts)):
+        cls_scores = ori_scores[:, cls_id]
+        labels = torch.ones(cls_scores.shape[0], dtype=torch.long) * cls_id
+        keep_idxs = nms(ori_bboxes, cls_scores, iou_threshold=nms_thr)
+        cur_bboxes = ori_bboxes[keep_idxs]
+        cls_scores = cls_scores[keep_idxs]
+        labels = labels[keep_idxs]
+        scores_list.append(cls_scores)
+        labels_list.append(labels)
+        bboxes_list.append(cur_bboxes)
+
+    scores = torch.cat(scores_list, dim=0)
+    labels = torch.cat(labels_list, dim=0)
+    bboxes = torch.cat(bboxes_list, dim=0)
+
+    keep_idxs = scores > score_thr
+    scores = scores[keep_idxs]
+    labels = labels[keep_idxs]
+    bboxes = bboxes[keep_idxs]
+    if len(keep_idxs) > max_dets:
+        _, sorted_idx = torch.sort(scores, descending=True)
+        keep_idxs = sorted_idx[:max_dets]
+        bboxes = bboxes[keep_idxs]
+        scores = scores[keep_idxs]
+        labels = labels[keep_idxs]
+
+    # Get candidate predict info by num_dets
+    scores = scores.cpu().numpy()
+    bboxes = bboxes.cpu().numpy()
+    labels = labels.cpu().numpy()
+
+    bboxes -= np.array(
+        [pad_param[1], pad_param[0], pad_param[1], pad_param[0]])
+    bboxes /= scale_factor
+    bboxes[:, 0::2] = np.clip(bboxes[:, 0::2], 0, w)
+    bboxes[:, 1::2] = np.clip(bboxes[:, 1::2], 0, h)
+    bboxes = bboxes.round().astype('int')
+
+    image_out = visualize(ori_image, bboxes, labels, scores, texts)
+    cv2.imwrite(osp.join(output_dir, osp.basename(image_path)), image_out)
+    return image_out
+
+
+def main():
+
+    args = parse_args()
+    onnx_file = args.onnx
+    # init ONNX session
+    ort_session = ort.InferenceSession(
+        onnx_file, providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
+    print("Init ONNX Runtime session")
+    output_dir = "onnx_outputs"
+    if not osp.exists(output_dir):
+        os.mkdir(output_dir)
+
+    # load images
+    if not osp.isfile(args.image):
+        images = [
+            osp.join(args.image, img) for img in os.listdir(args.image)
+            if img.endswith('.png') or img.endswith('.jpg')
+        ]
+    else:
+        images = [args.image]
+
+    if args.text.endswith('.txt'):
+        with open(args.text) as f:
+            lines = f.readlines()
+        texts = [[t.rstrip('\r\n')] for t in lines]
+    elif args.text.endswith('.json'):
+        texts = json.load(open(args.text))
+    else:
+        texts = [[t.strip()] for t in args.text.split(',')]
+
+    print("Start to inference.")
+    progress_bar = ProgressBar(len(images))
+
+    if args.onnx_nms:
+        inference_func = inference
+    else:
+        inference_func = inference_with_postprocessing
+
+    for img in images:
+        inference_func(ort_session, img, texts, output_dir=output_dir)
+        progress_bar.update()
+    print("Finish inference")
+
+
+if __name__ == "__main__":
+    main()
--- a/deploy/tflite_demo.py
+++ b/deploy/tflite_demo.py
+import os
+import json
+import argparse
+import os.path as osp
+
+import cv2
+import tqdm
+import torch
+import numpy as np
+import tensorflow as tf
+import supervision as sv
+from torchvision.ops import nms
+
+BOUNDING_BOX_ANNOTATOR = sv.BoundingBoxAnnotator(thickness=1)
+MASK_ANNOTATOR = sv.MaskAnnotator()
+
+
+class LabelAnnotator(sv.LabelAnnotator):
+
+    @staticmethod
+    def resolve_text_background_xyxy(
+        center_coordinates,
+        text_wh,
+        position,
+    ):
+        center_x, center_y = center_coordinates
+        text_w, text_h = text_wh
+        return center_x, center_y, center_x + text_w, center_y + text_h
+
+
+LABEL_ANNOTATOR = LabelAnnotator(text_padding=4,
+                                 text_scale=0.5,
+                                 text_thickness=1)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser('YOLO-World TFLite (INT8) Demo')
+    parser.add_argument('path', help='TFLite Model `.tflite`')
+    parser.add_argument('image', help='image path, include image file or dir.')
+    parser.add_argument(
+        'text',
+        help=
+        'detecting texts (str, txt, or json), should be consistent with the ONNX model'
+    )
+    parser.add_argument('--output-dir',
+                        default='./output',
+                        help='directory to save output files')
+    args = parser.parse_args()
+    return args
+
+
+def preprocess(image, size=(640, 640)):
+    h, w = image.shape[:2]
+    max_size = max(h, w)
+    scale_factor = size[0] / max_size
+    pad_h = (max_size - h) // 2
+    pad_w = (max_size - w) // 2
+    pad_image = np.zeros((max_size, max_size, 3), dtype=image.dtype)
+    pad_image[pad_h:h + pad_h, pad_w:w + pad_w] = image
+    image = cv2.resize(pad_image, size,
+                       interpolation=cv2.INTER_LINEAR).astype('float32')
+    image /= 255.0
+    image = image[None]
+    return image, scale_factor, (pad_h, pad_w)
+
+
+def generate_anchors_per_level(feat_size, stride, offset=0.5):
+    h, w = feat_size
+    shift_x = (torch.arange(0, w) + offset) * stride
+    shift_y = (torch.arange(0, h) + offset) * stride
+    yy, xx = torch.meshgrid(shift_y, shift_x)
+    anchors = torch.stack([xx, yy]).reshape(2, -1).transpose(0, 1)
+    return anchors
+
+
+def generate_anchors(feat_sizes=[(80, 80), (40, 40), (20, 20)],
+                     strides=[8, 16, 32],
+                     offset=0.5):
+    anchors = [
+        generate_anchors_per_level(fs, s, offset)
+        for fs, s in zip(feat_sizes, strides)
+    ]
+    anchors = torch.cat(anchors)
+    return anchors
+
+
+def simple_bbox_decode(points, pred_bboxes, stride):
+
+    pred_bboxes = pred_bboxes * stride[None, :, None]
+    x1 = points[..., 0] - pred_bboxes[..., 0]
+    y1 = points[..., 1] - pred_bboxes[..., 1]
+    x2 = points[..., 0] + pred_bboxes[..., 2]
+    y2 = points[..., 1] + pred_bboxes[..., 3]
+    bboxes = torch.stack([x1, y1, x2, y2], -1)
+
+    return bboxes
+
+
+def visualize(image, bboxes, labels, scores, texts):
+    detections = sv.Detections(xyxy=bboxes, class_id=labels, confidence=scores)
+    labels = [
+        f"{texts[class_id][0]} {confidence:0.2f}" for class_id, confidence in
+        zip(detections.class_id, detections.confidence)
+    ]
+
+    image = BOUNDING_BOX_ANNOTATOR.annotate(image, detections)
+    image = LABEL_ANNOTATOR.annotate(image, detections, labels=labels)
+    return image
+
+
+def inference_per_sample(interp,
+                         image_path,
+                         texts,
+                         priors,
+                         strides,
+                         output_dir,
+                         size=(640, 640),
+                         vis=False,
+                         score_thr=0.05,
+                         nms_thr=0.3,
+                         max_dets=300):
+
+    # input / output details from TFLite
+    input_details = interp.get_input_details()
+    output_details = interp.get_output_details()
+
+    # load image from path
+    ori_image = cv2.imread(image_path)
+    h, w = ori_image.shape[:2]
+    image, scale_factor, pad_param = preprocess(ori_image[:, :, [2, 1, 0]],
+                                                size)
+
+    # inference
+    interp.set_tensor(input_details[0]['index'], image)
+    interp.invoke()
+
+    scores = interp.get_tensor(output_details[1]['index'])
+    bboxes = interp.get_tensor(output_details[0]['index'])
+
+    # can be converted to numpy for other devices
+    # using torch here is only for references.
+    ori_scores = torch.from_numpy(scores[0])
+    ori_bboxes = torch.from_numpy(bboxes)
+
+    # decode bbox cordinates with priors
+    decoded_bboxes = simple_bbox_decode(priors, ori_bboxes, strides)[0]
+    scores_list = []
+    labels_list = []
+    bboxes_list = []
+    for cls_id in range(len(texts)):
+        cls_scores = ori_scores[:, cls_id]
+        labels = torch.ones(cls_scores.shape[0], dtype=torch.long) * cls_id
+        keep_idxs = nms(decoded_bboxes, cls_scores, iou_threshold=0.5)
+        cur_bboxes = decoded_bboxes[keep_idxs]
+        cls_scores = cls_scores[keep_idxs]
+        labels = labels[keep_idxs]
+        scores_list.append(cls_scores)
+        labels_list.append(labels)
+        bboxes_list.append(cur_bboxes)
+
+    scores = torch.cat(scores_list, dim=0)
+    labels = torch.cat(labels_list, dim=0)
+    bboxes = torch.cat(bboxes_list, dim=0)
+
+    keep_idxs = scores > score_thr
+    scores = scores[keep_idxs]
+    labels = labels[keep_idxs]
+    bboxes = bboxes[keep_idxs]
+    # only for visualization, add an extra NMS
+    keep_idxs = nms(bboxes, scores, iou_threshold=nms_thr)
+    num_dets = min(len(keep_idxs), max_dets)
+    bboxes = bboxes[keep_idxs].unsqueeze(0)
+    scores = scores[keep_idxs].unsqueeze(0)
+    labels = labels[keep_idxs].unsqueeze(0)
+
+    scores = scores[0, :num_dets].numpy()
+    bboxes = bboxes[0, :num_dets].numpy()
+    labels = labels[0, :num_dets].numpy()
+
+    bboxes -= np.array(
+        [pad_param[1], pad_param[0], pad_param[1], pad_param[0]])
+    bboxes /= scale_factor
+    bboxes[:, 0::2] = np.clip(bboxes[:, 0::2], 0, w)
+    bboxes[:, 1::2] = np.clip(bboxes[:, 1::2], 0, h)
+
+    if vis:
+        image_out = visualize(ori_image, bboxes, labels, scores, texts)
+        cv2.imwrite(osp.join(output_dir, osp.basename(image_path)), image_out)
+        print(f"detecting {num_dets} objects.")
+        return image_out, ori_scores, ori_bboxes[0]
+    else:
+        return bboxes, labels, scores
+
+
+def main():
+
+    args = parse_args()
+    tflite_file = args.tflite
+    # init ONNX session
+    interpreter = tf.lite.Interpreter(model_path=tflite_file,
+                                      experimental_preserve_all_tensors=True)
+    interpreter.allocate_tensors()
+    print("Init TFLite Interpter")
+    output_dir = "onnx_outputs"
+    if not osp.exists(output_dir):
+        os.mkdir(output_dir)
+
+    # load images
+    if not osp.isfile(args.image):
+        images = [
+            osp.join(args.image, img) for img in os.listdir(args.image)
+            if img.endswith('.png') or img.endswith('.jpg')
+        ]
+    else:
+        images = [args.image]
+
+    if args.text.endswith('.txt'):
+        with open(args.text) as f:
+            lines = f.readlines()
+        texts = [[t.rstrip('\r\n')] for t in lines]
+    elif args.text.endswith('.json'):
+        texts = json.load(open(args.text))
+    else:
+        texts = [[t.strip()] for t in args.text.split(',')]
+
+    size = (640, 640)
+    strides = [8, 16, 32]
+
+    # prepare anchors, since TFLite models does not contain anchors, due to INT8 quantization.
+    featmap_sizes = [(size[0] // s, size[1] // s) for s in strides]
+    flatten_priors = generate_anchors(featmap_sizes, strides=strides)
+    mlvl_strides = [
+        flatten_priors.new_full((featmap_size[0] * featmap_size[1] * 1, ),
+                                stride)
+        for featmap_size, stride in zip(featmap_sizes, strides)
+    ]
+    flatten_strides = torch.cat(mlvl_strides)
+
+    print("Start to inference.")
+    for img in tqdm.tqdm(images):
+        inference_per_sample(interpreter,
+                             img,
+                             texts,
+                             flatten_priors[None],
+                             flatten_strides,
+                             output_dir=output_dir,
+                             vis=True,
+                             score_thr=0.3,
+                             nms_thr=0.5)
+    print("Finish inference")
+
+
+if __name__ == "__main__":
+    main()
--- a/doc/RepVL-PAN.png
+++ b/doc/RepVL-PAN.png
--- a/doc/The_overview.png
+++ b/doc/The_overview.png
--- a/doc/YOLO-World.png
+++ b/doc/YOLO-World.png
--- a/doc/bus.jpg
+++ b/doc/bus.jpg
--- a/doc/data.md
+++ b/doc/data.md
+## Preparing Data for YOLO-World
+
+### Overview
+
+For pre-training YOLO-World, we adopt several datasets as listed in the below table:
+
+| Data | Samples | Type | Boxes  |
+| :-- | :-----: | :---:| :---: | 
+| Objects365v1 | 609k | detection | 9,621k |
+| GQA | 621k | grounding | 3,681k |
+| Flickr | 149k | grounding | 641k |
+| CC3M-Lite | 245k | image-text | 821k |
+ 
+### Dataset Directory
+
+We put all data into the `data` directory, such as:
+
+```bash
+├── coco
+│   ├── annotations
+│   ├── lvis
+│   ├── train2017
+│   ├── val2017
+├── flickr
+│   ├── annotations
+│   └── images
+├── mixed_grounding
+│   ├── annotations
+│   ├── images
+├── mixed_grounding
+│   ├── annotations
+│   ├── images
+├── objects365v1
+│   ├── annotations
+│   ├── train
+│   ├── val
+```
+**NOTE**: We strongly suggest that you check the directories or paths in the dataset part of the config file, especially for the values `ann_file`, `data_root`, and `data_prefix`.
+
+We provide the annotations of the pre-training data in the below table:
+
+| Data | images | Annotation File |
+| :--- | :------| :-------------- |
+| Objects365v1 | [`Objects365 train`](https://opendatalab.com/OpenDataLab/Objects365_v1) | [`objects365_train.json`](https://opendatalab.com/OpenDataLab/Objects365_v1) |
+| MixedGrounding | [`GQA`](https://nlp.stanford.edu/data/gqa/images.zip) | [`final_mixed_train_no_coco.json`](https://huggingface.co/GLIPModel/GLIP/tree/main/mdetr_annotations/final_mixed_train_no_coco.json) |
+| Flickr30k | [`Flickr30k`](https://shannon.cs.illinois.edu/DenotationGraph/) |[`final_flickr_separateGT_train.json`](https://huggingface.co/GLIPModel/GLIP/tree/main/mdetr_annotations/final_flickr_separateGT_train.json) |
+| LVIS-minival | [`COCO val2017`](https://cocodataset.org/) | [`lvis_v1_minival_inserted_image_name.json`](https://huggingface.co/GLIPModel/GLIP/blob/main/lvis_v1_minival_inserted_image_name.json) |
+
+**Acknowledgement:** We sincerely thank [GLIP](https://github.com/microsoft/GLIP) and [mdetr](https://github.com/ashkamath/mdetr) for providing the annotation files for pre-training.
+
+
+### Dataset Class
+
+> For fine-tuning YOLO-World on Close-set Object Detection, using `MultiModalDataset` is recommended.
+
+#### Setting CLASSES/Categories
+
+If you use `COCO-format` custom datasets, you "DO NOT" need to define a dataset class for custom vocabularies/categories.
+Explicitly setting the CLASSES in the config file through `metainfo=dict(classes=your_classes),` is simple:
+
+```python
+
+coco_train_dataset = dict(
+    _delete_=True,
+    type='MultiModalDataset',
+    dataset=dict(
+        type='YOLOv5CocoDataset',
+        metainfo=dict(classes=your_classes),
+        data_root='data/your_data',
+        ann_file='annotations/your_annotation.json',
+        data_prefix=dict(img='images/'),
+        filter_cfg=dict(filter_empty_gt=False, min_size=32)),
+    class_text_path='data/texts/your_class_texts.json',
+    pipeline=train_pipeline)
+```
+
+
+For training YOLO-World, we mainly adopt two kinds of dataset classs:
+
+#### 1. `MultiModalDataset`
+
+`MultiModalDataset` is a simple wrapper for pre-defined Dataset Class, such as `Objects365` or `COCO`, which add the texts (category texts) into the dataset instance for formatting input texts.  
+
+**Text JSON**
+
+The json file is formatted as follows:
+
+```json
+[
+    ['A_1','A_2'],
+    ['B'],
+    ['C_1', 'C_2', 'C_3'],
+    ...
+]
+```
+
+We have provided the text json for [`LVIS`](./../data/texts/lvis_v1_class_texts.json), [`COCO`](../data/texts/coco_class_texts.json), and [`Objects365`](../data/texts/obj365v1_class_texts.json)
+
+#### 2. `YOLOv5MixedGroundingDataset`
+
+The `YOLOv5MixedGroundingDataset` extends the `COCO` dataset by supporting loading texts/captions from the json file. It's desgined for `MixedGrounding` or `Flickr30K` with text tokens for each object.
+
+
+
+### 🔥 Custom Datasets
+
+For custom dataset, we suggest the users convert the annotation files according to the usage. Note that, converting the annotations to the **standard COCO format** is basically required.
+
+1. **Large vocabulary, grounding, referring:** you can follow the annotation format as the `MixedGrounding` dataset, which adds `caption` and `tokens_positive` for assigning the text for each object. The texts can be a category or a noun phrases.
+
+2. **Custom vocabulary (fixed):** you can adopt the `MultiModalDataset` wrapper as the `Objects365` and create a **text json** for your custom categories.
+
+
+### CC3M Pseudo Annotations
+
+The following annotations are generated according to the automatic labeling process in our paper. Adn we report the results based on these annotations.
+
+To use CC3M annotations, you need to prepare the `CC3M` images first.
+
+| Data | Images | Boxes | File |
+| :--: | :----: | :---: | :---: |
+| CC3M-246K | 246,363 | 820,629 | [Download 🤗](https://huggingface.co/wondervictor/YOLO-World/blob/main/cc3m_pseudo_annotations.json) |
+| CC3M-500K | 536,405 | 1,784,405| [Download 🤗](https://huggingface.co/wondervictor/YOLO-World/blob/main/cc3m_pseudo_500k_annotations.json) |
+| CC3M-750K | 750,000 | 4,504,805 | [Download 🤗](https://huggingface.co/wondervictor/YOLO-World/blob/main/cc3m_pseudo_750k_annotations.json) |
\ No newline at end of file
--- a/doc/deploy.md
+++ b/doc/deploy.md
+## Deploy YOLO-World
+
+- [x] ONNX export
+- [x] ONNX demo
+- [ ] TensorRT
+- [ ] TFLite
+
+We provide several ways to deploy YOLO-World with ONNX or TensorRT
+
+### Priliminaries
+
+```bash
+pip install supervision onnx onnxruntime onnxsim
+```
+
+### Export ONNX on Gradio Demo
+
+start the `demo.py` and you can modify the texts in the demo and output the ONNX model.
+
+```bash
+python demo.py path/to/config path/to/weights
+```
+
+### Export YOLO-World to ONNX models
+
+You can also use [`export_onnx.py`](../deploy/export_onnx.py) to obtain the ONNX model. You might specify the `--custom-text` with your own `Text JSON` for your custom prompts. The format of `Text JSON` can be found in [`docs/data`](/data.md).
+
+```bash
+PYTHONPATH=./ python deploy/export_onnx.py path/to/config path/to/weights --custom-text path/to/customtexts --opset 11
+```
+
+If you don't want to include `NMS` or "post-processing" into the ONNX model, you can add `--without-nms`
+```bash
+PYTHONPATH=./ python deploy/export_onnx.py path/to/config path/to/weights --custom-text path/to/customtexts --opset 11 --without-nms
+```
+
+If you want to quantize YOLO-World with ONNX model, you'd better remove `NMS` and `bbox_decoder` by adding `--without-bbox-decoder`
+
+```bash
+PYTHONPATH=./ python deploy/export_onnx.py path/to/config path/to/weights --custom-text path/to/customtexts --opset 11 --without-bbox-decoder
+```
+
+**Running ONNX demo**
+
+```bash
+python deploy/onnx_demo.py path/to/model.onnx path/to/images path/to/texts
+```
+
+
+### Export YOLO-World to TensorRT models
+
+coming soon.
+
+### FAQ
+
+**Q1**. `RuntimeError: Exporting the operator einsum to ONNX opset version 11 is not supported. Support for this operator was added in version 12, try exporting with this version.`
+
+**A:** This error arises because YOLO-World adopts `einsum` for matrix multiplication while it is not supported by `opset 11`. You can set the `--opset` from `11` to `12` if your device supports or change the `einsum` to normal `permute/reshape/multiplication` by set `use_einsum=False` in the `MaxSigmoidCSPLayerWithTwoConv` and `YOLOWorldHeadModule`. You can refer to the [sample config](../configs/pretrain/yolo_world_v2_m_vlpan_bn_noeinsum_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py) without einsum.
+
--- a/doc/detecting_results.png
+++ b/doc/detecting_results.png
--- a/doc/faq.md
+++ b/doc/faq.md
+## Frequently Asked Questions (FAQ)
+
+
+1. ` Incorrect path_or_model_id`
+```bash
+OSError: class `YOLOWorldDetector` in yolo_world/models/detectors/yolo_world.py: class `MultiModalYOLOBackbone` in yolo_world/models/backbones/mm_backbone.py: class `HuggingCLIPLanguageBackbone` in yolo_world/models/backbones/mm_backbone.py: Incorrect path_or_model_id: '../pretrained_models/clip-vit-base-patch32-projection'. Please provide either the path to a local folder or the repo_id of a model on the Hub.
+```
+
+**Solution:** 
\ No newline at end of file
--- a/doc/finetuning.md
+++ b/doc/finetuning.md
+## Fine-tuning YOLO-World
+
+Fine-tuning YOLO-World is easy and we provide the samples for COCO object detection as a simple guidance.
+
+
+### Fine-tuning Requirements
+
+Fine-tuning YOLO-World is cheap:
+
+* it does not require 32 GPUs for multi-node distributed training. **8 GPUs or even 1 GPU** is enough.
+
+* it does not require the long schedule, *e.g.,* 300 epochs or 500 epochs for training YOLOv5 or YOLOv8. **80 epochs or fewer** is enough considering that we provide the good pre-trained weights.
+
+### Data Preparation
+
+The fine-tuning dataset should have the similar format as the that of the pre-training dataset.
+We suggest you refer to [`docs/data`](./data.md) for more details about how to build the datasets:
+
+* if you fine-tune YOLO-World for close-set / custom vocabulary object detection, using `MultiModalDataset` with a `text json` is preferred.
+
+* if you fine-tune YOLO-World for open-vocabulary detection with rich texts or grounding tasks, using `MixedGroundingDataset` is preferred.
+
+### Hyper-parameters and Config
+
+Please refer to the [config for fine-tuning YOLO-World-L on COCO](../configs/finetune_coco/yolo_world_l_dual_vlpan_2e-4_80e_8gpus_finetune_coco.py) for more details.
+
+1. Basic config file:
+
+If the fine-tuning dataset **contains mask annotations**:
+
+```python
+_base_ = ('../../third_party/mmyolo/configs/yolov8/yolov8_l_mask-refine_syncbn_fast_8xb16-500e_coco.py')
+```
+
+If the fine-tuning dataset **doesn't contain mask annotations**:
+
+```python
+_base_ = ('../../third_party/mmyolo/configs/yolov8/yolov8_l_syncbn_fast_8xb16-500e_coco.py')
+```
+
+2. Training Schemes:
+
+Reducing the epochs and adjusting the learning rate
+
+```python
+max_epochs = 80
+base_lr = 2e-4
+weight_decay = 0.05
+train_batch_size_per_gpu = 16
+close_mosaic_epochs=10
+
+train_cfg = dict(
+    max_epochs=max_epochs,
+    val_interval=5,
+    dynamic_intervals=[((max_epochs - close_mosaic_epochs),
+                        _base_.val_interval_stage2)])
+
+```
+
+3. Datasets:
+
+```python
+coco_train_dataset = dict(
+    _delete_=True,
+    type='MultiModalDataset',
+    dataset=dict(
+        type='YOLOv5CocoDataset',
+        data_root='data/coco',
+        ann_file='annotations/instances_train2017.json',
+        data_prefix=dict(img='train2017/'),
+        filter_cfg=dict(filter_empty_gt=False, min_size=32)),
+    class_text_path='data/texts/coco_class_texts.json',
+    pipeline=train_pipeline)
+```
+
+#### Finetuning without RepVL-PAN or Text Encoder 🚀
+
+For further efficiency and simplicity, we can fine-tune an efficient version of YOLO-World without RepVL-PAN and the text encoder.
+The efficient version of YOLO-World has the similar architecture or layers with the orignial YOLOv8 but we provide the pre-trained weights on large-scale datasets.
+The pre-trained YOLO-World has strong generalization capabilities and is more robust compared to YOLOv8 trained on the COCO dataset.
+
+You can refer to the [config for Efficient YOLO-World](./../configs/finetune_coco/yolo_world_l_efficient_neck_2e-4_80e_8gpus_finetune_coco.py) for more details.
+
+The efficient YOLO-World adopts `EfficientCSPLayerWithTwoConv` and the text encoder can be removed during inference or exporting models.
+
+```python
+
+model = dict(
+    type='YOLOWorldDetector',
+    mm_neck=True,
+    neck=dict(type='YOLOWorldPAFPN',
+              guide_channels=text_channels,
+              embed_channels=neck_embed_channels,
+              num_heads=neck_num_heads,
+              block_cfg=dict(type='EfficientCSPLayerWithTwoConv')))
+
+```
+
+### Launch Fine-tuning!
+
+It's easy:
+
+```bash
+./dist_train.sh <path/to/config> <NUM_GPUS> --amp
+```
--- a/doc/installation.md
+++ b/doc/installation.md
+## Installation Guide
+
+We provide the `requirements` files in [./requirements](./../requirements/):
+
+* `basic_requirements`: training, finetuning, evaluation.
+* `demo_requirements`: running YOLO-World [demos](./../demo/).
+* `onnx_requirements`: converting YOLO-World to ONNX or TFLite models (TFLite is coming soon).
+
+#### Install `MMCV`
+
+YOLO-World adopts `mmcv>=2.0.0`. There are several ways to install `mmcv`
+
+**1. using `openmim`**:
+
+see more in [official guide](https://github.com/open-mmlab/mmcv/tree/master?tab=readme-ov-file#install-mmcv-full).
+
+```bash
+pip install openmim
+mim install mmcv==2.0.0 
+```
+
+**2. using `pip`**:
+
+go to [install-with-pip](https://mmcv.readthedocs.io/en/latest/get_started/installation.html#install-with-pip) to select the pip index. 
+
+```bash
+# cuda=11.3, torch=1.11
+pip install mmcv==2.0.0 -f https://download.openmmlab.com/mmcv/dist/cu113/torch1.11/index.html
+# cuda=11.7, torch=1.13
+pip install mmcv==2.2.0 -f https://download.openmmlab.com/mmcv/dist/cu117/torch1.13/index.html
+# cuda=12.1, torch=2.1
+pip install mmcv==2.1.0 -f https://download.openmmlab.com/mmcv/dist/cu121/torch2.1/index.html
+```
+
+**3. using `whl`**
+
+go to [index packages](https://download.openmmlab.com/mmcv/dist/cu117/torch1.13/index.html) to find a suitable version and download.
+
+```bash
+pip install mmcv-2.0.1-cp38-cp38-manylinux1_x86_64.whl
+```
\ No newline at end of file
--- a/doc/prompt_yolo_world.md
+++ b/doc/prompt_yolo_world.md
+## Prompt YOLO-World
+
+
+### 1. Simple YOLO-World with Embeddings
+
+For simplifying YOLO-World and get rid of the language model, we define a new basic detector `YOLOWorldPromptDetector`:
+
+The `YOLOWorldPromptDetector` supports prompt embeddings as the input and doesn't not contain a language model anymore!
+Now, YOLO-World adopts `embeddings` as language inputs, and the embeddings support several kinds: (1) text embeddings from the language model, e.g., CLIP language encoder, (2) image embeddings from a vision model, e.g., CLIP vision encoder, and (3) image-text fused embeddings, and (4) random embeddings.
+The (1)(2)(3) supports zero-shot inference and (4), including (1)(2)(3) are designed for prompt tuning on your custom data.
+
+The basic detector is defined as follows:
+
+```python
+class YOLOWorldPromptDetector(YOLODetector):
+    """Implementation of YOLO World Series"""
+
+    def __init__(self,
+                 *args,
+                 mm_neck: bool = False,
+                 num_train_classes=80,
+                 num_test_classes=80,
+                 prompt_dim=512,
+                 num_prompts=80,
+                 embedding_path='',
+                 freeze_prompt=False,
+                 use_mlp_adapter=False,
+                 **kwargs)
+```
+
+To use it in a zero-shot manner, you need to pre-compute the text embeddings (image embeddings) and save it as a `numpy array (*.npy)` with a `NxD` shape (N is the number of prompts and D is the dimension of the embeddings). Currently, we only support one prompt for one class. You can use several prompts for one class but you need to merge the results in the post-processing steps.
+
+
+### 2. Prompt Tuning YOLO-World
+
+We introduce prompt tuning for YOLO-World to maintain the zero-shot ability while improve the performance on your custom datasets.
+
+For more details about writing configs for prompt tuning, you can refer to [`prompt tuning for COCO data`](./../configs/prompt_tuning_coco/yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_prompt_tuning_coco.py).
+
+1. Use random prompts
+
+```python
+dict(type='YOLOWorldPromptDetector',
+             mm_neck=True,
+             num_train_classes=num_training_classes,
+             num_test_classes=num_classes,
+             prompt_dim=text_channels,
+             num_prompts=80,
+             ...)
+```
+
+2. Use CLIP embeddings (text, image, or text-image embeddings)
+
+the `clip_vit_b32_coco_80_embeddings.npy` can be downloaded at [HuggingFace](https://huggingface.co/wondervictor/YOLO-World/blob/main/clip_vit_b32_coco_80_embeddings.npy).
+
+```python
+dict(type='YOLOWorldPromptDetector',
+             mm_neck=True,
+             num_train_classes=num_training_classes,
+             num_test_classes=num_classes,
+             embedding_path='embeddings/clip_vit_b32_coco_80_embeddings.npy',
+             prompt_dim=text_channels,
+             num_prompts=80,
+             ...)
+```
+
+Using CLIP model to obtains the image and text embeddings will maintain the zero-shot performace.
+
+
+| Model | Config |  AP  | AP50 | AP75  | APS | APM | APL |
+| :---- | :----: | :--: | :--: | :---: | :-: | :-: | :-: |
+| YOLO-World-v2-L | Zero-shot | 45.7 | 61.6 | 49.8 | 29.9 | 50.0 | 60.8 |
+| [YOLO-World-v2-L](./../configs/prompt_tuning_coco/yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_prompt_tuning_coco.py) | Prompt tuning | 47.9 | 64.3 | 52.5 | 31.9 | 52.6 | 61.3 | 
--- a/doc/reparameterize.md
+++ b/doc/reparameterize.md
+## Reparameterize YOLO-World
+
+The reparameterization incorporates text embeddings as parameters into the model. For example, in the final classification layer, text embeddings are reparameterized into a simple 1x1 convolutional layer.
+
+<div align="center">
+<img width="600" src="../assets/reparameterize.png">
+</div>
+
+### Key Advantages from Reparameterization
+
+> Reparameterized YOLO-World still has zero-shot ability!
+
+* **Efficiency:** reparameterized YOLO-World has a simple and efficient archtecture, e.g., `conv1x1` is faster than `transpose & matmul`. In addition, it enables further optmization for deployment.
+ 
+* **Accuracy:** reparameterized YOLO-World supports fine-tuning. Compared to the normal `fine-tuning` or `prompt tuning`, **reparameterized version can optimize the `neck` and `head` independently** since the `neck` and `head` have different parameters and do not depend on `text embeddings` anymore!
+For example, fine-tuning the **reparameterized YOLO-World** obtains *46.3 AP* on COCO *val2017* while fine-tuning the normal version obtains *46.1 AP*, with all hyper-parameters kept the same.
+
+### Getting Started
+
+#### 1. Prepare cutstom text embeddings
+
+You need to generate the text embeddings by [`toos/generate_text_prompts.py`](../tools/generate_text_prompts.py) and save it as a `numpy.array` with shape `NxD`.
+
+#### 2. Reparameterizing
+
+Reparameterizing will generate a new checkpoint with text embeddings!
+
+Check those files first:
+
+* model checkpoint
+* text embeddings
+
+We mainly reparameterize two groups of modules:
+
+* head (`YOLOWorldHeadModule`)
+* neck (`MaxSigmoidCSPLayerWithTwoConv`)
+
+```bash
+python tools/reparameterize_yoloworld.py \
+    --model path/to/checkpoint \
+    --out-dir path/to/save/re-parameterized/ \
+    --text-embed path/to/text/embeddings \
+    --conv-neck
+```
+
+
+#### 3. Prepare the model config
+
+Please see the sample config: [`finetune_coco/yolo_world_v2_s_rep_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py`](../configs/finetune_coco/yolo_world_v2_s_rep_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py) for reparameterized training.
+
+
+* `RepConvMaxSigmoidCSPLayerWithTwoConv`:
+
+```python
+neck=dict(type='YOLOWorldPAFPN',
+        guide_channels=num_classes,
+        embed_channels=neck_embed_channels,
+        num_heads=neck_num_heads,
+        block_cfg=dict(type='RepConvMaxSigmoidCSPLayerWithTwoConv',
+                        guide_channels=num_classes)),
+```
+
+* `RepYOLOWorldHeadModule`:
+
+```python
+bbox_head=dict(head_module=dict(type='RepYOLOWorldHeadModule',
+                                embed_dims=text_channels,
+                                num_guide=num_classes,
+                                num_classes=num_classes)),
+
+```
+
+#### 4. Reparameterized Training
+
+**Reparameterized YOLO-World** is easier to fine-tune and can be treated as an enhanced and pre-trained YOLOv8!
+
+You can check [`finetune_coco/yolo_world_v2_s_rep_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py`](../configs/finetune_coco/yolo_world_v2_s_rep_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py) for more details.
\ No newline at end of file
--- a/doc/tflite_deploy.md
+++ b/doc/tflite_deploy.md
+## Run YOLO-World (Quantized) on TF-Lite
+
+- [x] Export YOLO-World to TFLite with INT8 Quantization.
+- [x] TFLite demo
+
+### Priliminaries
+
+```bash
+pip install onnxruntime onnx onnx-simplifier
+pip install tensorflow==2.15.1
+```
+
+See [onnx2tf](https://github.com/PINTO0309/onnx2tf) for more details about export TFLite models.
+The contributor of `onnx2tf` is very nice!
+
+### Export TFLite INT8 Quantization models 
+
+Please use **Reparameterized YOLO-World** for TFLite!!
+
+1. Prepare the ONNX model
+
+Please export the ONNX model without `postprocessing` and `bbox_decoder`, just add `--without-bbox-decoder`!
+`bbox_decoder` is not supported for INT8 quantization, please take care!
+
+```bash
+PYTHONPATH=./ python deploy/export_onnx.py path/to/config path/to/weights --custom-text path/to/customtexts --opset 11 --without-bbox-decoder
+```
+
+2. Generate the calibration samples
+
+Using 100 COCO images is suggested to create a simple calibration dataset for quantization.
+
+```python
+import os
+import random
+from PIL import Image, ImageOps
+import cv2
+import glob
+import numpy as np
+
+root = "data/coco/val2017/"
+image_list = os.listdir(root)
+image_list = [os.path.join(root, f) for f in image_list]
+random.shuffle(image_list)
+
+img_datas = []
+for idx, file in enumerate(image_list[:100]):
+    image = Image.open(file).convert('RGB')
+    # Get sample input data as a numpy array in a method of your choosing.
+    img_width, img_height = image.size
+    size = max(img_width, img_height)
+    image = ImageOps.pad(image, (size, size), method=Image.BILINEAR)
+    image = image.resize((640, 640), Image.BILINEAR)
+    tensor_image = np.asarray(image).astype(np.float32)
+    tensor_image /= 255.0
+    tensor_image = np.expand_dims(tensor_image, axis=0)
+    img_datas.append(tensor_image)
+
+calib_datas = np.vstack(img_datas)
+print(f'calib_datas.shape: {calib_datas.shape}')
+np.save(file='tflite_calibration_data_100_images_640.npy', arr=calib_datas)
+
+```
+
+3. Export ONNX to TFLite using `onnx2tf`
+
+```bash
+onnx2tf -i [ONNX] -o [OUTPUT] -oiqt  -cind "images" "tflite_calibration_data_100_images_640.npy" "[[[[0.,0.,0.]]]]" "[[[[1.,1.,1.]]]]"  -onimc "scores" "bboxes" --verbosity debug
+```
+
+We provide a sample TFLite INT8 model: [yolo_world_x_coco_zeroshot_rep_integer_quant.tflite](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_x_coco_zeroshot_rep_integer_quant.tflite)
+
+### Inference using TFLite
+
+```bash
+python deploy/tflite_demo.py path/to/tflite path/to/images path/to/texts
+
+```
\ No newline at end of file
--- a/doc/updates.md
+++ b/doc/updates.md
+## Update Notes
+
+We provide the details for important updates of YOLO-World in this note.
+
+### Model Architecture
+
+**[2024-2-29]:** YOLO-World-v2:
+
+1. We remove the `I-PoolingAttention`: though it improves the performance for zero-shot LVIS evaluation, it affects the inference speeds after exporting YOLO-World to ONNX or TensorRT. Considering the trade-off, we remove the `I-PoolingAttention` in the newest version.
+2. We replace the `L2-Norm` in the contrastive head with the `BatchNorm`. The `L2-Norm` contains complex operations, such as `reduce`, which is time-consuming for deployment. However, the `BatchNorm` can be fused into the convolution, which is much more efficient and also improves the zero-shot performance.
+
+
+
+
--- a/model.properties
+++ b/model.properties
+# 模型唯一标识
+modelCode=673
+# 模型名称
+modelName=yolo_world_pytorch
+# 模型描述
+modelDescription=实时开放词汇目标检测模型YOLO-World的训练、推理
+# 应用场景
+appScenario=训练,推理,科研,制造,医疗,家居,教育
+# 框架类型
+frameType=Pytorch