Initial commit

e9cee049 · luopl · e9cee049 · e9cee049 · e9cee049 · e9cee049
Commit e9cee049 authored May 31, 2024 by luopl
20 changed files
--- a/demo/gradio_demo.py
+++ b/demo/gradio_demo.py
+# Copyright (c) Tencent Inc. All rights reserved.
+import os
+import sys
+import argparse
+import os.path as osp
+from io import BytesIO
+from functools import partial
+
+import cv2
+import onnx
+import torch
+import onnxsim
+import numpy as np
+import gradio as gr
+from PIL import Image
+import supervision as sv
+from torchvision.ops import nms
+from mmengine.runner import Runner
+from mmengine.dataset import Compose
+from mmengine.runner.amp import autocast
+from mmengine.config import Config, DictAction, ConfigDict
+from mmdet.datasets import CocoDataset
+from mmyolo.registry import RUNNERS
+
+sys.path.append('./deploy')
+from easydeploy import model as EM
+
+BOUNDING_BOX_ANNOTATOR = sv.BoundingBoxAnnotator(thickness=1)
+MASK_ANNOTATOR = sv.MaskAnnotator()
+
+
+class LabelAnnotator(sv.LabelAnnotator):
+
+    @staticmethod
+    def resolve_text_background_xyxy(
+        center_coordinates,
+        text_wh,
+        position,
+    ):
+        center_x, center_y = center_coordinates
+        text_w, text_h = text_wh
+        return center_x, center_y, center_x + text_w, center_y + text_h
+
+
+LABEL_ANNOTATOR = LabelAnnotator(text_padding=4,
+                                 text_scale=0.5,
+                                 text_thickness=1)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='YOLO-World Demo')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument('checkpoint', help='checkpoint file')
+    parser.add_argument(
+        '--work-dir',
+        help='the directory to save the file containing evaluation metrics',
+        default='output')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    args = parser.parse_args()
+    return args
+
+
+def run_image(runner,
+              image,
+              text,
+              max_num_boxes,
+              score_thr,
+              nms_thr,
+              image_path='./work_dirs/demo.png'):
+    # image.save(image_path)
+    texts = [[t.strip()] for t in text.split(',')] + [[' ']]
+    data_info = dict(img_id=0, img=np.array(image), texts=texts)
+    data_info = runner.pipeline(data_info)
+    data_batch = dict(inputs=data_info['inputs'].unsqueeze(0),
+                      data_samples=[data_info['data_samples']])
+
+    with autocast(enabled=False), torch.no_grad():
+        output = runner.model.test_step(data_batch)[0]
+        pred_instances = output.pred_instances
+
+    keep = nms(pred_instances.bboxes,
+               pred_instances.scores,
+               iou_threshold=nms_thr)
+    pred_instances = pred_instances[keep]
+    pred_instances = pred_instances[pred_instances.scores.float() > score_thr]
+
+    if len(pred_instances.scores) > max_num_boxes:
+        indices = pred_instances.scores.float().topk(max_num_boxes)[1]
+        pred_instances = pred_instances[indices]
+
+    pred_instances = pred_instances.cpu().numpy()
+    if 'masks' in pred_instances:
+        masks = pred_instances['masks']
+    else:
+        masks = None
+    detections = sv.Detections(xyxy=pred_instances['bboxes'],
+                               class_id=pred_instances['labels'],
+                               confidence=pred_instances['scores'],
+                               mask=masks)
+    labels = [
+        f"{texts[class_id][0]} {confidence:0.2f}" for class_id, confidence in
+        zip(detections.class_id, detections.confidence)
+    ]
+
+    image = np.array(image)
+    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)  # Convert RGB to BGR
+    image = BOUNDING_BOX_ANNOTATOR.annotate(image, detections)
+    image = LABEL_ANNOTATOR.annotate(image, detections, labels=labels)
+    if masks is not None:
+        image = MASK_ANNOTATOR.annotate(image, detections)
+    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)  # Convert BGR to RGB
+    image = Image.fromarray(image)
+    return image
+
+
+def export_model(runner, text, max_num_boxes, score_thr, nms_thr):
+
+    backend = EM.MMYOLOBackend.ONNXRUNTIME
+    postprocess_cfg = ConfigDict(pre_top_k=10 * max_num_boxes,
+                                 keep_top_k=max_num_boxes,
+                                 iou_threshold=nms_thr,
+                                 score_threshold=score_thr)
+
+    base_model = runner.model
+
+    texts = [[t.strip() for t in text.split(',')] + [' ']]
+    base_model.reparameterize(texts)
+    deploy_model = EM.DeployModel(baseModel=base_model,
+                                  backend=backend,
+                                  postprocess_cfg=postprocess_cfg)
+    deploy_model.eval()
+
+    device = (next(iter(base_model.parameters()))).device
+    fake_input = torch.ones([1, 3, 640, 640], device=device)
+    deploy_model(fake_input)
+
+    save_onnx_path = os.path.join(
+        args.work_dir,
+        os.path.basename(args.checkpoint).replace('pth', 'onnx'))
+    # export onnx
+    with BytesIO() as f:
+        output_names = ['num_dets', 'boxes', 'scores', 'labels']
+        torch.onnx.export(deploy_model,
+                          fake_input,
+                          f,
+                          input_names=['images'],
+                          output_names=output_names,
+                          opset_version=12)
+        f.seek(0)
+        onnx_model = onnx.load(f)
+        onnx.checker.check_model(onnx_model)
+    onnx_model, check = onnxsim.simplify(onnx_model)
+    onnx.save(onnx_model, save_onnx_path)
+    return gr.update(visible=True), save_onnx_path
+
+
+def demo(runner, args):
+    with gr.Blocks(title="YOLO-World") as demo:
+        with gr.Row():
+            gr.Markdown('<h1><center>YOLO-World: Real-Time Open-Vocabulary '
+                        'Object Detector</center></h1>')
+        with gr.Row():
+            with gr.Column(scale=0.3):
+                with gr.Row():
+                    image = gr.Image(type='pil', label='input image')
+                input_text = gr.Textbox(
+                    lines=7,
+                    label='Enter the classes to be detected, '
+                    'separated by comma',
+                    value=', '.join(CocoDataset.METAINFO['classes']),
+                    elem_id='textbox')
+                with gr.Row():
+                    submit = gr.Button('Submit')
+                    clear = gr.Button('Clear')
+                with gr.Row():
+                    export = gr.Button('Deploy and Export ONNX Model')
+                with gr.Row():
+                    gr.Markdown(
+                        "It takes a few seconds to generate the ONNX file! YOLO-World-Seg (segmentation) is not supported now"
+                    )
+                out_download = gr.File(visible=False)
+                max_num_boxes = gr.Slider(minimum=1,
+                                          maximum=300,
+                                          value=100,
+                                          step=1,
+                                          interactive=True,
+                                          label='Maximum Number Boxes')
+                score_thr = gr.Slider(minimum=0,
+                                      maximum=1,
+                                      value=0.05,
+                                      step=0.001,
+                                      interactive=True,
+                                      label='Score Threshold')
+                nms_thr = gr.Slider(minimum=0,
+                                    maximum=1,
+                                    value=0.7,
+                                    step=0.001,
+                                    interactive=True,
+                                    label='NMS Threshold')
+            with gr.Column(scale=0.7):
+                output_image = gr.Image(type='pil', label='output image')
+
+        submit.click(partial(run_image, runner),
+                     [image, input_text, max_num_boxes, score_thr, nms_thr],
+                     [output_image])
+        clear.click(lambda: [None, '', None], None,
+                    [image, input_text, output_image])
+
+        export.click(partial(export_model, runner),
+                     [input_text, max_num_boxes, score_thr, nms_thr],
+                     [out_download, out_download])
+
+        demo.launch(server_name='0.0.0.0',
+                    server_port=8080)  # port 80 does not work for me
+
+
+if __name__ == '__main__':
+    args = parse_args()
+
+    # load config
+    cfg = Config.fromfile(args.config)
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+
+    if args.work_dir is not None:
+        cfg.work_dir = args.work_dir
+    elif cfg.get('work_dir', None) is None:
+        cfg.work_dir = osp.join('./work_dirs',
+                                osp.splitext(osp.basename(args.config))[0])
+
+    cfg.load_from = args.checkpoint
+
+    if 'runner_type' not in cfg:
+        runner = Runner.from_cfg(cfg)
+    else:
+        runner = RUNNERS.build(cfg)
+
+    runner.call_hook('before_run')
+    runner.load_or_resume()
+    pipeline = cfg.test_dataloader.dataset.pipeline
+    pipeline[0].type = 'mmdet.LoadImageFromNDArray'
+    runner.pipeline = Compose(pipeline)
+    runner.model.eval()
+    demo(runner, args)
--- a/demo/image_demo.py
+++ b/demo/image_demo.py
+# Copyright (c) Tencent Inc. All rights reserved.
+import os
+import cv2
+import argparse
+import os.path as osp
+
+import torch
+from mmengine.config import Config, DictAction
+from mmengine.runner.amp import autocast
+from mmengine.dataset import Compose
+from mmengine.utils import ProgressBar
+from mmdet.apis import init_detector
+from mmdet.utils import get_test_pipeline_cfg
+
+import supervision as sv
+
+BOUNDING_BOX_ANNOTATOR = sv.BoundingBoxAnnotator(thickness=1)
+MASK_ANNOTATOR = sv.MaskAnnotator()
+
+
+class LabelAnnotator(sv.LabelAnnotator):
+
+    @staticmethod
+    def resolve_text_background_xyxy(
+        center_coordinates,
+        text_wh,
+        position,
+    ):
+        center_x, center_y = center_coordinates
+        text_w, text_h = text_wh
+        return center_x, center_y, center_x + text_w, center_y + text_h
+
+
+LABEL_ANNOTATOR = LabelAnnotator(text_padding=4,
+                                 text_scale=0.5,
+                                 text_thickness=1)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='YOLO-World Demo')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument('checkpoint', help='checkpoint file')
+    parser.add_argument('image', help='image path, include image file or dir.')
+    parser.add_argument(
+        'text',
+        help=
+        'text prompts, including categories separated by a comma or a txt file with each line as a prompt.'
+    )
+    parser.add_argument('--topk',
+                        default=100,
+                        type=int,
+                        help='keep topk predictions.')
+    parser.add_argument('--threshold',
+                        default=0.1,
+                        type=float,
+                        help='confidence score threshold for predictions.')
+    parser.add_argument('--device',
+                        default='cuda:0',
+                        help='device used for inference.')
+    parser.add_argument('--show',
+                        action='store_true',
+                        help='show the detection results.')
+    parser.add_argument(
+        '--annotation',
+        action='store_true',
+        help='save the annotated detection results as yolo text format.')
+    parser.add_argument('--amp',
+                        action='store_true',
+                        help='use mixed precision for inference.')
+    parser.add_argument('--output-dir',
+                        default='demo_outputs',
+                        help='the directory to save outputs')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    args = parser.parse_args()
+    return args
+
+
+def inference_detector(model,
+                       image,
+                       texts,
+                       test_pipeline,
+                       max_dets=100,
+                       score_thr=0.3,
+                       output_dir='./work_dir',
+                       use_amp=False,
+                       show=False,
+                       annotation=False):
+    data_info = dict(img_id=0, img_path=image, texts=texts)
+    data_info = test_pipeline(data_info)
+    data_batch = dict(inputs=data_info['inputs'].unsqueeze(0),
+                      data_samples=[data_info['data_samples']])
+
+    with autocast(enabled=use_amp), torch.no_grad():
+        output = model.test_step(data_batch)[0]
+        pred_instances = output.pred_instances
+        pred_instances = pred_instances[pred_instances.scores.float() >
+                                        score_thr]
+
+    if len(pred_instances.scores) > max_dets:
+        indices = pred_instances.scores.float().topk(max_dets)[1]
+        pred_instances = pred_instances[indices]
+
+    pred_instances = pred_instances.cpu().numpy()
+
+    if 'masks' in pred_instances:
+        masks = pred_instances['masks']
+    else:
+        masks = None
+
+    detections = sv.Detections(xyxy=pred_instances['bboxes'],
+                               class_id=pred_instances['labels'],
+                               confidence=pred_instances['scores'],
+                               mask=masks)
+
+    labels = [
+        f"{texts[class_id][0]} {confidence:0.2f}" for class_id, confidence in
+        zip(detections.class_id, detections.confidence)
+    ]
+
+    # label images
+    image = cv2.imread(image_path)
+    anno_image = image.copy()
+    image = BOUNDING_BOX_ANNOTATOR.annotate(image, detections)
+    image = LABEL_ANNOTATOR.annotate(image, detections, labels=labels)
+    if masks is not None:
+        image = MASK_ANNOTATOR.annotate(image, detections)
+    cv2.imwrite(osp.join(output_dir, osp.basename(image_path)), image)
+
+    if annotation:
+        images_dict = {}
+        annotations_dict = {}
+
+        images_dict[osp.basename(image_path)] = anno_image
+        annotations_dict[osp.basename(image_path)] = detections
+
+        ANNOTATIONS_DIRECTORY = os.makedirs(r"./annotations", exist_ok=True)
+
+        MIN_IMAGE_AREA_PERCENTAGE = 0.002
+        MAX_IMAGE_AREA_PERCENTAGE = 0.80
+        APPROXIMATION_PERCENTAGE = 0.75
+
+        sv.DetectionDataset(
+            classes=texts, images=images_dict,
+            annotations=annotations_dict).as_yolo(
+                annotations_directory_path=ANNOTATIONS_DIRECTORY,
+                min_image_area_percentage=MIN_IMAGE_AREA_PERCENTAGE,
+                max_image_area_percentage=MAX_IMAGE_AREA_PERCENTAGE,
+                approximation_percentage=APPROXIMATION_PERCENTAGE)
+
+    if show:
+        cv2.imshow('Image', image)  # Provide window name
+        k = cv2.waitKey(0)
+        if k == 27:
+            # wait for ESC key to exit
+            cv2.destroyAllWindows()
+
+
+if __name__ == '__main__':
+    args = parse_args()
+
+    # load config
+    cfg = Config.fromfile(args.config)
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+
+    cfg.work_dir = osp.join('./work_dirs',
+                            osp.splitext(osp.basename(args.config))[0])
+    # init model
+    cfg.load_from = args.checkpoint
+    model = init_detector(cfg, checkpoint=args.checkpoint, device=args.device)
+
+    # init test pipeline
+    test_pipeline_cfg = get_test_pipeline_cfg(cfg=cfg)
+    # test_pipeline[0].type = 'mmdet.LoadImageFromNDArray'
+    test_pipeline = Compose(test_pipeline_cfg)
+
+    if args.text.endswith('.txt'):
+        with open(args.text) as f:
+            lines = f.readlines()
+        texts = [[t.rstrip('\r\n')] for t in lines] + [[' ']]
+    else:
+        texts = [[t.strip()] for t in args.text.split(',')] + [[' ']]
+
+    output_dir = args.output_dir
+    if not osp.exists(output_dir):
+        os.mkdir(output_dir)
+
+    # load images
+    if not osp.isfile(args.image):
+        images = [
+            osp.join(args.image, img) for img in os.listdir(args.image)
+            if img.endswith('.png') or img.endswith('.jpg')
+        ]
+    else:
+        images = [args.image]
+
+    # reparameterize texts
+    model.reparameterize(texts)
+    progress_bar = ProgressBar(len(images))
+    for image_path in images:
+        inference_detector(model,
+                           image_path,
+                           texts,
+                           test_pipeline,
+                           args.topk,
+                           args.threshold,
+                           output_dir=output_dir,
+                           use_amp=args.amp,
+                           show=args.show,
+                           annotation=args.annotation)
+        progress_bar.update()
--- a/demo/inference.ipynb
+++ b/demo/inference.ipynb
--- a/demo/sample_images/bus.jpg
+++ b/demo/sample_images/bus.jpg
--- a/demo/sample_images/zidane.jpg
+++ b/demo/sample_images/zidane.jpg
--- a/demo/simple_demo.py
+++ b/demo/simple_demo.py
+# Copyright (c) Tencent Inc. All rights reserved.
+import os.path as osp
+
+import cv2
+import torch
+from mmengine.config import Config
+from mmengine.dataset import Compose
+from mmdet.apis import init_detector
+from mmdet.utils import get_test_pipeline_cfg
+
+
+def inference(model, image, texts, test_pipeline, score_thr=0.3, max_dets=100):
+    image = cv2.imread(image)
+    image = image[:, :, [2, 1, 0]]
+    data_info = dict(img=image, img_id=0, texts=texts)
+    data_info = test_pipeline(data_info)
+    data_batch = dict(inputs=data_info['inputs'].unsqueeze(0),
+                      data_samples=[data_info['data_samples']])
+    with torch.no_grad():
+        output = model.test_step(data_batch)[0]
+    pred_instances = output.pred_instances
+    # score thresholding
+    pred_instances = pred_instances[pred_instances.scores.float() > score_thr]
+    # max detections
+    if len(pred_instances.scores) > max_dets:
+        indices = pred_instances.scores.float().topk(max_dets)[1]
+        pred_instances = pred_instances[indices]
+
+    pred_instances = pred_instances.cpu().numpy()
+    boxes = pred_instances['bboxes']
+    labels = pred_instances['labels']
+    scores = pred_instances['scores']
+    label_texts = [texts[x][0] for x in labels]
+    return boxes, labels, label_texts, scores
+
+
+if __name__ == "__main__":
+
+    config_file = "configs/pretrain/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py"
+    checkpoint = "weights/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth"
+
+    cfg = Config.fromfile(config_file)
+    cfg.work_dir = osp.join('./work_dirs')
+    # init model
+    cfg.load_from = checkpoint
+    model = init_detector(cfg, checkpoint=checkpoint, device='cuda:0')
+    test_pipeline_cfg = get_test_pipeline_cfg(cfg=cfg)
+    test_pipeline_cfg[0].type = 'mmdet.LoadImageFromNDArray'
+    test_pipeline = Compose(test_pipeline_cfg)
+
+    texts = [['person'], ['bus'], [' ']]
+    image = "demo/sample_images/bus.jpg"
+    print(f"starting to detect: {image}")
+    results = inference(model, image, texts, test_pipeline)
+    format_str = [
+        f"obj-{idx}: {box}, label-{lbl}, class-{lbl_text}, score-{score}"
+        for idx, (box, lbl, lbl_text, score) in enumerate(zip(*results))
+    ]
+    print("detecting results:")
+    for q in format_str:
+        print(q)
--- a/demo/video_demo.py
+++ b/demo/video_demo.py
+# Copyright (c) Tencent Inc. All rights reserved.
+# This file is modifef from mmyolo/demo/video_demo.py
+import argparse
+
+import cv2
+import mmcv
+import torch
+from mmengine.dataset import Compose
+from mmdet.apis import init_detector
+from mmengine.utils import track_iter_progress
+
+from mmyolo.registry import VISUALIZERS
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='YOLO-World video demo')
+    parser.add_argument('config', help='Config file')
+    parser.add_argument('checkpoint', help='Checkpoint file')
+    parser.add_argument('video', help='video file path')
+    parser.add_argument(
+        'text',
+        help=
+        'text prompts, including categories separated by a comma or a txt file with each line as a prompt.'
+    )
+    parser.add_argument('--device',
+                        default='cuda:0',
+                        help='device used for inference')
+    parser.add_argument('--score-thr',
+                        default=0.1,
+                        type=float,
+                        help='confidence score threshold for predictions.')
+    parser.add_argument('--out', type=str, help='output video file')
+    args = parser.parse_args()
+    return args
+
+
+def inference_detector(model, image, texts, test_pipeline, score_thr=0.3):
+    data_info = dict(img_id=0, img=image, texts=texts)
+    data_info = test_pipeline(data_info)
+    data_batch = dict(inputs=data_info['inputs'].unsqueeze(0),
+                      data_samples=[data_info['data_samples']])
+
+    with torch.no_grad():
+        output = model.test_step(data_batch)[0]
+        pred_instances = output.pred_instances
+        pred_instances = pred_instances[pred_instances.scores.float() >
+                                        score_thr]
+    output.pred_instances = pred_instances
+    return output
+
+
+def main():
+    args = parse_args()
+
+    model = init_detector(args.config, args.checkpoint, device=args.device)
+
+    # build test pipeline
+    model.cfg.test_dataloader.dataset.pipeline[
+        0].type = 'mmdet.LoadImageFromNDArray'
+    test_pipeline = Compose(model.cfg.test_dataloader.dataset.pipeline)
+
+    if args.text.endswith('.txt'):
+        with open(args.text) as f:
+            lines = f.readlines()
+        texts = [[t.rstrip('\r\n')] for t in lines] + [[' ']]
+    else:
+        texts = [[t.strip()] for t in args.text.split(',')] + [[' ']]
+
+    # reparameterize texts
+    model.reparameterize(texts)
+
+    # init visualizer
+    visualizer = VISUALIZERS.build(model.cfg.visualizer)
+    # the dataset_meta is loaded from the checkpoint and
+    # then pass to the model in init_detector
+    visualizer.dataset_meta = model.dataset_meta
+
+    video_reader = mmcv.VideoReader(args.video)
+    video_writer = None
+    if args.out:
+        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+        video_writer = cv2.VideoWriter(
+            args.out, fourcc, video_reader.fps,
+            (video_reader.width, video_reader.height))
+
+    for frame in track_iter_progress(video_reader):
+        result = inference_detector(model,
+                                    frame,
+                                    texts,
+                                    test_pipeline,
+                                    score_thr=args.score_thr)
+        visualizer.add_datasample(name='video',
+                                  image=frame,
+                                  data_sample=result,
+                                  draw_gt=False,
+                                  show=False,
+                                  pred_score_thr=args.score_thr)
+        frame = visualizer.get_image()
+
+        if args.out:
+            video_writer.write(frame)
+
+    if video_writer:
+        video_writer.release()
+
+
+if __name__ == '__main__':
+    main()
--- a/deploy/__init__.py
+++ b/deploy/__init__.py
--- a/deploy/easydeploy/README.md
+++ b/deploy/easydeploy/README.md
+# MMYOLO Model Easy-Deployment
+
+## Introduction
+
+This project is developed for easily converting your MMYOLO models to other inference backends without the need of MMDeploy, which reduces the cost of both time and effort on getting familiar with MMDeploy.
+
+Currently we support converting to `ONNX` and `TensorRT` formats, other inference backends such `ncnn` will be added to this project as well.
+
+## Supported Backends
+
+- [Model Convert](docs/model_convert.md)
--- a/deploy/easydeploy/README_zh-CN.md
+++ b/deploy/easydeploy/README_zh-CN.md
+# MMYOLO 模型转换
+
+## 介绍
+
+本项目作为 MMYOLO 的部署 project 单独存在，意图剥离 MMDeploy 当前的体系，独自支持用户完成模型训练后的转换和部署功能，使用户的学习和工程成本下降。
+
+当前支持对 ONNX 格式和 TensorRT 格式的转换，后续对其他推理平台也会支持起来。
+
+## 转换教程
+
+- [Model Convert](docs/model_convert.md)
--- a/deploy/easydeploy/backbone/__init__.py
+++ b/deploy/easydeploy/backbone/__init__.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from .common import DeployC2f
+from .focus import DeployFocus, GConvFocus, NcnnFocus
+
+__all__ = ['DeployFocus', 'NcnnFocus', 'GConvFocus', 'DeployC2f']
--- a/deploy/easydeploy/backbone/common.py
+++ b/deploy/easydeploy/backbone/common.py
+import torch
+import torch.nn as nn
+from torch import Tensor
+
+
+class DeployC2f(nn.Module):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__()
+
+    def forward(self, x: Tensor) -> Tensor:
+        x_main = self.main_conv(x)
+        x_main = [x_main, x_main[:, self.mid_channels:, ...]]
+        x_main.extend(blocks(x_main[-1]) for blocks in self.blocks)
+        x_main.pop(1)
+        return self.final_conv(torch.cat(x_main, 1))
--- a/deploy/easydeploy/backbone/focus.py
+++ b/deploy/easydeploy/backbone/focus.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+
+
+class DeployFocus(nn.Module):
+
+    def __init__(self, orin_Focus: nn.Module):
+        super().__init__()
+        self.__dict__.update(orin_Focus.__dict__)
+
+    def forward(self, x: Tensor) -> Tensor:
+        batch_size, channel, height, width = x.shape
+        x = x.reshape(batch_size, channel, -1, 2, width)
+        x = x.reshape(batch_size, channel, x.shape[2], 2, -1, 2)
+        half_h = x.shape[2]
+        half_w = x.shape[4]
+        x = x.permute(0, 5, 3, 1, 2, 4)
+        x = x.reshape(batch_size, channel * 4, half_h, half_w)
+
+        return self.conv(x)
+
+
+class NcnnFocus(nn.Module):
+
+    def __init__(self, orin_Focus: nn.Module):
+        super().__init__()
+        self.__dict__.update(orin_Focus.__dict__)
+
+    def forward(self, x: Tensor) -> Tensor:
+        batch_size, c, h, w = x.shape
+        assert h % 2 == 0 and w % 2 == 0, f'focus for yolox needs even feature\
+            height and width, got {(h, w)}.'
+
+        x = x.reshape(batch_size, c * h, 1, w)
+        _b, _c, _h, _w = x.shape
+        g = _c // 2
+        # fuse to ncnn's shufflechannel
+        x = x.view(_b, g, 2, _h, _w)
+        x = torch.transpose(x, 1, 2).contiguous()
+        x = x.view(_b, -1, _h, _w)
+
+        x = x.reshape(_b, c * h * w, 1, 1)
+
+        _b, _c, _h, _w = x.shape
+        g = _c // 2
+        # fuse to ncnn's shufflechannel
+        x = x.view(_b, g, 2, _h, _w)
+        x = torch.transpose(x, 1, 2).contiguous()
+        x = x.view(_b, -1, _h, _w)
+
+        x = x.reshape(_b, c * 4, h // 2, w // 2)
+
+        return self.conv(x)
+
+
+class GConvFocus(nn.Module):
+
+    def __init__(self, orin_Focus: nn.Module):
+        super().__init__()
+        device = next(orin_Focus.parameters()).device
+        self.weight1 = torch.tensor([[1., 0], [0, 0]]).expand(3, 1, 2,
+                                                              2).to(device)
+        self.weight2 = torch.tensor([[0, 0], [1., 0]]).expand(3, 1, 2,
+                                                              2).to(device)
+        self.weight3 = torch.tensor([[0, 1.], [0, 0]]).expand(3, 1, 2,
+                                                              2).to(device)
+        self.weight4 = torch.tensor([[0, 0], [0, 1.]]).expand(3, 1, 2,
+                                                              2).to(device)
+        self.__dict__.update(orin_Focus.__dict__)
+
+    def forward(self, x: Tensor) -> Tensor:
+        conv1 = F.conv2d(x, self.weight1, stride=2, groups=3)
+        conv2 = F.conv2d(x, self.weight2, stride=2, groups=3)
+        conv3 = F.conv2d(x, self.weight3, stride=2, groups=3)
+        conv4 = F.conv2d(x, self.weight4, stride=2, groups=3)
+        return self.conv(torch.cat([conv1, conv2, conv3, conv4], dim=1))
--- a/deploy/easydeploy/bbox_code/__init__.py
+++ b/deploy/easydeploy/bbox_code/__init__.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from .bbox_coder import (rtmdet_bbox_decoder, yolov5_bbox_decoder,
+                         yolox_bbox_decoder)
+
+__all__ = ['yolov5_bbox_decoder', 'rtmdet_bbox_decoder', 'yolox_bbox_decoder']
--- a/deploy/easydeploy/bbox_code/bbox_coder.py
+++ b/deploy/easydeploy/bbox_code/bbox_coder.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+import torch
+from torch import Tensor
+
+
+def yolov5_bbox_decoder(priors: Tensor, bbox_preds: Tensor,
+                        stride: Tensor) -> Tensor:
+    bbox_preds = bbox_preds.sigmoid()
+
+    x_center = (priors[..., 0] + priors[..., 2]) * 0.5
+    y_center = (priors[..., 1] + priors[..., 3]) * 0.5
+    w = priors[..., 2] - priors[..., 0]
+    h = priors[..., 3] - priors[..., 1]
+
+    x_center_pred = (bbox_preds[..., 0] - 0.5) * 2 * stride + x_center
+    y_center_pred = (bbox_preds[..., 1] - 0.5) * 2 * stride + y_center
+    w_pred = (bbox_preds[..., 2] * 2)**2 * w
+    h_pred = (bbox_preds[..., 3] * 2)**2 * h
+
+    decoded_bboxes = torch.stack(
+        [x_center_pred, y_center_pred, w_pred, h_pred], dim=-1)
+
+    return decoded_bboxes
+
+
+def rtmdet_bbox_decoder(priors: Tensor, bbox_preds: Tensor,
+                        stride: Optional[Tensor]) -> Tensor:
+    stride = stride[None, :, None]
+    bbox_preds *= stride
+    tl_x = (priors[..., 0] - bbox_preds[..., 0])
+    tl_y = (priors[..., 1] - bbox_preds[..., 1])
+    br_x = (priors[..., 0] + bbox_preds[..., 2])
+    br_y = (priors[..., 1] + bbox_preds[..., 3])
+    decoded_bboxes = torch.stack([tl_x, tl_y, br_x, br_y], -1)
+    return decoded_bboxes
+
+
+def yolox_bbox_decoder(priors: Tensor, bbox_preds: Tensor,
+                       stride: Optional[Tensor]) -> Tensor:
+    stride = stride[None, :, None]
+    xys = (bbox_preds[..., :2] * stride) + priors
+    whs = bbox_preds[..., 2:].exp() * stride
+    decoded_bboxes = torch.cat([xys, whs], -1)
+    return decoded_bboxes
--- a/deploy/easydeploy/deepstream/CMakeLists.txt
+++ b/deploy/easydeploy/deepstream/CMakeLists.txt
+cmake_minimum_required(VERSION 2.8.12)
+
+set(CMAKE_CUDA_ARCHITECTURES 60 61 62 70 72 75 86)
+set(CMAKE_CUDA_COMPILER /usr/local/cuda/bin/nvcc)
+
+project(nvdsparsebbox_mmyolo LANGUAGES CXX)
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14 -O3 -g -Wall -Werror -shared -fPIC")
+set(CMAKE_CXX_STANDARD 14)
+set(CMAKE_BUILD_TYPE Release)
+option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
+
+# CUDA
+find_package(CUDA REQUIRED)
+
+# TensorRT
+set(TensorRT_INCLUDE_DIRS "/usr/include/x86_64-linux-gnu" CACHE STRING "TensorRT headers path")
+set(TensorRT_LIBRARIES "/usr/lib/x86_64-linux-gnu" CACHE STRING "TensorRT libs path")
+
+# DeepStream
+set(DEEPSTREAM "/opt/nvidia/deepstream/deepstream" CACHE STRING "DeepStream root path")
+set(DS_LIBRARIES ${DEEPSTREAM}/lib)
+set(DS_INCLUDE_DIRS ${DEEPSTREAM}/sources/includes)
+
+include_directories(
+        ${CUDA_INCLUDE_DIRS}
+        ${TensorRT_INCLUDE_DIRS}
+        ${DS_INCLUDE_DIRS})
+
+add_library(
+        ${PROJECT_NAME}
+        SHARED
+        custom_mmyolo_bbox_parser/nvdsparsebbox_mmyolo.cpp)
+
+target_link_libraries(${PROJECT_NAME} PRIVATE nvinfer nvinfer_plugin)
--- a/deploy/easydeploy/deepstream/README.md
+++ b/deploy/easydeploy/deepstream/README.md
+# Inference MMYOLO Models with DeepStream
+
+This project demonstrates how to inference MMYOLO models with customized parsers in [DeepStream SDK](https://developer.nvidia.com/deepstream-sdk).
+
+## Pre-requisites
+
+### 1. Install Nvidia Driver and CUDA
+
+First, please follow the official documents and instructions to install dedicated Nvidia graphic driver and CUDA matched to your gpu and target Nvidia AIoT devices.
+
+### 2. Install DeepStream SDK
+
+Second, please follow the official instruction to download and install DeepStream SDK. Currently stable version of DeepStream is v6.2.
+
+### 3. Generate TensorRT Engine
+
+As DeepStream builds on top of several NVIDIA libraries, you need to first convert your trained MMYOLO models to TensorRT engine files. We strongly recommend you to try the supported TensorRT deployment solution in [EasyDeploy](../../easydeploy/).
+
+## Build and Run
+
+Please make sure that your converted TensorRT engine is already located in the `deepstream` folder as the config shows. Create your own model config files and change the `config-file` parameter in [deepstream_app_config.txt](deepstream_app_config.txt) to the model you want to run with.
+
+```bash
+mkdir build && cd build
+cmake ..
+make -j$(nproc) && make install
+```
+
+Then you can run the inference with this command.
+
+```bash
+deepstream-app -c deepstream_app_config.txt
+```
+
+## Code Structure
+
+```bash
+├── deepstream
+│   ├── configs                   # config file for MMYOLO models
+│   │   └── config_infer_rtmdet.txt
+│   ├── custom_mmyolo_bbox_parser # customized parser for MMYOLO models to DeepStream formats
+│   │   └── nvdsparsebbox_mmyolo.cpp
+|   ├── CMakeLists.txt
+│   ├── coco_labels.txt           # labels for coco detection
+│   ├── deepstream_app_config.txt # deepStream reference app configs for MMYOLO models
+│   ├── README_zh-CN.md
+│   └── README.md
+```
--- a/deploy/easydeploy/deepstream/README_zh-CN.md
+++ b/deploy/easydeploy/deepstream/README_zh-CN.md
+# 使用 DeepStream SDK 推理 MMYOLO 模型
+
+本项目演示了如何使用 [DeepStream SDK](https://developer.nvidia.com/deepstream-sdk) 配合改写的 parser 来推理 MMYOLO 的模型。
+
+## 预先准备
+
+### 1. 安装 Nidia 驱动和 CUDA
+
+首先请根据当前的显卡驱动和目标使用设备的驱动完成显卡驱动和 CUDA 的安装。
+
+### 2. 安装 DeepStream SDK
+
+目前 DeepStream SDK 稳定版本已经更新到 v6.2，官方推荐使用这个版本。
+
+### 3. 将 MMYOLO 模型转换为 TensorRT Engine
+
+推荐使用 EasyDeploy 中的 TensorRT 方案完成目标模型的转换部署，具体可参考 [此文档](../../easydeploy/docs/model_convert.md) 。
+
+## 编译使用
+
+当前项目使用的是 MMYOLO 的 rtmdet 模型，若想使用其他的模型，请参照目录下的配置文件进行改写。然后将转换完的 TensorRT engine 放在当前目录下并执行如下命令：
+
+```bash
+mkdir build && cd build
+cmake ..
+make -j$(nproc) && make install
+```
+
+完成编译后可使用如下命令进行推理：
+
+```bash
+deepstream-app -c deepstream_app_config.txt
+```
+
+## 项目代码结构
+
+```bash
+├── deepstream
+│   ├── configs                   # MMYOLO 模型对应的 DeepStream 配置
+│   │   └── config_infer_rtmdet.txt
+│   ├── custom_mmyolo_bbox_parser # 适配 DeepStream formats 的 parser
+│   │   └── nvdsparsebbox_mmyolo.cpp
+|   ├── CMakeLists.txt
+│   ├── coco_labels.txt           # coco labels
+│   ├── deepstream_app_config.txt # DeepStream app 配置
+│   ├── README_zh-CN.md
+│   └── README.md
+```
--- a/deploy/easydeploy/deepstream/coco_labels.txt
+++ b/deploy/easydeploy/deepstream/coco_labels.txt
+person
+bicycle
+car
+motorbike
+aeroplane
+bus
+train
+truck
+boat
+traffic light
+fire hydrant
+stop sign
+parking meter
+bench
+bird
+cat
+dog
+horse
+sheep
+cow
+elephant
+bear
+zebra
+giraffe
+backpack
+umbrella
+handbag
+tie
+suitcase
+frisbee
+skis
+snowboard
+sports ball
+kite
+baseball bat
+baseball glove
+skateboard
+surfboard
+tennis racket
+bottle
+wine glass
+cup
+fork
+knife
+spoon
+bowl
+banana
+apple
+sandwich
+orange
+broccoli
+carrot
+hot dog
+pizza
+donut
+cake
+chair
+sofa
+pottedplant
+bed
+diningtable
+toilet
+tvmonitor
+laptop
+mouse
+remote
+keyboard
+cell phone
+microwave
+oven
+toaster
+sink
+refrigerator
+book
+clock
+vase
+scissors
+teddy bear
+hair drier
+toothbrush
--- a/deploy/easydeploy/deepstream/configs/config_infer_rtmdet.txt
+++ b/deploy/easydeploy/deepstream/configs/config_infer_rtmdet.txt
+[property]
+gpu-id=0
+net-scale-factor=0.01735207357279195
+offsets=57.375;57.12;58.395
+model-color-format=1
+model-engine-file=../end2end.engine
+labelfile-path=../coco_labels.txt
+batch-size=1
+network-mode=0
+num-detected-classes=80
+interval=0
+gie-unique-id=1
+process-mode=1
+network-type=0
+cluster-mode=2
+maintain-aspect-ratio=1
+parse-bbox-func-name=NvDsInferParseCustomMMYOLO
+custom-lib-path=../build/libnvdsparsebbox_mmyolo.so
+
+[class-attrs-all]
+pre-cluster-threshold=0.45
+topk=100