v1.0

a53a851b · chenzk · a53a851b · a53a851b · a53a851b · a53a851b
Commit a53a851b authored Jun 11, 2024 by chenzk
20 changed files
--- a/examples/YOLOv8-LibTorch-CPP-Inference/CMakeLists.txt
+++ b/examples/YOLOv8-LibTorch-CPP-Inference/CMakeLists.txt
+cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
+
+project(yolov8_libtorch_example)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS OFF)
+
+
+# -------------- OpenCV --------------
+set(OpenCV_DIR "/path/to/opencv/lib/cmake/opencv4")
+find_package(OpenCV REQUIRED)
+
+message(STATUS "OpenCV library status:")
+message(STATUS "    config: ${OpenCV_DIR}")
+message(STATUS "    version: ${OpenCV_VERSION}")
+message(STATUS "    libraries: ${OpenCV_LIBS}")
+message(STATUS "    include path: ${OpenCV_INCLUDE_DIRS}")
+
+include_directories(${OpenCV_INCLUDE_DIRS})
+
+# -------------- libtorch --------------
+list(APPEND CMAKE_PREFIX_PATH "/path/to/libtorch")
+set(Torch_DIR "/path/to/libtorch/share/cmake/Torch")
+
+find_package(Torch REQUIRED)
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}")
+message("${TORCH_LIBRARIES}")
+message("${TORCH_INCLUDE_DIRS}")
+
+# The following code block is suggested to be used on Windows.
+# According to https://github.com/pytorch/pytorch/issues/25457,
+# the DLLs need to be copied to avoid memory errors.
+# if (MSVC)
+#   file(GLOB TORCH_DLLS "${TORCH_INSTALL_PREFIX}/lib/*.dll")
+#   add_custom_command(TARGET yolov8_libtorch_example
+#                      POST_BUILD
+#                      COMMAND ${CMAKE_COMMAND} -E copy_if_different
+#                      ${TORCH_DLLS}
+#                      $<TARGET_FILE_DIR:yolov8_libtorch_example>)
+# endif (MSVC)
+
+include_directories(${TORCH_INCLUDE_DIRS})
+
+add_executable(yolov8_libtorch_inference "${CMAKE_CURRENT_SOURCE_DIR}/main.cc")
+target_link_libraries(yolov8_libtorch_inference ${TORCH_LIBRARIES} ${OpenCV_LIBS})
+set_property(TARGET yolov8_libtorch_inference PROPERTY CXX_STANDARD 17)
--- a/examples/YOLOv8-LibTorch-CPP-Inference/README.md
+++ b/examples/YOLOv8-LibTorch-CPP-Inference/README.md
+# YOLOv8 LibTorch Inference C++
+
+This example demonstrates how to perform inference using YOLOv8 models in C++ with LibTorch API.
+
+## Dependencies
+
+| Dependency   | Version  |
+| ------------ | -------- |
+| OpenCV       | >=4.0.0  |
+| C++ Standard | >=17     |
+| Cmake        | >=3.18   |
+| Libtorch     | >=1.12.1 |
+
+## Usage
+
+```bash
+git clone ultralytics
+cd ultralytics
+pip install .
+cd examples/YOLOv8-LibTorch-CPP-Inference
+
+mkdir build
+cd build
+cmake ..
+make
+./yolov8_libtorch_inference
+```
+
+## Exporting YOLOv8
+
+To export YOLOv8 models:
+
+```commandline
+yolo export model=yolov8s.pt imgsz=640 format=torchscript
+```
--- a/examples/YOLOv8-LibTorch-CPP-Inference/main.cc
+++ b/examples/YOLOv8-LibTorch-CPP-Inference/main.cc
+#include <iostream>
+
+#include <opencv2/core.hpp>
+#include <opencv2/imgproc.hpp>
+#include <opencv2/imgcodecs.hpp>
+#include <torch/torch.h>
+#include <torch/script.h>
+
+using torch::indexing::Slice;
+using torch::indexing::None;
+
+
+float generate_scale(cv::Mat& image, const std::vector<int>& target_size) {
+    int origin_w = image.cols;
+    int origin_h = image.rows;
+
+    int target_h = target_size[0];
+    int target_w = target_size[1];
+
+    float ratio_h = static_cast<float>(target_h) / static_cast<float>(origin_h);
+    float ratio_w = static_cast<float>(target_w) / static_cast<float>(origin_w);
+    float resize_scale = std::min(ratio_h, ratio_w);
+    return resize_scale;
+}
+
+
+float letterbox(cv::Mat &input_image, cv::Mat &output_image, const std::vector<int> &target_size) {
+    if (input_image.cols == target_size[1] && input_image.rows == target_size[0]) {
+        if (input_image.data == output_image.data) {
+            return 1.;
+        } else {
+            output_image = input_image.clone();
+            return 1.;
+        }
+    }
+
+    float resize_scale = generate_scale(input_image, target_size);
+    int new_shape_w = std::round(input_image.cols * resize_scale);
+    int new_shape_h = std::round(input_image.rows * resize_scale);
+    float padw = (target_size[1] - new_shape_w) / 2.;
+    float padh = (target_size[0] - new_shape_h) / 2.;
+
+    int top = std::round(padh - 0.1);
+    int bottom = std::round(padh + 0.1);
+    int left = std::round(padw - 0.1);
+    int right = std::round(padw + 0.1);
+
+    cv::resize(input_image, output_image,
+               cv::Size(new_shape_w, new_shape_h),
+               0, 0, cv::INTER_AREA);
+
+    cv::copyMakeBorder(output_image, output_image, top, bottom, left, right,
+                       cv::BORDER_CONSTANT, cv::Scalar(114.));
+    return resize_scale;
+}
+
+
+torch::Tensor xyxy2xywh(const torch::Tensor& x) {
+    auto y = torch::empty_like(x);
+    y.index_put_({"...", 0}, (x.index({"...", 0}) + x.index({"...", 2})).div(2));
+    y.index_put_({"...", 1}, (x.index({"...", 1}) + x.index({"...", 3})).div(2));
+    y.index_put_({"...", 2}, x.index({"...", 2}) - x.index({"...", 0}));
+    y.index_put_({"...", 3}, x.index({"...", 3}) - x.index({"...", 1}));
+    return y;
+}
+
+
+torch::Tensor xywh2xyxy(const torch::Tensor& x) {
+    auto y = torch::empty_like(x);
+    auto dw = x.index({"...", 2}).div(2);
+    auto dh = x.index({"...", 3}).div(2);
+    y.index_put_({"...", 0}, x.index({"...", 0}) - dw);
+    y.index_put_({"...", 1}, x.index({"...", 1}) - dh);
+    y.index_put_({"...", 2}, x.index({"...", 0}) + dw);
+    y.index_put_({"...", 3}, x.index({"...", 1}) + dh);
+    return y;
+}
+
+
+// Reference: https://github.com/pytorch/vision/blob/main/torchvision/csrc/ops/cpu/nms_kernel.cpp
+torch::Tensor nms(const torch::Tensor& bboxes, const torch::Tensor& scores, float iou_threshold) {
+    if (bboxes.numel() == 0)
+        return torch::empty({0}, bboxes.options().dtype(torch::kLong));
+
+    auto x1_t = bboxes.select(1, 0).contiguous();
+    auto y1_t = bboxes.select(1, 1).contiguous();
+    auto x2_t = bboxes.select(1, 2).contiguous();
+    auto y2_t = bboxes.select(1, 3).contiguous();
+
+    torch::Tensor areas_t = (x2_t - x1_t) * (y2_t - y1_t);
+
+    auto order_t = std::get<1>(
+        scores.sort(/*stable=*/true, /*dim=*/0, /* descending=*/true));
+
+    auto ndets = bboxes.size(0);
+    torch::Tensor suppressed_t = torch::zeros({ndets}, bboxes.options().dtype(torch::kByte));
+    torch::Tensor keep_t = torch::zeros({ndets}, bboxes.options().dtype(torch::kLong));
+
+    auto suppressed = suppressed_t.data_ptr<uint8_t>();
+    auto keep = keep_t.data_ptr<int64_t>();
+    auto order = order_t.data_ptr<int64_t>();
+    auto x1 = x1_t.data_ptr<float>();
+    auto y1 = y1_t.data_ptr<float>();
+    auto x2 = x2_t.data_ptr<float>();
+    auto y2 = y2_t.data_ptr<float>();
+    auto areas = areas_t.data_ptr<float>();
+
+    int64_t num_to_keep = 0;
+
+    for (int64_t _i = 0; _i < ndets; _i++) {
+        auto i = order[_i];
+        if (suppressed[i] == 1)
+            continue;
+        keep[num_to_keep++] = i;
+        auto ix1 = x1[i];
+        auto iy1 = y1[i];
+        auto ix2 = x2[i];
+        auto iy2 = y2[i];
+        auto iarea = areas[i];
+
+        for (int64_t _j = _i + 1; _j < ndets; _j++) {
+        auto j = order[_j];
+        if (suppressed[j] == 1)
+            continue;
+        auto xx1 = std::max(ix1, x1[j]);
+        auto yy1 = std::max(iy1, y1[j]);
+        auto xx2 = std::min(ix2, x2[j]);
+        auto yy2 = std::min(iy2, y2[j]);
+
+        auto w = std::max(static_cast<float>(0), xx2 - xx1);
+        auto h = std::max(static_cast<float>(0), yy2 - yy1);
+        auto inter = w * h;
+        auto ovr = inter / (iarea + areas[j] - inter);
+        if (ovr > iou_threshold)
+            suppressed[j] = 1;
+        }
+    }
+    return keep_t.narrow(0, 0, num_to_keep);
+}
+
+
+torch::Tensor non_max_supperession(torch::Tensor& prediction, float conf_thres = 0.25, float iou_thres = 0.45, int max_det = 300) {
+    auto bs = prediction.size(0);
+    auto nc = prediction.size(1) - 4;
+    auto nm = prediction.size(1) - nc - 4;
+    auto mi = 4 + nc;
+    auto xc = prediction.index({Slice(), Slice(4, mi)}).amax(1) > conf_thres;
+
+    prediction = prediction.transpose(-1, -2);
+    prediction.index_put_({"...", Slice({None, 4})}, xywh2xyxy(prediction.index({"...", Slice(None, 4)})));
+
+    std::vector<torch::Tensor> output;
+    for (int i = 0; i < bs; i++) {
+        output.push_back(torch::zeros({0, 6 + nm}, prediction.device()));
+    }
+
+    for (int xi = 0; xi < prediction.size(0); xi++) {
+        auto x = prediction[xi];
+        x = x.index({xc[xi]});
+        auto x_split = x.split({4, nc, nm}, 1);
+        auto box = x_split[0], cls = x_split[1], mask = x_split[2];
+        auto [conf, j] = cls.max(1, true);
+        x = torch::cat({box, conf, j.toType(torch::kFloat), mask}, 1);
+        x = x.index({conf.view(-1) > conf_thres});
+        int n = x.size(0);
+        if (!n) { continue; }
+
+        // NMS
+        auto c = x.index({Slice(), Slice{5, 6}}) * 7680;
+        auto boxes = x.index({Slice(), Slice(None, 4)}) + c;
+        auto scores = x.index({Slice(), 4});
+        auto i = nms(boxes, scores, iou_thres);
+        i = i.index({Slice(None, max_det)});
+        output[xi] = x.index({i});
+    }
+
+    return torch::stack(output);
+}
+
+
+torch::Tensor clip_boxes(torch::Tensor& boxes, const std::vector<int>& shape) {
+    boxes.index_put_({"...", 0}, boxes.index({"...", 0}).clamp(0, shape[1]));
+    boxes.index_put_({"...", 1}, boxes.index({"...", 1}).clamp(0, shape[0]));
+    boxes.index_put_({"...", 2}, boxes.index({"...", 2}).clamp(0, shape[1]));
+    boxes.index_put_({"...", 3}, boxes.index({"...", 3}).clamp(0, shape[0]));
+    return boxes;
+}
+
+
+torch::Tensor scale_boxes(const std::vector<int>& img1_shape, torch::Tensor& boxes, const std::vector<int>& img0_shape) {
+    auto gain = (std::min)((float)img1_shape[0] / img0_shape[0], (float)img1_shape[1] / img0_shape[1]);
+    auto pad0 = std::round((float)(img1_shape[1] - img0_shape[1] * gain) / 2. - 0.1);
+    auto pad1 = std::round((float)(img1_shape[0] - img0_shape[0] * gain) / 2. - 0.1);
+
+    boxes.index_put_({"...", 0}, boxes.index({"...", 0}) - pad0);
+    boxes.index_put_({"...", 2}, boxes.index({"...", 2}) - pad0);
+    boxes.index_put_({"...", 1}, boxes.index({"...", 1}) - pad1);
+    boxes.index_put_({"...", 3}, boxes.index({"...", 3}) - pad1);
+    boxes.index_put_({"...", Slice(None, 4)}, boxes.index({"...", Slice(None, 4)}).div(gain));
+    return boxes;
+}
+
+
+int main() {
+    // Device
+    torch::Device device(torch::cuda::is_available() ? torch::kCUDA :torch::kCPU);
+
+    // Note that in this example the classes are hard-coded
+    std::vector<std::string> classes {"person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light", "fire hydrant",
+                                      "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra",
+                                      "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite",
+                                      "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork", "knife",
+                                      "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair",
+                                      "couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
+                                      "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush"};
+
+    try {
+        // Load the model (e.g. yolov8s.torchscript)
+        std::string model_path = "/path/to/yolov8s.torchscript";
+        torch::jit::script::Module yolo_model;
+        yolo_model = torch::jit::load(model_path);
+        yolo_model.eval();
+        yolo_model.to(device, torch::kFloat32);
+
+        // Load image and preprocess
+        cv::Mat image = cv::imread("/path/to/bus.jpg");
+        cv::Mat input_image;
+        letterbox(image, input_image, {640, 640});
+
+        torch::Tensor image_tensor = torch::from_blob(input_image.data, {input_image.rows, input_image.cols, 3}, torch::kByte).to(device);
+        image_tensor = image_tensor.toType(torch::kFloat32).div(255);
+        image_tensor = image_tensor.permute({2, 0, 1});
+        image_tensor = image_tensor.unsqueeze(0);
+        std::vector<torch::jit::IValue> inputs {image_tensor};
+
+        // Inference
+        torch::Tensor output = yolo_model.forward(inputs).toTensor().cpu();
+
+        // NMS
+        auto keep = non_max_supperession(output)[0];
+        auto boxes = keep.index({Slice(), Slice(None, 4)});
+        keep.index_put_({Slice(), Slice(None, 4)}, scale_boxes({input_image.rows, input_image.cols}, boxes, {image.rows, image.cols}));
+
+        // Show the results
+        for (int i = 0; i < keep.size(0); i++) {
+            int x1 = keep[i][0].item().toFloat();
+            int y1 = keep[i][1].item().toFloat();
+            int x2 = keep[i][2].item().toFloat();
+            int y2 = keep[i][3].item().toFloat();
+            float conf = keep[i][4].item().toFloat();
+            int cls = keep[i][5].item().toInt();
+            std::cout << "Rect: [" << x1 << "," << y1 << "," << x2 << "," << y2 << "]  Conf: " << conf << "  Class: " << classes[cls] << std::endl;
+        }
+    } catch (const c10::Error& e) {
+        std::cout << e.msg() << std::endl;
+    }
+
+    return 0;
+}
--- a/examples/YOLOv8-ONNXRuntime-CPP/CMakeLists.txt
+++ b/examples/YOLOv8-ONNXRuntime-CPP/CMakeLists.txt
+cmake_minimum_required(VERSION 3.5)
+
+set(PROJECT_NAME Yolov8OnnxRuntimeCPPInference)
+project(${PROJECT_NAME} VERSION 0.0.1 LANGUAGES CXX)
+
+
+# -------------- Support C++17 for using filesystem  ------------------#
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS ON)
+set(CMAKE_INCLUDE_CURRENT_DIR ON)
+
+
+# -------------- OpenCV  ------------------#
+find_package(OpenCV REQUIRED)
+include_directories(${OpenCV_INCLUDE_DIRS})
+
+
+# -------------- Compile CUDA for FP16 inference if needed  ------------------#
+option(USE_CUDA "Enable CUDA support" ON)
+if (NOT APPLE AND USE_CUDA)
+    find_package(CUDA REQUIRED)
+    include_directories(${CUDA_INCLUDE_DIRS})
+    add_definitions(-DUSE_CUDA)
+else ()
+    set(USE_CUDA OFF)
+endif ()
+
+# -------------- ONNXRUNTIME  ------------------#
+
+# Set ONNXRUNTIME_VERSION
+set(ONNXRUNTIME_VERSION 1.15.1)
+
+if (WIN32)
+    if (USE_CUDA)
+        set(ONNXRUNTIME_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/onnxruntime-win-x64-gpu-${ONNXRUNTIME_VERSION}")
+    else ()
+        set(ONNXRUNTIME_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/onnxruntime-win-x64-${ONNXRUNTIME_VERSION}")
+    endif ()
+elseif (LINUX)
+    if (USE_CUDA)
+        set(ONNXRUNTIME_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/onnxruntime-linux-x64-gpu-${ONNXRUNTIME_VERSION}")
+    else ()
+        set(ONNXRUNTIME_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/onnxruntime-linux-x64-${ONNXRUNTIME_VERSION}")
+    endif ()
+elseif (APPLE)
+    set(ONNXRUNTIME_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/onnxruntime-osx-arm64-${ONNXRUNTIME_VERSION}")
+    # Apple X64 binary
+    # set(ONNXRUNTIME_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/onnxruntime-osx-x64-${ONNXRUNTIME_VERSION}")
+    # Apple Universal binary
+    # set(ONNXRUNTIME_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/onnxruntime-osx-universal2-${ONNXRUNTIME_VERSION}")
+endif ()
+
+include_directories(${PROJECT_NAME} ${ONNXRUNTIME_ROOT}/include)
+
+set(PROJECT_SOURCES
+        main.cpp
+        inference.h
+        inference.cpp
+)
+
+add_executable(${PROJECT_NAME} ${PROJECT_SOURCES})
+
+if (WIN32)
+    target_link_libraries(${PROJECT_NAME} ${OpenCV_LIBS} ${ONNXRUNTIME_ROOT}/lib/onnxruntime.lib)
+    if (USE_CUDA)
+        target_link_libraries(${PROJECT_NAME} ${CUDA_LIBRARIES})
+    endif ()
+elseif (LINUX)
+    target_link_libraries(${PROJECT_NAME} ${OpenCV_LIBS} ${ONNXRUNTIME_ROOT}/lib/libonnxruntime.so)
+    if (USE_CUDA)
+        target_link_libraries(${PROJECT_NAME} ${CUDA_LIBRARIES})
+    endif ()
+elseif (APPLE)
+    target_link_libraries(${PROJECT_NAME} ${OpenCV_LIBS} ${ONNXRUNTIME_ROOT}/lib/libonnxruntime.dylib)
+endif ()
+
+# For windows system, copy onnxruntime.dll to the same folder of the executable file
+if (WIN32)
+    add_custom_command(TARGET ${PROJECT_NAME} POST_BUILD
+            COMMAND ${CMAKE_COMMAND} -E copy_if_different
+            "${ONNXRUNTIME_ROOT}/lib/onnxruntime.dll"
+            $<TARGET_FILE_DIR:${PROJECT_NAME}>)
+endif ()
+
+# Download https://raw.githubusercontent.com/ultralytics/ultralytics/main/ultralytics/cfg/datasets/coco.yaml
+# and put it in the same folder of the executable file
+configure_file(coco.yaml ${CMAKE_CURRENT_BINARY_DIR}/coco.yaml COPYONLY)
+
+# Copy yolov8n.onnx file to the same folder of the executable file
+configure_file(yolov8n.onnx ${CMAKE_CURRENT_BINARY_DIR}/yolov8n.onnx COPYONLY)
+
+# Create folder name images in the same folder of the executable file
+add_custom_command(TARGET ${PROJECT_NAME} POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}/images
+)
--- a/examples/YOLOv8-ONNXRuntime-CPP/README.md
+++ b/examples/YOLOv8-ONNXRuntime-CPP/README.md
+# YOLOv8 OnnxRuntime C++
+
+<img alt="C++" src="https://img.shields.io/badge/C++-17-blue.svg?style=flat&logo=c%2B%2B"> <img alt="Onnx-runtime" src="https://img.shields.io/badge/OnnxRuntime-717272.svg?logo=Onnx&logoColor=white">
+
+This example demonstrates how to perform inference using YOLOv8 in C++ with ONNX Runtime and OpenCV's API.
+
+## Benefits ✨
+
+- Friendly for deployment in the industrial sector.
+- Faster than OpenCV's DNN inference on both CPU and GPU.
+- Supports FP32 and FP16 CUDA acceleration.
+
+## Note ☕
+
+1. Benefit for Ultralytics' latest release, a `Transpose` op is added to the YOLOv8 model, while make v8 and v5 has the same output shape. Therefore, you can run inference with YOLOv5/v7/v8 via this project.
+
+## Exporting YOLOv8 Models 📦
+
+To export YOLOv8 models, use the following Python script:
+
+```python
+from ultralytics import YOLO
+
+# Load a YOLOv8 model
+model = YOLO("yolov8n.pt")
+
+# Export the model
+model.export(format="onnx", opset=12, simplify=True, dynamic=False, imgsz=640)
+```
+
+Alternatively, you can use the following command for exporting the model in the terminal
+
+```bash
+yolo export model=yolov8n.pt opset=12 simplify=True dynamic=False format=onnx imgsz=640,640
+```
+
+## Exporting YOLOv8 FP16 Models 📦
+
+```python
+import onnx
+from onnxconverter_common import float16
+
+model = onnx.load(R'YOUR_ONNX_PATH')
+model_fp16 = float16.convert_float_to_float16(model)
+onnx.save(model_fp16, R'YOUR_FP16_ONNX_PATH')
+```
+
+## Download COCO.yaml file 📂
+
+In order to run example, you also need to download coco.yaml. You can download the file manually from [here](https://raw.githubusercontent.com/ultralytics/ultralytics/main/ultralytics/cfg/datasets/coco.yaml)
+
+## Dependencies ⚙️
+
+| Dependency                       | Version        |
+| -------------------------------- | -------------- |
+| Onnxruntime(linux,windows,macos) | >=1.14.1       |
+| OpenCV                           | >=4.0.0        |
+| C++ Standard                     | >=17           |
+| Cmake                            | >=3.5          |
+| Cuda (Optional)                  | >=11.4  \<12.0 |
+| cuDNN (Cuda required)            | =8             |
+
+Note: The dependency on C++17 is due to the usage of the C++17 filesystem feature.
+
+Note (2): Due to ONNX Runtime, we need to use CUDA 11 and cuDNN 8. Keep in mind that this requirement might change in the future.
+
+## Build 🛠️
+
+1. Clone the repository to your local machine.
+
+2. Navigate to the root directory of the repository.
+
+3. Create a build directory and navigate to it:
+
+    ```console
+    mkdir build && cd build
+    ```
+
+4. Run CMake to generate the build files:
+
+    ```console
+    cmake ..
+    ```
+
+5. Build the project:
+
+    ```console
+    make
+    ```
+
+6. The built executable should now be located in the `build` directory.
+
+## Usage 🚀
+
+```c++
+//change your param as you like
+//Pay attention to your device and the onnx model type(fp32 or fp16)
+DL_INIT_PARAM params;
+params.rectConfidenceThreshold = 0.1;
+params.iouThreshold = 0.5;
+params.modelPath = "yolov8n.onnx";
+params.imgSize = { 640, 640 };
+params.cudaEnable = true;
+params.modelType = YOLO_DETECT_V8;
+yoloDetector->CreateSession(params);
+Detector(yoloDetector);
+```
--- a/examples/YOLOv8-ONNXRuntime-CPP/inference.cpp
+++ b/examples/YOLOv8-ONNXRuntime-CPP/inference.cpp
+#include "inference.h"
+#include <regex>
+
+#define benchmark
+#define min(a,b)            (((a) < (b)) ? (a) : (b))
+YOLO_V8::YOLO_V8() {
+
+}
+
+
+YOLO_V8::~YOLO_V8() {
+    delete session;
+}
+
+#ifdef USE_CUDA
+namespace Ort
+{
+    template<>
+    struct TypeToTensorType<half> { static constexpr ONNXTensorElementDataType type = ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16; };
+}
+#endif
+
+
+template<typename T>
+char* BlobFromImage(cv::Mat& iImg, T& iBlob) {
+    int channels = iImg.channels();
+    int imgHeight = iImg.rows;
+    int imgWidth = iImg.cols;
+
+    for (int c = 0; c < channels; c++)
+    {
+        for (int h = 0; h < imgHeight; h++)
+        {
+            for (int w = 0; w < imgWidth; w++)
+            {
+                iBlob[c * imgWidth * imgHeight + h * imgWidth + w] = typename std::remove_pointer<T>::type(
+                    (iImg.at<cv::Vec3b>(h, w)[c]) / 255.0f);
+            }
+        }
+    }
+    return RET_OK;
+}
+
+
+char* YOLO_V8::PreProcess(cv::Mat& iImg, std::vector<int> iImgSize, cv::Mat& oImg)
+{
+    if (iImg.channels() == 3)
+    {
+        oImg = iImg.clone();
+        cv::cvtColor(oImg, oImg, cv::COLOR_BGR2RGB);
+    }
+    else
+    {
+        cv::cvtColor(iImg, oImg, cv::COLOR_GRAY2RGB);
+    }
+
+    switch (modelType)
+    {
+    case YOLO_DETECT_V8:
+    case YOLO_POSE:
+    case YOLO_DETECT_V8_HALF:
+    case YOLO_POSE_V8_HALF://LetterBox
+    {
+        if (iImg.cols >= iImg.rows)
+        {
+            resizeScales = iImg.cols / (float)iImgSize.at(0);
+            cv::resize(oImg, oImg, cv::Size(iImgSize.at(0), int(iImg.rows / resizeScales)));
+        }
+        else
+        {
+            resizeScales = iImg.rows / (float)iImgSize.at(0);
+            cv::resize(oImg, oImg, cv::Size(int(iImg.cols / resizeScales), iImgSize.at(1)));
+        }
+        cv::Mat tempImg = cv::Mat::zeros(iImgSize.at(0), iImgSize.at(1), CV_8UC3);
+        oImg.copyTo(tempImg(cv::Rect(0, 0, oImg.cols, oImg.rows)));
+        oImg = tempImg;
+        break;
+    }
+    case YOLO_CLS://CenterCrop
+    {
+        int h = iImg.rows;
+        int w = iImg.cols;
+        int m = min(h, w);
+        int top = (h - m) / 2;
+        int left = (w - m) / 2;
+        cv::resize(oImg(cv::Rect(left, top, m, m)), oImg, cv::Size(iImgSize.at(0), iImgSize.at(1)));
+        break;
+    }
+    }
+    return RET_OK;
+}
+
+
+char* YOLO_V8::CreateSession(DL_INIT_PARAM& iParams) {
+    char* Ret = RET_OK;
+    std::regex pattern("[\u4e00-\u9fa5]");
+    bool result = std::regex_search(iParams.modelPath, pattern);
+    if (result)
+    {
+        Ret = "[YOLO_V8]:Your model path is error.Change your model path without chinese characters.";
+        std::cout << Ret << std::endl;
+        return Ret;
+    }
+    try
+    {
+        rectConfidenceThreshold = iParams.rectConfidenceThreshold;
+        iouThreshold = iParams.iouThreshold;
+        imgSize = iParams.imgSize;
+        modelType = iParams.modelType;
+        env = Ort::Env(ORT_LOGGING_LEVEL_WARNING, "Yolo");
+        Ort::SessionOptions sessionOption;
+        if (iParams.cudaEnable)
+        {
+            cudaEnable = iParams.cudaEnable;
+            OrtCUDAProviderOptions cudaOption;
+            cudaOption.device_id = 0;
+            sessionOption.AppendExecutionProvider_CUDA(cudaOption);
+        }
+        sessionOption.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_ALL);
+        sessionOption.SetIntraOpNumThreads(iParams.intraOpNumThreads);
+        sessionOption.SetLogSeverityLevel(iParams.logSeverityLevel);
+
+#ifdef _WIN32
+        int ModelPathSize = MultiByteToWideChar(CP_UTF8, 0, iParams.modelPath.c_str(), static_cast<int>(iParams.modelPath.length()), nullptr, 0);
+        wchar_t* wide_cstr = new wchar_t[ModelPathSize + 1];
+        MultiByteToWideChar(CP_UTF8, 0, iParams.modelPath.c_str(), static_cast<int>(iParams.modelPath.length()), wide_cstr, ModelPathSize);
+        wide_cstr[ModelPathSize] = L'\0';
+        const wchar_t* modelPath = wide_cstr;
+#else
+        const char* modelPath = iParams.modelPath.c_str();
+#endif // _WIN32
+
+        session = new Ort::Session(env, modelPath, sessionOption);
+        Ort::AllocatorWithDefaultOptions allocator;
+        size_t inputNodesNum = session->GetInputCount();
+        for (size_t i = 0; i < inputNodesNum; i++)
+        {
+            Ort::AllocatedStringPtr input_node_name = session->GetInputNameAllocated(i, allocator);
+            char* temp_buf = new char[50];
+            strcpy(temp_buf, input_node_name.get());
+            inputNodeNames.push_back(temp_buf);
+        }
+        size_t OutputNodesNum = session->GetOutputCount();
+        for (size_t i = 0; i < OutputNodesNum; i++)
+        {
+            Ort::AllocatedStringPtr output_node_name = session->GetOutputNameAllocated(i, allocator);
+            char* temp_buf = new char[10];
+            strcpy(temp_buf, output_node_name.get());
+            outputNodeNames.push_back(temp_buf);
+        }
+        options = Ort::RunOptions{ nullptr };
+        WarmUpSession();
+        return RET_OK;
+    }
+    catch (const std::exception& e)
+    {
+        const char* str1 = "[YOLO_V8]:";
+        const char* str2 = e.what();
+        std::string result = std::string(str1) + std::string(str2);
+        char* merged = new char[result.length() + 1];
+        std::strcpy(merged, result.c_str());
+        std::cout << merged << std::endl;
+        delete[] merged;
+        return "[YOLO_V8]:Create session failed.";
+    }
+
+}
+
+
+char* YOLO_V8::RunSession(cv::Mat& iImg, std::vector<DL_RESULT>& oResult) {
+#ifdef benchmark
+    clock_t starttime_1 = clock();
+#endif // benchmark
+
+    char* Ret = RET_OK;
+    cv::Mat processedImg;
+    PreProcess(iImg, imgSize, processedImg);
+    if (modelType < 4)
+    {
+        float* blob = new float[processedImg.total() * 3];
+        BlobFromImage(processedImg, blob);
+        std::vector<int64_t> inputNodeDims = { 1, 3, imgSize.at(0), imgSize.at(1) };
+        TensorProcess(starttime_1, iImg, blob, inputNodeDims, oResult);
+    }
+    else
+    {
+#ifdef USE_CUDA
+        half* blob = new half[processedImg.total() * 3];
+        BlobFromImage(processedImg, blob);
+        std::vector<int64_t> inputNodeDims = { 1,3,imgSize.at(0),imgSize.at(1) };
+        TensorProcess(starttime_1, iImg, blob, inputNodeDims, oResult);
+#endif
+    }
+
+    return Ret;
+}
+
+
+template<typename N>
+char* YOLO_V8::TensorProcess(clock_t& starttime_1, cv::Mat& iImg, N& blob, std::vector<int64_t>& inputNodeDims,
+    std::vector<DL_RESULT>& oResult) {
+    Ort::Value inputTensor = Ort::Value::CreateTensor<typename std::remove_pointer<N>::type>(
+        Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU), blob, 3 * imgSize.at(0) * imgSize.at(1),
+        inputNodeDims.data(), inputNodeDims.size());
+#ifdef benchmark
+    clock_t starttime_2 = clock();
+#endif // benchmark
+    auto outputTensor = session->Run(options, inputNodeNames.data(), &inputTensor, 1, outputNodeNames.data(),
+        outputNodeNames.size());
+#ifdef benchmark
+    clock_t starttime_3 = clock();
+#endif // benchmark
+
+    Ort::TypeInfo typeInfo = outputTensor.front().GetTypeInfo();
+    auto tensor_info = typeInfo.GetTensorTypeAndShapeInfo();
+    std::vector<int64_t> outputNodeDims = tensor_info.GetShape();
+    auto output = outputTensor.front().GetTensorMutableData<typename std::remove_pointer<N>::type>();
+    delete[] blob;
+    switch (modelType)
+    {
+    case YOLO_DETECT_V8:
+    case YOLO_DETECT_V8_HALF:
+    {
+        int strideNum = outputNodeDims[1];//8400
+        int signalResultNum = outputNodeDims[2];//84
+        std::vector<int> class_ids;
+        std::vector<float> confidences;
+        std::vector<cv::Rect> boxes;
+        cv::Mat rawData;
+        if (modelType == YOLO_DETECT_V8)
+        {
+            // FP32
+            rawData = cv::Mat(strideNum, signalResultNum, CV_32F, output);
+        }
+        else
+        {
+            // FP16
+            rawData = cv::Mat(strideNum, signalResultNum, CV_16F, output);
+            rawData.convertTo(rawData, CV_32F);
+        }
+        //Note:
+        //ultralytics add transpose operator to the output of yolov8 model.which make yolov8/v5/v7 has same shape
+        //https://github.com/ultralytics/assets/releases/download/v8.1.0/yolov8n.pt
+        //rowData = rowData.t();
+
+        float* data = (float*)rawData.data;
+
+        for (int i = 0; i < strideNum; ++i)
+        {
+            float* classesScores = data + 4;
+            cv::Mat scores(1, this->classes.size(), CV_32FC1, classesScores);
+            cv::Point class_id;
+            double maxClassScore;
+            cv::minMaxLoc(scores, 0, &maxClassScore, 0, &class_id);
+            if (maxClassScore > rectConfidenceThreshold)
+            {
+                confidences.push_back(maxClassScore);
+                class_ids.push_back(class_id.x);
+                float x = data[0];
+                float y = data[1];
+                float w = data[2];
+                float h = data[3];
+
+                int left = int((x - 0.5 * w) * resizeScales);
+                int top = int((y - 0.5 * h) * resizeScales);
+
+                int width = int(w * resizeScales);
+                int height = int(h * resizeScales);
+
+                boxes.push_back(cv::Rect(left, top, width, height));
+            }
+            data += signalResultNum;
+        }
+        std::vector<int> nmsResult;
+        cv::dnn::NMSBoxes(boxes, confidences, rectConfidenceThreshold, iouThreshold, nmsResult);
+        for (int i = 0; i < nmsResult.size(); ++i)
+        {
+            int idx = nmsResult[i];
+            DL_RESULT result;
+            result.classId = class_ids[idx];
+            result.confidence = confidences[idx];
+            result.box = boxes[idx];
+            oResult.push_back(result);
+        }
+
+#ifdef benchmark
+        clock_t starttime_4 = clock();
+        double pre_process_time = (double)(starttime_2 - starttime_1) / CLOCKS_PER_SEC * 1000;
+        double process_time = (double)(starttime_3 - starttime_2) / CLOCKS_PER_SEC * 1000;
+        double post_process_time = (double)(starttime_4 - starttime_3) / CLOCKS_PER_SEC * 1000;
+        if (cudaEnable)
+        {
+            std::cout << "[YOLO_V8(CUDA)]: " << pre_process_time << "ms pre-process, " << process_time << "ms inference, " << post_process_time << "ms post-process." << std::endl;
+        }
+        else
+        {
+            std::cout << "[YOLO_V8(CPU)]: " << pre_process_time << "ms pre-process, " << process_time << "ms inference, " << post_process_time << "ms post-process." << std::endl;
+        }
+#endif // benchmark
+
+        break;
+    }
+    case YOLO_CLS:
+    {
+        DL_RESULT result;
+        for (int i = 0; i < this->classes.size(); i++)
+        {
+            result.classId = i;
+            result.confidence = output[i];
+            oResult.push_back(result);
+        }
+        break;
+    }
+    default:
+        std::cout << "[YOLO_V8]: " << "Not support model type." << std::endl;
+    }
+    return RET_OK;
+
+}
+
+
+char* YOLO_V8::WarmUpSession() {
+    clock_t starttime_1 = clock();
+    cv::Mat iImg = cv::Mat(cv::Size(imgSize.at(0), imgSize.at(1)), CV_8UC3);
+    cv::Mat processedImg;
+    PreProcess(iImg, imgSize, processedImg);
+    if (modelType < 4)
+    {
+        float* blob = new float[iImg.total() * 3];
+        BlobFromImage(processedImg, blob);
+        std::vector<int64_t> YOLO_input_node_dims = { 1, 3, imgSize.at(0), imgSize.at(1) };
+        Ort::Value input_tensor = Ort::Value::CreateTensor<float>(
+            Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU), blob, 3 * imgSize.at(0) * imgSize.at(1),
+            YOLO_input_node_dims.data(), YOLO_input_node_dims.size());
+        auto output_tensors = session->Run(options, inputNodeNames.data(), &input_tensor, 1, outputNodeNames.data(),
+            outputNodeNames.size());
+        delete[] blob;
+        clock_t starttime_4 = clock();
+        double post_process_time = (double)(starttime_4 - starttime_1) / CLOCKS_PER_SEC * 1000;
+        if (cudaEnable)
+        {
+            std::cout << "[YOLO_V8(CUDA)]: " << "Cuda warm-up cost " << post_process_time << " ms. " << std::endl;
+        }
+    }
+    else
+    {
+#ifdef USE_CUDA
+        half* blob = new half[iImg.total() * 3];
+        BlobFromImage(processedImg, blob);
+        std::vector<int64_t> YOLO_input_node_dims = { 1,3,imgSize.at(0),imgSize.at(1) };
+        Ort::Value input_tensor = Ort::Value::CreateTensor<half>(Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU), blob, 3 * imgSize.at(0) * imgSize.at(1), YOLO_input_node_dims.data(), YOLO_input_node_dims.size());
+        auto output_tensors = session->Run(options, inputNodeNames.data(), &input_tensor, 1, outputNodeNames.data(), outputNodeNames.size());
+        delete[] blob;
+        clock_t starttime_4 = clock();
+        double post_process_time = (double)(starttime_4 - starttime_1) / CLOCKS_PER_SEC * 1000;
+        if (cudaEnable)
+        {
+            std::cout << "[YOLO_V8(CUDA)]: " << "Cuda warm-up cost " << post_process_time << " ms. " << std::endl;
+        }
+#endif
+    }
+    return RET_OK;
+}
--- a/examples/YOLOv8-ONNXRuntime-CPP/inference.h
+++ b/examples/YOLOv8-ONNXRuntime-CPP/inference.h
+#pragma once
+
+#define    RET_OK nullptr
+
+#ifdef _WIN32
+#include <Windows.h>
+#include <direct.h>
+#include <io.h>
+#endif
+
+#include <string>
+#include <vector>
+#include <cstdio>
+#include <opencv2/opencv.hpp>
+#include "onnxruntime_cxx_api.h"
+
+#ifdef USE_CUDA
+#include <cuda_fp16.h>
+#endif
+
+
+enum MODEL_TYPE
+{
+    //FLOAT32 MODEL
+    YOLO_DETECT_V8 = 1,
+    YOLO_POSE = 2,
+    YOLO_CLS = 3,
+
+    //FLOAT16 MODEL
+    YOLO_DETECT_V8_HALF = 4,
+    YOLO_POSE_V8_HALF = 5,
+};
+
+
+typedef struct _DL_INIT_PARAM
+{
+    std::string modelPath;
+    MODEL_TYPE modelType = YOLO_DETECT_V8;
+    std::vector<int> imgSize = { 640, 640 };
+    float rectConfidenceThreshold = 0.6;
+    float iouThreshold = 0.5;
+    int	keyPointsNum = 2;//Note:kpt number for pose
+    bool cudaEnable = false;
+    int logSeverityLevel = 3;
+    int intraOpNumThreads = 1;
+} DL_INIT_PARAM;
+
+
+typedef struct _DL_RESULT
+{
+    int classId;
+    float confidence;
+    cv::Rect box;
+    std::vector<cv::Point2f> keyPoints;
+} DL_RESULT;
+
+
+class YOLO_V8
+{
+public:
+    YOLO_V8();
+
+    ~YOLO_V8();
+
+public:
+    char* CreateSession(DL_INIT_PARAM& iParams);
+
+    char* RunSession(cv::Mat& iImg, std::vector<DL_RESULT>& oResult);
+
+    char* WarmUpSession();
+
+    template<typename N>
+    char* TensorProcess(clock_t& starttime_1, cv::Mat& iImg, N& blob, std::vector<int64_t>& inputNodeDims,
+        std::vector<DL_RESULT>& oResult);
+
+    char* PreProcess(cv::Mat& iImg, std::vector<int> iImgSize, cv::Mat& oImg);
+
+    std::vector<std::string> classes{};
+
+private:
+    Ort::Env env;
+    Ort::Session* session;
+    bool cudaEnable;
+    Ort::RunOptions options;
+    std::vector<const char*> inputNodeNames;
+    std::vector<const char*> outputNodeNames;
+
+    MODEL_TYPE modelType;
+    std::vector<int> imgSize;
+    float rectConfidenceThreshold;
+    float iouThreshold;
+    float resizeScales;//letterbox scale
+};
--- a/examples/YOLOv8-ONNXRuntime-CPP/main.cpp
+++ b/examples/YOLOv8-ONNXRuntime-CPP/main.cpp
+#include <iostream>
+#include <iomanip>
+#include "inference.h"
+#include <filesystem>
+#include <fstream>
+#include <random>
+
+void Detector(YOLO_V8*& p) {
+    std::filesystem::path current_path = std::filesystem::current_path();
+    std::filesystem::path imgs_path = current_path / "images";
+    for (auto& i : std::filesystem::directory_iterator(imgs_path))
+    {
+        if (i.path().extension() == ".jpg" || i.path().extension() == ".png" || i.path().extension() == ".jpeg")
+        {
+            std::string img_path = i.path().string();
+            cv::Mat img = cv::imread(img_path);
+            std::vector<DL_RESULT> res;
+            p->RunSession(img, res);
+
+            for (auto& re : res)
+            {
+                cv::RNG rng(cv::getTickCount());
+                cv::Scalar color(rng.uniform(0, 256), rng.uniform(0, 256), rng.uniform(0, 256));
+
+                cv::rectangle(img, re.box, color, 3);
+
+                float confidence = floor(100 * re.confidence) / 100;
+                std::cout << std::fixed << std::setprecision(2);
+                std::string label = p->classes[re.classId] + " " +
+                    std::to_string(confidence).substr(0, std::to_string(confidence).size() - 4);
+
+                cv::rectangle(
+                    img,
+                    cv::Point(re.box.x, re.box.y - 25),
+                    cv::Point(re.box.x + label.length() * 15, re.box.y),
+                    color,
+                    cv::FILLED
+                );
+
+                cv::putText(
+                    img,
+                    label,
+                    cv::Point(re.box.x, re.box.y - 5),
+                    cv::FONT_HERSHEY_SIMPLEX,
+                    0.75,
+                    cv::Scalar(0, 0, 0),
+                    2
+                );
+
+
+            }
+            std::cout << "Press any key to exit" << std::endl;
+            cv::imshow("Result of Detection", img);
+            cv::waitKey(0);
+            cv::destroyAllWindows();
+        }
+    }
+}
+
+
+void Classifier(YOLO_V8*& p)
+{
+    std::filesystem::path current_path = std::filesystem::current_path();
+    std::filesystem::path imgs_path = current_path;// / "images"
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_int_distribution<int> dis(0, 255);
+    for (auto& i : std::filesystem::directory_iterator(imgs_path))
+    {
+        if (i.path().extension() == ".jpg" || i.path().extension() == ".png")
+        {
+            std::string img_path = i.path().string();
+            //std::cout << img_path << std::endl;
+            cv::Mat img = cv::imread(img_path);
+            std::vector<DL_RESULT> res;
+            char* ret = p->RunSession(img, res);
+
+            float positionY = 50;
+            for (int i = 0; i < res.size(); i++)
+            {
+                int r = dis(gen);
+                int g = dis(gen);
+                int b = dis(gen);
+                cv::putText(img, std::to_string(i) + ":", cv::Point(10, positionY), cv::FONT_HERSHEY_SIMPLEX, 1, cv::Scalar(b, g, r), 2);
+                cv::putText(img, std::to_string(res.at(i).confidence), cv::Point(70, positionY), cv::FONT_HERSHEY_SIMPLEX, 1, cv::Scalar(b, g, r), 2);
+                positionY += 50;
+            }
+
+            cv::imshow("TEST_CLS", img);
+            cv::waitKey(0);
+            cv::destroyAllWindows();
+            //cv::imwrite("E:\\output\\" + std::to_string(k) + ".png", img);
+        }
+
+    }
+}
+
+
+
+int ReadCocoYaml(YOLO_V8*& p) {
+    // Open the YAML file
+    std::ifstream file("coco.yaml");
+    if (!file.is_open())
+    {
+        std::cerr << "Failed to open file" << std::endl;
+        return 1;
+    }
+
+    // Read the file line by line
+    std::string line;
+    std::vector<std::string> lines;
+    while (std::getline(file, line))
+    {
+        lines.push_back(line);
+    }
+
+    // Find the start and end of the names section
+    std::size_t start = 0;
+    std::size_t end = 0;
+    for (std::size_t i = 0; i < lines.size(); i++)
+    {
+        if (lines[i].find("names:") != std::string::npos)
+        {
+            start = i + 1;
+        }
+        else if (start > 0 && lines[i].find(':') == std::string::npos)
+        {
+            end = i;
+            break;
+        }
+    }
+
+    // Extract the names
+    std::vector<std::string> names;
+    for (std::size_t i = start; i < end; i++)
+    {
+        std::stringstream ss(lines[i]);
+        std::string name;
+        std::getline(ss, name, ':'); // Extract the number before the delimiter
+        std::getline(ss, name); // Extract the string after the delimiter
+        names.push_back(name);
+    }
+
+    p->classes = names;
+    return 0;
+}
+
+
+void DetectTest()
+{
+    YOLO_V8* yoloDetector = new YOLO_V8;
+    ReadCocoYaml(yoloDetector);
+    DL_INIT_PARAM params;
+    params.rectConfidenceThreshold = 0.1;
+    params.iouThreshold = 0.5;
+    params.modelPath = "yolov8n.onnx";
+    params.imgSize = { 640, 640 };
+#ifdef USE_CUDA
+    params.cudaEnable = true;
+
+    // GPU FP32 inference
+    params.modelType = YOLO_DETECT_V8;
+    // GPU FP16 inference
+    //Note: change fp16 onnx model
+    //params.modelType = YOLO_DETECT_V8_HALF;
+
+#else
+    // CPU inference
+    params.modelType = YOLO_DETECT_V8;
+    params.cudaEnable = false;
+
+#endif
+    yoloDetector->CreateSession(params);
+    Detector(yoloDetector);
+}
+
+
+void ClsTest()
+{
+    YOLO_V8* yoloDetector = new YOLO_V8;
+    std::string model_path = "cls.onnx";
+    ReadCocoYaml(yoloDetector);
+    DL_INIT_PARAM params{ model_path, YOLO_CLS, {224, 224} };
+    yoloDetector->CreateSession(params);
+    Classifier(yoloDetector);
+}
+
+
+int main()
+{
+    //DetectTest();
+    ClsTest();
+}
--- a/examples/YOLOv8-ONNXRuntime-Rust/Cargo.toml
+++ b/examples/YOLOv8-ONNXRuntime-Rust/Cargo.toml
+[package]
+name = "yolov8-rs"
+version = "0.1.0"
+edition = "2021"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+clap = { version = "4.2.4", features = ["derive"] }
+image = { version = "0.24.7", default-features = false, features = ["jpeg", "png", "webp-encoder"] }
+imageproc = { version = "0.23.0", default-features = false }
+ndarray = { version = "0.15.6" }
+ort =  {version = "1.16.3", default-features = false, features = ["load-dynamic", "copy-dylibs", "half"]}
+rusttype = { version = "0.9", default-features = false }
+anyhow = { version = "1.0.75"}
+regex = { version = "1.5.4" }
+rand = { version ="0.8.5" }
+chrono = { version = "0.4.30" }
+half = { version = "2.3.1" }
+dirs = { version = "5.0.1" }
+ureq = { version = "2.9.1" }
--- a/examples/YOLOv8-ONNXRuntime-Rust/README.md
+++ b/examples/YOLOv8-ONNXRuntime-Rust/README.md
+# YOLOv8-ONNXRuntime-Rust for All the Key YOLO Tasks
+
+This repository provides a Rust demo for performing YOLOv8 tasks like `Classification`, `Segmentation`, `Detection` and `Pose Detection` using ONNXRuntime.
+
+## Features
+
+- Support `Classification`, `Segmentation`, `Detection`, `Pose(Keypoints)-Detection` tasks.
+- Support `FP16` & `FP32` ONNX models.
+- Support `CPU`, `CUDA` and `TensorRT` execution provider to accelerate computation.
+- Support dynamic input shapes(`batch`, `width`, `height`).
+
+## Installation
+
+### 1. Install Rust
+
+Please follow the Rust official installation. (https://www.rust-lang.org/tools/install)
+
+### 2. Install ONNXRuntime
+
+This repository use `ort` crate, which is ONNXRuntime wrapper for Rust. (https://docs.rs/ort/latest/ort/)
+
+You can follow the instruction with `ort` doc or simply do this:
+
+- step1: Download ONNXRuntime(https://github.com/microsoft/onnxruntime/releases)
+- setp2: Set environment variable `PATH` for linking.
+
+On ubuntu, You can do like this:
+
+```
+vim ~/.bashrc
+
+# Add the path of ONNXRUntime lib
+export LD_LIBRARY_PATH=/home/qweasd/Documents/onnxruntime-linux-x64-gpu-1.16.3/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
+
+source ~/.bashrc
+```
+
+### 3. \[Optional\] Install CUDA & CuDNN & TensorRT
+
+- CUDA execution provider requires CUDA v11.6+.
+- TensorRT execution provider requires CUDA v11.4+ and TensorRT v8.4+.
+
+## Get Started
+
+### 1. Export the YOLOv8 ONNX Models
+
+```bash
+pip install -U ultralytics
+
+# export onnx model with dynamic shapes
+yolo export model=yolov8m.pt format=onnx  simplify dynamic
+yolo export model=yolov8m-cls.pt format=onnx  simplify dynamic
+yolo export model=yolov8m-pose.pt format=onnx  simplify dynamic
+yolo export model=yolov8m-seg.pt format=onnx  simplify dynamic
+
+
+# export onnx model with constant shapes
+yolo export model=yolov8m.pt format=onnx  simplify
+yolo export model=yolov8m-cls.pt format=onnx  simplify
+yolo export model=yolov8m-pose.pt format=onnx  simplify
+yolo export model=yolov8m-seg.pt format=onnx  simplify
+```
+
+### 2. Run Inference
+
+It will perform inference with the ONNX model on the source image.
+
+```
+cargo run --release -- --model <MODEL> --source <SOURCE>
+```
+
+Set `--cuda` to use CUDA execution provider to speed up inference.
+
+```
+cargo run --release -- --cuda --model <MODEL> --source <SOURCE>
+```
+
+Set `--trt` to use TensorRT execution provider, and you can set `--fp16` at the same time to use TensorRT FP16 engine.
+
+```
+cargo run --release -- --trt --fp16 --model <MODEL> --source <SOURCE>
+```
+
+Set `--device_id` to select which device to run. When you have only one GPU, and you set `device_id` to 1 will not cause program panic, the `ort` would automatically fall back to `CPU` EP.
+
+```
+cargo run --release -- --cuda --device_id 0 --model <MODEL> --source <SOURCE>
+```
+
+Set `--batch` to do multi-batch-size inference.
+
+If you're using `--trt`, you can also set `--batch-min` and `--batch-max` to explicitly specify min/max/opt batch for dynamic batch input.(https://onnxruntime.ai/docs/execution-providers/TensorRT-ExecutionProvider.html#explicit-shape-range-for-dynamic-shape-input).(Note that the ONNX model should exported with dynamic shapes)
+
+```
+cargo run --release -- --cuda --batch 2 --model <MODEL> --source <SOURCE>
+```
+
+Set `--height` and `--width` to do dynamic image size inference. (Note that the ONNX model should exported with dynamic shapes)
+
+```
+cargo run --release -- --cuda --width 480 --height 640 --model <MODEL> --source <SOURCE>
+```
+
+Set `--profile` to check time consumed in each stage.(Note that the model usually needs to take 1~3 times dry run to warmup. Make sure to run enough times to evaluate the result.)
+
+```
+cargo run --release -- --trt --fp16 --profile --model <MODEL> --source <SOURCE>
+```
+
+Results: (yolov8m.onnx, batch=1, 3 times, trt, fp16, RTX 3060Ti)
+
+```
+==> 0
+[Model Preprocess]: 12.75788ms
+[ORT H2D]: 237.118µs
+[ORT Inference]: 507.895469ms
+[ORT D2H]: 191.655µs
+[Model Inference]: 508.34589ms
+[Model Postprocess]: 1.061122ms
+==> 1
+[Model Preprocess]: 13.658655ms
+[ORT H2D]: 209.975µs
+[ORT Inference]: 5.12372ms
+[ORT D2H]: 182.389µs
+[Model Inference]: 5.530022ms
+[Model Postprocess]: 1.04851ms
+==> 2
+[Model Preprocess]: 12.475332ms
+[ORT H2D]: 246.127µs
+[ORT Inference]: 5.048432ms
+[ORT D2H]: 187.117µs
+[Model Inference]: 5.493119ms
+[Model Postprocess]: 1.040906ms
+```
+
+And also:
+
+`--conf`: confidence threshold \[default: 0.3\]
+
+`--iou`: iou threshold in NMS \[default: 0.45\]
+
+`--kconf`: confidence threshold of keypoint \[default: 0.55\]
+
+`--plot`: plot inference result with random RGB color and save
+
+you can check out all CLI arguments by:
+
+```
+git clone https://github.com/ultralytics/ultralytics
+cd ultralytics/examples/YOLOv8-ONNXRuntime-Rust
+cargo run --release -- --help
+```
+
+## Examples
+
+### Classification
+
+Running dynamic shape ONNX model on `CPU` with image size `--height 224 --width 224`. Saving plotted image in `runs` directory.
+
+```
+cargo run --release -- --model ../assets/weights/yolov8m-cls-dyn.onnx --source ../assets/images/dog.jpg --height 224 --width 224 --plot --profile
+```
+
+You will see result like:
+
+```
+Summary:
+> Task: Classify (Ultralytics 8.0.217)
+> EP: Cpu
+> Dtype: Float32
+> Batch: 1 (Dynamic), Height: 224 (Dynamic), Width: 224 (Dynamic)
+> nc: 1000 nk: 0, nm: 0, conf: 0.3, kconf: 0.55, iou: 0.45
+
+[Model Preprocess]: 16.363477ms
+[ORT H2D]: 50.722µs
+[ORT Inference]: 16.295808ms
+[ORT D2H]: 8.37µs
+[Model Inference]: 16.367046ms
+[Model Postprocess]: 3.527µs
+[
+    YOLOResult {
+        Probs(top5): Some([(208, 0.6950566), (209, 0.13823675), (178, 0.04849795), (215, 0.019029364), (212, 0.016506357)]),
+        Bboxes: None,
+        Keypoints: None,
+        Masks: None,
+    },
+]
+
+```
+
+![2023-11-25-22-02-02-156623351](https://github.com/jamjamjon/ultralytics/assets/51357717/ef75c2ae-c5ab-44cc-9d9e-e60b51e39662)
+
+### Object Detection
+
+Using `CUDA` EP and dynamic image size `--height 640 --width 480`
+
+```
+cargo run --release -- --cuda --model ../assets/weights/yolov8m-dynamic.onnx --source ../assets/images/bus.jpg --plot --height 640 --width 480
+```
+
+![det](https://github.com/jamjamjon/ultralytics/assets/51357717/5d89a19d-0c96-4a59-875c-defab6887a2c)
+
+### Pose Detection
+
+using `TensorRT` EP
+
+```
+cargo run --release -- --trt --model ../assets/weights/yolov8m-pose.onnx --source ../assets/images/bus.jpg --plot
+```
+
+![2023-11-25-22-31-45-127054025](https://github.com/jamjamjon/ultralytics/assets/51357717/157b5ba7-bfcf-47cf-bee7-68b62e0de1c4)
+
+### Instance Segmentation
+
+using `TensorRT` EP and FP16 model `--fp16`
+
+```
+cargo run --release --  --trt --fp16 --model ../assets/weights/yolov8m-seg.onnx --source ../assets/images/0172.jpg --plot
+```
+
+![seg](https://github.com/jamjamjon/ultralytics/assets/51357717/cf046f4f-9533-478a-adc7-4de22443a641)
--- a/examples/YOLOv8-ONNXRuntime-Rust/src/cli.rs
+++ b/examples/YOLOv8-ONNXRuntime-Rust/src/cli.rs
+use clap::Parser;
+
+use crate::YOLOTask;
+
+#[derive(Parser, Clone)]
+#[command(author, version, about, long_about = None)]
+pub struct Args {
+    /// ONNX model path
+    #[arg(long, required = true)]
+    pub model: String,
+
+    /// input path
+    #[arg(long, required = true)]
+    pub source: String,
+
+    /// device id
+    #[arg(long, default_value_t = 0)]
+    pub device_id: u32,
+
+    /// using TensorRT EP
+    #[arg(long)]
+    pub trt: bool,
+
+    /// using CUDA EP
+    #[arg(long)]
+    pub cuda: bool,
+
+    /// input batch size
+    #[arg(long, default_value_t = 1)]
+    pub batch: u32,
+
+    /// trt input min_batch size
+    #[arg(long, default_value_t = 1)]
+    pub batch_min: u32,
+
+    /// trt input max_batch size
+    #[arg(long, default_value_t = 32)]
+    pub batch_max: u32,
+
+    /// using TensorRT --fp16
+    #[arg(long)]
+    pub fp16: bool,
+
+    /// specify YOLO task
+    #[arg(long, value_enum)]
+    pub task: Option<YOLOTask>,
+
+    /// num_classes
+    #[arg(long)]
+    pub nc: Option<u32>,
+
+    /// num_keypoints
+    #[arg(long)]
+    pub nk: Option<u32>,
+
+    /// num_masks
+    #[arg(long)]
+    pub nm: Option<u32>,
+
+    /// input image width
+    #[arg(long)]
+    pub width: Option<u32>,
+
+    /// input image height
+    #[arg(long)]
+    pub height: Option<u32>,
+
+    /// confidence threshold
+    #[arg(long, required = false, default_value_t = 0.3)]
+    pub conf: f32,
+
+    /// iou threshold in NMS
+    #[arg(long, required = false, default_value_t = 0.45)]
+    pub iou: f32,
+
+    /// confidence threshold of keypoint
+    #[arg(long, required = false, default_value_t = 0.55)]
+    pub kconf: f32,
+
+    /// plot inference result and save
+    #[arg(long)]
+    pub plot: bool,
+
+    /// check time consumed in each stage
+    #[arg(long)]
+    pub profile: bool,
+}
--- a/examples/YOLOv8-ONNXRuntime-Rust/src/lib.rs
+++ b/examples/YOLOv8-ONNXRuntime-Rust/src/lib.rs
+#![allow(clippy::type_complexity)]
+
+use std::io::{Read, Write};
+
+pub mod cli;
+pub mod model;
+pub mod ort_backend;
+pub mod yolo_result;
+pub use crate::cli::Args;
+pub use crate::model::YOLOv8;
+pub use crate::ort_backend::{Batch, OrtBackend, OrtConfig, OrtEP, YOLOTask};
+pub use crate::yolo_result::{Bbox, Embedding, Point2, YOLOResult};
+
+pub fn non_max_suppression(
+    xs: &mut Vec<(Bbox, Option<Vec<Point2>>, Option<Vec<f32>>)>,
+    iou_threshold: f32,
+) {
+    xs.sort_by(|b1, b2| b2.0.confidence().partial_cmp(&b1.0.confidence()).unwrap());
+
+    let mut current_index = 0;
+    for index in 0..xs.len() {
+        let mut drop = false;
+        for prev_index in 0..current_index {
+            let iou = xs[prev_index].0.iou(&xs[index].0);
+            if iou > iou_threshold {
+                drop = true;
+                break;
+            }
+        }
+        if !drop {
+            xs.swap(current_index, index);
+            current_index += 1;
+        }
+    }
+    xs.truncate(current_index);
+}
+
+pub fn gen_time_string(delimiter: &str) -> String {
+    let offset = chrono::FixedOffset::east_opt(8 * 60 * 60).unwrap(); // Beijing
+    let t_now = chrono::Utc::now().with_timezone(&offset);
+    let fmt = format!(
+        "%Y{}%m{}%d{}%H{}%M{}%S{}%f",
+        delimiter, delimiter, delimiter, delimiter, delimiter, delimiter
+    );
+    t_now.format(&fmt).to_string()
+}
+
+pub const SKELETON: [(usize, usize); 16] = [
+    (0, 1),
+    (0, 2),
+    (1, 3),
+    (2, 4),
+    (5, 6),
+    (5, 11),
+    (6, 12),
+    (11, 12),
+    (5, 7),
+    (6, 8),
+    (7, 9),
+    (8, 10),
+    (11, 13),
+    (12, 14),
+    (13, 15),
+    (14, 16),
+];
+
+pub fn check_font(font: &str) -> rusttype::Font<'static> {
+    // check then load font
+
+    // ultralytics font path
+    let font_path_config = match dirs::config_dir() {
+        Some(mut d) => {
+            d.push("Ultralytics");
+            d.push(font);
+            d
+        }
+        None => panic!("Unsupported operating system. Now support Linux, MacOS, Windows."),
+    };
+
+    // current font path
+    let font_path_current = std::path::PathBuf::from(font);
+
+    // check font
+    let font_path = if font_path_config.exists() {
+        font_path_config
+    } else if font_path_current.exists() {
+        font_path_current
+    } else {
+        println!("Downloading font...");
+        let source_url = "https://ultralytics.com/assets/Arial.ttf";
+        let resp = ureq::get(source_url)
+            .timeout(std::time::Duration::from_secs(500))
+            .call()
+            .unwrap_or_else(|err| panic!("> Failed to download font: {source_url}: {err:?}"));
+
+        // read to buffer
+        let mut buffer = vec![];
+        let total_size = resp
+            .header("Content-Length")
+            .and_then(|s| s.parse::<u64>().ok())
+            .unwrap();
+        let _reader = resp
+            .into_reader()
+            .take(total_size)
+            .read_to_end(&mut buffer)
+            .unwrap();
+
+        // save
+        let _path = std::fs::File::create(font).unwrap();
+        let mut writer = std::io::BufWriter::new(_path);
+        writer.write_all(&buffer).unwrap();
+        println!("Font saved at: {:?}", font_path_current.display());
+        font_path_current
+    };
+
+    // load font
+    let buffer = std::fs::read(font_path).unwrap();
+    rusttype::Font::try_from_vec(buffer).unwrap()
+}
--- a/examples/YOLOv8-ONNXRuntime-Rust/src/main.rs
+++ b/examples/YOLOv8-ONNXRuntime-Rust/src/main.rs
+use clap::Parser;
+
+use yolov8_rs::{Args, YOLOv8};
+
+fn main() -> Result<(), Box<dyn std::error::Error>> {
+    let args = Args::parse();
+
+    // 1. load image
+    let x = image::io::Reader::open(&args.source)?
+        .with_guessed_format()?
+        .decode()?;
+
+    // 2. model support dynamic batch inference, so input should be a Vec
+    let xs = vec![x];
+
+    // You can test `--batch 2` with this
+    // let xs = vec![x.clone(), x];
+
+    // 3. build yolov8 model
+    let mut model = YOLOv8::new(args)?;
+    model.summary(); // model info
+
+    // 4. run
+    let ys = model.run(&xs)?;
+    println!("{:?}", ys);
+
+    Ok(())
+}
--- a/examples/YOLOv8-ONNXRuntime-Rust/src/model.rs
+++ b/examples/YOLOv8-ONNXRuntime-Rust/src/model.rs
+#![allow(clippy::type_complexity)]
+
+use anyhow::Result;
+use image::{DynamicImage, GenericImageView, ImageBuffer};
+use ndarray::{s, Array, Axis, IxDyn};
+use rand::{thread_rng, Rng};
+use std::path::PathBuf;
+
+use crate::{
+    check_font, gen_time_string, non_max_suppression, Args, Batch, Bbox, Embedding, OrtBackend,
+    OrtConfig, OrtEP, Point2, YOLOResult, YOLOTask, SKELETON,
+};
+
+pub struct YOLOv8 {
+    // YOLOv8 model for all yolo-tasks
+    engine: OrtBackend,
+    nc: u32,
+    nk: u32,
+    nm: u32,
+    height: u32,
+    width: u32,
+    batch: u32,
+    task: YOLOTask,
+    conf: f32,
+    kconf: f32,
+    iou: f32,
+    names: Vec<String>,
+    color_palette: Vec<(u8, u8, u8)>,
+    profile: bool,
+    plot: bool,
+}
+
+impl YOLOv8 {
+    pub fn new(config: Args) -> Result<Self> {
+        // execution provider
+        let ep = if config.trt {
+            OrtEP::Trt(config.device_id)
+        } else if config.cuda {
+            OrtEP::Cuda(config.device_id)
+        } else {
+            OrtEP::Cpu
+        };
+
+        // batch
+        let batch = Batch {
+            opt: config.batch,
+            min: config.batch_min,
+            max: config.batch_max,
+        };
+
+        // build ort engine
+        let ort_args = OrtConfig {
+            ep,
+            batch,
+            f: config.model,
+            task: config.task,
+            trt_fp16: config.fp16,
+            image_size: (config.height, config.width),
+        };
+        let engine = OrtBackend::build(ort_args)?;
+
+        //  get batch, height, width, tasks, nc, nk, nm
+        let (batch, height, width, task) = (
+            engine.batch(),
+            engine.height(),
+            engine.width(),
+            engine.task(),
+        );
+        let nc = engine.nc().or(config.nc).unwrap_or_else(|| {
+            panic!("Failed to get num_classes, make it explicit with `--nc`");
+        });
+        let (nk, nm) = match task {
+            YOLOTask::Pose => {
+                let nk = engine.nk().or(config.nk).unwrap_or_else(|| {
+                    panic!("Failed to get num_keypoints, make it explicit with `--nk`");
+                });
+                (nk, 0)
+            }
+            YOLOTask::Segment => {
+                let nm = engine.nm().or(config.nm).unwrap_or_else(|| {
+                    panic!("Failed to get num_masks, make it explicit with `--nm`");
+                });
+                (0, nm)
+            }
+            _ => (0, 0),
+        };
+
+        // class names
+        let names = engine.names().unwrap_or(vec!["Unknown".to_string()]);
+
+        // color palette
+        let mut rng = thread_rng();
+        let color_palette: Vec<_> = names
+            .iter()
+            .map(|_| {
+                (
+                    rng.gen_range(0..=255),
+                    rng.gen_range(0..=255),
+                    rng.gen_range(0..=255),
+                )
+            })
+            .collect();
+
+        Ok(Self {
+            engine,
+            names,
+            conf: config.conf,
+            kconf: config.kconf,
+            iou: config.iou,
+            color_palette,
+            profile: config.profile,
+            plot: config.plot,
+            nc,
+            nk,
+            nm,
+            height,
+            width,
+            batch,
+            task,
+        })
+    }
+
+    pub fn scale_wh(&self, w0: f32, h0: f32, w1: f32, h1: f32) -> (f32, f32, f32) {
+        let r = (w1 / w0).min(h1 / h0);
+        (r, (w0 * r).round(), (h0 * r).round())
+    }
+
+    pub fn preprocess(&mut self, xs: &Vec<DynamicImage>) -> Result<Array<f32, IxDyn>> {
+        let mut ys =
+            Array::ones((xs.len(), 3, self.height() as usize, self.width() as usize)).into_dyn();
+        ys.fill(144.0 / 255.0);
+        for (idx, x) in xs.iter().enumerate() {
+            let img = match self.task() {
+                YOLOTask::Classify => x.resize_exact(
+                    self.width(),
+                    self.height(),
+                    image::imageops::FilterType::Triangle,
+                ),
+                _ => {
+                    let (w0, h0) = x.dimensions();
+                    let w0 = w0 as f32;
+                    let h0 = h0 as f32;
+                    let (_, w_new, h_new) =
+                        self.scale_wh(w0, h0, self.width() as f32, self.height() as f32); // f32 round
+                    x.resize_exact(
+                        w_new as u32,
+                        h_new as u32,
+                        if let YOLOTask::Segment = self.task() {
+                            image::imageops::FilterType::CatmullRom
+                        } else {
+                            image::imageops::FilterType::Triangle
+                        },
+                    )
+                }
+            };
+
+            for (x, y, rgb) in img.pixels() {
+                let x = x as usize;
+                let y = y as usize;
+                let [r, g, b, _] = rgb.0;
+                ys[[idx, 0, y, x]] = (r as f32) / 255.0;
+                ys[[idx, 1, y, x]] = (g as f32) / 255.0;
+                ys[[idx, 2, y, x]] = (b as f32) / 255.0;
+            }
+        }
+
+        Ok(ys)
+    }
+
+    pub fn run(&mut self, xs: &Vec<DynamicImage>) -> Result<Vec<YOLOResult>> {
+        // pre-process
+        let t_pre = std::time::Instant::now();
+        let xs_ = self.preprocess(xs)?;
+        if self.profile {
+            println!("[Model Preprocess]: {:?}", t_pre.elapsed());
+        }
+
+        // run
+        let t_run = std::time::Instant::now();
+        let ys = self.engine.run(xs_, self.profile)?;
+        if self.profile {
+            println!("[Model Inference]: {:?}", t_run.elapsed());
+        }
+
+        // post-process
+        let t_post = std::time::Instant::now();
+        let ys = self.postprocess(ys, xs)?;
+        if self.profile {
+            println!("[Model Postprocess]: {:?}", t_post.elapsed());
+        }
+
+        // plot and save
+        if self.plot {
+            self.plot_and_save(&ys, xs, Some(&SKELETON));
+        }
+        Ok(ys)
+    }
+
+    pub fn postprocess(
+        &self,
+        xs: Vec<Array<f32, IxDyn>>,
+        xs0: &[DynamicImage],
+    ) -> Result<Vec<YOLOResult>> {
+        if let YOLOTask::Classify = self.task() {
+            let mut ys = Vec::new();
+            let preds = &xs[0];
+            for batch in preds.axis_iter(Axis(0)) {
+                ys.push(YOLOResult::new(
+                    Some(Embedding::new(batch.into_owned())),
+                    None,
+                    None,
+                    None,
+                ));
+            }
+            Ok(ys)
+        } else {
+            const CXYWH_OFFSET: usize = 4; // cxcywh
+            const KPT_STEP: usize = 3; // xyconf
+            let preds = &xs[0];
+            let protos = {
+                if xs.len() > 1 {
+                    Some(&xs[1])
+                } else {
+                    None
+                }
+            };
+            let mut ys = Vec::new();
+            for (idx, anchor) in preds.axis_iter(Axis(0)).enumerate() {
+                // [bs, 4 + nc + nm, anchors]
+                // input image
+                let width_original = xs0[idx].width() as f32;
+                let height_original = xs0[idx].height() as f32;
+                let ratio = (self.width() as f32 / width_original)
+                    .min(self.height() as f32 / height_original);
+
+                // save each result
+                let mut data: Vec<(Bbox, Option<Vec<Point2>>, Option<Vec<f32>>)> = Vec::new();
+                for pred in anchor.axis_iter(Axis(1)) {
+                    // split preds for different tasks
+                    let bbox = pred.slice(s![0..CXYWH_OFFSET]);
+                    let clss = pred.slice(s![CXYWH_OFFSET..CXYWH_OFFSET + self.nc() as usize]);
+                    let kpts = {
+                        if let YOLOTask::Pose = self.task() {
+                            Some(pred.slice(s![pred.len() - KPT_STEP * self.nk() as usize..]))
+                        } else {
+                            None
+                        }
+                    };
+                    let coefs = {
+                        if let YOLOTask::Segment = self.task() {
+                            Some(pred.slice(s![pred.len() - self.nm() as usize..]).to_vec())
+                        } else {
+                            None
+                        }
+                    };
+
+                    // confidence and id
+                    let (id, &confidence) = clss
+                        .into_iter()
+                        .enumerate()
+                        .reduce(|max, x| if x.1 > max.1 { x } else { max })
+                        .unwrap(); // definitely will not panic!
+
+                    // confidence filter
+                    if confidence < self.conf {
+                        continue;
+                    }
+
+                    // bbox re-scale
+                    let cx = bbox[0] / ratio;
+                    let cy = bbox[1] / ratio;
+                    let w = bbox[2] / ratio;
+                    let h = bbox[3] / ratio;
+                    let x = cx - w / 2.;
+                    let y = cy - h / 2.;
+                    let y_bbox = Bbox::new(
+                        x.max(0.0f32).min(width_original),
+                        y.max(0.0f32).min(height_original),
+                        w,
+                        h,
+                        id,
+                        confidence,
+                    );
+
+                    // kpts
+                    let y_kpts = {
+                        if let Some(kpts) = kpts {
+                            let mut kpts_ = Vec::new();
+                            // rescale
+                            for i in 0..self.nk() as usize {
+                                let kx = kpts[KPT_STEP * i] / ratio;
+                                let ky = kpts[KPT_STEP * i + 1] / ratio;
+                                let kconf = kpts[KPT_STEP * i + 2];
+                                if kconf < self.kconf {
+                                    kpts_.push(Point2::default());
+                                } else {
+                                    kpts_.push(Point2::new_with_conf(
+                                        kx.max(0.0f32).min(width_original),
+                                        ky.max(0.0f32).min(height_original),
+                                        kconf,
+                                    ));
+                                }
+                            }
+                            Some(kpts_)
+                        } else {
+                            None
+                        }
+                    };
+
+                    // data merged
+                    data.push((y_bbox, y_kpts, coefs));
+                }
+
+                // nms
+                non_max_suppression(&mut data, self.iou);
+
+                // decode
+                let mut y_bboxes: Vec<Bbox> = Vec::new();
+                let mut y_kpts: Vec<Vec<Point2>> = Vec::new();
+                let mut y_masks: Vec<Vec<u8>> = Vec::new();
+                for elem in data.into_iter() {
+                    if let Some(kpts) = elem.1 {
+                        y_kpts.push(kpts)
+                    }
+
+                    // decode masks
+                    if let Some(coefs) = elem.2 {
+                        let proto = protos.unwrap().slice(s![idx, .., .., ..]);
+                        let (nm, nh, nw) = proto.dim();
+
+                        // coefs * proto -> mask
+                        let coefs = Array::from_shape_vec((1, nm), coefs)?; // (n, nm)
+                        let proto = proto.to_owned().into_shape((nm, nh * nw))?; // (nm, nh*nw)
+                        let mask = coefs.dot(&proto).into_shape((nh, nw, 1))?; // (nh, nw, n)
+
+                        // build image from ndarray
+                        let mask_im: ImageBuffer<image::Luma<_>, Vec<f32>> =
+                            match ImageBuffer::from_raw(nw as u32, nh as u32, mask.into_raw_vec()) {
+                                Some(image) => image,
+                                None => panic!("can not create image from ndarray"),
+                            };
+                        let mut mask_im = image::DynamicImage::from(mask_im); // -> dyn
+
+                        // rescale masks
+                        let (_, w_mask, h_mask) =
+                            self.scale_wh(width_original, height_original, nw as f32, nh as f32);
+                        let mask_cropped = mask_im.crop(0, 0, w_mask as u32, h_mask as u32);
+                        let mask_original = mask_cropped.resize_exact(
+                            // resize_to_fill
+                            width_original as u32,
+                            height_original as u32,
+                            match self.task() {
+                                YOLOTask::Segment => image::imageops::FilterType::CatmullRom,
+                                _ => image::imageops::FilterType::Triangle,
+                            },
+                        );
+
+                        // crop-mask with bbox
+                        let mut mask_original_cropped = mask_original.into_luma8();
+                        for y in 0..height_original as usize {
+                            for x in 0..width_original as usize {
+                                if x < elem.0.xmin() as usize
+                                    || x > elem.0.xmax() as usize
+                                    || y < elem.0.ymin() as usize
+                                    || y > elem.0.ymax() as usize
+                                {
+                                    mask_original_cropped.put_pixel(
+                                        x as u32,
+                                        y as u32,
+                                        image::Luma([0u8]),
+                                    );
+                                }
+                            }
+                        }
+                        y_masks.push(mask_original_cropped.into_raw());
+                    }
+                    y_bboxes.push(elem.0);
+                }
+
+                // save each result
+                let y = YOLOResult {
+                    probs: None,
+                    bboxes: if !y_bboxes.is_empty() {
+                        Some(y_bboxes)
+                    } else {
+                        None
+                    },
+                    keypoints: if !y_kpts.is_empty() {
+                        Some(y_kpts)
+                    } else {
+                        None
+                    },
+                    masks: if !y_masks.is_empty() {
+                        Some(y_masks)
+                    } else {
+                        None
+                    },
+                };
+                ys.push(y);
+            }
+
+            Ok(ys)
+        }
+    }
+
+    pub fn plot_and_save(
+        &self,
+        ys: &[YOLOResult],
+        xs0: &[DynamicImage],
+        skeletons: Option<&[(usize, usize)]>,
+    ) {
+        // check font then load
+        let font = check_font("Arial.ttf");
+        for (_idb, (img0, y)) in xs0.iter().zip(ys.iter()).enumerate() {
+            let mut img = img0.to_rgb8();
+
+            // draw for classifier
+            if let Some(probs) = y.probs() {
+                for (i, k) in probs.topk(5).iter().enumerate() {
+                    let legend = format!("{} {:.2}%", self.names[k.0], k.1);
+                    let scale = 32;
+                    let legend_size = img.width().max(img.height()) / scale;
+                    let x = img.width() / 20;
+                    let y = img.height() / 20 + i as u32 * legend_size;
+                    imageproc::drawing::draw_text_mut(
+                        &mut img,
+                        image::Rgb([0, 255, 0]),
+                        x as i32,
+                        y as i32,
+                        rusttype::Scale::uniform(legend_size as f32 - 1.),
+                        &font,
+                        &legend,
+                    );
+                }
+            }
+
+            // draw bboxes & keypoints
+            if let Some(bboxes) = y.bboxes() {
+                for (_idx, bbox) in bboxes.iter().enumerate() {
+                    // rect
+                    imageproc::drawing::draw_hollow_rect_mut(
+                        &mut img,
+                        imageproc::rect::Rect::at(bbox.xmin() as i32, bbox.ymin() as i32)
+                            .of_size(bbox.width() as u32, bbox.height() as u32),
+                        image::Rgb(self.color_palette[bbox.id()].into()),
+                    );
+
+                    // text
+                    let legend = format!("{} {:.2}%", self.names[bbox.id()], bbox.confidence());
+                    let scale = 40;
+                    let legend_size = img.width().max(img.height()) / scale;
+                    imageproc::drawing::draw_text_mut(
+                        &mut img,
+                        image::Rgb(self.color_palette[bbox.id()].into()),
+                        bbox.xmin() as i32,
+                        (bbox.ymin() - legend_size as f32) as i32,
+                        rusttype::Scale::uniform(legend_size as f32 - 1.),
+                        &font,
+                        &legend,
+                    );
+                }
+            }
+
+            // draw kpts
+            if let Some(keypoints) = y.keypoints() {
+                for kpts in keypoints.iter() {
+                    for kpt in kpts.iter() {
+                        // filter
+                        if kpt.confidence() < self.kconf {
+                            continue;
+                        }
+
+                        // draw point
+                        imageproc::drawing::draw_filled_circle_mut(
+                            &mut img,
+                            (kpt.x() as i32, kpt.y() as i32),
+                            2,
+                            image::Rgb([0, 255, 0]),
+                        );
+                    }
+
+                    // draw skeleton if has
+                    if let Some(skeletons) = skeletons {
+                        for &(idx1, idx2) in skeletons.iter() {
+                            let kpt1 = &kpts[idx1];
+                            let kpt2 = &kpts[idx2];
+                            if kpt1.confidence() < self.kconf || kpt2.confidence() < self.kconf {
+                                continue;
+                            }
+                            imageproc::drawing::draw_line_segment_mut(
+                                &mut img,
+                                (kpt1.x(), kpt1.y()),
+                                (kpt2.x(), kpt2.y()),
+                                image::Rgb([233, 14, 57]),
+                            );
+                        }
+                    }
+                }
+            }
+
+            // draw mask
+            if let Some(masks) = y.masks() {
+                for (mask, _bbox) in masks.iter().zip(y.bboxes().unwrap().iter()) {
+                    let mask_nd: ImageBuffer<image::Luma<_>, Vec<u8>> =
+                        match ImageBuffer::from_vec(img.width(), img.height(), mask.to_vec()) {
+                            Some(image) => image,
+                            None => panic!("can not crate image from ndarray"),
+                        };
+
+                    for _x in 0..img.width() {
+                        for _y in 0..img.height() {
+                            let mask_p = imageproc::drawing::Canvas::get_pixel(&mask_nd, _x, _y);
+                            if mask_p.0[0] > 0 {
+                                let mut img_p = imageproc::drawing::Canvas::get_pixel(&img, _x, _y);
+                                // img_p.0[2] = self.color_palette[bbox.id()].2 / 2;
+                                // img_p.0[1] = self.color_palette[bbox.id()].1 / 2;
+                                // img_p.0[0] = self.color_palette[bbox.id()].0 / 2;
+                                img_p.0[2] /= 2;
+                                img_p.0[1] = 255 - (255 - img_p.0[2]) / 2;
+                                img_p.0[0] /= 2;
+                                imageproc::drawing::Canvas::draw_pixel(&mut img, _x, _y, img_p)
+                            }
+                        }
+                    }
+                }
+            }
+
+            // mkdir and save
+            let mut runs = PathBuf::from("runs");
+            if !runs.exists() {
+                std::fs::create_dir_all(&runs).unwrap();
+            }
+            runs.push(gen_time_string("-"));
+            let saveout = format!("{}.jpg", runs.to_str().unwrap());
+            let _ = img.save(saveout);
+        }
+    }
+
+    pub fn summary(&self) {
+        println!(
+            "\nSummary:\n\
+            > Task: {:?}{}\n\
+            > EP: {:?} {}\n\
+            > Dtype: {:?}\n\
+            > Batch: {} ({}), Height: {} ({}), Width: {} ({})\n\
+            > nc: {} nk: {}, nm: {}, conf: {}, kconf: {}, iou: {}\n\
+            ",
+            self.task(),
+            match self.engine.author().zip(self.engine.version()) {
+                Some((author, ver)) => format!(" ({} {})", author, ver),
+                None => String::from(""),
+            },
+            self.engine.ep(),
+            if let OrtEP::Cpu = self.engine.ep() {
+                ""
+            } else {
+                "(May still fall back to CPU)"
+            },
+            self.engine.dtype(),
+            self.batch(),
+            if self.engine.is_batch_dynamic() {
+                "Dynamic"
+            } else {
+                "Const"
+            },
+            self.height(),
+            if self.engine.is_height_dynamic() {
+                "Dynamic"
+            } else {
+                "Const"
+            },
+            self.width(),
+            if self.engine.is_width_dynamic() {
+                "Dynamic"
+            } else {
+                "Const"
+            },
+            self.nc(),
+            self.nk(),
+            self.nm(),
+            self.conf,
+            self.kconf,
+            self.iou,
+        );
+    }
+
+    pub fn engine(&self) -> &OrtBackend {
+        &self.engine
+    }
+
+    pub fn conf(&self) -> f32 {
+        self.conf
+    }
+
+    pub fn set_conf(&mut self, val: f32) {
+        self.conf = val;
+    }
+
+    pub fn conf_mut(&mut self) -> &mut f32 {
+        &mut self.conf
+    }
+
+    pub fn kconf(&self) -> f32 {
+        self.kconf
+    }
+
+    pub fn iou(&self) -> f32 {
+        self.iou
+    }
+
+    pub fn task(&self) -> &YOLOTask {
+        &self.task
+    }
+
+    pub fn batch(&self) -> u32 {
+        self.batch
+    }
+
+    pub fn width(&self) -> u32 {
+        self.width
+    }
+
+    pub fn height(&self) -> u32 {
+        self.height
+    }
+
+    pub fn nc(&self) -> u32 {
+        self.nc
+    }
+
+    pub fn nk(&self) -> u32 {
+        self.nk
+    }
+
+    pub fn nm(&self) -> u32 {
+        self.nm
+    }
+
+    pub fn names(&self) -> &Vec<String> {
+        &self.names
+    }
+}
--- a/examples/YOLOv8-ONNXRuntime-Rust/src/ort_backend.rs
+++ b/examples/YOLOv8-ONNXRuntime-Rust/src/ort_backend.rs
+use anyhow::Result;
+use clap::ValueEnum;
+use half::f16;
+use ndarray::{Array, CowArray, IxDyn};
+use ort::execution_providers::{CUDAExecutionProviderOptions, TensorRTExecutionProviderOptions};
+use ort::tensor::TensorElementDataType;
+use ort::{Environment, ExecutionProvider, Session, SessionBuilder, Value};
+use regex::Regex;
+
+#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, ValueEnum)]
+pub enum YOLOTask {
+    // YOLO tasks
+    Classify,
+    Detect,
+    Pose,
+    Segment,
+}
+
+#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
+pub enum OrtEP {
+    // ONNXRuntime execution provider
+    Cpu,
+    Cuda(u32),
+    Trt(u32),
+}
+
+#[derive(Debug)]
+pub struct Batch {
+    pub opt: u32,
+    pub min: u32,
+    pub max: u32,
+}
+
+impl Default for Batch {
+    fn default() -> Self {
+        Self {
+            opt: 1,
+            min: 1,
+            max: 1,
+        }
+    }
+}
+
+#[derive(Debug, Default)]
+pub struct OrtInputs {
+    // ONNX model inputs attrs
+    pub shapes: Vec<Vec<i32>>,
+    pub dtypes: Vec<TensorElementDataType>,
+    pub names: Vec<String>,
+    pub sizes: Vec<Vec<u32>>,
+}
+
+impl OrtInputs {
+    pub fn new(session: &Session) -> Self {
+        let mut shapes = Vec::new();
+        let mut dtypes = Vec::new();
+        let mut names = Vec::new();
+        for i in session.inputs.iter() {
+            let shape: Vec<i32> = i
+                .dimensions()
+                .map(|x| if let Some(x) = x { x as i32 } else { -1i32 })
+                .collect();
+            shapes.push(shape);
+            dtypes.push(i.input_type);
+            names.push(i.name.clone());
+        }
+        Self {
+            shapes,
+            dtypes,
+            names,
+            ..Default::default()
+        }
+    }
+}
+
+#[derive(Debug)]
+pub struct OrtConfig {
+    // ORT config
+    pub f: String,
+    pub task: Option<YOLOTask>,
+    pub ep: OrtEP,
+    pub trt_fp16: bool,
+    pub batch: Batch,
+    pub image_size: (Option<u32>, Option<u32>),
+}
+
+#[derive(Debug)]
+pub struct OrtBackend {
+    // ORT engine
+    session: Session,
+    task: YOLOTask,
+    ep: OrtEP,
+    batch: Batch,
+    inputs: OrtInputs,
+}
+
+impl OrtBackend {
+    pub fn build(args: OrtConfig) -> Result<Self> {
+        // build env & session
+        let env = Environment::builder()
+            .with_name("YOLOv8")
+            .with_log_level(ort::LoggingLevel::Verbose)
+            .build()?
+            .into_arc();
+        let session = SessionBuilder::new(&env)?.with_model_from_file(&args.f)?;
+
+        // get inputs
+        let mut inputs = OrtInputs::new(&session);
+
+        // batch size
+        let mut batch = args.batch;
+        let batch = if inputs.shapes[0][0] == -1 {
+            batch
+        } else {
+            assert_eq!(
+                inputs.shapes[0][0] as u32, batch.opt,
+                "Expected batch size: {}, got {}. Try using `--batch {}`.",
+                inputs.shapes[0][0] as u32, batch.opt, inputs.shapes[0][0] as u32
+            );
+            batch.opt = inputs.shapes[0][0] as u32;
+            batch
+        };
+
+        // input size: height and width
+        let height = if inputs.shapes[0][2] == -1 {
+            match args.image_size.0 {
+                Some(height) => height,
+                None => panic!("Failed to get model height. Make it explicit with `--height`"),
+            }
+        } else {
+            inputs.shapes[0][2] as u32
+        };
+        let width = if inputs.shapes[0][3] == -1 {
+            match args.image_size.1 {
+                Some(width) => width,
+                None => panic!("Failed to get model width. Make it explicit with `--width`"),
+            }
+        } else {
+            inputs.shapes[0][3] as u32
+        };
+        inputs.sizes.push(vec![height, width]);
+
+        // build provider
+        let (ep, provider) = match args.ep {
+            OrtEP::Cuda(device_id) => Self::set_ep_cuda(device_id),
+            OrtEP::Trt(device_id) => Self::set_ep_trt(device_id, args.trt_fp16, &batch, &inputs),
+            _ => (OrtEP::Cpu, ExecutionProvider::CPU(Default::default())),
+        };
+
+        // build session again with the new provider
+        let session = SessionBuilder::new(&env)?
+            // .with_optimization_level(ort::GraphOptimizationLevel::Level3)?
+            .with_execution_providers([provider])?
+            .with_model_from_file(args.f)?;
+
+        // task: using given one or guessing
+        let task = match args.task {
+            Some(task) => task,
+            None => match session.metadata() {
+                Err(_) => panic!("No metadata found. Try making it explicit by `--task`"),
+                Ok(metadata) => match metadata.custom("task") {
+                    Err(_) => panic!("Can not get custom value. Try making it explicit by `--task`"),
+                    Ok(value) => match value {
+                        None => panic!("No correspoing value of `task` found in metadata. Make it explicit by `--task`"),
+                        Some(task) => match task.as_str() {
+                            "classify" => YOLOTask::Classify,
+                            "detect" => YOLOTask::Detect,
+                            "pose" => YOLOTask::Pose,
+                            "segment" => YOLOTask::Segment,
+                            x => todo!("{:?} is not supported for now!", x),
+                        },
+                    },
+                },
+            },
+        };
+
+        Ok(Self {
+            session,
+            task,
+            ep,
+            batch,
+            inputs,
+        })
+    }
+
+    pub fn fetch_inputs_from_session(
+        session: &Session,
+    ) -> (Vec<Vec<i32>>, Vec<TensorElementDataType>, Vec<String>) {
+        // get inputs attrs from ONNX model
+        let mut shapes = Vec::new();
+        let mut dtypes = Vec::new();
+        let mut names = Vec::new();
+        for i in session.inputs.iter() {
+            let shape: Vec<i32> = i
+                .dimensions()
+                .map(|x| if let Some(x) = x { x as i32 } else { -1i32 })
+                .collect();
+            shapes.push(shape);
+            dtypes.push(i.input_type);
+            names.push(i.name.clone());
+        }
+        (shapes, dtypes, names)
+    }
+
+    pub fn set_ep_cuda(device_id: u32) -> (OrtEP, ExecutionProvider) {
+        // set CUDA
+        if ExecutionProvider::CUDA(Default::default()).is_available() {
+            (
+                OrtEP::Cuda(device_id),
+                ExecutionProvider::CUDA(CUDAExecutionProviderOptions {
+                    device_id,
+                    ..Default::default()
+                }),
+            )
+        } else {
+            println!("> CUDA is not available! Using CPU.");
+            (OrtEP::Cpu, ExecutionProvider::CPU(Default::default()))
+        }
+    }
+
+    pub fn set_ep_trt(
+        device_id: u32,
+        fp16: bool,
+        batch: &Batch,
+        inputs: &OrtInputs,
+    ) -> (OrtEP, ExecutionProvider) {
+        // set TensorRT
+        if ExecutionProvider::TensorRT(Default::default()).is_available() {
+            let (height, width) = (inputs.sizes[0][0], inputs.sizes[0][1]);
+
+            // dtype match checking
+            if inputs.dtypes[0] == TensorElementDataType::Float16 && !fp16 {
+                panic!(
+                    "Dtype mismatch! Expected: Float32, got: {:?}. You should use `--fp16`",
+                    inputs.dtypes[0]
+                );
+            }
+
+            // dynamic shape: input_tensor_1:dim_1xdim_2x...,input_tensor_2:dim_3xdim_4x...,...
+            let mut opt_string = String::new();
+            let mut min_string = String::new();
+            let mut max_string = String::new();
+            for name in inputs.names.iter() {
+                let s_opt = format!("{}:{}x3x{}x{},", name, batch.opt, height, width);
+                let s_min = format!("{}:{}x3x{}x{},", name, batch.min, height, width);
+                let s_max = format!("{}:{}x3x{}x{},", name, batch.max, height, width);
+                opt_string.push_str(s_opt.as_str());
+                min_string.push_str(s_min.as_str());
+                max_string.push_str(s_max.as_str());
+            }
+            let _ = opt_string.pop();
+            let _ = min_string.pop();
+            let _ = max_string.pop();
+            (
+                OrtEP::Trt(device_id),
+                ExecutionProvider::TensorRT(TensorRTExecutionProviderOptions {
+                    device_id,
+                    fp16_enable: fp16,
+                    timing_cache_enable: true,
+                    profile_min_shapes: min_string,
+                    profile_max_shapes: max_string,
+                    profile_opt_shapes: opt_string,
+                    ..Default::default()
+                }),
+            )
+        } else {
+            println!("> TensorRT is not available! Try using CUDA...");
+            Self::set_ep_cuda(device_id)
+        }
+    }
+
+    pub fn fetch_from_metadata(&self, key: &str) -> Option<String> {
+        // fetch value from onnx model file by key
+        match self.session.metadata() {
+            Err(_) => None,
+            Ok(metadata) => match metadata.custom(key) {
+                Err(_) => None,
+                Ok(value) => value,
+            },
+        }
+    }
+
+    pub fn run(&self, xs: Array<f32, IxDyn>, profile: bool) -> Result<Vec<Array<f32, IxDyn>>> {
+        // ORT inference
+        match self.dtype() {
+            TensorElementDataType::Float16 => self.run_fp16(xs, profile),
+            TensorElementDataType::Float32 => self.run_fp32(xs, profile),
+            _ => todo!(),
+        }
+    }
+
+    pub fn run_fp16(&self, xs: Array<f32, IxDyn>, profile: bool) -> Result<Vec<Array<f32, IxDyn>>> {
+        // f32->f16
+        let t = std::time::Instant::now();
+        let xs = xs.mapv(f16::from_f32);
+        if profile {
+            println!("[ORT f32->f16]: {:?}", t.elapsed());
+        }
+
+        // h2d
+        let t = std::time::Instant::now();
+        let xs = CowArray::from(xs);
+        let xs = vec![Value::from_array(self.session.allocator(), &xs)?];
+        if profile {
+            println!("[ORT H2D]: {:?}", t.elapsed());
+        }
+
+        // run
+        let t = std::time::Instant::now();
+        let ys = self.session.run(xs)?;
+        if profile {
+            println!("[ORT Inference]: {:?}", t.elapsed());
+        }
+
+        // d2h
+        Ok(ys
+            .iter()
+            .map(|x| {
+                // d2h
+                let t = std::time::Instant::now();
+                let x = x.try_extract::<_>().unwrap().view().clone().into_owned();
+                if profile {
+                    println!("[ORT D2H]: {:?}", t.elapsed());
+                }
+
+                // f16->f32
+                let t_ = std::time::Instant::now();
+                let x = x.mapv(f16::to_f32);
+                if profile {
+                    println!("[ORT f16->f32]: {:?}", t_.elapsed());
+                }
+                x
+            })
+            .collect::<Vec<Array<_, _>>>())
+    }
+
+    pub fn run_fp32(&self, xs: Array<f32, IxDyn>, profile: bool) -> Result<Vec<Array<f32, IxDyn>>> {
+        // h2d
+        let t = std::time::Instant::now();
+        let xs = CowArray::from(xs);
+        let xs = vec![Value::from_array(self.session.allocator(), &xs)?];
+        if profile {
+            println!("[ORT H2D]: {:?}", t.elapsed());
+        }
+
+        // run
+        let t = std::time::Instant::now();
+        let ys = self.session.run(xs)?;
+        if profile {
+            println!("[ORT Inference]: {:?}", t.elapsed());
+        }
+
+        // d2h
+        Ok(ys
+            .iter()
+            .map(|x| {
+                let t = std::time::Instant::now();
+                let x = x.try_extract::<_>().unwrap().view().clone().into_owned();
+                if profile {
+                    println!("[ORT D2H]: {:?}", t.elapsed());
+                }
+                x
+            })
+            .collect::<Vec<Array<_, _>>>())
+    }
+
+    pub fn output_shapes(&self) -> Vec<Vec<i32>> {
+        let mut shapes = Vec::new();
+        for o in &self.session.outputs {
+            let shape: Vec<_> = o
+                .dimensions()
+                .map(|x| if let Some(x) = x { x as i32 } else { -1i32 })
+                .collect();
+            shapes.push(shape);
+        }
+        shapes
+    }
+
+    pub fn output_dtypes(&self) -> Vec<TensorElementDataType> {
+        let mut dtypes = Vec::new();
+        self.session
+            .outputs
+            .iter()
+            .for_each(|x| dtypes.push(x.output_type));
+        dtypes
+    }
+
+    pub fn input_shapes(&self) -> &Vec<Vec<i32>> {
+        &self.inputs.shapes
+    }
+
+    pub fn input_names(&self) -> &Vec<String> {
+        &self.inputs.names
+    }
+
+    pub fn input_dtypes(&self) -> &Vec<TensorElementDataType> {
+        &self.inputs.dtypes
+    }
+
+    pub fn dtype(&self) -> TensorElementDataType {
+        self.input_dtypes()[0]
+    }
+
+    pub fn height(&self) -> u32 {
+        self.inputs.sizes[0][0]
+    }
+
+    pub fn width(&self) -> u32 {
+        self.inputs.sizes[0][1]
+    }
+
+    pub fn is_height_dynamic(&self) -> bool {
+        self.input_shapes()[0][2] == -1
+    }
+
+    pub fn is_width_dynamic(&self) -> bool {
+        self.input_shapes()[0][3] == -1
+    }
+
+    pub fn batch(&self) -> u32 {
+        self.batch.opt
+    }
+
+    pub fn is_batch_dynamic(&self) -> bool {
+        self.input_shapes()[0][0] == -1
+    }
+
+    pub fn ep(&self) -> &OrtEP {
+        &self.ep
+    }
+
+    pub fn task(&self) -> YOLOTask {
+        self.task.clone()
+    }
+
+    pub fn names(&self) -> Option<Vec<String>> {
+        // class names, metadata parsing
+        // String format: `{0: 'person', 1: 'bicycle', 2: 'sports ball', ..., 27: "yellow_lady's_slipper"}`
+        match self.fetch_from_metadata("names") {
+            Some(names) => {
+                let re = Regex::new(r#"(['"])([-()\w '"]+)(['"])"#).unwrap();
+                let mut names_ = vec![];
+                for (_, [_, name, _]) in re.captures_iter(&names).map(|x| x.extract()) {
+                    names_.push(name.to_string());
+                }
+                Some(names_)
+            }
+            None => None,
+        }
+    }
+
+    pub fn nk(&self) -> Option<u32> {
+        // num_keypoints, metadata parsing: String `nk` in onnx model: `[17, 3]`
+        match self.fetch_from_metadata("kpt_shape") {
+            None => None,
+            Some(kpt_string) => {
+                let re = Regex::new(r"([0-9]+), ([0-9]+)").unwrap();
+                let caps = re.captures(&kpt_string).unwrap();
+                Some(caps.get(1).unwrap().as_str().parse::<u32>().unwrap())
+            }
+        }
+    }
+
+    pub fn nc(&self) -> Option<u32> {
+        // num_classes
+        match self.names() {
+            // by names
+            Some(names) => Some(names.len() as u32),
+            None => match self.task() {
+                // by task calculation
+                YOLOTask::Classify => Some(self.output_shapes()[0][1] as u32),
+                YOLOTask::Detect => {
+                    if self.output_shapes()[0][1] == -1 {
+                        None
+                    } else {
+                        // cxywhclss
+                        Some(self.output_shapes()[0][1] as u32 - 4)
+                    }
+                }
+                YOLOTask::Pose => {
+                    match self.nk() {
+                        None => None,
+                        Some(nk) => {
+                            if self.output_shapes()[0][1] == -1 {
+                                None
+                            } else {
+                                // cxywhclss3*kpt
+                                Some(self.output_shapes()[0][1] as u32 - 4 - 3 * nk)
+                            }
+                        }
+                    }
+                }
+                YOLOTask::Segment => {
+                    if self.output_shapes()[0][1] == -1 {
+                        None
+                    } else {
+                        // cxywhclssnm
+                        Some((self.output_shapes()[0][1] - self.output_shapes()[1][1]) as u32 - 4)
+                    }
+                }
+            },
+        }
+    }
+
+    pub fn nm(&self) -> Option<u32> {
+        // num_masks
+        match self.task() {
+            YOLOTask::Segment => Some(self.output_shapes()[1][1] as u32),
+            _ => None,
+        }
+    }
+
+    pub fn na(&self) -> Option<u32> {
+        // num_anchors
+        match self.task() {
+            YOLOTask::Segment | YOLOTask::Detect | YOLOTask::Pose => {
+                if self.output_shapes()[0][2] == -1 {
+                    None
+                } else {
+                    Some(self.output_shapes()[0][2] as u32)
+                }
+            }
+            _ => None,
+        }
+    }
+
+    pub fn author(&self) -> Option<String> {
+        self.fetch_from_metadata("author")
+    }
+
+    pub fn version(&self) -> Option<String> {
+        self.fetch_from_metadata("version")
+    }
+}
--- a/examples/YOLOv8-ONNXRuntime-Rust/src/yolo_result.rs
+++ b/examples/YOLOv8-ONNXRuntime-Rust/src/yolo_result.rs
+use ndarray::{Array, Axis, IxDyn};
+
+#[derive(Clone, PartialEq, Default)]
+pub struct YOLOResult {
+    // YOLO tasks results of an image
+    pub probs: Option<Embedding>,
+    pub bboxes: Option<Vec<Bbox>>,
+    pub keypoints: Option<Vec<Vec<Point2>>>,
+    pub masks: Option<Vec<Vec<u8>>>,
+}
+
+impl std::fmt::Debug for YOLOResult {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("YOLOResult")
+            .field(
+                "Probs(top5)",
+                &format_args!("{:?}", self.probs().map(|probs| probs.topk(5))),
+            )
+            .field("Bboxes", &self.bboxes)
+            .field("Keypoints", &self.keypoints)
+            .field(
+                "Masks",
+                &format_args!("{:?}", self.masks().map(|masks| masks.len())),
+            )
+            .finish()
+    }
+}
+
+impl YOLOResult {
+    pub fn new(
+        probs: Option<Embedding>,
+        bboxes: Option<Vec<Bbox>>,
+        keypoints: Option<Vec<Vec<Point2>>>,
+        masks: Option<Vec<Vec<u8>>>,
+    ) -> Self {
+        Self {
+            probs,
+            bboxes,
+            keypoints,
+            masks,
+        }
+    }
+
+    pub fn probs(&self) -> Option<&Embedding> {
+        self.probs.as_ref()
+    }
+
+    pub fn keypoints(&self) -> Option<&Vec<Vec<Point2>>> {
+        self.keypoints.as_ref()
+    }
+
+    pub fn masks(&self) -> Option<&Vec<Vec<u8>>> {
+        self.masks.as_ref()
+    }
+
+    pub fn bboxes(&self) -> Option<&Vec<Bbox>> {
+        self.bboxes.as_ref()
+    }
+
+    pub fn bboxes_mut(&mut self) -> Option<&mut Vec<Bbox>> {
+        self.bboxes.as_mut()
+    }
+}
+
+#[derive(Debug, PartialEq, Clone, Default)]
+pub struct Point2 {
+    // A point2d with x, y, conf
+    x: f32,
+    y: f32,
+    confidence: f32,
+}
+
+impl Point2 {
+    pub fn new_with_conf(x: f32, y: f32, confidence: f32) -> Self {
+        Self { x, y, confidence }
+    }
+
+    pub fn new(x: f32, y: f32) -> Self {
+        Self {
+            x,
+            y,
+            ..Default::default()
+        }
+    }
+
+    pub fn x(&self) -> f32 {
+        self.x
+    }
+
+    pub fn y(&self) -> f32 {
+        self.y
+    }
+
+    pub fn confidence(&self) -> f32 {
+        self.confidence
+    }
+}
+
+#[derive(Debug, Clone, PartialEq, Default)]
+pub struct Embedding {
+    // An float32 n-dims tensor
+    data: Array<f32, IxDyn>,
+}
+
+impl Embedding {
+    pub fn new(data: Array<f32, IxDyn>) -> Self {
+        Self { data }
+    }
+
+    pub fn data(&self) -> &Array<f32, IxDyn> {
+        &self.data
+    }
+
+    pub fn topk(&self, k: usize) -> Vec<(usize, f32)> {
+        let mut probs = self
+            .data
+            .iter()
+            .enumerate()
+            .map(|(a, b)| (a, *b))
+            .collect::<Vec<_>>();
+        probs.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
+        let mut topk = Vec::new();
+        for &(id, confidence) in probs.iter().take(k) {
+            topk.push((id, confidence));
+        }
+        topk
+    }
+
+    pub fn norm(&self) -> Array<f32, IxDyn> {
+        let std_ = self.data.mapv(|x| x * x).sum_axis(Axis(0)).mapv(f32::sqrt);
+        self.data.clone() / std_
+    }
+
+    pub fn top1(&self) -> (usize, f32) {
+        self.topk(1)[0]
+    }
+}
+
+#[derive(Debug, Clone, PartialEq, Default)]
+pub struct Bbox {
+    // a bounding box around an object
+    xmin: f32,
+    ymin: f32,
+    width: f32,
+    height: f32,
+    id: usize,
+    confidence: f32,
+}
+
+impl Bbox {
+    pub fn new_from_xywh(xmin: f32, ymin: f32, width: f32, height: f32) -> Self {
+        Self {
+            xmin,
+            ymin,
+            width,
+            height,
+            ..Default::default()
+        }
+    }
+
+    pub fn new(xmin: f32, ymin: f32, width: f32, height: f32, id: usize, confidence: f32) -> Self {
+        Self {
+            xmin,
+            ymin,
+            width,
+            height,
+            id,
+            confidence,
+        }
+    }
+
+    pub fn width(&self) -> f32 {
+        self.width
+    }
+
+    pub fn height(&self) -> f32 {
+        self.height
+    }
+
+    pub fn xmin(&self) -> f32 {
+        self.xmin
+    }
+
+    pub fn ymin(&self) -> f32 {
+        self.ymin
+    }
+
+    pub fn xmax(&self) -> f32 {
+        self.xmin + self.width
+    }
+
+    pub fn ymax(&self) -> f32 {
+        self.ymin + self.height
+    }
+
+    pub fn tl(&self) -> Point2 {
+        Point2::new(self.xmin, self.ymin)
+    }
+
+    pub fn br(&self) -> Point2 {
+        Point2::new(self.xmax(), self.ymax())
+    }
+
+    pub fn cxcy(&self) -> Point2 {
+        Point2::new(self.xmin + self.width / 2., self.ymin + self.height / 2.)
+    }
+
+    pub fn id(&self) -> usize {
+        self.id
+    }
+
+    pub fn confidence(&self) -> f32 {
+        self.confidence
+    }
+
+    pub fn area(&self) -> f32 {
+        self.width * self.height
+    }
+
+    pub fn intersection_area(&self, another: &Bbox) -> f32 {
+        let l = self.xmin.max(another.xmin);
+        let r = (self.xmin + self.width).min(another.xmin + another.width);
+        let t = self.ymin.max(another.ymin);
+        let b = (self.ymin + self.height).min(another.ymin + another.height);
+        (r - l + 1.).max(0.) * (b - t + 1.).max(0.)
+    }
+
+    pub fn union(&self, another: &Bbox) -> f32 {
+        self.area() + another.area() - self.intersection_area(another)
+    }
+
+    pub fn iou(&self, another: &Bbox) -> f32 {
+        self.intersection_area(another) / self.union(another)
+    }
+}
--- a/examples/YOLOv8-ONNXRuntime/README.md
+++ b/examples/YOLOv8-ONNXRuntime/README.md
+# YOLOv8 - ONNX Runtime
+
+This project implements YOLOv8 using ONNX Runtime.
+
+## Installation
+
+To run this project, you need to install the required dependencies. The following instructions will guide you through the installation process.
+
+### Installing Required Dependencies
+
+You can install the required dependencies by running the following command:
+
+```bash
+pip install -r requirements.txt
+```
+
+### Installing `onnxruntime-gpu`
+
+If you have an NVIDIA GPU and want to leverage GPU acceleration, you can install the onnxruntime-gpu package using the following command:
+
+```bash
+pip install onnxruntime-gpu
+```
+
+Note: Make sure you have the appropriate GPU drivers installed on your system.
+
+### Installing `onnxruntime` (CPU version)
+
+If you don't have an NVIDIA GPU or prefer to use the CPU version of onnxruntime, you can install the onnxruntime package using the following command:
+
+```bash
+pip install onnxruntime
+```
+
+### Usage
+
+After successfully installing the required packages, you can run the YOLOv8 implementation using the following command:
+
+```bash
+python main.py --model yolov8n.onnx --img image.jpg --conf-thres 0.5 --iou-thres 0.5
+```
+
+Make sure to replace yolov8n.onnx with the path to your YOLOv8 ONNX model file, image.jpg with the path to your input image, and adjust the confidence threshold (conf-thres) and IoU threshold (iou-thres) values as needed.
--- a/examples/YOLOv8-ONNXRuntime/main.py
+++ b/examples/YOLOv8-ONNXRuntime/main.py
+# Ultralytics YOLO 🚀, AGPL-3.0 license
+
+import argparse
+
+import cv2
+import numpy as np
+import onnxruntime as ort
+import torch
+
+from ultralytics.utils import ASSETS, yaml_load
+from ultralytics.utils.checks import check_requirements, check_yaml
+
+
+class YOLOv8:
+    """YOLOv8 object detection model class for handling inference and visualization."""
+
+    def __init__(self, onnx_model, input_image, confidence_thres, iou_thres):
+        """
+        Initializes an instance of the YOLOv8 class.
+
+        Args:
+            onnx_model: Path to the ONNX model.
+            input_image: Path to the input image.
+            confidence_thres: Confidence threshold for filtering detections.
+            iou_thres: IoU (Intersection over Union) threshold for non-maximum suppression.
+        """
+        self.onnx_model = onnx_model
+        self.input_image = input_image
+        self.confidence_thres = confidence_thres
+        self.iou_thres = iou_thres
+
+        # Load the class names from the COCO dataset
+        self.classes = yaml_load(check_yaml("coco128.yaml"))["names"]
+
+        # Generate a color palette for the classes
+        self.color_palette = np.random.uniform(0, 255, size=(len(self.classes), 3))
+
+    def draw_detections(self, img, box, score, class_id):
+        """
+        Draws bounding boxes and labels on the input image based on the detected objects.
+
+        Args:
+            img: The input image to draw detections on.
+            box: Detected bounding box.
+            score: Corresponding detection score.
+            class_id: Class ID for the detected object.
+
+        Returns:
+            None
+        """
+
+        # Extract the coordinates of the bounding box
+        x1, y1, w, h = box
+
+        # Retrieve the color for the class ID
+        color = self.color_palette[class_id]
+
+        # Draw the bounding box on the image
+        cv2.rectangle(img, (int(x1), int(y1)), (int(x1 + w), int(y1 + h)), color, 2)
+
+        # Create the label text with class name and score
+        label = f"{self.classes[class_id]}: {score:.2f}"
+
+        # Calculate the dimensions of the label text
+        (label_width, label_height), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)
+
+        # Calculate the position of the label text
+        label_x = x1
+        label_y = y1 - 10 if y1 - 10 > label_height else y1 + 10
+
+        # Draw a filled rectangle as the background for the label text
+        cv2.rectangle(
+            img, (label_x, label_y - label_height), (label_x + label_width, label_y + label_height), color, cv2.FILLED
+        )
+
+        # Draw the label text on the image
+        cv2.putText(img, label, (label_x, label_y), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 1, cv2.LINE_AA)
+
+    def preprocess(self):
+        """
+        Preprocesses the input image before performing inference.
+
+        Returns:
+            image_data: Preprocessed image data ready for inference.
+        """
+        # Read the input image using OpenCV
+        self.img = cv2.imread(self.input_image)
+
+        # Get the height and width of the input image
+        self.img_height, self.img_width = self.img.shape[:2]
+
+        # Convert the image color space from BGR to RGB
+        img = cv2.cvtColor(self.img, cv2.COLOR_BGR2RGB)
+
+        # Resize the image to match the input shape
+        img = cv2.resize(img, (self.input_width, self.input_height))
+
+        # Normalize the image data by dividing it by 255.0
+        image_data = np.array(img) / 255.0
+
+        # Transpose the image to have the channel dimension as the first dimension
+        image_data = np.transpose(image_data, (2, 0, 1))  # Channel first
+
+        # Expand the dimensions of the image data to match the expected input shape
+        image_data = np.expand_dims(image_data, axis=0).astype(np.float32)
+
+        # Return the preprocessed image data
+        return image_data
+
+    def postprocess(self, input_image, output):
+        """
+        Performs post-processing on the model's output to extract bounding boxes, scores, and class IDs.
+
+        Args:
+            input_image (numpy.ndarray): The input image.
+            output (numpy.ndarray): The output of the model.
+
+        Returns:
+            numpy.ndarray: The input image with detections drawn on it.
+        """
+
+        # Transpose and squeeze the output to match the expected shape
+        outputs = np.transpose(np.squeeze(output[0]))
+
+        # Get the number of rows in the outputs array
+        rows = outputs.shape[0]
+
+        # Lists to store the bounding boxes, scores, and class IDs of the detections
+        boxes = []
+        scores = []
+        class_ids = []
+
+        # Calculate the scaling factors for the bounding box coordinates
+        x_factor = self.img_width / self.input_width
+        y_factor = self.img_height / self.input_height
+
+        # Iterate over each row in the outputs array
+        for i in range(rows):
+            # Extract the class scores from the current row
+            classes_scores = outputs[i][4:]
+
+            # Find the maximum score among the class scores
+            max_score = np.amax(classes_scores)
+
+            # If the maximum score is above the confidence threshold
+            if max_score >= self.confidence_thres:
+                # Get the class ID with the highest score
+                class_id = np.argmax(classes_scores)
+
+                # Extract the bounding box coordinates from the current row
+                x, y, w, h = outputs[i][0], outputs[i][1], outputs[i][2], outputs[i][3]
+
+                # Calculate the scaled coordinates of the bounding box
+                left = int((x - w / 2) * x_factor)
+                top = int((y - h / 2) * y_factor)
+                width = int(w * x_factor)
+                height = int(h * y_factor)
+
+                # Add the class ID, score, and box coordinates to the respective lists
+                class_ids.append(class_id)
+                scores.append(max_score)
+                boxes.append([left, top, width, height])
+
+        # Apply non-maximum suppression to filter out overlapping bounding boxes
+        indices = cv2.dnn.NMSBoxes(boxes, scores, self.confidence_thres, self.iou_thres)
+
+        # Iterate over the selected indices after non-maximum suppression
+        for i in indices:
+            # Get the box, score, and class ID corresponding to the index
+            box = boxes[i]
+            score = scores[i]
+            class_id = class_ids[i]
+
+            # Draw the detection on the input image
+            self.draw_detections(input_image, box, score, class_id)
+
+        # Return the modified input image
+        return input_image
+
+    def main(self):
+        """
+        Performs inference using an ONNX model and returns the output image with drawn detections.
+
+        Returns:
+            output_img: The output image with drawn detections.
+        """
+        # Create an inference session using the ONNX model and specify execution providers
+        session = ort.InferenceSession(self.onnx_model, providers=["CUDAExecutionProvider", "CPUExecutionProvider"])
+
+        # Get the model inputs
+        model_inputs = session.get_inputs()
+
+        # Store the shape of the input for later use
+        input_shape = model_inputs[0].shape
+        self.input_width = input_shape[2]
+        self.input_height = input_shape[3]
+
+        # Preprocess the image data
+        img_data = self.preprocess()
+
+        # Run inference using the preprocessed image data
+        outputs = session.run(None, {model_inputs[0].name: img_data})
+
+        # Perform post-processing on the outputs to obtain output image.
+        return self.postprocess(self.img, outputs)  # output image
+
+
+if __name__ == "__main__":
+    # Create an argument parser to handle command-line arguments
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", type=str, default="yolov8n.onnx", help="Input your ONNX model.")
+    parser.add_argument("--img", type=str, default=str(ASSETS / "bus.jpg"), help="Path to input image.")
+    parser.add_argument("--conf-thres", type=float, default=0.5, help="Confidence threshold")
+    parser.add_argument("--iou-thres", type=float, default=0.5, help="NMS IoU threshold")
+    args = parser.parse_args()
+
+    # Check the requirements and select the appropriate backend (CPU or GPU)
+    check_requirements("onnxruntime-gpu" if torch.cuda.is_available() else "onnxruntime")
+
+    # Create an instance of the YOLOv8 class with the specified arguments
+    detection = YOLOv8(args.model, args.img, args.conf_thres, args.iou_thres)
+
+    # Perform object detection and obtain the output image
+    output_image = detection.main()
+
+    # Display the output image in a window
+    cv2.namedWindow("Output", cv2.WINDOW_NORMAL)
+    cv2.imshow("Output", output_image)
+
+    # Wait for a key press to exit
+    cv2.waitKey(0)
--- a/examples/YOLOv8-OpenCV-ONNX-Python/README.md
+++ b/examples/YOLOv8-OpenCV-ONNX-Python/README.md
+# YOLOv8 - OpenCV
+
+Implementation YOLOv8 on OpenCV using ONNX Format.
+
+Just simply clone and run
+
+```bash
+pip install -r requirements.txt
+python main.py --model yolov8n.onnx --img image.jpg
+```
+
+If you start from scratch:
+
+```bash
+pip install ultralytics
+yolo export model=yolov8n.pt imgsz=640 format=onnx opset=12
+```
+
+_\*Make sure to include "opset=12"_
--- a/examples/YOLOv8-OpenCV-ONNX-Python/main.py
+++ b/examples/YOLOv8-OpenCV-ONNX-Python/main.py
+# Ultralytics YOLO 🚀, AGPL-3.0 license
+
+import argparse
+
+import cv2.dnn
+import numpy as np
+
+from ultralytics.utils import ASSETS, yaml_load
+from ultralytics.utils.checks import check_yaml
+
+CLASSES = yaml_load(check_yaml("coco128.yaml"))["names"]
+colors = np.random.uniform(0, 255, size=(len(CLASSES), 3))
+
+
+def draw_bounding_box(img, class_id, confidence, x, y, x_plus_w, y_plus_h):
+    """
+    Draws bounding boxes on the input image based on the provided arguments.
+
+    Args:
+        img (numpy.ndarray): The input image to draw the bounding box on.
+        class_id (int): Class ID of the detected object.
+        confidence (float): Confidence score of the detected object.
+        x (int): X-coordinate of the top-left corner of the bounding box.
+        y (int): Y-coordinate of the top-left corner of the bounding box.
+        x_plus_w (int): X-coordinate of the bottom-right corner of the bounding box.
+        y_plus_h (int): Y-coordinate of the bottom-right corner of the bounding box.
+    """
+    label = f"{CLASSES[class_id]} ({confidence:.2f})"
+    color = colors[class_id]
+    cv2.rectangle(img, (x, y), (x_plus_w, y_plus_h), color, 2)
+    cv2.putText(img, label, (x - 10, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)
+
+
+def main(onnx_model, input_image):
+    """
+    Main function to load ONNX model, perform inference, draw bounding boxes, and display the output image.
+
+    Args:
+        onnx_model (str): Path to the ONNX model.
+        input_image (str): Path to the input image.
+
+    Returns:
+        list: List of dictionaries containing detection information such as class_id, class_name, confidence, etc.
+    """
+    # Load the ONNX model
+    model: cv2.dnn.Net = cv2.dnn.readNetFromONNX(onnx_model)
+
+    # Read the input image
+    original_image: np.ndarray = cv2.imread(input_image)
+    [height, width, _] = original_image.shape
+
+    # Prepare a square image for inference
+    length = max((height, width))
+    image = np.zeros((length, length, 3), np.uint8)
+    image[0:height, 0:width] = original_image
+
+    # Calculate scale factor
+    scale = length / 640
+
+    # Preprocess the image and prepare blob for model
+    blob = cv2.dnn.blobFromImage(image, scalefactor=1 / 255, size=(640, 640), swapRB=True)
+    model.setInput(blob)
+
+    # Perform inference
+    outputs = model.forward()
+
+    # Prepare output array
+    outputs = np.array([cv2.transpose(outputs[0])])
+    rows = outputs.shape[1]
+
+    boxes = []
+    scores = []
+    class_ids = []
+
+    # Iterate through output to collect bounding boxes, confidence scores, and class IDs
+    for i in range(rows):
+        classes_scores = outputs[0][i][4:]
+        (minScore, maxScore, minClassLoc, (x, maxClassIndex)) = cv2.minMaxLoc(classes_scores)
+        if maxScore >= 0.25:
+            box = [
+                outputs[0][i][0] - (0.5 * outputs[0][i][2]),
+                outputs[0][i][1] - (0.5 * outputs[0][i][3]),
+                outputs[0][i][2],
+                outputs[0][i][3],
+            ]
+            boxes.append(box)
+            scores.append(maxScore)
+            class_ids.append(maxClassIndex)
+
+    # Apply NMS (Non-maximum suppression)
+    result_boxes = cv2.dnn.NMSBoxes(boxes, scores, 0.25, 0.45, 0.5)
+
+    detections = []
+
+    # Iterate through NMS results to draw bounding boxes and labels
+    for i in range(len(result_boxes)):
+        index = result_boxes[i]
+        box = boxes[index]
+        detection = {
+            "class_id": class_ids[index],
+            "class_name": CLASSES[class_ids[index]],
+            "confidence": scores[index],
+            "box": box,
+            "scale": scale,
+        }
+        detections.append(detection)
+        draw_bounding_box(
+            original_image,
+            class_ids[index],
+            scores[index],
+            round(box[0] * scale),
+            round(box[1] * scale),
+            round((box[0] + box[2]) * scale),
+            round((box[1] + box[3]) * scale),
+        )
+
+    # Display the image with bounding boxes
+    cv2.imshow("image", original_image)
+    cv2.waitKey(0)
+    cv2.destroyAllWindows()
+
+    return detections
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", default="yolov8n.onnx", help="Input your ONNX model.")
+    parser.add_argument("--img", default=str(ASSETS / "bus.jpg"), help="Path to input image.")
+    args = parser.parse_args()
+    main(args.model, args.img)