v1.0

e63cf68a · chenzk · e63cf68a · e63cf68a · e63cf68a · e63cf68a
Commit e63cf68a authored Jul 11, 2025 by chenzk
20 changed files
--- a/examples/YOLOv8-LibTorch-CPP-Inference/README.md
+++ b/examples/YOLOv8-LibTorch-CPP-Inference/README.md
+# YOLOv8 LibTorch Inference C++
+
+This example demonstrates how to perform inference using YOLOv8 models in C++ with LibTorch API.
+
+## Dependencies
+
+| Dependency   | Version  |
+| ------------ | -------- |
+| OpenCV       | >=4.0.0  |
+| C++ Standard | >=17     |
+| Cmake        | >=3.18   |
+| Libtorch     | >=1.12.1 |
+
+## Usage
+
+```bash
+git clone ultralytics
+cd ultralytics
+pip install .
+cd examples/YOLOv8-LibTorch-CPP-Inference
+
+mkdir build
+cd build
+cmake ..
+make
+./yolov8_libtorch_inference
+```
+
+## Exporting YOLOv8
+
+To export YOLOv8 models:
+
+```bash
+yolo export model=yolov8s.pt imgsz=640 format=torchscript
+```
--- a/examples/YOLOv8-LibTorch-CPP-Inference/main.cc
+++ b/examples/YOLOv8-LibTorch-CPP-Inference/main.cc
+#include <iostream>
+
+#include <opencv2/core.hpp>
+#include <opencv2/imgproc.hpp>
+#include <opencv2/imgcodecs.hpp>
+#include <torch/torch.h>
+#include <torch/script.h>
+
+using torch::indexing::Slice;
+using torch::indexing::None;
+
+
+float generate_scale(cv::Mat& image, const std::vector<int>& target_size) {
+    int origin_w = image.cols;
+    int origin_h = image.rows;
+
+    int target_h = target_size[0];
+    int target_w = target_size[1];
+
+    float ratio_h = static_cast<float>(target_h) / static_cast<float>(origin_h);
+    float ratio_w = static_cast<float>(target_w) / static_cast<float>(origin_w);
+    float resize_scale = std::min(ratio_h, ratio_w);
+    return resize_scale;
+}
+
+
+float letterbox(cv::Mat &input_image, cv::Mat &output_image, const std::vector<int> &target_size) {
+    if (input_image.cols == target_size[1] && input_image.rows == target_size[0]) {
+        if (input_image.data == output_image.data) {
+            return 1.;
+        } else {
+            output_image = input_image.clone();
+            return 1.;
+        }
+    }
+
+    float resize_scale = generate_scale(input_image, target_size);
+    int new_shape_w = std::round(input_image.cols * resize_scale);
+    int new_shape_h = std::round(input_image.rows * resize_scale);
+    float padw = (target_size[1] - new_shape_w) / 2.;
+    float padh = (target_size[0] - new_shape_h) / 2.;
+
+    int top = std::round(padh - 0.1);
+    int bottom = std::round(padh + 0.1);
+    int left = std::round(padw - 0.1);
+    int right = std::round(padw + 0.1);
+
+    cv::resize(input_image, output_image,
+               cv::Size(new_shape_w, new_shape_h),
+               0, 0, cv::INTER_AREA);
+
+    cv::copyMakeBorder(output_image, output_image, top, bottom, left, right,
+                       cv::BORDER_CONSTANT, cv::Scalar(114.));
+    return resize_scale;
+}
+
+
+torch::Tensor xyxy2xywh(const torch::Tensor& x) {
+    auto y = torch::empty_like(x);
+    y.index_put_({"...", 0}, (x.index({"...", 0}) + x.index({"...", 2})).div(2));
+    y.index_put_({"...", 1}, (x.index({"...", 1}) + x.index({"...", 3})).div(2));
+    y.index_put_({"...", 2}, x.index({"...", 2}) - x.index({"...", 0}));
+    y.index_put_({"...", 3}, x.index({"...", 3}) - x.index({"...", 1}));
+    return y;
+}
+
+
+torch::Tensor xywh2xyxy(const torch::Tensor& x) {
+    auto y = torch::empty_like(x);
+    auto dw = x.index({"...", 2}).div(2);
+    auto dh = x.index({"...", 3}).div(2);
+    y.index_put_({"...", 0}, x.index({"...", 0}) - dw);
+    y.index_put_({"...", 1}, x.index({"...", 1}) - dh);
+    y.index_put_({"...", 2}, x.index({"...", 0}) + dw);
+    y.index_put_({"...", 3}, x.index({"...", 1}) + dh);
+    return y;
+}
+
+
+// Reference: https://github.com/pytorch/vision/blob/main/torchvision/csrc/ops/cpu/nms_kernel.cpp
+torch::Tensor nms(const torch::Tensor& bboxes, const torch::Tensor& scores, float iou_threshold) {
+    if (bboxes.numel() == 0)
+        return torch::empty({0}, bboxes.options().dtype(torch::kLong));
+
+    auto x1_t = bboxes.select(1, 0).contiguous();
+    auto y1_t = bboxes.select(1, 1).contiguous();
+    auto x2_t = bboxes.select(1, 2).contiguous();
+    auto y2_t = bboxes.select(1, 3).contiguous();
+
+    torch::Tensor areas_t = (x2_t - x1_t) * (y2_t - y1_t);
+
+    auto order_t = std::get<1>(
+        scores.sort(/*stable=*/true, /*dim=*/0, /* descending=*/true));
+
+    auto ndets = bboxes.size(0);
+    torch::Tensor suppressed_t = torch::zeros({ndets}, bboxes.options().dtype(torch::kByte));
+    torch::Tensor keep_t = torch::zeros({ndets}, bboxes.options().dtype(torch::kLong));
+
+    auto suppressed = suppressed_t.data_ptr<uint8_t>();
+    auto keep = keep_t.data_ptr<int64_t>();
+    auto order = order_t.data_ptr<int64_t>();
+    auto x1 = x1_t.data_ptr<float>();
+    auto y1 = y1_t.data_ptr<float>();
+    auto x2 = x2_t.data_ptr<float>();
+    auto y2 = y2_t.data_ptr<float>();
+    auto areas = areas_t.data_ptr<float>();
+
+    int64_t num_to_keep = 0;
+
+    for (int64_t _i = 0; _i < ndets; _i++) {
+        auto i = order[_i];
+        if (suppressed[i] == 1)
+            continue;
+        keep[num_to_keep++] = i;
+        auto ix1 = x1[i];
+        auto iy1 = y1[i];
+        auto ix2 = x2[i];
+        auto iy2 = y2[i];
+        auto iarea = areas[i];
+
+        for (int64_t _j = _i + 1; _j < ndets; _j++) {
+        auto j = order[_j];
+        if (suppressed[j] == 1)
+            continue;
+        auto xx1 = std::max(ix1, x1[j]);
+        auto yy1 = std::max(iy1, y1[j]);
+        auto xx2 = std::min(ix2, x2[j]);
+        auto yy2 = std::min(iy2, y2[j]);
+
+        auto w = std::max(static_cast<float>(0), xx2 - xx1);
+        auto h = std::max(static_cast<float>(0), yy2 - yy1);
+        auto inter = w * h;
+        auto ovr = inter / (iarea + areas[j] - inter);
+        if (ovr > iou_threshold)
+            suppressed[j] = 1;
+        }
+    }
+    return keep_t.narrow(0, 0, num_to_keep);
+}
+
+
+torch::Tensor non_max_suppression(torch::Tensor& prediction, float conf_thres = 0.25, float iou_thres = 0.45, int max_det = 300) {
+    auto bs = prediction.size(0);
+    auto nc = prediction.size(1) - 4;
+    auto nm = prediction.size(1) - nc - 4;
+    auto mi = 4 + nc;
+    auto xc = prediction.index({Slice(), Slice(4, mi)}).amax(1) > conf_thres;
+
+    prediction = prediction.transpose(-1, -2);
+    prediction.index_put_({"...", Slice({None, 4})}, xywh2xyxy(prediction.index({"...", Slice(None, 4)})));
+
+    std::vector<torch::Tensor> output;
+    for (int i = 0; i < bs; i++) {
+        output.push_back(torch::zeros({0, 6 + nm}, prediction.device()));
+    }
+
+    for (int xi = 0; xi < prediction.size(0); xi++) {
+        auto x = prediction[xi];
+        x = x.index({xc[xi]});
+        auto x_split = x.split({4, nc, nm}, 1);
+        auto box = x_split[0], cls = x_split[1], mask = x_split[2];
+        auto [conf, j] = cls.max(1, true);
+        x = torch::cat({box, conf, j.toType(torch::kFloat), mask}, 1);
+        x = x.index({conf.view(-1) > conf_thres});
+        int n = x.size(0);
+        if (!n) { continue; }
+
+        // NMS
+        auto c = x.index({Slice(), Slice{5, 6}}) * 7680;
+        auto boxes = x.index({Slice(), Slice(None, 4)}) + c;
+        auto scores = x.index({Slice(), 4});
+        auto i = nms(boxes, scores, iou_thres);
+        i = i.index({Slice(None, max_det)});
+        output[xi] = x.index({i});
+    }
+
+    return torch::stack(output);
+}
+
+
+torch::Tensor clip_boxes(torch::Tensor& boxes, const std::vector<int>& shape) {
+    boxes.index_put_({"...", 0}, boxes.index({"...", 0}).clamp(0, shape[1]));
+    boxes.index_put_({"...", 1}, boxes.index({"...", 1}).clamp(0, shape[0]));
+    boxes.index_put_({"...", 2}, boxes.index({"...", 2}).clamp(0, shape[1]));
+    boxes.index_put_({"...", 3}, boxes.index({"...", 3}).clamp(0, shape[0]));
+    return boxes;
+}
+
+
+torch::Tensor scale_boxes(const std::vector<int>& img1_shape, torch::Tensor& boxes, const std::vector<int>& img0_shape) {
+    auto gain = (std::min)((float)img1_shape[0] / img0_shape[0], (float)img1_shape[1] / img0_shape[1]);
+    auto pad0 = std::round((float)(img1_shape[1] - img0_shape[1] * gain) / 2. - 0.1);
+    auto pad1 = std::round((float)(img1_shape[0] - img0_shape[0] * gain) / 2. - 0.1);
+
+    boxes.index_put_({"...", 0}, boxes.index({"...", 0}) - pad0);
+    boxes.index_put_({"...", 2}, boxes.index({"...", 2}) - pad0);
+    boxes.index_put_({"...", 1}, boxes.index({"...", 1}) - pad1);
+    boxes.index_put_({"...", 3}, boxes.index({"...", 3}) - pad1);
+    boxes.index_put_({"...", Slice(None, 4)}, boxes.index({"...", Slice(None, 4)}).div(gain));
+    return boxes;
+}
+
+
+int main() {
+    // Device
+    torch::Device device(torch::cuda::is_available() ? torch::kCUDA :torch::kCPU);
+
+    // Note that in this example the classes are hard-coded
+    std::vector<std::string> classes {"person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light", "fire hydrant",
+                                      "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra",
+                                      "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite",
+                                      "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork", "knife",
+                                      "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair",
+                                      "couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
+                                      "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush"};
+
+    try {
+        // Load the model (e.g. yolov8s.torchscript)
+        std::string model_path = "/path/to/yolov8s.torchscript";
+        torch::jit::script::Module yolo_model;
+        yolo_model = torch::jit::load(model_path);
+        yolo_model.eval();
+        yolo_model.to(device, torch::kFloat32);
+
+        // Load image and preprocess
+        cv::Mat image = cv::imread("/path/to/bus.jpg");
+        cv::Mat input_image;
+        letterbox(image, input_image, {640, 640});
+        cv::cvtColor(input_image, input_image, cv::COLOR_BGR2RGB);
+
+        torch::Tensor image_tensor = torch::from_blob(input_image.data, {input_image.rows, input_image.cols, 3}, torch::kByte).to(device);
+        image_tensor = image_tensor.toType(torch::kFloat32).div(255);
+        image_tensor = image_tensor.permute({2, 0, 1});
+        image_tensor = image_tensor.unsqueeze(0);
+        std::vector<torch::jit::IValue> inputs {image_tensor};
+
+        // Inference
+        torch::Tensor output = yolo_model.forward(inputs).toTensor().cpu();
+
+        // NMS
+        auto keep = non_max_suppression(output)[0];
+        auto boxes = keep.index({Slice(), Slice(None, 4)});
+        keep.index_put_({Slice(), Slice(None, 4)}, scale_boxes({input_image.rows, input_image.cols}, boxes, {image.rows, image.cols}));
+
+        // Show the results
+        for (int i = 0; i < keep.size(0); i++) {
+            int x1 = keep[i][0].item().toFloat();
+            int y1 = keep[i][1].item().toFloat();
+            int x2 = keep[i][2].item().toFloat();
+            int y2 = keep[i][3].item().toFloat();
+            float conf = keep[i][4].item().toFloat();
+            int cls = keep[i][5].item().toInt();
+            std::cout << "Rect: [" << x1 << "," << y1 << "," << x2 << "," << y2 << "]  Conf: " << conf << "  Class: " << classes[cls] << std::endl;
+        }
+    } catch (const c10::Error& e) {
+        std::cout << e.msg() << std::endl;
+    }
+
+    return 0;
+}
--- a/examples/YOLOv8-ONNXRuntime-CPP/CMakeLists.txt
+++ b/examples/YOLOv8-ONNXRuntime-CPP/CMakeLists.txt
+cmake_minimum_required(VERSION 3.5)
+
+set(PROJECT_NAME Yolov8OnnxRuntimeCPPInference)
+project(${PROJECT_NAME} VERSION 0.0.1 LANGUAGES CXX)
+
+
+# -------------- Support C++17 for using filesystem  ------------------#
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS ON)
+set(CMAKE_INCLUDE_CURRENT_DIR ON)
+
+
+# -------------- OpenCV  ------------------#
+find_package(OpenCV REQUIRED)
+include_directories(${OpenCV_INCLUDE_DIRS})
+
+
+# -------------- Compile CUDA for FP16 inference if needed  ------------------#
+option(USE_CUDA "Enable CUDA support" ON)
+if (NOT APPLE AND USE_CUDA)
+    find_package(CUDA REQUIRED)
+    include_directories(${CUDA_INCLUDE_DIRS})
+    add_definitions(-DUSE_CUDA)
+else ()
+    set(USE_CUDA OFF)
+endif ()
+
+# -------------- ONNXRUNTIME  ------------------#
+
+# Set ONNXRUNTIME_VERSION
+set(ONNXRUNTIME_VERSION 1.15.1)
+
+if (WIN32)
+    if (USE_CUDA)
+        set(ONNXRUNTIME_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/onnxruntime-win-x64-gpu-${ONNXRUNTIME_VERSION}")
+    else ()
+        set(ONNXRUNTIME_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/onnxruntime-win-x64-${ONNXRUNTIME_VERSION}")
+    endif ()
+elseif (LINUX)
+    if (USE_CUDA)
+        set(ONNXRUNTIME_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/onnxruntime-linux-x64-gpu-${ONNXRUNTIME_VERSION}")
+    else ()
+        set(ONNXRUNTIME_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/onnxruntime-linux-x64-${ONNXRUNTIME_VERSION}")
+    endif ()
+elseif (APPLE)
+    set(ONNXRUNTIME_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/onnxruntime-osx-arm64-${ONNXRUNTIME_VERSION}")
+    # Apple X64 binary
+    # set(ONNXRUNTIME_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/onnxruntime-osx-x64-${ONNXRUNTIME_VERSION}")
+    # Apple Universal binary
+    # set(ONNXRUNTIME_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/onnxruntime-osx-universal2-${ONNXRUNTIME_VERSION}")
+else ()
+    message(SEND_ERROR "Variable ONNXRUNTIME_ROOT is not set properly. Please check if your cmake project \
+    is not compiled with `-D WIN32=TRUE`, `-D LINUX=TRUE`, or `-D APPLE=TRUE`!")
+endif ()
+
+include_directories(${PROJECT_NAME} ${ONNXRUNTIME_ROOT}/include)
+
+set(PROJECT_SOURCES
+        main.cpp
+        inference.h
+        inference.cpp
+)
+
+add_executable(${PROJECT_NAME} ${PROJECT_SOURCES})
+
+if (WIN32)
+    target_link_libraries(${PROJECT_NAME} ${OpenCV_LIBS} ${ONNXRUNTIME_ROOT}/lib/onnxruntime.lib)
+    if (USE_CUDA)
+        target_link_libraries(${PROJECT_NAME} ${CUDA_LIBRARIES})
+    endif ()
+elseif (LINUX)
+    target_link_libraries(${PROJECT_NAME} ${OpenCV_LIBS} ${ONNXRUNTIME_ROOT}/lib/libonnxruntime.so)
+    if (USE_CUDA)
+        target_link_libraries(${PROJECT_NAME} ${CUDA_LIBRARIES})
+    endif ()
+elseif (APPLE)
+    target_link_libraries(${PROJECT_NAME} ${OpenCV_LIBS} ${ONNXRUNTIME_ROOT}/lib/libonnxruntime.dylib)
+endif ()
+
+# For windows system, copy onnxruntime.dll to the same folder of the executable file
+if (WIN32)
+    add_custom_command(TARGET ${PROJECT_NAME} POST_BUILD
+            COMMAND ${CMAKE_COMMAND} -E copy_if_different
+            "${ONNXRUNTIME_ROOT}/lib/onnxruntime.dll"
+            $<TARGET_FILE_DIR:${PROJECT_NAME}>)
+endif ()
+
+# Download https://raw.githubusercontent.com/ultralytics/ultralytics/main/ultralytics/cfg/datasets/coco.yaml
+# and put it in the same folder of the executable file
+configure_file(coco.yaml ${CMAKE_CURRENT_BINARY_DIR}/coco.yaml COPYONLY)
+
+# Copy yolov8n.onnx file to the same folder of the executable file
+configure_file(yolov8n.onnx ${CMAKE_CURRENT_BINARY_DIR}/yolov8n.onnx COPYONLY)
+
+# Create folder name images in the same folder of the executable file
+add_custom_command(TARGET ${PROJECT_NAME} POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}/images
+)
--- a/examples/YOLOv8-ONNXRuntime-CPP/README.md
+++ b/examples/YOLOv8-ONNXRuntime-CPP/README.md
+# YOLOv8 OnnxRuntime C++
+
+<img alt="C++" src="https://img.shields.io/badge/C++-17-blue.svg?style=flat&logo=c%2B%2B"> <img alt="Onnx-runtime" src="https://img.shields.io/badge/OnnxRuntime-717272.svg?logo=Onnx&logoColor=white">
+
+This example demonstrates how to perform inference using YOLOv8 in C++ with ONNX Runtime and OpenCV's API.
+
+## Benefits ✨
+
+- Friendly for deployment in the industrial sector.
+- Faster than OpenCV's DNN inference on both CPU and GPU.
+- Supports FP32 and FP16 CUDA acceleration.
+
+## Note ☕
+
+1. Benefit for Ultralytics' latest release, a `Transpose` op is added to the YOLOv8 model, while make v8 and v5 has the same output shape. Therefore, you can run inference with YOLOv5/v7/v8 via this project.
+
+## Exporting YOLOv8 Models 📦
+
+To export YOLOv8 models, use the following Python script:
+
+```python
+from ultralytics import YOLO
+
+# Load a YOLOv8 model
+model = YOLO("yolov8n.pt")
+
+# Export the model
+model.export(format="onnx", opset=12, simplify=True, dynamic=False, imgsz=640)
+```
+
+Alternatively, you can use the following command for exporting the model in the terminal
+
+```bash
+yolo export model=yolov8n.pt opset=12 simplify=True dynamic=False format=onnx imgsz=640,640
+```
+
+## Exporting YOLOv8 FP16 Models 📦
+
+```python
+import onnx
+from onnxconverter_common import float16
+
+model = onnx.load(R"YOUR_ONNX_PATH")
+model_fp16 = float16.convert_float_to_float16(model)
+onnx.save(model_fp16, R"YOUR_FP16_ONNX_PATH")
+```
+
+## Download COCO.yaml file 📂
+
+In order to run example, you also need to download coco.yaml. You can download the file manually from [here](https://raw.githubusercontent.com/ultralytics/ultralytics/main/ultralytics/cfg/datasets/coco.yaml)
+
+## Dependencies ⚙️
+
+| Dependency                       | Version       |
+| -------------------------------- | ------------- |
+| Onnxruntime(linux,windows,macos) | >=1.14.1      |
+| OpenCV                           | >=4.0.0       |
+| C++ Standard                     | >=17          |
+| Cmake                            | >=3.5         |
+| Cuda (Optional)                  | >=11.4 \<12.0 |
+| cuDNN (Cuda required)            | =8            |
+
+Note: The dependency on C++17 is due to the usage of the C++17 filesystem feature.
+
+Note (2): Due to ONNX Runtime, we need to use CUDA 11 and cuDNN 8. Keep in mind that this requirement might change in the future.
+
+## Build 🛠️
+
+1. Clone the repository to your local machine.
+
+2. Navigate to the root directory of the repository.
+
+3. Create a build directory and navigate to it:
+
+   ```console
+   mkdir build && cd build
+   ```
+
+4. Run CMake to generate the build files:
+
+   ```console
+   cmake ..
+   ```
+
+   **Notice**:
+
+   If you encounter an error indicating that the `ONNXRUNTIME_ROOT` variable is not set correctly, you can resolve this by building the project using the appropriate command tailored to your system.
+
+   ```console
+   # compiled in a win32 system
+   cmake -D WIN32=TRUE ..
+   # compiled in a linux system
+   cmake -D LINUX=TRUE ..
+   # compiled in an apple system
+   cmake -D APPLE=TRUE ..
+   ```
+
+5. Build the project:
+
+   ```console
+   make
+   ```
+
+6. The built executable should now be located in the `build` directory.
+
+## Usage 🚀
+
+```c++
+//change your param as you like
+//Pay attention to your device and the onnx model type(fp32 or fp16)
+DL_INIT_PARAM params;
+params.rectConfidenceThreshold = 0.1;
+params.iouThreshold = 0.5;
+params.modelPath = "yolov8n.onnx";
+params.imgSize = { 640, 640 };
+params.cudaEnable = true;
+params.modelType = YOLO_DETECT_V8;
+yoloDetector->CreateSession(params);
+Detector(yoloDetector);
+```
--- a/examples/YOLOv8-ONNXRuntime-CPP/inference.cpp
+++ b/examples/YOLOv8-ONNXRuntime-CPP/inference.cpp
+#include "inference.h"
+#include <regex>
+
+#define benchmark
+#define min(a,b)            (((a) < (b)) ? (a) : (b))
+YOLO_V8::YOLO_V8() {
+
+}
+
+
+YOLO_V8::~YOLO_V8() {
+    delete session;
+}
+
+#ifdef USE_CUDA
+namespace Ort
+{
+    template<>
+    struct TypeToTensorType<half> { static constexpr ONNXTensorElementDataType type = ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16; };
+}
+#endif
+
+
+template<typename T>
+char* BlobFromImage(cv::Mat& iImg, T& iBlob) {
+    int channels = iImg.channels();
+    int imgHeight = iImg.rows;
+    int imgWidth = iImg.cols;
+
+    for (int c = 0; c < channels; c++)
+    {
+        for (int h = 0; h < imgHeight; h++)
+        {
+            for (int w = 0; w < imgWidth; w++)
+            {
+                iBlob[c * imgWidth * imgHeight + h * imgWidth + w] = typename std::remove_pointer<T>::type(
+                    (iImg.at<cv::Vec3b>(h, w)[c]) / 255.0f);
+            }
+        }
+    }
+    return RET_OK;
+}
+
+
+char* YOLO_V8::PreProcess(cv::Mat& iImg, std::vector<int> iImgSize, cv::Mat& oImg)
+{
+    if (iImg.channels() == 3)
+    {
+        oImg = iImg.clone();
+        cv::cvtColor(oImg, oImg, cv::COLOR_BGR2RGB);
+    }
+    else
+    {
+        cv::cvtColor(iImg, oImg, cv::COLOR_GRAY2RGB);
+    }
+
+    switch (modelType)
+    {
+    case YOLO_DETECT_V8:
+    case YOLO_POSE:
+    case YOLO_DETECT_V8_HALF:
+    case YOLO_POSE_V8_HALF://LetterBox
+    {
+        if (iImg.cols >= iImg.rows)
+        {
+            resizeScales = iImg.cols / (float)iImgSize.at(0);
+            cv::resize(oImg, oImg, cv::Size(iImgSize.at(0), int(iImg.rows / resizeScales)));
+        }
+        else
+        {
+            resizeScales = iImg.rows / (float)iImgSize.at(0);
+            cv::resize(oImg, oImg, cv::Size(int(iImg.cols / resizeScales), iImgSize.at(1)));
+        }
+        cv::Mat tempImg = cv::Mat::zeros(iImgSize.at(0), iImgSize.at(1), CV_8UC3);
+        oImg.copyTo(tempImg(cv::Rect(0, 0, oImg.cols, oImg.rows)));
+        oImg = tempImg;
+        break;
+    }
+    case YOLO_CLS://CenterCrop
+    {
+        int h = iImg.rows;
+        int w = iImg.cols;
+        int m = min(h, w);
+        int top = (h - m) / 2;
+        int left = (w - m) / 2;
+        cv::resize(oImg(cv::Rect(left, top, m, m)), oImg, cv::Size(iImgSize.at(0), iImgSize.at(1)));
+        break;
+    }
+    }
+    return RET_OK;
+}
+
+
+char* YOLO_V8::CreateSession(DL_INIT_PARAM& iParams) {
+    char* Ret = RET_OK;
+    std::regex pattern("[\u4e00-\u9fa5]");
+    bool result = std::regex_search(iParams.modelPath, pattern);
+    if (result)
+    {
+        Ret = "[YOLO_V8]:Your model path is error.Change your model path without chinese characters.";
+        std::cout << Ret << std::endl;
+        return Ret;
+    }
+    try
+    {
+        rectConfidenceThreshold = iParams.rectConfidenceThreshold;
+        iouThreshold = iParams.iouThreshold;
+        imgSize = iParams.imgSize;
+        modelType = iParams.modelType;
+        env = Ort::Env(ORT_LOGGING_LEVEL_WARNING, "Yolo");
+        Ort::SessionOptions sessionOption;
+        if (iParams.cudaEnable)
+        {
+            cudaEnable = iParams.cudaEnable;
+            OrtCUDAProviderOptions cudaOption;
+            cudaOption.device_id = 0;
+            sessionOption.AppendExecutionProvider_CUDA(cudaOption);
+        }
+        sessionOption.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_ALL);
+        sessionOption.SetIntraOpNumThreads(iParams.intraOpNumThreads);
+        sessionOption.SetLogSeverityLevel(iParams.logSeverityLevel);
+
+#ifdef _WIN32
+        int ModelPathSize = MultiByteToWideChar(CP_UTF8, 0, iParams.modelPath.c_str(), static_cast<int>(iParams.modelPath.length()), nullptr, 0);
+        wchar_t* wide_cstr = new wchar_t[ModelPathSize + 1];
+        MultiByteToWideChar(CP_UTF8, 0, iParams.modelPath.c_str(), static_cast<int>(iParams.modelPath.length()), wide_cstr, ModelPathSize);
+        wide_cstr[ModelPathSize] = L'\0';
+        const wchar_t* modelPath = wide_cstr;
+#else
+        const char* modelPath = iParams.modelPath.c_str();
+#endif // _WIN32
+
+        session = new Ort::Session(env, modelPath, sessionOption);
+        Ort::AllocatorWithDefaultOptions allocator;
+        size_t inputNodesNum = session->GetInputCount();
+        for (size_t i = 0; i < inputNodesNum; i++)
+        {
+            Ort::AllocatedStringPtr input_node_name = session->GetInputNameAllocated(i, allocator);
+            char* temp_buf = new char[50];
+            strcpy(temp_buf, input_node_name.get());
+            inputNodeNames.push_back(temp_buf);
+        }
+        size_t OutputNodesNum = session->GetOutputCount();
+        for (size_t i = 0; i < OutputNodesNum; i++)
+        {
+            Ort::AllocatedStringPtr output_node_name = session->GetOutputNameAllocated(i, allocator);
+            char* temp_buf = new char[10];
+            strcpy(temp_buf, output_node_name.get());
+            outputNodeNames.push_back(temp_buf);
+        }
+        options = Ort::RunOptions{ nullptr };
+        WarmUpSession();
+        return RET_OK;
+    }
+    catch (const std::exception& e)
+    {
+        const char* str1 = "[YOLO_V8]:";
+        const char* str2 = e.what();
+        std::string result = std::string(str1) + std::string(str2);
+        char* merged = new char[result.length() + 1];
+        std::strcpy(merged, result.c_str());
+        std::cout << merged << std::endl;
+        delete[] merged;
+        return "[YOLO_V8]:Create session failed.";
+    }
+
+}
+
+
+char* YOLO_V8::RunSession(cv::Mat& iImg, std::vector<DL_RESULT>& oResult) {
+#ifdef benchmark
+    clock_t starttime_1 = clock();
+#endif // benchmark
+
+    char* Ret = RET_OK;
+    cv::Mat processedImg;
+    PreProcess(iImg, imgSize, processedImg);
+    if (modelType < 4)
+    {
+        float* blob = new float[processedImg.total() * 3];
+        BlobFromImage(processedImg, blob);
+        std::vector<int64_t> inputNodeDims = { 1, 3, imgSize.at(0), imgSize.at(1) };
+        TensorProcess(starttime_1, iImg, blob, inputNodeDims, oResult);
+    }
+    else
+    {
+#ifdef USE_CUDA
+        half* blob = new half[processedImg.total() * 3];
+        BlobFromImage(processedImg, blob);
+        std::vector<int64_t> inputNodeDims = { 1,3,imgSize.at(0),imgSize.at(1) };
+        TensorProcess(starttime_1, iImg, blob, inputNodeDims, oResult);
+#endif
+    }
+
+    return Ret;
+}
+
+
+template<typename N>
+char* YOLO_V8::TensorProcess(clock_t& starttime_1, cv::Mat& iImg, N& blob, std::vector<int64_t>& inputNodeDims,
+    std::vector<DL_RESULT>& oResult) {
+    Ort::Value inputTensor = Ort::Value::CreateTensor<typename std::remove_pointer<N>::type>(
+        Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU), blob, 3 * imgSize.at(0) * imgSize.at(1),
+        inputNodeDims.data(), inputNodeDims.size());
+#ifdef benchmark
+    clock_t starttime_2 = clock();
+#endif // benchmark
+    auto outputTensor = session->Run(options, inputNodeNames.data(), &inputTensor, 1, outputNodeNames.data(),
+        outputNodeNames.size());
+#ifdef benchmark
+    clock_t starttime_3 = clock();
+#endif // benchmark
+
+    Ort::TypeInfo typeInfo = outputTensor.front().GetTypeInfo();
+    auto tensor_info = typeInfo.GetTensorTypeAndShapeInfo();
+    std::vector<int64_t> outputNodeDims = tensor_info.GetShape();
+    auto output = outputTensor.front().GetTensorMutableData<typename std::remove_pointer<N>::type>();
+    delete[] blob;
+    switch (modelType)
+    {
+    case YOLO_DETECT_V8:
+    case YOLO_DETECT_V8_HALF:
+    {
+        int signalResultNum = outputNodeDims[1];//84
+        int strideNum = outputNodeDims[2];//8400
+        std::vector<int> class_ids;
+        std::vector<float> confidences;
+        std::vector<cv::Rect> boxes;
+        cv::Mat rawData;
+        if (modelType == YOLO_DETECT_V8)
+        {
+            // FP32
+            rawData = cv::Mat(signalResultNum, strideNum, CV_32F, output);
+        }
+        else
+        {
+            // FP16
+            rawData = cv::Mat(signalResultNum, strideNum, CV_16F, output);
+            rawData.convertTo(rawData, CV_32F);
+        }
+        // Note:
+        // ultralytics add transpose operator to the output of yolov8 model.which make yolov8/v5/v7 has same shape
+        // https://github.com/ultralytics/assets/releases/download/v8.3.0/yolov8n.pt
+        rawData = rawData.t();
+
+        float* data = (float*)rawData.data;
+
+        for (int i = 0; i < strideNum; ++i)
+        {
+            float* classesScores = data + 4;
+            cv::Mat scores(1, this->classes.size(), CV_32FC1, classesScores);
+            cv::Point class_id;
+            double maxClassScore;
+            cv::minMaxLoc(scores, 0, &maxClassScore, 0, &class_id);
+            if (maxClassScore > rectConfidenceThreshold)
+            {
+                confidences.push_back(maxClassScore);
+                class_ids.push_back(class_id.x);
+                float x = data[0];
+                float y = data[1];
+                float w = data[2];
+                float h = data[3];
+
+                int left = int((x - 0.5 * w) * resizeScales);
+                int top = int((y - 0.5 * h) * resizeScales);
+
+                int width = int(w * resizeScales);
+                int height = int(h * resizeScales);
+
+                boxes.push_back(cv::Rect(left, top, width, height));
+            }
+            data += signalResultNum;
+        }
+        std::vector<int> nmsResult;
+        cv::dnn::NMSBoxes(boxes, confidences, rectConfidenceThreshold, iouThreshold, nmsResult);
+        for (int i = 0; i < nmsResult.size(); ++i)
+        {
+            int idx = nmsResult[i];
+            DL_RESULT result;
+            result.classId = class_ids[idx];
+            result.confidence = confidences[idx];
+            result.box = boxes[idx];
+            oResult.push_back(result);
+        }
+
+#ifdef benchmark
+        clock_t starttime_4 = clock();
+        double pre_process_time = (double)(starttime_2 - starttime_1) / CLOCKS_PER_SEC * 1000;
+        double process_time = (double)(starttime_3 - starttime_2) / CLOCKS_PER_SEC * 1000;
+        double post_process_time = (double)(starttime_4 - starttime_3) / CLOCKS_PER_SEC * 1000;
+        if (cudaEnable)
+        {
+            std::cout << "[YOLO_V8(CUDA)]: " << pre_process_time << "ms pre-process, " << process_time << "ms inference, " << post_process_time << "ms post-process." << std::endl;
+        }
+        else
+        {
+            std::cout << "[YOLO_V8(CPU)]: " << pre_process_time << "ms pre-process, " << process_time << "ms inference, " << post_process_time << "ms post-process." << std::endl;
+        }
+#endif // benchmark
+
+        break;
+    }
+    case YOLO_CLS:
+    case YOLO_CLS_HALF:
+    {
+        cv::Mat rawData;
+        if (modelType == YOLO_CLS) {
+            // FP32
+            rawData = cv::Mat(1, this->classes.size(), CV_32F, output);
+        } else {
+            // FP16
+            rawData = cv::Mat(1, this->classes.size(), CV_16F, output);
+            rawData.convertTo(rawData, CV_32F);
+        }
+        float *data = (float *) rawData.data;
+
+        DL_RESULT result;
+        for (int i = 0; i < this->classes.size(); i++)
+        {
+            result.classId = i;
+            result.confidence = data[i];
+            oResult.push_back(result);
+        }
+        break;
+    }
+    default:
+        std::cout << "[YOLO_V8]: " << "Not support model type." << std::endl;
+    }
+    return RET_OK;
+
+}
+
+
+char* YOLO_V8::WarmUpSession() {
+    clock_t starttime_1 = clock();
+    cv::Mat iImg = cv::Mat(cv::Size(imgSize.at(0), imgSize.at(1)), CV_8UC3);
+    cv::Mat processedImg;
+    PreProcess(iImg, imgSize, processedImg);
+    if (modelType < 4)
+    {
+        float* blob = new float[iImg.total() * 3];
+        BlobFromImage(processedImg, blob);
+        std::vector<int64_t> YOLO_input_node_dims = { 1, 3, imgSize.at(0), imgSize.at(1) };
+        Ort::Value input_tensor = Ort::Value::CreateTensor<float>(
+            Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU), blob, 3 * imgSize.at(0) * imgSize.at(1),
+            YOLO_input_node_dims.data(), YOLO_input_node_dims.size());
+        auto output_tensors = session->Run(options, inputNodeNames.data(), &input_tensor, 1, outputNodeNames.data(),
+            outputNodeNames.size());
+        delete[] blob;
+        clock_t starttime_4 = clock();
+        double post_process_time = (double)(starttime_4 - starttime_1) / CLOCKS_PER_SEC * 1000;
+        if (cudaEnable)
+        {
+            std::cout << "[YOLO_V8(CUDA)]: " << "Cuda warm-up cost " << post_process_time << " ms. " << std::endl;
+        }
+    }
+    else
+    {
+#ifdef USE_CUDA
+        half* blob = new half[iImg.total() * 3];
+        BlobFromImage(processedImg, blob);
+        std::vector<int64_t> YOLO_input_node_dims = { 1,3,imgSize.at(0),imgSize.at(1) };
+        Ort::Value input_tensor = Ort::Value::CreateTensor<half>(Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU), blob, 3 * imgSize.at(0) * imgSize.at(1), YOLO_input_node_dims.data(), YOLO_input_node_dims.size());
+        auto output_tensors = session->Run(options, inputNodeNames.data(), &input_tensor, 1, outputNodeNames.data(), outputNodeNames.size());
+        delete[] blob;
+        clock_t starttime_4 = clock();
+        double post_process_time = (double)(starttime_4 - starttime_1) / CLOCKS_PER_SEC * 1000;
+        if (cudaEnable)
+        {
+            std::cout << "[YOLO_V8(CUDA)]: " << "Cuda warm-up cost " << post_process_time << " ms. " << std::endl;
+        }
+#endif
+    }
+    return RET_OK;
+}
--- a/examples/YOLOv8-ONNXRuntime-CPP/inference.h
+++ b/examples/YOLOv8-ONNXRuntime-CPP/inference.h
+#pragma once
+
+#define    RET_OK nullptr
+
+#ifdef _WIN32
+#include <Windows.h>
+#include <direct.h>
+#include <io.h>
+#endif
+
+#include <string>
+#include <vector>
+#include <cstdio>
+#include <opencv2/opencv.hpp>
+#include "onnxruntime_cxx_api.h"
+
+#ifdef USE_CUDA
+#include <cuda_fp16.h>
+#endif
+
+
+enum MODEL_TYPE
+{
+    //FLOAT32 MODEL
+    YOLO_DETECT_V8 = 1,
+    YOLO_POSE = 2,
+    YOLO_CLS = 3,
+
+    //FLOAT16 MODEL
+    YOLO_DETECT_V8_HALF = 4,
+    YOLO_POSE_V8_HALF = 5,
+    YOLO_CLS_HALF = 6
+};
+
+
+typedef struct _DL_INIT_PARAM
+{
+    std::string modelPath;
+    MODEL_TYPE modelType = YOLO_DETECT_V8;
+    std::vector<int> imgSize = { 640, 640 };
+    float rectConfidenceThreshold = 0.6;
+    float iouThreshold = 0.5;
+    int	keyPointsNum = 2;//Note:kpt number for pose
+    bool cudaEnable = false;
+    int logSeverityLevel = 3;
+    int intraOpNumThreads = 1;
+} DL_INIT_PARAM;
+
+
+typedef struct _DL_RESULT
+{
+    int classId;
+    float confidence;
+    cv::Rect box;
+    std::vector<cv::Point2f> keyPoints;
+} DL_RESULT;
+
+
+class YOLO_V8
+{
+public:
+    YOLO_V8();
+
+    ~YOLO_V8();
+
+public:
+    char* CreateSession(DL_INIT_PARAM& iParams);
+
+    char* RunSession(cv::Mat& iImg, std::vector<DL_RESULT>& oResult);
+
+    char* WarmUpSession();
+
+    template<typename N>
+    char* TensorProcess(clock_t& starttime_1, cv::Mat& iImg, N& blob, std::vector<int64_t>& inputNodeDims,
+        std::vector<DL_RESULT>& oResult);
+
+    char* PreProcess(cv::Mat& iImg, std::vector<int> iImgSize, cv::Mat& oImg);
+
+    std::vector<std::string> classes{};
+
+private:
+    Ort::Env env;
+    Ort::Session* session;
+    bool cudaEnable;
+    Ort::RunOptions options;
+    std::vector<const char*> inputNodeNames;
+    std::vector<const char*> outputNodeNames;
+
+    MODEL_TYPE modelType;
+    std::vector<int> imgSize;
+    float rectConfidenceThreshold;
+    float iouThreshold;
+    float resizeScales;//letterbox scale
+};
--- a/examples/YOLOv8-ONNXRuntime-CPP/main.cpp
+++ b/examples/YOLOv8-ONNXRuntime-CPP/main.cpp
+#include <iostream>
+#include <iomanip>
+#include "inference.h"
+#include <filesystem>
+#include <fstream>
+#include <random>
+
+void Detector(YOLO_V8*& p) {
+    std::filesystem::path current_path = std::filesystem::current_path();
+    std::filesystem::path imgs_path = current_path / "images";
+    for (auto& i : std::filesystem::directory_iterator(imgs_path))
+    {
+        if (i.path().extension() == ".jpg" || i.path().extension() == ".png" || i.path().extension() == ".jpeg")
+        {
+            std::string img_path = i.path().string();
+            cv::Mat img = cv::imread(img_path);
+            std::vector<DL_RESULT> res;
+            p->RunSession(img, res);
+
+            for (auto& re : res)
+            {
+                cv::RNG rng(cv::getTickCount());
+                cv::Scalar color(rng.uniform(0, 256), rng.uniform(0, 256), rng.uniform(0, 256));
+
+                cv::rectangle(img, re.box, color, 3);
+
+                float confidence = floor(100 * re.confidence) / 100;
+                std::cout << std::fixed << std::setprecision(2);
+                std::string label = p->classes[re.classId] + " " +
+                    std::to_string(confidence).substr(0, std::to_string(confidence).size() - 4);
+
+                cv::rectangle(
+                    img,
+                    cv::Point(re.box.x, re.box.y - 25),
+                    cv::Point(re.box.x + label.length() * 15, re.box.y),
+                    color,
+                    cv::FILLED
+                );
+
+                cv::putText(
+                    img,
+                    label,
+                    cv::Point(re.box.x, re.box.y - 5),
+                    cv::FONT_HERSHEY_SIMPLEX,
+                    0.75,
+                    cv::Scalar(0, 0, 0),
+                    2
+                );
+
+
+            }
+            std::cout << "Press any key to exit" << std::endl;
+            cv::imshow("Result of Detection", img);
+            cv::waitKey(0);
+            cv::destroyAllWindows();
+        }
+    }
+}
+
+
+void Classifier(YOLO_V8*& p)
+{
+    std::filesystem::path current_path = std::filesystem::current_path();
+    std::filesystem::path imgs_path = current_path;// / "images"
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_int_distribution<int> dis(0, 255);
+    for (auto& i : std::filesystem::directory_iterator(imgs_path))
+    {
+        if (i.path().extension() == ".jpg" || i.path().extension() == ".png")
+        {
+            std::string img_path = i.path().string();
+            //std::cout << img_path << std::endl;
+            cv::Mat img = cv::imread(img_path);
+            std::vector<DL_RESULT> res;
+            char* ret = p->RunSession(img, res);
+
+            float positionY = 50;
+            for (int i = 0; i < res.size(); i++)
+            {
+                int r = dis(gen);
+                int g = dis(gen);
+                int b = dis(gen);
+                cv::putText(img, std::to_string(i) + ":", cv::Point(10, positionY), cv::FONT_HERSHEY_SIMPLEX, 1, cv::Scalar(b, g, r), 2);
+                cv::putText(img, std::to_string(res.at(i).confidence), cv::Point(70, positionY), cv::FONT_HERSHEY_SIMPLEX, 1, cv::Scalar(b, g, r), 2);
+                positionY += 50;
+            }
+
+            cv::imshow("TEST_CLS", img);
+            cv::waitKey(0);
+            cv::destroyAllWindows();
+            //cv::imwrite("E:\\output\\" + std::to_string(k) + ".png", img);
+        }
+
+    }
+}
+
+
+
+int ReadCocoYaml(YOLO_V8*& p) {
+    // Open the YAML file
+    std::ifstream file("coco.yaml");
+    if (!file.is_open())
+    {
+        std::cerr << "Failed to open file" << std::endl;
+        return 1;
+    }
+
+    // Read the file line by line
+    std::string line;
+    std::vector<std::string> lines;
+    while (std::getline(file, line))
+    {
+        lines.push_back(line);
+    }
+
+    // Find the start and end of the names section
+    std::size_t start = 0;
+    std::size_t end = 0;
+    for (std::size_t i = 0; i < lines.size(); i++)
+    {
+        if (lines[i].find("names:") != std::string::npos)
+        {
+            start = i + 1;
+        }
+        else if (start > 0 && lines[i].find(':') == std::string::npos)
+        {
+            end = i;
+            break;
+        }
+    }
+
+    // Extract the names
+    std::vector<std::string> names;
+    for (std::size_t i = start; i < end; i++)
+    {
+        std::stringstream ss(lines[i]);
+        std::string name;
+        std::getline(ss, name, ':'); // Extract the number before the delimiter
+        std::getline(ss, name); // Extract the string after the delimiter
+        names.push_back(name);
+    }
+
+    p->classes = names;
+    return 0;
+}
+
+
+void DetectTest()
+{
+    YOLO_V8* yoloDetector = new YOLO_V8;
+    ReadCocoYaml(yoloDetector);
+    DL_INIT_PARAM params;
+    params.rectConfidenceThreshold = 0.1;
+    params.iouThreshold = 0.5;
+    params.modelPath = "yolov8n.onnx";
+    params.imgSize = { 640, 640 };
+#ifdef USE_CUDA
+    params.cudaEnable = true;
+
+    // GPU FP32 inference
+    params.modelType = YOLO_DETECT_V8;
+    // GPU FP16 inference
+    //Note: change fp16 onnx model
+    //params.modelType = YOLO_DETECT_V8_HALF;
+
+#else
+    // CPU inference
+    params.modelType = YOLO_DETECT_V8;
+    params.cudaEnable = false;
+
+#endif
+    yoloDetector->CreateSession(params);
+    Detector(yoloDetector);
+}
+
+
+void ClsTest()
+{
+    YOLO_V8* yoloDetector = new YOLO_V8;
+    std::string model_path = "cls.onnx";
+    ReadCocoYaml(yoloDetector);
+    DL_INIT_PARAM params{ model_path, YOLO_CLS, {224, 224} };
+    yoloDetector->CreateSession(params);
+    Classifier(yoloDetector);
+}
+
+
+int main()
+{
+    //DetectTest();
+    ClsTest();
+}
--- a/examples/YOLOv8-ONNXRuntime-Rust/Cargo.toml
+++ b/examples/YOLOv8-ONNXRuntime-Rust/Cargo.toml
+# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
+
+[package]
+name = "yolov8-rs"
+version = "0.1.0"
+edition = "2021"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+clap = { version = "4.2.4", features = ["derive"] }
+image = { version = "0.25.2"}
+imageproc = { version = "0.25.0"}
+ndarray = { version = "0.16" }
+ort = { version = "2.0.0-rc.5", features = ["cuda", "tensorrt", "load-dynamic", "copy-dylibs", "half"]}
+rusttype = { version = "0.9.3" }
+anyhow = { version = "1.0.75" }
+regex = { version = "1.5.4" }
+rand = { version = "0.8.5" }
+chrono = { version = "0.4.30" }
+half = { version = "2.3.1" }
+dirs = { version = "5.0.1" }
+ureq = { version = "2.9.1" }
+ab_glyph = "0.2.29"
--- a/examples/YOLOv8-ONNXRuntime-Rust/README.md
+++ b/examples/YOLOv8-ONNXRuntime-Rust/README.md
--- a/examples/YOLOv8-ONNXRuntime-Rust/src/cli.rs
+++ b/examples/YOLOv8-ONNXRuntime-Rust/src/cli.rs
--- a/examples/YOLOv8-ONNXRuntime-Rust/src/lib.rs
+++ b/examples/YOLOv8-ONNXRuntime-Rust/src/lib.rs
--- a/examples/YOLOv8-ONNXRuntime-Rust/src/main.rs
+++ b/examples/YOLOv8-ONNXRuntime-Rust/src/main.rs
+use clap::Parser;
+
+use yolov8_rs::{Args, YOLOv8};
+
+fn main() -> Result<(), Box<dyn std::error::Error>> {
+    let args = Args::parse();
+
+    // 1. load image
+    let x = image::ImageReader::open(&args.source)?
+        .with_guessed_format()?
+        .decode()?;
+
+    // 2. model support dynamic batch inference, so input should be a Vec
+    let xs = vec![x];
+
+    // You can test `--batch 2` with this
+    // let xs = vec![x.clone(), x];
+
+    // 3. build yolov8 model
+    let mut model = YOLOv8::new(args)?;
+    model.summary(); // model info
+
+    // 4. run
+    let ys = model.run(&xs)?;
+    println!("{:?}", ys);
+
+    Ok(())
+}
--- a/examples/YOLOv8-ONNXRuntime-Rust/src/model.rs
+++ b/examples/YOLOv8-ONNXRuntime-Rust/src/model.rs
--- a/examples/YOLOv8-ONNXRuntime-Rust/src/ort_backend.rs
+++ b/examples/YOLOv8-ONNXRuntime-Rust/src/ort_backend.rs
--- a/examples/YOLOv8-ONNXRuntime-Rust/src/yolo_result.rs
+++ b/examples/YOLOv8-ONNXRuntime-Rust/src/yolo_result.rs
--- a/examples/YOLOv8-ONNXRuntime/README.md
+++ b/examples/YOLOv8-ONNXRuntime/README.md
--- a/examples/YOLOv8-ONNXRuntime/main.py
+++ b/examples/YOLOv8-ONNXRuntime/main.py
--- a/examples/YOLOv8-OpenCV-ONNX-Python/README.md
+++ b/examples/YOLOv8-OpenCV-ONNX-Python/README.md
+# YOLOv8 - OpenCV
+
+Implementation YOLOv8 on OpenCV using ONNX Format.
+
+Just simply clone and run
+
+```bash
+pip install -r requirements.txt
+python main.py --model yolov8n.onnx --img image.jpg
+```
+
+If you start from scratch:
+
+```bash
+pip install ultralytics
+yolo export model=yolov8n.pt imgsz=640 format=onnx opset=12
+```
+
+_\*Make sure to include "opset=12"_
--- a/examples/YOLOv8-OpenCV-ONNX-Python/main.py
+++ b/examples/YOLOv8-OpenCV-ONNX-Python/main.py
--- a/examples/YOLOv8-OpenVINO-CPP-Inference/CMakeLists.txt
+++ b/examples/YOLOv8-OpenVINO-CPP-Inference/CMakeLists.txt