Commit e63cf68a authored by chenzk's avatar chenzk
Browse files

v1.0

parents
Pipeline #2842 canceled with stages
# YOLOv8 LibTorch Inference C++
This example demonstrates how to perform inference using YOLOv8 models in C++ with LibTorch API.
## Dependencies
| Dependency | Version |
| ------------ | -------- |
| OpenCV | >=4.0.0 |
| C++ Standard | >=17 |
| Cmake | >=3.18 |
| Libtorch | >=1.12.1 |
## Usage
```bash
git clone ultralytics
cd ultralytics
pip install .
cd examples/YOLOv8-LibTorch-CPP-Inference
mkdir build
cd build
cmake ..
make
./yolov8_libtorch_inference
```
## Exporting YOLOv8
To export YOLOv8 models:
```bash
yolo export model=yolov8s.pt imgsz=640 format=torchscript
```
#include <iostream>
#include <opencv2/core.hpp>
#include <opencv2/imgproc.hpp>
#include <opencv2/imgcodecs.hpp>
#include <torch/torch.h>
#include <torch/script.h>
using torch::indexing::Slice;
using torch::indexing::None;
float generate_scale(cv::Mat& image, const std::vector<int>& target_size) {
int origin_w = image.cols;
int origin_h = image.rows;
int target_h = target_size[0];
int target_w = target_size[1];
float ratio_h = static_cast<float>(target_h) / static_cast<float>(origin_h);
float ratio_w = static_cast<float>(target_w) / static_cast<float>(origin_w);
float resize_scale = std::min(ratio_h, ratio_w);
return resize_scale;
}
float letterbox(cv::Mat &input_image, cv::Mat &output_image, const std::vector<int> &target_size) {
if (input_image.cols == target_size[1] && input_image.rows == target_size[0]) {
if (input_image.data == output_image.data) {
return 1.;
} else {
output_image = input_image.clone();
return 1.;
}
}
float resize_scale = generate_scale(input_image, target_size);
int new_shape_w = std::round(input_image.cols * resize_scale);
int new_shape_h = std::round(input_image.rows * resize_scale);
float padw = (target_size[1] - new_shape_w) / 2.;
float padh = (target_size[0] - new_shape_h) / 2.;
int top = std::round(padh - 0.1);
int bottom = std::round(padh + 0.1);
int left = std::round(padw - 0.1);
int right = std::round(padw + 0.1);
cv::resize(input_image, output_image,
cv::Size(new_shape_w, new_shape_h),
0, 0, cv::INTER_AREA);
cv::copyMakeBorder(output_image, output_image, top, bottom, left, right,
cv::BORDER_CONSTANT, cv::Scalar(114.));
return resize_scale;
}
torch::Tensor xyxy2xywh(const torch::Tensor& x) {
auto y = torch::empty_like(x);
y.index_put_({"...", 0}, (x.index({"...", 0}) + x.index({"...", 2})).div(2));
y.index_put_({"...", 1}, (x.index({"...", 1}) + x.index({"...", 3})).div(2));
y.index_put_({"...", 2}, x.index({"...", 2}) - x.index({"...", 0}));
y.index_put_({"...", 3}, x.index({"...", 3}) - x.index({"...", 1}));
return y;
}
torch::Tensor xywh2xyxy(const torch::Tensor& x) {
auto y = torch::empty_like(x);
auto dw = x.index({"...", 2}).div(2);
auto dh = x.index({"...", 3}).div(2);
y.index_put_({"...", 0}, x.index({"...", 0}) - dw);
y.index_put_({"...", 1}, x.index({"...", 1}) - dh);
y.index_put_({"...", 2}, x.index({"...", 0}) + dw);
y.index_put_({"...", 3}, x.index({"...", 1}) + dh);
return y;
}
// Reference: https://github.com/pytorch/vision/blob/main/torchvision/csrc/ops/cpu/nms_kernel.cpp
torch::Tensor nms(const torch::Tensor& bboxes, const torch::Tensor& scores, float iou_threshold) {
if (bboxes.numel() == 0)
return torch::empty({0}, bboxes.options().dtype(torch::kLong));
auto x1_t = bboxes.select(1, 0).contiguous();
auto y1_t = bboxes.select(1, 1).contiguous();
auto x2_t = bboxes.select(1, 2).contiguous();
auto y2_t = bboxes.select(1, 3).contiguous();
torch::Tensor areas_t = (x2_t - x1_t) * (y2_t - y1_t);
auto order_t = std::get<1>(
scores.sort(/*stable=*/true, /*dim=*/0, /* descending=*/true));
auto ndets = bboxes.size(0);
torch::Tensor suppressed_t = torch::zeros({ndets}, bboxes.options().dtype(torch::kByte));
torch::Tensor keep_t = torch::zeros({ndets}, bboxes.options().dtype(torch::kLong));
auto suppressed = suppressed_t.data_ptr<uint8_t>();
auto keep = keep_t.data_ptr<int64_t>();
auto order = order_t.data_ptr<int64_t>();
auto x1 = x1_t.data_ptr<float>();
auto y1 = y1_t.data_ptr<float>();
auto x2 = x2_t.data_ptr<float>();
auto y2 = y2_t.data_ptr<float>();
auto areas = areas_t.data_ptr<float>();
int64_t num_to_keep = 0;
for (int64_t _i = 0; _i < ndets; _i++) {
auto i = order[_i];
if (suppressed[i] == 1)
continue;
keep[num_to_keep++] = i;
auto ix1 = x1[i];
auto iy1 = y1[i];
auto ix2 = x2[i];
auto iy2 = y2[i];
auto iarea = areas[i];
for (int64_t _j = _i + 1; _j < ndets; _j++) {
auto j = order[_j];
if (suppressed[j] == 1)
continue;
auto xx1 = std::max(ix1, x1[j]);
auto yy1 = std::max(iy1, y1[j]);
auto xx2 = std::min(ix2, x2[j]);
auto yy2 = std::min(iy2, y2[j]);
auto w = std::max(static_cast<float>(0), xx2 - xx1);
auto h = std::max(static_cast<float>(0), yy2 - yy1);
auto inter = w * h;
auto ovr = inter / (iarea + areas[j] - inter);
if (ovr > iou_threshold)
suppressed[j] = 1;
}
}
return keep_t.narrow(0, 0, num_to_keep);
}
torch::Tensor non_max_suppression(torch::Tensor& prediction, float conf_thres = 0.25, float iou_thres = 0.45, int max_det = 300) {
auto bs = prediction.size(0);
auto nc = prediction.size(1) - 4;
auto nm = prediction.size(1) - nc - 4;
auto mi = 4 + nc;
auto xc = prediction.index({Slice(), Slice(4, mi)}).amax(1) > conf_thres;
prediction = prediction.transpose(-1, -2);
prediction.index_put_({"...", Slice({None, 4})}, xywh2xyxy(prediction.index({"...", Slice(None, 4)})));
std::vector<torch::Tensor> output;
for (int i = 0; i < bs; i++) {
output.push_back(torch::zeros({0, 6 + nm}, prediction.device()));
}
for (int xi = 0; xi < prediction.size(0); xi++) {
auto x = prediction[xi];
x = x.index({xc[xi]});
auto x_split = x.split({4, nc, nm}, 1);
auto box = x_split[0], cls = x_split[1], mask = x_split[2];
auto [conf, j] = cls.max(1, true);
x = torch::cat({box, conf, j.toType(torch::kFloat), mask}, 1);
x = x.index({conf.view(-1) > conf_thres});
int n = x.size(0);
if (!n) { continue; }
// NMS
auto c = x.index({Slice(), Slice{5, 6}}) * 7680;
auto boxes = x.index({Slice(), Slice(None, 4)}) + c;
auto scores = x.index({Slice(), 4});
auto i = nms(boxes, scores, iou_thres);
i = i.index({Slice(None, max_det)});
output[xi] = x.index({i});
}
return torch::stack(output);
}
torch::Tensor clip_boxes(torch::Tensor& boxes, const std::vector<int>& shape) {
boxes.index_put_({"...", 0}, boxes.index({"...", 0}).clamp(0, shape[1]));
boxes.index_put_({"...", 1}, boxes.index({"...", 1}).clamp(0, shape[0]));
boxes.index_put_({"...", 2}, boxes.index({"...", 2}).clamp(0, shape[1]));
boxes.index_put_({"...", 3}, boxes.index({"...", 3}).clamp(0, shape[0]));
return boxes;
}
torch::Tensor scale_boxes(const std::vector<int>& img1_shape, torch::Tensor& boxes, const std::vector<int>& img0_shape) {
auto gain = (std::min)((float)img1_shape[0] / img0_shape[0], (float)img1_shape[1] / img0_shape[1]);
auto pad0 = std::round((float)(img1_shape[1] - img0_shape[1] * gain) / 2. - 0.1);
auto pad1 = std::round((float)(img1_shape[0] - img0_shape[0] * gain) / 2. - 0.1);
boxes.index_put_({"...", 0}, boxes.index({"...", 0}) - pad0);
boxes.index_put_({"...", 2}, boxes.index({"...", 2}) - pad0);
boxes.index_put_({"...", 1}, boxes.index({"...", 1}) - pad1);
boxes.index_put_({"...", 3}, boxes.index({"...", 3}) - pad1);
boxes.index_put_({"...", Slice(None, 4)}, boxes.index({"...", Slice(None, 4)}).div(gain));
return boxes;
}
int main() {
// Device
torch::Device device(torch::cuda::is_available() ? torch::kCUDA :torch::kCPU);
// Note that in this example the classes are hard-coded
std::vector<std::string> classes {"person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light", "fire hydrant",
"stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra",
"giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite",
"baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork", "knife",
"spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair",
"couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
"microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush"};
try {
// Load the model (e.g. yolov8s.torchscript)
std::string model_path = "/path/to/yolov8s.torchscript";
torch::jit::script::Module yolo_model;
yolo_model = torch::jit::load(model_path);
yolo_model.eval();
yolo_model.to(device, torch::kFloat32);
// Load image and preprocess
cv::Mat image = cv::imread("/path/to/bus.jpg");
cv::Mat input_image;
letterbox(image, input_image, {640, 640});
cv::cvtColor(input_image, input_image, cv::COLOR_BGR2RGB);
torch::Tensor image_tensor = torch::from_blob(input_image.data, {input_image.rows, input_image.cols, 3}, torch::kByte).to(device);
image_tensor = image_tensor.toType(torch::kFloat32).div(255);
image_tensor = image_tensor.permute({2, 0, 1});
image_tensor = image_tensor.unsqueeze(0);
std::vector<torch::jit::IValue> inputs {image_tensor};
// Inference
torch::Tensor output = yolo_model.forward(inputs).toTensor().cpu();
// NMS
auto keep = non_max_suppression(output)[0];
auto boxes = keep.index({Slice(), Slice(None, 4)});
keep.index_put_({Slice(), Slice(None, 4)}, scale_boxes({input_image.rows, input_image.cols}, boxes, {image.rows, image.cols}));
// Show the results
for (int i = 0; i < keep.size(0); i++) {
int x1 = keep[i][0].item().toFloat();
int y1 = keep[i][1].item().toFloat();
int x2 = keep[i][2].item().toFloat();
int y2 = keep[i][3].item().toFloat();
float conf = keep[i][4].item().toFloat();
int cls = keep[i][5].item().toInt();
std::cout << "Rect: [" << x1 << "," << y1 << "," << x2 << "," << y2 << "] Conf: " << conf << " Class: " << classes[cls] << std::endl;
}
} catch (const c10::Error& e) {
std::cout << e.msg() << std::endl;
}
return 0;
}
cmake_minimum_required(VERSION 3.5)
set(PROJECT_NAME Yolov8OnnxRuntimeCPPInference)
project(${PROJECT_NAME} VERSION 0.0.1 LANGUAGES CXX)
# -------------- Support C++17 for using filesystem ------------------#
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS ON)
set(CMAKE_INCLUDE_CURRENT_DIR ON)
# -------------- OpenCV ------------------#
find_package(OpenCV REQUIRED)
include_directories(${OpenCV_INCLUDE_DIRS})
# -------------- Compile CUDA for FP16 inference if needed ------------------#
option(USE_CUDA "Enable CUDA support" ON)
if (NOT APPLE AND USE_CUDA)
find_package(CUDA REQUIRED)
include_directories(${CUDA_INCLUDE_DIRS})
add_definitions(-DUSE_CUDA)
else ()
set(USE_CUDA OFF)
endif ()
# -------------- ONNXRUNTIME ------------------#
# Set ONNXRUNTIME_VERSION
set(ONNXRUNTIME_VERSION 1.15.1)
if (WIN32)
if (USE_CUDA)
set(ONNXRUNTIME_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/onnxruntime-win-x64-gpu-${ONNXRUNTIME_VERSION}")
else ()
set(ONNXRUNTIME_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/onnxruntime-win-x64-${ONNXRUNTIME_VERSION}")
endif ()
elseif (LINUX)
if (USE_CUDA)
set(ONNXRUNTIME_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/onnxruntime-linux-x64-gpu-${ONNXRUNTIME_VERSION}")
else ()
set(ONNXRUNTIME_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/onnxruntime-linux-x64-${ONNXRUNTIME_VERSION}")
endif ()
elseif (APPLE)
set(ONNXRUNTIME_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/onnxruntime-osx-arm64-${ONNXRUNTIME_VERSION}")
# Apple X64 binary
# set(ONNXRUNTIME_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/onnxruntime-osx-x64-${ONNXRUNTIME_VERSION}")
# Apple Universal binary
# set(ONNXRUNTIME_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/onnxruntime-osx-universal2-${ONNXRUNTIME_VERSION}")
else ()
message(SEND_ERROR "Variable ONNXRUNTIME_ROOT is not set properly. Please check if your cmake project \
is not compiled with `-D WIN32=TRUE`, `-D LINUX=TRUE`, or `-D APPLE=TRUE`!")
endif ()
include_directories(${PROJECT_NAME} ${ONNXRUNTIME_ROOT}/include)
set(PROJECT_SOURCES
main.cpp
inference.h
inference.cpp
)
add_executable(${PROJECT_NAME} ${PROJECT_SOURCES})
if (WIN32)
target_link_libraries(${PROJECT_NAME} ${OpenCV_LIBS} ${ONNXRUNTIME_ROOT}/lib/onnxruntime.lib)
if (USE_CUDA)
target_link_libraries(${PROJECT_NAME} ${CUDA_LIBRARIES})
endif ()
elseif (LINUX)
target_link_libraries(${PROJECT_NAME} ${OpenCV_LIBS} ${ONNXRUNTIME_ROOT}/lib/libonnxruntime.so)
if (USE_CUDA)
target_link_libraries(${PROJECT_NAME} ${CUDA_LIBRARIES})
endif ()
elseif (APPLE)
target_link_libraries(${PROJECT_NAME} ${OpenCV_LIBS} ${ONNXRUNTIME_ROOT}/lib/libonnxruntime.dylib)
endif ()
# For windows system, copy onnxruntime.dll to the same folder of the executable file
if (WIN32)
add_custom_command(TARGET ${PROJECT_NAME} POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy_if_different
"${ONNXRUNTIME_ROOT}/lib/onnxruntime.dll"
$<TARGET_FILE_DIR:${PROJECT_NAME}>)
endif ()
# Download https://raw.githubusercontent.com/ultralytics/ultralytics/main/ultralytics/cfg/datasets/coco.yaml
# and put it in the same folder of the executable file
configure_file(coco.yaml ${CMAKE_CURRENT_BINARY_DIR}/coco.yaml COPYONLY)
# Copy yolov8n.onnx file to the same folder of the executable file
configure_file(yolov8n.onnx ${CMAKE_CURRENT_BINARY_DIR}/yolov8n.onnx COPYONLY)
# Create folder name images in the same folder of the executable file
add_custom_command(TARGET ${PROJECT_NAME} POST_BUILD
COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}/images
)
# YOLOv8 OnnxRuntime C++
<img alt="C++" src="https://img.shields.io/badge/C++-17-blue.svg?style=flat&logo=c%2B%2B"> <img alt="Onnx-runtime" src="https://img.shields.io/badge/OnnxRuntime-717272.svg?logo=Onnx&logoColor=white">
This example demonstrates how to perform inference using YOLOv8 in C++ with ONNX Runtime and OpenCV's API.
## Benefits ✨
- Friendly for deployment in the industrial sector.
- Faster than OpenCV's DNN inference on both CPU and GPU.
- Supports FP32 and FP16 CUDA acceleration.
## Note ☕
1. Benefit for Ultralytics' latest release, a `Transpose` op is added to the YOLOv8 model, while make v8 and v5 has the same output shape. Therefore, you can run inference with YOLOv5/v7/v8 via this project.
## Exporting YOLOv8 Models 📦
To export YOLOv8 models, use the following Python script:
```python
from ultralytics import YOLO
# Load a YOLOv8 model
model = YOLO("yolov8n.pt")
# Export the model
model.export(format="onnx", opset=12, simplify=True, dynamic=False, imgsz=640)
```
Alternatively, you can use the following command for exporting the model in the terminal
```bash
yolo export model=yolov8n.pt opset=12 simplify=True dynamic=False format=onnx imgsz=640,640
```
## Exporting YOLOv8 FP16 Models 📦
```python
import onnx
from onnxconverter_common import float16
model = onnx.load(R"YOUR_ONNX_PATH")
model_fp16 = float16.convert_float_to_float16(model)
onnx.save(model_fp16, R"YOUR_FP16_ONNX_PATH")
```
## Download COCO.yaml file 📂
In order to run example, you also need to download coco.yaml. You can download the file manually from [here](https://raw.githubusercontent.com/ultralytics/ultralytics/main/ultralytics/cfg/datasets/coco.yaml)
## Dependencies ⚙️
| Dependency | Version |
| -------------------------------- | ------------- |
| Onnxruntime(linux,windows,macos) | >=1.14.1 |
| OpenCV | >=4.0.0 |
| C++ Standard | >=17 |
| Cmake | >=3.5 |
| Cuda (Optional) | >=11.4 \<12.0 |
| cuDNN (Cuda required) | =8 |
Note: The dependency on C++17 is due to the usage of the C++17 filesystem feature.
Note (2): Due to ONNX Runtime, we need to use CUDA 11 and cuDNN 8. Keep in mind that this requirement might change in the future.
## Build 🛠️
1. Clone the repository to your local machine.
2. Navigate to the root directory of the repository.
3. Create a build directory and navigate to it:
```console
mkdir build && cd build
```
4. Run CMake to generate the build files:
```console
cmake ..
```
**Notice**:
If you encounter an error indicating that the `ONNXRUNTIME_ROOT` variable is not set correctly, you can resolve this by building the project using the appropriate command tailored to your system.
```console
# compiled in a win32 system
cmake -D WIN32=TRUE ..
# compiled in a linux system
cmake -D LINUX=TRUE ..
# compiled in an apple system
cmake -D APPLE=TRUE ..
```
5. Build the project:
```console
make
```
6. The built executable should now be located in the `build` directory.
## Usage 🚀
```c++
//change your param as you like
//Pay attention to your device and the onnx model type(fp32 or fp16)
DL_INIT_PARAM params;
params.rectConfidenceThreshold = 0.1;
params.iouThreshold = 0.5;
params.modelPath = "yolov8n.onnx";
params.imgSize = { 640, 640 };
params.cudaEnable = true;
params.modelType = YOLO_DETECT_V8;
yoloDetector->CreateSession(params);
Detector(yoloDetector);
```
#include "inference.h"
#include <regex>
#define benchmark
#define min(a,b) (((a) < (b)) ? (a) : (b))
YOLO_V8::YOLO_V8() {
}
YOLO_V8::~YOLO_V8() {
delete session;
}
#ifdef USE_CUDA
namespace Ort
{
template<>
struct TypeToTensorType<half> { static constexpr ONNXTensorElementDataType type = ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16; };
}
#endif
template<typename T>
char* BlobFromImage(cv::Mat& iImg, T& iBlob) {
int channels = iImg.channels();
int imgHeight = iImg.rows;
int imgWidth = iImg.cols;
for (int c = 0; c < channels; c++)
{
for (int h = 0; h < imgHeight; h++)
{
for (int w = 0; w < imgWidth; w++)
{
iBlob[c * imgWidth * imgHeight + h * imgWidth + w] = typename std::remove_pointer<T>::type(
(iImg.at<cv::Vec3b>(h, w)[c]) / 255.0f);
}
}
}
return RET_OK;
}
char* YOLO_V8::PreProcess(cv::Mat& iImg, std::vector<int> iImgSize, cv::Mat& oImg)
{
if (iImg.channels() == 3)
{
oImg = iImg.clone();
cv::cvtColor(oImg, oImg, cv::COLOR_BGR2RGB);
}
else
{
cv::cvtColor(iImg, oImg, cv::COLOR_GRAY2RGB);
}
switch (modelType)
{
case YOLO_DETECT_V8:
case YOLO_POSE:
case YOLO_DETECT_V8_HALF:
case YOLO_POSE_V8_HALF://LetterBox
{
if (iImg.cols >= iImg.rows)
{
resizeScales = iImg.cols / (float)iImgSize.at(0);
cv::resize(oImg, oImg, cv::Size(iImgSize.at(0), int(iImg.rows / resizeScales)));
}
else
{
resizeScales = iImg.rows / (float)iImgSize.at(0);
cv::resize(oImg, oImg, cv::Size(int(iImg.cols / resizeScales), iImgSize.at(1)));
}
cv::Mat tempImg = cv::Mat::zeros(iImgSize.at(0), iImgSize.at(1), CV_8UC3);
oImg.copyTo(tempImg(cv::Rect(0, 0, oImg.cols, oImg.rows)));
oImg = tempImg;
break;
}
case YOLO_CLS://CenterCrop
{
int h = iImg.rows;
int w = iImg.cols;
int m = min(h, w);
int top = (h - m) / 2;
int left = (w - m) / 2;
cv::resize(oImg(cv::Rect(left, top, m, m)), oImg, cv::Size(iImgSize.at(0), iImgSize.at(1)));
break;
}
}
return RET_OK;
}
char* YOLO_V8::CreateSession(DL_INIT_PARAM& iParams) {
char* Ret = RET_OK;
std::regex pattern("[\u4e00-\u9fa5]");
bool result = std::regex_search(iParams.modelPath, pattern);
if (result)
{
Ret = "[YOLO_V8]:Your model path is error.Change your model path without chinese characters.";
std::cout << Ret << std::endl;
return Ret;
}
try
{
rectConfidenceThreshold = iParams.rectConfidenceThreshold;
iouThreshold = iParams.iouThreshold;
imgSize = iParams.imgSize;
modelType = iParams.modelType;
env = Ort::Env(ORT_LOGGING_LEVEL_WARNING, "Yolo");
Ort::SessionOptions sessionOption;
if (iParams.cudaEnable)
{
cudaEnable = iParams.cudaEnable;
OrtCUDAProviderOptions cudaOption;
cudaOption.device_id = 0;
sessionOption.AppendExecutionProvider_CUDA(cudaOption);
}
sessionOption.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_ALL);
sessionOption.SetIntraOpNumThreads(iParams.intraOpNumThreads);
sessionOption.SetLogSeverityLevel(iParams.logSeverityLevel);
#ifdef _WIN32
int ModelPathSize = MultiByteToWideChar(CP_UTF8, 0, iParams.modelPath.c_str(), static_cast<int>(iParams.modelPath.length()), nullptr, 0);
wchar_t* wide_cstr = new wchar_t[ModelPathSize + 1];
MultiByteToWideChar(CP_UTF8, 0, iParams.modelPath.c_str(), static_cast<int>(iParams.modelPath.length()), wide_cstr, ModelPathSize);
wide_cstr[ModelPathSize] = L'\0';
const wchar_t* modelPath = wide_cstr;
#else
const char* modelPath = iParams.modelPath.c_str();
#endif // _WIN32
session = new Ort::Session(env, modelPath, sessionOption);
Ort::AllocatorWithDefaultOptions allocator;
size_t inputNodesNum = session->GetInputCount();
for (size_t i = 0; i < inputNodesNum; i++)
{
Ort::AllocatedStringPtr input_node_name = session->GetInputNameAllocated(i, allocator);
char* temp_buf = new char[50];
strcpy(temp_buf, input_node_name.get());
inputNodeNames.push_back(temp_buf);
}
size_t OutputNodesNum = session->GetOutputCount();
for (size_t i = 0; i < OutputNodesNum; i++)
{
Ort::AllocatedStringPtr output_node_name = session->GetOutputNameAllocated(i, allocator);
char* temp_buf = new char[10];
strcpy(temp_buf, output_node_name.get());
outputNodeNames.push_back(temp_buf);
}
options = Ort::RunOptions{ nullptr };
WarmUpSession();
return RET_OK;
}
catch (const std::exception& e)
{
const char* str1 = "[YOLO_V8]:";
const char* str2 = e.what();
std::string result = std::string(str1) + std::string(str2);
char* merged = new char[result.length() + 1];
std::strcpy(merged, result.c_str());
std::cout << merged << std::endl;
delete[] merged;
return "[YOLO_V8]:Create session failed.";
}
}
char* YOLO_V8::RunSession(cv::Mat& iImg, std::vector<DL_RESULT>& oResult) {
#ifdef benchmark
clock_t starttime_1 = clock();
#endif // benchmark
char* Ret = RET_OK;
cv::Mat processedImg;
PreProcess(iImg, imgSize, processedImg);
if (modelType < 4)
{
float* blob = new float[processedImg.total() * 3];
BlobFromImage(processedImg, blob);
std::vector<int64_t> inputNodeDims = { 1, 3, imgSize.at(0), imgSize.at(1) };
TensorProcess(starttime_1, iImg, blob, inputNodeDims, oResult);
}
else
{
#ifdef USE_CUDA
half* blob = new half[processedImg.total() * 3];
BlobFromImage(processedImg, blob);
std::vector<int64_t> inputNodeDims = { 1,3,imgSize.at(0),imgSize.at(1) };
TensorProcess(starttime_1, iImg, blob, inputNodeDims, oResult);
#endif
}
return Ret;
}
template<typename N>
char* YOLO_V8::TensorProcess(clock_t& starttime_1, cv::Mat& iImg, N& blob, std::vector<int64_t>& inputNodeDims,
std::vector<DL_RESULT>& oResult) {
Ort::Value inputTensor = Ort::Value::CreateTensor<typename std::remove_pointer<N>::type>(
Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU), blob, 3 * imgSize.at(0) * imgSize.at(1),
inputNodeDims.data(), inputNodeDims.size());
#ifdef benchmark
clock_t starttime_2 = clock();
#endif // benchmark
auto outputTensor = session->Run(options, inputNodeNames.data(), &inputTensor, 1, outputNodeNames.data(),
outputNodeNames.size());
#ifdef benchmark
clock_t starttime_3 = clock();
#endif // benchmark
Ort::TypeInfo typeInfo = outputTensor.front().GetTypeInfo();
auto tensor_info = typeInfo.GetTensorTypeAndShapeInfo();
std::vector<int64_t> outputNodeDims = tensor_info.GetShape();
auto output = outputTensor.front().GetTensorMutableData<typename std::remove_pointer<N>::type>();
delete[] blob;
switch (modelType)
{
case YOLO_DETECT_V8:
case YOLO_DETECT_V8_HALF:
{
int signalResultNum = outputNodeDims[1];//84
int strideNum = outputNodeDims[2];//8400
std::vector<int> class_ids;
std::vector<float> confidences;
std::vector<cv::Rect> boxes;
cv::Mat rawData;
if (modelType == YOLO_DETECT_V8)
{
// FP32
rawData = cv::Mat(signalResultNum, strideNum, CV_32F, output);
}
else
{
// FP16
rawData = cv::Mat(signalResultNum, strideNum, CV_16F, output);
rawData.convertTo(rawData, CV_32F);
}
// Note:
// ultralytics add transpose operator to the output of yolov8 model.which make yolov8/v5/v7 has same shape
// https://github.com/ultralytics/assets/releases/download/v8.3.0/yolov8n.pt
rawData = rawData.t();
float* data = (float*)rawData.data;
for (int i = 0; i < strideNum; ++i)
{
float* classesScores = data + 4;
cv::Mat scores(1, this->classes.size(), CV_32FC1, classesScores);
cv::Point class_id;
double maxClassScore;
cv::minMaxLoc(scores, 0, &maxClassScore, 0, &class_id);
if (maxClassScore > rectConfidenceThreshold)
{
confidences.push_back(maxClassScore);
class_ids.push_back(class_id.x);
float x = data[0];
float y = data[1];
float w = data[2];
float h = data[3];
int left = int((x - 0.5 * w) * resizeScales);
int top = int((y - 0.5 * h) * resizeScales);
int width = int(w * resizeScales);
int height = int(h * resizeScales);
boxes.push_back(cv::Rect(left, top, width, height));
}
data += signalResultNum;
}
std::vector<int> nmsResult;
cv::dnn::NMSBoxes(boxes, confidences, rectConfidenceThreshold, iouThreshold, nmsResult);
for (int i = 0; i < nmsResult.size(); ++i)
{
int idx = nmsResult[i];
DL_RESULT result;
result.classId = class_ids[idx];
result.confidence = confidences[idx];
result.box = boxes[idx];
oResult.push_back(result);
}
#ifdef benchmark
clock_t starttime_4 = clock();
double pre_process_time = (double)(starttime_2 - starttime_1) / CLOCKS_PER_SEC * 1000;
double process_time = (double)(starttime_3 - starttime_2) / CLOCKS_PER_SEC * 1000;
double post_process_time = (double)(starttime_4 - starttime_3) / CLOCKS_PER_SEC * 1000;
if (cudaEnable)
{
std::cout << "[YOLO_V8(CUDA)]: " << pre_process_time << "ms pre-process, " << process_time << "ms inference, " << post_process_time << "ms post-process." << std::endl;
}
else
{
std::cout << "[YOLO_V8(CPU)]: " << pre_process_time << "ms pre-process, " << process_time << "ms inference, " << post_process_time << "ms post-process." << std::endl;
}
#endif // benchmark
break;
}
case YOLO_CLS:
case YOLO_CLS_HALF:
{
cv::Mat rawData;
if (modelType == YOLO_CLS) {
// FP32
rawData = cv::Mat(1, this->classes.size(), CV_32F, output);
} else {
// FP16
rawData = cv::Mat(1, this->classes.size(), CV_16F, output);
rawData.convertTo(rawData, CV_32F);
}
float *data = (float *) rawData.data;
DL_RESULT result;
for (int i = 0; i < this->classes.size(); i++)
{
result.classId = i;
result.confidence = data[i];
oResult.push_back(result);
}
break;
}
default:
std::cout << "[YOLO_V8]: " << "Not support model type." << std::endl;
}
return RET_OK;
}
char* YOLO_V8::WarmUpSession() {
clock_t starttime_1 = clock();
cv::Mat iImg = cv::Mat(cv::Size(imgSize.at(0), imgSize.at(1)), CV_8UC3);
cv::Mat processedImg;
PreProcess(iImg, imgSize, processedImg);
if (modelType < 4)
{
float* blob = new float[iImg.total() * 3];
BlobFromImage(processedImg, blob);
std::vector<int64_t> YOLO_input_node_dims = { 1, 3, imgSize.at(0), imgSize.at(1) };
Ort::Value input_tensor = Ort::Value::CreateTensor<float>(
Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU), blob, 3 * imgSize.at(0) * imgSize.at(1),
YOLO_input_node_dims.data(), YOLO_input_node_dims.size());
auto output_tensors = session->Run(options, inputNodeNames.data(), &input_tensor, 1, outputNodeNames.data(),
outputNodeNames.size());
delete[] blob;
clock_t starttime_4 = clock();
double post_process_time = (double)(starttime_4 - starttime_1) / CLOCKS_PER_SEC * 1000;
if (cudaEnable)
{
std::cout << "[YOLO_V8(CUDA)]: " << "Cuda warm-up cost " << post_process_time << " ms. " << std::endl;
}
}
else
{
#ifdef USE_CUDA
half* blob = new half[iImg.total() * 3];
BlobFromImage(processedImg, blob);
std::vector<int64_t> YOLO_input_node_dims = { 1,3,imgSize.at(0),imgSize.at(1) };
Ort::Value input_tensor = Ort::Value::CreateTensor<half>(Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU), blob, 3 * imgSize.at(0) * imgSize.at(1), YOLO_input_node_dims.data(), YOLO_input_node_dims.size());
auto output_tensors = session->Run(options, inputNodeNames.data(), &input_tensor, 1, outputNodeNames.data(), outputNodeNames.size());
delete[] blob;
clock_t starttime_4 = clock();
double post_process_time = (double)(starttime_4 - starttime_1) / CLOCKS_PER_SEC * 1000;
if (cudaEnable)
{
std::cout << "[YOLO_V8(CUDA)]: " << "Cuda warm-up cost " << post_process_time << " ms. " << std::endl;
}
#endif
}
return RET_OK;
}
#pragma once
#define RET_OK nullptr
#ifdef _WIN32
#include <Windows.h>
#include <direct.h>
#include <io.h>
#endif
#include <string>
#include <vector>
#include <cstdio>
#include <opencv2/opencv.hpp>
#include "onnxruntime_cxx_api.h"
#ifdef USE_CUDA
#include <cuda_fp16.h>
#endif
enum MODEL_TYPE
{
//FLOAT32 MODEL
YOLO_DETECT_V8 = 1,
YOLO_POSE = 2,
YOLO_CLS = 3,
//FLOAT16 MODEL
YOLO_DETECT_V8_HALF = 4,
YOLO_POSE_V8_HALF = 5,
YOLO_CLS_HALF = 6
};
typedef struct _DL_INIT_PARAM
{
std::string modelPath;
MODEL_TYPE modelType = YOLO_DETECT_V8;
std::vector<int> imgSize = { 640, 640 };
float rectConfidenceThreshold = 0.6;
float iouThreshold = 0.5;
int keyPointsNum = 2;//Note:kpt number for pose
bool cudaEnable = false;
int logSeverityLevel = 3;
int intraOpNumThreads = 1;
} DL_INIT_PARAM;
typedef struct _DL_RESULT
{
int classId;
float confidence;
cv::Rect box;
std::vector<cv::Point2f> keyPoints;
} DL_RESULT;
class YOLO_V8
{
public:
YOLO_V8();
~YOLO_V8();
public:
char* CreateSession(DL_INIT_PARAM& iParams);
char* RunSession(cv::Mat& iImg, std::vector<DL_RESULT>& oResult);
char* WarmUpSession();
template<typename N>
char* TensorProcess(clock_t& starttime_1, cv::Mat& iImg, N& blob, std::vector<int64_t>& inputNodeDims,
std::vector<DL_RESULT>& oResult);
char* PreProcess(cv::Mat& iImg, std::vector<int> iImgSize, cv::Mat& oImg);
std::vector<std::string> classes{};
private:
Ort::Env env;
Ort::Session* session;
bool cudaEnable;
Ort::RunOptions options;
std::vector<const char*> inputNodeNames;
std::vector<const char*> outputNodeNames;
MODEL_TYPE modelType;
std::vector<int> imgSize;
float rectConfidenceThreshold;
float iouThreshold;
float resizeScales;//letterbox scale
};
#include <iostream>
#include <iomanip>
#include "inference.h"
#include <filesystem>
#include <fstream>
#include <random>
void Detector(YOLO_V8*& p) {
std::filesystem::path current_path = std::filesystem::current_path();
std::filesystem::path imgs_path = current_path / "images";
for (auto& i : std::filesystem::directory_iterator(imgs_path))
{
if (i.path().extension() == ".jpg" || i.path().extension() == ".png" || i.path().extension() == ".jpeg")
{
std::string img_path = i.path().string();
cv::Mat img = cv::imread(img_path);
std::vector<DL_RESULT> res;
p->RunSession(img, res);
for (auto& re : res)
{
cv::RNG rng(cv::getTickCount());
cv::Scalar color(rng.uniform(0, 256), rng.uniform(0, 256), rng.uniform(0, 256));
cv::rectangle(img, re.box, color, 3);
float confidence = floor(100 * re.confidence) / 100;
std::cout << std::fixed << std::setprecision(2);
std::string label = p->classes[re.classId] + " " +
std::to_string(confidence).substr(0, std::to_string(confidence).size() - 4);
cv::rectangle(
img,
cv::Point(re.box.x, re.box.y - 25),
cv::Point(re.box.x + label.length() * 15, re.box.y),
color,
cv::FILLED
);
cv::putText(
img,
label,
cv::Point(re.box.x, re.box.y - 5),
cv::FONT_HERSHEY_SIMPLEX,
0.75,
cv::Scalar(0, 0, 0),
2
);
}
std::cout << "Press any key to exit" << std::endl;
cv::imshow("Result of Detection", img);
cv::waitKey(0);
cv::destroyAllWindows();
}
}
}
void Classifier(YOLO_V8*& p)
{
std::filesystem::path current_path = std::filesystem::current_path();
std::filesystem::path imgs_path = current_path;// / "images"
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_int_distribution<int> dis(0, 255);
for (auto& i : std::filesystem::directory_iterator(imgs_path))
{
if (i.path().extension() == ".jpg" || i.path().extension() == ".png")
{
std::string img_path = i.path().string();
//std::cout << img_path << std::endl;
cv::Mat img = cv::imread(img_path);
std::vector<DL_RESULT> res;
char* ret = p->RunSession(img, res);
float positionY = 50;
for (int i = 0; i < res.size(); i++)
{
int r = dis(gen);
int g = dis(gen);
int b = dis(gen);
cv::putText(img, std::to_string(i) + ":", cv::Point(10, positionY), cv::FONT_HERSHEY_SIMPLEX, 1, cv::Scalar(b, g, r), 2);
cv::putText(img, std::to_string(res.at(i).confidence), cv::Point(70, positionY), cv::FONT_HERSHEY_SIMPLEX, 1, cv::Scalar(b, g, r), 2);
positionY += 50;
}
cv::imshow("TEST_CLS", img);
cv::waitKey(0);
cv::destroyAllWindows();
//cv::imwrite("E:\\output\\" + std::to_string(k) + ".png", img);
}
}
}
int ReadCocoYaml(YOLO_V8*& p) {
// Open the YAML file
std::ifstream file("coco.yaml");
if (!file.is_open())
{
std::cerr << "Failed to open file" << std::endl;
return 1;
}
// Read the file line by line
std::string line;
std::vector<std::string> lines;
while (std::getline(file, line))
{
lines.push_back(line);
}
// Find the start and end of the names section
std::size_t start = 0;
std::size_t end = 0;
for (std::size_t i = 0; i < lines.size(); i++)
{
if (lines[i].find("names:") != std::string::npos)
{
start = i + 1;
}
else if (start > 0 && lines[i].find(':') == std::string::npos)
{
end = i;
break;
}
}
// Extract the names
std::vector<std::string> names;
for (std::size_t i = start; i < end; i++)
{
std::stringstream ss(lines[i]);
std::string name;
std::getline(ss, name, ':'); // Extract the number before the delimiter
std::getline(ss, name); // Extract the string after the delimiter
names.push_back(name);
}
p->classes = names;
return 0;
}
void DetectTest()
{
YOLO_V8* yoloDetector = new YOLO_V8;
ReadCocoYaml(yoloDetector);
DL_INIT_PARAM params;
params.rectConfidenceThreshold = 0.1;
params.iouThreshold = 0.5;
params.modelPath = "yolov8n.onnx";
params.imgSize = { 640, 640 };
#ifdef USE_CUDA
params.cudaEnable = true;
// GPU FP32 inference
params.modelType = YOLO_DETECT_V8;
// GPU FP16 inference
//Note: change fp16 onnx model
//params.modelType = YOLO_DETECT_V8_HALF;
#else
// CPU inference
params.modelType = YOLO_DETECT_V8;
params.cudaEnable = false;
#endif
yoloDetector->CreateSession(params);
Detector(yoloDetector);
}
void ClsTest()
{
YOLO_V8* yoloDetector = new YOLO_V8;
std::string model_path = "cls.onnx";
ReadCocoYaml(yoloDetector);
DL_INIT_PARAM params{ model_path, YOLO_CLS, {224, 224} };
yoloDetector->CreateSession(params);
Classifier(yoloDetector);
}
int main()
{
//DetectTest();
ClsTest();
}
# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
[package]
name = "yolov8-rs"
version = "0.1.0"
edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
clap = { version = "4.2.4", features = ["derive"] }
image = { version = "0.25.2"}
imageproc = { version = "0.25.0"}
ndarray = { version = "0.16" }
ort = { version = "2.0.0-rc.5", features = ["cuda", "tensorrt", "load-dynamic", "copy-dylibs", "half"]}
rusttype = { version = "0.9.3" }
anyhow = { version = "1.0.75" }
regex = { version = "1.5.4" }
rand = { version = "0.8.5" }
chrono = { version = "0.4.30" }
half = { version = "2.3.1" }
dirs = { version = "5.0.1" }
ureq = { version = "2.9.1" }
ab_glyph = "0.2.29"
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
use clap::Parser;
use yolov8_rs::{Args, YOLOv8};
fn main() -> Result<(), Box<dyn std::error::Error>> {
let args = Args::parse();
// 1. load image
let x = image::ImageReader::open(&args.source)?
.with_guessed_format()?
.decode()?;
// 2. model support dynamic batch inference, so input should be a Vec
let xs = vec![x];
// You can test `--batch 2` with this
// let xs = vec![x.clone(), x];
// 3. build yolov8 model
let mut model = YOLOv8::new(args)?;
model.summary(); // model info
// 4. run
let ys = model.run(&xs)?;
println!("{:?}", ys);
Ok(())
}
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
# YOLOv8 - OpenCV
Implementation YOLOv8 on OpenCV using ONNX Format.
Just simply clone and run
```bash
pip install -r requirements.txt
python main.py --model yolov8n.onnx --img image.jpg
```
If you start from scratch:
```bash
pip install ultralytics
yolo export model=yolov8n.pt imgsz=640 format=onnx opset=12
```
_\*Make sure to include "opset=12"_
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment