#include "yolov8Predictor.h" YOLOPredictor::YOLOPredictor(const std::string &modelPath, const bool &isGPU, float confThreshold, float iouThreshold, float maskThreshold) { this->confThreshold = confThreshold; this->iouThreshold = iouThreshold; this->maskThreshold = maskThreshold; env = Ort::Env(OrtLoggingLevel::ORT_LOGGING_LEVEL_WARNING, "YOLOV8"); sessionOptions = Ort::SessionOptions(); // std::vector availableProviders = Ort::GetAvailableProviders(); // auto cudaAvailable = std::find(availableProviders.begin(), availableProviders.end(), "CUDAExecutionProvider"); // OrtCUDAProviderOptions cudaOption; OrtROCMProviderOptions rocmOption; rocmOption.device_id = 0; std::cout << "Inference device: DCU" << std::endl; // sessionOptions.AppendExecutionProvider_CUDA(cudaOption); sessionOptions.AppendExecutionProvider_ROCM(rocmOption); session = Ort::Session(env, modelPath.c_str(), sessionOptions); const size_t num_input_nodes = session.GetInputCount(); //==1 const size_t num_output_nodes = session.GetOutputCount(); //==1,2 if (num_output_nodes > 1) { this->hasMask = true; std::cout << "Instance Segmentation" << std::endl; } else std::cout << "Object Detection" << std::endl; Ort::AllocatorWithDefaultOptions allocator; for (int i = 0; i < num_input_nodes; i++) { auto input_name = session.GetInputNameAllocated(i, allocator); this->inputNames.push_back(input_name.get()); input_names_ptr.push_back(std::move(input_name)); Ort::TypeInfo inputTypeInfo = session.GetInputTypeInfo(i); std::vector inputTensorShape = inputTypeInfo.GetTensorTypeAndShapeInfo().GetShape(); this->inputShapes.push_back(inputTensorShape); this->isDynamicInputShape = false; // checking if width and height are dynamic if (inputTensorShape[2] == -1 && inputTensorShape[3] == -1) { std::cout << "Dynamic input shape" << std::endl; this->isDynamicInputShape = true; } } for (int i = 0; i < num_output_nodes; i++) { auto output_name = session.GetOutputNameAllocated(i, allocator); this->outputNames.push_back(output_name.get()); output_names_ptr.push_back(std::move(output_name)); Ort::TypeInfo outputTypeInfo = session.GetOutputTypeInfo(i); std::vector outputTensorShape = outputTypeInfo.GetTensorTypeAndShapeInfo().GetShape(); this->outputShapes.push_back(outputTensorShape); if (i == 0) { if (!this->hasMask) classNums = outputTensorShape[1] - 4; else classNums = outputTensorShape[1] - 4 - 32; } } } void YOLOPredictor::getBestClassInfo(std::vector::iterator it, float &bestConf, int &bestClassId, const int _classNums) { // first 4 element are box bestClassId = 4; bestConf = 0; for (int i = 4; i < _classNums + 4; i++) { if (it[i] > bestConf) { bestConf = it[i]; bestClassId = i - 4; } } } cv::Mat YOLOPredictor::getMask(const cv::Mat &maskProposals, const cv::Mat &maskProtos) { cv::Mat protos = maskProtos.reshape(0, {(int)this->outputShapes[1][1], (int)this->outputShapes[1][2] * (int)this->outputShapes[1][3]}); cv::Mat matmul_res = (maskProposals * protos).t(); cv::Mat masks = matmul_res.reshape(1, {(int)this->outputShapes[1][2], (int)this->outputShapes[1][3]}); cv::Mat dest; // sigmoid cv::exp(-masks, dest); dest = 1.0 / (1.0 + dest); cv::resize(dest, dest, cv::Size((int)this->inputShapes[0][2], (int)this->inputShapes[0][3]), cv::INTER_LINEAR); return dest; } void YOLOPredictor::preprocessing(std::vector &images, float *&blob, std::vector &inputTensorShape) { int batchSize = images.size(); if(batchSize == 0) return; int targetWidth = (int)this->inputShapes[0][2]; int targetHeight = (int)this->inputShapes[0][3]; size_t totalElements = batchSize * 3 * targetWidth * targetHeight; blob = new float[totalElements]; inputTensorShape = {batchSize, 3, targetHeight, targetWidth}; for (int b = 0; b < batchSize; ++b) { cv::Mat resizedImage, floatImage; cv::cvtColor(images[b], resizedImage, cv::COLOR_BGR2RGB); utils::letterbox(resizedImage, resizedImage, cv::Size((int)this->inputShapes[0][2], (int)this->inputShapes[0][3]), cv::Scalar(114, 114, 114), this->isDynamicInputShape, false, true, 32); resizedImage.convertTo(floatImage, CV_32FC3, 1 / 255.0); cv::Size floatImageSize{floatImage.cols, floatImage.rows}; // hwc -> chw std::vector chw(floatImage.channels()); for (int i = 0; i < floatImage.channels(); ++i) { chw[i] = cv::Mat(floatImageSize, CV_32FC1, blob + b * 3 * floatImageSize.width * floatImageSize.height + i * floatImageSize.width * floatImageSize.height); } cv::split(floatImage, chw); } } std::vector> YOLOPredictor::postprocessing(const cv::Size &resizedImageShape, const std::vector &originalImageShapes, std::vector &outputTensors) { int batchsize = originalImageShapes.size(); if(batchsize == 0) return {}; // for box std::vector> boxes(batchsize); std::vector> confs(batchsize); std::vector> classIds(batchsize); float *boxOutput = outputTensors[0].GetTensorMutableData(); int rows = (int)this->outputShapes[0][2]; int cols = (int)this->outputShapes[0][1]; // if hasMask std::vector>> picked_proposals(batchsize); std::vector mask_protos(batchsize); //[1,4+n,8400]=>[1,8400,4+n] or [1,4+n+32,8400]=>[1,8400,4+n+32] cv::Mat output0 = cv::Mat(cv::Size((int)this->outputShapes[0][2], (int)this->outputShapes[0][1]), CV_32F, boxOutput).t(); float *output0ptr = (float *)output0.data; // std::cout << rows << cols << std::endl; for (int b = 0; b < batchsize; ++b) { //[1,4+n,8400]=>[1,8400,4+n] or [1,4+n+32,8400]=>[1,8400,4+n+32] cv::Mat output0 = cv::Mat(cv::Size((int)this->outputShapes[0][2], (int)this->outputShapes[0][1]), CV_32F, boxOutput + b * rows * cols).t(); float *output0ptr = (float *)output0.data; for (int i = 0; i < rows; i++) { std::vector it(output0ptr + i * cols, output0ptr + (i + 1) * cols); float confidence; int classId; this->getBestClassInfo(it.begin(), confidence, classId, classNums); if (confidence > this->confThreshold) { if (this->hasMask) { std::vector temp(it.begin() + 4 + classNums, it.end()); picked_proposals[b].push_back(temp); } int centerX = (int)(it[0]); int centerY = (int)(it[1]); int width = (int)(it[2]); int height = (int)(it[3]); int left = centerX - width / 2; int top = centerY - height / 2; boxes[b].emplace_back(left, top, width, height); confs[b].emplace_back(confidence); classIds[b].emplace_back(classId); } } if (this->hasMask) { float* maskOutput = outputTensors[1].GetTensorMutableData() + b * this->outputShapes[1][1] * this->outputShapes[1][2] * this->outputShapes[1][3]; std::vector mask_protos_shape = {1, (int)this->outputShapes[1][1], (int)this->outputShapes[1][2], (int)this->outputShapes[1][3]}; mask_protos[b] = cv::Mat(mask_protos_shape, CV_32F, maskOutput); } } std::vector> results(batchsize); for (int b = 0; b < batchsize; ++b) { std::vector indices; cv::dnn::NMSBoxes(boxes[b], confs[b], this->confThreshold, this->iouThreshold, indices); for (int idx : indices) { Yolov8Result res; res.box = cv::Rect(boxes[b][idx]); if (this->hasMask) res.boxMask = this->getMask(cv::Mat(picked_proposals[b][idx]).t(), mask_protos[b]); else res.boxMask = cv::Mat::zeros((int)this->inputShapes[0][2], (int)this->inputShapes[0][3], CV_8U); utils::scaleCoords(res.box, res.boxMask, this->maskThreshold, resizedImageShape, originalImageShapes[b]); res.conf = confs[b][idx]; res.classId = classIds[b][idx]; results[b].emplace_back(res); } } return results; } std::vector> YOLOPredictor::predict(std::vector& images) { if (images.empty()) return {}; float* blob = nullptr; std::vector inputTensorShape{1, 3, -1, -1}; this->preprocessing(images, blob, inputTensorShape); size_t inputTensorSize = utils::vectorProduct(inputTensorShape); std::vector inputTensorValues(blob, blob + inputTensorSize); std::vector inputTensors; Ort::MemoryInfo memoryInfo = Ort::MemoryInfo::CreateCpu(OrtAllocatorType::OrtArenaAllocator, OrtMemType::OrtMemTypeDefault); inputTensors.push_back(Ort::Value::CreateTensor( memoryInfo, inputTensorValues.data(), inputTensorSize, inputTensorShape.data(), inputTensorShape.size())); std::vector outputTensors = this->session.Run(Ort::RunOptions{nullptr}, this->inputNames.data(), inputTensors.data(), 1, this->outputNames.data(), this->outputNames.size()); cv::Size resizedShapes(cv::Size((int)inputTensorShape[3], (int)inputTensorShape[2])); std::vector originalShapes; for (const auto& image : images) { originalShapes.push_back(image.size()); } std::vector> results = this->postprocessing(resizedShapes, originalShapes, outputTensors); delete[] blob; return results; }