Commit 6af474a2 authored by Ted Themistokleous's avatar Ted Themistokleous
Browse files

Initial idea of NMS to parallize this

Messing up my templates as its late right now.

The idea is to split box creation/calculations and then IOU operations into seperate
threads.

Another approach is if we know certain boxes have their area calculated, that we can
calculate things once and reuse it for a storrage trade off instead of CPU load.

Currently things chug along single threaded, and on a large input with a single
huge batch we stall. Need to add checks here for additional error cases too
if the error say, leads to a zero area but points are overlapping from bad data.
parent d8eb00b4
...@@ -28,6 +28,7 @@ ...@@ -28,6 +28,7 @@
#include <queue> #include <queue>
#include <cstdint> #include <cstdint>
#include <iterator> #include <iterator>
#include <thread>
#include <migraphx/config.hpp> #include <migraphx/config.hpp>
#include <migraphx/ranges.hpp> #include <migraphx/ranges.hpp>
#include <migraphx/float_equal.hpp> #include <migraphx/float_equal.hpp>
...@@ -58,8 +59,11 @@ struct nonmaxsuppression ...@@ -58,8 +59,11 @@ struct nonmaxsuppression
shape compute_shape(std::vector<shape> inputs) const shape compute_shape(std::vector<shape> inputs) const
{ {
std::cout << "COMP_SHAPE: " << inputs.size() << std::endl;
// requires at least 2 inputs // requires at least 2 inputs
check_shapes{{inputs.at(0), inputs.at(1)}, *this, true}.only_dims(3).same_ndims(); check_shapes{{inputs.at(0), inputs.at(1)}, *this, true}.only_dims(3).same_ndims();
auto boxes_max_lens = inputs.at(0).max_lens(); auto boxes_max_lens = inputs.at(0).max_lens();
// num batches * num boxes // num batches * num boxes
const auto max_num_boxes = boxes_max_lens.at(0) * boxes_max_lens.at(1); const auto max_num_boxes = boxes_max_lens.at(0) * boxes_max_lens.at(1);
...@@ -164,7 +168,7 @@ struct nonmaxsuppression ...@@ -164,7 +168,7 @@ struct nonmaxsuppression
}; };
template <class T> template <class T>
box batch_box(T boxes, std::size_t box_idx) const void batch_box(T boxes, std::size_t box_idx, T & box_result) const
{ {
box result{}; box result{};
auto start = boxes + 4 * box_idx; auto start = boxes + 4 * box_idx;
...@@ -183,10 +187,10 @@ struct nonmaxsuppression ...@@ -183,10 +187,10 @@ struct nonmaxsuppression
result.y = {static_cast<double>(start[0]), static_cast<double>(start[2])}; result.y = {static_cast<double>(start[0]), static_cast<double>(start[2])};
} }
return result; box_result = std::move(result);
} }
inline bool suppress_by_iou(box b1, box b2, double iou_threshold) const void suppress_by_iou(box b1, box b2, double iou_threshold, bool & result) const
{ {
b1.sort(); b1.sort();
b2.sort(); b2.sort();
...@@ -198,7 +202,8 @@ struct nonmaxsuppression ...@@ -198,7 +202,8 @@ struct nonmaxsuppression
intersection[i][1] = std::min(b1[i][1], b2[i][1]); intersection[i][1] = std::min(b1[i][1], b2[i][1]);
if(intersection[i][0] > intersection[i][1]) if(intersection[i][0] > intersection[i][1])
{ {
return false; result = false;
return;
} }
} }
...@@ -209,12 +214,14 @@ struct nonmaxsuppression ...@@ -209,12 +214,14 @@ struct nonmaxsuppression
if(area1 <= .0f or area2 <= .0f or union_area <= .0f) if(area1 <= .0f or area2 <= .0f or union_area <= .0f)
{ {
return false; result = false;
return;
} }
const double intersection_over_union = intersection_area / union_area; const double intersection_over_union = intersection_area / union_area;
return intersection_over_union > iou_threshold; result = intersection_over_union > iou_threshold;
return;
} }
// filter boxes below score_threshold // filter boxes below score_threshold
...@@ -258,7 +265,8 @@ struct nonmaxsuppression ...@@ -258,7 +265,8 @@ struct nonmaxsuppression
selected_boxes_inside_class.reserve(max_output_shape.elements()); selected_boxes_inside_class.reserve(max_output_shape.elements());
// iterate over batches and classes // iterate over batches and classes
shape comp_s{shape::double_type, {num_batches, num_classes}}; shape comp_s{shape::double_type, {num_batches, num_classes}};
shape_for_each(comp_s, [&](auto idx) { shape_for_each(comp_s, [&](auto idx)
{
auto batch_idx = idx[0]; auto batch_idx = idx[0];
auto class_idx = idx[1]; auto class_idx = idx[1];
// index offset for this class // index offset for this class
...@@ -267,6 +275,7 @@ struct nonmaxsuppression ...@@ -267,6 +275,7 @@ struct nonmaxsuppression
auto batch_boxes_start = boxes.begin() + batch_idx * num_boxes * 4; auto batch_boxes_start = boxes.begin() + batch_idx * num_boxes * 4;
auto boxes_heap = filter_boxes_by_score(scores_start, num_boxes, score_threshold); auto boxes_heap = filter_boxes_by_score(scores_start, num_boxes, score_threshold);
selected_boxes_inside_class.clear(); selected_boxes_inside_class.clear();
while(not boxes_heap.empty() && while(not boxes_heap.empty() &&
selected_boxes_inside_class.size() < max_output_boxes_per_class) selected_boxes_inside_class.size() < max_output_boxes_per_class)
{ {
...@@ -279,16 +288,30 @@ struct nonmaxsuppression ...@@ -279,16 +288,30 @@ struct nonmaxsuppression
selected_indices.push_back(class_idx); selected_indices.push_back(class_idx);
selected_indices.push_back(next_top_score.second); selected_indices.push_back(next_top_score.second);
std::priority_queue<std::pair<double, int64_t>> remainder_boxes; std::priority_queue<std::pair<double, int64_t>> remainder_boxes;
while(not boxes_heap.empty()) while(not boxes_heap.empty())
{ {
auto iou_candidate_box = boxes_heap.top(); auto iou_candidate_box = boxes_heap.top();
if(not this->suppress_by_iou(
batch_box(batch_boxes_start, iou_candidate_box.second), box b1{};
batch_box(batch_boxes_start, next_top_score.second), box b2{};
iou_threshold)) bool suppress_box = false;
std::thread box1_th(batch_box<Boxes>, batch_boxes_start, iou_candidate_box.second, std::ref(b1));
std::thread box2_th(batch_box<Boxes>, batch_boxes_start, next_top_score.second, std::ref(b2));
box1_th.join();
box2_th.join();
std::thread suppress_th(suppress_by_iou, std::ref(b1), std::ref(b2), iou_threshold, std::ref(suppress_box));
suppress_th.join();
if(not suppress_box)
{ {
remainder_boxes.push(iou_candidate_box); remainder_boxes.push(iou_candidate_box);
} }
boxes_heap.pop(); boxes_heap.pop();
} }
boxes_heap = remainder_boxes; boxes_heap = remainder_boxes;
...@@ -300,16 +323,26 @@ struct nonmaxsuppression ...@@ -300,16 +323,26 @@ struct nonmaxsuppression
argument compute(const shape& output_shape, std::vector<argument> args) const argument compute(const shape& output_shape, std::vector<argument> args) const
{ {
std::cout << "COMPUTE: " << args.size() << std::endl;
// make buffer of maximum size // make buffer of maximum size
shape max_output_shape = {output_shape.type(), output_shape.max_lens()}; shape max_output_shape = {output_shape.type(), output_shape.max_lens()};
argument result{max_output_shape}; argument result{max_output_shape};
std::size_t max_output_boxes_per_class = std::size_t max_output_boxes_per_class =
(args.size() > 2) ? (args.at(2).at<std::size_t>()) : 0; (args.size() > 2) ? (args.at(2).at<std::size_t>()) : 0;
if(max_output_boxes_per_class == 0) if(max_output_boxes_per_class == 0)
{ {
return result; return result;
} }
if (max_output_boxes_per_class > args.at(1).get_shape().lens().at(2))
{
max_output_boxes_per_class = args.at(1).get_shape().lens().at(2);
std::cout << "Max output box: " << max_output_boxes_per_class << std::endl;
}
double iou_threshold = (args.size() > 3) ? (args.at(3).at<double>()) : 0.0f; double iou_threshold = (args.size() > 3) ? (args.at(3).at<double>()) : 0.0f;
double score_threshold = (args.size() > 4) ? (args.at(4).at<double>()) : 0.0f; double score_threshold = (args.size() > 4) ? (args.at(4).at<double>()) : 0.0f;
std::size_t num_selected = 0; std::size_t num_selected = 0;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment