Unverified Commit 9fcf48c4 authored by bdf's avatar bdf Committed by GitHub
Browse files

[Enhancement] Replace the implementation of deform_roi_pool with mlu-ops (#2598)



* [Feature] Replace the implementation of deform_roi_pool with mlu-ops

* [Feature] Modify code

---------
Co-authored-by: default avatarbudefei <budefei@cambricon.com>
parent 0d1b224f
......@@ -9,254 +9,59 @@
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#include "pytorch_device_registry.hpp"
#include "pytorch_mlu_helper.hpp"
void KernelDeformRoIPoolForward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
cnrtQueue_t queue, cnrtDataType_t data_type,
const void *input, const void *rois,
const void *offset, void *output,
const int channels, const int height,
const int width, const int num_rois,
const int pooled_height, const int pooled_width,
const float spatial_scale,
const int sampling_ratio, const float gamma);
void KernelDeformRoIPoolBackward(
cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
cnrtDataType_t data_type, const void *grad_output, const void *input,
const void *rois, const void *offset, void *grad_input, void *grad_offset,
const int channels, const int height, const int width, const int num_rois,
const int pooled_height, const int pooled_width, const float spatial_scale,
const int sampling_ratio, const float gamma);
// policy function for forward and backward
static void policyFunc(const int bin_num, cnrtDim3_t *k_dim,
cnrtFunctionType_t *k_type) {
const size_t cluster_limit = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
;
const size_t core_limit = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
const size_t bin_num_align = CEIL_ALIGN(bin_num, core_limit);
k_dim->x = core_limit;
k_dim->y = (bin_num_align / core_limit) > cluster_limit
? cluster_limit
: (bin_num_align / core_limit);
k_dim->z = 1;
*k_type = CNRT_FUNC_TYPE_UNION1;
}
#include "mlu_common_helper.h"
void DeformRoIPoolForwardMLUKernelLauncher(Tensor input, Tensor rois,
Tensor offset, Tensor output,
int pooled_height, int pooled_width,
float spatial_scale,
int sampling_ratio, float gamma) {
// Check dtype.
TORCH_CHECK(
input.scalar_type() == at::kFloat || input.scalar_type() == at::kHalf,
"input type should be Float or Half, got ", input.scalar_type());
TORCH_CHECK(input.scalar_type() == rois.scalar_type(),
"rois should have the same type as input");
// Check shape.
TORCH_CHECK(input.dim() == 4, "input should be 4d tensor, got ", input.dim(),
"D.");
TORCH_CHECK(rois.dim() == 2, "rois should be 2d tensor, got ", rois.dim(),
"D.");
if (offset.defined() && offset.numel() > 0) {
TORCH_CHECK(input.scalar_type() == offset.scalar_type(),
"offset should have the same type as input");
TORCH_CHECK(offset.dim() == 4, "offset should be 4d tensor, got ",
offset.dim(), "D.");
TORCH_CHECK(
(offset.size(0) == rois.size(0)), "offset.size(0) = ", offset.size(0),
"while rois.size(0)) = ", rois.size(0), ". They should be the same.");
TORCH_CHECK((offset.size(1) == 2), "offset.size(1) should be 2, ",
"but now offset.size(1) = ", offset.size(1), ".");
TORCH_CHECK((offset.size(2) == output.size(2)),
"offset.size(2) = ", offset.size(2),
"while output.size(2)) = ", output.size(2),
". They should be the same.");
TORCH_CHECK((offset.size(3) == output.size(3)),
"offset.size(3) = ", offset.size(3),
"while output.size(3)) = ", output.size(3),
". They should be the same.");
}
TORCH_CHECK(spatial_scale > 0 && spatial_scale <= 1,
"spatial_scale should be within (0, 1], got ", spatial_scale,
".");
// compute kernel params
auto height = input.size(2);
auto width = input.size(3);
auto channels = input.size(1);
auto num_rois = output.size(0);
if (output.numel() == 0) {
output = at::zeros({num_rois, channels, pooled_height, pooled_width},
input.options());
return;
}
// zero element check
TORCH_CHECK(input.size(0) != 0, "input.size(0) should not be zero, got ",
input.size(0));
TORCH_CHECK(rois.numel() != 0, "rois.numel() should not be zero, got ",
rois.numel());
if (input.numel() == 0 || output.numel() == 0) {
return;
}
// large tensor check
const size_t max_input_num = 2147483648; // 2^31, 2G num
TORCH_CHECK(input.numel() < max_input_num,
"input.numel() should be less than 2147483648, got ",
input.numel());
TORCH_CHECK(rois.numel() < max_input_num,
"rois.numel() should be less than 2147483648, got ",
rois.numel());
TORCH_CHECK(output.numel() < max_input_num,
"output.numel() should be less than 2147483648, got ",
output.numel());
TORCH_CHECK(!offset.defined() || offset.numel() < max_input_num,
"offset.numel() should be less than 2147483648, got ",
offset.numel());
auto memory_format =
torch_mlu::cnnl::ops::get_channels_last_memory_format(input.dim());
auto input_ = torch_mlu::cnnl::ops::cnnl_contiguous(input, memory_format);
at::Tensor output_ =
at::empty({num_rois, channels, pooled_height, pooled_width},
input.options(), memory_format);
// calculate task dimension
cnrtDim3_t k_dim;
cnrtFunctionType_t k_type;
policyFunc(num_rois * pooled_height * pooled_width, &k_dim, &k_type);
// get compute queue
auto queue = torch_mlu::getCurQueue();
auto rois_contiguous =
torch_mlu::cnnl::ops::cnnl_contiguous(rois, rois.suggest_memory_format());
auto output_contiguous =
torch_mlu::cnnl::ops::cnnl_contiguous(output, memory_format);
MluOpTensorDescriptor input_desc, rois_desc, offset_desc, output_desc;
input_desc.set_with_layout(input_, MLUOP_LAYOUT_NHWC);
rois_desc.set(rois_contiguous);
output_desc.set_with_layout(output_contiguous, MLUOP_LAYOUT_NHWC);
mluOpTensorDescriptor_t offset_real_desc = NULL;
void *offset_ptr = NULL;
if (offset.defined() && offset.numel() > 0) {
auto offset_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
offset, offset.suggest_memory_format());
offset_desc.set(offset_contiguous);
offset_real_desc = offset_desc.desc();
auto offset_impl = torch_mlu::getMluTensorImpl(offset_contiguous);
offset_ptr = offset_impl->cnnlMalloc();
}
// get ptr of tensors
auto input_impl = torch_mlu::getMluTensorImpl(input_);
auto input_ptr = input_impl->cnnlMalloc();
auto rois_impl = torch_mlu::getMluTensorImpl(rois);
auto rois_impl = torch_mlu::getMluTensorImpl(rois_contiguous);
auto rois_ptr = rois_impl->cnnlMalloc();
auto offset_impl = torch_mlu::getMluTensorImpl(offset);
auto offset_ptr = offset_impl->cnnlMalloc();
auto output_impl = torch_mlu::getMluTensorImpl(output_);
auto output_impl = torch_mlu::getMluTensorImpl(output_contiguous);
auto output_ptr = output_impl->cnnlMalloc();
// get comput dtype of input
cnrtDataType_t data_type = torch_mlu::toCnrtDtype(input_.dtype());
// get compute handle
auto handle = mluOpGetCurrentHandle();
mluOpDeformRoiPoolForward(
handle, input_desc.desc(), input_ptr, rois_desc.desc(), rois_ptr,
offset_real_desc, offset_ptr, pooled_height, pooled_width, spatial_scale,
sampling_ratio, gamma, output_desc.desc(), output_ptr);
// launch kernel
CNLOG(INFO) << "Launch Kernel MLUKernelDeformRoIPoolForward<<<" << k_dim.x
<< ", " << k_dim.y << ", " << k_dim.z << ">>>";
KernelDeformRoIPoolForward(k_dim, k_type, queue, data_type, input_ptr,
rois_ptr, offset_ptr, output_ptr, channels, height,
width, num_rois, pooled_height, pooled_width,
spatial_scale, sampling_ratio, gamma);
output.copy_(output_);
output.copy_(output_contiguous);
}
void DeformRoIPoolBackwardMLUKernelLauncher(
Tensor grad_output, Tensor input, Tensor rois, Tensor offset,
Tensor grad_input, Tensor grad_offset, int pooled_height, int pooled_width,
float spatial_scale, int sampling_ratio, float gamma) {
// Check dtype.
TORCH_CHECK(
input.scalar_type() == at::kFloat || input.scalar_type() == at::kHalf,
"input type should be Float or Half, got ", input.scalar_type());
TORCH_CHECK(input.scalar_type() == grad_output.scalar_type(),
"grad_output should have the same type as input");
TORCH_CHECK(input.scalar_type() == rois.scalar_type(),
"rois should have the same type as input");
TORCH_CHECK(input.scalar_type() == grad_input.scalar_type(),
"grad_input should have the same type as input");
// Check shape.
TORCH_CHECK(grad_output.dim() == 4, "grad_output should be 4d tensor, got ",
grad_output.dim(), "D.");
TORCH_CHECK(input.dim() == 4, "input should be 4d tensor, got ", input.dim(),
"D.");
TORCH_CHECK(rois.dim() == 2, "rois should be 2d tensor, got ", rois.dim(),
"D.");
if (offset.defined() && offset.numel() > 0) {
TORCH_CHECK(input.scalar_type() == offset.scalar_type(),
"offset should have the same type as input");
TORCH_CHECK(offset.dim() == 4, "offset should be 4d tensor, got ",
offset.dim(), "D.");
TORCH_CHECK(
(offset.size(0) == rois.size(0)), "offset.size(0) = ", offset.size(0),
"while rois.size(0)) = ", rois.size(0), ". They should be the same.");
TORCH_CHECK((offset.size(1) == 2), "offset.size(1) should be 2, ",
"but now offset.size(1) = ", offset.size(1), ".");
TORCH_CHECK((offset.size(2) == grad_output.size(2)),
"offset.size(2) = ", offset.size(2),
"while grad_output.size(2)) = ", grad_output.size(2),
". They should be the same.");
TORCH_CHECK((offset.size(3) == grad_output.size(3)),
"offset.size(3) = ", offset.size(3),
"while grad_output.size(3)) = ", grad_output.size(3),
". They should be the same.");
}
TORCH_CHECK(spatial_scale > 0 && spatial_scale <= 1,
"spatial_scale should be within (0, 1], got ", spatial_scale);
// Check relationship between tensor.
TORCH_CHECK((grad_output.size(0) == rois.size(0)),
"grad_output.size(0) = ", grad_output.size(0),
"while rois.size(0)) = ", rois.size(0),
". They should be the same.");
TORCH_CHECK((grad_output.size(1) == input.size(1)),
"grad_output.size(1) = ", grad_output.size(1),
"while input.size(1)) = ", input.size(1),
". They should be the same.");
TORCH_CHECK((grad_output.size(2) == pooled_height),
"grad_output.size(2) = ", grad_output.size(2),
"while pooled_height = ", pooled_height,
". They should be the same.");
TORCH_CHECK((grad_output.size(3) == pooled_width),
"grad_output.size(3) = ", grad_output.size(3),
"while pooled_width = ", pooled_width,
". They should be the same.");
// compute kernel params
auto batch = input.size(0);
auto channels = input.size(1);
auto height = input.size(2);
auto width = input.size(3);
auto num_rois = grad_output.size(0);
// zero element check
TORCH_CHECK(input.size(0) != 0, "input.size(0) should not be zero, got ",
input.size(0));
TORCH_CHECK(rois.numel() != 0, "rois.numel() should not be zero, got ",
rois.numel());
if (input.numel() == 0 || grad_output.numel() == 0) {
return;
}
// large tensor check
const size_t max_input_num = 2147483648; // 2^31, 2G num
TORCH_CHECK(input.numel() < max_input_num,
"input.numel() should be less than 2147483648, got ",
input.numel());
TORCH_CHECK(rois.numel() < max_input_num,
"rois.numel() should be less than 2147483648, got ",
rois.numel());
TORCH_CHECK(grad_output.numel() < max_input_num,
"grad_output.numel() should be less than 2147483648, got ",
grad_output.numel());
TORCH_CHECK(!offset.defined() || offset.numel() < max_input_num,
"offset.numel() should be less than 2147483648, got ",
offset.numel());
auto memory_format =
torch_mlu::cnnl::ops::get_channels_last_memory_format(grad_output.dim());
auto grad_output_ =
......@@ -264,45 +69,56 @@ void DeformRoIPoolBackwardMLUKernelLauncher(
memory_format =
torch_mlu::cnnl::ops::get_channels_last_memory_format(input.dim());
auto input_ = torch_mlu::cnnl::ops::cnnl_contiguous(input, memory_format);
at::Tensor grad_input_ = at::empty({batch, channels, height, width},
input.options(), memory_format)
.zero_();
// calculate task dimension
cnrtDim3_t k_dim;
cnrtFunctionType_t k_type;
policyFunc(num_rois * pooled_height * pooled_width, &k_dim, &k_type);
// get compute queue
auto queue = torch_mlu::getCurQueue();
auto rois_contiguous =
torch_mlu::cnnl::ops::cnnl_contiguous(rois, rois.suggest_memory_format());
auto grad_input_ =
torch_mlu::cnnl::ops::cnnl_contiguous(grad_input, memory_format);
// get ptr of tensors
auto grad_output_impl = torch_mlu::getMluTensorImpl(grad_output_);
auto grad_output_ptr = grad_output_impl->cnnlMalloc();
auto input_impl = torch_mlu::getMluTensorImpl(input_);
auto input_ptr = input_impl->cnnlMalloc();
auto rois_impl = torch_mlu::getMluTensorImpl(rois);
auto rois_impl = torch_mlu::getMluTensorImpl(rois_contiguous);
auto rois_ptr = rois_impl->cnnlMalloc();
auto offset_impl = torch_mlu::getMluTensorImpl(offset);
auto offset_ptr = offset_impl->cnnlMalloc();
auto grad_input_impl = torch_mlu::getMluTensorImpl(grad_input_);
auto grad_input_ptr = grad_input_impl->cnnlMalloc();
auto grad_offset_impl = torch_mlu::getMluTensorImpl(grad_offset);
auto grad_offset_ptr = grad_offset_impl->cnnlMalloc();
// get comput dtype of input
cnrtDataType_t data_type = torch_mlu::toCnrtDtype(input.dtype());
// launch kernel
CNLOG(INFO) << "Launch Kernel KernelDeformRoIPoolBackward<<<" << k_dim.x
<< ", " << k_dim.y << ", " << k_dim.z << ">>>";
KernelDeformRoIPoolBackward(k_dim, k_type, queue, data_type, grad_output_ptr,
input_ptr, rois_ptr, offset_ptr, grad_input_ptr,
grad_offset_ptr, channels, height, width,
num_rois, pooled_height, pooled_width,
spatial_scale, sampling_ratio, gamma);
MluOpTensorDescriptor grad_output_desc, input_desc, rois_desc, offset_desc,
grad_input_desc, grad_offset_desc;
grad_output_desc.set_with_layout(grad_output_, MLUOP_LAYOUT_NHWC);
input_desc.set_with_layout(input_, MLUOP_LAYOUT_NHWC);
rois_desc.set(rois_contiguous);
grad_input_desc.set_with_layout(grad_input_, MLUOP_LAYOUT_NHWC);
mluOpTensorDescriptor_t offset_real_desc = NULL;
void *offset_ptr = NULL;
if (offset.defined() && offset.numel() > 0) {
auto offset_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
offset, offset.suggest_memory_format());
offset_desc.set(offset_contiguous);
offset_real_desc = offset_desc.desc();
auto offset_impl = torch_mlu::getMluTensorImpl(offset_contiguous);
offset_ptr = offset_impl->cnnlMalloc();
}
mluOpTensorDescriptor_t grad_offset_real_desc = NULL;
void *grad_offset_ptr = NULL;
if (grad_offset.defined() && grad_offset.numel() > 0) {
auto grad_offset_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
grad_offset, grad_offset.suggest_memory_format());
grad_offset_desc.set(grad_offset_contiguous);
grad_offset_real_desc = grad_offset_desc.desc();
auto grad_offset_impl = torch_mlu::getMluTensorImpl(grad_offset_contiguous);
grad_offset_ptr = grad_offset_impl->cnnlMalloc();
}
// get compute handle
auto handle = mluOpGetCurrentHandle();
mluOpDeformRoiPoolBackward(
handle, grad_output_desc.desc(), grad_output_ptr, input_desc.desc(),
input_ptr, rois_desc.desc(), rois_ptr, offset_real_desc, offset_ptr,
pooled_height, pooled_width, spatial_scale, sampling_ratio, gamma,
grad_input_desc.desc(), grad_input_ptr, grad_offset_real_desc,
grad_offset_ptr);
grad_input.copy_(grad_input_);
}
......
......@@ -72,6 +72,39 @@ void MluOpTensorDescriptor::set(Tensor t) {
set_desc(t, layout, data_type, dim_array);
}
void MluOpTensorDescriptor::set_with_layout(Tensor t,
mluOpTensorLayout_t layout) {
mluOpDataType_t data_type = getMluOpDataType(t.dtype());
int t_dim = t.dim();
std::vector<int> shape_info = checkUpperBoundAndCastTo<int>(t.sizes().vec());
std::vector<int> stride_info =
checkUpperBoundAndCastTo<int>(t.strides().vec());
if (layout == MLUOP_LAYOUT_NHWC || layout == MLUOP_LAYOUT_NDHWC ||
layout == MLUOP_LAYOUT_NLC) {
convertShapeAndStride(shape_info, stride_info);
} else if (layout == MLUOP_LAYOUT_HWCN) {
auto convertDepthWiseConvShapeStride = [](const std::vector<int64_t>& vec,
std::vector<int>& target_vec,
std::vector<int>& stride_vec) {
// NCHW --> HWCN
target_vec[0] = static_cast<int>(vec[2]);
target_vec[1] = static_cast<int>(vec[3]);
target_vec[2] = static_cast<int>(vec[1]);
target_vec[3] = static_cast<int>(vec[0]);
// Calculate Stride just like contiguous of HWCN.
stride_vec[3] = 1;
stride_vec[2] = target_vec[3] * stride_vec[3];
stride_vec[1] = target_vec[2] * stride_vec[2];
stride_vec[0] = target_vec[1] * stride_vec[1];
};
convertDepthWiseConvShapeStride(t.sizes().vec(), shape_info, stride_info);
}
TORCH_CHECK(mluOpSetTensorDescriptorEx(
desc_, layout, data_type, t_dim, shape_info.data(),
stride_info.data()) == MLUOP_STATUS_SUCCESS,
"mluOpSetTensorDescriptorEx execution failed.");
}
void MluOpTensorDescriptor::set_desc(const at::Tensor& t,
mluOpTensorLayout_t layout,
mluOpDataType_t dtype,
......
......@@ -30,6 +30,7 @@ class MluOpTensorDescriptor {
~MluOpTensorDescriptor() { mluOpDestroyTensorDescriptor(desc_); }
void set(at::Tensor);
void set_with_layout(at::Tensor, mluOpTensorLayout_t layout);
mluOpTensorDescriptor_t desc() { return desc_; }
private:
......@@ -52,3 +53,47 @@ class MluOpHandle {
void setQueue(cnrtQueue_t queue) { mluOpSetQueue(handle, queue); }
mluOpHandle_t handle;
};
// modify tensor size and stride order based on
// channels_first to channels_last or channels_last_3d.
// which this is not same with pytorch original layout,
// this real layout is based on data storage real order.
// example: modify channels_last tensor dim to nhwc tensor desc.
// N C H W --> N H W C
// C*H*W 1 W C --> C*H*W W C 1
template <typename T>
void convertShapeAndStride(std::vector<T>& shape_info,
std::vector<T>& stride_info) {
TORCH_MLU_CHECK(shape_info.size() == stride_info.size(),
"shape size need equal to stride size.");
const int dim = shape_info.size();
std::vector<T> temp_shape_info(dim);
std::vector<T> temp_stride_info(dim);
temp_shape_info[0] = shape_info[0];
temp_stride_info[0] = stride_info[0];
for (size_t i = 0; i < dim - 1; ++i) {
const int index = (i + 1) % (dim - 1) + 1;
temp_shape_info[i + 1] = shape_info[index];
temp_stride_info[i + 1] = stride_info[index];
}
shape_info.assign(temp_shape_info.begin(), temp_shape_info.end());
stride_info.assign(temp_stride_info.begin(), temp_stride_info.end());
}
// torch tensor provides int64_t type of shape and stride,
// but mluops descriptor requires type int32.
// use this function to ensure safe CAST, or report an error.
template <typename DST_T, typename SRC_T>
std::vector<DST_T> checkUpperBoundAndCastTo(const std::vector<SRC_T>& input) {
std::vector<DST_T> output;
output.reserve(input.size());
for (const auto& val : input) {
if (val > std::numeric_limits<DST_T>::max()) {
TORCH_MLU_CHECK(false, "Requires dim size not greater than ",
std::numeric_limits<DST_T>::max(), ". But got ", val,
".");
}
output.push_back(static_cast<DST_T>(val));
}
return output;
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment