Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
MMCV
Commits
fdeee889
Commit
fdeee889
authored
May 25, 2025
by
limm
Browse files
release v1.6.1 of mmcv
parent
df465820
Changes
457
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
6090 additions
and
126 deletions
+6090
-126
mmcv/ops/csrc/common/cuda/roipoint_pool3d_cuda_kernel.cuh
mmcv/ops/csrc/common/cuda/roipoint_pool3d_cuda_kernel.cuh
+53
-63
mmcv/ops/csrc/common/cuda/rotated_feature_align_cuda_kernel.cuh
...ps/csrc/common/cuda/rotated_feature_align_cuda_kernel.cuh
+129
-0
mmcv/ops/csrc/common/cuda/spconv/indice.cuh
mmcv/ops/csrc/common/cuda/spconv/indice.cuh
+236
-0
mmcv/ops/csrc/common/cuda/spconv/reordering.cuh
mmcv/ops/csrc/common/cuda/spconv/reordering.cuh
+160
-0
mmcv/ops/csrc/common/cuda/three_interpolate_cuda_kernel.cuh
mmcv/ops/csrc/common/cuda/three_interpolate_cuda_kernel.cuh
+21
-21
mmcv/ops/csrc/common/cuda/three_nn_cuda_kernel.cuh
mmcv/ops/csrc/common/cuda/three_nn_cuda_kernel.cuh
+38
-37
mmcv/ops/csrc/common/cuda/voxelization_cuda_kernel.cuh
mmcv/ops/csrc/common/cuda/voxelization_cuda_kernel.cuh
+52
-5
mmcv/ops/csrc/common/mlu/bbox_overlaps_mlu_kernel.mlu
mmcv/ops/csrc/common/mlu/bbox_overlaps_mlu_kernel.mlu
+322
-0
mmcv/ops/csrc/common/mlu/common_mlu_helper.hpp
mmcv/ops/csrc/common/mlu/common_mlu_helper.hpp
+190
-0
mmcv/ops/csrc/common/mlu/focal_loss_sigmoid_mlu_kernel.mlu
mmcv/ops/csrc/common/mlu/focal_loss_sigmoid_mlu_kernel.mlu
+888
-0
mmcv/ops/csrc/common/mlu/nms_mlu_kernel.mlu
mmcv/ops/csrc/common/mlu/nms_mlu_kernel.mlu
+1161
-0
mmcv/ops/csrc/common/mlu/psamask_mlu_kernel.mlu
mmcv/ops/csrc/common/mlu/psamask_mlu_kernel.mlu
+615
-0
mmcv/ops/csrc/common/mlu/psamask_utils.hpp
mmcv/ops/csrc/common/mlu/psamask_utils.hpp
+55
-0
mmcv/ops/csrc/common/mlu/roi_align_mlu_kernel.mlu
mmcv/ops/csrc/common/mlu/roi_align_mlu_kernel.mlu
+493
-0
mmcv/ops/csrc/common/mlu/roi_align_rotated_mlu_kernel.mlu
mmcv/ops/csrc/common/mlu/roi_align_rotated_mlu_kernel.mlu
+472
-0
mmcv/ops/csrc/common/mlu/roi_align_rotated_utils.hpp
mmcv/ops/csrc/common/mlu/roi_align_rotated_utils.hpp
+24
-0
mmcv/ops/csrc/common/mlu/roi_pool_mlu_kernel.mlu
mmcv/ops/csrc/common/mlu/roi_pool_mlu_kernel.mlu
+749
-0
mmcv/ops/csrc/common/mlu/tin_shift_mlu_kernel.mlu
mmcv/ops/csrc/common/mlu/tin_shift_mlu_kernel.mlu
+307
-0
mmcv/ops/csrc/common/mps/MPSDevice.h
mmcv/ops/csrc/common/mps/MPSDevice.h
+64
-0
mmcv/ops/csrc/common/mps/MPSLibrary.h
mmcv/ops/csrc/common/mps/MPSLibrary.h
+61
-0
No files found.
Too many changes to show.
To preserve performance only
457 of 457+
files are displayed.
Plain diff
Email patch
mmcv/ops/csrc/common/cuda/roipoint_pool3d_cuda_kernel.cuh
View file @
fdeee889
...
...
@@ -42,23 +42,23 @@ __global__ void assign_pts_to_box3d(int batch_size, int pts_num, int boxes_num,
// params boxes3d: (B, M, 7)
// params pts_assign: (B, N, M): idx of the corresponding box3d, -1 means
// background points
int
pt_idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
box_idx
=
blockIdx
.
y
;
int
bs_idx
=
blockIdx
.
z
;
CUDA_1D_KERNEL_LOOP
(
pt_idx
,
pts_num
)
{
if
(
box_idx
>=
boxes_num
||
bs_idx
>=
batch_size
)
return
;
if
(
pt_idx
>=
pts_num
||
box_idx
>=
boxes_num
||
bs_idx
>=
batch_size
)
{
return
;
}
int
assign_idx
=
bs_idx
*
pts_num
*
boxes_num
+
pt_idx
*
boxes_num
+
box_idx
;
pts_assign
[
assign_idx
]
=
0
;
int
assign_idx
=
bs_idx
*
pts_num
*
boxes_num
+
pt_idx
*
boxes_num
+
box_idx
;
pts_assign
[
assign_idx
]
=
0
;
int
box_offset
=
bs_idx
*
boxes_num
*
7
+
box_idx
*
7
;
int
pt_offset
=
bs_idx
*
pts_num
*
3
+
pt_idx
*
3
;
int
box_offset
=
bs_idx
*
boxes_num
*
7
+
box_idx
*
7
;
int
pt_offset
=
bs_idx
*
pts_num
*
3
+
pt_idx
*
3
;
T
local_x
=
0
,
local_y
=
0
;
int
cur_in_flag
=
check_pt_in_box3d
(
xyz
+
pt_offset
,
boxes3d
+
box_offset
,
local_x
,
local_y
);
pts_assign
[
assign_idx
]
=
cur_in_flag
;
T
local_x
=
0
,
local_y
=
0
;
int
cur_in_flag
=
check_pt_in_box3d
(
xyz
+
pt_offset
,
boxes3d
+
box_offset
,
local_x
,
local_y
);
pts_assign
[
assign_idx
]
=
cur_in_flag
;
}
}
__global__
void
get_pooled_idx
(
int
batch_size
,
int
pts_num
,
int
boxes_num
,
...
...
@@ -69,35 +69,32 @@ __global__ void get_pooled_idx(int batch_size, int pts_num, int boxes_num,
// params pts_assign: (B, N)
// params pts_idx: (B, M, 512)
// params pooled_empty_flag: (B, M)
int
boxes_idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
boxes_idx
>=
boxes_num
)
{
return
;
}
int
bs_idx
=
blockIdx
.
y
;
int
cnt
=
0
;
for
(
int
k
=
0
;
k
<
pts_num
;
k
++
)
{
if
(
pts_assign
[
bs_idx
*
pts_num
*
boxes_num
+
k
*
boxes_num
+
boxes_idx
])
{
if
(
cnt
<
sampled_pts_num
)
{
pts_idx
[
bs_idx
*
boxes_num
*
sampled_pts_num
+
boxes_idx
*
sampled_pts_num
+
cnt
]
=
k
;
cnt
++
;
}
else
break
;
CUDA_1D_KERNEL_LOOP
(
boxes_idx
,
boxes_num
)
{
int
bs_idx
=
blockIdx
.
y
;
int
cnt
=
0
;
for
(
int
k
=
0
;
k
<
pts_num
;
k
++
)
{
if
(
pts_assign
[
bs_idx
*
pts_num
*
boxes_num
+
k
*
boxes_num
+
boxes_idx
])
{
if
(
cnt
<
sampled_pts_num
)
{
pts_idx
[
bs_idx
*
boxes_num
*
sampled_pts_num
+
boxes_idx
*
sampled_pts_num
+
cnt
]
=
k
;
cnt
++
;
}
else
break
;
}
}
}
if
(
cnt
==
0
)
{
pooled_empty_flag
[
bs_idx
*
boxes_num
+
boxes_idx
]
=
1
;
}
else
if
(
cnt
<
sampled_pts_num
)
{
// duplicate same points for sampling
for
(
int
k
=
cnt
;
k
<
sampled_pts_num
;
k
++
)
{
int
duplicate_idx
=
k
%
cnt
;
int
base_offset
=
bs_idx
*
boxes_num
*
sampled_pts_num
+
boxes_idx
*
sampled_pts_num
;
pts_idx
[
base_offset
+
k
]
=
pts_idx
[
base_offset
+
duplicate_idx
];
if
(
cnt
==
0
)
{
pooled_empty_flag
[
bs_idx
*
boxes_num
+
boxes_idx
]
=
1
;
}
else
if
(
cnt
<
sampled_pts_num
)
{
// duplicate same points for sampling
for
(
int
k
=
cnt
;
k
<
sampled_pts_num
;
k
++
)
{
int
duplicate_idx
=
k
%
cnt
;
int
base_offset
=
bs_idx
*
boxes_num
*
sampled_pts_num
+
boxes_idx
*
sampled_pts_num
;
pts_idx
[
base_offset
+
k
]
=
pts_idx
[
base_offset
+
duplicate_idx
];
}
}
}
}
...
...
@@ -112,33 +109,26 @@ __global__ void roipoint_pool3d_forward(
// params pts_feature: (B, N, C)
// params pooled_features: (B, M, 512, 3+C)
// params pooled_empty_flag: (B, M)
int
sample_pt_idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
box_idx
=
blockIdx
.
y
;
int
bs_idx
=
blockIdx
.
z
;
if
(
sample_pt_idx
>=
sampled_pts_num
||
box_idx
>=
boxes_num
||
bs_idx
>=
batch_size
)
{
return
;
}
if
(
pooled_empty_flag
[
bs_idx
*
boxes_num
+
box_idx
])
{
return
;
CUDA_1D_KERNEL_LOOP
(
sample_pt_idx
,
sampled_pts_num
)
{
if
(
box_idx
>=
boxes_num
||
bs_idx
>=
batch_size
)
return
;
if
(
pooled_empty_flag
[
bs_idx
*
boxes_num
+
box_idx
])
return
;
int
temp_idx
=
bs_idx
*
boxes_num
*
sampled_pts_num
+
box_idx
*
sampled_pts_num
+
sample_pt_idx
;
int
src_pt_idx
=
pts_idx
[
temp_idx
];
int
dst_feature_offset
=
temp_idx
*
(
3
+
feature_in_len
);
for
(
int
j
=
0
;
j
<
3
;
j
++
)
pooled_features
[
dst_feature_offset
+
j
]
=
xyz
[
bs_idx
*
pts_num
*
3
+
src_pt_idx
*
3
+
j
];
int
src_feature_offset
=
bs_idx
*
pts_num
*
feature_in_len
+
src_pt_idx
*
feature_in_len
;
memcpy
(
pooled_features
+
dst_feature_offset
+
3
,
pts_feature
+
src_feature_offset
,
feature_in_len
*
sizeof
(
T
));
}
int
temp_idx
=
bs_idx
*
boxes_num
*
sampled_pts_num
+
box_idx
*
sampled_pts_num
+
sample_pt_idx
;
int
src_pt_idx
=
pts_idx
[
temp_idx
];
int
dst_feature_offset
=
temp_idx
*
(
3
+
feature_in_len
);
for
(
int
j
=
0
;
j
<
3
;
j
++
)
pooled_features
[
dst_feature_offset
+
j
]
=
xyz
[
bs_idx
*
pts_num
*
3
+
src_pt_idx
*
3
+
j
];
int
src_feature_offset
=
bs_idx
*
pts_num
*
feature_in_len
+
src_pt_idx
*
feature_in_len
;
memcpy
(
pooled_features
+
dst_feature_offset
+
3
,
pts_feature
+
src_feature_offset
,
feature_in_len
*
sizeof
(
T
));
}
#endif // ROIPOINT_POOL3D_CUDA_KERNEL_CUH
mmcv/ops/csrc/common/cuda/rotated_feature_align_cuda_kernel.cuh
0 → 100644
View file @
fdeee889
// Copyright (c) OpenMMLab. All rights reserved.
// Modified from
// https://github.com/SJTU-Thinklab-Det/r3det-on-mmdetection/blob/master/mmdet/ops/fr/src/feature_refine_kernel.cu
#ifndef ROTATED_FEATURE_ALIGN_CUDA_KERNEL_CUH
#define ROTATED_FEATURE_ALIGN_CUDA_KERNEL_CUH
#ifdef MMCV_USE_PARROTS
#include "parrots_cuda_helper.hpp"
#else
#include "pytorch_cuda_helper.hpp"
#endif
template
<
typename
scalar_t
>
__global__
void
rotated_feature_align_forward_kernel
(
const
int
nthreads
,
const
int
points
,
const
scalar_t
*
bottom_data
,
const
scalar_t
*
best_bboxes
,
const
scalar_t
spatial_scale
,
const
int
channels
,
const
int
height
,
const
int
width
,
scalar_t
*
top_data
)
{
CUDA_1D_KERNEL_LOOP
(
index
,
nthreads
)
{
int
w
=
index
%
width
;
int
h
=
(
index
/
width
)
%
height
;
int
c
=
(
index
/
width
/
height
)
%
channels
;
int
n
=
index
/
width
/
height
/
channels
;
const
scalar_t
*
bbox_offset
=
best_bboxes
+
((
n
*
height
+
h
)
*
width
+
w
)
*
5
;
scalar_t
roi_y
=
bbox_offset
[
0
]
*
spatial_scale
;
scalar_t
roi_x
=
bbox_offset
[
1
]
*
spatial_scale
;
scalar_t
px
[
5
]
=
{
roi_x
,
0
,
0
,
0
,
0
};
scalar_t
py
[
5
]
=
{
roi_y
,
0
,
0
,
0
,
0
};
if
(
points
>
1
)
{
scalar_t
roi_w
=
bbox_offset
[
2
]
*
spatial_scale
;
scalar_t
roi_h
=
bbox_offset
[
3
]
*
spatial_scale
;
scalar_t
roi_a
=
bbox_offset
[
4
];
scalar_t
w_2
=
roi_w
/
2
,
h_2
=
roi_h
/
2
;
scalar_t
cosa
=
cosf
(
roi_a
),
sina
=
sinf
(
roi_a
);
scalar_t
wx
=
cosa
*
w_2
,
wy
=
sina
*
w_2
;
scalar_t
hx
=
-
sina
*
h_2
,
hy
=
cosa
*
h_2
;
px
[
1
]
=
roi_x
+
wx
+
hx
;
py
[
1
]
=
roi_y
+
wy
+
hy
;
px
[
2
]
=
roi_x
-
wx
+
hx
;
py
[
2
]
=
roi_y
-
wy
+
hy
;
px
[
3
]
=
roi_x
-
wx
-
hx
;
py
[
3
]
=
roi_y
-
wy
-
hy
;
px
[
4
]
=
roi_x
+
wx
-
hx
;
py
[
4
]
=
roi_y
+
wy
-
hy
;
}
const
scalar_t
*
offset_bottom_data
=
bottom_data
+
(
n
*
channels
+
c
)
*
height
*
width
;
scalar_t
output_val
=
bottom_data
[
index
];
for
(
int
i
=
0
;
i
<
points
;
i
++
)
{
output_val
+=
bilinear_interpolate
<
scalar_t
>
(
offset_bottom_data
,
height
,
width
,
py
[
i
],
px
[
i
],
i
);
}
top_data
[
index
]
=
output_val
;
}
}
template
<
typename
scalar_t
>
__global__
void
rotated_feature_align_backward_kernel
(
const
int
nthreads
,
const
int
points
,
const
scalar_t
*
top_diff
,
const
scalar_t
*
best_bboxes
,
const
scalar_t
spatial_scale
,
const
int
channels
,
const
int
height
,
const
int
width
,
scalar_t
*
bottom_diff
)
{
CUDA_1D_KERNEL_LOOP
(
index
,
nthreads
)
{
int
w
=
index
%
width
;
int
h
=
(
index
/
width
)
%
height
;
int
c
=
(
index
/
width
/
height
)
%
channels
;
int
n
=
index
/
width
/
height
/
channels
;
const
scalar_t
*
bbox_offset
=
best_bboxes
+
((
n
*
height
+
h
)
*
width
+
w
)
*
5
;
scalar_t
roi_y
=
bbox_offset
[
0
]
*
spatial_scale
;
scalar_t
roi_x
=
bbox_offset
[
1
]
*
spatial_scale
;
scalar_t
px
[
5
]
=
{
roi_x
,
0
,
0
,
0
,
0
};
scalar_t
py
[
5
]
=
{
roi_y
,
0
,
0
,
0
,
0
};
if
(
points
>
1
)
{
scalar_t
roi_w
=
bbox_offset
[
2
]
*
spatial_scale
;
scalar_t
roi_h
=
bbox_offset
[
3
]
*
spatial_scale
;
scalar_t
roi_a
=
bbox_offset
[
4
];
scalar_t
w_2
=
roi_w
/
2
,
h_2
=
roi_h
/
2
;
scalar_t
cosa
=
cosf
(
roi_a
),
sina
=
sinf
(
roi_a
);
scalar_t
wx
=
cosa
*
w_2
,
wy
=
sina
*
w_2
;
scalar_t
hx
=
-
sina
*
h_2
,
hy
=
cosa
*
h_2
;
px
[
1
]
=
roi_x
+
wx
+
hx
;
py
[
1
]
=
roi_y
+
wy
+
hy
;
px
[
2
]
=
roi_x
-
wx
+
hx
;
py
[
2
]
=
roi_y
-
wy
+
hy
;
px
[
3
]
=
roi_x
-
wx
-
hx
;
py
[
3
]
=
roi_y
-
wy
-
hy
;
px
[
4
]
=
roi_x
+
wx
-
hx
;
py
[
4
]
=
roi_y
+
wy
-
hy
;
}
scalar_t
*
offset_bottom_diff
=
bottom_diff
+
(
n
*
channels
+
c
)
*
height
*
width
;
scalar_t
value_top_diff
=
top_diff
[
index
];
atomicAdd
(
bottom_diff
+
index
,
value_top_diff
);
for
(
int
i
=
0
;
i
<
points
;
i
++
)
{
scalar_t
w1
,
w2
,
w3
,
w4
;
int
x_low
,
x_high
,
y_low
,
y_high
;
bilinear_interpolate_gradient
<
scalar_t
>
(
height
,
width
,
py
[
i
],
px
[
i
],
w1
,
w2
,
w3
,
w4
,
x_low
,
x_high
,
y_low
,
y_high
,
i
);
scalar_t
g1
=
value_top_diff
*
w1
;
scalar_t
g2
=
value_top_diff
*
w2
;
scalar_t
g3
=
value_top_diff
*
w3
;
scalar_t
g4
=
value_top_diff
*
w4
;
if
(
x_low
>=
0
&&
x_high
>=
0
&&
y_low
>=
0
&&
y_high
>=
0
)
{
atomicAdd
(
offset_bottom_diff
+
y_low
*
width
+
x_low
,
g1
);
atomicAdd
(
offset_bottom_diff
+
y_low
*
width
+
x_high
,
g2
);
atomicAdd
(
offset_bottom_diff
+
y_high
*
width
+
x_low
,
g3
);
atomicAdd
(
offset_bottom_diff
+
y_high
*
width
+
x_high
,
g4
);
}
}
}
}
#endif // ROTATED_FEATURE_ALIGN_CUDA_KERNEL_CUH
mmcv/ops/csrc/common/cuda/spconv/indice.cuh
0 → 100644
View file @
fdeee889
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef INDICE_CU_H_
#define INDICE_CU_H_
#include <utils/spconv/spconv/geometry.h>
#include <utils/spconv/tensorview/tensorview.h>
#include <utils/spconv/tensorview/helper_kernel.cuh>
template
<
typename
Index
,
typename
IndexGrid
,
unsigned
NDim
,
int
KernelMaxVolume
=
256
>
__global__
void
prepareIndicePairsKernel
(
tv
::
TensorView
<
const
Index
>
indicesIn
,
tv
::
TensorView
<
Index
>
indicesOut
,
tv
::
TensorView
<
IndexGrid
>
gridsOut
,
tv
::
TensorView
<
Index
>
indicePairs
,
tv
::
TensorView
<
Index
>
indiceNum
,
tv
::
TensorView
<
Index
>
indicePairUnique
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
kernelSize
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
stride
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
padding
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
dilation
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
outSpatialShape
)
{
auto
numActIn
=
indicesIn
.
dim
(
0
);
Index
spatialVolume
=
1
;
#pragma unroll
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
spatialVolume
*=
outSpatialShape
[
i
];
}
Index
kernelVolume
=
1
;
#pragma unroll
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
kernelVolume
*=
kernelSize
[
i
];
}
Index
numValidPoints
=
0
;
Index
validPoints
[
KernelMaxVolume
*
(
NDim
+
1
)];
Index
*
pointPtr
=
nullptr
;
auto
indicePairsDim2
=
indicePairs
.
dim
(
2
);
Index
index
;
for
(
int
ix
:
tv
::
KernelLoopX
<
int
>
(
numActIn
))
{
numValidPoints
=
getValidOutPos
<
Index
,
NDim
>
(
indicesIn
.
data
()
+
ix
*
(
NDim
+
1
)
+
1
,
kernelSize
.
data
(),
stride
.
data
(),
padding
.
data
(),
dilation
.
data
(),
outSpatialShape
.
data
(),
validPoints
);
for
(
Index
i
=
0
;
i
<
numValidPoints
;
++
i
)
{
pointPtr
=
validPoints
+
i
*
(
NDim
+
1
);
auto
offset
=
pointPtr
[
NDim
];
auto
oldNum
=
atomicAdd
(
indiceNum
.
data
()
+
offset
,
Index
(
1
));
indicePairs
(
offset
,
0
,
oldNum
)
=
ix
;
index
=
tv
::
rowArrayIdx
<
Index
,
NDim
>
(
pointPtr
,
outSpatialShape
.
data
())
+
spatialVolume
*
indicesIn
(
ix
,
0
);
indicePairs
(
offset
,
1
,
oldNum
)
=
index
;
indicePairUnique
[
offset
*
indicePairsDim2
+
oldNum
]
=
index
;
}
}
}
template
<
typename
Index
,
typename
IndexGrid
,
unsigned
NDim
,
int
KernelMaxVolume
=
256
>
__global__
void
prepareDeConvIndicePairsKernel
(
tv
::
TensorView
<
const
Index
>
indicesIn
,
tv
::
TensorView
<
Index
>
indicesOut
,
tv
::
TensorView
<
IndexGrid
>
gridsOut
,
tv
::
TensorView
<
Index
>
indicePairs
,
tv
::
TensorView
<
Index
>
indiceNum
,
tv
::
TensorView
<
Index
>
indicePairUnique
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
kernelSize
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
stride
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
padding
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
dilation
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
outSpatialShape
)
{
auto
numActIn
=
indicesIn
.
dim
(
0
);
Index
spatialVolume
=
1
;
#pragma unroll
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
spatialVolume
*=
outSpatialShape
[
i
];
}
Index
kernelVolume
=
1
;
#pragma unroll
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
kernelVolume
*=
kernelSize
[
i
];
}
Index
numValidPoints
=
0
;
Index
validPoints
[
KernelMaxVolume
*
(
NDim
+
1
)];
Index
*
pointPtr
=
nullptr
;
auto
indicePairsDim2
=
indicePairs
.
dim
(
2
);
Index
index
;
for
(
int
ix
:
tv
::
KernelLoopX
<
int
>
(
numActIn
))
{
numValidPoints
=
getValidOutPosTranspose
<
Index
,
NDim
>
(
indicesIn
.
data
()
+
ix
*
(
NDim
+
1
)
+
1
,
kernelSize
.
data
(),
stride
.
data
(),
padding
.
data
(),
dilation
.
data
(),
outSpatialShape
.
data
(),
validPoints
);
for
(
Index
i
=
0
;
i
<
numValidPoints
;
++
i
)
{
pointPtr
=
validPoints
+
i
*
(
NDim
+
1
);
auto
offset
=
pointPtr
[
NDim
];
auto
oldNum
=
atomicAdd
(
indiceNum
.
data
()
+
offset
,
Index
(
1
));
indicePairs
(
offset
,
0
,
oldNum
)
=
ix
;
index
=
tv
::
rowArrayIdx
<
Index
,
NDim
>
(
pointPtr
,
outSpatialShape
.
data
())
+
spatialVolume
*
indicesIn
(
ix
,
0
);
indicePairs
(
offset
,
1
,
oldNum
)
=
index
;
indicePairUnique
[
offset
*
indicePairsDim2
+
oldNum
]
=
index
;
}
}
}
template
<
typename
Index
,
typename
IndexGrid
,
unsigned
NDim
>
__global__
void
assignGridAndIndiceOutKernel
(
tv
::
TensorView
<
Index
>
indicesOut
,
tv
::
TensorView
<
IndexGrid
>
gridsOut
,
int
numAct
,
tv
::
TensorView
<
Index
>
indicePairs
,
tv
::
TensorView
<
Index
>
indicePairUnique
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
outSpatialShape
,
int
batchSize
)
{
Index
index
;
auto
indicesOutPtr
=
indicesOut
.
data
();
for
(
int
ix
:
tv
::
KernelLoopX
<
int
>
(
numAct
))
{
index
=
indicePairUnique
[
ix
];
gridsOut
[
index
]
=
ix
;
index
=
tv
::
rowArrayIdxInv
<
Index
,
NDim
>
(
index
,
indicesOutPtr
+
ix
*
(
NDim
+
1
)
+
1
,
outSpatialShape
.
data
());
indicesOut
[
ix
*
(
NDim
+
1
)]
=
index
%
batchSize
;
}
}
template
<
typename
Index
,
typename
IndexGrid
,
unsigned
NDim
>
__global__
void
assignIndicePairsKernel
(
tv
::
TensorView
<
Index
>
indicesOut
,
tv
::
TensorView
<
IndexGrid
>
gridsOut
,
int
numActIn
,
tv
::
TensorView
<
Index
>
indicePairs
,
tv
::
TensorView
<
Index
>
indicePairUnique
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
outSpatialShape
)
{
Index
index
;
int
kernelVolume
=
indicePairs
.
dim
(
0
);
for
(
int
ix
:
tv
::
KernelLoopX
<
int
>
(
numActIn
))
{
for
(
int
i
=
0
;
i
<
kernelVolume
;
++
i
)
{
index
=
indicePairs
(
i
,
1
,
ix
);
if
(
index
>
-
1
)
{
indicePairs
(
i
,
1
,
ix
)
=
gridsOut
[
index
];
}
}
}
}
template
<
typename
Index
,
typename
IndexGrid
,
unsigned
NDim
>
__global__
void
prepareSubMGridKernel
(
tv
::
TensorView
<
const
Index
>
indicesIn
,
tv
::
TensorView
<
IndexGrid
>
gridsOut
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
outSpatialShape
)
{
auto
numActIn
=
indicesIn
.
dim
(
0
);
Index
spatialVolume
=
1
;
#pragma unroll
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
spatialVolume
*=
outSpatialShape
[
i
];
}
Index
index
=
0
;
for
(
int
ix
:
tv
::
KernelLoopX
<
int
>
(
numActIn
))
{
index
=
tv
::
rowArrayIdx
<
Index
,
NDim
>
(
indicesIn
.
data
()
+
ix
*
(
NDim
+
1
)
+
1
,
outSpatialShape
.
data
())
+
spatialVolume
*
indicesIn
(
ix
,
0
);
gridsOut
[
index
]
=
ix
;
}
}
template
<
typename
Index
,
typename
IndexGrid
,
unsigned
NDim
,
int
KernelMaxVolume
=
256
>
__global__
void
getSubMIndicePairsKernel
(
tv
::
TensorView
<
const
Index
>
indicesIn
,
tv
::
TensorView
<
IndexGrid
>
gridsOut
,
tv
::
TensorView
<
Index
>
indicePairs
,
tv
::
TensorView
<
Index
>
indiceNum
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
kernelSize
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
stride
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
padding
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
dilation
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
outSpatialShape
)
{
auto
numActIn
=
indicesIn
.
dim
(
0
);
Index
spatialVolume
=
1
;
#pragma unroll
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
spatialVolume
*=
outSpatialShape
[
i
];
}
Index
numValidPoints
=
0
;
Index
validPoints
[
KernelMaxVolume
*
(
NDim
+
1
)];
Index
*
pointPtr
=
nullptr
;
Index
index
=
0
;
for
(
int
ix
:
tv
::
KernelLoopX
<
int
>
(
numActIn
))
{
numValidPoints
=
getValidOutPos
<
Index
,
NDim
>
(
indicesIn
.
data
()
+
ix
*
(
NDim
+
1
)
+
1
,
kernelSize
.
data
(),
stride
.
data
(),
padding
.
data
(),
dilation
.
data
(),
outSpatialShape
.
data
(),
validPoints
);
for
(
int
i
=
0
;
i
<
numValidPoints
;
++
i
)
{
pointPtr
=
validPoints
+
i
*
(
NDim
+
1
);
auto
offset
=
pointPtr
[
NDim
];
index
=
tv
::
rowArrayIdx
<
Index
,
NDim
>
(
pointPtr
,
outSpatialShape
.
data
())
+
spatialVolume
*
indicesIn
(
ix
,
0
);
if
(
gridsOut
[
index
]
>
-
1
)
{
auto
oldNum
=
atomicAdd
(
indiceNum
.
data
()
+
offset
,
Index
(
1
));
indicePairs
(
offset
,
1
,
oldNum
)
=
gridsOut
[
index
];
indicePairs
(
offset
,
0
,
oldNum
)
=
ix
;
}
}
}
}
template
<
typename
Index
,
typename
IndexGrid
,
unsigned
NDim
>
__global__
void
resetGridKernel
(
const
Index
*
indicePairUnique
,
tv
::
TensorView
<
IndexGrid
>
gridsOut
,
int
numAct
)
{
for
(
int
ix
:
tv
::
KernelLoopX
<
int
>
(
numAct
))
{
gridsOut
[
indicePairUnique
[
ix
]]
=
-
1
;
}
}
template
<
typename
Index
,
typename
IndexGrid
,
unsigned
NDim
>
__global__
void
resetGridSubMKernel
(
const
Index
*
indices
,
tv
::
TensorView
<
IndexGrid
>
gridsOut
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
outSpatialShape
,
int
numAct
)
{
int
outSpatialShapeReg
[
NDim
];
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
outSpatialShapeReg
[
i
]
=
outSpatialShape
[
i
];
}
Index
spatialVolume
=
1
;
auto
indsPtr
=
indices
;
#pragma unroll
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
spatialVolume
*=
outSpatialShape
[
i
];
}
Index
index
;
for
(
int
ix
:
tv
::
KernelLoopX
<
int
>
(
numAct
))
{
indsPtr
=
indices
+
ix
*
(
NDim
+
1
);
index
=
tv
::
rowArrayIdx
<
Index
,
NDim
>
(
indsPtr
+
1
,
outSpatialShapeReg
);
gridsOut
[
index
+
spatialVolume
*
indsPtr
[
0
]]
=
-
1
;
}
}
#endif
mmcv/ops/csrc/common/cuda/spconv/reordering.cuh
0 → 100644
View file @
fdeee889
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef REORDERING_CU_H_
#define REORDERING_CU_H_
#include <utils/spconv/tensorview/helper_kernel.cuh>
template
<
typename
scalar_t
,
typename
Index
,
int
NumTLP
,
int
NumILP
>
__global__
void
gatherGenericKernel
(
scalar_t
*
buffer
,
const
scalar_t
*
features
,
const
Index
*
indices
,
int
size
,
int
numPlanes
)
{
int
ILPStrideX
[
NumILP
];
Index
inds
[
NumILP
];
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
ILPStrideX
[
ilp
]
=
ilp
*
gridDim
.
x
*
blockDim
.
x
;
for
(
int
ix
:
tv
::
KernelLoopX
<
int
,
NumILP
>
(
size
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
{
if
(
ix
+
ILPStrideX
[
ilp
]
<
size
)
inds
[
ilp
]
=
indices
[
ix
+
ILPStrideX
[
ilp
]]
*
numPlanes
;
}
for
(
int
iy
:
tv
::
KernelLoopY
<
int
>
(
numPlanes
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
++
ilp
)
{
if
(
ix
+
ILPStrideX
[
ilp
]
<
size
)
buffer
[(
ix
+
ILPStrideX
[
ilp
])
*
numPlanes
+
iy
]
=
features
[
inds
[
ilp
]
+
iy
];
}
}
}
}
template
<
typename
scalar_t
,
typename
Index
,
int
NumTLP
,
int
NumILP
,
typename
VecType
>
__global__
void
gatherVecKernel
(
scalar_t
*
buffer
,
const
scalar_t
*
features
,
const
Index
*
indices
,
int
size
,
int
numPlanes
)
{
int
ILPStrideX
[
NumILP
];
Index
inds
[
NumILP
];
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
ILPStrideX
[
ilp
]
=
ilp
*
gridDim
.
x
*
blockDim
.
x
;
for
(
int
ix
:
tv
::
KernelLoopX
<
int
,
NumILP
>
(
size
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
{
if
(
ix
+
ILPStrideX
[
ilp
]
<
size
)
inds
[
ilp
]
=
indices
[
ix
+
ILPStrideX
[
ilp
]]
*
numPlanes
;
}
for
(
int
iy
:
tv
::
KernelLoopY
<
int
>
(
numPlanes
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
++
ilp
)
{
if
(
ix
+
ILPStrideX
[
ilp
]
<
size
)
reinterpret_cast
<
VecType
*>
(
buffer
)[(
ix
+
ILPStrideX
[
ilp
])
*
numPlanes
+
iy
]
=
reinterpret_cast
<
const
VecType
*>
(
features
)[
inds
[
ilp
]
+
iy
];
}
}
}
}
template
<
typename
scalar_t
,
typename
Index
,
int
NumTLP
,
int
NumILP
,
typename
VecType
=
int4
>
__global__
void
gatherVecBlockKernel
(
scalar_t
*
buffer
,
const
scalar_t
*
features
,
const
Index
*
indices
,
int
size
,
int
numPlanes
)
{
int
ILPStrideY
[
NumILP
];
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
ILPStrideY
[
ilp
]
=
ilp
*
gridDim
.
y
*
blockDim
.
y
;
features
+=
blockIdx
.
x
*
NumTLP
;
buffer
+=
blockIdx
.
x
*
NumTLP
;
for
(
int
iy
:
tv
::
KernelLoopY
<
int
,
NumILP
>
(
size
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
++
ilp
)
{
reinterpret_cast
<
VecType
*>
(
buffer
)[(
iy
+
ILPStrideY
[
ilp
])
*
numPlanes
+
threadIdx
.
x
]
=
reinterpret_cast
<
const
VecType
*>
(
features
)[
indices
[
iy
+
ILPStrideY
[
ilp
]]
*
numPlanes
+
threadIdx
.
x
];
}
}
}
template
<
typename
scalar_t
,
typename
Index
,
int
NumTLP
,
int
NumILP
>
__global__
void
scatterAddGenericKernel
(
scalar_t
*
outFeatures
,
const
scalar_t
*
buffer
,
const
Index
*
indices
,
int
size
,
int
numPlanes
)
{
int
ILPStrideX
[
NumILP
];
Index
inds
[
NumILP
];
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
ILPStrideX
[
ilp
]
=
ilp
*
gridDim
.
x
*
blockDim
.
x
;
for
(
int
ix
:
tv
::
KernelLoopX
<
int
,
NumILP
>
(
size
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
{
if
(
ix
+
ILPStrideX
[
ilp
]
<
size
)
inds
[
ilp
]
=
indices
[
ix
+
ILPStrideX
[
ilp
]]
*
numPlanes
;
}
for
(
int
iy
:
tv
::
KernelLoopY
<
int
>
(
numPlanes
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
++
ilp
)
{
if
(
ix
+
ILPStrideX
[
ilp
]
<
size
)
{
outFeatures
[
inds
[
ilp
]
+
iy
]
+=
buffer
[(
ix
+
ILPStrideX
[
ilp
])
*
numPlanes
+
iy
];
}
}
}
}
}
template
<
typename
scalar_t
,
typename
Index
,
int
NumTLP
,
int
NumILP
,
typename
VecType
=
int4
>
__global__
void
scatterAddVecBlockKernel
(
scalar_t
*
outFeatures
,
const
scalar_t
*
buffer
,
const
Index
*
indices
,
int
size
,
int
numPlanes
)
{
int
ILPStrideY
[
NumILP
];
constexpr
int
vecloadFactor
=
sizeof
(
VecType
)
/
sizeof
(
scalar_t
);
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
ILPStrideY
[
ilp
]
=
ilp
*
gridDim
.
y
*
blockDim
.
y
;
outFeatures
+=
blockIdx
.
x
*
NumTLP
;
buffer
+=
blockIdx
.
x
*
NumTLP
;
scalar_t
buf
[
vecloadFactor
];
scalar_t
buf2
[
vecloadFactor
];
Index
idx
;
for
(
int
iy
:
tv
::
KernelLoopY
<
int
,
NumILP
>
(
size
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
++
ilp
)
{
idx
=
indices
[
iy
+
ILPStrideY
[
ilp
]]
*
numPlanes
+
threadIdx
.
x
;
reinterpret_cast
<
VecType
*>
(
buf
)[
0
]
=
reinterpret_cast
<
VecType
*>
(
outFeatures
)[
idx
];
reinterpret_cast
<
VecType
*>
(
buf2
)[
0
]
=
reinterpret_cast
<
const
VecType
*>
(
buffer
)[(
iy
+
ILPStrideY
[
ilp
])
*
numPlanes
+
threadIdx
.
x
];
#pragma unroll
for
(
int
i
=
0
;
i
<
vecloadFactor
;
i
++
)
{
buf
[
i
]
+=
buf2
[
i
];
}
reinterpret_cast
<
VecType
*>
(
outFeatures
)[
idx
]
=
reinterpret_cast
<
VecType
*>
(
buf
)[
0
];
}
}
}
#endif
mmcv/ops/csrc/common/cuda/three_interpolate_cuda_kernel.cuh
View file @
fdeee889
...
...
@@ -20,17 +20,17 @@ __global__ void three_interpolate_forward_cuda_kernel(
int
bs_idx
=
blockIdx
.
z
;
int
c_idx
=
blockIdx
.
y
;
int
pt_idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
CUDA_1D_KERNEL_LOOP
(
pt_idx
,
n
)
{
if
(
bs_idx
>=
b
||
c_idx
>=
c
)
return
;
if
(
bs_idx
>=
b
||
c_idx
>=
c
||
pt_idx
>=
n
)
return
;
weight
+=
bs_idx
*
n
*
3
+
pt_idx
*
3
;
points
+=
bs_idx
*
c
*
m
+
c_idx
*
m
;
idx
+=
bs_idx
*
n
*
3
+
pt_idx
*
3
;
out
+=
bs_idx
*
c
*
n
+
c_idx
*
n
;
weight
+=
bs_idx
*
n
*
3
+
pt_idx
*
3
;
points
+=
bs_idx
*
c
*
m
+
c_idx
*
m
;
idx
+=
bs_idx
*
n
*
3
+
pt_idx
*
3
;
out
+=
bs_idx
*
c
*
n
+
c_idx
*
n
;
out
[
pt_idx
]
=
weight
[
0
]
*
points
[
idx
[
0
]]
+
weight
[
1
]
*
points
[
idx
[
1
]]
+
weight
[
2
]
*
points
[
idx
[
2
]];
out
[
pt_idx
]
=
weight
[
0
]
*
points
[
idx
[
0
]]
+
weight
[
1
]
*
points
[
idx
[
1
]]
+
weight
[
2
]
*
points
[
idx
[
2
]];
}
}
template
<
typename
T
>
...
...
@@ -44,18 +44,18 @@ __global__ void three_interpolate_backward_cuda_kernel(
int
bs_idx
=
blockIdx
.
z
;
int
c_idx
=
blockIdx
.
y
;
int
pt_idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
bs_idx
>=
b
||
c_idx
>=
c
||
pt_idx
>=
n
)
return
;
grad_ou
t
+=
bs_idx
*
c
*
n
+
c_idx
*
n
+
pt_idx
;
weight
+=
bs_idx
*
n
*
3
+
pt
_idx
*
3
;
grad_points
+=
bs_idx
*
c
*
m
+
c
_idx
*
m
;
idx
+=
bs_idx
*
n
*
3
+
pt_idx
*
3
;
atomicAdd
(
grad_points
+
idx
[
0
],
grad_out
[
0
]
*
weight
[
0
]);
atomicAdd
(
grad_points
+
idx
[
1
],
grad_out
[
0
]
*
weight
[
1
]);
atomicAdd
(
grad_points
+
idx
[
2
],
grad_out
[
0
]
*
weight
[
2
]);
CUDA_1D_KERNEL_LOOP
(
pt_idx
,
n
)
{
if
(
bs_idx
>=
b
||
c_idx
>=
c
)
return
;
grad_out
+=
bs_idx
*
c
*
n
+
c_idx
*
n
+
pt_idx
;
weigh
t
+=
bs_idx
*
n
*
3
+
pt_idx
*
3
;
grad_points
+=
bs_idx
*
c
*
m
+
c
_idx
*
m
;
idx
+=
bs_idx
*
n
*
3
+
pt
_idx
*
3
;
atomicAdd
(
grad_points
+
idx
[
0
],
grad_out
[
0
]
*
weight
[
0
]);
atomicAdd
(
grad_points
+
idx
[
1
],
grad_out
[
0
]
*
weight
[
1
]);
atomicAdd
(
grad_points
+
idx
[
2
],
grad_out
[
0
]
*
weight
[
2
]);
}
}
#endif // THREE_INTERPOLATE_CUDA_KERNEL_CUH
mmcv/ops/csrc/common/cuda/three_nn_cuda_kernel.cuh
View file @
fdeee889
...
...
@@ -19,48 +19,49 @@ __global__ void three_nn_forward_cuda_kernel(int b, int n, int m,
// idx: (B, N, 3)
int
bs_idx
=
blockIdx
.
y
;
int
pt_idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
bs_idx
>=
b
||
pt_idx
>=
n
)
return
;
CUDA_1D_KERNEL_LOOP
(
pt_idx
,
n
)
{
if
(
bs_idx
>=
b
)
return
;
unknown
+=
bs_idx
*
n
*
3
+
pt_idx
*
3
;
known
+=
bs_idx
*
m
*
3
;
dist2
+=
bs_idx
*
n
*
3
+
pt_idx
*
3
;
idx
+=
bs_idx
*
n
*
3
+
pt_idx
*
3
;
unknown
+=
bs_idx
*
n
*
3
+
pt_idx
*
3
;
known
+=
bs_idx
*
m
*
3
;
dist2
+=
bs_idx
*
n
*
3
+
pt_idx
*
3
;
idx
+=
bs_idx
*
n
*
3
+
pt_idx
*
3
;
T
ux
=
unknown
[
0
];
T
uy
=
unknown
[
1
];
T
uz
=
unknown
[
2
];
T
ux
=
unknown
[
0
];
T
uy
=
unknown
[
1
];
T
uz
=
unknown
[
2
];
double
best1
=
1e40
,
best2
=
1e40
,
best3
=
1e40
;
int
besti1
=
0
,
besti2
=
0
,
besti3
=
0
;
for
(
int
k
=
0
;
k
<
m
;
++
k
)
{
T
x
=
known
[
k
*
3
+
0
];
T
y
=
known
[
k
*
3
+
1
];
T
z
=
known
[
k
*
3
+
2
];
T
d
=
(
ux
-
x
)
*
(
ux
-
x
)
+
(
uy
-
y
)
*
(
uy
-
y
)
+
(
uz
-
z
)
*
(
uz
-
z
);
if
(
d
<
best1
)
{
best3
=
best2
;
besti3
=
besti2
;
best2
=
best1
;
besti2
=
besti1
;
best1
=
d
;
besti1
=
k
;
}
else
if
(
d
<
best2
)
{
best3
=
best2
;
besti3
=
besti2
;
best2
=
d
;
besti2
=
k
;
}
else
if
(
d
<
best3
)
{
best3
=
d
;
besti3
=
k
;
double
best1
=
1e40
,
best2
=
1e40
,
best3
=
1e40
;
int
besti1
=
0
,
besti2
=
0
,
besti3
=
0
;
for
(
int
k
=
0
;
k
<
m
;
++
k
)
{
T
x
=
known
[
k
*
3
+
0
];
T
y
=
known
[
k
*
3
+
1
];
T
z
=
known
[
k
*
3
+
2
];
T
d
=
(
ux
-
x
)
*
(
ux
-
x
)
+
(
uy
-
y
)
*
(
uy
-
y
)
+
(
uz
-
z
)
*
(
uz
-
z
);
if
(
d
<
best1
)
{
best3
=
best2
;
besti3
=
besti2
;
best2
=
best1
;
besti2
=
besti1
;
best1
=
d
;
besti1
=
k
;
}
else
if
(
d
<
best2
)
{
best3
=
best2
;
besti3
=
besti2
;
best2
=
d
;
besti2
=
k
;
}
else
if
(
d
<
best3
)
{
best3
=
d
;
besti3
=
k
;
}
}
dist2
[
0
]
=
best1
;
dist2
[
1
]
=
best2
;
dist2
[
2
]
=
best3
;
idx
[
0
]
=
besti1
;
idx
[
1
]
=
besti2
;
idx
[
2
]
=
besti3
;
}
dist2
[
0
]
=
best1
;
dist2
[
1
]
=
best2
;
dist2
[
2
]
=
best3
;
idx
[
0
]
=
besti1
;
idx
[
1
]
=
besti2
;
idx
[
2
]
=
besti3
;
}
#endif // THREE_NN_CUDA_KERNEL_CUH
mmcv/ops/csrc/common/cuda/voxelization_cuda_kernel.cuh
View file @
fdeee889
...
...
@@ -23,20 +23,20 @@ __global__ void dynamic_voxelize_kernel(
// To save some computation
auto
points_offset
=
points
+
index
*
num_features
;
auto
coors_offset
=
coors
+
index
*
NDim
;
int
c_x
=
floor
((
points_offset
[
0
]
-
coors_x_min
)
/
voxel_x
);
int
c_x
=
floor
f
((
points_offset
[
0
]
-
coors_x_min
)
/
voxel_x
);
if
(
c_x
<
0
||
c_x
>=
grid_x
)
{
coors_offset
[
0
]
=
-
1
;
continue
;
}
int
c_y
=
floor
((
points_offset
[
1
]
-
coors_y_min
)
/
voxel_y
);
int
c_y
=
floor
f
((
points_offset
[
1
]
-
coors_y_min
)
/
voxel_y
);
if
(
c_y
<
0
||
c_y
>=
grid_y
)
{
coors_offset
[
0
]
=
-
1
;
coors_offset
[
1
]
=
-
1
;
continue
;
}
int
c_z
=
floor
((
points_offset
[
2
]
-
coors_z_min
)
/
voxel_z
);
int
c_z
=
floor
f
((
points_offset
[
2
]
-
coors_z_min
)
/
voxel_z
);
if
(
c_z
<
0
||
c_z
>=
grid_z
)
{
coors_offset
[
0
]
=
-
1
;
coors_offset
[
1
]
=
-
1
;
...
...
@@ -101,7 +101,7 @@ __global__ void point_to_voxelidx_kernel(const T_int* coor,
CUDA_1D_KERNEL_LOOP
(
index
,
num_points
)
{
auto
coor_offset
=
coor
+
index
*
NDim
;
// skip invalid points
if
((
index
>=
num_points
)
||
(
coor_offset
[
0
]
==
-
1
)
)
return
;
if
(
coor_offset
[
0
]
==
-
1
)
continue
;
int
num
=
0
;
int
coor_x
=
coor_offset
[
0
];
...
...
@@ -122,7 +122,7 @@ __global__ void point_to_voxelidx_kernel(const T_int* coor,
point_to_pointidx
[
index
]
=
i
;
}
else
if
(
num
>=
max_points
)
{
// out of boundary
re
turn
;
b
re
ak
;
}
}
}
...
...
@@ -166,4 +166,51 @@ __global__ void determin_voxel_num(
}
}
__global__
void
nondeterministic_get_assign_pos
(
const
int
nthreads
,
const
int32_t
*
coors_map
,
int32_t
*
pts_id
,
int32_t
*
coors_count
,
int32_t
*
reduce_count
,
int32_t
*
coors_order
)
{
CUDA_1D_KERNEL_LOOP
(
thread_idx
,
nthreads
)
{
int
coors_idx
=
coors_map
[
thread_idx
];
if
(
coors_idx
>
-
1
)
{
int32_t
coors_pts_pos
=
atomicAdd
(
&
reduce_count
[
coors_idx
],
1
);
pts_id
[
thread_idx
]
=
coors_pts_pos
;
if
(
coors_pts_pos
==
0
)
{
coors_order
[
coors_idx
]
=
atomicAdd
(
coors_count
,
1
);
}
}
}
}
template
<
typename
T
>
__global__
void
nondeterministic_assign_point_voxel
(
const
int
nthreads
,
const
T
*
points
,
const
int32_t
*
coors_map
,
const
int32_t
*
pts_id
,
const
int32_t
*
coors_in
,
const
int32_t
*
reduce_count
,
const
int32_t
*
coors_order
,
T
*
voxels
,
int32_t
*
coors
,
int32_t
*
pts_count
,
const
int
max_voxels
,
const
int
max_points
,
const
int
num_features
,
const
int
NDim
)
{
CUDA_1D_KERNEL_LOOP
(
thread_idx
,
nthreads
)
{
int
coors_idx
=
coors_map
[
thread_idx
];
int
coors_pts_pos
=
pts_id
[
thread_idx
];
if
(
coors_idx
>
-
1
&&
coors_pts_pos
<
max_points
)
{
int
coors_pos
=
coors_order
[
coors_idx
];
if
(
coors_pos
<
max_voxels
)
{
auto
voxels_offset
=
voxels
+
(
coors_pos
*
max_points
+
coors_pts_pos
)
*
num_features
;
auto
points_offset
=
points
+
thread_idx
*
num_features
;
for
(
int
k
=
0
;
k
<
num_features
;
k
++
)
{
voxels_offset
[
k
]
=
points_offset
[
k
];
}
if
(
coors_pts_pos
==
0
)
{
pts_count
[
coors_pos
]
=
min
(
reduce_count
[
coors_idx
],
max_points
);
auto
coors_offset
=
coors
+
coors_pos
*
NDim
;
auto
coors_in_offset
=
coors_in
+
coors_idx
*
NDim
;
for
(
int
k
=
0
;
k
<
NDim
;
k
++
)
{
coors_offset
[
k
]
=
coors_in_offset
[
k
];
}
}
}
}
}
}
#endif // VOXELIZATION_CUDA_KERNEL_CUH
mmcv/ops/csrc/common/mlu/bbox_overlaps_mlu_kernel.mlu
0 → 100644
View file @
fdeee889
/*************************************************************************
* Copyright (C) 2021 Cambricon.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#include <float.h>
#include "common_mlu_helper.hpp"
#define COORD_NUM 4
__nram__ char nmem_buf[MAX_NRAM_SIZE];
template <typename T>
__mlu_func__ void computeDiv(void *nram_dst, void *nram_src0, void *nram_src1,
void *nram_addition, const int32_t deal_num) {
__bang_active_reciphp((T *)nram_dst, (T *)nram_src1, deal_num);
__bang_mul((T *)nram_dst, (T *)nram_src0, (T *)nram_dst, deal_num);
}
template <>
__mlu_func__ void computeDiv<half>(void *nram_dst, void *nram_src0,
void *nram_src1, void *nram_addition,
const int32_t deal_num) {
__bang_half2float((float *)nram_addition, (half *)nram_src1, deal_num);
__bang_active_reciphp((float *)nram_addition, (float *)nram_addition,
deal_num);
__bang_float2half_rd((half *)nram_src1, (float *)nram_addition, deal_num);
__bang_mul((half *)nram_dst, (half *)nram_src0, (half *)nram_src1, deal_num);
}
template <typename T>
__mlu_func__ void bboxOverlapsWorkflow(
T *vec_b1_x1, T *vec_b1_y1, T *vec_b1_x2, T *vec_b1_y2, T *vec_b2_x1,
T *vec_b2_y1, T *vec_b2_x2, T *vec_b2_y2, T *vec_left, T *vec_right,
T *vec_top, T *vec_bottom, const T *bbox1, const T *bbox2, void *ious,
const int32_t offset, const int32_t mode, const int32_t batches_stride,
const int32_t num_bbox1, const int32_t num_bbox2, const bool aligned) {
int32_t task_batch_stride = (num_bbox1 + taskDim - 1) / taskDim;
int32_t batch_start = taskId * task_batch_stride;
int32_t batch_per_task = batch_start + task_batch_stride < num_bbox1
? task_batch_stride
: num_bbox1 - batch_start;
batch_per_task = batch_per_task > 0 ? batch_per_task : (0);
if (aligned) {
int32_t num_loop_cpy = batch_per_task / batches_stride;
int32_t num_rem_cpy_batches = batch_per_task % batches_stride;
num_loop_cpy = num_rem_cpy_batches > 0 ? num_loop_cpy + 1 : num_loop_cpy;
for (int32_t i = 0; i < num_loop_cpy; i++) {
int32_t index = batch_start + i * batches_stride;
int32_t handle_batches = index + batches_stride > num_bbox1
? num_rem_cpy_batches
: batches_stride;
int32_t b1 = index;
int32_t b2 = index;
int32_t base1 = b1 * COORD_NUM;
__memcpy(vec_b1_x1, &bbox1[base1], sizeof(T), GDRAM2NRAM, sizeof(T),
COORD_NUM * sizeof(T), handle_batches - 1);
__memcpy(vec_b1_y1, &bbox1[base1 + 1], sizeof(T), GDRAM2NRAM, sizeof(T),
COORD_NUM * sizeof(T), handle_batches - 1);
__memcpy(vec_b1_x2, &bbox1[base1 + 2], sizeof(T), GDRAM2NRAM, sizeof(T),
COORD_NUM * sizeof(T), handle_batches - 1);
__memcpy(vec_b1_y2, &bbox1[base1 + 3], sizeof(T), GDRAM2NRAM, sizeof(T),
COORD_NUM * sizeof(T), handle_batches - 1);
int32_t base2 = b2 * COORD_NUM;
__memcpy(vec_b2_x1, &bbox2[base2], sizeof(T), GDRAM2NRAM, sizeof(T),
COORD_NUM * sizeof(T), handle_batches - 1);
__memcpy(vec_b2_y1, &bbox2[base2 + 1], sizeof(T), GDRAM2NRAM, sizeof(T),
COORD_NUM * sizeof(T), handle_batches - 1);
__memcpy(vec_b2_x2, &bbox2[base2 + 2], sizeof(T), GDRAM2NRAM, sizeof(T),
COORD_NUM * sizeof(T), handle_batches - 1);
__memcpy(vec_b2_y2, &bbox2[base2 + 3], sizeof(T), GDRAM2NRAM, sizeof(T),
COORD_NUM * sizeof(T), handle_batches - 1);
// get the width and height
__bang_maxequal(vec_left, vec_b1_x1, vec_b2_x1, batches_stride);
__bang_minequal(vec_right, vec_b1_x2, vec_b2_x2, batches_stride);
__bang_maxequal(vec_top, vec_b1_y1, vec_b2_y1, batches_stride);
__bang_minequal(vec_bottom, vec_b1_y2, vec_b2_y2, batches_stride);
// right - left + offset ---> left
__bang_sub(vec_left, vec_right, vec_left, batches_stride);
__bang_add_const(vec_left, vec_left, (T)offset, batches_stride);
// bottom - top + offset ---> right
__bang_sub(vec_right, vec_bottom, vec_top, batches_stride);
__bang_add_const(vec_right, vec_right, (T)offset, batches_stride);
// zero vector ---> bottom
__nramset(vec_bottom, batches_stride, 0.f);
// width --> vec_left
__bang_maxequal(vec_left, vec_bottom, vec_left, batches_stride);
T *width = vec_left;
// height --> vec_right
__bang_maxequal(vec_right, vec_bottom, vec_right, batches_stride);
T *height = vec_right;
// get the b1_area
// (b1_x2 - b1_x1 + offset) ---> vec_top
__bang_sub(vec_top, vec_b1_x2, vec_b1_x1, batches_stride);
__bang_add_const(vec_top, vec_top, (T)offset, batches_stride);
// (b1_y2 - b1_y1 + offset) ---> vec_bottom
__bang_sub(vec_bottom, vec_b1_y2, vec_b1_y1, batches_stride);
__bang_add_const(vec_bottom, vec_bottom, (T)offset, batches_stride);
// b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset)
// ---> vec_top;
__bang_mul(vec_top, vec_top, vec_bottom, batches_stride);
T *b1_area = vec_top;
// get the b2_area
// (b2_x2 - b2_x1 + offset) ---> b2_x1
__bang_sub(vec_b2_x1, vec_b2_x2, vec_b2_x1, batches_stride);
__bang_add_const(vec_b2_x1, vec_b2_x1, (T)offset, batches_stride);
// (b2_y2 - b2_y1 + offset) ---> b2_y1
__bang_sub(vec_b2_y1, vec_b2_y2, vec_b2_y1, batches_stride);
__bang_add_const(vec_b2_y1, vec_b2_y1, (T)offset, batches_stride);
// b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset)
// ---> b2_x1;
__bang_mul(vec_b2_x1, vec_b2_x1, vec_b2_y1, batches_stride);
T *b2_area = vec_b2_x1;
// inter_s = width * height
__bang_mul(height, width, height, batches_stride);
T *inter_s = height;
// offset vector ---> vec_b2_y1
__nramset(vec_b2_y1, batches_stride, T(offset));
T *vec_offset = vec_b2_y1;
if (mode == 0) {
__bang_add(b1_area, b1_area, b2_area, batches_stride);
__bang_sub(b1_area, b1_area, inter_s, batches_stride);
__bang_maxequal(b1_area, vec_offset, b1_area, batches_stride);
} else {
__bang_maxequal(b1_area, vec_offset, b1_area, batches_stride);
}
T *base_s = b1_area;
// ious = inter_s / base_s
computeDiv<T>(width, inter_s, base_s, vec_b2_x2, batches_stride);
__memcpy((T *)ious + index, width, handle_batches * sizeof(T),
NRAM2GDRAM);
}
} else {
int32_t num_loop_cpy = num_bbox2 / batches_stride;
int32_t num_rem_cpy_batches = num_bbox2 % batches_stride;
num_loop_cpy = num_rem_cpy_batches > 0 ? num_loop_cpy + 1 : num_loop_cpy;
for (int32_t i = 0; i < batch_per_task; i++) {
int32_t index1 = batch_start + i;
int32_t b1 = index1;
int32_t base1 = b1 * COORD_NUM;
// set bbox1 and bbox2 to nram
__nramset(vec_b1_x1, batches_stride, bbox1[base1]);
__nramset(vec_b1_y1, batches_stride, bbox1[base1 + 1]);
__nramset(vec_b1_x2, batches_stride, bbox1[base1 + 2]);
__nramset(vec_b1_y2, batches_stride, bbox1[base1 + 3]);
for (int32_t j = 0; j < num_loop_cpy; j++) {
int32_t index2 = j * batches_stride;
int32_t handle_batches = index2 + batches_stride > num_bbox2
? num_rem_cpy_batches
: batches_stride;
int32_t b2 = index2;
int32_t base2 = b2 * COORD_NUM;
// copy bbox2 to nram
__memcpy(vec_b2_x1, &bbox2[base2], sizeof(T), GDRAM2NRAM, sizeof(T),
COORD_NUM * sizeof(T), handle_batches - 1);
__memcpy(vec_b2_y1, &bbox2[base2 + 1], sizeof(T), GDRAM2NRAM, sizeof(T),
COORD_NUM * sizeof(T), handle_batches - 1);
__memcpy(vec_b2_x2, &bbox2[base2 + 2], sizeof(T), GDRAM2NRAM, sizeof(T),
COORD_NUM * sizeof(T), handle_batches - 1);
__memcpy(vec_b2_y2, &bbox2[base2 + 3], sizeof(T), GDRAM2NRAM, sizeof(T),
COORD_NUM * sizeof(T), handle_batches - 1);
// get the width and height
__bang_maxequal(vec_left, vec_b1_x1, vec_b2_x1, batches_stride);
__bang_minequal(vec_right, vec_b1_x2, vec_b2_x2, batches_stride);
__bang_maxequal(vec_top, vec_b1_y1, vec_b2_y1, batches_stride);
__bang_minequal(vec_bottom, vec_b1_y2, vec_b2_y2, batches_stride);
// right - left + offset ---> left
__bang_sub(vec_left, vec_right, vec_left, batches_stride);
__bang_add_const(vec_left, vec_left, (T)offset, batches_stride);
// bottom - top + offset ---> right
__bang_sub(vec_right, vec_bottom, vec_top, batches_stride);
__bang_add_const(vec_right, vec_right, (T)offset, batches_stride);
// zero vector ---> bottom
__nramset(vec_bottom, batches_stride, (T)0);
// width --> vec_left
__bang_maxequal(vec_left, vec_bottom, vec_left, batches_stride);
T *width = vec_left;
// height --> vec_right
__bang_maxequal(vec_right, vec_bottom, vec_right, batches_stride);
T *height = vec_right;
// get the b1_area
// (b1_x2 - b1_x1 + offset) ---> vec_top
__bang_sub(vec_top, vec_b1_x2, vec_b1_x1, batches_stride);
__bang_add_const(vec_top, vec_top, (T)offset, batches_stride);
// (b1_y2 - b1_y1 + offset) ---> vec_bottom
__bang_sub(vec_bottom, vec_b1_y2, vec_b1_y1, batches_stride);
__bang_add_const(vec_bottom, vec_bottom, (T)offset, batches_stride);
// b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset)
// ---> vec_top;
__bang_mul(vec_top, vec_top, vec_bottom, batches_stride);
T *b1_area = vec_top;
// get the b2_area
// (b2_x2 - b2_x1 + offset) ---> b2_x1
__bang_sub(vec_b2_x1, vec_b2_x2, vec_b2_x1, batches_stride);
__bang_add_const(vec_b2_x1, vec_b2_x1, (T)offset, batches_stride);
// (b2_y2 - b2_y1 + offset) ---> b2_y1
__bang_sub(vec_b2_y1, vec_b2_y2, vec_b2_y1, batches_stride);
__bang_add_const(vec_b2_y1, vec_b2_y1, (T)offset, batches_stride);
// b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset)
// ---> b2_x1;
__bang_mul(vec_b2_x1, vec_b2_x1, vec_b2_y1, batches_stride);
T *b2_area = vec_b2_x1;
// inter_s = width * height
__bang_mul(height, width, height, batches_stride);
T *inter_s = height;
// offset vector ---> vec_b2_y1
__nramset(vec_b2_y1, batches_stride, T(offset));
T *vec_offset = vec_b2_y1;
if (mode == 0) {
__bang_add(b1_area, b1_area, b2_area, batches_stride);
__bang_sub(b1_area, b1_area, inter_s, batches_stride);
__bang_maxequal(b1_area, vec_offset, b1_area, batches_stride);
} else {
__bang_maxequal(b1_area, vec_offset, b1_area, batches_stride);
}
T *base_s = b1_area;
// ious = inter_s / base_s
computeDiv<T>(width, inter_s, base_s, vec_b2_x2, batches_stride);
int32_t gdram_offset = index1 * num_bbox2 + index2;
__memcpy((T *)ious + gdram_offset, width, handle_batches * sizeof(T),
NRAM2GDRAM);
}
}
}
}
template <typename T>
__mlu_global__ void MLUUnion1KernelBBoxOverlaps(
const void *bbox1, const void *bbox2, void *ious, const int32_t num_bbox1,
const int32_t num_bbox2, const int32_t mode, const bool aligned,
const int32_t offset) {
/*
* NRAM partition
* |-------------------------------------------------------------|
* | vec_b1_x1 | vec_b1_y1 | vec_b1_x2 | vec_b1_y2 |
* |-------------------------------------------------------------|
* | vec_b2_x1 | vec_b2_y1 | vec_b2_x2 | vec_b2_y2 |
* |-------------------------------------------------------------|
* | vec_left | vec_right | vec_top | vec_bottom |
* |-------------------------------------------------------------|
*
*/
const int32_t align_bytes = PAD_DOWN(MAX_NRAM_SIZE, NFU_ALIGN_SIZE);
const int32_t split_nram_num = 12;
const int32_t nram_stride =
align_bytes / NFU_ALIGN_SIZE / split_nram_num * NFU_ALIGN_SIZE;
void *vec_b1_x1 = nmem_buf;
void *vec_b1_y1 = nmem_buf + nram_stride;
void *vec_b1_x2 = nmem_buf + 2 * nram_stride;
void *vec_b1_y2 = nmem_buf + 3 * nram_stride;
void *vec_b2_x1 = nmem_buf + 4 * nram_stride;
void *vec_b2_y1 = nmem_buf + 5 * nram_stride;
void *vec_b2_x2 = nmem_buf + 6 * nram_stride;
void *vec_b2_y2 = nmem_buf + 7 * nram_stride;
void *vec_left = nmem_buf + 8 * nram_stride;
void *vec_right = nmem_buf + 9 * nram_stride;
void *vec_top = nmem_buf + 10 * nram_stride;
void *vec_bottom = nmem_buf + 11 * nram_stride;
const int32_t vec_length = nram_stride / sizeof(T);
bboxOverlapsWorkflow((T *)vec_b1_x1, (T *)vec_b1_y1, (T *)vec_b1_x2,
(T *)vec_b1_y2, (T *)vec_b2_x1, (T *)vec_b2_y1,
(T *)vec_b2_x2, (T *)vec_b2_y2, (T *)vec_left,
(T *)vec_right, (T *)vec_top, (T *)vec_bottom,
(T *)bbox1, (T *)bbox2, (T *)ious, offset, mode,
vec_length, num_bbox1, num_bbox2, aligned);
}
void KernelBBoxOverlaps(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
cnrtQueue_t queue, const cnrtDataType_t d_type,
const void *bbox1, const void *bbox2, void *ious,
const int32_t num_bbox1, const int32_t num_bbox2,
const int32_t mode, const bool aligned,
const int32_t offset) {
if (d_type == CNRT_FLOAT16) {
MLUUnion1KernelBBoxOverlaps<half><<<k_dim, k_type, queue>>>(
bbox1, bbox2, ious, num_bbox1, num_bbox2, mode, aligned, offset);
} else {
MLUUnion1KernelBBoxOverlaps<float><<<k_dim, k_type, queue>>>(
bbox1, bbox2, ious, num_bbox1, num_bbox2, mode, aligned, offset);
}
}
mmcv/ops/csrc/common/mlu/common_mlu_helper.hpp
0 → 100644
View file @
fdeee889
/*************************************************************************
* Copyright (C) 2021 Cambricon.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#ifndef COMMON_MLU_HELPER_HPP_
#define COMMON_MLU_HELPER_HPP_
#define NFU_ALIGN_SIZE 128 // Byte
#define REM_FOR_STACK (128 * 1024) // 128KB reserved for cncc
#ifdef __BANG_ARCH__
#define MAX_NRAM_SIZE \
(__MLU_NRAM_SIZE__ * 1024 - REM_FOR_STACK) // 128KB reserved for cncc
#define MAX_SRAM_SIZE \
(__MLU_SRAM_SIZE__ * 1024 - REM_FOR_STACK) // 128KB reserved for cncc
#else
#define MAX_NRAM_SIZE (384 * 1024) // 384KB, initialization value
#define MAX_SRAM_SIZE (1920 * 1024) // 1920KB, initialization value
#endif
#ifndef PAD_UP
#define PAD_UP(x, y) (((x) / (y) + (int)((x) % (y) > 0)) * (y))
#endif
#ifndef PAD_DOWN
#define PAD_DOWN(x, y) (((x) / (y)) * (y))
#endif
#define CEIL_ALIGN(x, y) (((x) + (y)-1) / (y) * (y))
/*!
* @brief Converts int32 to float32 data type.
*
* @param[out] dst
* Pointer to NRAM that stores int32 type data.
* @param[in,out] dst_addition
* Pointer to NRAM as the workspace of dst, which has the same size as dst.
* It allows empty pointer on MLU300 series.
* @param[in] src
* Pointer to NRAM that stores float32 type data.
* @param[in,out] src_addition
* Pointer to NRAM as the workspace of src, which has a size of 128 Bytes.
* It allows empty pointer on MLU300 series.
* @param[in] src_count
* The count of elements in src.
*/
__mlu_func__
void
convertInt2Float
(
float
*
dst
,
float
*
dst_addition
,
int
*
src
,
float
*
src_addition
,
const
int
src_count
)
{
#if __BANG_ARCH__ >= 300
__bang_int2float
((
float
*
)
dst
,
(
int32_t
*
)
src
,
src_count
,
0
);
#else
// get sign bit
const
float
move_23bit
=
8388608.0
;
// 0x80000000 = 1,000000000,0000000000000000000000000000
__nramset
((
unsigned
*
)
src_addition
,
NFU_ALIGN_SIZE
/
sizeof
(
float
),
0x80000000
);
__bang_cycle_band
((
char
*
)
dst_addition
,
(
char
*
)
src
,
(
char
*
)
src_addition
,
src_count
*
sizeof
(
float
),
NFU_ALIGN_SIZE
);
// get 1 or 0 from sign bit
// judg is Odd
__nramset
((
unsigned
*
)
src_addition
,
NFU_ALIGN_SIZE
/
sizeof
(
float
),
0x00000001
);
__bang_cycle_bor
((
char
*
)
dst_addition
,
(
char
*
)
dst_addition
,
(
char
*
)
src_addition
,
src_count
*
sizeof
(
float
),
NFU_ALIGN_SIZE
);
__nramset
((
unsigned
*
)
src_addition
,
NFU_ALIGN_SIZE
/
sizeof
(
float
),
0x80000001
);
__bang_cycle_eq
(
dst_addition
,
dst_addition
,
src_addition
,
src_count
,
NFU_ALIGN_SIZE
/
sizeof
(
float
));
// minus xor, positive num invariant
__nramset
((
unsigned
*
)
src_addition
,
NFU_ALIGN_SIZE
/
sizeof
(
float
),
0xffffffff
);
__bang_cycle_mul
(
dst
,
dst_addition
,
src_addition
,
src_count
,
NFU_ALIGN_SIZE
/
sizeof
(
float
));
__bang_bxor
((
char
*
)
dst
,
(
char
*
)
src
,
(
char
*
)
dst
,
src_count
*
sizeof
(
float
));
// convert int32 to float32
__nramset
((
unsigned
*
)
src_addition
,
NFU_ALIGN_SIZE
/
sizeof
(
float
),
0x7fffff
);
__bang_cycle_band
((
char
*
)
dst
,
(
char
*
)
dst
,
(
char
*
)
src_addition
,
src_count
*
sizeof
(
float
),
NFU_ALIGN_SIZE
);
__nramset
((
unsigned
*
)
src_addition
,
NFU_ALIGN_SIZE
/
sizeof
(
float
),
0x4b000000
);
__bang_cycle_bor
((
char
*
)
dst
,
(
char
*
)
dst
,
(
char
*
)
src_addition
,
src_count
*
sizeof
(
float
),
NFU_ALIGN_SIZE
);
__bang_sub_const
(
dst
,
dst
,
move_23bit
,
src_count
);
// add one
__bang_add
(
dst
,
dst
,
dst_addition
,
src_count
);
// set sign for float32
__nramset
((
unsigned
*
)
src_addition
,
NFU_ALIGN_SIZE
/
sizeof
(
float
),
0xffffffff
);
__bang_cycle_mul
(
dst_addition
,
dst_addition
,
src_addition
,
src_count
,
NFU_ALIGN_SIZE
/
sizeof
(
float
));
__nramset
((
unsigned
*
)
src_addition
,
NFU_ALIGN_SIZE
/
sizeof
(
float
),
0x00000001
);
__bang_cycle_add
(
dst_addition
,
dst_addition
,
src_addition
,
src_count
,
NFU_ALIGN_SIZE
/
sizeof
(
float
));
__nramset
((
unsigned
*
)
src_addition
,
NFU_ALIGN_SIZE
/
sizeof
(
float
),
0x80000000
);
__bang_cycle_band
((
char
*
)
dst_addition
,
(
char
*
)
dst_addition
,
(
char
*
)
src_addition
,
src_count
*
4
,
128
);
__bang_bor
((
char
*
)
dst
,
(
char
*
)
dst
,
(
char
*
)
dst_addition
,
src_count
*
4
);
#endif // __BANG_ARCH__ >= 300
}
/*!
* @brief Converts float32 to int32 data type with to_zero round mode.
*
* @param[out] dst
* Pointer to NRAM that stores float32 type data.
* @param[in,out] dst_addition
* Pointer to NRAM as the workspace of dst, which has the same size as dst.
* It allows empty pointer on MLU300 series.
* @param[in] src
* Pointer to NRAM that stores int32 type data.
* @param[in,out] src_addition
* Pointer to NRAM as the workspace of src, which has a size of 128 Bytes.
* It allows empty pointer on MLU300 series.
* @param[in] src_count
* The count of elements in src.
*/
__mlu_func__
void
convertFloat2Int
(
int
*
dst
,
float
*
dst_addition
,
float
*
src
,
float
*
src_addition
,
const
int
src_count
)
{
#if __BANG_ARCH__ >= 300
__bang_float2int_tz
((
int32_t
*
)
dst
,
(
float
*
)
src
,
src_count
,
0
);
#else
// sign ===> src_addition
// dst=-1.0 : when src[i] is a negative number
// dst=+1.0 : when src[i] is a positive number
const
int
floatDchar
=
sizeof
(
float
)
/
sizeof
(
char
);
__bang_active_sign
((
float
*
)
dst
,
src
,
src_count
);
// dst_addition = abs(src)
__bang_mul
(
dst_addition
,
src
,
(
float
*
)
dst
,
src_count
);
// if dst_addition < 1.0 , then src_addition + 1, to fix add error.
__nramset
((
float
*
)
src_addition
,
NFU_ALIGN_SIZE
/
sizeof
(
float
),
1.0
f
);
__bang_cycle_lt
(
dst_addition
,
dst_addition
,
(
float
*
)
src_addition
,
src_count
,
NFU_ALIGN_SIZE
/
sizeof
(
float
));
__bang_add_tz
((
float
*
)
dst
,
(
float
*
)
dst
,
(
float
*
)
dst_addition
,
src_count
);
__nramset
((
unsigned
*
)
src_addition
,
NFU_ALIGN_SIZE
/
sizeof
(
float
),
0xbf800000
);
// set negative flag -1.0 = 0xbf80000
__bang_cycle_eq
(
(
float
*
)
dst
,
(
float
*
)
dst
,
(
float
*
)
src_addition
,
src_count
,
NFU_ALIGN_SIZE
/
sizeof
(
float
));
// to mark all src in [x<-1.0]
__bang_active_abs
(
dst_addition
,
src
,
src_count
);
__nramset
((
float
*
)
src_addition
,
NFU_ALIGN_SIZE
/
sizeof
(
float
),
8388608.0
f
);
// mask shift move 23
__bang_cycle_add_tz
(
dst_addition
,
dst_addition
,
src_addition
,
src_count
,
NFU_ALIGN_SIZE
/
sizeof
(
float
));
// right shift move 23bit
// two`s complement for negatibe
// dst=1.0 , when src <-1.0
// dst=0.0 , when src >=-1.0
__bang_sub
(
dst_addition
,
dst_addition
,
(
float
*
)
dst
,
src_count
);
// to fix max value
// 0 1001 0110 111 1111 1111 1111 1111 1111 <=> 0xcb7fffff <=> 16777215.0,
// means max value.
__bang_mul_const
((
float
*
)
dst
,
(
float
*
)
dst
,
16777215.0
,
src_count
);
__bang_bxor
((
char
*
)
dst_addition
,
(
char
*
)
dst_addition
,
(
char
*
)
dst
,
src_count
*
floatDchar
);
// get low 23bit
__nramset
((
unsigned
*
)
src_addition
,
NFU_ALIGN_SIZE
/
sizeof
(
float
),
(
unsigned
)
0x007fffff
);
// mask low 23bit is 1
__bang_cycle_band
((
char
*
)
dst_addition
,
(
char
*
)
dst_addition
,
(
char
*
)
src_addition
,
src_count
*
floatDchar
,
NFU_ALIGN_SIZE
/
sizeof
(
char
));
// set 9 high bit ===> dst
// -2.0 <=> 0xc0000000 <=> 1100 0000 0000 0000 0000 0000 0000 0000
// 1.0 <=> 0x3f800000 <=> 0011 1111 1000 0000 0000 0000 0000 0000
__nramset
(
src_addition
,
NFU_ALIGN_SIZE
/
sizeof
(
float
),
0x3f800000
);
__bang_cycle_and
((
float
*
)
dst
,
(
float
*
)
dst
,
src_addition
,
src_count
,
NFU_ALIGN_SIZE
/
sizeof
(
float
));
// src or dst_addition
__bang_bor
((
char
*
)
dst_addition
,
(
char
*
)
dst
,
(
char
*
)
dst_addition
,
src_count
*
floatDchar
);
__bang_mul_const
((
float
*
)
dst
,
(
float
*
)
dst
,
-
2.0
,
src_count
);
__bang_bor
((
char
*
)
dst
,
(
char
*
)
dst
,
(
char
*
)
dst_addition
,
src_count
*
floatDchar
);
#endif // __BANG_ARCH__ >= 300
}
#endif // COMMON_MLU_HELPER_HPP_
mmcv/ops/csrc/common/mlu/focal_loss_sigmoid_mlu_kernel.mlu
0 → 100644
View file @
fdeee889
/*************************************************************************
* Copyright (C) 2021 Cambricon.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#include <float.h>
#include "common_mlu_helper.hpp"
#define PING 0
#define PONG 1
__nram__ char nram_buffer[MAX_NRAM_SIZE];
namespace forward {
template <typename T>
__mlu_func__ void loadInput(char *nram_input, T *dram_input, const int32_t size,
const int32_t dst_stride = 0,
const int32_t src_stride = 0,
const int32_t count = 1) {
if (dst_stride == src_stride) {
__memcpy_async(nram_input, dram_input, size * count, GDRAM2NRAM);
} else {
__memcpy_async(nram_input, dram_input, size, GDRAM2NRAM, dst_stride,
src_stride, count - 1);
}
}
template <typename T>
__mlu_func__ void loadWeight(char *nram_input, T *dram_input, const int32_t t,
const int32_t c, const int32_t has_weight,
const int32_t partition_nc) {
if (has_weight && partition_nc && t >= 0 && t < c) {
__memcpy_async(nram_input, (T *)dram_input + t, sizeof(T), GDRAM2NRAM);
}
}
template <typename T>
__mlu_func__ void storeOutput(T *dram_output, char *nram_output,
const int32_t size, const int32_t dst_stride = 0,
const int32_t src_stride = 0,
const int32_t count = 1) {
if (dst_stride == src_stride) {
__memcpy_async(dram_output, nram_output, size * count, NRAM2GDRAM);
} else {
__memcpy_async(dram_output, nram_output, size, NRAM2GDRAM, dst_stride,
src_stride, count - 1);
}
}
template <typename T>
__mlu_func__ void compute(T *input, const int32_t *target, const T *weight,
const int32_t has_weight, const int32_t partition_nc,
const int32_t deal_num, const int32_t n_seg,
const int32_t c, const int32_t c_seg,
const int32_t c_start_index, const float alpha,
const float gamma, T *compute_a, T *compute_b,
T *output) {
// set params
const int32_t c_num =
has_weight ? PAD_UP(c_seg, NFU_ALIGN_SIZE / sizeof(T)) : c_seg;
const int32_t c_end_index = c_start_index + c_seg;
const int32_t half_epsilon = 0x0400;
const T epsilon_f =
sizeof(T) == sizeof(float) ? FLT_MIN : *((half *)&half_epsilon);
// 0. alpha_t * p_t^r = alpha * (1 - p) ^ gamma if t == c_i
// = (1 - alpha) * p ^ gamma if t != c_i
__nramset((T *)output, deal_num, (T)(1 - alpha));
__bang_active_sigmoid((T *)compute_b, (T *)input, deal_num);
for (int32_t i = 0; i < n_seg; ++i) {
const int32_t t = *((uint32_t *)target + i);
if (t >= c_start_index && t < c_end_index) {
const uint32_t index = i * c_num + t - c_start_index;
*((T *)input + index) = -1.0 * (*((T *)input + index));
*((T *)compute_b + index) = 1.0 - (*((T *)compute_b + index)) + epsilon_f;
*((T *)output + index) = alpha;
}
}
if (sizeof(T) == sizeof(half)) {
__bang_half2float((float *)compute_a, (half *)compute_b, deal_num);
__bang_active_loghp((float *)compute_a, (float *)compute_a, deal_num);
__bang_mul_const((float *)compute_a, (float *)compute_a, (float)gamma,
deal_num);
__bang_active_exphp((float *)compute_a, (float *)compute_a, deal_num);
__bang_float2half_rd((half *)compute_a, (float *)compute_a, deal_num);
} else {
__bang_active_loghp((T *)compute_a, (T *)compute_b, deal_num);
__bang_mul_const((T *)compute_a, (T *)compute_a, (T)gamma, deal_num);
__bang_active_exphp((T *)compute_a, (T *)compute_a, deal_num);
}
__bang_mul((T *)output, (T *)compute_a, (T *)output, deal_num);
// 1. max = max(0, -x) if t == c_i
// = max(0, x) if t != c_i
__nramset((T *)compute_b, deal_num, (T)0);
__bang_maxequal((T *)compute_b, (T *)compute_b, (T *)input, deal_num);
// 2. -log(p_t) = ln(e^(-max)+ e^(-max-x) + max if t == c_i
// = ln(e^(-max)+ e^(-max+x) + max if t != c_i
__bang_mul_const((T *)compute_a, (T *)compute_b, (T)-1.0, deal_num);
__bang_add((T *)input, (T *)compute_a, (T *)input, deal_num);
__bang_active_exphp((T *)compute_a, (T *)compute_a, deal_num);
__bang_active_exphp((T *)input, (T *)input, deal_num);
__bang_add((T *)compute_a, (T *)compute_a, (T *)input, deal_num);
__bang_active_loghp((T *)compute_a, (T *)compute_a, deal_num);
__bang_add((T *)input, (T *)compute_a, (T *)compute_b, deal_num);
// 3. output = alpha_t * p_t^r * [-log(p_t)]
__bang_mul((T *)output, (T *)output, (T *)input, deal_num);
// 4. with weight
if (has_weight) {
for (int32_t i = 0; i < n_seg; ++i) {
int32_t t = *((int32_t *)target + i);
if (t >= 0 && t < c) {
t = partition_nc ? 0 : t;
__bang_mul_const((T *)output + i * c_num, (T *)output + i * c_num,
*((T *)weight + t), c_num);
}
}
}
}
template <typename T>
__mlu_func__ void startPipeline(
const T *input, const int32_t *target, const T *weight,
char *nram_compute_a, char *nram_compute_b, char *nram_input,
char *nram_target, char *nram_weight, char *nram_output,
const int32_t has_weight, const int32_t partition_nc,
const int32_t pingpong_offset, const int32_t pingpong_weight_offset,
const int32_t c_offset_num, const int32_t n, const int32_t n_seg,
const int32_t c, const int32_t c_seg, const float alpha, const float gamma,
T *output) {
// with offset
input = (T *)((char *)input + c_offset_num * sizeof(T));
output = (T *)((char *)output + c_offset_num * sizeof(T));
const int32_t c_seg_align_num = PAD_UP(c_seg, NFU_ALIGN_SIZE / sizeof(T));
const int32_t c_num = has_weight ? c_seg_align_num : c_seg;
const int32_t deal_num = PAD_UP(n_seg * c_num, NFU_ALIGN_SIZE / sizeof(T));
const int32_t load_size = c_seg * sizeof(T);
const int32_t dram_stride = c * sizeof(T);
const int32_t nram_stride = c_num * sizeof(T);
if (has_weight && !partition_nc) {
loadInput<T>(nram_weight, (T *)weight, load_size, nram_stride, dram_stride,
1);
__asm__ volatile("sync;\n\t");
}
const int32_t repeat = n / n_seg;
const int32_t remain = n % n_seg;
/*
* Pipeline: The pipeline is processed in three stages: Load, Compute, Store.
* The allocated memory space of NRAM is divided into two parts:
* PING and Pong. In a single time slice, PING is used to process
* IO stream and PONG is used for computation. Both of them are
* processed synchronously until finished.
*
* diagram of PINGPONG:
* |------|-----------------------------------------------------------------|
* | | space |
* |------|-----------------------------------------------------------------|
* | time | Ping | Pong | Ping | Pong | Ping | Pong |
* |------|-----------------------------------------------------------------|
* | 0 | L0 | | | | | |
* | 1 | C0 | L1 | | | | |
* | 2 | S0 | C1 | L2 | | | |
* | 3 | | S1 | C2 | L3 | | |
* | 4 | | | S2 | C3 | L4 | |
* | 5 | | | | S3 | C4 | L5 |
* | 6 | | | | | S4 | C5 |
* | 7 | | | | | | S5 |
* |------|-----------------------------------------------------------------|
*/
// diagram of PINGPONG: L0
if (repeat > 0) {
loadInput<T>(nram_input, (T *)input, load_size, nram_stride, dram_stride,
n_seg);
loadInput<int32_t>(nram_target, (int32_t *)target, n_seg * sizeof(int32_t));
loadWeight<T>(nram_weight, (T *)weight, *((int32_t *)target), c, has_weight,
partition_nc);
__asm__ volatile("sync;\n\t");
}
// diagram of PINGPONG: C0 and L1
if (repeat > 1) {
compute((T *)nram_input, (int32_t *)nram_target, (T *)nram_weight,
has_weight, partition_nc, deal_num, n_seg, c, c_seg, c_offset_num,
alpha, gamma, (T *)nram_compute_a, (T *)nram_compute_b,
(T *)nram_output);
loadInput<T>((char *)nram_input + pingpong_offset, (T *)input + c * n_seg,
load_size, nram_stride, dram_stride, n_seg);
loadInput<int32_t>((char *)nram_target + pingpong_offset,
(int32_t *)target + n_seg, n_seg * sizeof(int32_t));
loadWeight<T>((char *)nram_weight + pingpong_weight_offset, (T *)weight,
*((int32_t *)target + n_seg), c, has_weight, partition_nc);
__asm__ volatile("sync;\n\t");
}
for (int32_t i = 0; i < repeat - 2; ++i) {
storeOutput<T>((T *)output + i * c * n_seg,
nram_output + (i % 2) * pingpong_offset, load_size,
dram_stride, nram_stride, n_seg);
loadInput<T>((char *)nram_input + (i % 2) * pingpong_offset,
(T *)(input) + (i + 2) * c * n_seg, load_size, nram_stride,
dram_stride, n_seg);
loadInput<int32_t>((char *)nram_target + (i % 2) * pingpong_offset,
(int32_t *)target + (i + 2) * n_seg,
n_seg * sizeof(int32_t));
loadWeight<T>((char *)nram_weight + (i % 2) * pingpong_weight_offset,
(T *)weight, *((int32_t *)target + (i + 2) * n_seg), c,
has_weight, partition_nc);
compute((T *)(nram_input + ((i + 1) % 2) * pingpong_offset),
(int32_t *)(nram_target + ((i + 1) % 2) * pingpong_offset),
(T *)(nram_weight +
partition_nc * ((i + 1) % 2) * pingpong_weight_offset),
has_weight, partition_nc, deal_num, n_seg, c, c_seg, c_offset_num,
alpha, gamma, (T *)nram_compute_a, (T *)nram_compute_b,
(T *)(nram_output + ((i + 1) % 2) * pingpong_offset));
__asm__ volatile("sync;\n\t");
}
if (repeat > 1) {
storeOutput<T>((T *)output + (repeat - 2) * c * n_seg,
(char *)nram_output + (repeat % 2) * pingpong_offset,
load_size, dram_stride, nram_stride, n_seg);
}
if (remain > 0) {
loadInput<T>((char *)nram_input + (repeat % 2) * pingpong_offset,
(T *)input + repeat * c * n_seg, load_size, nram_stride,
dram_stride, remain);
loadInput<int32_t>((char *)nram_target + (repeat % 2) * pingpong_offset,
(int32_t *)target + repeat * n_seg,
remain * sizeof(int32_t));
loadWeight<T>((char *)nram_weight + (repeat % 2) * pingpong_weight_offset,
(T *)weight, *((int32_t *)target + repeat * n_seg), c,
has_weight, partition_nc);
}
if (repeat > 0) {
compute((T *)(nram_input + ((repeat - 1) % 2) * pingpong_offset),
(int32_t *)(nram_target + ((repeat - 1) % 2) * pingpong_offset),
(T *)(nram_weight +
partition_nc * ((repeat - 1) % 2) * pingpong_weight_offset),
has_weight, partition_nc, deal_num, n_seg, c, c_seg, c_offset_num,
alpha, gamma, (T *)nram_compute_a, (T *)nram_compute_b,
(T *)(nram_output + ((repeat - 1) % 2) * pingpong_offset));
}
__asm__ volatile("sync;\n\t");
if (repeat > 0) {
storeOutput<T>((T *)output + (repeat - 1) * c * n_seg,
(char *)nram_output + ((repeat - 1) % 2) * pingpong_offset,
load_size, dram_stride, nram_stride, n_seg);
}
if (remain > 0) {
int32_t rem_num = PAD_UP(remain * c_num, NFU_ALIGN_SIZE / sizeof(T));
compute((T *)(nram_input + (repeat % 2) * pingpong_offset),
(int32_t *)(nram_target + (repeat % 2) * pingpong_offset),
(T *)(nram_weight +
partition_nc * (repeat % 2) * pingpong_weight_offset),
has_weight, partition_nc, rem_num, remain, c, c_seg, c_offset_num,
alpha, gamma, (T *)nram_compute_a, (T *)nram_compute_b,
(T *)(nram_output + (repeat % 2) * pingpong_offset));
__asm__ volatile("sync;\n\t");
storeOutput<T>((T *)output + repeat * c * n_seg,
(char *)nram_output + (repeat % 2) * pingpong_offset,
load_size, dram_stride, nram_stride, remain);
}
__asm__ volatile("sync;\n\t");
}
template <typename T>
__mlu_func__ void focalLossSigmoidForwardBlock(
const T *input, const int32_t *target, const T *weight, const int32_t n,
const int32_t c, const float alpha, const float gamma, T *output) {
/*
* NRAM partition
* |-----------------------------------------------------------------------|
* | weight |
* |------------------------------- COMPUTE -------------------------------|
* | | |
* | computeA | computeB |
* | | |
* |------------- PING ------------------------------- PONG ---------------|
* | | |
* | input | input |
* | | |
* |-----------------------------------|-----------------------------------|
* | | |
* | output | output |
* | | |
* |-----------------------------------|-----------------------------------|
* | target | target |
* |-----------------------------------|-----------------------------------|
*
* split_pipeline_num is 6: COMPUTE(computeA,computeB), PING(input,output),
* PONG(input,output).
* split_target_num is 2: PING(target), PONG(target).
* weight is not NULL:
* The nram-size of weight is equal to c_align_size when partition input-N.
* The nram-size of weight is equal to NFU_ALIGN_SIZE when partition
* input-NC.
*/
// calculate threshold of c
const int32_t split_pipeline_num = 6;
const int32_t split_target_num = 2;
const int32_t has_weight = weight != NULL;
const int32_t threshold_c =
PAD_DOWN((MAX_NRAM_SIZE - split_target_num * sizeof(int32_t)) /
(split_pipeline_num + has_weight),
NFU_ALIGN_SIZE) /
sizeof(T);
const int32_t c_align = PAD_UP(c, NFU_ALIGN_SIZE / sizeof(T));
const int32_t c_align_size = c_align * sizeof(T);
if (c <= threshold_c) {
// partition inputN
int32_t c_num = c;
int32_t reservered_align_size =
(split_target_num + split_pipeline_num) * NFU_ALIGN_SIZE;
int32_t weight_size = 0;
if (has_weight) {
c_num = c_align;
reservered_align_size = split_target_num * NFU_ALIGN_SIZE;
weight_size = c_align_size;
}
const int32_t remain_size =
MAX_NRAM_SIZE - weight_size - reservered_align_size;
const int32_t n_seg =
remain_size / (split_pipeline_num * c_num * sizeof(T) +
split_target_num * sizeof(int32_t));
const int32_t split_pipeline_size =
PAD_UP(c_num * n_seg * sizeof(T), NFU_ALIGN_SIZE);
const int32_t compute_size = 2 * split_pipeline_size;
const int32_t pingpong_offset = (MAX_NRAM_SIZE - weight_size - compute_size) / 2;
char *nram_weight = (char *)nram_buffer;
char *nram_compute_a = nram_weight + has_weight * c_align_size;
char *nram_compute_b = nram_compute_a + split_pipeline_size;
char *nram_input = nram_compute_b + split_pipeline_size;
char *nram_output = nram_input + split_pipeline_size;
char *nram_target = nram_output + split_pipeline_size;
startPipeline<T>(input, target, weight, nram_compute_a, nram_compute_b,
nram_input, nram_target, nram_weight, nram_output,
has_weight, 0, pingpong_offset, 0, 0, n, n_seg, c, c,
alpha, gamma, output);
} else {
// partition inputNC
const int32_t weight_size = has_weight * NFU_ALIGN_SIZE;
const int32_t remain_size = MAX_NRAM_SIZE - weight_size;
const int32_t split_pipeline_size = PAD_DOWN(
(remain_size - split_target_num * NFU_ALIGN_SIZE) / split_pipeline_num,
NFU_ALIGN_SIZE);
const int32_t c_seg = split_pipeline_size / sizeof(T);
const int32_t n_seg = 1;
const int32_t compute_size = 2 * split_pipeline_size;
const int32_t pingpong_offset = (MAX_NRAM_SIZE - weight_size - compute_size) / 2;
const int32_t pingpong_weight_offset = weight_size / 2;
char *nram_weight = (char *)nram_buffer;
char *nram_compute_a = nram_weight + weight_size;
char *nram_compute_b = nram_compute_a + split_pipeline_size;
char *nram_input = nram_compute_b + split_pipeline_size;
char *nram_output = nram_input + split_pipeline_size;
char *nram_target = nram_output + split_pipeline_size;
const int32_t loop_num = (c + c_seg - 1) / c_seg;
const int32_t partition_nc = 1;
for (int32_t i = 0; i < loop_num; ++i) {
const int32_t c_index = i * c_seg;
const int32_t c_seg_curr = i == (loop_num - 1) ? c - c_index : c_seg;
startPipeline<T>(input, target, weight, nram_compute_a, nram_compute_b,
nram_input, nram_target, nram_weight, nram_output,
has_weight, partition_nc, pingpong_offset,
pingpong_weight_offset, c_index, n, n_seg, c, c_seg_curr,
alpha, gamma, output);
}
}
}
template <typename T>
__mlu_global__ void MLUUnion1KernelFocalLossSigmoidForward(
const void *input, const void *target, const void *weight, const int32_t N,
const int32_t C, const float alpha, const float gamma, void *output) {
const int32_t n_seg = N / taskDim + (taskId == taskDim - 1) * (N % taskDim);
const T *input_offset = (T *)input + N / taskDim * taskId * C;
const int32_t *target_offset = (int32_t *)target + N / taskDim * taskId;
T *output_offset = (T *)output + N / taskDim * taskId * C;
focalLossSigmoidForwardBlock((T *)input_offset, (int32_t *)target_offset,
(T *)weight, n_seg, C, alpha, gamma,
(T *)output_offset);
}
} // namespace forward
namespace backward {
template <typename T>
__mlu_func__ void loadInput(char *nram_input, char *nram_target,
const T *gdram_input, const int32_t *gdram_target,
const int32_t deal_n, const int32_t total_c,
const bool pingping_flag, const bool has_weight,
const int32_t nram_offset,
const int32_t gdram_offset) {
if (pingping_flag == PONG) {
nram_input += nram_offset;
nram_target += nram_offset;
}
__memcpy_async(nram_target, gdram_target + gdram_offset / total_c,
deal_n * sizeof(int32_t), GDRAM2NRAM);
char *nram_input_load = nram_input;
int32_t compute_align_size = 2 * NFU_ALIGN_SIZE;
if (has_weight) {
if (sizeof(T) == sizeof(half)) {
int32_t compute_align_num = compute_align_size / sizeof(float);
int32_t align_c = PAD_UP(total_c, compute_align_num);
int32_t compute_size = deal_n * align_c * sizeof(float);
nram_input_load += compute_size / 2;
}
int32_t align_c = PAD_UP(total_c, NFU_ALIGN_SIZE / sizeof(T));
int32_t total_c_size = total_c * sizeof(T);
int32_t align_c_size = align_c * sizeof(T);
__memcpy_async(nram_input_load, gdram_input + gdram_offset, total_c_size,
GDRAM2NRAM, align_c_size, total_c_size, deal_n - 1);
} else {
if (sizeof(T) == sizeof(half)) {
int32_t compute_size =
PAD_UP(deal_n * total_c * sizeof(float), compute_align_size);
nram_input_load += compute_size / 2;
}
int32_t load_size = deal_n * total_c * sizeof(T);
__memcpy_async(nram_input_load, gdram_input + gdram_offset, load_size,
GDRAM2NRAM);
}
}
template <typename T>
__mlu_func__ void sigmoid(T *dst_data, const T *src_data,
const int32_t elem_count) {
__bang_mul_const(dst_data, (T *)src_data, T(-1), elem_count);
__bang_active_exphp(dst_data, dst_data, elem_count);
__bang_add_const(dst_data, dst_data, T(1), elem_count);
__bang_active_reciphp(dst_data, dst_data, elem_count);
}
template <typename T>
__mlu_func__ void coreCompute(char *nram_input, const T *nram_weight,
const float *nram_flt_min, char *nram_pt,
char *nram_alpha_t, char *nram_temp,
char *nram_target, const float *nram_gamma,
char *nram_output, const float alpha,
const int32_t compute_num, const int32_t deal_n,
const int32_t total_c, const bool pingpong_flag,
const int32_t nram_offset,
const bool has_weight) {
if (pingpong_flag == PONG) {
nram_input += nram_offset;
nram_pt += nram_offset;
nram_alpha_t += nram_offset;
nram_temp += nram_offset;
nram_output += nram_offset;
nram_target += nram_offset;
}
if (sizeof(T) == sizeof(half)) {
const int32_t compute_size = compute_num * sizeof(float);
char *nram_input_load = nram_input + compute_size / 2;
__bang_half2float((float *)nram_input, (half *)nram_input_load,
compute_num);
}
// 0. alpha_t = alpha - 1
__nramset((float *)nram_alpha_t, compute_num, (float)(alpha - 1.0));
// 1. pt = 1 - sigmoid(x)
sigmoid((float *)nram_pt, (float *)nram_input, compute_num);
__bang_mul_const((float *)nram_pt, (float *)nram_pt, (float)(-1),
compute_num);
__bang_add_const((float *)nram_pt, (float *)nram_pt, (float)1, compute_num);
// 2. pt = target[n] == c ? sigmoid(x) : 1 - sigmoid(x)
// alpha_t = target[n] == c ? alpha : alpha - 1
const int32_t nfu_align_num = NFU_ALIGN_SIZE / sizeof(float);
for (int n = 0; n < deal_n; n++) {
const int32_t target_value = ((int32_t *)nram_target)[n];
if (target_value >= total_c || target_value < 0) continue;
int32_t c_offset = 0;
if (has_weight) {
int32_t c_align_num = nfu_align_num;
if (sizeof(T) == sizeof(half)) {
c_align_num += nfu_align_num;
}
c_offset = PAD_UP(total_c, c_align_num);
} else {
c_offset = total_c;
}
int32_t idx = n * c_offset + target_value;
*((float *)nram_pt + idx) = 1.0 - *((float *)nram_pt + idx);
*((float *)nram_alpha_t + idx) = alpha;
}
// 3. temp = -alpha_t * e^(gamma * log(max(1 - pt, FLT_MIN))
__bang_mul_const((float *)nram_temp, (float *)nram_pt, (float)(-1),
compute_num);
__bang_add_const((float *)nram_temp, (float *)nram_temp, (float)(1),
compute_num);
__bang_cycle_maxequal((float *)nram_temp, (float *)nram_temp,
(float *)nram_flt_min, compute_num, nfu_align_num);
__bang_active_loghp((float *)nram_temp, (float *)nram_temp, compute_num);
__bang_cycle_mul((float *)nram_temp, (float *)nram_temp, (float *)nram_gamma,
compute_num, nfu_align_num);
__bang_active_exphp((float *)nram_temp, (float *)nram_temp, compute_num);
__bang_mul((float *)nram_temp, (float *)nram_temp, (float *)nram_alpha_t,
compute_num);
__bang_mul_const((float *)nram_temp, (float *)nram_temp, (float)(-1),
compute_num);
// 4. output = 1 - pt - gamma * pt * log(max(pt, FLT_MIN))
__bang_cycle_maxequal((float *)nram_output, (float *)nram_pt,
(float *)nram_flt_min, compute_num, nfu_align_num);
__bang_active_loghp((float *)nram_output, (float *)nram_output, compute_num);
__bang_mul((float *)nram_output, (float *)nram_output, (float *)nram_pt,
compute_num);
__bang_cycle_mul((float *)nram_output, (float *)nram_output,
(float *)nram_gamma, compute_num, nfu_align_num);
__bang_add((float *)nram_output, (float *)nram_output, (float *)nram_pt,
compute_num);
__bang_mul_const((float *)nram_output, (float *)nram_output, (float)(-1),
compute_num);
__bang_add_const((float *)nram_output, (float *)nram_output, (float)(1),
compute_num);
// 5. output = output * temp
__bang_mul((float *)nram_output, (float *)nram_output, (float *)nram_temp,
compute_num);
if (sizeof(T) == sizeof(half)) {
__bang_float2half_rd((half *)nram_output, (float *)nram_output,
compute_num);
}
if (has_weight) {
// with weight
for (int n = 0; n < deal_n; n++) {
int32_t c_align_num = nfu_align_num;
if (sizeof(T) == sizeof(half)) {
c_align_num += nfu_align_num;
}
int32_t align_c = PAD_UP(total_c, c_align_num);
int32_t target_value = ((int32_t *)nram_target)[n];
T weight_value = nram_weight[target_value];
__bang_mul_const((T *)nram_output + n * align_c,
(T *)nram_output + n * align_c, weight_value, align_c);
}
}
}
template <typename T>
__mlu_func__ void storeOutput(T *gdram_output, const char *nram_output,
const int32_t deal_n, const int32_t total_c,
const bool pingpong_flag, const bool has_weight,
const int32_t nram_offset,
const int32_t gdram_offset) {
if (pingpong_flag == PONG) {
nram_output += nram_offset;
}
const int32_t store_size = deal_n * total_c * sizeof(T);
if (has_weight) {
int32_t align_c = PAD_UP(total_c, NFU_ALIGN_SIZE / sizeof(T));
int32_t total_c_size = total_c * sizeof(T);
int32_t align_c_size = align_c * sizeof(T);
__memcpy_async(gdram_output + gdram_offset, nram_output, total_c_size,
NRAM2GDRAM, total_c_size, align_c_size, deal_n - 1);
} else {
__memcpy_async(gdram_output + gdram_offset, nram_output, store_size,
NRAM2GDRAM);
}
}
template <typename T>
__mlu_func__ void focalLossSigmoidBackwardBlock(
const T *input, const int32_t *target, const T *weight, const float gamma,
const float alpha, const int32_t total_n, const int32_t deal_n,
const int32_t total_c, T *output) {
// params per time slice
int32_t deal_num = deal_n * total_c;
int32_t deal_size = deal_num * sizeof(float);
int32_t compute_num = 0;
int32_t compute_size = 0;
int32_t compute_align_size = NFU_ALIGN_SIZE;
const int32_t nfu_align_num = NFU_ALIGN_SIZE / sizeof(T);
if (sizeof(T) == sizeof(half)) {
compute_align_size += NFU_ALIGN_SIZE;
}
const int32_t compute_align_num = compute_align_size / sizeof(float);
bool has_weight = false;
if (weight != NULL) {
has_weight = true;
int32_t align_c = PAD_UP(total_c, compute_align_num);
compute_num = deal_n * align_c;
compute_size = compute_num * sizeof(float);
} else {
compute_size = PAD_UP(deal_size, compute_align_size);
compute_num = compute_size / sizeof(float);
}
// params per core
int32_t total_num = total_n * total_c;
int32_t num_per_core = PAD_DOWN(total_num / taskDim, deal_num);
int32_t loop_per_core = num_per_core / deal_num;
/* NRAM partition:
*
* |-----------------ping pong--------------------|
* |input | pt | alpha_t | temp | output | target | flt_min | gamma | weight|
*
* split_pipeline_num is 5: input, pt, alpha_t, temp, output.
* nram_reserved_line_num is 2: flt_min, gamma.
*/
const int32_t split_pipeline_num = 5;
const int32_t nram_reserved_line_num = 2;
int32_t target_deal_size = deal_n * sizeof(int32_t);
int32_t target_deal_size_align = PAD_UP(target_deal_size, NFU_ALIGN_SIZE);
// nram PING/PONG offset
int32_t ping_pong_offset =
compute_size * split_pipeline_num + target_deal_size_align;
// gdram addr
int32_t *base_addr_target =
(int32_t *)target + taskId * loop_per_core * deal_n;
T *base_addr_input = (T *)input + taskId * num_per_core;
T *base_addr_output = output + taskId * num_per_core;
// nram addr
char *nram_input = (char *)nram_buffer;
char *nram_pt = nram_input + compute_size;
char *nram_alpha_t = nram_pt + compute_size;
char *nram_temp = nram_alpha_t + compute_size;
char *nram_output = nram_temp + compute_size;
char *nram_target = nram_output + compute_size;
float *nram_flt_min = NULL;
float *nram_gamma = NULL;
T *nram_weight = NULL;
if (!has_weight) {
nram_flt_min = (float *)(nram_buffer + MAX_NRAM_SIZE -
nram_reserved_line_num * NFU_ALIGN_SIZE);
nram_gamma = nram_flt_min + nfu_align_num;
} else {
int32_t weight_space = PAD_UP(total_c * sizeof(T), NFU_ALIGN_SIZE);
nram_flt_min =
(float *)(nram_buffer + MAX_NRAM_SIZE -
nram_reserved_line_num * NFU_ALIGN_SIZE - weight_space);
nram_gamma = nram_flt_min + nfu_align_num;
nram_weight = (T *)(nram_gamma + nfu_align_num);
__memcpy_async(nram_weight, weight, total_c * sizeof(T), GDRAM2NRAM);
}
// nram set gamma and FLT_MIN
__nramset(nram_gamma, nfu_align_num, gamma);
__nramset(nram_flt_min, nfu_align_num, FLT_MIN);
/*
* Pipeline: The pipeline is processed in three stages: Load, Compute, Store.
* The allocated memory space of NRAM is divided into two parts:
* PING and Pong. In a single time slice, PING is used to process
* IO stream and PONG is used for computation. Both of them are
* processed synchronously until finished.
*
* diagram of PINGPONG:
* |------|-----------------------------------------------------------------|
* | | space |
* |------|-----------------------------------------------------------------|
* | time | Ping | Pong | Ping | Pong | Ping | Pong |
* |------|-----------------------------------------------------------------|
* | 0 | L0 | | | | | |
* | 1 | C0 | L1 | | | | |
* | 2 | S0 | C1 | L2 | | | |
* | 3 | | S1 | C2 | L3 | | |
* | 4 | | | S2 | C3 | L4 | |
* | 5 | | | | S3 | C4 | L5 |
* | 6 | | | | | S4 | C5 |
* | 7 | | | | | | S5 |
* |------|-----------------------------------------------------------------|
*/
// diagram of PINGPONG: L0
if (loop_per_core > 0) {
loadInput(nram_input, nram_target, base_addr_input, base_addr_target,
deal_n, total_c, PING, has_weight, ping_pong_offset, 0);
__asm__ volatile("sync;");
}
// diagram of PINGPONG: C0 and L1
if (loop_per_core > 1) {
coreCompute(nram_input, nram_weight, nram_flt_min, nram_pt, nram_alpha_t,
nram_temp, nram_target, nram_gamma, nram_output, alpha,
compute_num, deal_n, total_c, PING, ping_pong_offset,
has_weight);
loadInput(nram_input, nram_target, base_addr_input, base_addr_target,
deal_n, total_c, PONG, has_weight, ping_pong_offset, deal_num);
__asm__ volatile("sync;");
}
for (int i = 0; i < loop_per_core - 2; ++i) {
if (i % 2 == PING) {
storeOutput(base_addr_output, nram_output, deal_n, total_c, PING,
has_weight, ping_pong_offset, i * deal_num);
coreCompute(nram_input, nram_weight, nram_flt_min, nram_pt, nram_alpha_t,
nram_temp, nram_target, nram_gamma, nram_output, alpha,
compute_num, deal_n, total_c, PONG, ping_pong_offset,
has_weight);
loadInput(nram_input, nram_target, base_addr_input, base_addr_target,
deal_n, total_c, PING, has_weight, ping_pong_offset,
(i + 2) * deal_num);
} else {
storeOutput(base_addr_output, nram_output, deal_n, total_c, PONG,
has_weight, ping_pong_offset, i * deal_num);
coreCompute(nram_input, nram_weight, nram_flt_min, nram_pt, nram_alpha_t,
nram_temp, nram_target, nram_gamma, nram_output, alpha,
compute_num, deal_n, total_c, PING, ping_pong_offset,
has_weight);
loadInput(nram_input, nram_target, base_addr_input, base_addr_target,
deal_n, total_c, PONG, has_weight, ping_pong_offset,
(i + 2) * deal_num);
}
__asm__ volatile("sync;");
}
if (loop_per_core > 1) {
if ((loop_per_core - 2) % 2 == PING) {
storeOutput(base_addr_output, nram_output, deal_n, total_c, PING,
has_weight, ping_pong_offset, (loop_per_core - 2) * deal_num);
coreCompute(nram_input, nram_weight, nram_flt_min, nram_pt, nram_alpha_t,
nram_temp, nram_target, nram_gamma, nram_output, alpha,
compute_num, deal_n, total_c, PONG, ping_pong_offset,
has_weight);
} else {
storeOutput(base_addr_output, nram_output, deal_n, total_c, PONG,
has_weight, ping_pong_offset, (loop_per_core - 2) * deal_num);
coreCompute(nram_input, nram_weight, nram_flt_min, nram_pt, nram_alpha_t,
nram_temp, nram_target, nram_gamma, nram_output, alpha,
compute_num, deal_n, total_c, PING, ping_pong_offset,
has_weight);
}
__asm__ volatile("sync;");
}
if (loop_per_core > 0) {
if (loop_per_core == 1) {
coreCompute(nram_input, nram_weight, nram_flt_min, nram_pt, nram_alpha_t,
nram_temp, nram_target, nram_gamma, nram_output, alpha,
compute_num, deal_n, total_c, PING, ping_pong_offset,
has_weight);
__asm__ volatile("sync;");
}
if ((loop_per_core - 1) % 2 == PING) {
storeOutput(base_addr_output, nram_output, deal_n, total_c, PING,
has_weight, ping_pong_offset, (loop_per_core - 1) * deal_num);
} else {
storeOutput(base_addr_output, nram_output, deal_n, total_c, PONG,
has_weight, ping_pong_offset, (loop_per_core - 1) * deal_num);
}
}
// process the remaining data which N remainder per core is less than deal_n
int32_t rem_for_all = total_num - num_per_core * taskDim;
if (rem_for_all == 0) return;
int32_t rem_n_for_all = rem_for_all / total_c;
int32_t rem_n_per_core = (rem_n_for_all + taskDim - 1) / taskDim;
int32_t rem_num_per_core = rem_n_per_core * total_c;
int32_t rem_num_per_core_align = 0;
int32_t rem_core_num = rem_for_all / rem_num_per_core;
int32_t rem_n_for_last = rem_n_for_all % rem_n_per_core;
int32_t rem_num_for_last = rem_n_for_last * total_c;
int32_t rem_num_for_last_align = 0;
if (has_weight) {
int32_t align_c = PAD_UP(total_c, compute_align_num);
rem_num_per_core_align = rem_n_per_core * align_c;
rem_num_for_last_align = rem_n_for_last * align_c;
} else {
rem_num_per_core_align = PAD_UP(rem_num_per_core, compute_align_num);
rem_num_for_last_align = PAD_UP(rem_num_for_last, compute_align_num);
}
int32_t rem_addr_base = num_per_core * taskDim;
int32_t rem_target_addr_base = loop_per_core * deal_n * taskDim;
base_addr_target = (int32_t *)target + rem_target_addr_base;
base_addr_input = (T *)input + rem_addr_base;
base_addr_output = output + rem_addr_base;
if (taskId < rem_core_num) {
loadInput(nram_input, nram_target, base_addr_input, base_addr_target,
rem_n_per_core, total_c, PING, has_weight, ping_pong_offset,
taskId * rem_num_per_core);
__asm__ volatile("sync;");
coreCompute(nram_input, nram_weight, nram_flt_min, nram_pt, nram_alpha_t,
nram_temp, nram_target, nram_gamma, nram_output, alpha,
rem_num_per_core_align, rem_n_per_core, total_c, PING,
ping_pong_offset, has_weight);
__asm__ volatile("sync;");
storeOutput(base_addr_output, nram_output, rem_n_per_core, total_c, PING,
has_weight, ping_pong_offset, taskId * rem_num_per_core);
} else if (taskId == rem_core_num) {
if (rem_num_for_last == 0) return;
loadInput(nram_input, nram_target, base_addr_input, base_addr_target,
rem_n_for_last, total_c, PING, has_weight, ping_pong_offset,
taskId * rem_num_per_core);
__asm__ volatile("sync;");
coreCompute(nram_input, nram_weight, nram_flt_min, nram_pt, nram_alpha_t,
nram_temp, nram_target, nram_gamma, nram_output, alpha,
rem_num_for_last_align, rem_n_for_last, total_c, PING,
ping_pong_offset, has_weight);
__asm__ volatile("sync;");
storeOutput(base_addr_output, nram_output, rem_n_for_last, total_c, PING,
has_weight, ping_pong_offset, taskId * rem_num_per_core);
} else {
return;
}
}
template <typename T>
__mlu_global__ void MLUUnion1KernelFocalLossSigmoidBackward(
const void *input, const void *target, const void *weight,
const float gamma, const float alpha, const int32_t total_n,
const int32_t deal_n, const int32_t total_c, void *output) {
focalLossSigmoidBackwardBlock((T *)input, (int32_t *)target, (T *)weight,
gamma, alpha, total_n, deal_n, total_c,
(T *)output);
}
} // namespace backward
void KernelFocalLossSigmoidForward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
cnrtQueue_t queue,
const cnrtDataType_t d_type,
const void *input, const void *target,
const void *weight, const int32_t N,
const int32_t C, const float alpha,
const float gamma, void *output) {
if (d_type == CNRT_FLOAT16) {
forward::MLUUnion1KernelFocalLossSigmoidForward<
half><<<k_dim, k_type, queue>>>(input, target, weight, N, C, alpha,
gamma, output);
} else {
forward::MLUUnion1KernelFocalLossSigmoidForward<
float><<<k_dim, k_type, queue>>>(input, target, weight, N, C, alpha,
gamma, output);
}
}
void KernelFocalLossSigmoidBackward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
cnrtQueue_t queue,
const cnrtDataType_t d_type,
const void *input, const void *target,
const void *weight, const float gamma,
const float alpha, const int32_t dim_n,
const int32_t deal_n, const int32_t dim_c,
void *output) {
if (d_type == CNRT_FLOAT16) {
backward::MLUUnion1KernelFocalLossSigmoidBackward<
half><<<k_dim, k_type, queue>>>(input, target, weight, gamma, alpha,
dim_n, deal_n, dim_c, output);
} else {
backward::MLUUnion1KernelFocalLossSigmoidBackward<
float><<<k_dim, k_type, queue>>>(input, target, weight, gamma, alpha,
dim_n, deal_n, dim_c, output);
}
}
mmcv/ops/csrc/common/mlu/nms_mlu_kernel.mlu
0 → 100644
View file @
fdeee889
/*************************************************************************
* Copyright (C) 2021 Cambricon.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#include "common_mlu_helper.hpp"
#define NMS_SIZE (64)
#define COORD_DIM (4)
#define MEMORY_CORE (0x80)
#define INFO_NUM (5) // 5 means x1, x2, y1, y2 and score
#define REDUCE_NUM \
(7) // score, x1, y1, x2, y2, max_index (reserve 2 num for half-type input)
#define SIZE_NRAM_BUF (MAX_NRAM_SIZE + REM_FOR_STACK - 62 * 1024)
#define SIZE_SRAM_BUF (MAX_SRAM_SIZE)
__nram__ int8_t nram_buffer[SIZE_NRAM_BUF];
__mlu_shared__ int8_t sram_buffer[SIZE_SRAM_BUF];
__mlu_func__ void pvLock() {
#if __BANG_ARCH__ == 270
if (coreId != MEMORY_CORE) {
__bang_lock(0, 0);
}
#endif
}
__mlu_func__ void pvUnlock() {
#if __BANG_ARCH__ == 270
if (coreId != MEMORY_CORE) {
__bang_unlock(0, 0);
}
#endif
}
enum Addr { SRAM, GDRAM };
template <typename IN_DT, typename OUT_DT>
__mlu_func__ void nms_detection(
uint32_t *output_box_num, const int output_mode, const int input_layout,
OUT_DT *output_data, const Addr dst, IN_DT *input_data_score,
const IN_DT *input_data_box, const Addr src, IN_DT *buffer,
const int buffer_size, IN_DT *sram, const int core_limit,
const int input_box_num, const int input_stride, const int output_stride,
const int keepNum, const float thresh_iou, const float thresh_score,
const float offset, const int algo) {
// global value, it is stored in sram with a offset from the begin.
const int flag_offset_size = 28;
int32_t *loop_end_flag = (int32_t *)(sram + flag_offset_size);
loop_end_flag[0] = 0;
// score, x1, y1, x2, y2, inter_x1, inter_y1, inter_x2, inter_y2
const int nms_buffer_count1 = 9;
// temp nram buffer to store selected target.
const int nram_save_limit_count = 256;
float div_thresh_iou = 1.0 / thresh_iou;
// input data ptr
IN_DT *input_score_ptr;
const IN_DT *input_x1_ptr;
const IN_DT *input_y1_ptr;
const IN_DT *input_x2_ptr;
const IN_DT *input_y2_ptr;
input_score_ptr = input_data_score;
input_x1_ptr = input_data_box;
if (input_layout == 0) {
// [boxes_num, 4]
input_y1_ptr = input_x1_ptr + 1;
input_x2_ptr = input_x1_ptr + 2;
input_y2_ptr = input_x1_ptr + 3;
} else if (input_layout == 1) {
// [4, boxes_num]
input_y1_ptr = input_x1_ptr + input_stride;
input_x2_ptr = input_y1_ptr + input_stride;
input_y2_ptr = input_x2_ptr + input_stride;
}
// nram data ptr
IN_DT *x1;
IN_DT *y1;
IN_DT *x2;
IN_DT *y2;
IN_DT *score;
IN_DT *inter_x1;
IN_DT *inter_y1;
IN_DT *inter_x2;
IN_DT *inter_y2;
IN_DT *max_box; // the max score, x1, y1, x2, y2
IN_DT *x1_mask;
IN_DT *y1_mask;
IN_DT *x2_mask;
IN_DT *y2_mask;
OUT_DT *nram_save;
int limit = 0; // find limit when GDRAM or SRAM
int len_core = 0; // the length deal by every core
int max_seg_pad = 0; // the max length every repeat
int repeat = 0;
int remain = 0;
int remain_pad = 0;
int input_offset = 0; // offset of input_data for current core
int nram_save_count = 0;
// mask for collect x1, y1, x2, y2. each mask has 128 elements
const int mask_size = 128;
const int total_mask_size = 512;
if (output_mode == 0) {
limit = (buffer_size - 128 /*for max_box*/ * sizeof(IN_DT) -
nram_save_limit_count * sizeof(OUT_DT) -
total_mask_size * sizeof(IN_DT)) /
(nms_buffer_count1 * sizeof(IN_DT));
} else {
limit = (buffer_size - 128 /*for max_box*/ * sizeof(IN_DT) -
nram_save_limit_count * INFO_NUM * sizeof(OUT_DT) -
total_mask_size * sizeof(IN_DT)) /
(nms_buffer_count1 * sizeof(IN_DT));
}
if (core_limit == 1) {
len_core = input_box_num;
input_offset = 0;
} else {
int avg_core = input_box_num / core_limit;
int rem = input_box_num % core_limit;
len_core = avg_core + (taskId < rem ? 1 : 0);
input_offset = avg_core * taskId + (taskId <= rem ? taskId : rem);
}
max_seg_pad = PAD_DOWN(limit, NMS_SIZE);
repeat = len_core / max_seg_pad;
remain = len_core % max_seg_pad;
remain_pad = PAD_UP(remain, NMS_SIZE);
// if datatype is half, we should convert it to float when compute the IoU
int max_seg_iou_compute =
PAD_DOWN(max_seg_pad / (sizeof(float) / sizeof(IN_DT)), NMS_SIZE);
int repeat_iou_compute = len_core / max_seg_iou_compute;
int remain_iou_compute = len_core % max_seg_iou_compute;
int remain_pad_iou_compute = PAD_UP(remain_iou_compute, NMS_SIZE);
// initial the address point
score = buffer;
x1 = score + max_seg_pad;
y1 = x1 + max_seg_pad;
x2 = y1 + max_seg_pad;
y2 = x2 + max_seg_pad;
inter_x1 = y2 + max_seg_pad;
inter_y1 = inter_x1 + max_seg_pad;
inter_x2 = inter_y1 + max_seg_pad;
inter_y2 = inter_x2 + max_seg_pad;
x1_mask = inter_y2 + max_seg_pad;
y1_mask = x1_mask + mask_size;
x2_mask = y1_mask + mask_size;
y2_mask = x2_mask + mask_size;
max_box = y2_mask + mask_size; // the max score, x1, y1, x2, y2
// offset two line from max_box
nram_save = (OUT_DT *)((char *)max_box + NFU_ALIGN_SIZE);
// set mask for __bang_collect instruction
if (input_layout == 0) {
__nramset((IN_DT *)x1_mask, total_mask_size, (IN_DT)0);
for (int idx = 0; idx < mask_size; idx++) {
int index = (idx % COORD_DIM) * mask_size + idx;
x1_mask[index] = (IN_DT)1.0;
}
}
for (int keep = 0; keep < keepNum; keep++) { // loop until the max_score <= 0
if (core_limit != 1) {
__sync_cluster(); // sync before current loop
}
/******find max start******/
int max_index = 0; // the max score index
int global_max_index = 0; // for U1
float max_area = 0; // the max score area
max_box[0] = 0; // init 0
for (int i = 0; i <= repeat; i++) {
if (i == repeat && remain == 0) {
break;
}
int seg_len = 0; // the length every nms compute
int cpy_len = 0; // the length every nms memcpy
i == repeat ? seg_len = remain_pad : seg_len = max_seg_pad;
// check seg_len exceeds the limit of fp16 or not. 65536 is the largest
// num that half data type could express.
if (sizeof(IN_DT) == sizeof(half) && seg_len > 65536) {
// seg length exceeds the max num for fp16 datatype!
return;
}
i == repeat ? cpy_len = remain : cpy_len = max_seg_pad;
/******nms load start******/
mluMemcpyDirection_t load_dir = SRAM2NRAM;
if (src == SRAM) {
load_dir = SRAM2NRAM;
} else {
load_dir = GDRAM2NRAM;
}
__nramset(score, seg_len, (IN_DT)0);
__memcpy(score, input_score_ptr + input_offset + i * max_seg_pad,
cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT),
cpy_len * sizeof(IN_DT), 0);
/******nms load end******/
__bang_max(inter_x1, score, seg_len);
if (inter_x1[0] > max_box[0]) {
max_box[0] = inter_x1[0];
if (sizeof(IN_DT) == sizeof(half)) {
max_index = ((uint16_t *)inter_x1)[1] + input_offset +
i * max_seg_pad; // offset start from head of input_data
} else if (sizeof(IN_DT) == sizeof(float)) {
max_index = ((uint32_t *)inter_x1)[1] + input_offset +
i * max_seg_pad; // offset start from head of input_data
}
}
} // for repeat
int stride = 1;
if (input_layout == 0) {
stride = input_stride;
} else if (input_layout == 1) {
stride = 1;
}
if (core_limit == 1) {
max_box[1] = input_x1_ptr[max_index * stride];
max_box[2] = input_y1_ptr[max_index * stride];
max_box[3] = input_x2_ptr[max_index * stride];
max_box[4] = input_y2_ptr[max_index * stride];
if (algo == 0 || offset == 0.0) {
max_area = ((float)max_box[3] - (float)max_box[1]) *
((float)max_box[4] - (float)max_box[2]);
} else {
max_area = ((float)max_box[3] - (float)max_box[1] + offset) *
((float)max_box[4] - (float)max_box[2] + offset);
}
input_score_ptr[max_index] = 0;
global_max_index = max_index;
((uint32_t *)(max_box + INFO_NUM))[0] = max_index;
} else if (core_limit == 4) {
// find the max with sram
// the max box's x1, y1, x2, y2 on every core
if (coreId != MEMORY_CORE) {
max_box[1] = input_x1_ptr[max_index * stride];
max_box[2] = input_y1_ptr[max_index * stride];
max_box[3] = input_x2_ptr[max_index * stride];
max_box[4] = input_y2_ptr[max_index * stride];
}
((uint32_t *)(max_box + INFO_NUM))[0] = max_index;
// copy every core's box info to sram, form: score---x1---y1---x2---y2---
for (int i = 0; i < INFO_NUM; i++) {
__memcpy(sram + i * core_limit + taskId, max_box + i, 1 * sizeof(IN_DT),
NRAM2SRAM);
}
// copy every core's max_index to sram, use 2 half to store max_index
__memcpy(sram + INFO_NUM * core_limit + taskId * 2, max_box + INFO_NUM,
sizeof(uint32_t),
NRAM2SRAM); // int32_t datatype
__sync_cluster();
// copy score from sram to nram and find the max
__nramset(inter_x1, NMS_SIZE, (IN_DT)0);
__memcpy(inter_x1, sram, core_limit * sizeof(IN_DT), SRAM2NRAM);
__bang_max(max_box, inter_x1, NMS_SIZE);
int max_core = 0;
if (sizeof(IN_DT) == sizeof(half)) {
max_core = ((uint16_t *)max_box)[1];
} else if (sizeof(IN_DT) == sizeof(float)) {
max_core = ((uint32_t *)max_box)[1];
}
// copy the max box from SRAM to NRAM
__memcpy(max_box + 1, sram + 1 * core_limit + max_core, 1 * sizeof(IN_DT),
SRAM2NRAM); // x1
__memcpy(max_box + 2, sram + 2 * core_limit + max_core, 1 * sizeof(IN_DT),
SRAM2NRAM); // y1
__memcpy(max_box + 3, sram + 3 * core_limit + max_core, 1 * sizeof(IN_DT),
SRAM2NRAM); // x2
__memcpy(max_box + 4, sram + 4 * core_limit + max_core, 1 * sizeof(IN_DT),
SRAM2NRAM); // y2
__memcpy(max_box + 5, sram + 5 * core_limit + 2 * max_core,
sizeof(uint32_t), SRAM2NRAM);
if (algo == 0 || offset == 0.0) {
max_area = ((float)max_box[3] - (float)max_box[1]) *
((float)max_box[4] - (float)max_box[2]);
} else {
max_area = ((float)max_box[3] - (float)max_box[1] + offset) *
((float)max_box[4] - (float)max_box[2] + offset);
}
global_max_index = ((uint32_t *)(max_box + INFO_NUM))[0];
input_score_ptr[global_max_index] = 0;
}
// by now, we get: max_score|max_index|max_box|max_area
/******find max end******/
/******nms store start******/
// store to nram
if (float(max_box[0]) > thresh_score) {
OUT_DT *save_ptr;
int save_offset = 0;
int save_str_num = 0;
save_ptr = nram_save;
save_offset = nram_save_count;
save_str_num = nram_save_limit_count;
if (coreId == 0) {
if (output_mode == 0) { // index1, index2, ...
__memcpy(save_ptr + save_offset, (uint32_t *)(max_box + INFO_NUM),
1 * sizeof(uint32_t), NRAM2NRAM, 1 * sizeof(uint32_t),
1 * sizeof(uint32_t), 0);
} else if (output_mode == 1) { // score, x1, y1, x2, y2
__memcpy(save_ptr + save_offset * INFO_NUM, max_box,
INFO_NUM * sizeof(IN_DT), NRAM2NRAM,
INFO_NUM * sizeof(IN_DT), INFO_NUM * sizeof(IN_DT), 0);
} else if (output_mode == 2) { // score---, x1---, y1---, x2---, y2---
__memcpy(save_ptr + save_offset, max_box, 1 * sizeof(IN_DT),
NRAM2NRAM, save_str_num * sizeof(IN_DT), 1 * sizeof(IN_DT),
4);
}
}
nram_save_count++;
(*output_box_num)++;
}
// store to sram/gdram
if (*output_box_num != 0) {
mluMemcpyDirection_t store_dir = NRAM2GDRAM;
if (dst == SRAM) {
store_dir = NRAM2SRAM;
} else { // dst == GDRAM
store_dir = NRAM2GDRAM;
}
if ((nram_save_count == nram_save_limit_count) ||
(float(max_box[0]) <= thresh_score) || keep == keepNum - 1) {
if (nram_save_count != 0) {
if (coreId == 0) {
if (output_mode == 0) { // index1, index2, ...
pvLock();
__memcpy(output_data, nram_save,
nram_save_count * sizeof(uint32_t), store_dir);
pvUnlock();
output_data += nram_save_count;
} else if (output_mode == 1) { // score, x1, y1, x2, y2
pvLock();
__memcpy(output_data, nram_save,
nram_save_count * INFO_NUM * sizeof(IN_DT), store_dir);
pvUnlock();
output_data += nram_save_count * INFO_NUM;
} else if (output_mode ==
2) { // score---, x1---, y1---, x2---, y2---
pvLock();
__memcpy(output_data, nram_save, nram_save_count * sizeof(IN_DT),
store_dir, output_stride * sizeof(IN_DT),
nram_save_limit_count * sizeof(IN_DT), 4);
pvUnlock();
output_data += nram_save_count;
}
nram_save_count = 0;
}
}
} // if move data nram->sram/gdram
} // if dst
// if the max score <= 0, end
if (core_limit == 1) {
if (float(max_box[0]) <= thresh_score) {
break;
}
} else {
if (float(max_box[0]) <= thresh_score) {
if (coreId == 0) {
loop_end_flag[0] = 1;
}
}
__sync_cluster();
if (loop_end_flag[0] == 1) {
break;
}
}
/******nms store end******/
// To solve half data accuracy, we convert half to float to calculate IoU.
for (int i = 0; i <= repeat_iou_compute; i++) {
if (i == repeat_iou_compute && remain_iou_compute == 0) {
break;
}
int seg_len = 0; // the length every nms compute
int cpy_len = 0; // the length every nms memcpy
i == repeat_iou_compute ? seg_len = remain_pad_iou_compute
: seg_len = max_seg_iou_compute;
i == repeat_iou_compute ? cpy_len = remain_iou_compute
: cpy_len = max_seg_iou_compute;
/******nms load start******/
mluMemcpyDirection_t load_dir = SRAM2NRAM;
if (src == SRAM) {
load_dir = SRAM2NRAM;
} else {
load_dir = GDRAM2NRAM;
}
__nramset((float *)score, seg_len, 0.0f);
int dt_offset = 0;
if (sizeof(IN_DT) == sizeof(float)) {
__memcpy(score, input_score_ptr + input_offset + i * max_seg_pad,
cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT),
cpy_len * sizeof(IN_DT), 0);
dt_offset = 0;
} else if (sizeof(IN_DT) == sizeof(half)) {
__nramset(x1, seg_len, half(0));
__memcpy(x1, input_score_ptr + input_offset + i * max_seg_iou_compute,
cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT),
cpy_len * sizeof(IN_DT), 0);
__bang_half2float((float *)score, (half *)x1, seg_len);
dt_offset = max_seg_iou_compute;
}
if (input_layout == 0) {
// the following number 4 means x1, y1, x2, y2
__memcpy(
inter_x1,
input_x1_ptr + (input_offset + i * max_seg_iou_compute) * COORD_DIM,
cpy_len * COORD_DIM * sizeof(IN_DT), load_dir,
cpy_len * COORD_DIM * sizeof(IN_DT),
cpy_len * COORD_DIM * sizeof(IN_DT), 0);
// here use collect instruction to transpose the [n, 4] shape into [4,
// n] shape to avoid
// discrete memory accessing.
for (int c_i = 0; c_i < COORD_DIM * seg_len / mask_size; c_i++) {
// the following number 32 means 32 elements will be selected out by
// once operation
__bang_collect(x1 + dt_offset + c_i * 32, inter_x1 + c_i * mask_size,
x1_mask, mask_size);
__bang_collect(y1 + dt_offset + c_i * 32, inter_x1 + c_i * mask_size,
y1_mask, mask_size);
__bang_collect(x2 + dt_offset + c_i * 32, inter_x1 + c_i * mask_size,
x2_mask, mask_size);
__bang_collect(y2 + dt_offset + c_i * 32, inter_x1 + c_i * mask_size,
y2_mask, mask_size);
}
} else if (input_layout == 1) {
__memcpy(x1 + dt_offset,
input_x1_ptr + input_offset + i * max_seg_iou_compute,
cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT),
cpy_len * sizeof(IN_DT), 0);
__memcpy(y1 + dt_offset,
input_y1_ptr + input_offset + i * max_seg_iou_compute,
cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT),
cpy_len * sizeof(IN_DT), 0);
__memcpy(x2 + dt_offset,
input_x2_ptr + input_offset + i * max_seg_iou_compute,
cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT),
cpy_len * sizeof(IN_DT), 0);
__memcpy(y2 + dt_offset,
input_y2_ptr + input_offset + i * max_seg_iou_compute,
cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT),
cpy_len * sizeof(IN_DT), 0);
}
/******nms load end******/
/******nms compute start******/
if (sizeof(IN_DT) == sizeof(half)) {
__bang_half2float((float *)x1, (half *)x1 + max_seg_iou_compute,
seg_len);
__bang_half2float((float *)y1, (half *)y1 + max_seg_iou_compute,
seg_len);
__bang_half2float((float *)x2, (half *)x2 + max_seg_iou_compute,
seg_len);
__bang_half2float((float *)y2, (half *)y2 + max_seg_iou_compute,
seg_len);
}
// 1、 compute IOU
// get the area_I
__nramset((float *)inter_y1, seg_len, float(max_box[1])); // max_x1
__bang_maxequal((float *)inter_x1, (float *)x1, (float *)inter_y1,
seg_len); // inter_x1
__nramset((float *)inter_y2, seg_len, float(max_box[3])); // max_x2
__bang_minequal((float *)inter_x2, (float *)x2, (float *)inter_y2,
seg_len); // inter_x2
__bang_sub((float *)inter_x1, (float *)inter_x2, (float *)inter_x1,
seg_len);
if (algo == 1 && offset != 0.0) {
__bang_add_const((float *)inter_x1, (float *)inter_x1, offset, seg_len);
}
__bang_active_relu((float *)inter_x1, (float *)inter_x1,
seg_len); // inter_w
__nramset((float *)inter_x2, seg_len, float(max_box[2])); // max_y1
__bang_maxequal((float *)inter_y1, (float *)y1, (float *)inter_x2,
seg_len); // inter_y1
__nramset((float *)inter_x2, seg_len, float(max_box[4])); // max_y2
__bang_minequal((float *)inter_y2, (float *)y2, (float *)inter_x2,
seg_len); // inter_y2
__bang_sub((float *)inter_y1, (float *)inter_y2, (float *)inter_y1,
seg_len);
if (algo == 1 && offset != 0.0) {
__bang_add_const((float *)inter_y1, (float *)inter_y1, offset, seg_len);
}
__bang_active_relu((float *)inter_y1, (float *)inter_y1,
seg_len); // inter_h
__bang_mul((float *)inter_x1, (float *)inter_x1, (float *)inter_y1,
seg_len); // area_I
// get the area of input_box: area = (x2 - x1) * (y2 - y1);
__bang_sub((float *)inter_y1, (float *)x2, (float *)x1, seg_len);
__bang_sub((float *)inter_y2, (float *)y2, (float *)y1, seg_len);
if (algo == 1 && offset != 0.0) {
__bang_add_const((float *)inter_y1, (float *)inter_y1, offset, seg_len);
__bang_add_const((float *)inter_y2, (float *)inter_y2, offset, seg_len);
}
__bang_mul((float *)inter_x2, (float *)inter_y1, (float *)inter_y2,
seg_len); // area
// get the area_U: area + max_area - area_I
__bang_add_const((float *)inter_x2, (float *)inter_x2, float(max_area),
seg_len);
__bang_sub((float *)inter_x2, (float *)inter_x2, (float *)inter_x1,
seg_len); // area_U
// 2、 select the box
// if IOU greater than thres, set the score to zero, abort it: area_U >
// area_I * (1 / thresh)?
if (thresh_iou > 0.0) {
__bang_mul_const((float *)inter_x1, (float *)inter_x1, div_thresh_iou,
seg_len);
} else {
__bang_mul_const((float *)inter_x2, (float *)inter_x2, thresh_iou,
seg_len);
}
__bang_ge((float *)inter_x1, (float *)inter_x2, (float *)inter_x1,
seg_len);
__bang_mul((float *)score, (float *)score, (float *)inter_x1, seg_len);
/******nms compute end******/
// update the score
mluMemcpyDirection_t update_dir = NRAM2SRAM;
if (dst == SRAM) {
update_dir = NRAM2SRAM;
} else {
update_dir = NRAM2GDRAM;
}
if (sizeof(IN_DT) == sizeof(half)) {
__bang_float2half_rd((half *)score, (float *)score, seg_len);
}
pvLock();
__memcpy(input_score_ptr + input_offset + i * max_seg_iou_compute, score,
cpy_len * sizeof(IN_DT), update_dir, cpy_len * sizeof(IN_DT),
cpy_len * sizeof(IN_DT), 0);
pvUnlock();
} // for repeat
} // for keepNum
}
__mlu_global__ void MLUUnion1KernelNMS(
const void *input_boxes, const void *input_confidence,
const int input_num_boxes, const int input_stride,
const int max_output_size, const float iou_threshold,
const float confidence_threshold, const int mode, const int input_layout,
void *workspace, void *result_num, void *output,
const cnrtDataType_t data_type_input, const float offset, const int algo) {
if (data_type_input == CNRT_FLOAT16) {
__memcpy(workspace, input_confidence, input_num_boxes * sizeof(half),
GDRAM2GDRAM);
} else if (data_type_input == CNRT_FLOAT32) {
__memcpy(workspace, input_confidence, input_num_boxes * sizeof(float),
GDRAM2GDRAM);
} else {
}
int output_stride = max_output_size;
uint32_t result_box_num = 0;
if (mode == 0) {
uint32_t *out_data = (uint32_t *)output;
switch (data_type_input) {
default: { return; }
case CNRT_FLOAT16: {
half *boxes_data = (half *)input_boxes;
half *confi_data = (half *)workspace;
half *buffer = (half *)nram_buffer;
half *sram = (half *)sram_buffer;
nms_detection(&result_box_num, mode, input_layout, out_data, GDRAM,
confi_data, boxes_data, GDRAM, buffer, SIZE_NRAM_BUF,
sram, taskDim, input_num_boxes, input_stride,
output_stride, max_output_size, iou_threshold,
confidence_threshold, offset, algo);
((uint32_t *)result_num)[0] = result_box_num;
}; break;
case CNRT_FLOAT32: {
float *boxes_data = (float *)input_boxes;
float *confi_data = (float *)workspace;
float *buffer = (float *)nram_buffer;
float *sram = (float *)sram_buffer;
nms_detection(&result_box_num, mode, input_layout, out_data, GDRAM,
confi_data, boxes_data, GDRAM, buffer, SIZE_NRAM_BUF,
sram, taskDim, input_num_boxes, input_stride,
output_stride, max_output_size, iou_threshold,
confidence_threshold, offset, algo);
((uint32_t *)result_num)[0] = result_box_num;
}; break;
}
} else {
switch (data_type_input) {
default: { return; }
case CNRT_FLOAT16: {
half *boxes_data = (half *)input_boxes;
half *confi_data = (half *)workspace;
half *out_data = (half *)output;
half *buffer = (half *)nram_buffer;
half *sram = (half *)sram_buffer;
nms_detection(&result_box_num, mode, input_layout, out_data, GDRAM,
confi_data, boxes_data, GDRAM, buffer, SIZE_NRAM_BUF,
sram, taskDim, input_num_boxes, input_stride,
output_stride, max_output_size, iou_threshold,
confidence_threshold, offset, algo);
((uint32_t *)result_num)[0] = result_box_num;
}; break;
case CNRT_FLOAT32: {
float *boxes_data = (float *)input_boxes;
float *confi_data = (float *)workspace;
float *out_data = (float *)output;
float *buffer = (float *)nram_buffer;
float *sram = (float *)sram_buffer;
nms_detection(&result_box_num, mode, input_layout, out_data, GDRAM,
confi_data, boxes_data, GDRAM, buffer, SIZE_NRAM_BUF,
sram, taskDim, input_num_boxes, input_stride,
output_stride, max_output_size, iou_threshold,
confidence_threshold, offset, algo);
((uint32_t *)result_num)[0] = result_box_num;
}; break;
}
}
}
template <typename IN_DT, typename OUT_DT>
__mlu_func__ void nms_detection_ux(
int32_t *loop_end_flag, uint32_t &output_box_num, OUT_DT *output_dram,
IN_DT *score_data, const IN_DT *boxes_data, const Addr input_ram,
const int input_layout, const int input_num_boxes, const int input_stride,
const int max_output_size, const float thresh_iou, const float thresh_score,
const float offset, const int output_mode, const int algo) {
loop_end_flag[0] = 0;
IN_DT *sram = (IN_DT *)sram_buffer;
// score, x1, y1, x2, y2, inter_x1, inter_y1, inter_x2, inter_y2
int nms_buffer_count1 = 9;
// temp nram buffer to store selected target.
int nram_save_limit_count = 256;
float div_thresh_iou = 1.0 / thresh_iou;
// input data ptr
IN_DT *input_score_ptr;
const IN_DT *input_x1_ptr;
const IN_DT *input_y1_ptr;
const IN_DT *input_x2_ptr;
const IN_DT *input_y2_ptr;
input_score_ptr = score_data;
input_x1_ptr = boxes_data;
input_y1_ptr = input_x1_ptr + input_stride;
input_x2_ptr = input_y1_ptr + input_stride;
input_y2_ptr = input_x2_ptr + input_stride;
int limit = 0; // find limit when GDRAM or SRAM
int max_seg_pad = 0; // the max length every repeat
int repeat = 0;
int remain = 0;
int remain_pad = 0;
int nram_save_count = 0;
if (output_mode == 0) {
limit = (SIZE_NRAM_BUF - NFU_ALIGN_SIZE /*for max_box*/ * sizeof(IN_DT) -
nram_save_limit_count * sizeof(OUT_DT)) /
(nms_buffer_count1 * sizeof(IN_DT));
} else {
limit = (SIZE_NRAM_BUF - NFU_ALIGN_SIZE /*for max_box*/ * sizeof(IN_DT) -
nram_save_limit_count * INFO_NUM * sizeof(OUT_DT)) /
(nms_buffer_count1 * sizeof(IN_DT));
}
// data split
int avg_cluster = input_num_boxes / clusterDim;
int rem_cluster = input_num_boxes % clusterDim;
int len_cluster = avg_cluster + (clusterId < rem_cluster ? 1 : 0);
int cluster_offset = avg_cluster * clusterId +
(clusterId <= rem_cluster ? clusterId : rem_cluster);
int avg_core = len_cluster / coreDim;
int rem_core = len_cluster % coreDim;
int len_core = avg_core + (coreId < rem_core ? 1 : 0);
int core_offset =
avg_core * coreId + (coreId <= rem_core ? coreId : rem_core);
int input_offset = cluster_offset + core_offset;
max_seg_pad = PAD_DOWN(limit, NMS_SIZE);
// core 0 of each cluster calculate the max score index
int max_index_avg_core = input_num_boxes / clusterDim;
int max_index_rem_core = input_num_boxes % clusterDim;
int max_index_len_core =
max_index_avg_core + (clusterId < max_index_rem_core ? 1 : 0);
int max_index_input_offset =
max_index_avg_core * clusterId +
(clusterId <= max_index_rem_core ? clusterId : max_index_rem_core);
repeat = max_index_len_core / max_seg_pad;
remain = max_index_len_core % max_seg_pad;
remain_pad = PAD_UP(remain, NMS_SIZE);
// if datatype is fp16, we should cvt to fp32 when compute iou
int max_seg_iou_compute =
PAD_DOWN(max_seg_pad / (sizeof(float) / sizeof(IN_DT)), NMS_SIZE);
int repeat_iou_compute = len_core / max_seg_iou_compute;
int remain_iou_compute = len_core % max_seg_iou_compute;
int remain_pad_iou_compute = PAD_UP(remain_iou_compute, NMS_SIZE);
// init the nram ptr
IN_DT *score = (IN_DT *)nram_buffer;
IN_DT *x1 = score + max_seg_pad;
IN_DT *y1 = x1 + max_seg_pad;
IN_DT *x2 = y1 + max_seg_pad;
IN_DT *y2 = x2 + max_seg_pad;
IN_DT *inter_x1 = y2 + max_seg_pad;
IN_DT *inter_y1 = inter_x1 + max_seg_pad;
IN_DT *inter_x2 = inter_y1 + max_seg_pad;
IN_DT *inter_y2 = inter_x2 + max_seg_pad;
IN_DT *max_box = inter_y2 + max_seg_pad; // the max score, x1, y1, x2, y2
OUT_DT *nram_save =
(OUT_DT *)((char *)max_box +
NFU_ALIGN_SIZE); // offset two line from max_box
mluMemcpyDirection_t input_load_dir = SRAM2NRAM;
mluMemcpyDirection_t input_store_dir = NRAM2SRAM;
input_load_dir = (input_ram == SRAM) ? SRAM2NRAM : GDRAM2NRAM;
input_store_dir = (input_ram == SRAM) ? NRAM2SRAM : NRAM2GDRAM;
for (int keep = 0; keep < max_output_size;
keep++) { // loop until the max_score <= 0
__sync_all();
/******FIND MAX START******/
int max_index = 0;
int global_max_index = 0; // for Ux
float max_area = 0; // the max socre area
max_box[0] = 0; // init 0
if (coreId == 0) {
for (int i = 0; i <= repeat; i++) {
if (i == repeat && remain == 0) {
break;
}
int seg_len = (i == repeat)
? remain_pad
: max_seg_pad; // the length every nms compute
// check seg_len exceeds the limit of fp16 or not. 65536 is the largest
// num
// that fp16 could express.
if (sizeof(IN_DT) == sizeof(half) && seg_len > 65536) {
return;
}
int cpy_len = (i == repeat)
? remain
: max_seg_pad; // the length every nms memcpy
/******NMS LOAD START******/
__bang_write_zero(score, seg_len);
__memcpy(score,
input_score_ptr + max_index_input_offset + i * max_seg_pad,
cpy_len * sizeof(IN_DT), input_load_dir,
cpy_len * sizeof(IN_DT), cpy_len * sizeof(IN_DT), 0);
/******NMS LOAD END******/
__bang_max(inter_x1, score, seg_len);
if (inter_x1[0] > max_box[0]) {
max_box[0] = inter_x1[0];
if (sizeof(IN_DT) == sizeof(half)) {
max_index =
((uint16_t *)inter_x1)[1] + max_index_input_offset +
i * max_seg_pad; // offset start from head of input_data
} else if (sizeof(IN_DT) == sizeof(float)) {
max_index =
((uint32_t *)inter_x1)[1] + max_index_input_offset +
i * max_seg_pad; // offset start from head of input_data
}
}
} // for repeat
// the max box's x1, y1, x2, y2 on every cluster
max_box[1] = input_x1_ptr[max_index];
max_box[2] = input_y1_ptr[max_index];
max_box[3] = input_x2_ptr[max_index];
max_box[4] = input_y2_ptr[max_index];
((uint32_t *)(max_box + 5))[0] = max_index;
// copy max box info to sram
__memcpy(sram, max_box, REDUCE_NUM * sizeof(IN_DT), NRAM2SRAM);
}
__sync_all();
// copy all partial max to the sram of cluster 0
if (clusterId != 0) {
__memcpy(sram + REDUCE_NUM * clusterId, sram, REDUCE_NUM * sizeof(IN_DT),
SRAM2SRAM, 0);
}
__sync_all();
// reduce between clusters to get the global max box
if (clusterId == 0) {
if (coreId == 0) {
__bang_write_zero(inter_x1, NMS_SIZE);
__memcpy(inter_x1, sram, sizeof(IN_DT), SRAM2NRAM, sizeof(IN_DT),
REDUCE_NUM * sizeof(IN_DT), clusterDim - 1);
__bang_max(max_box, inter_x1, NMS_SIZE);
int max_cluster = (sizeof(IN_DT) == sizeof(half))
? ((uint16_t *)max_box)[1]
: ((uint32_t *)max_box)[1];
__memcpy(max_box, sram + max_cluster * REDUCE_NUM,
REDUCE_NUM * sizeof(IN_DT), SRAM2NRAM);
__memcpy(sram, max_box, REDUCE_NUM * sizeof(IN_DT), NRAM2SRAM);
}
__sync_cluster();
if (coreId == 0x80 && clusterDim > 1) {
// broadcast global max box to each cluster's sram
for (int cluster_idx = 1; cluster_idx < clusterDim; ++cluster_idx) {
__memcpy(sram, sram, REDUCE_NUM * sizeof(IN_DT), SRAM2SRAM,
cluster_idx);
}
}
__sync_cluster();
}
__sync_all();
// copy the global max box to max_box
__memcpy(max_box, sram, REDUCE_NUM * sizeof(IN_DT), SRAM2NRAM);
if (algo == 0 || offset == 0.0) {
max_area = ((float)max_box[3] - (float)max_box[1]) *
((float)max_box[4] - (float)max_box[2]);
} else {
max_area = ((float)max_box[3] - (float)max_box[1] + offset) *
((float)max_box[4] - (float)max_box[2] + offset);
}
global_max_index = ((uint32_t *)(max_box + 5))[0];
if (coreId != 0x80) {
input_score_ptr[global_max_index] = 0;
}
// by now, we get: max_score|max_index|max_box|max_area
/******FIND MAX END******/
/******NMS STORE START******/
// store to nram
if (float(max_box[0]) > thresh_score) {
OUT_DT *save_ptr;
int save_offset = 0;
int save_str_num = 0;
save_ptr = nram_save;
save_offset = nram_save_count;
save_str_num = nram_save_limit_count;
if (clusterId == 0 && coreId == 0) {
if (output_mode == 0) { // index1, index2, ...
save_ptr[save_offset] = ((uint32_t *)(max_box + INFO_NUM))[0];
} else if (output_mode == 1) { // score, x1, y1, x2, y2
__memcpy(save_ptr + save_offset * INFO_NUM, max_box,
INFO_NUM * sizeof(IN_DT), NRAM2NRAM,
INFO_NUM * sizeof(IN_DT), INFO_NUM * sizeof(IN_DT), 0);
} else if (output_mode == 2) { // score---, x1---, y1---, x2---, y2---
__memcpy(save_ptr + save_offset, max_box, 1 * sizeof(IN_DT),
NRAM2NRAM, save_str_num * sizeof(IN_DT), 1 * sizeof(IN_DT),
4);
}
}
nram_save_count++;
output_box_num++;
}
// store to sram/gdram
if (output_box_num != 0) {
if ((nram_save_count == nram_save_limit_count) ||
(float(max_box[0]) <= thresh_score) || keep == max_output_size - 1) {
if (nram_save_count != 0) {
if (clusterId == 0 && coreId == 0) {
if (output_mode == 0) { // index1, index2, ...
pvLock();
__memcpy(output_dram, nram_save,
nram_save_count * sizeof(uint32_t), NRAM2GDRAM);
pvUnlock();
output_dram += nram_save_count;
} else if (output_mode == 1) { // score, x1, y1, x2, y2
pvLock();
__memcpy(output_dram, nram_save,
nram_save_count * INFO_NUM * sizeof(IN_DT), NRAM2GDRAM);
pvUnlock();
output_dram += nram_save_count * INFO_NUM;
} else if (output_mode ==
2) { // score---, x1---, y1---, x2---, y2---
pvLock();
__memcpy(output_dram, nram_save, nram_save_count * sizeof(IN_DT),
NRAM2GDRAM, max_output_size * sizeof(IN_DT),
nram_save_limit_count * sizeof(IN_DT), 4);
pvUnlock();
output_dram += nram_save_count;
}
nram_save_count = 0;
}
}
} // if move data nram->sram/gdram
} // if dst
if (float(max_box[0]) <= thresh_score) {
if (clusterId == 0 && coreId == 0) {
loop_end_flag[0] = 1; // dram
}
}
__sync_all();
if (loop_end_flag[0] == 1) {
break;
}
/******NMS STORE END******/
// To solve fp16 accuracy, we convert fp16 to fp32 to calculate IoU.
for (int i = 0; i <= repeat_iou_compute; i++) {
if (i == repeat_iou_compute && remain_iou_compute == 0) {
break;
}
int seg_len = (i == repeat_iou_compute) ? remain_pad_iou_compute
: max_seg_iou_compute;
int cpy_len =
(i == repeat_iou_compute) ? remain_iou_compute : max_seg_iou_compute;
/******NMS LOAD START******/
__nramset((float *)score, seg_len, 0.0f);
int dt_offset = 0;
if (sizeof(IN_DT) == sizeof(float)) {
__memcpy(score, input_score_ptr + input_offset + i * max_seg_pad,
cpy_len * sizeof(IN_DT), input_load_dir,
cpy_len * sizeof(IN_DT), cpy_len * sizeof(IN_DT), 0);
dt_offset = 0;
} else if (sizeof(IN_DT) == sizeof(half)) {
__nramset(x1, seg_len, half(0));
__memcpy(x1, input_score_ptr + input_offset + i * max_seg_iou_compute,
cpy_len * sizeof(IN_DT), input_load_dir,
cpy_len * sizeof(IN_DT), cpy_len * sizeof(IN_DT), 0);
__bang_half2float((float *)score, (half *)x1, seg_len);
dt_offset = max_seg_iou_compute;
}
__memcpy(x1 + dt_offset,
input_x1_ptr + input_offset + i * max_seg_iou_compute,
cpy_len * sizeof(IN_DT), input_load_dir,
max_seg_pad * sizeof(IN_DT), input_num_boxes * sizeof(IN_DT), 3);
/******NMS LOAD END******/
/******NMS COMPUTE START******/
if (sizeof(IN_DT) == sizeof(half)) {
__bang_half2float((float *)x1, (half *)x1 + max_seg_iou_compute,
seg_len);
__bang_half2float((float *)y1, (half *)y1 + max_seg_iou_compute,
seg_len);
__bang_half2float((float *)x2, (half *)x2 + max_seg_iou_compute,
seg_len);
__bang_half2float((float *)y2, (half *)y2 + max_seg_iou_compute,
seg_len);
}
// 1、 compute IOU
// get the area_I
__nramset((float *)inter_y1, seg_len, float(max_box[1])); // max_x1
__bang_maxequal((float *)inter_x1, (float *)x1, (float *)inter_y1,
seg_len); // inter_x1
__nramset((float *)inter_y2, seg_len, float(max_box[3])); // max_x2
__bang_minequal((float *)inter_x2, (float *)x2, (float *)inter_y2,
seg_len); // inter_x2
__bang_sub((float *)inter_x1, (float *)inter_x2, (float *)inter_x1,
seg_len);
if (algo == 1 && offset != 0.0) {
__bang_add_const((float *)inter_x1, (float *)inter_x1, offset, seg_len);
}
__bang_active_relu((float *)inter_x1, (float *)inter_x1,
seg_len); // inter_w
__nramset((float *)inter_x2, seg_len, float(max_box[2])); // max_y1
__bang_maxequal((float *)inter_y1, (float *)y1, (float *)inter_x2,
seg_len); // inter_y1
__nramset((float *)inter_x2, seg_len, float(max_box[4])); // max_y2
__bang_minequal((float *)inter_y2, (float *)y2, (float *)inter_x2,
seg_len); // inter_y2
__bang_sub((float *)inter_y1, (float *)inter_y2, (float *)inter_y1,
seg_len);
if (algo == 1 && offset != 0.0) {
__bang_add_const((float *)inter_y1, (float *)inter_y1, offset, seg_len);
}
__bang_active_relu((float *)inter_y1, (float *)inter_y1,
seg_len); // inter_h
__bang_mul((float *)inter_x1, (float *)inter_x1, (float *)inter_y1,
seg_len); // area_I
// get the area of input_box: area = (x2 - x1) * (y2 - y1);
__bang_sub((float *)inter_y1, (float *)x2, (float *)x1, seg_len);
__bang_sub((float *)inter_y2, (float *)y2, (float *)y1, seg_len);
if (algo == 1 && offset != 0.0) {
__bang_add_const((float *)inter_y1, (float *)inter_y1, offset, seg_len);
__bang_add_const((float *)inter_y2, (float *)inter_y2, offset, seg_len);
}
__bang_mul((float *)inter_x2, (float *)inter_y1, (float *)inter_y2,
seg_len); // area
// get the area_U: area + max_area - area_I
__bang_add_const((float *)inter_x2, (float *)inter_x2, float(max_area),
seg_len);
__bang_sub((float *)inter_x2, (float *)inter_x2, (float *)inter_x1,
seg_len); // area_U
// 2、 select the box
// if IOU greater than thres, set the score to zero, abort it: area_U >
// area_I * (1 / thresh)?
if (thresh_iou > 0.0) {
__bang_mul_const((float *)inter_x1, (float *)inter_x1, div_thresh_iou,
seg_len);
} else {
__bang_mul_const((float *)inter_x2, (float *)inter_x2, thresh_iou,
seg_len);
}
__bang_ge((float *)inter_x1, (float *)inter_x2, (float *)inter_x1,
seg_len);
__bang_mul((float *)score, (float *)score, (float *)inter_x1, seg_len);
/******NMS COMPUTE END******/
if (sizeof(IN_DT) == 2) {
__bang_float2half_rd((half *)score, (float *)score, seg_len);
}
pvLock();
__memcpy(input_score_ptr + input_offset + i * max_seg_iou_compute, score,
cpy_len * sizeof(IN_DT), input_store_dir,
cpy_len * sizeof(IN_DT), cpy_len * sizeof(IN_DT), 0);
pvUnlock();
} // for repeat
} // for max_output_size
}
__mlu_global__ void MLUUionXKernelNMS(
const void *input_boxes, const void *input_confidence,
const int input_num_boxes, const int input_layout, const int input_stride,
const int max_output_size, const float iou_threshold,
const float confidence_threshold, const float offset,
const cnrtDataType_t data_type_input, const int output_mode, const int algo,
void *workspace, void *result_num, void *output) {
int input_dwidth = (data_type_input == CNRT_FLOAT32) ? 4 : 2;
int32_t *loop_end_flag =
(int32_t *)((char *)workspace +
INFO_NUM * input_num_boxes * input_dwidth);
int reduce_sram_size = NFU_ALIGN_SIZE * REDUCE_NUM * input_dwidth;
int availbale_sram_size = SIZE_SRAM_BUF - reduce_sram_size;
int cluster_score_size = input_num_boxes * input_dwidth;
int cluster_boxes_size = input_num_boxes * 4 * input_dwidth;
char *sram_score = (char *)sram_buffer + reduce_sram_size;
char *sram_boxes =
(char *)sram_buffer + reduce_sram_size + cluster_score_size;
Addr input_ram = GDRAM;
if ((cluster_score_size + cluster_boxes_size) < availbale_sram_size) {
input_ram = SRAM;
__memcpy(sram_score, input_confidence, cluster_score_size, GDRAM2SRAM);
__memcpy(sram_boxes, input_boxes, cluster_boxes_size, GDRAM2SRAM);
} else {
__memcpy(workspace, input_confidence, cluster_score_size, GDRAM2GDRAM);
}
__sync_cluster();
uint32_t output_box_num = 0;
if (output_mode == 0) {
uint32_t *output_dram = (uint32_t *)output;
switch (data_type_input) {
default: { return; }
case CNRT_FLOAT16: {
half *score_data;
half *boxes_data;
score_data =
(input_ram == SRAM) ? (half *)sram_score : (half *)workspace;
boxes_data =
(input_ram == SRAM) ? (half *)sram_boxes : (half *)input_boxes;
nms_detection_ux(loop_end_flag, output_box_num, output_dram, score_data,
boxes_data, input_ram, input_layout, input_num_boxes,
input_stride, max_output_size, iou_threshold,
confidence_threshold, offset, output_mode, algo);
((uint32_t *)result_num)[0] = output_box_num;
}; break;
case CNRT_FLOAT32: {
float *score_data;
float *boxes_data;
score_data =
(input_ram == SRAM) ? (float *)sram_score : (float *)workspace;
boxes_data =
(input_ram == SRAM) ? (float *)sram_boxes : (float *)input_boxes;
nms_detection_ux(loop_end_flag, output_box_num, output_dram, score_data,
boxes_data, input_ram, input_layout, input_num_boxes,
input_stride, max_output_size, iou_threshold,
confidence_threshold, offset, output_mode, algo);
((uint32_t *)result_num)[0] = output_box_num;
}; break;
}
} else {
switch (data_type_input) {
default: { return; }
case CNRT_FLOAT16: {
half *output_dram = (half *)output;
half *score_data;
half *boxes_data;
score_data =
(input_ram == SRAM) ? (half *)sram_score : (half *)workspace;
boxes_data =
(input_ram == SRAM) ? (half *)sram_boxes : (half *)input_boxes;
nms_detection_ux(loop_end_flag, output_box_num, output_dram, score_data,
boxes_data, input_ram, input_layout, input_num_boxes,
input_stride, max_output_size, iou_threshold,
confidence_threshold, offset, output_mode, algo);
((uint32_t *)result_num)[0] = output_box_num;
}; break;
case CNRT_FLOAT32: {
float *output_dram = (float *)output;
float *score_data;
float *boxes_data;
score_data =
(input_ram == SRAM) ? (float *)sram_score : (float *)workspace;
boxes_data =
(input_ram == SRAM) ? (float *)sram_boxes : (float *)input_boxes;
nms_detection_ux(loop_end_flag, output_box_num, output_dram, score_data,
boxes_data, input_ram, input_layout, input_num_boxes,
input_stride, max_output_size, iou_threshold,
confidence_threshold, offset, output_mode, algo);
((uint32_t *)result_num)[0] = output_box_num;
}; break;
}
}
}
void KernelNms(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
const cnrtDataType_t data_type_input, const void *boxes_ptr,
const void *scores_ptr, const int input_num_boxes,
const int input_stride, const int max_output_boxes,
const float iou_threshold, const float offset,
void *workspace_ptr, void *output_size_ptr, void *output_ptr) {
switch (k_type) {
default: { return; }
case CNRT_FUNC_TYPE_BLOCK:
case CNRT_FUNC_TYPE_UNION1: {
MLUUnion1KernelNMS<<<k_dim, k_type, queue>>>(
boxes_ptr, scores_ptr, input_num_boxes, input_stride,
max_output_boxes, iou_threshold, /*confidence_threshold=*/0.0,
/*output_mode=*/0,
/*input_layout=*/1, workspace_ptr, output_size_ptr, output_ptr,
data_type_input, offset, /*algo=*/1);
}; break;
case CNRT_FUNC_TYPE_UNION2:
case CNRT_FUNC_TYPE_UNION4:
case CNRT_FUNC_TYPE_UNION8:
case CNRT_FUNC_TYPE_UNION16: {
MLUUionXKernelNMS<<<k_dim, k_type, queue>>>(
boxes_ptr, scores_ptr, input_num_boxes, /*input_layout=*/1,
input_stride, max_output_boxes, iou_threshold,
/*confidence_threshold=*/0.0, offset, data_type_input,
/*output_mode=*/0, /*algo=*/1, workspace_ptr, output_size_ptr,
output_ptr);
}; break;
}
}
mmcv/ops/csrc/common/mlu/psamask_mlu_kernel.mlu
0 → 100644
View file @
fdeee889
/*************************************************************************
* Copyright (C) 2022 Cambricon.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#include "common_mlu_helper.hpp"
#include "psamask_utils.hpp"
#define COMPUTE_COUNT_ALIGN 64
__nram__ char buf[MAX_NRAM_SIZE];
template <typename T>
__mlu_func__ void swap(T &a, T &b) {
T tmp = a;
a = b;
b = tmp;
}
template <typename T>
__mlu_func__ void storeDataFromNramToDram(T *dst, const T *src,
const PositionInCore &position,
const Shape &shape_full) {
int n_offset = shape_full.h * shape_full.w * shape_full.c;
int h_offset = shape_full.w * shape_full.c;
int w_offset = shape_full.c;
int n_seg = position.n_end - position.n_start;
int h_seg = position.h_end - position.h_start;
int w_seg = position.w_end - position.w_start;
int size = h_seg * w_seg * shape_full.c;
__memcpy(dst + position.n_start * n_offset + position.h_start * h_offset +
position.w_start * w_offset,
src, size * sizeof(T), NRAM2GDRAM, n_offset * sizeof(T),
size * sizeof(T), n_seg - 1);
}
template <typename T>
__mlu_func__ void loadDataFromDramToNram(T *dst, const T *src,
const PositionInCore &position,
const Shape &shape_full) {
int n_offset = shape_full.h * shape_full.w * shape_full.c;
int h_offset = shape_full.w * shape_full.c;
int w_offset = shape_full.c;
int n_seg = position.n_end - position.n_start;
int h_seg = position.h_end - position.h_start;
int w_seg = position.w_end - position.w_start;
int size = h_seg * w_seg * shape_full.c;
__memcpy(dst,
src + position.n_start * n_offset + position.h_start * h_offset +
position.w_start * w_offset,
size * sizeof(T), GDRAM2NRAM, size * sizeof(T), n_offset * sizeof(T),
n_seg - 1);
}
// transpose the data from A*B*C*(D*E) to A*D*E*(B*C)
template <typename T>
__mlu_func__ void transposeData(T *dst, T *src, const Shape &shape_seg) {
int align_c = CEIL_ALIGN(shape_seg.c, COMPUTE_COUNT_ALIGN / sizeof(T));
int align_hw =
CEIL_ALIGN(shape_seg.h * shape_seg.w, COMPUTE_COUNT_ALIGN / sizeof(T));
for (int i = 0; i < shape_seg.n; ++i) {
__bang_transpose(dst, src, align_hw, align_c);
dst += align_hw * align_c;
src += align_hw * align_c;
}
}
template <typename T>
__mlu_func__ void psamaskCollectForward(
const T *x_dram, T *y_dram, const PositionInCore &position,
const Shape &x_full, const Shape &y_full, const Shape &shape_seg,
const int h_mask, const int w_mask, const int half_h_mask,
const int half_w_mask) {
T *x_nram = (T *)buf;
T *y_nram =
x_nram + CEIL_ALIGN(shape_seg.n * shape_seg.h * shape_seg.w * x_full.c,
COMPUTE_COUNT_ALIGN / sizeof(T));
loadDataFromDramToNram(x_nram, x_dram, position, x_full);
// fill zeros to output
int elem_count =
CEIL_ALIGN(shape_seg.n * shape_seg.h * shape_seg.w * y_full.c,
NFU_ALIGN_SIZE / sizeof(T));
__nramset(y_nram, elem_count, (T)0);
int y_n_offset = shape_seg.h * shape_seg.w * shape_seg.c;
int y_h_offset = shape_seg.w * shape_seg.c;
int y_w_offset = shape_seg.c;
int x_n_offset = shape_seg.h * shape_seg.w * x_full.c;
int y_c_offset = 1;
int x_h_offset = shape_seg.w * x_full.c;
int x_w_offset = x_full.c;
int x_c_offset = 1;
int x_start = 0;
int y_start = 0;
for (int nidx = 0; nidx < shape_seg.n; ++nidx) {
for (int hidx = 0; hidx < shape_seg.h; ++hidx) {
for (int widx = 0; widx < shape_seg.w; ++widx) {
int h_abs = hidx + position.h_start;
int w_abs = widx + position.w_start;
int y_offset = y_start;
int x_offset = x_start;
y_offset += hidx * y_h_offset + widx * y_w_offset;
x_offset += hidx * x_h_offset + widx * x_w_offset;
const int hstart = half_h_mask - h_abs > 0 ? half_h_mask - h_abs : 0;
const int hend = x_full.h + half_h_mask - h_abs < h_mask
? x_full.h + half_h_mask - h_abs
: h_mask;
const int wstart = half_w_mask - w_abs > 0 ? half_w_mask - w_abs : 0;
const int wend = x_full.w + half_w_mask - w_abs < w_mask
? x_full.w + half_w_mask - w_abs
: w_mask;
// (h, w ) with mask-indexed
// (h + hidx - half_h_mask, w + widx - half_w_mask) with feature-indexed
y_offset += ((hstart + h_abs - half_h_mask) * x_full.w + wstart +
w_abs - half_w_mask) *
y_c_offset;
x_offset += (hstart * w_mask + wstart) * x_c_offset;
int count = wend - wstart;
__memcpy(y_nram + y_offset, x_nram + x_offset, count * sizeof(T),
NRAM2NRAM, y_c_offset * x_full.w * sizeof(T),
x_c_offset * w_mask * sizeof(T), hend - hstart - 1);
}
}
y_start += y_n_offset;
x_start += x_n_offset;
}
storeDataFromNramToDram(y_dram, y_nram, position, y_full);
}
template <typename T>
__mlu_func__ void psamaskDistributeForward(
const T *x_dram, T *y_dram, const PositionInCore &position,
const Shape &x_full, const Shape &y_full, const Shape &shape_seg,
const int h_mask, const int w_mask, const int half_h_mask,
const int half_w_mask) {
T *x_nram = (T *)buf;
T *y_nram_temp =
x_nram + CEIL_ALIGN(shape_seg.n * shape_seg.h * shape_seg.w * x_full.c,
COMPUTE_COUNT_ALIGN / sizeof(T));
loadDataFromDramToNram(x_nram, x_dram, position, x_full);
// fill zeros to output
int align_c = CEIL_ALIGN(y_full.c, COMPUTE_COUNT_ALIGN / sizeof(T));
int align_hw =
CEIL_ALIGN(shape_seg.h * shape_seg.w, COMPUTE_COUNT_ALIGN / sizeof(T));
int elem_count =
CEIL_ALIGN(shape_seg.n * align_c * align_hw, NFU_ALIGN_SIZE / sizeof(T));
__nramset(y_nram_temp, elem_count, (T)0);
int y_n_offset = align_hw * align_c;
int y_h_offset = shape_seg.w * align_c;
int y_w_offset = align_c;
int y_c_offset = 1;
int x_n_offset = shape_seg.h * shape_seg.w * x_full.c;
int x_h_offset = shape_seg.w * x_full.c;
int x_w_offset = x_full.c;
int x_c_offset = 1;
int h_feature = y_full.h;
int w_feature = y_full.w;
int y_start = 0;
int x_start = 0;
for (int nidx = 0; nidx < shape_seg.n; ++nidx) {
for (int hidx = 0; hidx < shape_seg.h; ++hidx) {
for (int widx = 0; widx < shape_seg.w; ++widx) {
int h_abs = hidx + position.h_start;
int w_abs = widx + position.w_start;
int y_offset = y_start;
int x_offset = x_start;
y_offset += hidx * y_h_offset + widx * y_w_offset;
x_offset += hidx * x_h_offset + widx * x_w_offset;
const int hstart = half_h_mask - h_abs > 0 ? half_h_mask - h_abs : 0;
const int hend = h_feature + half_h_mask - h_abs < h_mask
? h_feature + half_h_mask - h_abs
: h_mask;
const int wstart = half_w_mask - w_abs > 0 ? half_w_mask - w_abs : 0;
const int wend = w_feature + half_w_mask - w_abs < w_mask
? w_feature + half_w_mask - w_abs
: w_mask;
// (h, w ) with mask-indexed
// (h + hidx - half_h_mask, w + widx - half_w_mask) with feature-indexed
y_offset += ((hstart + h_abs - half_h_mask) * x_full.w + wstart +
w_abs - half_w_mask) *
y_c_offset;
x_offset += (hstart * w_mask + wstart) * x_c_offset;
int count = wend - wstart;
__memcpy(y_nram_temp + y_offset, x_nram + x_offset, count * sizeof(T),
NRAM2NRAM, y_c_offset * w_feature * sizeof(T),
x_c_offset * w_mask * sizeof(T), hend - hstart - 1);
}
}
y_start += y_n_offset;
x_start += x_n_offset;
}
// transpose y
T *y_nram = y_nram_temp + shape_seg.n * align_hw * align_c;
Shape y_seg{shape_seg.n, shape_seg.h, shape_seg.w, y_full.c};
transposeData(y_nram, y_nram_temp, y_seg);
swap(align_c, align_hw);
// store y from nram to dram
int y_n_offset_full = y_full.h * y_full.w * y_full.c;
int y_w_offset_full = y_full.c;
int y_c_offset_full = 1;
int y_dram_start =
position.n_start * y_n_offset_full +
(position.h_start * y_full.w + position.w_start) * y_c_offset_full;
int y_nram_start = 0;
for (int nidx = 0; nidx < shape_seg.n; ++nidx) {
int y_dram_offset = y_dram_start + nidx * y_n_offset_full;
int y_nram_offset = y_nram_start + nidx * align_hw * align_c;
__memcpy(y_dram + y_dram_offset, y_nram + y_nram_offset,
shape_seg.h * shape_seg.w * sizeof(T), NRAM2GDRAM,
y_w_offset_full * sizeof(T), align_c * sizeof(T),
h_feature * w_feature - 1);
}
}
template <typename T>
__mlu_func__ void psamaskCollectBackward(
const T *dy_dram, T *dx_dram, const PositionInCore &position,
const Shape &dy_full, const Shape &dx_full, const Shape &shape_seg,
const int h_mask, const int w_mask, const int half_h_mask,
const int half_w_mask) {
T *dy_nram = (T *)buf;
T *dx_nram =
dy_nram + CEIL_ALIGN(shape_seg.n * shape_seg.h * shape_seg.w * dy_full.c,
COMPUTE_COUNT_ALIGN / sizeof(T));
loadDataFromDramToNram(dy_nram, dy_dram, position, dy_full);
// fill zeros to output
int elem_count =
CEIL_ALIGN(shape_seg.n * shape_seg.h * shape_seg.w * shape_seg.c,
NFU_ALIGN_SIZE / sizeof(T));
__nramset(dx_nram, elem_count, (T)0);
int dy_n_offset = shape_seg.h * shape_seg.w * dy_full.c;
int dy_h_offset = shape_seg.w * dy_full.c;
int dy_w_offset = dy_full.c;
int dy_c_offset = 1;
int dx_n_offset = shape_seg.h * shape_seg.w * dx_full.c;
int dx_h_offset = shape_seg.w * dx_full.c;
int dx_w_offset = dx_full.c;
int dx_c_offset = 1;
int h_feature = dy_full.h;
int w_feature = dy_full.w;
int dy_start = 0;
int dx_start = 0;
for (int nidx = 0; nidx < shape_seg.n; ++nidx) {
for (int hidx = 0; hidx < shape_seg.h; ++hidx) {
for (int widx = 0; widx < shape_seg.w; ++widx) {
int h_abs = hidx + position.h_start;
int w_abs = widx + position.w_start;
int dy_offset = dy_start;
int dx_offset = dx_start;
dy_offset += hidx * dy_h_offset + widx * dy_w_offset;
dx_offset += hidx * dx_h_offset + widx * dx_w_offset;
const int hstart = half_h_mask - h_abs > 0 ? half_h_mask - h_abs : 0;
const int hend = h_feature + half_h_mask - h_abs < h_mask
? h_feature + half_h_mask - h_abs
: h_mask;
const int wstart = half_w_mask - w_abs > 0 ? half_w_mask - w_abs : 0;
const int wend = w_feature + half_w_mask - w_abs < w_mask
? w_feature + half_w_mask - w_abs
: w_mask;
// (h, w ) with mask-indexed
// (h + h_abs - half_h_mask, w + w_abs - half_w_mask) with
// feature-indexed
dy_offset += ((hstart + h_abs - half_h_mask) * w_feature + wstart +
w_abs - half_w_mask) *
dy_c_offset;
dx_offset += (hstart * w_mask + wstart) * dx_c_offset;
int count = wend - wstart;
__memcpy(dx_nram + dx_offset, dy_nram + dy_offset, count * sizeof(T),
NRAM2NRAM, dx_c_offset * w_mask * sizeof(T),
dy_c_offset * w_feature * sizeof(T), hend - hstart - 1);
}
}
dy_start += dy_n_offset;
dx_start += dx_n_offset;
}
storeDataFromNramToDram(dx_dram, dx_nram, position, dx_full);
}
template <typename T>
__mlu_func__ void psamaskDistributeBackward(
const T *dy_dram, T *dx_dram, const PositionInCore &position,
const Shape &dy_full, const Shape &dx_full, const Shape &shape_seg,
const int h_mask, const int w_mask, const int half_h_mask,
const int half_w_mask) {
// load dy from dram to nram
T *dy_nram_temp = (T *)buf;
int dy_n_offset_full = dy_full.h * dy_full.w * dy_full.c;
int dy_c_offset_full = 1;
int h_feature = dy_full.h;
int w_feature = dy_full.w;
int align_c =
CEIL_ALIGN(shape_seg.h * shape_seg.w, COMPUTE_COUNT_ALIGN / sizeof(T));
int align_hw =
CEIL_ALIGN(h_feature * w_feature, COMPUTE_COUNT_ALIGN / sizeof(T));
int dy_dram_start =
position.n_start * dy_n_offset_full +
(position.h_start * w_feature + position.w_start) * dy_c_offset_full;
int dy_nram_start = 0;
for (int i = 0; i < shape_seg.n; ++i) {
int dy_nram_offset = dy_nram_start + i * (align_hw * align_c);
int dy_dram_offset = dy_dram_start + i * dy_n_offset_full;
__memcpy(dy_nram_temp + dy_nram_offset, dy_dram + dy_dram_offset,
shape_seg.h * shape_seg.w * sizeof(T), GDRAM2NRAM,
align_c * sizeof(T), dy_full.c * sizeof(T),
h_feature * w_feature - 1);
}
T *dy_nram = dy_nram_temp + shape_seg.n * align_hw * align_c;
Shape dy_seg{shape_seg.n, h_feature, w_feature, shape_seg.h * shape_seg.w};
transposeData(dy_nram, dy_nram_temp, dy_seg);
swap(align_c, align_hw);
// fill zeros to dx
T *dx_nram = dy_nram + shape_seg.n * align_hw * align_c;
int dx_size = shape_seg.n * shape_seg.h * shape_seg.w * dx_full.c;
__nramset(dx_nram, CEIL_ALIGN(dx_size, NFU_ALIGN_SIZE / sizeof(T)), (T)0);
int dy_n_offset_seg = align_hw * align_c;
int dy_h_offset_seg = shape_seg.w * align_c;
int dy_w_offset_seg = align_c;
int dy_c_offset_seg = 1;
int dx_n_offset_seg = shape_seg.h * shape_seg.w * shape_seg.c;
int dx_h_offset_seg = shape_seg.w * shape_seg.c;
int dx_w_offset_seg = shape_seg.c;
int dx_c_offset_seg = 1;
int dy_start = 0;
int dx_start = 0;
for (int nidx = 0; nidx < shape_seg.n; ++nidx) {
for (int hidx = 0; hidx < shape_seg.h; ++hidx) {
for (int widx = 0; widx < shape_seg.w; ++widx) {
int h_abs = hidx + position.h_start;
int w_abs = widx + position.w_start;
int dy_offset = dy_start;
int dx_offset = dx_start;
dy_offset += hidx * dy_h_offset_seg + widx * dy_w_offset_seg;
dx_offset += hidx * dx_h_offset_seg + widx * dx_w_offset_seg;
const int hstart = half_h_mask - h_abs > 0 ? half_h_mask - h_abs : 0;
const int hend = h_feature + half_h_mask - h_abs < h_mask
? h_feature + half_h_mask - h_abs
: h_mask;
const int wstart = half_w_mask - w_abs > 0 ? half_w_mask - w_abs : 0;
const int wend = w_feature + half_w_mask - w_abs < w_mask
? w_feature + half_w_mask - w_abs
: w_mask;
// (h, w ) with mask-indexed
// (h + h_abs - half_h_mask, w + w_abs - half_w_mask) with
// feature-indexed
dy_offset += ((hstart + h_abs - half_h_mask) * w_feature + wstart +
w_abs - half_w_mask) *
dy_c_offset_seg;
dx_offset += (hstart * w_mask + wstart) * dx_c_offset_seg;
int count = wend - wstart;
__memcpy(dx_nram + dx_offset, dy_nram + dy_offset, count * sizeof(T),
NRAM2NRAM, w_mask * dx_c_offset_seg * sizeof(T),
w_feature * dy_c_offset_seg * sizeof(T), hend - hstart - 1);
}
}
dy_start += dy_n_offset_seg;
dx_start += dx_n_offset_seg;
}
storeDataFromNramToDram(dx_dram, dx_nram, position, dx_full);
}
template <typename T>
__mlu_func__ void psamaskBase(const T *input_dram, T *output_dram,
const Shape &input_full, const Shape &output_full,
LimitParam &limit, const PsamaskType psa_type,
const DimPartitionType core_partition,
const DimPartitionType cluster_partition,
const bool is_forward, const int h_mask,
const int w_mask, const int half_h_mask,
const int half_w_mask, const int n_per_core,
const int h_per_core, const int n_per_cluster,
const int h_per_cluster) {
PositionInCore position_full;
PositionInCore position_seg;
position_full.w_start = 0;
position_full.w_end = output_full.w;
int n_num_in_cluster = n_per_cluster;
int h_num_in_cluster = h_per_cluster;
switch (cluster_partition) {
case PARTITION_N: {
position_full.h_start = 0;
position_full.h_end = input_full.h;
position_full.n_start = taskIdY * n_per_cluster;
int cluster_need = (input_full.n + n_per_cluster - 1) / n_per_cluster;
if (taskIdY >= cluster_need) return;
int n_remainder = input_full.n - (cluster_need - 1) * n_per_cluster;
n_num_in_cluster =
(taskIdY == cluster_need - 1) ? n_remainder : n_per_cluster;
position_full.n_end = position_full.n_start + n_num_in_cluster;
}; break;
case PARTITION_H: {
position_full.n_start = 0;
position_full.n_end = input_full.n;
position_full.h_start = taskIdY * h_per_cluster;
int cluster_need = (input_full.h + h_per_cluster - 1) / h_per_cluster;
if (taskIdY >= cluster_need) return;
int h_remainder = input_full.h - (cluster_need - 1) * h_per_cluster;
h_num_in_cluster =
(taskIdY == cluster_need - 1) ? h_remainder : h_per_cluster;
position_full.h_end = position_full.h_start + h_num_in_cluster;
}; break;
}
switch (core_partition) {
case PARTITION_N: {
position_full.n_start += taskIdX * n_per_core;
int core_need = (n_num_in_cluster + n_per_core - 1) / n_per_core;
if (taskIdX >= core_need) return;
int n_remainder = n_num_in_cluster - (core_need - 1) * n_per_core;
position_full.n_end =
position_full.n_start +
((taskIdX == core_need - 1) ? n_remainder : n_per_core);
}; break;
case PARTITION_H: {
position_full.h_start += taskIdX * h_per_core;
int core_need = (h_num_in_cluster + h_per_core - 1) / h_per_core;
if (taskIdX >= core_need) return;
int h_remainder = h_num_in_cluster - (core_need - 1) * h_per_core;
position_full.h_end =
position_full.h_start +
((taskIdX == core_need - 1) ? h_remainder : h_per_core);
}; break;
}
// the count of n ,h and w need to be processed in the current core
int shape_core_n = position_full.n_end - position_full.n_start;
int shape_core_h = position_full.h_end - position_full.h_start;
int shape_core_w = input_full.w;
limit.n = limit.n < shape_core_n ? limit.n : shape_core_n;
limit.h = limit.h < shape_core_h ? limit.h : shape_core_h;
limit.w = limit.w < shape_core_w ? limit.w : shape_core_w;
// load the data to nram according to the limit
for (int nidx = position_full.n_start; nidx < position_full.n_end;
nidx += limit.n) {
position_seg.n_start = nidx;
position_seg.n_end =
position_seg.n_start + (position_full.n_end - nidx < limit.n
? position_full.n_end - nidx
: limit.n);
for (int hidx = position_full.h_start; hidx < position_full.h_end;
hidx += limit.h) {
position_seg.h_start = hidx;
position_seg.h_end =
position_seg.h_start + (position_full.h_end - hidx < limit.h
? position_full.h_end - hidx
: limit.h);
for (int widx = position_full.w_start; widx < position_full.w_end;
widx += limit.w) {
position_seg.w_start = widx;
position_seg.w_end =
position_seg.w_start + (position_full.w_end - widx < limit.w
? position_full.w_end - widx
: limit.w);
// record the segment of output except the size of channel
// channel segments of output and input are the same
Shape shape_seg;
shape_seg.n = position_seg.n_end - position_seg.n_start;
shape_seg.h = position_seg.h_end - position_seg.h_start;
shape_seg.w = position_seg.w_end - position_seg.w_start;
shape_seg.c = output_full.c;
switch (psa_type) {
case COLLECT: {
if (is_forward) {
psamaskCollectForward(input_dram, output_dram, position_seg,
input_full, output_full, shape_seg, h_mask,
w_mask, half_h_mask, half_w_mask);
} else {
psamaskCollectBackward(input_dram, output_dram, position_seg,
input_full, output_full, shape_seg, h_mask,
w_mask, half_h_mask, half_w_mask);
}
} break;
case DISTRIBUTE: {
if (is_forward) {
psamaskDistributeForward(input_dram, output_dram, position_seg,
input_full, output_full, shape_seg,
h_mask, w_mask, half_h_mask,
half_w_mask);
} else {
psamaskDistributeBackward(input_dram, output_dram, position_seg,
input_full, output_full, shape_seg,
h_mask, w_mask, half_h_mask,
half_w_mask);
}
} break;
}
}
}
}
}
template <typename T>
__mlu_global__ void MLUUnion1KernelPsamaskForward(
const T *x, T *y, const PsamaskType psa_type,
const DimPartitionType core_partition,
const DimPartitionType cluster_partition, const int batch,
const int h_feature, const int w_feature, const int h_mask,
const int w_mask, const int x_c, const int y_c, const int half_h_mask,
const int half_w_mask, const int n_per_core, const int h_per_core,
const int n_per_cluster, const int h_per_cluster, const int limit_n_seg,
const int limit_h_seg, const int limit_w_seg) {
if (coreId == 0x80) {
return;
}
Shape x_full, y_full;
x_full.n = batch;
x_full.h = h_feature;
x_full.w = w_feature;
x_full.c = x_c;
y_full.n = batch;
y_full.h = h_feature;
y_full.w = w_feature;
y_full.c = y_c;
LimitParam limit;
limit.n = limit_n_seg;
limit.h = limit_h_seg;
limit.w = limit_w_seg;
psamaskBase(x, y, x_full, y_full, limit, psa_type, core_partition,
cluster_partition, true, h_mask, w_mask, half_h_mask, half_w_mask,
n_per_core, h_per_core, n_per_cluster, h_per_cluster);
}
template <typename T>
__mlu_global__ void MLUUnion1KernelPsamaskBackward(
const T *dy, T *dx, const PsamaskType psa_type,
const DimPartitionType core_partition,
const DimPartitionType cluster_partition, const int batch,
const int h_feature, const int w_feature, const int h_mask,
const int w_mask, const int dx_c, const int dy_c, const int half_h_mask,
const int half_w_mask, const int n_per_core, const int h_per_core,
const int n_per_cluster, const int h_per_cluster, const int limit_n_seg,
const int limit_h_seg, const int limit_w_seg) {
if (coreId == 0x80) {
return;
}
Shape dy_full, dx_full;
dx_full.n = batch;
dx_full.h = h_feature;
dx_full.w = w_feature;
dx_full.c = dx_c;
dy_full.n = batch;
dy_full.h = h_feature;
dy_full.w = w_feature;
dy_full.c = dy_c;
LimitParam limit;
limit.n = limit_n_seg;
limit.h = limit_h_seg;
limit.w = limit_w_seg;
psamaskBase(dy, dx, dy_full, dx_full, limit, psa_type, core_partition,
cluster_partition, false, h_mask, w_mask, half_h_mask,
half_w_mask, n_per_core, h_per_core, n_per_cluster,
h_per_cluster);
}
void KernelPsamaskForward(
cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
const void *x, void *y, const PsamaskType psa_type,
const DimPartitionType core_partition,
const DimPartitionType cluster_partition, const int batch,
const int h_feature, const int w_feature, const int h_mask,
const int w_mask, const int x_c, const int y_c, const int half_h_mask,
const int half_w_mask, const int n_per_core, const int h_per_core,
const int n_per_cluster, const int h_per_cluster, const int limit_n_seg,
const int limit_h_seg, const int limit_w_seg) {
MLUUnion1KernelPsamaskForward<<<k_dim, k_type, queue>>>(
static_cast<const float *>(x), static_cast<float *>(y), psa_type,
core_partition, cluster_partition, batch, h_feature, w_feature, h_mask,
w_mask, x_c, y_c, half_h_mask, half_w_mask, n_per_core, h_per_core,
n_per_cluster, h_per_cluster, limit_n_seg, limit_h_seg, limit_w_seg);
}
void KernelPsamaskBackward(
cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
const void *dy, void *dx, const PsamaskType psa_type,
const DimPartitionType core_partition,
const DimPartitionType cluster_partition, const int batch,
const int h_feature, const int w_feature, const int h_mask,
const int w_mask, const int dx_c, const int dy_c, const int half_h_mask,
const int half_w_mask, const int n_per_core, const int h_per_core,
const int n_per_cluster, const int h_per_cluster, const int limit_n_seg,
const int limit_h_seg, const int limit_w_seg) {
MLUUnion1KernelPsamaskBackward<<<k_dim, k_type, queue>>>(
static_cast<const float *>(dy), static_cast<float *>(dx), psa_type,
core_partition, cluster_partition, batch, h_feature, w_feature, h_mask,
w_mask, dx_c, dy_c, half_h_mask, half_w_mask, n_per_core, h_per_core,
n_per_cluster, h_per_cluster, limit_n_seg, limit_h_seg, limit_w_seg);
}
mmcv/ops/csrc/common/mlu/psamask_utils.hpp
0 → 100644
View file @
fdeee889
/*************************************************************************
* Copyright (C) 2022 Cambricon.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#ifndef PSAMASK_UTILS_HPP_
#define PSAMASK_UTILS_HPP_
typedef
enum
{
COLLECT
=
0
,
DISTRIBUTE
=
1
,
}
PsamaskType
;
typedef
enum
{
PARTITION_N
=
0
,
PARTITION_H
=
1
,
}
DimPartitionType
;
struct
PartitionSeg
{
int
h_per_cluster
;
int
n_per_cluster
;
int
h_per_core
;
int
n_per_core
;
DimPartitionType
cluster_partition
;
DimPartitionType
core_partition
;
};
struct
Shape
{
int
n
;
int
h
;
int
w
;
int
c
;
};
struct
LimitParam
{
int
n
;
int
h
;
int
w
;
};
struct
PositionInCore
{
int
n_start
;
int
n_end
;
int
h_start
;
int
h_end
;
int
w_start
;
int
w_end
;
};
#endif // PSAMASK_UTILS_HPP_
mmcv/ops/csrc/common/mlu/roi_align_mlu_kernel.mlu
0 → 100644
View file @
fdeee889
/*************************************************************************
* Copyright (C) 2021 Cambricon.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#include "common_mlu_helper.hpp"
#define ROI_OFFSET 5
__nram__ char buffer[MAX_NRAM_SIZE];
namespace forward {
template <typename T>
__mlu_func__ void bilinearInterpolate(const int input_height,
const int input_width, T y, T x, T *w1,
T *w2, T *w3, T *w4, int *x_low,
int *x_high, int *y_low, int *y_high,
bool *empty) {
// deal with cases that inverse elements are of feature map boundary
if (y < -1.0 || y > input_height || x < -1.0 || x > input_width) {
*empty = true;
return;
}
if (y <= 0) y = 0;
if (x <= 0) x = 0;
int y_low_ = int(y);
int x_low_ = int(x);
if (y_low_ >= input_height - 1) {
*y_high = y_low_ = input_height - 1;
y = (T)y_low_;
} else {
*y_high = y_low_ + 1;
}
if (x_low_ >= input_width - 1) {
*x_high = x_low_ = input_width - 1;
x = T(x_low_);
} else {
*x_high = x_low_ + 1;
}
*y_low = y_low_;
*x_low = x_low_;
T ly = y - y_low_;
T lx = x - x_low_;
T hy = 1.0 - ly;
T hx = 1.0 - lx;
*w1 = hy * hx, *w2 = hy * lx, *w3 = ly * hx, *w4 = ly * lx;
return;
}
template <typename T>
__mlu_func__ void computeChannel(T *input_core, T *nram_in, T *output_core,
T *nram_out, const int roi_bin_grid_h,
const int roi_bin_grid_w, const T roi_start_h,
const T roi_start_w, const int ph,
const int pw, const T bin_size_h,
const T bin_size_w, const float count,
const int input_height, const int input_width,
const int channels, const int cyc_num,
const int max_elements) {
int cyc_channel = max_elements;
for (int i = 0; i < cyc_num; i++) {
int real_channel =
(i == cyc_num - 1) ? channels - i * cyc_channel : cyc_channel;
int align_channel = PAD_UP(real_channel, NFU_ALIGN_SIZE / sizeof(T));
__bang_write_zero(nram_out, align_channel);
uint32_t real_size = real_channel * sizeof(T);
int iy, ix;
for (iy = 0; iy < roi_bin_grid_h; iy++) {
// 1. compute the coordinates of the y axis in the current roi_bin_grid_h
T y = roi_start_h + ph * bin_size_h +
(T)(iy + 0.5) * bin_size_h / (T)(roi_bin_grid_h);
for (ix = 0; ix < roi_bin_grid_w; ix++) {
// 2. compute the coordinates of the x axis in the current
// roi_bin_grid_w
T x = roi_start_w + pw * bin_size_w +
(T)(ix + 0.5) * bin_size_w / (T)(roi_bin_grid_w);
// 3. compute the four weights (w1, w2, w3 and w4), the height (y_low
// and y_high) and weight (x_low and x_high) of input feature map in
// the current roi bin grid, and the flag (empty) which shows if x, y
// are out of input feature map ranges
T w1, w2, w3, w4;
int x_low, x_high, y_low, y_high;
bool empty = false;
bilinearInterpolate(input_height, input_width, y, x, &w1, &w2, &w3, &w4,
&x_low, &x_high, &y_low, &y_high, &empty);
// 4. compute interpolation of the current roi bin grid
// tmp_cyc1, temp_cyc2, tmp_cyc3 and tmp_cyc4 store the input values
// to compute the interpolation, and then reused to compute
// the argmax_x and argmax_y.
T *tmp_cyc1 = nram_in + cyc_channel;
T *tmp_cyc2 = nram_in + cyc_channel * 2;
T *tmp_cyc3 = nram_in + cyc_channel * 3;
T *tmp_cyc4 = nram_in + cyc_channel * 4;
if (empty) { // exits abnormal values
__bang_write_zero(nram_in, align_channel);
} else {
__bang_write_zero(nram_in, align_channel);
uint32_t offset1 = (y_low * input_width + x_low) * channels;
uint32_t offset2 = (y_low * input_width + x_high) * channels;
uint32_t offset3 = (y_high * input_width + x_low) * channels;
uint32_t offset4 = (y_high * input_width + x_high) * channels;
T *input1 = (T *)input_core + offset1 + i * cyc_channel;
T *input2 = (T *)input_core + offset2 + i * cyc_channel;
T *input3 = (T *)input_core + offset3 + i * cyc_channel;
T *input4 = (T *)input_core + offset4 + i * cyc_channel;
// load the four pixels (p1, p2, p3 and p4) of input feature map to
// compute interpolation
__memcpy(tmp_cyc1, input1, real_size, GDRAM2NRAM);
__memcpy(tmp_cyc2, input2, real_size, GDRAM2NRAM);
__memcpy(tmp_cyc3, input3, real_size, GDRAM2NRAM);
__memcpy(tmp_cyc4, input4, real_size, GDRAM2NRAM);
// interpolation value = w1 * p1 + w2 * p2 + w3 * p3 + w4 * p4
__bang_mul_const(tmp_cyc1, tmp_cyc1, w1, align_channel);
__bang_mul_const(tmp_cyc2, tmp_cyc2, w2, align_channel);
__bang_mul_const(tmp_cyc3, tmp_cyc3, w3, align_channel);
__bang_mul_const(tmp_cyc4, tmp_cyc4, w4, align_channel);
__bang_add(nram_in, tmp_cyc1, nram_in, align_channel);
__bang_add(nram_in, tmp_cyc2, nram_in, align_channel);
__bang_add(nram_in, tmp_cyc3, nram_in, align_channel);
__bang_add(nram_in, tmp_cyc4, nram_in, align_channel);
}
// 5. compute sum value and corresponding coordinates of x axis and y
// axis. Update the sum value.
__bang_add(nram_out, nram_in, nram_out, align_channel);
} // loop_roi_grid_w
} // loop_roi_grid_h
T count_value = (T)(1.0 / count);
__bang_mul_const(nram_out, nram_out, count_value, align_channel);
__memcpy(output_core + i * cyc_channel, nram_out, real_size, NRAM2GDRAM);
} // loop_cyc_num
}
template <typename T>
__mlu_func__ void roialignForwardAvg(
T *input, T *rois, T *output, const bool aligned, const int channels,
const int pooled_height, const int pooled_width, const int input_height,
const int input_width, const int sampling_ratio, const T spatial_scale,
const int num_rois) {
// find limit for channel, the nram space is divided to 6 parts that are
// input, 4 weights to compute the interpolation (w1, w2, w3, w4), output
// max_elements : 300 : float datatype : 27296, half datatype : 54592
// max_elements : 200 : float datatype : 16384, half datatype : 32768
int max_elements = (PAD_DOWN(MAX_NRAM_SIZE / 6, NFU_ALIGN_SIZE)) / sizeof(T);
int cyc_num = channels / max_elements + (int)(channels % max_elements != 0);
T offset = aligned ? (T)0.5 : (T)0.0;
int task_num = num_rois * pooled_height * pooled_width;
T *nram_out = (T *)buffer;
T *nram_in = nram_out + max_elements;
if (task_num < taskDim) {
if (taskId >= task_num) {
return;
}
}
for (int bin_idx = taskId; bin_idx < task_num; bin_idx = bin_idx + taskDim) {
if (bin_idx >= task_num) {
return;
}
// (n,ph.pw) is a c in the pooled output
int pw = bin_idx % pooled_width;
int ph = (bin_idx / pooled_width) % pooled_height;
int n = bin_idx / pooled_width / pooled_height;
T *roi_id_tmp = rois + n * ROI_OFFSET;
// 1. compute width and height of roi region.
int batch_idx = (int)roi_id_tmp[0];
T roi_x1 = roi_id_tmp[1];
T roi_y1 = roi_id_tmp[2];
T roi_x2 = roi_id_tmp[3];
T roi_y2 = roi_id_tmp[4];
T roi_start_w = roi_x1 * spatial_scale - offset;
T roi_start_h = roi_y1 * spatial_scale - offset;
T roi_end_w = roi_x2 * spatial_scale - offset;
T roi_end_h = roi_y2 * spatial_scale - offset;
T roi_width = roi_end_w - roi_start_w;
T roi_height = roi_end_h - roi_start_h;
if (!aligned) {
roi_width = roi_width > (T)(1.0) ? roi_width : (T)(1.0);
roi_height = roi_height > (T)(1.0) ? roi_height : (T)(1.0);
}
// 2. compute float-type width and height of roi bin region.
T bin_size_w = (T)roi_width / (T)pooled_width;
T bin_size_h = (T)roi_height / (T)pooled_height;
// 3. compute int-type width and height of roi bin region.
int roi_bin_grid_h, roi_bin_grid_w;
roi_bin_grid_h = (sampling_ratio > 0)
? sampling_ratio
: int(ceilf(roi_height / pooled_height));
roi_bin_grid_w = (sampling_ratio > 0)
? sampling_ratio
: int(ceilf(roi_width / pooled_width));
float count = (float)((roi_bin_grid_h * roi_bin_grid_w) > 1
? roi_bin_grid_h * roi_bin_grid_w
: 1.0);
T *input_core = input + batch_idx * channels * input_width * input_height;
T *output_core = output + bin_idx * channels;
// 4. compute avg value and corresponding coordinates of x axis and y axis.
computeChannel(input_core, nram_in, output_core, nram_out, roi_bin_grid_h,
roi_bin_grid_w, roi_start_h, roi_start_w, ph, pw, bin_size_h,
bin_size_w, count, input_height, input_width, channels,
cyc_num, max_elements);
}
}
__mlu_global__ void MLUUnion1KernelRoiAlignAvg(
const void *input, const void *rois, const int channels, const bool aligned,
const int pooled_height, const int pooled_width, const int input_height,
const int input_width, const int sampling_ratio, const float spatial_scale,
const int num_rois, const cnrtDataType_t data_type, void *output) {
// make sure that memcore is not used
if (coreId == 0x80) {
return;
}
switch (data_type) {
case CNRT_FLOAT16: {
roialignForwardAvg((half *)input, (half *)rois, (half *)output, aligned,
channels, pooled_height, pooled_width, input_height,
input_width, sampling_ratio,
(half)spatial_scale, num_rois);
}; break;
case CNRT_FLOAT32: {
roialignForwardAvg((float *)input, (float *)rois, (float *)output,
aligned, channels, pooled_height, pooled_width,
input_height, input_width, sampling_ratio,
(float)spatial_scale, num_rois);
}; break;
default:
break;
}
return;
}
} // namespace forward
namespace backward {
__mlu_func__ void bilinearInterpolateGradient(int height, int width, float y,
float x, float *w1, float *w2,
float *w3, float *w4, int *x_low,
int *x_high, int *y_low,
int *y_high) {
if (y < -1.0 || y > height || x < -1.0 || x > width) {
*w1 = 0.0, *w2 = 0.0, *w3 = 0.0, *w4 = 0.0;
*x_low = -1, *x_high = -1, *y_low = -1, *y_high = -1;
return;
}
if (y <= 0) {
y = 0;
}
if (x <= 0) {
x = 0;
}
*y_low = (int)y;
*x_low = (int)x;
if (*y_low >= height - 1) {
*y_high = height - 1, *y_low = height - 1;
y = (float)(*y_low);
} else {
*y_high = *y_low + 1;
}
if (*x_low >= width - 1) {
*x_high = width - 1, *x_low = width - 1;
x = (float)(*x_low);
} else {
*x_high = *x_low + 1;
}
float ly = y - *y_low, lx = x - *x_low;
float hy = 1.0 - ly, hx = 1.0 - lx;
*w1 = hy * hx, *w2 = hy * lx, *w3 = ly * hx, *w4 = ly * lx;
return;
}
template <typename T>
__mlu_func__ void unionRoiAlignBp(
T *grads, T *boxes, T *grads_image, const int boxes_num, const int hi,
const int wi, const int c, const int no, const int ho, const int wo,
const float spatial_scale, const int sampling_ratio, const bool aligned) {
int c_align = PAD_UP(c, NFU_ALIGN_SIZE / sizeof(T));
int deal_all = boxes_num * hi * wi;
int deal_this_core = deal_all / taskDim + (int)(taskId < deal_all % taskDim);
for (int i = 0; i < deal_this_core; ++i) {
int bhw_id = i * taskDim + taskId;
int box_id = bhw_id / (hi * wi);
int ih = (bhw_id / wi) % hi;
int iw = bhw_id % wi;
T *box = boxes + box_id * 5;
int image_id = (int)box[0];
T *image_offset = grads_image + image_id * ho * wo * c;
T *grads_ = grads + box_id * hi * wi * c + ih * wi * c + iw * c;
float offset = aligned ? 0.5 : 0.0;
float x1 = box[1] * spatial_scale - offset;
float y1 = box[2] * spatial_scale - offset;
float x2 = box[3] * spatial_scale - offset;
float y2 = box[4] * spatial_scale - offset;
float roi_width = x2 - x1;
float roi_height = y2 - y1;
if (!aligned) {
roi_width = (roi_width > 1.0) ? roi_width : 1.0;
roi_height = (roi_height > 1.0) ? roi_height : 1.0;
}
float bin_size_h = roi_height / hi;
float bin_size_w = roi_width / wi;
int roi_grid_h =
(sampling_ratio > 0) ? sampling_ratio : std::ceil(roi_height / hi);
int roi_grid_w =
(sampling_ratio > 0) ? sampling_ratio : std::ceil(roi_width / wi);
const T count = roi_grid_h * roi_grid_w;
if (c_align * sizeof(T) * 2 <= MAX_NRAM_SIZE) {
for (int iy = 0; iy < roi_grid_h; ++iy) {
const float y =
y1 + ih * bin_size_h + (iy + 0.5) * bin_size_h / roi_grid_h;
for (int ix = 0; ix < roi_grid_w; ++ix) {
const float x =
x1 + iw * bin_size_w + (ix + 0.5) * bin_size_w / roi_grid_w;
float w1, w2, w3, w4;
int x_low, x_high, y_low, y_high;
bilinearInterpolateGradient(ho, wo, y, x, &w1, &w2, &w3, &w4, &x_low,
&x_high, &y_low, &y_high);
if (x_low >= 0 && y_low >= 0) {
__memcpy(buffer, grads_, c * sizeof(T), GDRAM2NRAM);
__bang_mul_const((T *)buffer + c_align, (T *)buffer, (T)w1,
c_align);
__bang_mul_const((T *)buffer + c_align, (T *)buffer + c_align,
1 / count, c_align);
__bang_atomic_add((T *)buffer + c_align,
image_offset + y_low * wo * c + x_low * c,
(T *)buffer + c_align, c);
__bang_mul_const((T *)buffer + c_align, (T *)buffer, (T)w2,
c_align);
__bang_mul_const((T *)buffer + c_align, (T *)buffer + c_align,
1 / count, c_align);
__bang_atomic_add((T *)buffer + c_align,
image_offset + y_low * wo * c + x_high * c,
(T *)buffer + c_align, c);
__bang_mul_const((T *)buffer + c_align, (T *)buffer, (T)w3,
c_align);
__bang_mul_const((T *)buffer + c_align, (T *)buffer + c_align,
1 / count, c_align);
__bang_atomic_add((T *)buffer + c_align,
image_offset + y_high * wo * c + x_low * c,
(T *)buffer + c_align, c);
__bang_mul_const((T *)buffer + c_align, (T *)buffer, (T)w4,
c_align);
__bang_mul_const((T *)buffer + c_align, (T *)buffer + c_align,
1 / count, c_align);
__bang_atomic_add((T *)buffer + c_align,
image_offset + y_high * wo * c + x_high * c,
(T *)buffer + c_align, c);
} // x_low && y_low
} // ix
} // iy
} else {
for (int iy = 0; iy < roi_grid_h; ++iy) {
const float y =
y1 + ih * bin_size_h + (iy + 0.5) * bin_size_h / roi_grid_h;
for (int ix = 0; ix < roi_grid_w; ++ix) {
const float x =
x1 + iw * bin_size_w + (ix + 0.5) * bin_size_w / roi_grid_w;
float w1, w2, w3, w4;
int x_low, x_high, y_low, y_high;
bilinearInterpolateGradient(ho, wo, y, x, &w1, &w2, &w3, &w4, &x_low,
&x_high, &y_low, &y_high);
if (x_low >= 0 && y_low >= 0) {
int deal_once =
PAD_DOWN(MAX_NRAM_SIZE / 2, NFU_ALIGN_SIZE) / sizeof(T);
int c_repeat = c / deal_once + (int)(c % deal_once != 0);
for (int i = 0; i < c_repeat; ++i) {
int deal_c = deal_once;
int align_c = deal_once;
if (i == c_repeat - 1) {
deal_c = c - i * deal_once;
align_c = c_align - i * deal_once;
}
__memcpy(buffer, grads_ + i * deal_once, deal_c * sizeof(T),
GDRAM2NRAM);
__bang_mul_const((T *)buffer + align_c, (T *)buffer, (T)w1,
align_c);
__bang_mul_const((T *)buffer + align_c, (T *)buffer + align_c,
1 / count, align_c);
__bang_atomic_add(
(T *)buffer + align_c,
image_offset + y_low * wo * c + x_low * c + i * deal_once,
(T *)buffer + align_c, deal_c);
__bang_mul_const((T *)buffer + align_c, (T *)buffer, (T)w2,
align_c);
__bang_mul_const((T *)buffer + align_c, (T *)buffer + align_c,
1 / count, align_c);
__bang_atomic_add(
(T *)buffer + align_c,
image_offset + y_low * wo * c + x_high * c + i * deal_once,
(T *)buffer + align_c, deal_c);
__bang_mul_const((T *)buffer + align_c, (T *)buffer, (T)w3,
align_c);
__bang_mul_const((T *)buffer + align_c, (T *)buffer + align_c,
1 / count, align_c);
__bang_atomic_add(
(T *)buffer + align_c,
image_offset + y_high * wo * c + x_low * c + i * deal_once,
(T *)buffer + align_c, deal_c);
__bang_mul_const((T *)buffer + align_c, (T *)buffer, (T)w4,
align_c);
__bang_mul_const((T *)buffer + align_c, (T *)buffer + align_c,
1 / count, align_c);
__bang_atomic_add(
(T *)buffer + align_c,
image_offset + y_high * wo * c + x_high * c + i * deal_once,
(T *)buffer + align_c, deal_c);
} // for c_repeat
} // x_low >= 0 && y_low >= 0
} // ix
} // iy
} // if c
} // i
}
__mlu_global__ void MLUUnion1KernelRoiAlignBackward(
const void *grads, const void *boxes, void *grads_image,
const cnrtDataType_t dtype, const int boxes_num, const int hi, const int wi,
const int c, const int no, const int ho, const int wo,
const float spatial_scale, const int sampling_ratio, const bool aligned) {
// make sure that memcore is not used
if (coreId == 0x80) {
return;
}
switch (dtype) {
case CNRT_FLOAT16: {
unionRoiAlignBp((half *)grads, (half *)boxes, (half *)grads_image,
boxes_num, hi, wi, c, no, ho, wo, spatial_scale,
sampling_ratio, aligned);
}; break;
case CNRT_FLOAT32: {
unionRoiAlignBp((float *)grads, (float *)boxes, (float *)grads_image,
boxes_num, hi, wi, c, no, ho, wo, spatial_scale,
sampling_ratio, aligned);
}; break;
default: { return; }
}
}
} // namespace backward
void KernelRoiAlign(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
cnrtQueue_t queue, const cnrtDataType_t d_type,
const void *input, const void *rois, const int channels,
const bool aligned, const int pooled_height,
const int pooled_width, const int input_height,
const int input_width, const int sampling_ratio,
const float spatial_scale, const int num_rois,
void *output) {
forward::MLUUnion1KernelRoiAlignAvg<<<k_dim, k_type, queue>>>(
input, rois, channels, aligned, pooled_height, pooled_width, input_height,
input_width, sampling_ratio, spatial_scale, num_rois, d_type, output);
}
void KernelRoiAlignBackward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
cnrtQueue_t queue, const cnrtDataType_t dtype,
const void *grads, const void *boxes,
void *grads_image, const int boxes_num,
const int hi, const int wi, const int c,
const int no, const int ho, const int wo,
const float spatial_scale, const int sampling_ratio,
const bool aligned) {
backward::MLUUnion1KernelRoiAlignBackward<<<k_dim, k_type, queue>>>(
grads, boxes, grads_image, dtype, boxes_num, hi, wi, c, no, ho, wo,
spatial_scale, sampling_ratio, aligned);
}
mmcv/ops/csrc/common/mlu/roi_align_rotated_mlu_kernel.mlu
0 → 100644
View file @
fdeee889
/*************************************************************************
* Copyright (C) 2022 Cambricon.
*
* OR IMPLIED, INCLUDING BUvoid NOKType LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENvoid SHALL THE AUTHORS OR COPYRIGHKType HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORvoid OR OTHERWISE, ARISING FROM, OUKType OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#include "common_mlu_helper.hpp"
#include "roi_align_rotated_utils.hpp"
#define ROI_OFFSET 6
#define SAMPLING_NUM 4
__nram__ char nram_buffer[MAX_NRAM_SIZE];
template <typename T>
__mlu_func__ void swap(T &a, T &b) {
T tmp = a;
a = b;
b = tmp;
}
template <typename T>
__mlu_func__ void bilinearInterpolate(const int input_height,
const int input_width, T x, T y,
const T zero_sign, T *w1, T *w2, T *w3,
T *w4, int *x_low, int *x_high,
int *y_low, int *y_high, bool *empty) {
// deal with case that the point is out of feature map boundary
if (y < -1.0 || y > input_height || x < -1.0 || x > input_width) {
*empty = true;
return;
}
if (y <= 0) y = (T)0;
if (x <= 0) x = (T)0;
*y_low = int(y);
*x_low = int(x);
if (*y_low >= input_height - 1) {
*y_high = *y_low = input_height - 1;
y = (T)(*y_low);
} else {
*y_high = *y_low + 1;
}
if (*x_low >= input_width - 1) {
*x_high = *x_low = input_width - 1;
x = T(*x_low);
} else {
*x_high = *x_low + 1;
}
T ly = y - *y_low;
T lx = x - *x_low;
T hy = 1.0 - ly;
T hx = 1.0 - lx;
*w1 = hy * hx * zero_sign;
*w2 = hy * lx * zero_sign;
*w3 = ly * hx * zero_sign;
*w4 = ly * lx * zero_sign;
}
template <typename T>
__mlu_func__ void getRoiBinInfo(const T *rois_dram, const int bin_i,
const RoiAlignRotatedParams ¶ms,
int *batch_idx, int *roi_n, int *pw, int *ph,
T *roi_center_x, T *roi_center_y, T *roi_width,
T *roi_height, T *theta) {
T offset = params.aligned ? (T)0.5 : (T)0.0;
*pw = bin_i % params.pooled_width;
*ph = (bin_i / params.pooled_width) % params.pooled_height;
*roi_n = bin_i / params.pooled_width / params.pooled_height;
const T *roi_info = rois_dram + (*roi_n) * ROI_OFFSET;
*batch_idx = (int)roi_info[0];
*roi_center_x = roi_info[1] * (T)params.spatial_scale - offset;
*roi_center_y = roi_info[2] * (T)params.spatial_scale - offset;
*roi_width = roi_info[3] * (T)params.spatial_scale;
*roi_height = roi_info[4] * (T)params.spatial_scale;
*theta = roi_info[5];
if (params.clockwise) {
*theta = -(*theta);
}
if (!params.aligned) {
*roi_width = *roi_width > (T)1.0 ? *roi_width : (T)1.0;
*roi_height = *roi_height > (T)1.0 ? *roi_height : (T)1.0;
}
}
template <typename T>
__mlu_func__ void roiAlignRotatedForward(const T *input_dram,
const T *rois_dram, const int batch,
const int height, const int width,
const int channel, const int rois_num,
const RoiAlignRotatedParams ¶ms,
T *output_dram) {
int align_base_128 = NFU_ALIGN_SIZE / sizeof(T);
int channel_max_cap = MAX_NRAM_SIZE / sizeof(T) / (2 * SAMPLING_NUM + 1);
channel_max_cap = channel_max_cap / align_base_128 * align_base_128;
int channel_align = channel < channel_max_cap ? channel : channel_max_cap;
channel_align = CEIL_ALIGN(channel_align, align_base_128);
T *nram_out = (T *)nram_buffer;
T *nram_ping = nram_out + channel_align;
T *nram_pong = nram_ping + channel_align * SAMPLING_NUM;
int bin_first = taskId;
int bin_end = rois_num * params.pooled_height * params.pooled_width;
for (int bin_i = bin_first; bin_i < bin_end; bin_i += taskDim) {
T roi_center_x, roi_center_y, roi_width, roi_height, theta;
int batch_idx, roi_n, pw, ph;
getRoiBinInfo(rois_dram, bin_i, params, &batch_idx, &roi_n, &pw, &ph,
&roi_center_x, &roi_center_y, &roi_width, &roi_height,
&theta);
T bin_size_h = roi_height / params.pooled_height;
T bin_size_w = roi_width / params.pooled_width;
int roi_bin_grid_h =
(params.sample_ratio > 0)
? params.sample_ratio
: __float2int_up((float)roi_height / params.pooled_height);
int roi_bin_grid_w =
(params.sample_ratio > 0)
? params.sample_ratio
: __float2int_up((float)roi_width / params.pooled_width);
T roi_start_y = -roi_height / 2;
T roi_start_x = -roi_width / 2;
const int bin_dim = roi_bin_grid_h * roi_bin_grid_w > 1
? roi_bin_grid_h * roi_bin_grid_w
: 1;
T cos_theta = std::cos(theta);
T sin_theta = std::sin(theta);
T zero_sign = 1.0f / bin_dim;
bool is_first_sample = true;
int src_offset = 0;
int dst_offset = 0;
int c_rem, c_slice, c_slice_align, pongc_slice, pongc_slice_align;
for (int c_offset = 0; c_offset < channel; c_offset += channel_align) {
__nramset(nram_out, channel_align, (T)0);
c_rem = channel - c_offset;
c_slice = channel_align > c_rem ? c_rem : channel_align;
c_slice_align = CEIL_ALIGN(c_slice, align_base_128);
is_first_sample = true;
for (int iy = 0; iy < roi_bin_grid_h; ++iy) {
const T yy = roi_start_y + ph * bin_size_h +
T(iy + 0.5) * bin_size_h / roi_bin_grid_h;
for (int ix = 0; ix < roi_bin_grid_w; ++ix) {
const T xx = roi_start_x + pw * bin_size_w +
T(ix + 0.5) * bin_size_w / roi_bin_grid_w;
int sample_i = iy * roi_bin_grid_w + ix;
T y = yy * cos_theta - xx * sin_theta + roi_center_y;
T x = yy * sin_theta + xx * cos_theta + roi_center_x;
T w1, w2, w3, w4;
bool empty = false;
int x_low, x_high, y_low, y_high;
bilinearInterpolate(height, width, x, y, zero_sign, &w1, &w2, &w3,
&w4, &x_low, &x_high, &y_low, &y_high, &empty);
int sample_wdim = x_high - x_low + 1;
/*******************************************************
| ping | pong |
|------|-----|-----|-----|-----|-----|-----|-----|-----|
|output| p1 | p2 | p3 | p4 | p1 | p2 | p3 | p4 |
|------|-----|-----|-----|-----|-----|-----|-----|-----|
********************************************************/
if (is_first_sample && !empty) {
// load input data from dram to nram
__nramset(nram_ping, SAMPLING_NUM * c_slice_align, (T)0);
for (int h = y_low; h <= y_high; ++h) {
src_offset =
(batch_idx * height * width + h * width + x_low) * channel +
c_offset;
dst_offset = (h - y_low) * SAMPLING_NUM * c_slice_align / 2;
if (c_slice_align == channel) {
__memcpy(nram_ping + dst_offset, input_dram + src_offset,
sample_wdim * channel * sizeof(T), GDRAM2NRAM);
} else {
__memcpy(nram_ping + dst_offset, input_dram + src_offset,
c_slice * sizeof(T), GDRAM2NRAM,
c_slice_align * sizeof(T), channel * sizeof(T),
sample_wdim - 1);
}
}
}
// load next input data to nram
if (sample_i + 1 < bin_dim) {
int p_iy = (sample_i + 1) / roi_bin_grid_w;
int p_ix = (sample_i + 1) % roi_bin_grid_w;
const T p_yy = roi_start_y + ph * bin_size_h +
T(p_iy + 0.5) * bin_size_h / roi_bin_grid_h;
const T p_xx = roi_start_x + pw * bin_size_w +
T(p_ix + 0.5) * bin_size_w / roi_bin_grid_w;
T p_y = p_yy * cos_theta - p_xx * sin_theta + roi_center_y;
T p_x = p_yy * sin_theta + p_xx * cos_theta + roi_center_x;
T p_w1, p_w2, p_w3, p_w4;
bool p_empty = false;
int p_x_low, p_x_high, p_y_low, p_y_high;
bilinearInterpolate(height, width, p_x, p_y, zero_sign, &p_w1,
&p_w2, &p_w3, &p_w4, &p_x_low, &p_x_high,
&p_y_low, &p_y_high, &p_empty);
int p_sample_wdim = p_x_high - p_x_low + 1;
pongc_slice = c_slice;
pongc_slice_align = c_slice_align;
if (!p_empty) {
__nramset(nram_pong, SAMPLING_NUM * pongc_slice_align, (T)0);
for (int h = p_y_low; h <= p_y_high; ++h) {
src_offset =
(batch_idx * height * width + h * width + p_x_low) *
channel +
c_offset;
dst_offset =
(h - p_y_low) * SAMPLING_NUM * pongc_slice_align / 2;
if (pongc_slice_align == channel) {
__memcpy_async(
nram_pong + dst_offset, input_dram + src_offset,
p_sample_wdim * channel * sizeof(T), GDRAM2NRAM);
} else {
__memcpy_async(nram_pong + dst_offset,
input_dram + src_offset,
pongc_slice * sizeof(T), GDRAM2NRAM,
pongc_slice_align * sizeof(T),
channel * sizeof(T), p_sample_wdim - 1);
}
}
}
}
T *tmp_sum = nram_ping + 3 * c_slice_align;
if (empty) {
__nramset(tmp_sum, c_slice_align, T(0));
} else {
__bang_mul_const(nram_ping, nram_ping, w1, c_slice_align);
__bang_mul_const(nram_ping + c_slice_align,
nram_ping + c_slice_align, w2, c_slice_align);
__bang_mul_const(nram_ping + 2 * c_slice_align,
nram_ping + 2 * c_slice_align, w3, c_slice_align);
__bang_mul_const(nram_ping + 3 * c_slice_align,
nram_ping + 3 * c_slice_align, w4, c_slice_align);
__bang_sumpool(tmp_sum, nram_ping, c_slice_align, 1, SAMPLING_NUM,
1, SAMPLING_NUM, 1, 1);
}
__bang_add(nram_out, nram_out, tmp_sum, c_slice_align);
swap(nram_ping, nram_pong);
__asm__ volatile("sync;");
is_first_sample = false;
}
}
// store the result to dram
int output_offset =
((roi_n * params.pooled_height + ph) * params.pooled_width + pw) *
channel +
c_offset;
__memcpy(output_dram + output_offset, nram_out, c_slice * sizeof(T),
NRAM2GDRAM);
}
}
}
template <typename T>
__mlu_func__ void roiAlignRotatedBackward(const T *top_grad_dram,
const T *rois_dram, const int batch,
const int height, const int width,
const int channel, const int rois_num,
const RoiAlignRotatedParams ¶ms,
T *bottom_grad_dram) {
int align_base_128 = NFU_ALIGN_SIZE / sizeof(T);
int channel_align = CEIL_ALIGN(channel, align_base_128);
unsigned int max_element = MAX_NRAM_SIZE / sizeof(T);
int c_limit = max_element >> 2;
c_limit = c_limit > channel_align ? channel_align : c_limit;
T *nram_ping = (T *)nram_buffer;
T *nram_pong = nram_ping + 2 * c_limit;
T *nram_output = nullptr;
int bin_first = taskId;
int bin_end = rois_num * params.pooled_height * params.pooled_width;
bool is_first_bin = true;
T roi_center_x, roi_center_y, roi_width, roi_height, theta;
int batch_idx, roi_n, pw, ph;
T pong_roi_center_x, pong_roi_center_y, pong_roi_width, pong_roi_height,
pong_theta;
int pong_batch_idx, pong_roi_n, pong_pw, pong_ph;
for (int bin_i = bin_first; bin_i < bin_end; bin_i += taskDim) {
getRoiBinInfo(rois_dram, bin_i, params, &batch_idx, &roi_n, &pw, &ph,
&roi_center_x, &roi_center_y, &roi_width, &roi_height,
&theta);
T bin_size_h = roi_height / params.pooled_height;
T bin_size_w = roi_width / params.pooled_width;
int roi_bin_grid_h =
(params.sample_ratio > 0)
? params.sample_ratio
: __float2int_up((float)roi_height / params.pooled_height);
int roi_bin_grid_w =
(params.sample_ratio > 0)
? params.sample_ratio
: __float2int_up((float)roi_width / params.pooled_width);
T roi_start_y = -roi_height / 2;
T roi_start_x = -roi_width / 2;
const int bin_dim = roi_bin_grid_h * roi_bin_grid_w > 1
? roi_bin_grid_h * roi_bin_grid_w
: 1;
T cos_theta = std::cos(theta);
T sin_theta = std::sin(theta);
T zero_sign = 1.0f / bin_dim;
int c_rem, c_slice, pongc_slice, c_offset;
c_rem = channel;
c_offset = 0;
/****************************************
| ping | pong |
|---------|---------|---------|---------|
| input | output | input | output |
|---------|---------|---------|---------|
*****************************************/
if (is_first_bin) {
// load the first top_grad to nram
c_slice = c_limit < c_rem ? c_limit : c_rem;
int top_grad_offset =
((roi_n * params.pooled_height + ph) * params.pooled_width + pw) *
channel;
__memcpy(nram_ping, top_grad_dram + top_grad_offset, c_slice * sizeof(T),
GDRAM2NRAM);
}
nram_output = nram_ping + c_limit;
while (c_rem > 0) {
c_slice = c_slice < c_rem ? c_slice : c_rem;
// load the next top_grad to nram
if (c_rem - c_slice > 0) {
// load the rest channels to nram
pongc_slice = (c_rem - c_slice > c_slice) ? c_slice : c_rem - c_slice;
int top_grad_offset =
((roi_n * params.pooled_height + ph) * params.pooled_width + pw) *
channel +
c_offset + c_slice;
__memcpy_async(nram_pong, top_grad_dram + top_grad_offset,
pongc_slice * sizeof(T), GDRAM2NRAM);
} else if (bin_i + taskDim < bin_end) {
// load next bin's data to nram
getRoiBinInfo(rois_dram, bin_i + taskDim, params, &pong_batch_idx,
&pong_roi_n, &pong_pw, &pong_ph, &pong_roi_center_x,
&pong_roi_center_y, &pong_roi_width, &pong_roi_height,
&pong_theta);
pongc_slice = c_limit < channel ? c_limit : channel;
int top_grad_offset = ((pong_roi_n * params.pooled_height + pong_ph) *
params.pooled_width +
pong_pw) *
channel;
__memcpy_async(nram_pong, top_grad_dram + top_grad_offset,
c_slice * sizeof(T), GDRAM2NRAM);
}
// comput the output in a single bin
for (int iy = 0; iy < roi_bin_grid_h; ++iy) {
const T yy = roi_start_y + ph * bin_size_h +
T(iy + 0.5) * bin_size_h / roi_bin_grid_h;
for (int ix = 0; ix < roi_bin_grid_w; ++ix) {
const T xx = roi_start_x + pw * bin_size_w +
T(ix + 0.5) * bin_size_w / roi_bin_grid_w;
T y = yy * cos_theta - xx * sin_theta + roi_center_y;
T x = yy * sin_theta + xx * cos_theta + roi_center_x;
T w1, w2, w3, w4;
bool empty = false;
int x_low, x_high, y_low, y_high;
bilinearInterpolate(height, width, x, y, zero_sign, &w1, &w2, &w3,
&w4, &x_low, &x_high, &y_low, &y_high, &empty);
if (empty) {
continue;
} else {
__bang_mul_const(nram_output, nram_ping, w1, c_limit);
__bang_atomic_add(
(T *)nram_output,
bottom_grad_dram + batch_idx * height * width * channel +
y_low * width * channel + x_low * channel + c_offset,
(T *)nram_output, c_slice);
__bang_mul_const(nram_output, nram_ping, w2, c_limit);
__bang_atomic_add(
(T *)nram_output,
bottom_grad_dram + batch_idx * height * width * channel +
y_low * width * channel + x_high * channel + c_offset,
(T *)nram_output, c_slice);
__bang_mul_const(nram_output, nram_ping, w3, c_limit);
__bang_atomic_add(
(T *)nram_output,
bottom_grad_dram + batch_idx * height * width * channel +
y_high * width * channel + x_low * channel + c_offset,
(T *)nram_output, c_slice);
__bang_mul_const(nram_output, nram_ping, w4, c_limit);
__bang_atomic_add(
(T *)nram_output,
bottom_grad_dram + batch_idx * height * width * channel +
y_high * width * channel + x_high * channel + c_offset,
(T *)nram_output, c_slice);
}
}
}
swap(nram_ping, nram_pong);
c_rem -= c_slice;
c_offset += c_slice;
__asm__ volatile("sync;");
}
is_first_bin = false;
}
}
__mlu_global__ void MLUUnion1KernelRoiAlignRotatedForward(
const void *features, const void *rois, void *output, const int batch,
const int height, const int width, const int channel, const int rois_num,
const RoiAlignRotatedParams rroiAlignParams,
const cnrtDataType_t data_type) {
if (0x80 == coreId) {
return;
}
if (data_type == CNRT_FLOAT32) {
roiAlignRotatedForward((float *)features, (float *)rois, batch, height,
width, channel, rois_num, rroiAlignParams,
(float *)output);
} else {
roiAlignRotatedForward((half *)features, (half *)rois, batch, height, width,
channel, rois_num, rroiAlignParams, (half *)output);
}
}
__mlu_global__ void MLUUnion1KernelRoiAlignRotatedBackward(
const void *top_grad, const void *rois, void *bottom_grad, const int batch,
const int height, const int width, const int channel, const int rois_num,
const RoiAlignRotatedParams rroiAlignParams,
const cnrtDataType_t data_type) {
if (0x80 == coreId) {
return;
}
if (data_type == CNRT_FLOAT32) {
roiAlignRotatedBackward((float *)top_grad, (float *)rois, batch, height,
width, channel, rois_num, rroiAlignParams,
(float *)bottom_grad);
} else {
roiAlignRotatedBackward((half *)top_grad, (half *)rois, batch, height,
width, channel, rois_num, rroiAlignParams,
(half *)bottom_grad);
}
}
void KernelRoiAlignRotatedForward(
cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
const cnrtDataType_t d_type, const void *features, const void *rois,
void *output, const int batch, const int height, const int width,
const int channel, const int rois_num,
const RoiAlignRotatedParams roiAlignRotatedParams) {
MLUUnion1KernelRoiAlignRotatedForward<<<k_dim, k_type, queue>>>(
features, rois, output, batch, height, width, channel, rois_num,
roiAlignRotatedParams, d_type);
}
void KernelRoiAlignRotatedBackward(
cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
const cnrtDataType_t d_type, const void *top_grad, const void *rois,
void *bottom_grad, const int batch, const int height, const int width,
const int channel, const int rois_num,
const RoiAlignRotatedParams roiAlignRotatedParams) {
MLUUnion1KernelRoiAlignRotatedBackward<<<k_dim, k_type, queue>>>(
top_grad, rois, bottom_grad, batch, height, width, channel, rois_num,
roiAlignRotatedParams, d_type);
}
mmcv/ops/csrc/common/mlu/roi_align_rotated_utils.hpp
0 → 100644
View file @
fdeee889
/*************************************************************************
* Copyright (C) 2022 Cambricon.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#ifndef ROI_ALIGN_ROTATED_UTILS_HPP_
#define ROI_ALIGN_ROTATED_UTILS_HPP_
struct
RoiAlignRotatedParams
{
int
pooled_height
;
int
pooled_width
;
int
sample_ratio
;
float
spatial_scale
;
bool
aligned
;
bool
clockwise
;
};
#endif // ROI_ALIGN_ROTATED_UTILS_HPP_
mmcv/ops/csrc/common/mlu/roi_pool_mlu_kernel.mlu
0 → 100644
View file @
fdeee889
/*************************************************************************
* Copyright (C) 2022 Cambricon.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#include "common_mlu_helper.hpp"
#define ALIGN_SIZE 64
#define PIPELINE_COMMON_NUM 2
#define PIPELINE_PINGPONG_NUM 10
__nram__ char nram_buffer[MAX_NRAM_SIZE];
namespace forward {
template <typename T>
__mlu_func__ void getRoiBinInfo(T *input_v, T *rois_v, int bin_i, int height,
int width, int channels, int p_height,
int p_width, T spatial_scale, int *bin_x1,
int *bin_y1, int *bin_x2, int *bin_y2,
int *bin_wdim, int *bin_hdim, int *bin_dims,
T **input_base, bool *is_empty) {
int pw = bin_i % p_width;
int ph = (bin_i / p_width) % p_height;
int roi_n = bin_i / p_width / p_height;
/*roi*/
const T *roi_info = rois_v + roi_n * 5; // {{batch, x1, y1, x2, y2},,,}
int batch_index = (int)roi_info[0];
int roi_x1 = round(roi_info[1] * spatial_scale);
int roi_y1 = round(roi_info[2] * spatial_scale);
int roi_x2 = round(roi_info[3] * spatial_scale);
int roi_y2 = round(roi_info[4] * spatial_scale);
int roi_w = roi_x2 - roi_x1 + 1 > 1 ? roi_x2 - roi_x1 + 1 : 1;
int roi_h = roi_y2 - roi_y1 + 1 > 1 ? roi_y2 - roi_y1 + 1 : 1;
/*bin*/
T bin_w = (T)roi_w / (T)p_width;
T bin_h = (T)roi_h / (T)p_height;
*bin_x1 = (int)floor((T)pw * bin_w) + roi_x1;
*bin_x1 = *bin_x1 > 0 ? *bin_x1 : 0;
*bin_x1 = *bin_x1 < width ? *bin_x1 : width;
*bin_y1 = (int)floor((T)ph * bin_h) + roi_y1;
*bin_y1 = *bin_y1 > 0 ? *bin_y1 : 0;
*bin_y1 = *bin_y1 < height ? *bin_y1 : height;
*bin_x2 = (int)ceil((T)(pw + 1) * bin_w) + roi_x1;
*bin_x2 = *bin_x2 > 0 ? *bin_x2 : 0;
*bin_x2 = *bin_x2 < width ? *bin_x2 : width;
*bin_y2 = (int)ceil((T)(ph + 1) * bin_h) + roi_y1;
*bin_y2 = *bin_y2 > 0 ? *bin_y2 : 0;
*bin_y2 = *bin_y2 < height ? *bin_y2 : height;
*input_base = input_v + batch_index * height * width * channels;
*bin_wdim = *bin_x2 - *bin_x1;
*bin_hdim = *bin_y2 - *bin_y1;
*bin_dims = (*bin_hdim) * (*bin_wdim);
*is_empty = (*bin_y2 <= *bin_y1) || (*bin_x2 <= *bin_x1);
}
template <typename T>
__mlu_func__ void MLUUnion1Roipool(T *input_v, T *rois_v, int batch,
int channels, int height, int width,
int p_height, int p_width, int rois_num,
T spatial_scale, T *output_v, int *argmax) {
/*
* NRAM partition
* |---------------------------------------------------|
* | ping |
* |---------------------------------------------------|
* | pong |
* |---------------------------------------------------|
* | out |
* |---------------------------------------------------|
* | argmax |
* |---------------------------------------------------|
* | a |
* |---------------------------------------------------|
* | b |
* |---------------------------------------------------|
*/
uint32_t is_half = sizeof(T) == sizeof(half) ? true : false;
uint32_t t_size = sizeof(T);
uint32_t float_div = NFU_ALIGN_SIZE / sizeof(float);
uint32_t half_div = NFU_ALIGN_SIZE / sizeof(half);
uint32_t channels_align = PAD_UP(channels, float_div);
uint32_t nram_limit = PAD_DOWN(
(MAX_NRAM_SIZE / sizeof(float) - 4 * channels_align) / 2, half_div);
// nram PING/PONG, output, argamx, a, b
float *nram_ping = (float *)nram_buffer;
float *nram_pong = (float *)nram_buffer + nram_limit;
float *nram_out = (float *)nram_buffer + 2 * nram_limit;
float *nram_argmax = nram_out + channels_align;
float *nram_a = nram_out + 2 * channels_align;
float *nram_b = nram_out + 3 * channels_align;
uint32_t c_bins_num = rois_num * p_height * p_width;
uint32_t task_bins = c_bins_num / taskDim;
uint32_t rem_bins = c_bins_num % taskDim;
if (taskId < rem_bins) {
task_bins += 1;
}
int bin_first =
(c_bins_num / taskDim) * taskId + (taskId > rem_bins ? rem_bins : taskId);
int bins_loop = bin_first + task_bins;
T *input_base = NULL;
T *output_base = output_v + bin_first * channels;
int *argmax_base = NULL != argmax ? argmax + bin_first * channels : NULL;
int bin_x1, bin_y1, bin_x2, bin_y2, bin_wdim, bin_hdim, bin_dims;
int pbin_x1, pbin_y1, pbin_x2, pbin_y2, pbin_wdim, pbin_hdim, pbin_dims;
bool is_empty = false;
bool pong_is_empty = false;
bool is_first_bin = true;
uint32_t src_offset = 0;
uint32_t dst_offset = 0;
uint32_t nram_offset = 0;
uint32_t half_offset =
is_half ? (nram_limit / 2 / half_div * half_div) * 2 : 0;
float *nram_tmp = NULL;
uint32_t c_slice = 0;
uint32_t c_slice_align = 0;
uint32_t pongc_slice = 0;
uint32_t pongc_slice_align = 0;
for (int bin_i = bin_first; bin_i < bins_loop; bin_i++) {
getRoiBinInfo((T *)input_v, (T *)rois_v, bin_i, height, width, channels,
p_height, p_width, (T)spatial_scale, &bin_x1, &bin_y1,
&bin_x2, &bin_y2, &bin_wdim, &bin_hdim, &bin_dims,
&input_base, &is_empty);
uint32_t c_rem = channels;
c_slice = nram_limit / bin_dims / float_div * float_div;
if (is_first_bin && !is_empty) {
c_slice = c_slice > c_rem ? c_rem : c_slice;
c_slice_align = PAD_UP(c_slice, float_div);
for (int h = bin_y1; h < bin_y2; h++) {
src_offset = (h * width + bin_x1) * channels;
nram_offset = (h - bin_y1) * bin_wdim * c_slice_align + half_offset;
if (c_slice_align == channels) {
__memcpy((T *)nram_ping + nram_offset, (T *)input_base + src_offset,
bin_wdim * c_slice * t_size, GDRAM2NRAM);
} else {
__memcpy((T *)nram_ping + nram_offset, (T *)input_base + src_offset,
c_slice * t_size, GDRAM2NRAM, c_slice_align * t_size,
channels * t_size, bin_wdim - 1);
}
}
}
uint32_t c_offset = 0;
while (c_rem > 0) {
c_slice = c_slice > c_rem ? c_rem : c_slice;
c_slice_align = PAD_UP(c_slice, float_div);
/*__memcpy_async*/
if (c_rem - c_slice > 0 && !is_empty) {
pongc_slice = c_rem - c_slice > c_slice ? c_slice : c_rem - c_slice;
pongc_slice_align = PAD_UP(pongc_slice, float_div);
for (int h = bin_y1; h < bin_y2; h++) {
src_offset = (h * width + bin_x1) * channels + c_offset;
nram_offset =
(h - bin_y1) * bin_wdim * pongc_slice_align + half_offset;
__memcpy_async((T *)nram_pong + nram_offset,
(T *)input_base + src_offset + c_slice,
pongc_slice * t_size, GDRAM2NRAM,
pongc_slice_align * t_size, channels * t_size,
bin_wdim - 1);
}
} else if (bin_i + 1 < bins_loop) {
getRoiBinInfo((T *)input_v, (T *)rois_v, bin_i + 1, height, width,
channels, p_height, p_width, (T)spatial_scale, &pbin_x1,
&pbin_y1, &pbin_x2, &pbin_y2, &pbin_wdim, &pbin_hdim,
&pbin_dims, &input_base, &pong_is_empty);
pongc_slice = PAD_DOWN(nram_limit / pbin_dims, float_div);
pongc_slice = pongc_slice > channels ? channels : pongc_slice;
pongc_slice_align = PAD_UP(pongc_slice, float_div);
if (!pong_is_empty) {
for (int h = pbin_y1; h < pbin_y2; h++) {
src_offset = (h * width + pbin_x1) * channels;
nram_offset =
(h - pbin_y1) * pbin_wdim * pongc_slice_align + half_offset;
if (pongc_slice_align == channels) {
__memcpy_async((T *)nram_pong + nram_offset,
(T *)input_base + src_offset,
pbin_wdim * pongc_slice * t_size, GDRAM2NRAM);
} else {
__memcpy_async((T *)nram_pong + nram_offset,
(T *)input_base + src_offset, pongc_slice * t_size,
GDRAM2NRAM, pongc_slice_align * t_size,
channels * t_size, pbin_wdim - 1);
}
}
}
}
if (is_empty) {
__nramset((T *)nram_out, c_slice_align, (T)0);
__memcpy((T *)output_base + dst_offset + c_offset, (T *)nram_out,
c_slice * t_size, NRAM2GDRAM);
if (NULL != argmax) {
__nramset((int32_t *)nram_out, c_slice_align, (int32_t)(-1));
__memcpy((int32_t *)argmax_base + dst_offset + c_offset,
(int32_t *)nram_out, c_slice * sizeof(int32_t), NRAM2GDRAM);
}
} else {
if (is_half) {
uint32_t bin_align64 = PAD_UP(bin_dims * c_slice_align, half_div);
__bang_half2float((float *)nram_ping, (half *)nram_ping + half_offset,
bin_align64);
}
__bang_maxpool((float *)nram_out, (float *)nram_ping, c_slice_align,
bin_hdim, bin_wdim, bin_hdim, bin_wdim, 1, 1);
if (is_half) {
uint32_t c_align64 = PAD_UP(c_slice_align, half_div);
__bang_float2half_rd((half *)nram_out, (float *)nram_out, c_align64);
}
__memcpy((T *)output_base + dst_offset + c_offset, (T *)nram_out,
c_slice * t_size, NRAM2GDRAM);
if (NULL != argmax) {
/*compute max_index*/
__bang_maxpool_index((uint32_t *)nram_out, (float *)nram_ping,
c_slice_align, bin_hdim, bin_wdim, bin_hdim,
bin_wdim, 1, 1);
convertInt2Float((float *)nram_argmax, (float *)nram_a,
(int32_t *)nram_out, (float *)nram_b, c_slice_align);
/*compute input_h*/
for (int i = 0; i < c_slice; i++) {
nram_out[i] = (float)(((uint32_t *)nram_out)[i] / bin_wdim);
}
__bang_add_const((float *)nram_a, (float *)nram_out, (float)bin_y1,
c_slice_align);
__bang_mul_const((float *)nram_ping, (float *)nram_a, (float)width,
c_slice_align);
/*compute input_w*/
__bang_mul_const((float *)nram_a, (float *)nram_out, (float)bin_wdim,
c_slice_align);
__bang_sub((float *)nram_a, (float *)nram_argmax, (float *)nram_a,
c_slice_align);
__bang_add_const((float *)nram_a, (float *)nram_a, (float)bin_x1,
c_slice_align);
__bang_add((float *)nram_out, (float *)nram_ping, (float *)nram_a,
c_slice_align);
convertFloat2Int((int32_t *)nram_argmax, (float *)nram_a,
(float *)nram_out, (float *)nram_b, c_slice_align);
__memcpy((int32_t *)argmax_base + dst_offset + c_offset,
(int32_t *)nram_argmax, c_slice * sizeof(int32_t),
NRAM2GDRAM);
}
}
nram_tmp = nram_ping;
nram_ping = nram_pong;
nram_pong = nram_tmp;
c_offset += c_slice;
c_rem -= c_slice;
__asm__ volatile("sync;");
}
dst_offset += channels;
is_first_bin = false;
}
}
__mlu_global__ void MLUKernelRoiPool(cnrtDataType_t data_type,
const void *input_data,
const void *input_rois, int batch,
int channels, int height, int width,
int pooled_height, int pooled_width,
int rois_num, float spatial_scale,
void *output_data, int *argmax) {
switch (data_type) {
case CNRT_FLOAT16: {
MLUUnion1Roipool((half *)input_data, (half *)input_rois, batch, channels,
height, width, pooled_height, pooled_width, rois_num,
(half)spatial_scale, (half *)output_data, argmax);
}; break;
case CNRT_FLOAT32: {
MLUUnion1Roipool((float *)input_data, (float *)input_rois, batch,
channels, height, width, pooled_height, pooled_width,
rois_num, (float)spatial_scale, (float *)output_data,
argmax);
}; break;
default: {
break;
}
}
}
} // namespace forward
namespace backward {
// Convert index of argmax from global grads_image to local bin in RoI. Vector
// operations do not support int type, so conversion from int to float is
// performed here.
__mlu_func__ void convertIndex(
int32_t *nram_argmax, int32_t *nram_argmax_fp, int32_t *nram_argmax_fp_bk1,
int32_t *nram_argmax_fp_bk2, int32_t *nram_argmax_int,
int32_t *nram_argmax_int_h, int32_t *nram_argmax_int_w,
int32_t *nram_argmax_fp_h, int32_t *nram_argmax_fp_w,
float *nram_atomic_add, float *nram_grads_image, int width, int height,
int wstart, int hstart, int w_compute, int h_compute, int align_c,
int channels, int loop_flag, int loop_id, int true_limit) {
convertInt2Float((float *)nram_argmax_fp, (float *)nram_argmax_fp_bk1,
(int *)nram_argmax, (float *)nram_argmax_fp_bk2, align_c);
// This step uses scalar division, because the above vector division causes
// rounding accuracy problem.
for (int i = 0; i < channels; ++i) {
*((float *)nram_argmax_fp + i) = *((float *)nram_argmax_fp + i) / width;
}
// Use 'float2int_tz' to perform '*((int32_t*)nram_argmax + i) / width'
// operation.
convertFloat2Int((int *)nram_argmax_int_h, (float *)nram_argmax_fp_bk1,
(float *)nram_argmax_fp, (float *)nram_argmax_fp_bk2,
align_c);
convertInt2Float((float *)nram_argmax_fp, (float *)nram_argmax_fp_bk1,
(int *)nram_argmax_int_h, (float *)nram_argmax_fp_bk2,
align_c);
// Perform 'temp_result - hstart' operation
__bang_sub_const((float *)nram_argmax_fp_h, (float *)nram_argmax_fp, hstart,
align_c);
// Perform 'temp_result1 - temp_result2 * width' operation
__bang_mul_const((float *)nram_argmax_fp_w, (float *)nram_argmax_fp, width,
align_c);
convertInt2Float((float *)nram_argmax_fp, (float *)nram_argmax_fp_bk1,
(int *)nram_argmax, (float *)nram_argmax_fp_bk2, align_c);
__bang_sub((float *)nram_argmax_fp_w, (float *)nram_argmax_fp,
(float *)nram_argmax_fp_w, align_c);
// Perform 'temp_result - wstart' operation
__bang_sub_const((float *)nram_argmax_fp_w, (float *)nram_argmax_fp_w, wstart,
align_c);
// Perform 'temp_result = h * w_compute + w' operation
__bang_mul_const((float *)nram_argmax_fp_h, (float *)nram_argmax_fp_h,
w_compute, align_c);
__bang_add((float *)nram_argmax_fp_h, (float *)nram_argmax_fp_h,
(float *)nram_argmax_fp_w, align_c);
if (loop_flag == 1) {
__bang_sub_const((float *)nram_argmax_fp_h, (float *)nram_argmax_fp_h,
(loop_id * true_limit), align_c);
}
convertFloat2Int((int *)nram_argmax_int, (float *)nram_argmax_fp_bk1,
(float *)nram_argmax_fp_h, (float *)nram_argmax_fp_bk2,
align_c);
}
template <typename T>
__mlu_func__ void MLUUnion1Roipool(const T *rois, const T *grads,
const int32_t *argmax, T *grads_image,
int channels, int height, int width,
int pooled_height, int pooled_width,
int rois_num, const T spatial_scale,
int high_precision) {
// Calculate the number of rois processed by each core
int bin_num = rois_num * pooled_height * pooled_width;
int loop =
(bin_num % taskDim) ? (bin_num / taskDim + 1) : (bin_num / taskDim);
int tid = taskId * loop;
if (bin_num % taskDim != 0) {
if (tid >= bin_num) {
return;
} else {
// last part is (bin_num - tid).
loop = bin_num - tid < loop ? bin_num - tid : loop;
}
}
int align_c = PAD_UP(channels, ALIGN_SIZE);
// Common part has 2: grads, argmax; ping-pong each is PIPELINE_PINGPONG_NUM.
int data_size =
PAD_DOWN(((MAX_NRAM_SIZE / sizeof(float) - PIPELINE_COMMON_NUM * align_c -
(PIPELINE_PINGPONG_NUM - 1) * align_c * 2) /
2),
ALIGN_SIZE);
int hw_limit = data_size / align_c;
float *nram_grads = (float *)nram_buffer;
for (int idx = tid; idx < tid + loop; ++idx) {
// (n, ph, pw) is a C in the pooled output
int pw = idx % pooled_width;
int ph = (idx / pooled_width) % pooled_height;
int n = idx / pooled_width / pooled_height;
const T *offset_rois = (const T *)(rois + n * 5);
int roi_batch_ind = int(offset_rois[0]);
// Calculate the roi region on feature maps
int roi_start_w = round(offset_rois[1] * spatial_scale);
int roi_start_h = round(offset_rois[2] * spatial_scale);
int roi_end_w = round(offset_rois[3] * spatial_scale);
int roi_end_h = round(offset_rois[4] * spatial_scale);
// Force malformed rois to 1x1
int roi_width =
roi_end_w - roi_start_w + 1 > 1 ? roi_end_w - roi_start_w + 1 : 1;
int roi_height =
roi_end_h - roi_start_h + 1 > 1 ? roi_end_h - roi_start_h + 1 : 1;
T bin_size_h = (T)roi_height / (T)pooled_height;
T bin_size_w = (T)roi_width / (T)pooled_width;
// The corresponding bin region
int hstart = int(floor((T)ph * bin_size_h));
int wstart = int(floor((T)pw * bin_size_w));
int hend = int(ceil((T)(ph + 1) * bin_size_h));
int wend = int(ceil((T)(pw + 1) * bin_size_w));
// Add roi offsets and clip to input boundaries, min(max(A, B), C);
hstart = hstart + roi_start_h > 0 ? hstart + roi_start_h : 0;
hstart = hstart < height ? hstart : height;
hend = hend + roi_start_h > 0 ? hend + roi_start_h : 0;
hend = hend < height ? hend : height;
wstart = wstart + roi_start_w > 0 ? wstart + roi_start_w : 0;
wstart = wstart < width ? wstart : width;
wend = wend + roi_start_w > 0 ? wend + roi_start_w : 0;
wend = wend < width ? wend : width;
bool is_empty = (hend <= hstart) || (wend <= wstart);
if (!is_empty) {
int h_compute = hend - hstart;
int w_compute = wend - wstart;
int true_limit =
hw_limit < h_compute * w_compute ? hw_limit : h_compute * w_compute;
int loop_int = (h_compute * w_compute) / true_limit;
int rem = (h_compute * w_compute) % true_limit;
int32_t *nram_argmax = (int32_t *)nram_grads + align_c;
int32_t *nram_argmax_fp = (int32_t *)nram_argmax + align_c;
int32_t *nram_argmax_fp_bk1 = (int32_t *)nram_argmax_fp + align_c;
int32_t *nram_argmax_fp_bk2 = (int32_t *)nram_argmax_fp_bk1 + align_c;
int32_t *nram_argmax_int = (int32_t *)nram_argmax_fp_bk2 + align_c;
int32_t *nram_argmax_int_h = (int32_t *)nram_argmax_int + align_c;
int32_t *nram_argmax_int_w = (int32_t *)nram_argmax_int_h + align_c;
int32_t *nram_argmax_fp_h = (int32_t *)nram_argmax_int_w + align_c;
int32_t *nram_argmax_fp_w = (int32_t *)nram_argmax_fp_h + align_c;
float *nram_atomic_add = (float *)nram_argmax_fp_w + align_c;
float *nram_grads_image = (float *)nram_atomic_add + align_c;
if (true_limit == h_compute * w_compute) {
/*
* NRAM partition
* |---------------------------------------------------|
* | grads |
* |---------------------------------------------------|
* | argmax |
* |---------------------------------------------------|
* | argmax_temp |
* |---------------------------------------------------|
* | atomic_add |
* |---------------------------------------------------|
* | grads_image |
* |---------------------------------------------------|
*/
// Load the data from GDRAM to NRAM.
__memcpy((T *)nram_grads + align_c * high_precision,
(const T *)grads + (n * pooled_height * pooled_width +
ph * pooled_width + pw) *
channels,
channels * sizeof(T), GDRAM2NRAM);
if (high_precision) {
__bang_half2float((float *)nram_grads,
(half *)nram_grads + align_c * high_precision,
align_c);
}
__memcpy((int32_t *)nram_argmax,
(const int32_t *)argmax + (n * pooled_height * pooled_width +
ph * pooled_width + pw) *
channels,
channels * sizeof(int32_t), GDRAM2NRAM);
// Perform pooling operation on NRAM.
convertIndex(nram_argmax, nram_argmax_fp, nram_argmax_fp_bk1,
nram_argmax_fp_bk2, nram_argmax_int, nram_argmax_int_h,
nram_argmax_int_w, nram_argmax_fp_h, nram_argmax_fp_w,
nram_atomic_add, nram_grads_image, width, height, wstart,
hstart, w_compute, h_compute, align_c, channels, 0, 0, 0);
__bang_maxpool_bp((float *)nram_grads_image, (float *)nram_grads,
(int32_t *)nram_argmax_int, align_c, h_compute,
w_compute, h_compute, w_compute, h_compute,
w_compute);
if (high_precision) {
__bang_float2half_rd((half *)nram_grads_image,
(float *)nram_grads_image,
h_compute * w_compute * align_c);
}
// Store the result on NRAM back to GDRAM.
for (int hc = 0; hc < h_compute; ++hc) {
for (int wc = 0; wc < w_compute; ++wc) {
T *dst = (T *)nram_atomic_add;
int grad_image_offset = (roi_batch_ind * height * width +
(hc + hstart) * width + wc + wstart) *
channels;
T *src1 = (T *)grads_image + grad_image_offset;
int nram_grads_image_offset = (hc * w_compute + wc) * align_c;
T *src2 = (T *)nram_grads_image + nram_grads_image_offset;
__bang_atomic_add(dst, src1, src2, channels);
}
}
} else if (true_limit > 0) {
/*
* NRAM partition
* |---------------------------------------------------|
* | grads |
* |---------------------------------------------------|
* | argmax |
* |--------------------ping_pong----------------------|
* | argmax_temp | argmax_temp |
* |------------------------|--------------------------|
* | atomic_add | atomic_add |
* |------------------------|--------------------------|
* | grads_image | grads_image |
* |---------------------------------------------------|
*/
// Load the data from GDRAM to NRAM.
__memcpy((T *)nram_grads + align_c * high_precision,
(const T *)grads + (n * pooled_height * pooled_width +
ph * pooled_width + pw) *
channels,
channels * sizeof(T), GDRAM2NRAM);
if (high_precision) {
__bang_half2float((float *)nram_grads,
(half *)nram_grads + align_c * high_precision,
align_c);
}
__memcpy((int32_t *)nram_argmax,
(const int32_t *)argmax + (n * pooled_height * pooled_width +
ph * pooled_width + pw) *
channels,
channels * sizeof(int32_t), GDRAM2NRAM);
int ping_pong = 0;
int ping_pong_offset =
(MAX_NRAM_SIZE / sizeof(float) - align_c * PIPELINE_COMMON_NUM) / 2;
for (int loop_id = 0; loop_id <= loop_int; ++loop_id) {
int size = (loop_id == loop_int) ? rem : true_limit;
if (size == 0) {
break;
}
// Perform pooling operation on NRAM.
nram_argmax_fp =
(int32_t *)nram_argmax + align_c + ping_pong * ping_pong_offset;
nram_argmax_fp_bk1 = (int32_t *)nram_argmax_fp + align_c;
nram_argmax_fp_bk2 = (int32_t *)nram_argmax_fp_bk1 + align_c;
nram_argmax_int = (int32_t *)nram_argmax_fp_bk2 + align_c;
nram_argmax_int_h = (int32_t *)nram_argmax_int + align_c;
nram_argmax_int_w = (int32_t *)nram_argmax_int_h + align_c;
nram_argmax_fp_h = (int32_t *)nram_argmax_int_w + align_c;
nram_argmax_fp_w = (int32_t *)nram_argmax_fp_h + align_c;
nram_atomic_add = (float *)nram_argmax_fp_w + align_c;
nram_grads_image = (float *)nram_atomic_add + align_c;
int loop_id_1 = loop_id;
int size_1 = ((loop_id_1) == loop_int) ? rem : true_limit;
if (size_1 == 0) {
break;
}
convertIndex(nram_argmax, nram_argmax_fp, nram_argmax_fp_bk1,
nram_argmax_fp_bk2, nram_argmax_int, nram_argmax_int_h,
nram_argmax_int_w, nram_argmax_fp_h, nram_argmax_fp_w,
nram_atomic_add, nram_grads_image, width, height, wstart,
hstart, w_compute, h_compute, align_c, channels, 1,
loop_id_1, true_limit);
__bang_maxpool_bp((float *)nram_grads_image, (float *)nram_grads,
(int32_t *)nram_argmax_int, align_c, size_1, 1,
size_1, 1, size_1, 1);
if (high_precision) {
__bang_float2half_rd((half *)nram_grads_image,
(float *)nram_grads_image, size_1 * align_c);
}
// Store the result on NRAM back to GDRAM.
for (int index_size = 0; index_size < size; ++index_size) {
int h = (loop_id * true_limit + index_size) / w_compute;
int w = (loop_id * true_limit + index_size) % w_compute;
T *dst = (T *)nram_atomic_add;
T *grads_image_n =
(T *)grads_image + roi_batch_ind * height * width * channels;
T *src1 = (T *)grads_image_n +
((h + hstart) * width + (w + wstart)) * channels;
T *src2 = (T *)nram_grads_image + index_size * align_c;
__bang_atomic_add(dst, src1, src2, channels);
}
ping_pong = 1 - ping_pong;
}
} else {
/*
* NRAM partition
* |---------------------------------------------------|
* | grads |
* |---------------------------------------------------|
* | argmax |
* |--------------------ping_pong----------------------|
* | argmax_temp | argmax_temp |
* |------------------------|--------------------------|
* | atomic_add | atomic_add |
* |------------------------|--------------------------|
* | grads_image | grads_image |
* |---------------------------------------------------|
*/
int c_limit =
PAD_DOWN(MAX_NRAM_SIZE / sizeof(float) /
(PIPELINE_COMMON_NUM + PIPELINE_PINGPONG_NUM * 2),
ALIGN_SIZE);
int loop_int = channels / c_limit;
int rem = channels % c_limit;
int ping_pong = 0;
int ping_pong_offset =
(MAX_NRAM_SIZE / sizeof(float) - c_limit * PIPELINE_COMMON_NUM) / 2;
for (int loop_id = 0; loop_id <= loop_int; ++loop_id) {
int size = (loop_id == loop_int) ? rem : c_limit;
if (size == 0) {
break;
}
nram_argmax_fp =
(int32_t *)nram_argmax + c_limit + ping_pong * ping_pong_offset;
nram_argmax_fp_bk1 = (int32_t *)nram_argmax_fp + c_limit;
nram_argmax_fp_bk2 = (int32_t *)nram_argmax_fp_bk1 + c_limit;
nram_argmax_int = (int32_t *)nram_argmax_fp_bk2 + c_limit;
nram_argmax_int_h = (int32_t *)nram_argmax_int + c_limit;
nram_argmax_int_w = (int32_t *)nram_argmax_int_h + c_limit;
nram_argmax_fp_h = (int32_t *)nram_argmax_int_w + c_limit;
nram_argmax_fp_w = (int32_t *)nram_argmax_fp_h + c_limit;
nram_atomic_add = (float *)nram_argmax_fp_w + c_limit;
nram_grads_image = (float *)nram_atomic_add + c_limit;
// This pipeline loads the data from GDRAM to NRAM.
__memcpy((T *)nram_grads + c_limit * high_precision,
(const T *)grads +
n * pooled_height * pooled_width * channels +
ph * pooled_width * channels + pw * channels +
loop_id * c_limit,
size * sizeof(T), GDRAM2NRAM);
if (high_precision) {
__bang_half2float((float *)nram_grads,
(half *)nram_grads + c_limit * high_precision,
c_limit);
}
__memcpy((int32_t *)nram_argmax,
(const int32_t *)argmax +
n * pooled_height * pooled_width * channels +
ph * pooled_width * channels + pw * channels +
loop_id * c_limit,
size * sizeof(int32_t), GDRAM2NRAM);
for (int hc = 0; hc < h_compute; ++hc) {
for (int wc = 0; wc < w_compute; ++wc) {
// This pipeline performs pooling operation on NRAM.
convertIndex(
nram_argmax, nram_argmax_fp, nram_argmax_fp_bk1,
nram_argmax_fp_bk2, nram_argmax_int, nram_argmax_int_h,
nram_argmax_int_w, nram_argmax_fp_h, nram_argmax_fp_w,
nram_atomic_add, nram_grads_image, width, height, wstart + wc,
hstart + hc, h_compute, w_compute, c_limit, size, 0, 0, 0);
__bang_maxpool_bp((float *)nram_grads_image, (float *)nram_grads,
(int32_t *)nram_argmax_int, c_limit, 1, 1, 1, 1,
1, 1);
if (high_precision) {
__bang_float2half_rd((half *)nram_grads_image,
(float *)nram_grads_image, c_limit);
}
// This pipeline stores the result on NRAM back to GDRAM.
T *dst = (T *)nram_atomic_add;
T *grads_image_n =
(T *)grads_image + roi_batch_ind * height * width * channels;
T *src1 = (T *)grads_image_n +
((hc + hstart) * width + (wc + wstart)) * channels +
loop_id * c_limit;
T *src2 = (T *)nram_grads_image;
__bang_atomic_add(dst, src1, src2, size);
}
}
ping_pong = 1 - ping_pong;
}
}
}
}
}
__mlu_global__ void MLUKernelRoiPoolBackward(
const void *grads, const void *rois, const int *argmax, void *grads_image,
int rois_num, int pooled_height, int pooled_width, int channels, int no,
int height, int width, const float spatial_scale,
const cnrtDataType_t k_dtype) {
// make sure that memcore is not used
if (coreId == 0x80) {
return;
}
switch (k_dtype) {
case CNRT_FLOAT16: {
// Using the float type '__bang_max_pool_bp' instruction to increase the
// bit width.
const int high_precision = 1;
MLUUnion1Roipool((const half *)rois, (const half *)grads,
(const int32_t *)argmax, (half *)grads_image, channels,
height, width, pooled_height, pooled_width, rois_num,
(const half)spatial_scale, high_precision);
}; break;
case CNRT_FLOAT32: {
const int high_precision = 0;
MLUUnion1Roipool((const float *)rois, (const float *)grads,
(const int32_t *)argmax, (float *)grads_image, channels,
height, width, pooled_height, pooled_width, rois_num,
(const float)spatial_scale, high_precision);
}; break;
default: {
break;
}
}
}
} // namespace backward
void KernelRoiPoolForward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
cnrtQueue_t queue, cnrtDataType_t data_type,
const void *input_data, const void *input_rois,
const int batch, const int channels, const int height,
const int width, const int pooled_height,
const int pooled_width, const int rois_num,
const float spatial_scale, void *output_data,
int *argmax) {
forward::MLUKernelRoiPool<<<k_dim, k_type, queue>>>(
data_type, input_data, input_rois, batch, channels, height, width,
pooled_height, pooled_width, rois_num, spatial_scale, output_data,
argmax);
}
void KernelRoiPoolBackward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
cnrtQueue_t queue, cnrtDataType_t k_dtype,
const void *grad_output_ptr, const void *rois_ptr,
const int *argmax_ptr, void *grad_input_ptr,
const int box_num, const int pooled_height,
const int pooled_width, const int channels,
const int batch, const int height, const int width,
const float spatial_scale) {
backward::MLUKernelRoiPoolBackward<<<k_dim, k_type, queue>>>(
grad_output_ptr, rois_ptr, argmax_ptr, grad_input_ptr, box_num,
pooled_height, pooled_width, channels, batch, height, width,
spatial_scale, k_dtype);
}
mmcv/ops/csrc/common/mlu/tin_shift_mlu_kernel.mlu
0 → 100644
View file @
fdeee889
/*************************************************************************
* Copyright (C) 2022 Cambricon.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#include "common_mlu_helper.hpp"
__nram__ char data_nram[MAX_NRAM_SIZE];
template <typename T>
__mlu_func__ void mluMultiKernelTinShift(
const T *input, const int *shifts, T *output, const int batch_size,
const int time_size, const int channel_size, const int hw_size,
const int group_size, const int group_channel) {
for (int cur_channel_index = taskId;
cur_channel_index < batch_size * channel_size;
cur_channel_index += taskDim) {
int n_index = cur_channel_index / channel_size;
int group_id = cur_channel_index % channel_size / group_channel;
int t_shift = shifts[n_index * group_size + group_id];
int index = cur_channel_index % channel_size * hw_size +
n_index * time_size * channel_size * hw_size;
__nramset(data_nram, MAX_NRAM_SIZE, (char)0);
__asm__ volatile("sync;");
if (abs(t_shift) >= time_size) {
__memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM,
channel_size * hw_size * sizeof(T), hw_size * sizeof(T),
time_size - 1);
} else {
if (t_shift > 0) {
__memcpy(data_nram + t_shift * hw_size * sizeof(T), input + index,
hw_size * sizeof(T), GDRAM2NRAM, hw_size * sizeof(T),
channel_size * hw_size * sizeof(T), time_size - 1 - t_shift);
__memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM,
channel_size * hw_size * sizeof(T), hw_size * sizeof(T),
time_size - 1);
} else {
__memcpy(data_nram, input + (index - t_shift * channel_size * hw_size),
hw_size * sizeof(T), GDRAM2NRAM, hw_size * sizeof(T),
channel_size * hw_size * sizeof(T), time_size - 1 + t_shift);
__memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM,
channel_size * hw_size * sizeof(T), hw_size * sizeof(T),
time_size - 1);
}
}
__asm__ volatile("sync;");
}
}
template <typename T>
__mlu_func__ void mluHwSplit(const T *input, const int t_shift,
const int time_size, const int hw_size,
const int channel_size, const int index,
const int cur_sequence_index,
const int max_length_per_core, T *output) {
for (int cur_index = index; cur_index < index + hw_size;
cur_index += max_length_per_core) {
int memcpy_size = max_length_per_core;
if (cur_index + max_length_per_core > index + hw_size) {
memcpy_size = index + hw_size - cur_index;
}
if (cur_sequence_index - t_shift < 0 ||
cur_sequence_index - t_shift >= time_size) {
__memcpy(output + cur_index, data_nram, memcpy_size * sizeof(T),
NRAM2GDRAM);
} else {
__memcpy(data_nram, input + cur_index - t_shift * channel_size * hw_size,
memcpy_size * sizeof(T), GDRAM2NRAM);
__memcpy(output + cur_index, data_nram, memcpy_size * sizeof(T),
NRAM2GDRAM);
}
__asm__ volatile("sync;");
}
}
template <typename T>
__mlu_func__ void mluMultiKernelTinShiftSplitSequence(
const T *input, const int *shifts, T *output, const int batch_size,
const int time_size, const int channel_size, const int hw_size,
const int group_size, const int group_channel,
const int max_number_hw_per_core, const int max_length_per_core) {
const int tmp_max_number_hw_per_core =
max_number_hw_per_core > 0 ? max_number_hw_per_core : 1;
const int loop_time = time_size / tmp_max_number_hw_per_core +
((time_size % tmp_max_number_hw_per_core) > 0 ? 1 : 0);
int segmentime_size = tmp_max_number_hw_per_core;
int res_segment = time_size % tmp_max_number_hw_per_core;
for (int cur_segment_index = taskId;
cur_segment_index < loop_time * batch_size * channel_size;
cur_segment_index += taskDim) {
int n_index = cur_segment_index / loop_time / channel_size;
int group_id = cur_segment_index / loop_time % channel_size / group_channel;
int t_shift = shifts[n_index * group_size + group_id];
int index = n_index * time_size * channel_size * hw_size +
(cur_segment_index / loop_time % channel_size) * hw_size +
cur_segment_index % loop_time * segmentime_size * hw_size *
channel_size;
char *dst_gdram2nram = data_nram;
const T *src_gdram2nram = input + index;
int count_gdram2nram = -1;
int count_nram2gdram = -1;
int next_sequence_index =
index / hw_size / channel_size % time_size + segmentime_size;
int cur_sequence_index = index / hw_size / channel_size % time_size;
__nramset(data_nram, MAX_NRAM_SIZE, (char)0);
__asm__ volatile("sync;");
if (max_number_hw_per_core == 0) {
mluHwSplit(input, t_shift, time_size, hw_size, channel_size, index,
cur_sequence_index, max_length_per_core, output);
continue;
}
if (abs(t_shift) >= time_size) {
if ((cur_segment_index + 1) % loop_time == 0 && res_segment != 0) {
__memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM,
channel_size * hw_size * sizeof(T), hw_size * sizeof(T),
res_segment - 1);
} else {
__memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM,
channel_size * hw_size * sizeof(T), hw_size * sizeof(T),
segmentime_size - 1);
}
continue;
}
if (t_shift == 0) {
if ((cur_segment_index + 1) % loop_time == 0 && res_segment != 0) {
dst_gdram2nram = data_nram;
src_gdram2nram = input + index;
count_gdram2nram = res_segment - 1;
count_nram2gdram = res_segment - 1;
} else {
dst_gdram2nram = data_nram;
src_gdram2nram = input + index;
count_gdram2nram = segmentime_size - 1;
count_nram2gdram = segmentime_size - 1;
}
} else if (t_shift > 0) {
int first_index_cur_channel =
n_index * time_size * channel_size * hw_size +
(cur_segment_index / loop_time % channel_size) * hw_size;
if ((cur_segment_index + 1) % loop_time == 0 && res_segment != 0) {
dst_gdram2nram = data_nram;
src_gdram2nram =
input +
(index - t_shift * channel_size * hw_size < first_index_cur_channel
? first_index_cur_channel
: index - t_shift * channel_size * hw_size);
count_gdram2nram = res_segment - 1;
count_nram2gdram = res_segment - 1;
if (cur_sequence_index < t_shift && t_shift < next_sequence_index) {
dst_gdram2nram =
data_nram + t_shift % segmentime_size * hw_size * sizeof(T);
count_gdram2nram = res_segment - (t_shift - cur_sequence_index) - 1;
}
} else {
if (t_shift >= next_sequence_index) {
__memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM,
channel_size * hw_size * sizeof(T), hw_size * sizeof(T),
segmentime_size - 1);
continue;
} else if (cur_sequence_index < t_shift &&
t_shift < next_sequence_index) {
dst_gdram2nram =
data_nram + t_shift % segmentime_size * hw_size * sizeof(T);
src_gdram2nram = input + first_index_cur_channel;
count_gdram2nram = segmentime_size - (t_shift % segmentime_size) - 1;
count_nram2gdram = segmentime_size - 1;
} else {
dst_gdram2nram = data_nram;
src_gdram2nram = input + index - t_shift * channel_size * hw_size;
count_gdram2nram = segmentime_size - 1;
count_nram2gdram = segmentime_size - 1;
}
}
} else {
int offset_index = time_size + t_shift;
if (cur_sequence_index >= offset_index) {
if ((cur_segment_index + 1) % loop_time == 0 && res_segment != 0) {
__memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM,
channel_size * hw_size * sizeof(T), hw_size * sizeof(T),
res_segment - 1);
continue;
} else {
__memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM,
channel_size * hw_size * sizeof(T), hw_size * sizeof(T),
segmentime_size - 1);
continue;
}
} else {
dst_gdram2nram = data_nram;
src_gdram2nram = input + index - t_shift * channel_size * hw_size;
if (cur_sequence_index - t_shift + segmentime_size < time_size) {
count_gdram2nram = segmentime_size - 1;
count_nram2gdram = segmentime_size - 1;
} else {
count_gdram2nram = time_size - (cur_sequence_index - t_shift) - 1;
count_nram2gdram =
(segmentime_size - 1) < (time_size - cur_sequence_index - 1)
? (segmentime_size - 1)
: (time_size - cur_sequence_index - 1);
}
}
}
__memcpy(dst_gdram2nram, src_gdram2nram, hw_size * sizeof(T), GDRAM2NRAM,
hw_size * sizeof(T), channel_size * hw_size * sizeof(T),
count_gdram2nram);
__memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM,
channel_size * hw_size * sizeof(T), hw_size * sizeof(T),
count_nram2gdram);
__asm__ volatile("sync;");
}
}
__mlu_entry__ void MLUUnion1KernelTinShift(
const void *input, const void *shifts, void *output, const int batch_size,
const int time_size, const int channel_size, const int hw_size,
const int group_size, const int group_channel,
const cnrtDataType_t data_dtype) {
// make sure that memcore is not used
if (coreId == 0x80) {
return;
}
switch (data_dtype) {
case CNRT_FLOAT16: {
mluMultiKernelTinShift((half *)input, (const int *)shifts, (half *)output,
batch_size, time_size, channel_size, hw_size,
group_size, group_channel);
}; break;
case CNRT_FLOAT32: {
mluMultiKernelTinShift((float *)input, (const int *)shifts,
(float *)output, batch_size, time_size,
channel_size, hw_size, group_size, group_channel);
}; break;
default: { return; }
}
}
__mlu_entry__ void MLUUnion1KernelTinShiftSplitSequence(
const void *input, const void *shifts, void *output, const int batch_size,
const int time_size, const int channel_size, const int hw_size,
const int group_size, const int group_channel,
const int max_number_hw_per_core, const int max_length_per_core,
const cnrtDataType_t data_dtype) {
// make sure that memcore is not used
if (coreId == 0x80) {
return;
}
switch (data_dtype) {
case CNRT_FLOAT16: {
mluMultiKernelTinShiftSplitSequence(
(half *)input, (const int *)shifts, (half *)output, batch_size,
time_size, channel_size, hw_size, group_size, group_channel,
max_number_hw_per_core, max_length_per_core);
}; break;
case CNRT_FLOAT32: {
mluMultiKernelTinShiftSplitSequence(
(float *)input, (const int *)shifts, (float *)output, batch_size,
time_size, channel_size, hw_size, group_size, group_channel,
max_number_hw_per_core, max_length_per_core);
}; break;
default: { return; }
}
}
void KernelTinShiftForward(
cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
const void *input, const void *shifts, void *output, const int batch_size,
const int time_size, const int channel_size, const int hw_size,
const int group_size, const int group_channel,
const cnrtDataType_t data_dtype, const int channel_per_core,
const int max_number_hw_per_core, const int max_length_per_core) {
if (channel_per_core >= 1) {
MLUUnion1KernelTinShift<<<k_dim, k_type, queue>>>(
input, shifts, output, batch_size, time_size, channel_size, hw_size,
group_size, group_channel, data_dtype);
} else {
MLUUnion1KernelTinShiftSplitSequence<<<k_dim, k_type, queue>>>(
input, shifts, output, batch_size, time_size, channel_size, hw_size,
group_size, group_channel, max_number_hw_per_core, max_length_per_core,
data_dtype);
}
}
void KernelTinShiftBackward(
cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
const void *grad_output, const void *shifts, void *grad_input,
const int batch_size, const int time_size, const int channel_size,
const int hw_size, const int group_size, const int group_channel,
const cnrtDataType_t data_dtype, const int channel_per_core,
const int max_number_hw_per_core, const int max_length_per_core) {
if (channel_per_core >= 1) {
MLUUnion1KernelTinShift<<<k_dim, k_type, queue>>>(
grad_output, shifts, grad_input, batch_size, time_size, channel_size,
hw_size, group_size, group_channel, data_dtype);
} else {
MLUUnion1KernelTinShiftSplitSequence<<<k_dim, k_type, queue>>>(
grad_output, shifts, grad_input, batch_size, time_size, channel_size,
hw_size, group_size, group_channel, max_number_hw_per_core,
max_length_per_core, data_dtype);
}
}
mmcv/ops/csrc/common/mps/MPSDevice.h
0 → 100644
View file @
fdeee889
// Copyright © 2022 Apple Inc.
// This file is modify from:
// https://github.com/pytorch/pytorch/blob/a85d1f0bcdd02cf18d3b0517337458cb51a18cdb/aten/src/ATen/mps/MPSDevice.h
#pragma once
#include <ATen/ATen.h>
#include <c10/macros/Macros.h>
#include <c10/util/Exception.h>
#ifdef __OBJC__
#include <Foundation/Foundation.h>
#include <Metal/Metal.h>
#include <MetalPerformanceShaders/MetalPerformanceShaders.h>
typedef
id
<
MTLDevice
>
MTLDevice_t
;
#else
typedef
void
*
MTLDevice
;
typedef
void
*
MTLDevice_t
;
#endif
using
namespace
std
;
namespace
at
{
namespace
mps
{
//-----------------------------------------------------------------
// MPSDevice
//
// MPSDevice is a singleton class that returns the default device
//-----------------------------------------------------------------
class
TORCH_API
MPSDevice
{
public:
/**
* MPSDevice should not be cloneable.
*/
MPSDevice
(
MPSDevice
&
other
)
=
delete
;
/**
* MPSDevice should not be assignable.
*/
void
operator
=
(
const
MPSDevice
&
)
=
delete
;
/**
* Gets single instance of the Device.
*/
static
MPSDevice
*
getInstance
();
/**
* Returns the single device.
*/
MTLDevice_t
device
()
{
return
_mtl_device
;
}
~
MPSDevice
();
private:
static
MPSDevice
*
_device
;
MTLDevice_t
_mtl_device
;
MPSDevice
();
};
TORCH_API
bool
is_available
();
TORCH_API
at
::
Allocator
*
GetMPSAllocator
(
bool
useSharedAllocator
=
false
);
}
// namespace mps
}
// namespace at
mmcv/ops/csrc/common/mps/MPSLibrary.h
0 → 100644
View file @
fdeee889
#ifndef _MPS_LIBRARY_H_
#define _MPS_LIBRARY_H_
#include <string>
#include <unordered_map>
#ifdef __OBJC__
#include <Foundation/Foundation.h>
#include <Metal/Metal.h>
#include <MetalPerformanceShaders/MetalPerformanceShaders.h>
typedef
id
<
MTLComputePipelineState
>
MTLComputePipelineState_t
;
typedef
id
<
MTLLibrary
>
MTLLibrary_t
;
#else
typedef
void
*
MTLComputePipelineState
;
typedef
void
*
MTLComputePipelineState_t
;
typedef
void
*
MTLLibrary
;
typedef
void
*
MTLLibrary_t
;
#endif
class
MPSLibrary
{
public:
// disable constructor for singleton
static
MPSLibrary
*
createFromUrl
(
const
std
::
string
&
library_url
);
static
MPSLibrary
*
createFromSource
(
const
std
::
string
&
source
);
~
MPSLibrary
();
MTLLibrary_t
library
()
{
return
_library
;
}
MTLComputePipelineState_t
getComputePipelineState
(
const
std
::
string
&
function_name
);
private:
MTLLibrary_t
_library
;
std
::
unordered_map
<
std
::
string
,
MTLComputePipelineState_t
>
_pso_map
;
};
class
MPSLibraryManager
{
public:
// disable constructor for singleton
MPSLibraryManager
(
const
MPSLibraryManager
&
)
=
delete
;
MPSLibraryManager
&
operator
=
(
const
MPSLibraryManager
&
)
=
delete
;
MPSLibraryManager
(
MPSLibraryManager
&&
)
=
delete
;
MPSLibraryManager
&
operator
=
(
MPSLibraryManager
&&
)
=
delete
;
static
MPSLibraryManager
*
getInstance
();
bool
hasLibrary
(
const
std
::
string
&
name
);
MPSLibrary
*
getLibrary
(
const
std
::
string
&
library_url
);
MPSLibrary
*
createLibraryFromSouce
(
const
std
::
string
&
name
,
const
std
::
string
&
sources
);
~
MPSLibraryManager
();
private:
MPSLibraryManager
();
std
::
unordered_map
<
std
::
string
,
std
::
unique_ptr
<
MPSLibrary
>>
_library_map
;
};
#endif
Prev
1
…
7
8
9
10
11
12
13
14
15
…
23
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment