Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
MMCV
Commits
6f3c5f1c
Commit
6f3c5f1c
authored
Jul 11, 2024
by
limm
Browse files
support v1.4.0
parent
6f674c7e
Changes
339
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
128 additions
and
5983 deletions
+128
-5983
mmcv/ops/csrc/common/cuda/roipoint_pool3d_cuda_kernel.cuh
mmcv/ops/csrc/common/cuda/roipoint_pool3d_cuda_kernel.cuh
+63
-53
mmcv/ops/csrc/common/cuda/rotated_feature_align_cuda_kernel.cuh
...ps/csrc/common/cuda/rotated_feature_align_cuda_kernel.cuh
+0
-129
mmcv/ops/csrc/common/cuda/scatter_points_cuda_kernel.cuh
mmcv/ops/csrc/common/cuda/scatter_points_cuda_kernel.cuh
+2
-2
mmcv/ops/csrc/common/cuda/spconv/indice.cuh
mmcv/ops/csrc/common/cuda/spconv/indice.cuh
+0
-236
mmcv/ops/csrc/common/cuda/spconv/reordering.cuh
mmcv/ops/csrc/common/cuda/spconv/reordering.cuh
+0
-160
mmcv/ops/csrc/common/cuda/stack_ball_query_cuda_kernel.cuh
mmcv/ops/csrc/common/cuda/stack_ball_query_cuda_kernel.cuh
+0
-68
mmcv/ops/csrc/common/cuda/stack_group_points_cuda_kernel.cuh
mmcv/ops/csrc/common/cuda/stack_group_points_cuda_kernel.cuh
+0
-97
mmcv/ops/csrc/common/cuda/three_interpolate_cuda_kernel.cuh
mmcv/ops/csrc/common/cuda/three_interpolate_cuda_kernel.cuh
+21
-21
mmcv/ops/csrc/common/cuda/three_nn_cuda_kernel.cuh
mmcv/ops/csrc/common/cuda/three_nn_cuda_kernel.cuh
+37
-38
mmcv/ops/csrc/common/cuda/voxelization_cuda_kernel.cuh
mmcv/ops/csrc/common/cuda/voxelization_cuda_kernel.cuh
+5
-52
mmcv/ops/csrc/common/mlu/bbox_overlaps_mlu_kernel.mlu
mmcv/ops/csrc/common/mlu/bbox_overlaps_mlu_kernel.mlu
+0
-322
mmcv/ops/csrc/common/mlu/carafe_mlu_kernel.mlu
mmcv/ops/csrc/common/mlu/carafe_mlu_kernel.mlu
+0
-552
mmcv/ops/csrc/common/mlu/carafe_utils.hpp
mmcv/ops/csrc/common/mlu/carafe_utils.hpp
+0
-95
mmcv/ops/csrc/common/mlu/common_mlu_helper.hpp
mmcv/ops/csrc/common/mlu/common_mlu_helper.hpp
+0
-398
mmcv/ops/csrc/common/mlu/deform_roi_pool_mlu_kernel.mlu
mmcv/ops/csrc/common/mlu/deform_roi_pool_mlu_kernel.mlu
+0
-712
mmcv/ops/csrc/common/mlu/focal_loss_sigmoid_mlu_kernel.mlu
mmcv/ops/csrc/common/mlu/focal_loss_sigmoid_mlu_kernel.mlu
+0
-888
mmcv/ops/csrc/common/mlu/iou3d_mlu_kernel.mlu
mmcv/ops/csrc/common/mlu/iou3d_mlu_kernel.mlu
+0
-431
mmcv/ops/csrc/common/mlu/iou3d_utils.hpp
mmcv/ops/csrc/common/mlu/iou3d_utils.hpp
+0
-695
mmcv/ops/csrc/common/mlu/masked_conv2d_mlu_kernel.mlu
mmcv/ops/csrc/common/mlu/masked_conv2d_mlu_kernel.mlu
+0
-181
mmcv/ops/csrc/common/mlu/ms_deform_attn_mlu_kernel.mlu
mmcv/ops/csrc/common/mlu/ms_deform_attn_mlu_kernel.mlu
+0
-853
No files found.
Too many changes to show.
To preserve performance only
339 of 339+
files are displayed.
Plain diff
Email patch
mmcv/ops/csrc/common/cuda/roipoint_pool3d_cuda_kernel.cuh
View file @
6f3c5f1c
...
...
@@ -42,23 +42,23 @@ __global__ void assign_pts_to_box3d(int batch_size, int pts_num, int boxes_num,
// params boxes3d: (B, M, 7)
// params pts_assign: (B, N, M): idx of the corresponding box3d, -1 means
// background points
int
pt_idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
box_idx
=
blockIdx
.
y
;
int
bs_idx
=
blockIdx
.
z
;
CUDA_1D_KERNEL_LOOP
(
pt_idx
,
pts_num
)
{
if
(
box_idx
>=
boxes_num
||
bs_idx
>=
batch_size
)
return
;
int
assign_idx
=
bs_idx
*
pts_num
*
boxes_num
+
pt_idx
*
boxes_num
+
box_idx
;
pts_assign
[
assign_idx
]
=
0
;
if
(
pt_idx
>=
pts_num
||
box_idx
>=
boxes_num
||
bs_idx
>=
batch_size
)
{
return
;
}
int
assign_idx
=
bs_idx
*
pts_num
*
boxes_num
+
pt_idx
*
boxes_num
+
box_idx
;
pts_assign
[
assign_idx
]
=
0
;
int
box_offset
=
bs_idx
*
boxes_num
*
7
+
box_idx
*
7
;
int
pt_offset
=
bs_idx
*
pts_num
*
3
+
pt_idx
*
3
;
int
box_offset
=
bs_idx
*
boxes_num
*
7
+
box_idx
*
7
;
int
pt_offset
=
bs_idx
*
pts_num
*
3
+
pt_idx
*
3
;
T
local_x
=
0
,
local_y
=
0
;
int
cur_in_flag
=
check_pt_in_box3d
(
xyz
+
pt_offset
,
boxes3d
+
box_offset
,
local_x
,
local_y
);
pts_assign
[
assign_idx
]
=
cur_in_flag
;
}
T
local_x
=
0
,
local_y
=
0
;
int
cur_in_flag
=
check_pt_in_box3d
(
xyz
+
pt_offset
,
boxes3d
+
box_offset
,
local_x
,
local_y
);
pts_assign
[
assign_idx
]
=
cur_in_flag
;
}
__global__
void
get_pooled_idx
(
int
batch_size
,
int
pts_num
,
int
boxes_num
,
...
...
@@ -69,32 +69,35 @@ __global__ void get_pooled_idx(int batch_size, int pts_num, int boxes_num,
// params pts_assign: (B, N)
// params pts_idx: (B, M, 512)
// params pooled_empty_flag: (B, M)
CUDA_1D_KERNEL_LOOP
(
boxes_idx
,
boxes_num
)
{
int
bs_idx
=
blockIdx
.
y
;
int
cnt
=
0
;
for
(
int
k
=
0
;
k
<
pts_num
;
k
++
)
{
if
(
pts_assign
[
bs_idx
*
pts_num
*
boxes_num
+
k
*
boxes_num
+
boxes_idx
])
{
if
(
cnt
<
sampled_pts_num
)
{
pts_idx
[
bs_idx
*
boxes_num
*
sampled_pts_num
+
boxes_idx
*
sampled_pts_num
+
cnt
]
=
k
;
cnt
++
;
}
else
break
;
}
int
boxes_idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
boxes_idx
>=
boxes_num
)
{
return
;
}
int
bs_idx
=
blockIdx
.
y
;
int
cnt
=
0
;
for
(
int
k
=
0
;
k
<
pts_num
;
k
++
)
{
if
(
pts_assign
[
bs_idx
*
pts_num
*
boxes_num
+
k
*
boxes_num
+
boxes_idx
])
{
if
(
cnt
<
sampled_pts_num
)
{
pts_idx
[
bs_idx
*
boxes_num
*
sampled_pts_num
+
boxes_idx
*
sampled_pts_num
+
cnt
]
=
k
;
cnt
++
;
}
else
break
;
}
}
if
(
cnt
==
0
)
{
pooled_empty_flag
[
bs_idx
*
boxes_num
+
boxes_idx
]
=
1
;
}
else
if
(
cnt
<
sampled_pts_num
)
{
// duplicate same points for sampling
for
(
int
k
=
cnt
;
k
<
sampled_pts_num
;
k
++
)
{
int
duplicate_idx
=
k
%
cnt
;
int
base_offset
=
bs_idx
*
boxes_num
*
sampled_pts_num
+
boxes_idx
*
sampled_pts_num
;
pts_idx
[
base_offset
+
k
]
=
pts_idx
[
base_offset
+
duplicate_idx
];
}
if
(
cnt
==
0
)
{
pooled_empty_flag
[
bs_idx
*
boxes_num
+
boxes_idx
]
=
1
;
}
else
if
(
cnt
<
sampled_pts_num
)
{
// duplicate same points for sampling
for
(
int
k
=
cnt
;
k
<
sampled_pts_num
;
k
++
)
{
int
duplicate_idx
=
k
%
cnt
;
int
base_offset
=
bs_idx
*
boxes_num
*
sampled_pts_num
+
boxes_idx
*
sampled_pts_num
;
pts_idx
[
base_offset
+
k
]
=
pts_idx
[
base_offset
+
duplicate_idx
];
}
}
}
...
...
@@ -109,26 +112,33 @@ __global__ void roipoint_pool3d_forward(
// params pts_feature: (B, N, C)
// params pooled_features: (B, M, 512, 3+C)
// params pooled_empty_flag: (B, M)
int
sample_pt_idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
box_idx
=
blockIdx
.
y
;
int
bs_idx
=
blockIdx
.
z
;
CUDA_1D_KERNEL_LOOP
(
sample_pt_idx
,
sampled_pts_num
)
{
if
(
box_idx
>=
boxes_num
||
bs_idx
>=
batch_size
)
return
;
if
(
pooled_empty_flag
[
bs_idx
*
boxes_num
+
box_idx
])
return
;
int
temp_idx
=
bs_idx
*
boxes_num
*
sampled_pts_num
+
box_idx
*
sampled_pts_num
+
sample_pt_idx
;
int
src_pt_idx
=
pts_idx
[
temp_idx
];
int
dst_feature_offset
=
temp_idx
*
(
3
+
feature_in_len
);
for
(
int
j
=
0
;
j
<
3
;
j
++
)
pooled_features
[
dst_feature_offset
+
j
]
=
xyz
[
bs_idx
*
pts_num
*
3
+
src_pt_idx
*
3
+
j
];
int
src_feature_offset
=
bs_idx
*
pts_num
*
feature_in_len
+
src_pt_idx
*
feature_in_len
;
memcpy
(
pooled_features
+
dst_feature_offset
+
3
,
pts_feature
+
src_feature_offset
,
feature_in_len
*
sizeof
(
T
));
if
(
sample_pt_idx
>=
sampled_pts_num
||
box_idx
>=
boxes_num
||
bs_idx
>=
batch_size
)
{
return
;
}
if
(
pooled_empty_flag
[
bs_idx
*
boxes_num
+
box_idx
])
{
return
;
}
int
temp_idx
=
bs_idx
*
boxes_num
*
sampled_pts_num
+
box_idx
*
sampled_pts_num
+
sample_pt_idx
;
int
src_pt_idx
=
pts_idx
[
temp_idx
];
int
dst_feature_offset
=
temp_idx
*
(
3
+
feature_in_len
);
for
(
int
j
=
0
;
j
<
3
;
j
++
)
pooled_features
[
dst_feature_offset
+
j
]
=
xyz
[
bs_idx
*
pts_num
*
3
+
src_pt_idx
*
3
+
j
];
int
src_feature_offset
=
bs_idx
*
pts_num
*
feature_in_len
+
src_pt_idx
*
feature_in_len
;
memcpy
(
pooled_features
+
dst_feature_offset
+
3
,
pts_feature
+
src_feature_offset
,
feature_in_len
*
sizeof
(
T
));
}
#endif // ROIPOINT_POOL3D_CUDA_KERNEL_CUH
mmcv/ops/csrc/common/cuda/rotated_feature_align_cuda_kernel.cuh
deleted
100644 → 0
View file @
6f674c7e
// Copyright (c) OpenMMLab. All rights reserved.
// Modified from
// https://github.com/SJTU-Thinklab-Det/r3det-on-mmdetection/blob/master/mmdet/ops/fr/src/feature_refine_kernel.cu
#ifndef ROTATED_FEATURE_ALIGN_CUDA_KERNEL_CUH
#define ROTATED_FEATURE_ALIGN_CUDA_KERNEL_CUH
#ifdef MMCV_USE_PARROTS
#include "parrots_cuda_helper.hpp"
#else
#include "pytorch_cuda_helper.hpp"
#endif
template
<
typename
scalar_t
>
__global__
void
rotated_feature_align_forward_kernel
(
const
int
nthreads
,
const
int
points
,
const
scalar_t
*
bottom_data
,
const
scalar_t
*
best_bboxes
,
const
scalar_t
spatial_scale
,
const
int
channels
,
const
int
height
,
const
int
width
,
scalar_t
*
top_data
)
{
CUDA_1D_KERNEL_LOOP
(
index
,
nthreads
)
{
int
w
=
index
%
width
;
int
h
=
(
index
/
width
)
%
height
;
int
c
=
(
index
/
width
/
height
)
%
channels
;
int
n
=
index
/
width
/
height
/
channels
;
const
scalar_t
*
bbox_offset
=
best_bboxes
+
((
n
*
height
+
h
)
*
width
+
w
)
*
5
;
scalar_t
roi_y
=
bbox_offset
[
0
]
*
spatial_scale
;
scalar_t
roi_x
=
bbox_offset
[
1
]
*
spatial_scale
;
scalar_t
px
[
5
]
=
{
roi_x
,
0
,
0
,
0
,
0
};
scalar_t
py
[
5
]
=
{
roi_y
,
0
,
0
,
0
,
0
};
if
(
points
>
1
)
{
scalar_t
roi_w
=
bbox_offset
[
2
]
*
spatial_scale
;
scalar_t
roi_h
=
bbox_offset
[
3
]
*
spatial_scale
;
scalar_t
roi_a
=
bbox_offset
[
4
];
scalar_t
w_2
=
roi_w
/
2
,
h_2
=
roi_h
/
2
;
scalar_t
cosa
=
cosf
(
roi_a
),
sina
=
sinf
(
roi_a
);
scalar_t
wx
=
cosa
*
w_2
,
wy
=
sina
*
w_2
;
scalar_t
hx
=
-
sina
*
h_2
,
hy
=
cosa
*
h_2
;
px
[
1
]
=
roi_x
+
wx
+
hx
;
py
[
1
]
=
roi_y
+
wy
+
hy
;
px
[
2
]
=
roi_x
-
wx
+
hx
;
py
[
2
]
=
roi_y
-
wy
+
hy
;
px
[
3
]
=
roi_x
-
wx
-
hx
;
py
[
3
]
=
roi_y
-
wy
-
hy
;
px
[
4
]
=
roi_x
+
wx
-
hx
;
py
[
4
]
=
roi_y
+
wy
-
hy
;
}
const
scalar_t
*
offset_bottom_data
=
bottom_data
+
(
n
*
channels
+
c
)
*
height
*
width
;
scalar_t
output_val
=
bottom_data
[
index
];
for
(
int
i
=
0
;
i
<
points
;
i
++
)
{
output_val
+=
bilinear_interpolate
<
scalar_t
>
(
offset_bottom_data
,
height
,
width
,
py
[
i
],
px
[
i
],
i
);
}
top_data
[
index
]
=
output_val
;
}
}
template
<
typename
scalar_t
>
__global__
void
rotated_feature_align_backward_kernel
(
const
int
nthreads
,
const
int
points
,
const
scalar_t
*
top_diff
,
const
scalar_t
*
best_bboxes
,
const
scalar_t
spatial_scale
,
const
int
channels
,
const
int
height
,
const
int
width
,
scalar_t
*
bottom_diff
)
{
CUDA_1D_KERNEL_LOOP
(
index
,
nthreads
)
{
int
w
=
index
%
width
;
int
h
=
(
index
/
width
)
%
height
;
int
c
=
(
index
/
width
/
height
)
%
channels
;
int
n
=
index
/
width
/
height
/
channels
;
const
scalar_t
*
bbox_offset
=
best_bboxes
+
((
n
*
height
+
h
)
*
width
+
w
)
*
5
;
scalar_t
roi_y
=
bbox_offset
[
0
]
*
spatial_scale
;
scalar_t
roi_x
=
bbox_offset
[
1
]
*
spatial_scale
;
scalar_t
px
[
5
]
=
{
roi_x
,
0
,
0
,
0
,
0
};
scalar_t
py
[
5
]
=
{
roi_y
,
0
,
0
,
0
,
0
};
if
(
points
>
1
)
{
scalar_t
roi_w
=
bbox_offset
[
2
]
*
spatial_scale
;
scalar_t
roi_h
=
bbox_offset
[
3
]
*
spatial_scale
;
scalar_t
roi_a
=
bbox_offset
[
4
];
scalar_t
w_2
=
roi_w
/
2
,
h_2
=
roi_h
/
2
;
scalar_t
cosa
=
cosf
(
roi_a
),
sina
=
sinf
(
roi_a
);
scalar_t
wx
=
cosa
*
w_2
,
wy
=
sina
*
w_2
;
scalar_t
hx
=
-
sina
*
h_2
,
hy
=
cosa
*
h_2
;
px
[
1
]
=
roi_x
+
wx
+
hx
;
py
[
1
]
=
roi_y
+
wy
+
hy
;
px
[
2
]
=
roi_x
-
wx
+
hx
;
py
[
2
]
=
roi_y
-
wy
+
hy
;
px
[
3
]
=
roi_x
-
wx
-
hx
;
py
[
3
]
=
roi_y
-
wy
-
hy
;
px
[
4
]
=
roi_x
+
wx
-
hx
;
py
[
4
]
=
roi_y
+
wy
-
hy
;
}
scalar_t
*
offset_bottom_diff
=
bottom_diff
+
(
n
*
channels
+
c
)
*
height
*
width
;
scalar_t
value_top_diff
=
top_diff
[
index
];
atomicAdd
(
bottom_diff
+
index
,
value_top_diff
);
for
(
int
i
=
0
;
i
<
points
;
i
++
)
{
scalar_t
w1
,
w2
,
w3
,
w4
;
int
x_low
,
x_high
,
y_low
,
y_high
;
bilinear_interpolate_gradient
<
scalar_t
>
(
height
,
width
,
py
[
i
],
px
[
i
],
w1
,
w2
,
w3
,
w4
,
x_low
,
x_high
,
y_low
,
y_high
,
i
);
scalar_t
g1
=
value_top_diff
*
w1
;
scalar_t
g2
=
value_top_diff
*
w2
;
scalar_t
g3
=
value_top_diff
*
w3
;
scalar_t
g4
=
value_top_diff
*
w4
;
if
(
x_low
>=
0
&&
x_high
>=
0
&&
y_low
>=
0
&&
y_high
>=
0
)
{
atomicAdd
(
offset_bottom_diff
+
y_low
*
width
+
x_low
,
g1
);
atomicAdd
(
offset_bottom_diff
+
y_low
*
width
+
x_high
,
g2
);
atomicAdd
(
offset_bottom_diff
+
y_high
*
width
+
x_low
,
g3
);
atomicAdd
(
offset_bottom_diff
+
y_high
*
width
+
x_high
,
g4
);
}
}
}
}
#endif // ROTATED_FEATURE_ALIGN_CUDA_KERNEL_CUH
mmcv/ops/csrc/common/cuda/scatter_points_cuda_kernel.cuh
View file @
6f3c5f1c
...
...
@@ -34,7 +34,7 @@ __device__ __forceinline__ static void reduceMax(double *address, double val) {
}
// get rid of meaningless warnings when compiling host code
#ifdef
MMCV_WITH_HIP
#ifdef
HIP_DIFF
__device__
__forceinline__
static
void
reduceAdd
(
float
*
address
,
float
val
)
{
atomicAdd
(
address
,
val
);
}
...
...
@@ -86,7 +86,7 @@ __device__ __forceinline__ static void reduceAdd(double *address, double val) {
#endif
}
#endif // __CUDA_ARCH__
#endif //
MMCV_WITH_HIP
#endif //
HIP_DIFF
template
<
typename
T
>
__global__
void
feats_reduce_kernel
(
...
...
mmcv/ops/csrc/common/cuda/spconv/indice.cuh
deleted
100644 → 0
View file @
6f674c7e
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef INDICE_CU_H_
#define INDICE_CU_H_
#include <utils/spconv/spconv/geometry.h>
#include <utils/spconv/tensorview/tensorview.h>
#include <utils/spconv/tensorview/helper_kernel.cuh>
template
<
typename
Index
,
typename
IndexGrid
,
unsigned
NDim
,
int
KernelMaxVolume
=
256
>
__global__
void
prepareIndicePairsKernel
(
tv
::
TensorView
<
const
Index
>
indicesIn
,
tv
::
TensorView
<
Index
>
indicesOut
,
tv
::
TensorView
<
IndexGrid
>
gridsOut
,
tv
::
TensorView
<
Index
>
indicePairs
,
tv
::
TensorView
<
Index
>
indiceNum
,
tv
::
TensorView
<
Index
>
indicePairUnique
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
kernelSize
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
stride
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
padding
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
dilation
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
outSpatialShape
)
{
auto
numActIn
=
indicesIn
.
dim
(
0
);
Index
spatialVolume
=
1
;
#pragma unroll
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
spatialVolume
*=
outSpatialShape
[
i
];
}
Index
kernelVolume
=
1
;
#pragma unroll
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
kernelVolume
*=
kernelSize
[
i
];
}
Index
numValidPoints
=
0
;
Index
validPoints
[
KernelMaxVolume
*
(
NDim
+
1
)];
Index
*
pointPtr
=
nullptr
;
auto
indicePairsDim2
=
indicePairs
.
dim
(
2
);
Index
index
;
for
(
int
ix
:
tv
::
KernelLoopX
<
int
>
(
numActIn
))
{
numValidPoints
=
getValidOutPos
<
Index
,
NDim
>
(
indicesIn
.
data
()
+
ix
*
(
NDim
+
1
)
+
1
,
kernelSize
.
data
(),
stride
.
data
(),
padding
.
data
(),
dilation
.
data
(),
outSpatialShape
.
data
(),
validPoints
);
for
(
Index
i
=
0
;
i
<
numValidPoints
;
++
i
)
{
pointPtr
=
validPoints
+
i
*
(
NDim
+
1
);
auto
offset
=
pointPtr
[
NDim
];
auto
oldNum
=
atomicAdd
(
indiceNum
.
data
()
+
offset
,
Index
(
1
));
indicePairs
(
offset
,
0
,
oldNum
)
=
ix
;
index
=
tv
::
rowArrayIdx
<
Index
,
NDim
>
(
pointPtr
,
outSpatialShape
.
data
())
+
spatialVolume
*
indicesIn
(
ix
,
0
);
indicePairs
(
offset
,
1
,
oldNum
)
=
index
;
indicePairUnique
[
offset
*
indicePairsDim2
+
oldNum
]
=
index
;
}
}
}
template
<
typename
Index
,
typename
IndexGrid
,
unsigned
NDim
,
int
KernelMaxVolume
=
256
>
__global__
void
prepareDeConvIndicePairsKernel
(
tv
::
TensorView
<
const
Index
>
indicesIn
,
tv
::
TensorView
<
Index
>
indicesOut
,
tv
::
TensorView
<
IndexGrid
>
gridsOut
,
tv
::
TensorView
<
Index
>
indicePairs
,
tv
::
TensorView
<
Index
>
indiceNum
,
tv
::
TensorView
<
Index
>
indicePairUnique
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
kernelSize
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
stride
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
padding
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
dilation
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
outSpatialShape
)
{
auto
numActIn
=
indicesIn
.
dim
(
0
);
Index
spatialVolume
=
1
;
#pragma unroll
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
spatialVolume
*=
outSpatialShape
[
i
];
}
Index
kernelVolume
=
1
;
#pragma unroll
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
kernelVolume
*=
kernelSize
[
i
];
}
Index
numValidPoints
=
0
;
Index
validPoints
[
KernelMaxVolume
*
(
NDim
+
1
)];
Index
*
pointPtr
=
nullptr
;
auto
indicePairsDim2
=
indicePairs
.
dim
(
2
);
Index
index
;
for
(
int
ix
:
tv
::
KernelLoopX
<
int
>
(
numActIn
))
{
numValidPoints
=
getValidOutPosTranspose
<
Index
,
NDim
>
(
indicesIn
.
data
()
+
ix
*
(
NDim
+
1
)
+
1
,
kernelSize
.
data
(),
stride
.
data
(),
padding
.
data
(),
dilation
.
data
(),
outSpatialShape
.
data
(),
validPoints
);
for
(
Index
i
=
0
;
i
<
numValidPoints
;
++
i
)
{
pointPtr
=
validPoints
+
i
*
(
NDim
+
1
);
auto
offset
=
pointPtr
[
NDim
];
auto
oldNum
=
atomicAdd
(
indiceNum
.
data
()
+
offset
,
Index
(
1
));
indicePairs
(
offset
,
0
,
oldNum
)
=
ix
;
index
=
tv
::
rowArrayIdx
<
Index
,
NDim
>
(
pointPtr
,
outSpatialShape
.
data
())
+
spatialVolume
*
indicesIn
(
ix
,
0
);
indicePairs
(
offset
,
1
,
oldNum
)
=
index
;
indicePairUnique
[
offset
*
indicePairsDim2
+
oldNum
]
=
index
;
}
}
}
template
<
typename
Index
,
typename
IndexGrid
,
unsigned
NDim
>
__global__
void
assignGridAndIndiceOutKernel
(
tv
::
TensorView
<
Index
>
indicesOut
,
tv
::
TensorView
<
IndexGrid
>
gridsOut
,
int
numAct
,
tv
::
TensorView
<
Index
>
indicePairs
,
tv
::
TensorView
<
Index
>
indicePairUnique
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
outSpatialShape
,
int
batchSize
)
{
Index
index
;
auto
indicesOutPtr
=
indicesOut
.
data
();
for
(
int
ix
:
tv
::
KernelLoopX
<
int
>
(
numAct
))
{
index
=
indicePairUnique
[
ix
];
gridsOut
[
index
]
=
ix
;
index
=
tv
::
rowArrayIdxInv
<
Index
,
NDim
>
(
index
,
indicesOutPtr
+
ix
*
(
NDim
+
1
)
+
1
,
outSpatialShape
.
data
());
indicesOut
[
ix
*
(
NDim
+
1
)]
=
index
%
batchSize
;
}
}
template
<
typename
Index
,
typename
IndexGrid
,
unsigned
NDim
>
__global__
void
assignIndicePairsKernel
(
tv
::
TensorView
<
Index
>
indicesOut
,
tv
::
TensorView
<
IndexGrid
>
gridsOut
,
int
numActIn
,
tv
::
TensorView
<
Index
>
indicePairs
,
tv
::
TensorView
<
Index
>
indicePairUnique
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
outSpatialShape
)
{
Index
index
;
int
kernelVolume
=
indicePairs
.
dim
(
0
);
for
(
int
ix
:
tv
::
KernelLoopX
<
int
>
(
numActIn
))
{
for
(
int
i
=
0
;
i
<
kernelVolume
;
++
i
)
{
index
=
indicePairs
(
i
,
1
,
ix
);
if
(
index
>
-
1
)
{
indicePairs
(
i
,
1
,
ix
)
=
gridsOut
[
index
];
}
}
}
}
template
<
typename
Index
,
typename
IndexGrid
,
unsigned
NDim
>
__global__
void
prepareSubMGridKernel
(
tv
::
TensorView
<
const
Index
>
indicesIn
,
tv
::
TensorView
<
IndexGrid
>
gridsOut
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
outSpatialShape
)
{
auto
numActIn
=
indicesIn
.
dim
(
0
);
Index
spatialVolume
=
1
;
#pragma unroll
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
spatialVolume
*=
outSpatialShape
[
i
];
}
Index
index
=
0
;
for
(
int
ix
:
tv
::
KernelLoopX
<
int
>
(
numActIn
))
{
index
=
tv
::
rowArrayIdx
<
Index
,
NDim
>
(
indicesIn
.
data
()
+
ix
*
(
NDim
+
1
)
+
1
,
outSpatialShape
.
data
())
+
spatialVolume
*
indicesIn
(
ix
,
0
);
gridsOut
[
index
]
=
ix
;
}
}
template
<
typename
Index
,
typename
IndexGrid
,
unsigned
NDim
,
int
KernelMaxVolume
=
256
>
__global__
void
getSubMIndicePairsKernel
(
tv
::
TensorView
<
const
Index
>
indicesIn
,
tv
::
TensorView
<
IndexGrid
>
gridsOut
,
tv
::
TensorView
<
Index
>
indicePairs
,
tv
::
TensorView
<
Index
>
indiceNum
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
kernelSize
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
stride
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
padding
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
dilation
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
outSpatialShape
)
{
auto
numActIn
=
indicesIn
.
dim
(
0
);
Index
spatialVolume
=
1
;
#pragma unroll
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
spatialVolume
*=
outSpatialShape
[
i
];
}
Index
numValidPoints
=
0
;
Index
validPoints
[
KernelMaxVolume
*
(
NDim
+
1
)];
Index
*
pointPtr
=
nullptr
;
Index
index
=
0
;
for
(
int
ix
:
tv
::
KernelLoopX
<
int
>
(
numActIn
))
{
numValidPoints
=
getValidOutPos
<
Index
,
NDim
>
(
indicesIn
.
data
()
+
ix
*
(
NDim
+
1
)
+
1
,
kernelSize
.
data
(),
stride
.
data
(),
padding
.
data
(),
dilation
.
data
(),
outSpatialShape
.
data
(),
validPoints
);
for
(
int
i
=
0
;
i
<
numValidPoints
;
++
i
)
{
pointPtr
=
validPoints
+
i
*
(
NDim
+
1
);
auto
offset
=
pointPtr
[
NDim
];
index
=
tv
::
rowArrayIdx
<
Index
,
NDim
>
(
pointPtr
,
outSpatialShape
.
data
())
+
spatialVolume
*
indicesIn
(
ix
,
0
);
if
(
gridsOut
[
index
]
>
-
1
)
{
auto
oldNum
=
atomicAdd
(
indiceNum
.
data
()
+
offset
,
Index
(
1
));
indicePairs
(
offset
,
1
,
oldNum
)
=
gridsOut
[
index
];
indicePairs
(
offset
,
0
,
oldNum
)
=
ix
;
}
}
}
}
template
<
typename
Index
,
typename
IndexGrid
,
unsigned
NDim
>
__global__
void
resetGridKernel
(
const
Index
*
indicePairUnique
,
tv
::
TensorView
<
IndexGrid
>
gridsOut
,
int
numAct
)
{
for
(
int
ix
:
tv
::
KernelLoopX
<
int
>
(
numAct
))
{
gridsOut
[
indicePairUnique
[
ix
]]
=
-
1
;
}
}
template
<
typename
Index
,
typename
IndexGrid
,
unsigned
NDim
>
__global__
void
resetGridSubMKernel
(
const
Index
*
indices
,
tv
::
TensorView
<
IndexGrid
>
gridsOut
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
outSpatialShape
,
int
numAct
)
{
int
outSpatialShapeReg
[
NDim
];
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
outSpatialShapeReg
[
i
]
=
outSpatialShape
[
i
];
}
Index
spatialVolume
=
1
;
auto
indsPtr
=
indices
;
#pragma unroll
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
spatialVolume
*=
outSpatialShape
[
i
];
}
Index
index
;
for
(
int
ix
:
tv
::
KernelLoopX
<
int
>
(
numAct
))
{
indsPtr
=
indices
+
ix
*
(
NDim
+
1
);
index
=
tv
::
rowArrayIdx
<
Index
,
NDim
>
(
indsPtr
+
1
,
outSpatialShapeReg
);
gridsOut
[
index
+
spatialVolume
*
indsPtr
[
0
]]
=
-
1
;
}
}
#endif
mmcv/ops/csrc/common/cuda/spconv/reordering.cuh
deleted
100644 → 0
View file @
6f674c7e
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef REORDERING_CU_H_
#define REORDERING_CU_H_
#include <utils/spconv/tensorview/helper_kernel.cuh>
template
<
typename
scalar_t
,
typename
Index
,
int
NumTLP
,
int
NumILP
>
__global__
void
gatherGenericKernel
(
scalar_t
*
buffer
,
const
scalar_t
*
features
,
const
Index
*
indices
,
int
size
,
int
numPlanes
)
{
int
ILPStrideX
[
NumILP
];
Index
inds
[
NumILP
];
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
ILPStrideX
[
ilp
]
=
ilp
*
gridDim
.
x
*
blockDim
.
x
;
for
(
int
ix
:
tv
::
KernelLoopX
<
int
,
NumILP
>
(
size
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
{
if
(
ix
+
ILPStrideX
[
ilp
]
<
size
)
inds
[
ilp
]
=
indices
[
ix
+
ILPStrideX
[
ilp
]]
*
numPlanes
;
}
for
(
int
iy
:
tv
::
KernelLoopY
<
int
>
(
numPlanes
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
++
ilp
)
{
if
(
ix
+
ILPStrideX
[
ilp
]
<
size
)
buffer
[(
ix
+
ILPStrideX
[
ilp
])
*
numPlanes
+
iy
]
=
features
[
inds
[
ilp
]
+
iy
];
}
}
}
}
template
<
typename
scalar_t
,
typename
Index
,
int
NumTLP
,
int
NumILP
,
typename
VecType
>
__global__
void
gatherVecKernel
(
scalar_t
*
buffer
,
const
scalar_t
*
features
,
const
Index
*
indices
,
int
size
,
int
numPlanes
)
{
int
ILPStrideX
[
NumILP
];
Index
inds
[
NumILP
];
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
ILPStrideX
[
ilp
]
=
ilp
*
gridDim
.
x
*
blockDim
.
x
;
for
(
int
ix
:
tv
::
KernelLoopX
<
int
,
NumILP
>
(
size
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
{
if
(
ix
+
ILPStrideX
[
ilp
]
<
size
)
inds
[
ilp
]
=
indices
[
ix
+
ILPStrideX
[
ilp
]]
*
numPlanes
;
}
for
(
int
iy
:
tv
::
KernelLoopY
<
int
>
(
numPlanes
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
++
ilp
)
{
if
(
ix
+
ILPStrideX
[
ilp
]
<
size
)
reinterpret_cast
<
VecType
*>
(
buffer
)[(
ix
+
ILPStrideX
[
ilp
])
*
numPlanes
+
iy
]
=
reinterpret_cast
<
const
VecType
*>
(
features
)[
inds
[
ilp
]
+
iy
];
}
}
}
}
template
<
typename
scalar_t
,
typename
Index
,
int
NumTLP
,
int
NumILP
,
typename
VecType
=
int4
>
__global__
void
gatherVecBlockKernel
(
scalar_t
*
buffer
,
const
scalar_t
*
features
,
const
Index
*
indices
,
int
size
,
int
numPlanes
)
{
int
ILPStrideY
[
NumILP
];
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
ILPStrideY
[
ilp
]
=
ilp
*
gridDim
.
y
*
blockDim
.
y
;
features
+=
blockIdx
.
x
*
NumTLP
;
buffer
+=
blockIdx
.
x
*
NumTLP
;
for
(
int
iy
:
tv
::
KernelLoopY
<
int
,
NumILP
>
(
size
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
++
ilp
)
{
reinterpret_cast
<
VecType
*>
(
buffer
)[(
iy
+
ILPStrideY
[
ilp
])
*
numPlanes
+
threadIdx
.
x
]
=
reinterpret_cast
<
const
VecType
*>
(
features
)[
indices
[
iy
+
ILPStrideY
[
ilp
]]
*
numPlanes
+
threadIdx
.
x
];
}
}
}
template
<
typename
scalar_t
,
typename
Index
,
int
NumTLP
,
int
NumILP
>
__global__
void
scatterAddGenericKernel
(
scalar_t
*
outFeatures
,
const
scalar_t
*
buffer
,
const
Index
*
indices
,
int
size
,
int
numPlanes
)
{
int
ILPStrideX
[
NumILP
];
Index
inds
[
NumILP
];
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
ILPStrideX
[
ilp
]
=
ilp
*
gridDim
.
x
*
blockDim
.
x
;
for
(
int
ix
:
tv
::
KernelLoopX
<
int
,
NumILP
>
(
size
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
{
if
(
ix
+
ILPStrideX
[
ilp
]
<
size
)
inds
[
ilp
]
=
indices
[
ix
+
ILPStrideX
[
ilp
]]
*
numPlanes
;
}
for
(
int
iy
:
tv
::
KernelLoopY
<
int
>
(
numPlanes
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
++
ilp
)
{
if
(
ix
+
ILPStrideX
[
ilp
]
<
size
)
{
outFeatures
[
inds
[
ilp
]
+
iy
]
+=
buffer
[(
ix
+
ILPStrideX
[
ilp
])
*
numPlanes
+
iy
];
}
}
}
}
}
template
<
typename
scalar_t
,
typename
Index
,
int
NumTLP
,
int
NumILP
,
typename
VecType
=
int4
>
__global__
void
scatterAddVecBlockKernel
(
scalar_t
*
outFeatures
,
const
scalar_t
*
buffer
,
const
Index
*
indices
,
int
size
,
int
numPlanes
)
{
int
ILPStrideY
[
NumILP
];
constexpr
int
vecloadFactor
=
sizeof
(
VecType
)
/
sizeof
(
scalar_t
);
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
ILPStrideY
[
ilp
]
=
ilp
*
gridDim
.
y
*
blockDim
.
y
;
outFeatures
+=
blockIdx
.
x
*
NumTLP
;
buffer
+=
blockIdx
.
x
*
NumTLP
;
scalar_t
buf
[
vecloadFactor
];
scalar_t
buf2
[
vecloadFactor
];
Index
idx
;
for
(
int
iy
:
tv
::
KernelLoopY
<
int
,
NumILP
>
(
size
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
++
ilp
)
{
idx
=
indices
[
iy
+
ILPStrideY
[
ilp
]]
*
numPlanes
+
threadIdx
.
x
;
reinterpret_cast
<
VecType
*>
(
buf
)[
0
]
=
reinterpret_cast
<
VecType
*>
(
outFeatures
)[
idx
];
reinterpret_cast
<
VecType
*>
(
buf2
)[
0
]
=
reinterpret_cast
<
const
VecType
*>
(
buffer
)[(
iy
+
ILPStrideY
[
ilp
])
*
numPlanes
+
threadIdx
.
x
];
#pragma unroll
for
(
int
i
=
0
;
i
<
vecloadFactor
;
i
++
)
{
buf
[
i
]
+=
buf2
[
i
];
}
reinterpret_cast
<
VecType
*>
(
outFeatures
)[
idx
]
=
reinterpret_cast
<
VecType
*>
(
buf
)[
0
];
}
}
}
#endif
mmcv/ops/csrc/common/cuda/stack_ball_query_cuda_kernel.cuh
deleted
100644 → 0
View file @
6f674c7e
// Copyright (c) OpenMMLab. All rights reserved
// Modified from
// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/ball_query_gpu.cu
#ifndef STACK_BALL_QUERY_CUDA_KERNEL_CUH
#define STACK_BALL_QUERY_CUDA_KERNEL_CUH
#ifdef MMCV_USE_PARROTS
#include "parrots_cuda_helper.hpp"
#else
#include "pytorch_cuda_helper.hpp"
#endif
template
<
typename
T
>
__global__
void
stack_ball_query_forward_cuda_kernel
(
int
B
,
int
M
,
float
radius
,
int
nsample
,
const
T
*
new_xyz
,
const
int
*
new_xyz_batch_cnt
,
const
T
*
xyz
,
const
int
*
xyz_batch_cnt
,
int
*
idx
)
{
// :param xyz: (N1 + N2 ..., 3) xyz coordinates of the features
// :param xyz_batch_cnt: (batch_size), [N1, N2, ...]
// :param new_xyz: (M1 + M2 ..., 3) centers of the ball query
// :param new_xyz_batch_cnt: (batch_size), [M1, M2, ...]
// output:
// idx: (M, nsample)
const
T
*
cur_xyz
=
xyz
;
int
*
cur_idx
=
idx
;
CUDA_1D_KERNEL_LOOP
(
pt_idx
,
M
)
{
int
bs_idx
=
0
;
for
(
int
pt_cnt
=
0
;
bs_idx
<
B
;
bs_idx
++
)
{
pt_cnt
+=
new_xyz_batch_cnt
[
bs_idx
];
if
(
pt_idx
<
pt_cnt
)
break
;
}
int
xyz_batch_start_idx
=
0
;
for
(
int
k
=
0
;
k
<
bs_idx
;
k
++
)
xyz_batch_start_idx
+=
xyz_batch_cnt
[
k
];
const
T
*
new_xyz_p
=
new_xyz
+
pt_idx
*
3
;
cur_xyz
+=
xyz_batch_start_idx
*
3
;
cur_idx
+=
pt_idx
*
nsample
;
float
radius2
=
radius
*
radius
;
T
new_x
=
new_xyz_p
[
0
];
T
new_y
=
new_xyz_p
[
1
];
T
new_z
=
new_xyz_p
[
2
];
int
n
=
xyz_batch_cnt
[
bs_idx
];
int
cnt
=
0
;
for
(
int
k
=
0
;
k
<
n
;
++
k
)
{
T
x
=
cur_xyz
[
k
*
3
+
0
];
T
y
=
cur_xyz
[
k
*
3
+
1
];
T
z
=
cur_xyz
[
k
*
3
+
2
];
T
d2
=
(
new_x
-
x
)
*
(
new_x
-
x
)
+
(
new_y
-
y
)
*
(
new_y
-
y
)
+
(
new_z
-
z
)
*
(
new_z
-
z
);
if
(
d2
<
radius2
)
{
if
(
cnt
==
0
)
{
for
(
int
l
=
0
;
l
<
nsample
;
++
l
)
{
cur_idx
[
l
]
=
k
;
}
}
cur_idx
[
cnt
]
=
k
;
++
cnt
;
if
(
cnt
>=
nsample
)
break
;
}
}
if
(
cnt
==
0
)
cur_idx
[
0
]
=
-
1
;
}
}
#endif // STACK_BALL_QUERY_CUDA_KERNEL_CUH
mmcv/ops/csrc/common/cuda/stack_group_points_cuda_kernel.cuh
deleted
100644 → 0
View file @
6f674c7e
// Copyright (c) OpenMMLab. All rights reserved.
// Modified from
// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/group_points_gpu.cu
#ifndef STACK_GROUP_POINTS_CUDA_KERNEL_CUH
#define STACK_GROUP_POINTS_CUDA_KERNEL_CUH
#ifdef MMCV_USE_PARROTS
#include "parrots_cuda_helper.hpp"
#else
#include "pytorch_cuda_helper.hpp"
#endif
#include <stdio.h>
template
<
typename
T
>
__global__
void
stack_group_points_forward_cuda_kernel
(
int
b
,
int
c
,
int
m
,
int
nsample
,
const
T
*
features
,
const
int
*
features_batch_cnt
,
const
int
*
idx
,
const
int
*
idx_batch_cnt
,
T
*
out
)
{
// :param features: (N1 + N2 ..., C) tensor of features to group
// :param features_batch_cnt: (batch_size) [N1 + N2 ...] tensor containing the
// indices of features to group with :param idx: (M1 + M2 ..., nsample) tensor
// containing the indices of features to group with :param idx_batch_cnt:
// (batch_size) [M1 + M2 ...] tensor containing the indices of features to
// group with :return:
// output: (M1 + M2, C, nsample) tensor
CUDA_1D_KERNEL_LOOP
(
index
,
m
*
c
*
nsample
)
{
const
T
*
cur_features
=
features
;
const
int
*
cur_idx
=
idx
;
int
sample_idx
=
index
%
nsample
;
int
c_idx
=
(
index
/
nsample
)
%
c
;
int
pt_idx
=
(
index
/
nsample
/
c
);
if
(
pt_idx
>=
m
||
c_idx
>=
c
||
sample_idx
>=
nsample
)
return
;
int
bs_idx
=
0
,
pt_cnt
=
idx_batch_cnt
[
0
];
for
(
int
k
=
1
;
k
<
b
;
k
++
)
{
if
(
pt_idx
<
pt_cnt
)
break
;
pt_cnt
+=
idx_batch_cnt
[
k
];
bs_idx
=
k
;
}
int
features_batch_start_idx
=
0
;
int
features_batch_end_idx
=
features_batch_cnt
[
0
];
for
(
int
k
=
0
;
k
<
bs_idx
;
k
++
)
{
features_batch_start_idx
+=
features_batch_cnt
[
k
];
features_batch_end_idx
=
features_batch_start_idx
+
features_batch_cnt
[
k
+
1
];
}
cur_features
+=
features_batch_start_idx
*
c
;
cur_idx
+=
pt_idx
*
nsample
+
sample_idx
;
int
in_idx
=
cur_idx
[
0
]
*
c
+
c_idx
;
int
out_idx
=
pt_idx
*
c
*
nsample
+
c_idx
*
nsample
+
sample_idx
;
if
(
in_idx
<
features_batch_end_idx
*
c
)
{
out
[
out_idx
]
=
cur_features
[
in_idx
];
}
}
}
template
<
typename
T
>
__global__
void
stack_group_points_backward_cuda_kernel
(
int
b
,
int
c
,
int
m
,
int
n
,
int
nsample
,
const
T
*
grad_out
,
const
int
*
idx
,
const
int
*
idx_batch_cnt
,
const
int
*
features_batch_cnt
,
T
*
grad_features
)
{
// :param grad_out: (M1 + M2 ..., C, nsample) tensor of the gradients of the
// output from forward :param idx: (M1 + M2 ..., nsample) tensor containing
// the indices of features to group with :param idx_batch_cnt: (batch_size)
// [M1 + M2 ...] tensor containing the indices of features to group with
// :param features_batch_cnt: (batch_size) [N1 + N2 ...] tensor containing the
// indices of features to group with :return:
// grad_features: (N1 + N2 ..., C) gradient of the features
CUDA_1D_KERNEL_LOOP
(
index
,
m
*
c
*
nsample
)
{
const
T
*
cur_grad_out
=
grad_out
;
const
int
*
cur_idx
=
idx
;
T
*
cur_grad_features
=
grad_features
;
int
sample_idx
=
index
%
nsample
;
int
c_idx
=
(
index
/
nsample
)
%
c
;
int
pt_idx
=
(
index
/
nsample
/
c
);
if
(
pt_idx
>=
m
||
c_idx
>=
c
||
sample_idx
>=
nsample
)
return
;
int
bs_idx
=
0
,
pt_cnt
=
idx_batch_cnt
[
0
];
for
(
int
k
=
1
;
k
<
b
;
k
++
)
{
if
(
pt_idx
<
pt_cnt
)
break
;
pt_cnt
+=
idx_batch_cnt
[
k
];
bs_idx
=
k
;
}
int
features_batch_start_idx
=
0
;
for
(
int
k
=
0
;
k
<
bs_idx
;
k
++
)
features_batch_start_idx
+=
features_batch_cnt
[
k
];
cur_grad_out
+=
pt_idx
*
c
*
nsample
+
c_idx
*
nsample
+
sample_idx
;
cur_idx
+=
pt_idx
*
nsample
+
sample_idx
;
cur_grad_features
+=
(
features_batch_start_idx
+
cur_idx
[
0
])
*
c
+
c_idx
;
atomicAdd
(
cur_grad_features
,
cur_grad_out
[
0
]);
}
}
#endif // GROUP_POINTS_CUDA_KERNEL_CUH
mmcv/ops/csrc/common/cuda/three_interpolate_cuda_kernel.cuh
View file @
6f3c5f1c
...
...
@@ -20,17 +20,17 @@ __global__ void three_interpolate_forward_cuda_kernel(
int
bs_idx
=
blockIdx
.
z
;
int
c_idx
=
blockIdx
.
y
;
CUDA_1D_KERNEL_LOOP
(
pt_idx
,
n
)
{
if
(
bs_idx
>=
b
||
c_idx
>=
c
)
return
;
int
pt_idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
weight
+=
bs_idx
*
n
*
3
+
pt_idx
*
3
;
points
+=
bs_idx
*
c
*
m
+
c_idx
*
m
;
idx
+=
bs_idx
*
n
*
3
+
pt_idx
*
3
;
out
+=
bs_idx
*
c
*
n
+
c_idx
*
n
;
if
(
bs_idx
>=
b
||
c_idx
>=
c
||
pt_idx
>=
n
)
return
;
out
[
pt_idx
]
=
weight
[
0
]
*
points
[
idx
[
0
]]
+
weight
[
1
]
*
points
[
idx
[
1
]]
+
weight
[
2
]
*
points
[
idx
[
2
]];
}
weight
+=
bs_idx
*
n
*
3
+
pt_idx
*
3
;
points
+=
bs_idx
*
c
*
m
+
c_idx
*
m
;
idx
+=
bs_idx
*
n
*
3
+
pt_idx
*
3
;
out
+=
bs_idx
*
c
*
n
+
c_idx
*
n
;
out
[
pt_idx
]
=
weight
[
0
]
*
points
[
idx
[
0
]]
+
weight
[
1
]
*
points
[
idx
[
1
]]
+
weight
[
2
]
*
points
[
idx
[
2
]];
}
template
<
typename
T
>
...
...
@@ -44,18 +44,18 @@ __global__ void three_interpolate_backward_cuda_kernel(
int
bs_idx
=
blockIdx
.
z
;
int
c_idx
=
blockIdx
.
y
;
CUDA_1D_KERNEL_LOOP
(
pt_idx
,
n
)
{
if
(
bs_idx
>=
b
||
c_idx
>=
c
)
return
;
grad_out
+=
bs_idx
*
c
*
n
+
c_idx
*
n
+
pt_idx
;
weigh
t
+=
bs_idx
*
n
*
3
+
pt
_idx
*
3
;
grad_points
+=
bs_idx
*
c
*
m
+
c
_idx
*
m
;
idx
+=
bs_idx
*
n
*
3
+
pt
_idx
*
3
;
atomicAdd
(
grad_points
+
idx
[
0
],
grad_out
[
0
]
*
weight
[
0
]);
atomicAdd
(
grad_points
+
idx
[
1
],
grad_out
[
0
]
*
weight
[
1
]);
atomicAdd
(
grad_points
+
idx
[
2
],
grad_out
[
0
]
*
weight
[
2
]);
}
int
pt_idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
bs_idx
>=
b
||
c_idx
>=
c
||
pt_idx
>=
n
)
return
;
grad_ou
t
+=
bs_idx
*
c
*
n
+
c
_idx
*
n
+
pt_idx
;
weight
+=
bs_idx
*
n
*
3
+
pt
_idx
*
3
;
grad_points
+=
bs_idx
*
c
*
m
+
c
_idx
*
m
;
idx
+=
bs_idx
*
n
*
3
+
pt_idx
*
3
;
atomicAdd
(
grad_points
+
idx
[
0
],
grad_out
[
0
]
*
weight
[
0
]);
atomicAdd
(
grad_points
+
idx
[
1
],
grad_out
[
0
]
*
weight
[
1
]);
atomicAdd
(
grad_points
+
idx
[
2
],
grad_out
[
0
]
*
weight
[
2
]);
}
#endif // THREE_INTERPOLATE_CUDA_KERNEL_CUH
mmcv/ops/csrc/common/cuda/three_nn_cuda_kernel.cuh
View file @
6f3c5f1c
...
...
@@ -19,49 +19,48 @@ __global__ void three_nn_forward_cuda_kernel(int b, int n, int m,
// idx: (B, N, 3)
int
bs_idx
=
blockIdx
.
y
;
CUDA_1D_KERNEL_LOOP
(
pt_idx
,
n
)
{
if
(
bs_idx
>=
b
)
return
;
int
pt_idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
bs_idx
>=
b
||
pt_idx
>=
n
)
return
;
unknown
+=
bs_idx
*
n
*
3
+
pt_idx
*
3
;
known
+=
bs_idx
*
m
*
3
;
dist2
+=
bs_idx
*
n
*
3
+
pt_idx
*
3
;
idx
+=
bs_idx
*
n
*
3
+
pt_idx
*
3
;
unknown
+=
bs_idx
*
n
*
3
+
pt_idx
*
3
;
known
+=
bs_idx
*
m
*
3
;
dist2
+=
bs_idx
*
n
*
3
+
pt_idx
*
3
;
idx
+=
bs_idx
*
n
*
3
+
pt_idx
*
3
;
T
ux
=
unknown
[
0
];
T
uy
=
unknown
[
1
];
T
uz
=
unknown
[
2
];
T
ux
=
unknown
[
0
];
T
uy
=
unknown
[
1
];
T
uz
=
unknown
[
2
];
double
best1
=
1e40
,
best2
=
1e40
,
best3
=
1e40
;
int
besti1
=
0
,
besti2
=
0
,
besti3
=
0
;
for
(
int
k
=
0
;
k
<
m
;
++
k
)
{
T
x
=
known
[
k
*
3
+
0
];
T
y
=
known
[
k
*
3
+
1
];
T
z
=
known
[
k
*
3
+
2
];
T
d
=
(
ux
-
x
)
*
(
ux
-
x
)
+
(
uy
-
y
)
*
(
uy
-
y
)
+
(
uz
-
z
)
*
(
uz
-
z
);
if
(
d
<
best1
)
{
best3
=
best2
;
besti3
=
besti2
;
best2
=
best1
;
besti2
=
besti1
;
best1
=
d
;
besti1
=
k
;
}
else
if
(
d
<
best2
)
{
best3
=
best2
;
besti3
=
besti2
;
best2
=
d
;
besti2
=
k
;
}
else
if
(
d
<
best3
)
{
best3
=
d
;
besti3
=
k
;
}
double
best1
=
1e40
,
best2
=
1e40
,
best3
=
1e40
;
int
besti1
=
0
,
besti2
=
0
,
besti3
=
0
;
for
(
int
k
=
0
;
k
<
m
;
++
k
)
{
T
x
=
known
[
k
*
3
+
0
];
T
y
=
known
[
k
*
3
+
1
];
T
z
=
known
[
k
*
3
+
2
];
T
d
=
(
ux
-
x
)
*
(
ux
-
x
)
+
(
uy
-
y
)
*
(
uy
-
y
)
+
(
uz
-
z
)
*
(
uz
-
z
);
if
(
d
<
best1
)
{
best3
=
best2
;
besti3
=
besti2
;
best2
=
best1
;
besti2
=
besti1
;
best1
=
d
;
besti1
=
k
;
}
else
if
(
d
<
best2
)
{
best3
=
best2
;
besti3
=
besti2
;
best2
=
d
;
besti2
=
k
;
}
else
if
(
d
<
best3
)
{
best3
=
d
;
besti3
=
k
;
}
dist2
[
0
]
=
best1
;
dist2
[
1
]
=
best2
;
dist2
[
2
]
=
best3
;
idx
[
0
]
=
besti1
;
idx
[
1
]
=
besti2
;
idx
[
2
]
=
besti3
;
}
dist2
[
0
]
=
best1
;
dist2
[
1
]
=
best2
;
dist2
[
2
]
=
best3
;
idx
[
0
]
=
besti1
;
idx
[
1
]
=
besti2
;
idx
[
2
]
=
besti3
;
}
#endif // THREE_NN_CUDA_KERNEL_CUH
mmcv/ops/csrc/common/cuda/voxelization_cuda_kernel.cuh
View file @
6f3c5f1c
...
...
@@ -23,20 +23,20 @@ __global__ void dynamic_voxelize_kernel(
// To save some computation
auto
points_offset
=
points
+
index
*
num_features
;
auto
coors_offset
=
coors
+
index
*
NDim
;
int
c_x
=
floor
f
((
points_offset
[
0
]
-
coors_x_min
)
/
voxel_x
);
int
c_x
=
floor
((
points_offset
[
0
]
-
coors_x_min
)
/
voxel_x
);
if
(
c_x
<
0
||
c_x
>=
grid_x
)
{
coors_offset
[
0
]
=
-
1
;
continue
;
}
int
c_y
=
floor
f
((
points_offset
[
1
]
-
coors_y_min
)
/
voxel_y
);
int
c_y
=
floor
((
points_offset
[
1
]
-
coors_y_min
)
/
voxel_y
);
if
(
c_y
<
0
||
c_y
>=
grid_y
)
{
coors_offset
[
0
]
=
-
1
;
coors_offset
[
1
]
=
-
1
;
continue
;
}
int
c_z
=
floor
f
((
points_offset
[
2
]
-
coors_z_min
)
/
voxel_z
);
int
c_z
=
floor
((
points_offset
[
2
]
-
coors_z_min
)
/
voxel_z
);
if
(
c_z
<
0
||
c_z
>=
grid_z
)
{
coors_offset
[
0
]
=
-
1
;
coors_offset
[
1
]
=
-
1
;
...
...
@@ -101,7 +101,7 @@ __global__ void point_to_voxelidx_kernel(const T_int* coor,
CUDA_1D_KERNEL_LOOP
(
index
,
num_points
)
{
auto
coor_offset
=
coor
+
index
*
NDim
;
// skip invalid points
if
(
coor_offset
[
0
]
==
-
1
)
continue
;
if
((
index
>=
num_points
)
||
(
coor_offset
[
0
]
==
-
1
)
)
return
;
int
num
=
0
;
int
coor_x
=
coor_offset
[
0
];
...
...
@@ -122,7 +122,7 @@ __global__ void point_to_voxelidx_kernel(const T_int* coor,
point_to_pointidx
[
index
]
=
i
;
}
else
if
(
num
>=
max_points
)
{
// out of boundary
b
re
ak
;
re
turn
;
}
}
}
...
...
@@ -166,51 +166,4 @@ __global__ void determin_voxel_num(
}
}
__global__
void
nondeterministic_get_assign_pos
(
const
int
nthreads
,
const
int32_t
*
coors_map
,
int32_t
*
pts_id
,
int32_t
*
coors_count
,
int32_t
*
reduce_count
,
int32_t
*
coors_order
)
{
CUDA_1D_KERNEL_LOOP
(
thread_idx
,
nthreads
)
{
int
coors_idx
=
coors_map
[
thread_idx
];
if
(
coors_idx
>
-
1
)
{
int32_t
coors_pts_pos
=
atomicAdd
(
&
reduce_count
[
coors_idx
],
1
);
pts_id
[
thread_idx
]
=
coors_pts_pos
;
if
(
coors_pts_pos
==
0
)
{
coors_order
[
coors_idx
]
=
atomicAdd
(
coors_count
,
1
);
}
}
}
}
template
<
typename
T
>
__global__
void
nondeterministic_assign_point_voxel
(
const
int
nthreads
,
const
T
*
points
,
const
int32_t
*
coors_map
,
const
int32_t
*
pts_id
,
const
int32_t
*
coors_in
,
const
int32_t
*
reduce_count
,
const
int32_t
*
coors_order
,
T
*
voxels
,
int32_t
*
coors
,
int32_t
*
pts_count
,
const
int
max_voxels
,
const
int
max_points
,
const
int
num_features
,
const
int
NDim
)
{
CUDA_1D_KERNEL_LOOP
(
thread_idx
,
nthreads
)
{
int
coors_idx
=
coors_map
[
thread_idx
];
int
coors_pts_pos
=
pts_id
[
thread_idx
];
if
(
coors_idx
>
-
1
&&
coors_pts_pos
<
max_points
)
{
int
coors_pos
=
coors_order
[
coors_idx
];
if
(
coors_pos
<
max_voxels
)
{
auto
voxels_offset
=
voxels
+
(
coors_pos
*
max_points
+
coors_pts_pos
)
*
num_features
;
auto
points_offset
=
points
+
thread_idx
*
num_features
;
for
(
int
k
=
0
;
k
<
num_features
;
k
++
)
{
voxels_offset
[
k
]
=
points_offset
[
k
];
}
if
(
coors_pts_pos
==
0
)
{
pts_count
[
coors_pos
]
=
min
(
reduce_count
[
coors_idx
],
max_points
);
auto
coors_offset
=
coors
+
coors_pos
*
NDim
;
auto
coors_in_offset
=
coors_in
+
coors_idx
*
NDim
;
for
(
int
k
=
0
;
k
<
NDim
;
k
++
)
{
coors_offset
[
k
]
=
coors_in_offset
[
k
];
}
}
}
}
}
}
#endif // VOXELIZATION_CUDA_KERNEL_CUH
mmcv/ops/csrc/common/mlu/bbox_overlaps_mlu_kernel.mlu
deleted
100644 → 0
View file @
6f674c7e
/*************************************************************************
* Copyright (C) 2021 Cambricon.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#include <float.h>
#include "common_mlu_helper.hpp"
#define COORD_NUM 4
__nram__ char nmem_buf[MAX_NRAM_SIZE];
template <typename T>
__mlu_func__ void computeDiv(void *nram_dst, void *nram_src0, void *nram_src1,
void *nram_addition, const int32_t deal_num) {
__bang_active_reciphp((T *)nram_dst, (T *)nram_src1, deal_num);
__bang_mul((T *)nram_dst, (T *)nram_src0, (T *)nram_dst, deal_num);
}
template <>
__mlu_func__ void computeDiv<half>(void *nram_dst, void *nram_src0,
void *nram_src1, void *nram_addition,
const int32_t deal_num) {
__bang_half2float((float *)nram_addition, (half *)nram_src1, deal_num);
__bang_active_reciphp((float *)nram_addition, (float *)nram_addition,
deal_num);
__bang_float2half_rd((half *)nram_src1, (float *)nram_addition, deal_num);
__bang_mul((half *)nram_dst, (half *)nram_src0, (half *)nram_src1, deal_num);
}
template <typename T>
__mlu_func__ void bboxOverlapsWorkflow(
T *vec_b1_x1, T *vec_b1_y1, T *vec_b1_x2, T *vec_b1_y2, T *vec_b2_x1,
T *vec_b2_y1, T *vec_b2_x2, T *vec_b2_y2, T *vec_left, T *vec_right,
T *vec_top, T *vec_bottom, const T *bbox1, const T *bbox2, void *ious,
const int32_t offset, const int32_t mode, const int32_t batches_stride,
const int32_t num_bbox1, const int32_t num_bbox2, const bool aligned) {
int32_t task_batch_stride = (num_bbox1 + taskDim - 1) / taskDim;
int32_t batch_start = taskId * task_batch_stride;
int32_t batch_per_task = batch_start + task_batch_stride < num_bbox1
? task_batch_stride
: num_bbox1 - batch_start;
batch_per_task = batch_per_task > 0 ? batch_per_task : (0);
if (aligned) {
int32_t num_loop_cpy = batch_per_task / batches_stride;
int32_t num_rem_cpy_batches = batch_per_task % batches_stride;
num_loop_cpy = num_rem_cpy_batches > 0 ? num_loop_cpy + 1 : num_loop_cpy;
for (int32_t i = 0; i < num_loop_cpy; i++) {
int32_t index = batch_start + i * batches_stride;
int32_t handle_batches = index + batches_stride > num_bbox1
? num_rem_cpy_batches
: batches_stride;
int32_t b1 = index;
int32_t b2 = index;
int32_t base1 = b1 * COORD_NUM;
__memcpy(vec_b1_x1, &bbox1[base1], sizeof(T), GDRAM2NRAM, sizeof(T),
COORD_NUM * sizeof(T), handle_batches - 1);
__memcpy(vec_b1_y1, &bbox1[base1 + 1], sizeof(T), GDRAM2NRAM, sizeof(T),
COORD_NUM * sizeof(T), handle_batches - 1);
__memcpy(vec_b1_x2, &bbox1[base1 + 2], sizeof(T), GDRAM2NRAM, sizeof(T),
COORD_NUM * sizeof(T), handle_batches - 1);
__memcpy(vec_b1_y2, &bbox1[base1 + 3], sizeof(T), GDRAM2NRAM, sizeof(T),
COORD_NUM * sizeof(T), handle_batches - 1);
int32_t base2 = b2 * COORD_NUM;
__memcpy(vec_b2_x1, &bbox2[base2], sizeof(T), GDRAM2NRAM, sizeof(T),
COORD_NUM * sizeof(T), handle_batches - 1);
__memcpy(vec_b2_y1, &bbox2[base2 + 1], sizeof(T), GDRAM2NRAM, sizeof(T),
COORD_NUM * sizeof(T), handle_batches - 1);
__memcpy(vec_b2_x2, &bbox2[base2 + 2], sizeof(T), GDRAM2NRAM, sizeof(T),
COORD_NUM * sizeof(T), handle_batches - 1);
__memcpy(vec_b2_y2, &bbox2[base2 + 3], sizeof(T), GDRAM2NRAM, sizeof(T),
COORD_NUM * sizeof(T), handle_batches - 1);
// get the width and height
__bang_maxequal(vec_left, vec_b1_x1, vec_b2_x1, batches_stride);
__bang_minequal(vec_right, vec_b1_x2, vec_b2_x2, batches_stride);
__bang_maxequal(vec_top, vec_b1_y1, vec_b2_y1, batches_stride);
__bang_minequal(vec_bottom, vec_b1_y2, vec_b2_y2, batches_stride);
// right - left + offset ---> left
__bang_sub(vec_left, vec_right, vec_left, batches_stride);
__bang_add_scalar(vec_left, vec_left, (T)offset, batches_stride);
// bottom - top + offset ---> right
__bang_sub(vec_right, vec_bottom, vec_top, batches_stride);
__bang_add_scalar(vec_right, vec_right, (T)offset, batches_stride);
// zero vector ---> bottom
__bang_write_value(vec_bottom, batches_stride, 0.f);
// width --> vec_left
__bang_maxequal(vec_left, vec_bottom, vec_left, batches_stride);
T *width = vec_left;
// height --> vec_right
__bang_maxequal(vec_right, vec_bottom, vec_right, batches_stride);
T *height = vec_right;
// get the b1_area
// (b1_x2 - b1_x1 + offset) ---> vec_top
__bang_sub(vec_top, vec_b1_x2, vec_b1_x1, batches_stride);
__bang_add_scalar(vec_top, vec_top, (T)offset, batches_stride);
// (b1_y2 - b1_y1 + offset) ---> vec_bottom
__bang_sub(vec_bottom, vec_b1_y2, vec_b1_y1, batches_stride);
__bang_add_scalar(vec_bottom, vec_bottom, (T)offset, batches_stride);
// b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset)
// ---> vec_top;
__bang_mul(vec_top, vec_top, vec_bottom, batches_stride);
T *b1_area = vec_top;
// get the b2_area
// (b2_x2 - b2_x1 + offset) ---> b2_x1
__bang_sub(vec_b2_x1, vec_b2_x2, vec_b2_x1, batches_stride);
__bang_add_scalar(vec_b2_x1, vec_b2_x1, (T)offset, batches_stride);
// (b2_y2 - b2_y1 + offset) ---> b2_y1
__bang_sub(vec_b2_y1, vec_b2_y2, vec_b2_y1, batches_stride);
__bang_add_scalar(vec_b2_y1, vec_b2_y1, (T)offset, batches_stride);
// b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset)
// ---> b2_x1;
__bang_mul(vec_b2_x1, vec_b2_x1, vec_b2_y1, batches_stride);
T *b2_area = vec_b2_x1;
// inter_s = width * height
__bang_mul(height, width, height, batches_stride);
T *inter_s = height;
// offset vector ---> vec_b2_y1
__bang_write_value(vec_b2_y1, batches_stride, T(offset));
T *vec_offset = vec_b2_y1;
if (mode == 0) {
__bang_add(b1_area, b1_area, b2_area, batches_stride);
__bang_sub(b1_area, b1_area, inter_s, batches_stride);
__bang_maxequal(b1_area, vec_offset, b1_area, batches_stride);
} else {
__bang_maxequal(b1_area, vec_offset, b1_area, batches_stride);
}
T *base_s = b1_area;
// ious = inter_s / base_s
computeDiv<T>(width, inter_s, base_s, vec_b2_x2, batches_stride);
__memcpy((T *)ious + index, width, handle_batches * sizeof(T),
NRAM2GDRAM);
}
} else {
int32_t num_loop_cpy = num_bbox2 / batches_stride;
int32_t num_rem_cpy_batches = num_bbox2 % batches_stride;
num_loop_cpy = num_rem_cpy_batches > 0 ? num_loop_cpy + 1 : num_loop_cpy;
for (int32_t i = 0; i < batch_per_task; i++) {
int32_t index1 = batch_start + i;
int32_t b1 = index1;
int32_t base1 = b1 * COORD_NUM;
// set bbox1 and bbox2 to nram
__bang_write_value(vec_b1_x1, batches_stride, bbox1[base1]);
__bang_write_value(vec_b1_y1, batches_stride, bbox1[base1 + 1]);
__bang_write_value(vec_b1_x2, batches_stride, bbox1[base1 + 2]);
__bang_write_value(vec_b1_y2, batches_stride, bbox1[base1 + 3]);
for (int32_t j = 0; j < num_loop_cpy; j++) {
int32_t index2 = j * batches_stride;
int32_t handle_batches = index2 + batches_stride > num_bbox2
? num_rem_cpy_batches
: batches_stride;
int32_t b2 = index2;
int32_t base2 = b2 * COORD_NUM;
// copy bbox2 to nram
__memcpy(vec_b2_x1, &bbox2[base2], sizeof(T), GDRAM2NRAM, sizeof(T),
COORD_NUM * sizeof(T), handle_batches - 1);
__memcpy(vec_b2_y1, &bbox2[base2 + 1], sizeof(T), GDRAM2NRAM, sizeof(T),
COORD_NUM * sizeof(T), handle_batches - 1);
__memcpy(vec_b2_x2, &bbox2[base2 + 2], sizeof(T), GDRAM2NRAM, sizeof(T),
COORD_NUM * sizeof(T), handle_batches - 1);
__memcpy(vec_b2_y2, &bbox2[base2 + 3], sizeof(T), GDRAM2NRAM, sizeof(T),
COORD_NUM * sizeof(T), handle_batches - 1);
// get the width and height
__bang_maxequal(vec_left, vec_b1_x1, vec_b2_x1, batches_stride);
__bang_minequal(vec_right, vec_b1_x2, vec_b2_x2, batches_stride);
__bang_maxequal(vec_top, vec_b1_y1, vec_b2_y1, batches_stride);
__bang_minequal(vec_bottom, vec_b1_y2, vec_b2_y2, batches_stride);
// right - left + offset ---> left
__bang_sub(vec_left, vec_right, vec_left, batches_stride);
__bang_add_scalar(vec_left, vec_left, (T)offset, batches_stride);
// bottom - top + offset ---> right
__bang_sub(vec_right, vec_bottom, vec_top, batches_stride);
__bang_add_scalar(vec_right, vec_right, (T)offset, batches_stride);
// zero vector ---> bottom
__bang_write_value(vec_bottom, batches_stride, (T)0);
// width --> vec_left
__bang_maxequal(vec_left, vec_bottom, vec_left, batches_stride);
T *width = vec_left;
// height --> vec_right
__bang_maxequal(vec_right, vec_bottom, vec_right, batches_stride);
T *height = vec_right;
// get the b1_area
// (b1_x2 - b1_x1 + offset) ---> vec_top
__bang_sub(vec_top, vec_b1_x2, vec_b1_x1, batches_stride);
__bang_add_scalar(vec_top, vec_top, (T)offset, batches_stride);
// (b1_y2 - b1_y1 + offset) ---> vec_bottom
__bang_sub(vec_bottom, vec_b1_y2, vec_b1_y1, batches_stride);
__bang_add_scalar(vec_bottom, vec_bottom, (T)offset, batches_stride);
// b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset)
// ---> vec_top;
__bang_mul(vec_top, vec_top, vec_bottom, batches_stride);
T *b1_area = vec_top;
// get the b2_area
// (b2_x2 - b2_x1 + offset) ---> b2_x1
__bang_sub(vec_b2_x1, vec_b2_x2, vec_b2_x1, batches_stride);
__bang_add_scalar(vec_b2_x1, vec_b2_x1, (T)offset, batches_stride);
// (b2_y2 - b2_y1 + offset) ---> b2_y1
__bang_sub(vec_b2_y1, vec_b2_y2, vec_b2_y1, batches_stride);
__bang_add_scalar(vec_b2_y1, vec_b2_y1, (T)offset, batches_stride);
// b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset)
// ---> b2_x1;
__bang_mul(vec_b2_x1, vec_b2_x1, vec_b2_y1, batches_stride);
T *b2_area = vec_b2_x1;
// inter_s = width * height
__bang_mul(height, width, height, batches_stride);
T *inter_s = height;
// offset vector ---> vec_b2_y1
__bang_write_value(vec_b2_y1, batches_stride, T(offset));
T *vec_offset = vec_b2_y1;
if (mode == 0) {
__bang_add(b1_area, b1_area, b2_area, batches_stride);
__bang_sub(b1_area, b1_area, inter_s, batches_stride);
__bang_maxequal(b1_area, vec_offset, b1_area, batches_stride);
} else {
__bang_maxequal(b1_area, vec_offset, b1_area, batches_stride);
}
T *base_s = b1_area;
// ious = inter_s / base_s
computeDiv<T>(width, inter_s, base_s, vec_b2_x2, batches_stride);
int32_t gdram_offset = index1 * num_bbox2 + index2;
__memcpy((T *)ious + gdram_offset, width, handle_batches * sizeof(T),
NRAM2GDRAM);
}
}
}
}
template <typename T>
__mlu_global__ void MLUUnion1KernelBBoxOverlaps(
const void *bbox1, const void *bbox2, void *ious, const int32_t num_bbox1,
const int32_t num_bbox2, const int32_t mode, const bool aligned,
const int32_t offset) {
/*
* NRAM partition
* |-------------------------------------------------------------|
* | vec_b1_x1 | vec_b1_y1 | vec_b1_x2 | vec_b1_y2 |
* |-------------------------------------------------------------|
* | vec_b2_x1 | vec_b2_y1 | vec_b2_x2 | vec_b2_y2 |
* |-------------------------------------------------------------|
* | vec_left | vec_right | vec_top | vec_bottom |
* |-------------------------------------------------------------|
*
*/
const int32_t align_bytes = PAD_DOWN(MAX_NRAM_SIZE, NFU_ALIGN_SIZE);
const int32_t split_nram_num = 12;
const int32_t nram_stride =
align_bytes / NFU_ALIGN_SIZE / split_nram_num * NFU_ALIGN_SIZE;
void *vec_b1_x1 = nmem_buf;
void *vec_b1_y1 = nmem_buf + nram_stride;
void *vec_b1_x2 = nmem_buf + 2 * nram_stride;
void *vec_b1_y2 = nmem_buf + 3 * nram_stride;
void *vec_b2_x1 = nmem_buf + 4 * nram_stride;
void *vec_b2_y1 = nmem_buf + 5 * nram_stride;
void *vec_b2_x2 = nmem_buf + 6 * nram_stride;
void *vec_b2_y2 = nmem_buf + 7 * nram_stride;
void *vec_left = nmem_buf + 8 * nram_stride;
void *vec_right = nmem_buf + 9 * nram_stride;
void *vec_top = nmem_buf + 10 * nram_stride;
void *vec_bottom = nmem_buf + 11 * nram_stride;
const int32_t vec_length = nram_stride / sizeof(T);
bboxOverlapsWorkflow((T *)vec_b1_x1, (T *)vec_b1_y1, (T *)vec_b1_x2,
(T *)vec_b1_y2, (T *)vec_b2_x1, (T *)vec_b2_y1,
(T *)vec_b2_x2, (T *)vec_b2_y2, (T *)vec_left,
(T *)vec_right, (T *)vec_top, (T *)vec_bottom,
(T *)bbox1, (T *)bbox2, (T *)ious, offset, mode,
vec_length, num_bbox1, num_bbox2, aligned);
}
void KernelBBoxOverlaps(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
cnrtQueue_t queue, const cnrtDataType_t d_type,
const void *bbox1, const void *bbox2, void *ious,
const int32_t num_bbox1, const int32_t num_bbox2,
const int32_t mode, const bool aligned,
const int32_t offset) {
if (d_type == CNRT_FLOAT16) {
MLUUnion1KernelBBoxOverlaps<half><<<k_dim, k_type, queue>>>(
bbox1, bbox2, ious, num_bbox1, num_bbox2, mode, aligned, offset);
} else {
MLUUnion1KernelBBoxOverlaps<float><<<k_dim, k_type, queue>>>(
bbox1, bbox2, ious, num_bbox1, num_bbox2, mode, aligned, offset);
}
}
mmcv/ops/csrc/common/mlu/carafe_mlu_kernel.mlu
deleted
100644 → 0
View file @
6f674c7e
/*************************************************************************
* Copyright (C) 2022 Cambricon.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#include "carafe_utils.hpp"
#include "common_mlu_helper.hpp"
#define INDEX3(n, h, w, c, strN, strH, strW) \
(strN) * (n) + (strH) * (h) + (strW) * (w) + (c)
#define NRAM_BLOCK PAD_DOWN(MAX_NRAM_SIZE / 5, NRAM_ALIGN_SIZE)
__nram__ char nram_buf[MAX_NRAM_SIZE];
namespace forward {
struct BlockId {
int Ho;
int Wo;
int G;
int Cg;
int Kh;
int Kw;
int Hi;
int Wi;
};
// start indices of block
struct BlockStart {
int Ho;
int Wo;
int G;
int Cg;
int Kh;
int Kw;
int Hi;
int Wi;
int C;
};
struct BlockEnd {
int Ho;
int Wo;
int Kh;
int Kw;
int Hi;
int Wi;
};
struct BlockSize {
int Ho;
int Wo;
int G;
int Cg;
int Kh;
int Kw;
int Hi;
int Wi;
};
template <typename T>
__mlu_func__ void carafeForwardBLOCK(T *input, T *mask,
const CarafeForwardParam param,
const CarafeForwardBlockDim block_dim,
const CarafeForwardGridDim grid_dim,
T *output) {
// data block info
BlockId blkId;
BlockStart blkStart;
BlockEnd blkEnd;
BlockSize blkSize;
// set pointers on NRAM arrays
// input_nram[blkDim_(Hi+Kh)-1, blkDim_(Wi+Kw)-1, blkDim_(G*Cg)]
T *input_nram = (T *)nram_buf;
// mask_nram[blkDim_Ho, blkDim_Wo, blkDim_(G*Kh*Kw)]
T *mask_nram = input_nram + param.input_nram_size;
// output_nram[blkDim_Ho, blkDim_Wo, blkDim_(G*Cg)]
T *output_nram = mask_nram + param.mask_nram_size;
// sum_array[blkDim_(G*Cg)]
T *sum_array = output_nram + param.output_nram_size;
/* ===== loop over N, grid_dim(Ho,Wo,G,Cg)
* iterations are distributed over computing cores
*/
for (int loop_index = taskId; loop_index < param.job_num;
loop_index += taskDim) {
// block idx
blkId.Cg = loop_index;
blkId.G = blkId.Cg / grid_dim.Cg;
blkId.Wo = blkId.G / grid_dim.G;
blkId.Ho = blkId.Wo / grid_dim.Wo;
int sample_idx = blkId.Ho / grid_dim.Ho;
blkId.Cg %= grid_dim.Cg;
blkId.G %= grid_dim.G;
blkId.Wo %= grid_dim.Wo;
blkId.Ho %= grid_dim.Ho;
// block starting indices
blkStart.Ho = blkId.Ho * block_dim.Ho;
blkStart.Wo = blkId.Wo * block_dim.Wo;
blkStart.G = blkId.G * block_dim.G;
blkStart.Cg = blkId.Cg * block_dim.Cg;
blkStart.C = blkStart.G * param.Cg + blkStart.Cg;
// block size
blkSize.Ho = block_dim.Ho;
blkSize.Wo = block_dim.Wo;
blkSize.G = block_dim.G;
blkSize.Cg = block_dim.Cg;
// take care of blocks near the end of each dimension
if (blkId.Ho == (grid_dim.Ho - 1)) {
blkSize.Ho = param.Ho - (grid_dim.Ho - 1) * block_dim.Ho;
}
if (blkId.Wo == (grid_dim.Wo - 1)) {
blkSize.Wo = param.Wo - (grid_dim.Wo - 1) * block_dim.Wo;
}
if (blkId.G == (grid_dim.G - 1)) {
blkSize.G = param.group_size - (grid_dim.G - 1) * block_dim.G;
}
if (blkId.Cg == (grid_dim.Cg - 1)) {
blkSize.Cg = param.Cg - (grid_dim.Cg - 1) * block_dim.Cg;
}
// block end indices
blkEnd.Ho = blkStart.Ho + blkSize.Ho - 1;
blkEnd.Wo = blkStart.Wo + blkSize.Wo - 1;
// set output_nram to zero
__bang_write_value(output_nram, param.output_nram_size, T(0));
// loop blocks of kernel window: grid_dim.(Kh, Kw)
for (blkId.Kh = 0; blkId.Kh < grid_dim.Kh; ++blkId.Kh) {
blkStart.Kh = blkId.Kh * block_dim.Kh;
blkSize.Kh = block_dim.Kh;
if (blkId.Kh == (grid_dim.Kh - 1)) {
blkSize.Kh = param.kernel_size - (grid_dim.Kh - 1) * block_dim.Kh;
}
blkEnd.Kh = blkStart.Kh + blkSize.Kh - 1;
blkStart.Hi = blkStart.Ho / param.scale_factor - param.kernel_size_half +
blkStart.Kh;
blkEnd.Hi =
blkEnd.Ho / param.scale_factor - param.kernel_size_half + blkEnd.Kh;
blkSize.Hi = blkEnd.Hi - blkStart.Hi + 1;
for (blkId.Kw = 0; blkId.Kw < grid_dim.Kw; ++blkId.Kw) {
blkStart.Kw = blkId.Kw * block_dim.Kw;
blkSize.Kw = block_dim.Kw;
if (blkId.Kw == (grid_dim.Kw - 1)) {
blkSize.Kw = param.kernel_size - (grid_dim.Kw - 1) * block_dim.Kw;
}
blkEnd.Kw = blkStart.Kw + blkSize.Kw - 1;
blkStart.Wi = blkStart.Wo / param.scale_factor -
param.kernel_size_half + blkStart.Kw;
blkEnd.Wi =
blkEnd.Wo / param.scale_factor - param.kernel_size_half + blkEnd.Kw;
blkSize.Wi = blkEnd.Wi - blkStart.Wi + 1;
// load input block from gdram2nram
//
// input_nram[ | input[ sample_idx,
// 0:blkSize.Hi-1, | blkStart.Hi + 0:blkSize.Hi-1,
// 0:blkSize.Wi-1, | blkStart.Wi + 0:blkSize.Wi-1,
// 0:blkSize.G-1 | blkStart.G + 0:blkSize.G-1
// 0:blkSize.Cg-1] | blkStart.Cg + 0:blkSize.Cg-1]
//
// To skip out of bound indices:
//
// input_nram[
// hi_start_local:hi_end_local,
// wi_start_local:wi_end_local, ...]
// = input[n,
// hi_start_global:hi_end_global,
// wi_start_global:wi_end_global, ...]
//
int hi_start_local = 0;
int hi_start_global = blkStart.Hi;
if (blkStart.Hi < 0) {
hi_start_local = -blkStart.Hi;
hi_start_global = 0;
}
int wi_start_local = 0;
int wi_start_global = blkStart.Wi;
if (blkStart.Wi < 0) {
wi_start_local = -blkStart.Wi;
wi_start_global = 0;
}
int hi_end_local = blkSize.Hi - 1;
int hi_end_global = blkEnd.Hi;
if (blkEnd.Hi > param.Hi - 1) {
hi_end_global = param.Hi - 1;
hi_end_local -= blkEnd.Hi - hi_end_global;
}
int wi_end_local = blkSize.Wi - 1;
int wi_end_global = blkEnd.Wi;
if (blkEnd.Wi > param.Wi - 1) {
wi_end_global = param.Wi - 1;
wi_end_local -= blkEnd.Wi - wi_end_global;
}
int dst_offset = param.input_nram_stride_h * hi_start_local +
param.input_nram_stride_w * wi_start_local;
T *dst = input_nram + dst_offset;
int src_offset = INDEX3(sample_idx, hi_start_global, wi_start_global,
blkStart.C, param.input_stride_n,
param.input_stride_h, param.input_stride_w);
T *src = input + src_offset;
int input_seg_num_h = hi_end_local - hi_start_local + 1;
int input_seg_num_w = wi_end_local - wi_start_local + 1;
for (int i = 0; i < input_seg_num_h; ++i) {
loadStr3D(dst, src, blkSize.Cg, blkSize.G, input_seg_num_w,
param.input_nram_stride_g, param.input_nram_stride_w,
param.input_stride_g, param.input_stride_w);
dst += param.input_nram_stride_h;
src += param.input_stride_h;
}
/* load mask block from gdram2nram
*
* mask_nram[ | mask[sample_idx,
* 0:blkSize.Ho-1 , | blkStart.Ho + 0:blkSize.Ho-1,
* 0:blkSize.Wo-1, | blkStart.Wo + 0:blkSize.Wo-1,
* 0:blkSize.G-1, | blkStart.G + 0:blkSize.G-1,
* 0:blkSize.Kh-1, | blkStart.Kh + 0:blkSize.Kh-1,
* 0:blkSize.Kw-1] | blkStart.Kw + 0:blkSize.Kw-1]
*/
src_offset = INDEX3(blkStart.Wo, blkStart.G, blkStart.Kh, blkStart.Kw,
param.mask_stride_w, param.mask_stride_g,
param.mask_stride_kh);
src_offset += sample_idx * param.mask_stride_n +
blkStart.Ho * param.mask_stride_h;
for (int ho = 0; ho < blkSize.Ho; ++ho) {
dst = mask_nram + ho * param.mask_nram_stride_h;
src = mask + src_offset + ho * param.mask_stride_h;
for (int wo = 0; wo < blkSize.Wo; ++wo) {
loadStr3D(dst, src, blkSize.Kw, blkSize.Kh, blkSize.G,
param.mask_nram_stride_kh, param.mask_nram_stride_g,
param.mask_stride_kh, param.mask_stride_g);
dst += param.mask_nram_stride_w;
src += param.mask_stride_w;
}
}
// loop each pixel of the output block
for (int ho = 0; ho < blkSize.Ho; ++ho) {
int kernel_hi_start_global = (blkStart.Ho + ho) / param.scale_factor -
param.kernel_size_half + blkStart.Kh;
int kernel_hi_start_local = kernel_hi_start_global - blkStart.Hi;
// int kernel_hi_end_global = kernel_hi_start_global + blkSize.Kh - 1;
// int kernel_hi_end_local = kernel_hi_end_global - blkStart.Hi;
// exclude out of bound indices which should be ignored
int kh_min = hi_start_local - kernel_hi_start_local > 0
? hi_start_local - kernel_hi_start_local
: 0;
int kh_max = hi_end_local - kernel_hi_start_local < blkSize.Kh - 1
? hi_end_local - kernel_hi_start_local
: blkSize.Kh - 1;
for (int wo = 0; wo < blkSize.Wo; ++wo) {
int kernel_wi_start_global =
(blkStart.Wo + wo) / param.scale_factor -
param.kernel_size_half + blkStart.Kw;
int kernel_wi_start_local = kernel_wi_start_global - blkStart.Wi;
// exclude out of bound indices wwich should be ignored
int kw_min = wi_start_local - kernel_wi_start_local > 0
? wi_start_local - kernel_wi_start_local
: 0;
int kw_max = wi_end_local - kernel_wi_start_local < blkSize.Kw - 1
? wi_end_local - kernel_wi_start_local
: blkSize.Kw - 1;
// output_nram[ho, wo, g, c] = sum(mask_nram[ho, wo, g, kh, kw]
// * input_nram[hi+kh, wi+kw, g, c],
// for (kh,kw) in [0:blkSize.Kw-1] x [0:blkSize.Kh-1])
//
// sum(mask_nram[ho, wo, g, kh, kw]
// * input_nram[hi+kh, wi+kw, g, c], (kh,kw))
//
T *mask_array = mask_nram + param.mask_nram_stride_h * ho +
param.mask_nram_stride_w * wo;
for (int kh = kh_min; kh <= kh_max; ++kh) {
for (int kw = kw_min; kw <= kw_max; ++kw) {
T *src =
input_nram +
param.input_nram_stride_h * (kernel_hi_start_local + kh) +
param.input_nram_stride_w * (kernel_wi_start_local + kw);
int mask_index = param.mask_nram_stride_kh * kh + kw;
// mlutiply mask weight with channels for each channel group
T *sum = sum_array;
for (int g = 0; g < blkSize.G; ++g) {
__bang_mul_scalar(sum, src, mask_array[mask_index],
param.block_Cg_NFU);
//
// NOTE: Since block_Cg_NFU >= block_Cg_stride,
// overlapped writing may occur on sum_array.
// So this loop must be executed in order to
// avoid data contamination, as shown below.
//
// |-----block_Cg_NFU---------|
// xxxxxxxxxxxxxxxxxxxxyyyzzzzz------------
// |---block_Cg_stride---|^^^^^will be overwritten
// in the next iteration.
//
// x: actual data used, y: not used, z: overwritten
//
sum += param.input_nram_stride_g;
src += param.input_nram_stride_g;
mask_index += param.mask_nram_stride_g;
} // loop blk_G
// add array[blk_G * blk_C] to output_nram
dst = output_nram + param.output_nram_stride_h * ho +
param.output_nram_stride_w * wo;
__bang_add(dst, dst, sum_array, param.output_nram_stride_w);
} // end loop blk_Kw
} // end loop blk_Kh
} // end loop blk_Wo
} // end loop blk_Ho
} // end loop grid_dim.Kw
} // end loop grid_dim.Kh
/* write output from nram2gdram
*
* output_nram[ | output[sample_idx,
* 0:blkSize.Ho-1, | blkStart.Ho + 0:blkSize.Ho-1,
* 0:blkSize.Wo-1, | blkStart.Wo + 0:blkSize.Wo-1,
* 0:blkSize.G-1, | blkStart.G + 0:blkSize.G-1,
* 0:blkSize.Cg-1] | blkStart.Cg + 0:blkSize.Cg-1]
*/
int dst_offset = INDEX3(sample_idx, blkStart.Ho, blkStart.Wo, blkStart.C,
param.output_stride_n, param.output_stride_h,
param.output_stride_w);
T *dst = output + dst_offset;
T *src = output_nram;
for (int i = 0; i < blkSize.Ho; ++i) {
storeStr3D(dst, src, blkSize.Cg, blkSize.G, blkSize.Wo,
param.output_stride_g, param.output_stride_w,
param.output_nram_stride_g, param.output_nram_stride_w);
dst += param.output_stride_h;
src += param.output_nram_stride_h;
}
} // end loop N, grid_dim.(Hi,Wi,G,Cg)
}
template <typename T>
__mlu_global__ void MLUBLOCKKernelCarafeForward(
const void *input, const void *mask, const CarafeForwardParam param,
const CarafeForwardBlockDim block_dim, const CarafeForwardGridDim grid_dim,
void *output) {
carafeForwardBLOCK((T *)input, (T *)mask, param, block_dim, grid_dim,
(T *)output);
}
} // namespace forward
namespace backward {
template <typename T>
__mlu_func__ void CarafeCompute(T *input, T *mask, T *grad_output,
T *grad_input, T *grad_mask, const int n,
const int hi, const int wi, const int c,
const int k_up, const int group,
const int scale) {
char *input_buff = nram_buf;
char *mask_buff = input_buff + NRAM_BLOCK;
char *grad_input_buff = mask_buff + NRAM_BLOCK;
char *grad_output_buff = grad_input_buff + NRAM_BLOCK;
char *grad_mask_buff = grad_output_buff + NRAM_BLOCK;
int wo = wi * scale;
int ho = hi * scale;
int out_num = n * ho * wo * group;
int group_size = c / group;
int repeat = out_num / taskDim + (int)(taskId < out_num % taskDim);
int num_align = PAD_DOWN(NRAM_BLOCK / sizeof(T), NFU_ALIGN_SIZE / sizeof(T));
int num_per_loop = group_size / num_align;
int rem_for_loop = group_size % num_align;
int rem_for_loop_align = PAD_UP(rem_for_loop, NFU_ALIGN_SIZE / sizeof(T));
for (int k = 0; k < repeat; k++) {
int iter = k * taskDim + taskId;
int group_k = iter % group;
int w_k = (iter / group) % wo;
int h_k = (iter / wo / group) % ho;
int n_k = (iter / ho / wo / group) % n;
int h_i = h_k / scale;
int w_i = w_k / scale;
int start_h = h_i - ((k_up - 1) / 2);
int end_h = h_i + ((k_up - 1) / 2) + 1;
int start_w = w_i - ((k_up - 1) / 2);
int end_w = w_i + ((k_up - 1) / 2) + 1;
T *base_mask = (T *)mask + n_k * ho * wo * group * k_up * k_up +
h_k * wo * group * k_up * k_up + w_k * group * k_up * k_up +
group_k * k_up * k_up;
T *base_grad_mask = (T *)grad_mask + n_k * ho * wo * group * k_up * k_up +
h_k * wo * group * k_up * k_up +
w_k * group * k_up * k_up + group_k * k_up * k_up;
__bang_write_zero((T *)grad_input_buff, NRAM_BLOCK / sizeof(T));
__bang_write_zero((T *)grad_mask_buff, NRAM_BLOCK / sizeof(T));
__bang_write_zero((T *)grad_output_buff, NRAM_BLOCK / sizeof(T));
__memcpy((T *)mask_buff, (T *)base_mask, k_up * k_up * sizeof(T),
GDRAM2NRAM);
for (int i = 0; i < num_per_loop; i++) {
__bang_write_zero((T *)input_buff, NRAM_BLOCK / sizeof(T));
T *base_grad_output = (T *)grad_output + n_k * ho * wo * c +
h_k * wo * c + w_k * c + group_k * group_size +
i * num_align;
__memcpy((T *)grad_output_buff, (T *)base_grad_output,
num_align * sizeof(T), GDRAM2NRAM);
for (int ih = start_h; ih < end_h; ih++) {
for (int iw = start_w; iw < end_w; iw++) {
if (ih < 0 || ih > hi - 1 || iw < 0 || iw > wi - 1) {
continue;
}
int mask_ih = ih - h_i + (k_up - 1) / 2;
int mask_iw = iw - w_i + (k_up - 1) / 2;
int mask_index = mask_ih * k_up + mask_iw;
int input_index = n_k * hi * wi * c + ih * wi * c + iw * c +
group_k * group_size + i * num_align;
T *base_input = (T *)input + input_index;
T *base_grad_input = (T *)grad_input + input_index;
__memcpy((T *)input_buff, (T *)base_input, num_align * sizeof(T),
GDRAM2NRAM);
__bang_mul_scalar((T *)grad_input_buff, (T *)grad_output_buff,
((T *)mask_buff)[mask_index], num_align);
__bang_atomic_add((T *)grad_input_buff, (T *)base_grad_input,
(T *)grad_input_buff, num_align);
__bang_mul((T *)input_buff, (T *)grad_output_buff, (T *)input_buff,
num_align);
__bang_sumpool((T *)input_buff, (T *)input_buff,
NFU_ALIGN_SIZE / sizeof(T),
num_align / (NFU_ALIGN_SIZE / sizeof(T)), 1,
num_align / (NFU_ALIGN_SIZE / sizeof(T)), 1, 1, 1);
__bang_reduce_sum((T *)input_buff, (T *)input_buff,
NFU_ALIGN_SIZE / sizeof(T));
((T *)grad_mask_buff)[mask_index] += ((T *)input_buff)[0];
}
}
}
if (rem_for_loop) {
__bang_write_zero((T *)input_buff, NRAM_BLOCK / sizeof(T));
T *base_grad_output = (T *)grad_output + n_k * ho * wo * c +
h_k * wo * c + w_k * c + group_k * group_size +
num_per_loop * num_align;
__memcpy((T *)grad_output_buff, (T *)base_grad_output,
rem_for_loop * sizeof(T), GDRAM2NRAM);
for (int ih = start_h; ih < end_h; ih++) {
for (int iw = start_w; iw < end_w; iw++) {
if (ih < 0 || ih > hi - 1 || iw < 0 || iw > wi - 1) {
continue;
}
int mask_ih = ih - h_i + (k_up - 1) / 2;
int mask_iw = iw - w_i + (k_up - 1) / 2;
int mask_index = mask_ih * k_up + mask_iw;
int input_index = n_k * hi * wi * c + ih * wi * c + iw * c +
group_k * group_size + num_per_loop * num_align;
T *base_input = (T *)input + input_index;
T *base_grad_input = (T *)grad_input + input_index;
__memcpy((T *)input_buff, (T *)base_input, rem_for_loop * sizeof(T),
GDRAM2NRAM);
__bang_mul_scalar((T *)grad_input_buff, (T *)grad_output_buff,
((T *)mask_buff)[mask_index], rem_for_loop_align);
__bang_atomic_add((T *)grad_input_buff, (T *)base_grad_input,
(T *)grad_input_buff, rem_for_loop);
__bang_mul((T *)input_buff, (T *)grad_output_buff, (T *)input_buff,
rem_for_loop_align);
__bang_sumpool(
(T *)input_buff, (T *)input_buff, NFU_ALIGN_SIZE / sizeof(T),
rem_for_loop_align / (NFU_ALIGN_SIZE / sizeof(T)), 1,
rem_for_loop_align / (NFU_ALIGN_SIZE / sizeof(T)), 1, 1, 1);
__bang_reduce_sum((T *)input_buff, (T *)input_buff,
NFU_ALIGN_SIZE / sizeof(T));
((T *)grad_mask_buff)[mask_index] += ((T *)input_buff)[0];
}
}
}
__memcpy((T *)base_grad_mask, (T *)grad_mask_buff, k_up * k_up * sizeof(T),
NRAM2GDRAM);
}
}
template <typename T>
__mlu_global__ void MLUUnion1KernelCarafeBackward(
const void *input, const void *mask, const void *grad_output,
void *grad_input, void *grad_mask, const int n, const int hi, const int wi,
const int c, const int k_up, const int group, const int scale) {
CarafeCompute((T *)input, (T *)mask, (T *)grad_output, (T *)grad_input,
(T *)grad_mask, n, hi, wi, c, k_up, group, scale);
}
} // namespace backward
void KernelCarafeForward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
cnrtQueue_t queue, const cnrtDataType_t d_type,
const void *input, const void *mask,
const CarafeForwardParam ¶m,
const CarafeForwardBlockDim &block_dim,
const CarafeForwardGridDim &grid_dim, void *output) {
if (d_type == CNRT_FLOAT16) {
forward::MLUBLOCKKernelCarafeForward<half><<<k_dim, k_type, queue>>>(
input, mask, param, block_dim, grid_dim, output);
} else {
forward::MLUBLOCKKernelCarafeForward<float><<<k_dim, k_type, queue>>>(
input, mask, param, block_dim, grid_dim, output);
}
}
void KernelCarafeBackward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
cnrtQueue_t queue, cnrtDataType_t dtype,
const void *input, const void *mask,
const void *grad_output, void *grad_input,
void *grad_mask, const int n, const int hi,
const int wi, const int c, const int k_up,
const int group, const int scale) {
if (dtype == CNRT_FLOAT16) {
backward::MLUUnion1KernelCarafeBackward<half><<<k_dim, k_type, queue>>>(
input, mask, grad_output, grad_input, grad_mask, n, hi, wi, c, k_up,
group, scale);
} else {
backward::MLUUnion1KernelCarafeBackward<float><<<k_dim, k_type, queue>>>(
input, mask, grad_output, grad_input, grad_mask, n, hi, wi, c, k_up,
group, scale);
}
}
mmcv/ops/csrc/common/mlu/carafe_utils.hpp
deleted
100644 → 0
View file @
6f674c7e
/*************************************************************************
* Copyright (C) 2022 Cambricon.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#ifndef CARAFE_UTILS_HPP_
#define CARAFE_UTILS_HPP_
#define NRAM_ALIGN_SIZE 64
struct
CarafeForwardParam
{
int
N
;
// batch size
int
Hi
;
// input height
int
Wi
;
// input width
int
Ci
;
// input channels
int
Ho
;
// output height
int
Wo
;
// output width
int
Cg
;
// channels per group
int
kernel_size
;
// kernel_size
int
group_size
;
// group_size
int
scale_factor
;
// scale_factor
int
kernel_size_half
;
// kernel half size (K-1)/2
int
kernel_size_sq
;
// square of kernel size
int
dtype_size
;
// size of tensor data type
// Host arrays' geometry
int
input_stride_g
;
int
input_stride_w
;
int
input_stride_h
;
int
input_stride_n
;
int
input_size
;
int
mask_stride_kh
;
int
mask_stride_g
;
int
mask_stride_w
;
int
mask_stride_h
;
int
mask_stride_n
;
int
mask_size
;
int
output_stride_g
;
int
output_stride_w
;
int
output_stride_h
;
int
output_stride_n
;
int
output_size
;
// NRAM arrays' geometry
int
input_nram_stride_g
;
int
input_nram_stride_w
;
int
input_nram_stride_h
;
int
input_nram_size
;
int
mask_nram_stride_kh
;
int
mask_nram_stride_g
;
int
mask_nram_stride_w
;
int
mask_nram_stride_h
;
int
mask_nram_size
;
int
output_nram_stride_g
;
int
output_nram_stride_w
;
int
output_nram_stride_h
;
int
output_nram_size
;
// for address/compute alignment
int
align_size_NRAM
;
// for addressing on NRAM
int
align_size_NFU
;
// for NFU operation length
int
block_Cg_NFU
;
// for bang_mul_const
int
job_num
;
// total job number
};
struct
CarafeForwardBlockDim
{
int
Ho
;
// block size of output height
int
Wo
;
// block size of output width
int
Kh
;
// block size of kernel height
int
Kw
;
// block size of kernel width
int
G
;
// block size of groups
int
Cg
;
// block size of channels within a group
int
Hi
;
// block size of input height
int
Wi
;
// block size of input width
};
struct
CarafeForwardGridDim
{
int
Ho
;
// number of blocks of output height
int
Wo
;
int
Kh
;
int
Kw
;
int
G
;
int
Cg
;
};
#endif // CARAFE_UTILS_HPP_
mmcv/ops/csrc/common/mlu/common_mlu_helper.hpp
deleted
100644 → 0
View file @
6f674c7e
/*************************************************************************
* Copyright (C) 2021 Cambricon.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#ifndef COMMON_MLU_HELPER_HPP_
#define COMMON_MLU_HELPER_HPP_
#define NFU_ALIGN_SIZE 128 // Byte
#define REM_FOR_STACK (128 * 1024) // 128KB reserved for cncc
#ifdef __BANG_ARCH__
#define MAX_NRAM_SIZE \
(__MLU_NRAM_SIZE__ * 1024 - REM_FOR_STACK) // 128KB reserved for cncc
#define MAX_SRAM_SIZE \
(__MLU_SRAM_SIZE__ * 1024 - REM_FOR_STACK) // 128KB reserved for cncc
#else
#define MAX_NRAM_SIZE (384 * 1024) // 384KB, initialization value
#define MAX_SRAM_SIZE (1920 * 1024) // 1920KB, initialization value
#endif
#ifndef PAD_UP
#define PAD_UP(x, y) (((x) / (y) + (int)((x) % (y) > 0)) * (y))
#endif
#ifndef PAD_DOWN
#define PAD_DOWN(x, y) (((x) / (y)) * (y))
#endif
#define CEIL_ALIGN(x, y) (((x) + (y)-1) / (y) * (y))
template
<
typename
scalar_t
>
__mlu_func__
inline
scalar_t
min
(
scalar_t
a
,
scalar_t
b
)
{
return
a
<
b
?
a
:
b
;
}
template
<
typename
scalar_t
>
__mlu_func__
inline
scalar_t
max
(
scalar_t
a
,
scalar_t
b
)
{
return
a
>
b
?
a
:
b
;
}
/*!
* @brief loads data from global DRAM to NRAM with 2D pattern.
*
* @param[out] dst
* Pointer to NRAM that stores dst data.
* @param[in] src
* Pointer to global DRAM that stores src data.
* @param[in] size
* The byte size of segment in the lower dimension.
* @param[in] dst_str
* The data stride in bytes between segments in the lower dimension of dst.
* @param[in] src_str
* The data stride in bytes between segments in the lower dimension of src.
* @param[in] seg_num
* The total count of data segments in the lower dimension.
*/
template
<
typename
T
>
__mlu_func__
void
loadStr2D
(
T
*
dst
,
T
*
src
,
const
int
size
,
const
int
dst_str
,
const
int
src_str
,
const
int
seg_num
)
{
if
(
dst_str
==
src_str
&&
size
==
src_str
)
{
__memcpy
(
dst
,
src
,
src_str
*
seg_num
*
sizeof
(
T
),
GDRAM2NRAM
);
}
else
if
((
size
==
src_str
||
src_str
<=
dst_str
)
&&
src_str
*
sizeof
(
T
)
<=
512
)
{
// gather data less than 512Bytes to improve IO efficiency
T
*
tmp
=
(
T
*
)
dst
+
(
dst_str
-
src_str
)
*
seg_num
;
__memcpy
(
tmp
,
src
,
(
src_str
*
(
seg_num
-
1
)
+
size
)
*
sizeof
(
T
),
GDRAM2NRAM
);
if
(
dst_str
!=
src_str
)
{
__memcpy
(
dst
,
tmp
,
size
*
sizeof
(
T
),
NRAM2NRAM
,
dst_str
*
sizeof
(
T
),
src_str
*
sizeof
(
T
),
seg_num
-
1
);
}
}
else
{
__memcpy
(
dst
,
src
,
size
*
sizeof
(
T
),
GDRAM2NRAM
,
dst_str
*
sizeof
(
T
),
src_str
*
sizeof
(
T
),
seg_num
-
1
);
}
}
/*!
* @brief loads data from global DRAM to NRAM with 3D pattern.
*
* @param[out] dst
* Pointer to NRAM that stores dst data.
* @param[in] src
* Pointer to global DRAM that stores src data.
* @param[in] size
* The byte size of segment in the lowest dimension.
* @param[in] seg_num_in
* The total count of data segments in the lowest dimension.
* @param[in] seg_num_out
* The total count of data segments in the middle dimension.
* @param[in] dst_str_in
* The data stride in bytes between segments in the lowest dimension of dst.
* @param[in] dst_str_out
* The data stride in bytes between segments in the middle dimension of dst.
* @param[in] src_str_in
* The data stride in bytes between segments in the lowest dimension of src.
* @param[in] src_str_out
* The data stride in bytes between segments in the middle dimension of src.
*/
template
<
typename
T
>
__mlu_func__
void
loadStr3D
(
T
*
dst
,
T
*
src
,
const
int
size
,
const
int
seg_num_in
,
const
int
seg_num_out
,
const
int
dst_str_in
,
const
int
dst_str_out
,
const
int
src_str_in
,
const
int
src_str_out
)
{
T
*
tmp_dst
=
dst
;
T
*
tmp_src
=
src
;
for
(
int
i
=
0
;
i
<
seg_num_out
;
++
i
)
{
loadStr2D
(
tmp_dst
,
tmp_src
,
size
,
dst_str_in
,
src_str_in
,
seg_num_in
);
tmp_src
+=
src_str_out
;
tmp_dst
+=
dst_str_out
;
}
}
/*!
* @brief stores data from NRAM to global DRAM with 2D pattern.
*
* @param[out] dst
* Pointer to global DRAM that stores dst data.
* @param[in] src
* Pointer to NRAM that stores src data.
* @param[in] size
* The byte size of segment in the lower dimension.
* @param[in] dst_str
* The data stride in bytes between segments in the lower dimension of dst.
* @param[in] src_str
* The data stride in bytes between segments in the lower dimension of src.
* @param[in] seg_num
* The total count of data segments in the lower dimension.
*/
template
<
typename
T
>
__mlu_func__
void
storeStr2D
(
T
*
dst
,
T
*
src
,
const
int
size
,
const
int
seg_num
,
const
int
dst_str
,
const
int
src_str
)
{
if
((
size
==
dst_str
&&
dst_str
<=
src_str
)
&&
dst_str
*
sizeof
(
T
)
<=
512
)
{
// gather data less than 512Bytes to improve IO efficiency
if
(
dst_str
!=
src_str
)
{
__memcpy
(
src
,
src
,
size
*
sizeof
(
T
),
NRAM2NRAM
,
dst_str
*
sizeof
(
T
),
src_str
*
sizeof
(
T
),
seg_num
-
1
);
}
__memcpy
(
dst
,
src
,
size
*
seg_num
*
sizeof
(
T
),
NRAM2GDRAM
);
}
else
{
__memcpy
(
dst
,
src
,
size
*
sizeof
(
T
),
NRAM2GDRAM
,
dst_str
*
sizeof
(
T
),
src_str
*
sizeof
(
T
),
seg_num
-
1
);
}
}
/*!
* @brief stores data from NRAM to global DRAM with 3D pattern.
*
* @param[out] dst
* Pointer to global DRAM that stores dst data.
* @param[in] src
* Pointer to NRAM that stores src data.
* @param[in] size
* The byte size of segment in the lowest dimension.
* @param[in] seg_num_in
* The total count of data segments in the lowest dimension.
* @param[in] seg_num_out
* The total count of data segments in the middle dimension.
* @param[in] dst_str_in
* The data stride in bytes between segments in the lowest dimension of dst.
* @param[in] dst_str_out
* The data stride in bytes between segments in the middle dimension of dst.
* @param[in] src_str_in
* The data stride in bytes between segments in the lowest dimension of src.
* @param[in] src_str_out
* The data stride in bytes between segments in the middle dimension of src.
*/
template
<
typename
T
>
__mlu_func__
void
storeStr3D
(
T
*
dst
,
T
*
src
,
const
int
size
,
const
int
seg_num_in
,
const
int
seg_num_out
,
const
int
dst_str_in
,
const
int
dst_str_out
,
const
int
src_str_in
,
const
int
src_str_out
)
{
T
*
tmp_dst
=
dst
;
T
*
tmp_src
=
src
;
for
(
int
i
=
0
;
i
<
seg_num_out
;
++
i
)
{
storeStr2D
(
tmp_dst
,
tmp_src
,
size
,
seg_num_in
,
dst_str_in
,
src_str_in
);
tmp_src
+=
src_str_out
;
tmp_dst
+=
dst_str_out
;
}
}
/*!
* @brief Converts int32 to float32 data type.
*
* @param[out] dst
* Pointer to NRAM that stores int32 type data.
* @param[in,out] dst_addition
* Pointer to NRAM as the workspace of dst, which has the same size as dst.
* It allows empty pointer on MLU300 series.
* @param[in] src
* Pointer to NRAM that stores float32 type data.
* @param[in,out] src_addition
* Pointer to NRAM as the workspace of src, which has a size of 128 Bytes.
* It allows empty pointer on MLU300 series.
* @param[in] src_count
* The count of elements in src.
*/
__mlu_func__
void
convertInt2Float
(
float
*
dst
,
float
*
dst_addition
,
int
*
src
,
float
*
src_addition
,
const
int
src_count
)
{
#if __BANG_ARCH__ >= 300
__bang_int2float
((
float
*
)
dst
,
(
int32_t
*
)
src
,
src_count
,
0
);
#else
// get sign bit
const
float
move_23bit
=
8388608.0
;
// 0x80000000 = 1,000000000,0000000000000000000000000000
__bang_write_value
((
unsigned
*
)
src_addition
,
NFU_ALIGN_SIZE
/
sizeof
(
float
),
0x80000000
);
__bang_cycle_band
((
char
*
)
dst_addition
,
(
char
*
)
src
,
(
char
*
)
src_addition
,
src_count
*
sizeof
(
float
),
NFU_ALIGN_SIZE
);
// get 1 or 0 from sign bit
// judg is Odd
__bang_write_value
((
unsigned
*
)
src_addition
,
NFU_ALIGN_SIZE
/
sizeof
(
float
),
0x00000001
);
__bang_cycle_bor
((
char
*
)
dst_addition
,
(
char
*
)
dst_addition
,
(
char
*
)
src_addition
,
src_count
*
sizeof
(
float
),
NFU_ALIGN_SIZE
);
__bang_write_value
((
unsigned
*
)
src_addition
,
NFU_ALIGN_SIZE
/
sizeof
(
float
),
0x80000001
);
__bang_cycle_eq
(
dst_addition
,
dst_addition
,
src_addition
,
src_count
,
NFU_ALIGN_SIZE
/
sizeof
(
float
));
// minus xor, positive num invariant
__bang_write_value
((
unsigned
*
)
src_addition
,
NFU_ALIGN_SIZE
/
sizeof
(
float
),
0xffffffff
);
__bang_cycle_mul
(
dst
,
dst_addition
,
src_addition
,
src_count
,
NFU_ALIGN_SIZE
/
sizeof
(
float
));
__bang_bxor
((
char
*
)
dst
,
(
char
*
)
src
,
(
char
*
)
dst
,
src_count
*
sizeof
(
float
));
// convert int32 to float32
__bang_write_value
((
unsigned
*
)
src_addition
,
NFU_ALIGN_SIZE
/
sizeof
(
float
),
0x7fffff
);
__bang_cycle_band
((
char
*
)
dst
,
(
char
*
)
dst
,
(
char
*
)
src_addition
,
src_count
*
sizeof
(
float
),
NFU_ALIGN_SIZE
);
__bang_write_value
((
unsigned
*
)
src_addition
,
NFU_ALIGN_SIZE
/
sizeof
(
float
),
0x4b000000
);
__bang_cycle_bor
((
char
*
)
dst
,
(
char
*
)
dst
,
(
char
*
)
src_addition
,
src_count
*
sizeof
(
float
),
NFU_ALIGN_SIZE
);
__bang_sub_scalar
(
dst
,
dst
,
move_23bit
,
src_count
);
// add one
__bang_add
(
dst
,
dst
,
dst_addition
,
src_count
);
// set sign for float32
__bang_write_value
((
unsigned
*
)
src_addition
,
NFU_ALIGN_SIZE
/
sizeof
(
float
),
0xffffffff
);
__bang_cycle_mul
(
dst_addition
,
dst_addition
,
src_addition
,
src_count
,
NFU_ALIGN_SIZE
/
sizeof
(
float
));
__bang_write_value
((
unsigned
*
)
src_addition
,
NFU_ALIGN_SIZE
/
sizeof
(
float
),
0x00000001
);
__bang_cycle_add
(
dst_addition
,
dst_addition
,
src_addition
,
src_count
,
NFU_ALIGN_SIZE
/
sizeof
(
float
));
__bang_write_value
((
unsigned
*
)
src_addition
,
NFU_ALIGN_SIZE
/
sizeof
(
float
),
0x80000000
);
__bang_cycle_band
((
char
*
)
dst_addition
,
(
char
*
)
dst_addition
,
(
char
*
)
src_addition
,
src_count
*
4
,
128
);
__bang_bor
((
char
*
)
dst
,
(
char
*
)
dst
,
(
char
*
)
dst_addition
,
src_count
*
4
);
#endif // __BANG_ARCH__ >= 300
}
/*!
* @brief Converts float32 to int32 data type with to_zero round mode.
*
* @param[out] dst
* Pointer to NRAM that stores float32 type data.
* @param[in,out] dst_addition
* Pointer to NRAM as the workspace of dst, which has the same size as dst.
* It allows empty pointer on MLU300 series.
* @param[in] src
* Pointer to NRAM that stores int32 type data.
* @param[in,out] src_addition
* Pointer to NRAM as the workspace of src, which has a size of 128 Bytes.
* It allows empty pointer on MLU300 series.
* @param[in] src_count
* The count of elements in src.
*/
__mlu_func__
void
convertFloat2Int
(
int
*
dst
,
float
*
dst_addition
,
float
*
src
,
float
*
src_addition
,
const
int
src_count
)
{
#if __BANG_ARCH__ >= 300
__bang_float2int_tz
((
int32_t
*
)
dst
,
(
float
*
)
src
,
src_count
,
0
);
#else
// sign ===> src_addition
// dst=-1.0 : when src[i] is a negative number
// dst=+1.0 : when src[i] is a positive number
const
int
floatDchar
=
sizeof
(
float
)
/
sizeof
(
char
);
__bang_active_sign
((
float
*
)
dst
,
src
,
src_count
);
// dst_addition = abs(src)
__bang_mul
(
dst_addition
,
src
,
(
float
*
)
dst
,
src_count
);
// if dst_addition < 1.0 , then src_addition + 1, to fix add error.
__bang_write_value
((
float
*
)
src_addition
,
NFU_ALIGN_SIZE
/
sizeof
(
float
),
1.0
f
);
__bang_cycle_lt
(
dst_addition
,
dst_addition
,
(
float
*
)
src_addition
,
src_count
,
NFU_ALIGN_SIZE
/
sizeof
(
float
));
__bang_add_tz
((
float
*
)
dst
,
(
float
*
)
dst
,
(
float
*
)
dst_addition
,
src_count
);
__bang_write_value
((
unsigned
*
)
src_addition
,
NFU_ALIGN_SIZE
/
sizeof
(
float
),
0xbf800000
);
// set negative flag -1.0 = 0xbf80000
__bang_cycle_eq
(
(
float
*
)
dst
,
(
float
*
)
dst
,
(
float
*
)
src_addition
,
src_count
,
NFU_ALIGN_SIZE
/
sizeof
(
float
));
// to mark all src in [x<-1.0]
__bang_active_abs
(
dst_addition
,
src
,
src_count
);
__bang_write_value
((
float
*
)
src_addition
,
NFU_ALIGN_SIZE
/
sizeof
(
float
),
8388608.0
f
);
// mask shift move 23
__bang_cycle_add_tz
(
dst_addition
,
dst_addition
,
src_addition
,
src_count
,
NFU_ALIGN_SIZE
/
sizeof
(
float
));
// right shift move 23bit
// two`s complement for negatibe
// dst=1.0 , when src <-1.0
// dst=0.0 , when src >=-1.0
__bang_sub
(
dst_addition
,
dst_addition
,
(
float
*
)
dst
,
src_count
);
// to fix max value
// 0 1001 0110 111 1111 1111 1111 1111 1111 <=> 0xcb7fffff <=> 16777215.0,
// means max value.
__bang_mul_scalar
((
float
*
)
dst
,
(
float
*
)
dst
,
16777215.0
,
src_count
);
__bang_bxor
((
char
*
)
dst_addition
,
(
char
*
)
dst_addition
,
(
char
*
)
dst
,
src_count
*
floatDchar
);
// get low 23bit
__bang_write_value
((
unsigned
*
)
src_addition
,
NFU_ALIGN_SIZE
/
sizeof
(
float
),
(
unsigned
)
0x007fffff
);
// mask low 23bit is 1
__bang_cycle_band
((
char
*
)
dst_addition
,
(
char
*
)
dst_addition
,
(
char
*
)
src_addition
,
src_count
*
floatDchar
,
NFU_ALIGN_SIZE
/
sizeof
(
char
));
// set 9 high bit ===> dst
// -2.0 <=> 0xc0000000 <=> 1100 0000 0000 0000 0000 0000 0000 0000
// 1.0 <=> 0x3f800000 <=> 0011 1111 1000 0000 0000 0000 0000 0000
__bang_write_value
(
src_addition
,
NFU_ALIGN_SIZE
/
sizeof
(
float
),
0x3f800000
);
__bang_cycle_and
((
float
*
)
dst
,
(
float
*
)
dst
,
src_addition
,
src_count
,
NFU_ALIGN_SIZE
/
sizeof
(
float
));
// src or dst_addition
__bang_bor
((
char
*
)
dst_addition
,
(
char
*
)
dst
,
(
char
*
)
dst_addition
,
src_count
*
floatDchar
);
__bang_mul_scalar
((
float
*
)
dst
,
(
float
*
)
dst
,
-
2.0
,
src_count
);
__bang_bor
((
char
*
)
dst
,
(
char
*
)
dst
,
(
char
*
)
dst_addition
,
src_count
*
floatDchar
);
#endif // __BANG_ARCH__ >= 300
}
/*!
* @brief Converts float32 to half data type,
* the rounding mode on MLU200 is rd, on MLU300 is rn.
*
* @param[out] dst
* Pointer to NRAM that stores half type data.
* @param[in] src
* Pointer to NRAM that stores float32 type data.
* @param[in] src_count
* The count of elements in src.
*/
__mlu_func__
inline
void
convertFloat2half
(
half
*
dst
,
float
*
src
,
int
src_count
)
{
#if __BANG_ARCH__ >= 300
__bang_float2half_rn
(
dst
,
src
,
src_count
);
#else
__bang_float2half_rd
(
dst
,
src
,
src_count
);
#endif
}
/*!
* @brief recursiveSumPool.
* @param[in,out] dst
* Pointer to NRAM that stores the input and output data.
* @param[in] low_dim
* Which is the number of low dim.
* @param[in] high_dim
* Which is the number of high dim.
* @param[in] kernel_limit
* Which is the high_dim of sumpool per time.
******************************************************************************/
template
<
typename
T
>
__mlu_func__
void
recursiveSumPool
(
T
*
dst
,
int
low_dim
,
int
high_dim
,
int
kernel_limit
)
{
for
(;
high_dim
>
1
;)
{
int
repeat_s
=
high_dim
/
kernel_limit
;
int
remain_s
=
high_dim
%
kernel_limit
;
if
(
remain_s
)
{
__bang_sumpool
((
T
*
)
dst
,
(
T
*
)
dst
,
low_dim
,
1
,
remain_s
,
1
,
remain_s
,
1
,
1
);
}
if
(
repeat_s
)
{
__bang_sumpool
((
T
*
)
dst
+
(
remain_s
>
0
?
low_dim
:
0
),
(
T
*
)
dst
+
remain_s
*
low_dim
,
low_dim
,
kernel_limit
*
repeat_s
,
1
,
kernel_limit
,
1
,
1
,
kernel_limit
);
}
high_dim
=
repeat_s
+
(
bool
)
remain_s
;
}
return
;
}
#endif // COMMON_MLU_HELPER_HPP_
mmcv/ops/csrc/common/mlu/deform_roi_pool_mlu_kernel.mlu
deleted
100644 → 0
View file @
6f674c7e
/*************************************************************************
* Copyright (C) 2022 Cambricon.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#include <iostream>
#include "common_mlu_helper.hpp"
#define ROI_OFFSET 5
#define FOURSPLIT 4
#define FIVESPLIT 5
#define NINESPLIT 9
#define THIRTEENSPLIT 13
__nram__ char nram_buffer[MAX_NRAM_SIZE];
template <typename T>
static __mlu_func__ void bilinearInterpolate(const int input_width, T y, T x,
T *w1, T *w2, T *w3, T *w4,
int *x_low, int *x_high,
const int y_low, bool *is_empty) {
if (x < -1.0 || x > input_width) {
*is_empty = true;
return;
}
if (x <= 0) x = 0;
*x_low = int(x);
if (*x_low >= input_width - 1) {
*x_high = *x_low = input_width - 1;
x = T(*x_low);
} else {
*x_high = *x_low + 1;
}
T ly = y - y_low;
T lx = x - *x_low;
T hy = 1.0 - ly;
T hx = 1.0 - lx;
*w1 = hy * hx;
*w2 = hy * lx;
*w3 = ly * hx;
*w4 = ly * lx;
}
template <typename T>
__mlu_func__ void MLUUnion1DeformRoIPoolForward(
const T *input, const T *rois, const T *offset, T *output,
const int channels, const int height, const int width, const int num_rois,
const int pooled_height, const int pooled_width, const T spatial_scale,
const int sampling_ratio, const T gamma) {
for (int bin_index = taskId;
bin_index < num_rois * pooled_width * pooled_height;
bin_index += taskDim) {
int out_batch = bin_index / pooled_width / pooled_height;
int out_height = bin_index / pooled_width % pooled_height;
int out_width = bin_index % pooled_width;
const T *cur_roi = rois + out_batch * ROI_OFFSET;
T *nram_rois = (T *)nram_buffer;
__memcpy((void *)nram_rois, (void *)cur_roi, ROI_OFFSET * sizeof(T),
GDRAM2NRAM);
const int roi_batch = nram_rois[0];
T roi_x_min = nram_rois[1] * spatial_scale - 0.5;
T roi_y_min = nram_rois[2] * spatial_scale - 0.5;
const T roi_x_max = nram_rois[3] * spatial_scale - 0.5;
const T roi_y_max = nram_rois[4] * spatial_scale - 0.5;
const T roi_width = roi_x_max - roi_x_min;
const T roi_height = roi_y_max - roi_y_min;
const T bin_width = roi_width / static_cast<T>(pooled_width);
const T bin_height = roi_height / static_cast<T>(pooled_height);
const T *offset_input = input + roi_batch * height * width * channels;
int roi_bin_grid_height =
(sampling_ratio > 0)
? sampling_ratio
: static_cast<int>(ceilf(roi_height / pooled_height));
int roi_bin_grid_width =
(sampling_ratio > 0)
? sampling_ratio
: static_cast<int>(ceilf(roi_width / pooled_width));
if (offset != NULL) {
const T *offset_cur = offset +
out_batch * pooled_width * pooled_height * 2 +
out_height * pooled_width + out_width;
roi_x_min += gamma * roi_width * offset_cur[0];
roi_y_min +=
gamma * roi_height * offset_cur[pooled_width * pooled_height];
}
int type_align = NFU_ALIGN_SIZE / sizeof(T);
int channels_max_num_nram = MAX_NRAM_SIZE / sizeof(T);
int channels_nram_split =
channels_max_num_nram / NINESPLIT / type_align * type_align;
int channel_rem = channels % channels_nram_split;
int channel_loops =
channels / channels_nram_split + (channel_rem != 0 ? 1 : 0);
for (int channel_loop_index = 0; channel_loop_index < channel_loops;
++channel_loop_index) {
int channels_num =
channels_nram_split >= channels ? channels : channels_nram_split;
const int channel_offset = channel_loop_index * channels_num;
if (channel_loop_index + 1 == channel_loops && channel_rem != 0) {
channels_num = channel_rem;
}
int channels_align = CEIL_ALIGN(channels_num, type_align);
int nram_limit = (MAX_NRAM_SIZE / sizeof(T) - channels_align) >> 1;
int c_slice = nram_limit / FOURSPLIT / type_align * type_align;
int c_slice_align = 0;
/* NRAM partition
*
* | | ping | pong |
* |----------|-------------------|-------------------|
* | nram_out | p1 | p2 | p3 | p4 | p1 | p2 | p3 | p4 |
*
*/
T *nram_out = (T *)nram_buffer;
T *nram_ping = nram_out + channels_align;
T *nram_pong = nram_ping + nram_limit;
__bang_write_value((T *)nram_out, channels_align, (T)0);
__bang_write_value((T *)nram_ping, FOURSPLIT * c_slice, (T)0);
__bang_write_value((T *)nram_pong, FOURSPLIT * c_slice, (T)0);
const T num_bins =
static_cast<T>(max(roi_bin_grid_height * roi_bin_grid_width, 1));
const T value_div = 1.0f / num_bins;
bool is_ping_empty = true;
for (int iy = 0; iy < roi_bin_grid_height; ++iy) {
T y = roi_y_min + out_height * bin_height +
static_cast<T>(iy + .5f) * bin_height /
static_cast<T>(roi_bin_grid_height);
if (y < -1.0 || y > height) {
is_ping_empty = true;
continue;
}
if (y <= 0) {
y = 0;
}
int y_low = 0, y_high = 0;
y_low = int(y);
if (y_low >= height - 1) {
y_high = y_low = height - 1;
y = T(y_low);
} else {
y_high = y_low + 1;
}
for (int ix = 0; ix < roi_bin_grid_width; ++ix) {
T x = roi_x_min + out_width * bin_width +
static_cast<T>(ix + .5f) * bin_width /
static_cast<T>(roi_bin_grid_width);
const int sample_index = iy * roi_bin_grid_width + ix;
int c_rem = channels_num;
c_slice = nram_limit / FOURSPLIT / type_align * type_align;
c_slice_align = 0;
bool is_empty = false;
T w1, w2, w3, w4;
int x_low = 0, x_high = 0;
bilinearInterpolate(width, y, x, &w1, &w2, &w3, &w4, &x_low, &x_high,
y_low, &is_empty);
if (is_empty) {
is_ping_empty = true;
continue;
}
if (is_ping_empty) {
c_slice = c_slice > c_rem ? c_rem : c_slice;
c_slice_align = CEIL_ALIGN(c_slice, type_align);
__bang_write_value(nram_ping, FOURSPLIT * c_slice_align, (T)0);
__asm__ volatile("sync;");
__memcpy(nram_ping,
offset_input + y_low * width * channels +
x_low * channels + channel_offset,
c_slice * sizeof(T), GDRAM2NRAM);
__memcpy(nram_ping + c_slice_align,
offset_input + y_low * width * channels +
x_high * channels + channel_offset,
c_slice * sizeof(T), GDRAM2NRAM);
__memcpy(nram_ping + 2 * c_slice_align,
offset_input + y_high * width * channels +
x_low * channels + channel_offset,
c_slice * sizeof(T), GDRAM2NRAM);
__memcpy(nram_ping + 3 * c_slice_align,
offset_input + y_high * width * channels +
x_high * channels + channel_offset,
c_slice * sizeof(T), GDRAM2NRAM);
is_ping_empty = false;
}
int c_offset = 0;
int pongc_slice = 0;
int pongc_slice_align = 0;
while (c_rem > 0) {
c_slice = c_slice > c_rem ? c_rem : c_slice;
c_slice_align = CEIL_ALIGN(c_slice, type_align);
if (sample_index + 1 < roi_bin_grid_height * roi_bin_grid_width) {
int iy_tmp = (sample_index + 1) / roi_bin_grid_width;
int ix_tmp = (sample_index + 1) % roi_bin_grid_width;
y = roi_y_min + out_height * bin_height +
static_cast<T>(iy_tmp + .5f) * bin_height /
static_cast<T>(roi_bin_grid_height);
x = roi_x_min + out_width * bin_width +
static_cast<T>(ix_tmp + .5f) * bin_width /
static_cast<T>(roi_bin_grid_width);
if (y < -1.0 || y > height) {
is_empty = true;
} else {
T w1_tmp, w2_tmp, w3_tmp, w4_tmp;
if (y <= 0) {
y = 0;
}
y_low = int(y);
if (y_low >= height - 1) {
y_high = y_low = height - 1;
y = T(y_low);
} else {
y_high = y_low + 1;
}
bilinearInterpolate(width, y, x, &w1_tmp, &w2_tmp, &w3_tmp,
&w4_tmp, &x_low, &x_high, y_low, &is_empty);
}
pongc_slice = nram_limit / FOURSPLIT / type_align * type_align;
pongc_slice =
pongc_slice > channels_num ? channels_num : pongc_slice;
pongc_slice_align = CEIL_ALIGN(pongc_slice, type_align);
__bang_write_value(nram_pong, FOURSPLIT * pongc_slice_align,
(T)0);
__asm__ volatile("sync;");
if (!is_empty) {
__memcpy_async(nram_pong,
offset_input + y_low * width * channels +
x_low * channels + channel_offset,
pongc_slice * sizeof(T), GDRAM2NRAM);
__memcpy_async(nram_pong + pongc_slice_align,
offset_input + y_low * width * channels +
x_high * channels + channel_offset,
pongc_slice * sizeof(T), GDRAM2NRAM);
__memcpy_async(nram_pong + 2 * pongc_slice_align,
offset_input + y_high * width * channels +
x_low * channels + channel_offset,
pongc_slice * sizeof(T), GDRAM2NRAM);
__memcpy_async(nram_pong + 3 * pongc_slice_align,
offset_input + y_high * width * channels +
x_high * channels + channel_offset,
pongc_slice * sizeof(T), GDRAM2NRAM);
}
}
__bang_mul_scalar(nram_ping, nram_ping, w1, c_slice_align);
__bang_mul_scalar(nram_ping + c_slice_align,
nram_ping + c_slice_align, w2, c_slice_align);
__bang_add(nram_ping, nram_ping, nram_ping + c_slice_align,
c_slice_align);
__bang_mul_scalar(nram_ping + 2 * c_slice_align,
nram_ping + 2 * c_slice_align, w3, c_slice_align);
__bang_add(nram_ping, nram_ping, nram_ping + 2 * c_slice_align,
c_slice_align);
__bang_mul_scalar(nram_ping + 3 * c_slice_align,
nram_ping + 3 * c_slice_align, w4, c_slice_align);
__bang_add(nram_ping, nram_ping, nram_ping + 3 * c_slice_align,
c_slice_align);
__bang_add(nram_out + c_offset, nram_out + c_offset, nram_ping,
c_slice_align);
T *nram_tmp = nram_ping;
nram_ping = nram_pong;
nram_pong = nram_tmp;
c_rem -= c_slice;
c_offset += c_slice;
__asm__ volatile("sync;");
}
}
}
__bang_mul_scalar(nram_out, nram_out, value_div, channels_align);
__memcpy(output + channels * bin_index + channel_offset, nram_out,
channels_num * sizeof(T), NRAM2GDRAM);
}
}
}
__mlu_global__ void MLUKernelDeformRoIPoolForward(
cnrtDataType_t data_type, const void *input, const void *rois,
const void *offset, void *output, const int channels, const int height,
const int width, const int num_rois, const int pooled_height,
const int pooled_width, const float spatial_scale, const int sampling_ratio,
const float gamma) {
switch (data_type) {
case CNRT_FLOAT16: {
MLUUnion1DeformRoIPoolForward((half *)input, (half *)rois, (half *)offset,
(half *)output, channels, height, width,
num_rois, pooled_height, pooled_width,
static_cast<half>(spatial_scale),
sampling_ratio, static_cast<half>(gamma));
}; break;
case CNRT_FLOAT32: {
MLUUnion1DeformRoIPoolForward(
(float *)input, (float *)rois, (float *)offset, (float *)output,
channels, height, width, num_rois, pooled_height, pooled_width,
static_cast<float>(spatial_scale), sampling_ratio,
static_cast<float>(gamma));
}; break;
default: {
break;
}
}
}
void KernelDeformRoIPoolForward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
cnrtQueue_t queue, cnrtDataType_t data_type,
const void *input, const void *rois,
const void *offset, void *output,
const int channels, const int height,
const int width, const int num_rois,
const int pooled_height, const int pooled_width,
const float spatial_scale,
const int sampling_ratio, const float gamma) {
MLUKernelDeformRoIPoolForward<<<k_dim, k_type, queue>>>(
data_type, input, rois, offset, output, channels, height, width, num_rois,
pooled_height, pooled_width, spatial_scale, sampling_ratio, gamma);
}
template <typename T>
__mlu_func__ void MLUUnion1DeformRoIPoolBackward(
const T *grad_output, const T *input, const T *rois, const T *offset,
T *grad_input, T *grad_offset, const int channels, const int height,
const int width, const int num_rois, const int pooled_height,
const int pooled_width, const T spatial_scale, const int sampling_ratio,
const T gamma) {
for (int bin_index = taskId;
bin_index < num_rois * pooled_width * pooled_height;
bin_index += taskDim) {
int out_batch = bin_index / pooled_width / pooled_height;
int out_height = bin_index / pooled_width % pooled_height;
int out_width = bin_index % pooled_width;
const T *cur_roi = rois + out_batch * ROI_OFFSET;
T *nram_rois = (T *)nram_buffer;
__memcpy((void *)nram_rois, (void *)cur_roi, ROI_OFFSET * sizeof(T),
GDRAM2NRAM);
const int roi_batch = nram_rois[0];
T roi_x_min = nram_rois[1] * spatial_scale - 0.5;
T roi_y_min = nram_rois[2] * spatial_scale - 0.5;
const T roi_x_max = nram_rois[3] * spatial_scale - 0.5;
const T roi_y_max = nram_rois[4] * spatial_scale - 0.5;
const T roi_width = roi_x_max - roi_x_min;
const T roi_height = roi_y_max - roi_y_min;
const T bin_width = roi_width / static_cast<T>(pooled_width);
const T bin_height = roi_height / static_cast<T>(pooled_height);
const T *offset_input = input + roi_batch * height * width * channels;
T *offset_grad_input = grad_input + roi_batch * height * width * channels;
int roi_bin_grid_height =
(sampling_ratio > 0)
? sampling_ratio
: static_cast<int>(ceilf(roi_height / pooled_height));
int roi_bin_grid_width =
(sampling_ratio > 0)
? sampling_ratio
: static_cast<int>(ceilf(roi_width / pooled_width));
if (offset != NULL) {
const T *offset_cur = offset +
out_batch * pooled_width * pooled_height * 2 +
out_height * pooled_width + out_width;
roi_x_min += gamma * roi_width * offset_cur[0];
roi_y_min +=
gamma * roi_height * offset_cur[pooled_width * pooled_height];
}
/* NRAM partition
*
* If offset != NULL, NRAM partition belows.
* | |
* ping | pong |
* |---------------------------------------------------------------------|-----------|-----------|
* |nram_tmp1|nram_tmp2|nram_tmp3|nram_tmp4|nram_grad_output|nram_sum_tmp|p1|p2|p3|p4|p1|p2|p3|p4|
*
* If offset == NULL, ping and pang will not be needed.
* | |
* |----------------------------------------------------------------------------------|
* | nram_tmp1 | nram_tmp2 | nram_tmp3 | nram_tmp4 | nram_grad_output |
*
*/
int type_align = NFU_ALIGN_SIZE / sizeof(T);
int channels_max_num_nram = MAX_NRAM_SIZE / sizeof(T);
int channels_nram_split =
channels_max_num_nram / FIVESPLIT / type_align * type_align;
int channel_rem = channels % channels_nram_split;
int channel_loops =
channels / channels_nram_split + (channel_rem != 0 ? 1 : 0);
if (offset != NULL) {
channels_nram_split =
channels_max_num_nram / THIRTEENSPLIT / type_align * type_align;
channel_rem = channels % channels_nram_split;
channel_loops =
channels / channels_nram_split + (channel_rem != 0 ? 1 : 0);
}
for (int channel_loop_index = 0; channel_loop_index < channel_loops;
++channel_loop_index) {
int channels_num =
channels_nram_split >= channels ? channels : channels_nram_split;
const int channel_offset = channel_loop_index * channels_num;
if (channel_loop_index + 1 == channel_loops && channel_rem != 0) {
channels_num = channel_rem;
}
int channels_align = CEIL_ALIGN(channels_num, type_align);
const int32_t nram_sum_tmp_channel = NFU_ALIGN_SIZE / sizeof(T);
int nram_limit = (MAX_NRAM_SIZE / sizeof(T) - 5 * channels_align -
nram_sum_tmp_channel) >>
1;
int c_slice = 0;
int c_slice_align = 0;
T *nram_tmp1 = (T *)nram_buffer;
T *nram_tmp2 = (T *)nram_buffer + channels_align;
T *nram_tmp3 = (T *)nram_buffer + 2 * channels_align;
T *nram_tmp4 = (T *)nram_buffer + 3 * channels_align;
T *nram_grad_output = nram_tmp4 + channels_align;
T *nram_sum_tmp = NULL;
T *nram_ping_input = NULL;
T *nram_pong_input = NULL;
__bang_write_value((T *)nram_grad_output, channels_align, (T)0);
__asm__ volatile("sync;");
if (offset != NULL) {
c_slice = nram_limit / FOURSPLIT / type_align * type_align;
nram_sum_tmp = nram_grad_output + channels_align;
nram_ping_input = nram_sum_tmp + nram_sum_tmp_channel;
nram_pong_input = nram_ping_input + FOURSPLIT * c_slice;
__bang_write_value((T *)nram_sum_tmp, nram_sum_tmp_channel, (T)0);
__bang_write_value((T *)nram_ping_input, FOURSPLIT * c_slice, (T)0);
__bang_write_value((T *)nram_pong_input, FOURSPLIT * c_slice, (T)0);
__asm__ volatile("sync;");
}
const T num_bins =
static_cast<T>(max(roi_bin_grid_height * roi_bin_grid_width, 1));
const T value_div = 1.0f / num_bins;
bool is_ping_empty = true;
__memcpy(nram_grad_output,
grad_output + channels * bin_index + channel_offset,
channels_num * sizeof(T), GDRAM2NRAM);
__bang_mul_scalar(nram_grad_output, nram_grad_output, value_div,
channels_align);
for (int iy = 0; iy < roi_bin_grid_height; ++iy) {
T y = roi_y_min + out_height * bin_height +
static_cast<T>(iy + .5f) * bin_height /
static_cast<T>(roi_bin_grid_height);
T y_tmp = y;
if (y_tmp < -1.0 || y_tmp > height) {
is_ping_empty = true;
continue;
}
if (y_tmp <= 0) {
y_tmp = 0;
}
int y_low = 0, y_high = 0;
y_low = int(y_tmp);
if (y_low >= height - 1) {
y_high = y_low = height - 1;
y_tmp = T(y_low);
} else {
y_high = y_low + 1;
}
for (int ix = 0; ix < roi_bin_grid_width; ++ix) {
T x = roi_x_min + out_width * bin_width +
static_cast<T>(ix + .5f) * bin_width /
static_cast<T>(roi_bin_grid_width);
const int sample_index = iy * roi_bin_grid_width + ix;
int c_rem = channels_num;
bool is_empty = false;
T w1, w2, w3, w4;
int x_low = 0, x_high = 0;
bilinearInterpolate(width, y_tmp, x, &w1, &w2, &w3, &w4, &x_low,
&x_high, y_low, &is_empty);
if (is_empty) {
is_ping_empty = true;
continue;
}
__bang_mul_scalar((T *)nram_tmp1, (T *)nram_grad_output, w1,
channels_align);
__bang_mul_scalar((T *)nram_tmp2, (T *)nram_grad_output, w2,
channels_align);
__bang_mul_scalar((T *)nram_tmp3, (T *)nram_grad_output, w3,
channels_align);
__bang_mul_scalar((T *)nram_tmp4, (T *)nram_grad_output, w4,
channels_align);
__asm__ volatile("sync;");
__bang_atomic_add(
(T *)nram_tmp1,
(T *)(offset_grad_input + (y_low * width + x_low) * channels +
channel_offset),
(T *)nram_tmp1, channels_num);
__bang_atomic_add(
(T *)nram_tmp2,
(T *)(offset_grad_input + (y_low * width + x_high) * channels +
channel_offset),
(T *)nram_tmp2, channels_num);
__bang_atomic_add(
(T *)nram_tmp3,
(T *)(offset_grad_input + (y_high * width + x_low) * channels +
channel_offset),
(T *)nram_tmp3, channels_num);
__bang_atomic_add(
(T *)nram_tmp4,
(T *)(offset_grad_input + (y_high * width + x_high) * channels +
channel_offset),
(T *)nram_tmp4, channels_num);
if (offset != NULL) {
c_slice = nram_limit / FOURSPLIT / type_align * type_align;
c_slice_align = 0;
if (is_ping_empty) {
c_slice = c_slice > c_rem ? c_rem : c_slice;
c_slice_align = CEIL_ALIGN(c_slice, type_align);
__bang_write_value(nram_ping_input, FOURSPLIT * c_slice_align,
(T)0);
__asm__ volatile("sync;");
const T *src_offset1 = offset_input + y_low * width * channels +
x_low * channels + channel_offset;
const T *src_offset2 = offset_input + y_low * width * channels +
x_high * channels + channel_offset;
const T *src_offset3 = offset_input + y_high * width * channels +
x_low * channels + channel_offset;
const T *src_offset4 = offset_input + y_high * width * channels +
x_high * channels + channel_offset;
__memcpy(nram_ping_input, src_offset1, c_slice * sizeof(T),
GDRAM2NRAM);
__memcpy(nram_ping_input + c_slice_align, src_offset2,
c_slice * sizeof(T), GDRAM2NRAM);
__memcpy(nram_ping_input + 2 * c_slice_align, src_offset3,
c_slice * sizeof(T), GDRAM2NRAM);
__memcpy(nram_ping_input + 3 * c_slice_align, src_offset4,
c_slice * sizeof(T), GDRAM2NRAM);
is_ping_empty = false;
}
int c_offset = 0;
int pongc_slice = 0;
int pongc_slice_align = 0;
while (c_rem > 0) {
c_slice = c_slice > c_rem ? c_rem : c_slice;
c_slice_align = CEIL_ALIGN(c_slice, type_align);
if (sample_index + 1 < roi_bin_grid_height * roi_bin_grid_width) {
int iy_tmp = (sample_index + 1) / roi_bin_grid_width;
int ix_tmp = (sample_index + 1) % roi_bin_grid_width;
T y_tmp = roi_y_min + out_height * bin_height +
static_cast<T>(iy_tmp + .5f) * bin_height /
static_cast<T>(roi_bin_grid_height);
T x_tmp = roi_x_min + out_width * bin_width +
static_cast<T>(ix_tmp + .5f) * bin_width /
static_cast<T>(roi_bin_grid_width);
int x_low_tmp = 0, x_high_tmp = 0, y_low_tmp = 0,
y_high_tmp = 0;
if (y_tmp < -1.0 || y_tmp > height) {
is_empty = true;
} else {
T w1_tmp, w2_tmp, w3_tmp, w4_tmp;
if (y_tmp <= 0) {
y_tmp = 0;
}
y_low_tmp = int(y_tmp);
if (y_low_tmp >= height - 1) {
y_high_tmp = y_low_tmp = height - 1;
y_tmp = T(y_low_tmp);
} else {
y_high_tmp = y_low_tmp + 1;
}
bilinearInterpolate(width, y_tmp, x_tmp, &w1_tmp, &w2_tmp,
&w3_tmp, &w4_tmp, &x_low_tmp, &x_high_tmp,
y_low_tmp, &is_empty);
}
pongc_slice = nram_limit / FOURSPLIT / type_align * type_align;
pongc_slice =
pongc_slice > channels_num ? channels_num : pongc_slice;
pongc_slice_align = CEIL_ALIGN(pongc_slice, type_align);
__bang_write_value(nram_pong_input,
FOURSPLIT * pongc_slice_align, (T)0);
__asm__ volatile("sync;");
if (!is_empty) {
const T *src_offset1 = offset_input +
y_low_tmp * width * channels +
x_low_tmp * channels + channel_offset;
const T *src_offset2 = offset_input +
y_low_tmp * width * channels +
x_high_tmp * channels + channel_offset;
const T *src_offset3 = offset_input +
y_high_tmp * width * channels +
x_low_tmp * channels + channel_offset;
const T *src_offset4 = offset_input +
y_high_tmp * width * channels +
x_high_tmp * channels + channel_offset;
__memcpy_async(nram_pong_input, src_offset1,
pongc_slice * sizeof(T), GDRAM2NRAM);
__memcpy_async(nram_pong_input + pongc_slice_align,
src_offset2, pongc_slice * sizeof(T),
GDRAM2NRAM);
__memcpy_async(nram_pong_input + 2 * pongc_slice_align,
src_offset3, pongc_slice * sizeof(T),
GDRAM2NRAM);
__memcpy_async(nram_pong_input + 3 * pongc_slice_align,
src_offset4, pongc_slice * sizeof(T),
GDRAM2NRAM);
}
}
__bang_mul_scalar(nram_tmp1, nram_ping_input + 3 * c_slice_align,
y - y_low, c_slice_align);
__bang_mul_scalar(nram_tmp2, nram_ping_input + c_slice_align,
y_high - y, c_slice_align);
__bang_add(nram_tmp1, nram_tmp1, nram_tmp2, c_slice_align);
__bang_mul_scalar(nram_tmp2, nram_ping_input + 2 * c_slice_align,
y_low - y, c_slice_align);
__bang_add(nram_tmp1, nram_tmp1, nram_tmp2, c_slice_align);
__bang_mul_scalar(nram_tmp2, nram_ping_input, y - y_high,
c_slice_align);
__bang_add(nram_tmp1, nram_tmp1, nram_tmp2, c_slice_align);
__bang_mul_scalar(nram_tmp1, nram_tmp1, gamma * roi_width,
c_slice_align);
__bang_mul(nram_tmp1, nram_grad_output, nram_tmp1, c_slice_align);
const int32_t kernel_width =
c_slice_align / nram_sum_tmp_channel +
(int32_t)(c_slice_align % nram_sum_tmp_channel > 0);
__bang_sumpool(nram_sum_tmp, nram_tmp1, nram_sum_tmp_channel, 1,
kernel_width, 1, kernel_width, kernel_width, 1);
__bang_reduce_sum(nram_sum_tmp, nram_sum_tmp,
nram_sum_tmp_channel);
__bang_atomic_add(
(T *)nram_sum_tmp,
(T *)(grad_offset +
out_batch * pooled_width * pooled_height * 2 +
out_height * pooled_width + out_width),
(T *)nram_sum_tmp, 1);
__bang_write_value((T *)nram_sum_tmp, nram_sum_tmp_channel, (T)0);
__bang_mul_scalar(nram_tmp1, nram_ping_input + 3 * c_slice_align,
x - x_low, c_slice_align);
__bang_mul_scalar(nram_tmp2, nram_ping_input + 2 * c_slice_align,
x_high - x, c_slice_align);
__bang_add(nram_tmp1, nram_tmp1, nram_tmp2, c_slice_align);
__bang_mul_scalar(nram_tmp2, nram_ping_input + c_slice_align,
x_low - x, c_slice_align);
__bang_add(nram_tmp1, nram_tmp1, nram_tmp2, c_slice_align);
__bang_mul_scalar(nram_tmp2, nram_ping_input, x - x_high,
c_slice_align);
__bang_add(nram_tmp1, nram_tmp1, nram_tmp2, c_slice_align);
__bang_mul_scalar(nram_tmp1, nram_tmp1, gamma * roi_height,
c_slice_align);
__bang_mul(nram_tmp1, nram_grad_output, nram_tmp1, c_slice_align);
__bang_sumpool(nram_sum_tmp, nram_tmp1, nram_sum_tmp_channel, 1,
kernel_width, 1, kernel_width, kernel_width, 1);
__bang_reduce_sum(nram_sum_tmp, nram_sum_tmp,
NFU_ALIGN_SIZE / sizeof(T));
__bang_atomic_add(
(T *)nram_sum_tmp,
(T *)(grad_offset +
out_batch * pooled_width * pooled_height * 2 +
pooled_width * pooled_height +
out_height * pooled_width + out_width),
(T *)nram_sum_tmp, 1);
T *nram_tmp = nram_ping_input;
nram_ping_input = nram_pong_input;
nram_pong_input = nram_tmp;
c_rem -= c_slice;
c_offset += c_slice;
__asm__ volatile("sync;");
}
}
}
}
}
}
}
__mlu_global__ void MLUKernelDeformRoIPoolBackward(
cnrtDataType_t data_type, const void *grad_output, const void *input,
const void *rois, const void *offset, void *grad_input, void *grad_offset,
const int channels, const int height, const int width, const int num_rois,
const int pooled_height, const int pooled_width, const float spatial_scale,
const int sampling_ratio, const float gamma) {
switch (data_type) {
case CNRT_FLOAT16: {
MLUUnion1DeformRoIPoolBackward(
(half *)grad_output, (half *)input, (half *)rois, (half *)offset,
(half *)grad_input, (half *)grad_offset, channels, height, width,
num_rois, pooled_height, pooled_width,
static_cast<half>(spatial_scale), sampling_ratio,
static_cast<half>(gamma));
}; break;
case CNRT_FLOAT32: {
MLUUnion1DeformRoIPoolBackward(
(float *)grad_output, (float *)input, (float *)rois, (float *)offset,
(float *)grad_input, (float *)grad_offset, channels, height, width,
num_rois, pooled_height, pooled_width,
static_cast<float>(spatial_scale), sampling_ratio,
static_cast<float>(gamma));
}; break;
default: {
break;
}
}
}
void KernelDeformRoIPoolBackward(
cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
cnrtDataType_t data_type, const void *grad_output, const void *input,
const void *rois, const void *offset, void *grad_input, void *grad_offset,
const int channels, const int height, const int width, const int num_rois,
const int pooled_height, const int pooled_width, const float spatial_scale,
const int sampling_ratio, const float gamma) {
MLUKernelDeformRoIPoolBackward<<<k_dim, k_type, queue>>>(
data_type, grad_output, input, rois, offset, grad_input, grad_offset,
channels, height, width, num_rois, pooled_height, pooled_width,
spatial_scale, sampling_ratio, gamma);
}
mmcv/ops/csrc/common/mlu/focal_loss_sigmoid_mlu_kernel.mlu
deleted
100644 → 0
View file @
6f674c7e
/*************************************************************************
* Copyright (C) 2021 Cambricon.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#include <float.h>
#include "common_mlu_helper.hpp"
#define PING 0
#define PONG 1
__nram__ char nram_buffer[MAX_NRAM_SIZE];
namespace forward {
template <typename T>
__mlu_func__ void loadInput(char *nram_input, T *dram_input, const int32_t size,
const int32_t dst_stride = 0,
const int32_t src_stride = 0,
const int32_t count = 1) {
if (dst_stride == src_stride) {
__memcpy_async(nram_input, dram_input, size * count, GDRAM2NRAM);
} else {
__memcpy_async(nram_input, dram_input, size, GDRAM2NRAM, dst_stride,
src_stride, count - 1);
}
}
template <typename T>
__mlu_func__ void loadWeight(char *nram_input, T *dram_input, const int32_t t,
const int32_t c, const int32_t has_weight,
const int32_t partition_nc) {
if (has_weight && partition_nc && t >= 0 && t < c) {
__memcpy_async(nram_input, (T *)dram_input + t, sizeof(T), GDRAM2NRAM);
}
}
template <typename T>
__mlu_func__ void storeOutput(T *dram_output, char *nram_output,
const int32_t size, const int32_t dst_stride = 0,
const int32_t src_stride = 0,
const int32_t count = 1) {
if (dst_stride == src_stride) {
__memcpy_async(dram_output, nram_output, size * count, NRAM2GDRAM);
} else {
__memcpy_async(dram_output, nram_output, size, NRAM2GDRAM, dst_stride,
src_stride, count - 1);
}
}
template <typename T>
__mlu_func__ void compute(T *input, const int32_t *target, const T *weight,
const int32_t has_weight, const int32_t partition_nc,
const int32_t deal_num, const int32_t n_seg,
const int32_t c, const int32_t c_seg,
const int32_t c_start_index, const float alpha,
const float gamma, T *compute_a, T *compute_b,
T *output) {
// set params
const int32_t c_num =
has_weight ? PAD_UP(c_seg, NFU_ALIGN_SIZE / sizeof(T)) : c_seg;
const int32_t c_end_index = c_start_index + c_seg;
const int32_t half_epsilon = 0x0400;
const T epsilon_f =
sizeof(T) == sizeof(float) ? FLT_MIN : *((half *)&half_epsilon);
// 0. alpha_t * p_t^r = alpha * (1 - p) ^ gamma if t == c_i
// = (1 - alpha) * p ^ gamma if t != c_i
__nramset((T *)output, deal_num, (T)(1 - alpha));
__bang_active_sigmoid((T *)compute_b, (T *)input, deal_num);
for (int32_t i = 0; i < n_seg; ++i) {
const int32_t t = *((uint32_t *)target + i);
if (t >= c_start_index && t < c_end_index) {
const uint32_t index = i * c_num + t - c_start_index;
*((T *)input + index) = -1.0 * (*((T *)input + index));
*((T *)compute_b + index) = 1.0 - (*((T *)compute_b + index)) + epsilon_f;
*((T *)output + index) = alpha;
}
}
if (sizeof(T) == sizeof(half)) {
__bang_half2float((float *)compute_a, (half *)compute_b, deal_num);
__bang_active_loghp((float *)compute_a, (float *)compute_a, deal_num);
__bang_mul_const((float *)compute_a, (float *)compute_a, (float)gamma,
deal_num);
__bang_active_exphp((float *)compute_a, (float *)compute_a, deal_num);
__bang_float2half_rd((half *)compute_a, (float *)compute_a, deal_num);
} else {
__bang_active_loghp((T *)compute_a, (T *)compute_b, deal_num);
__bang_mul_const((T *)compute_a, (T *)compute_a, (T)gamma, deal_num);
__bang_active_exphp((T *)compute_a, (T *)compute_a, deal_num);
}
__bang_mul((T *)output, (T *)compute_a, (T *)output, deal_num);
// 1. max = max(0, -x) if t == c_i
// = max(0, x) if t != c_i
__nramset((T *)compute_b, deal_num, (T)0);
__bang_maxequal((T *)compute_b, (T *)compute_b, (T *)input, deal_num);
// 2. -log(p_t) = ln(e^(-max)+ e^(-max-x) + max if t == c_i
// = ln(e^(-max)+ e^(-max+x) + max if t != c_i
__bang_mul_const((T *)compute_a, (T *)compute_b, (T)-1.0, deal_num);
__bang_add((T *)input, (T *)compute_a, (T *)input, deal_num);
__bang_active_exphp((T *)compute_a, (T *)compute_a, deal_num);
__bang_active_exphp((T *)input, (T *)input, deal_num);
__bang_add((T *)compute_a, (T *)compute_a, (T *)input, deal_num);
__bang_active_loghp((T *)compute_a, (T *)compute_a, deal_num);
__bang_add((T *)input, (T *)compute_a, (T *)compute_b, deal_num);
// 3. output = alpha_t * p_t^r * [-log(p_t)]
__bang_mul((T *)output, (T *)output, (T *)input, deal_num);
// 4. with weight
if (has_weight) {
for (int32_t i = 0; i < n_seg; ++i) {
int32_t t = *((int32_t *)target + i);
if (t >= 0 && t < c) {
t = partition_nc ? 0 : t;
__bang_mul_const((T *)output + i * c_num, (T *)output + i * c_num,
*((T *)weight + t), c_num);
}
}
}
}
template <typename T>
__mlu_func__ void startPipeline(
const T *input, const int32_t *target, const T *weight,
char *nram_compute_a, char *nram_compute_b, char *nram_input,
char *nram_target, char *nram_weight, char *nram_output,
const int32_t has_weight, const int32_t partition_nc,
const int32_t pingpong_offset, const int32_t pingpong_weight_offset,
const int32_t c_offset_num, const int32_t n, const int32_t n_seg,
const int32_t c, const int32_t c_seg, const float alpha, const float gamma,
T *output) {
// with offset
input = (T *)((char *)input + c_offset_num * sizeof(T));
output = (T *)((char *)output + c_offset_num * sizeof(T));
const int32_t c_seg_align_num = PAD_UP(c_seg, NFU_ALIGN_SIZE / sizeof(T));
const int32_t c_num = has_weight ? c_seg_align_num : c_seg;
const int32_t deal_num = PAD_UP(n_seg * c_num, NFU_ALIGN_SIZE / sizeof(T));
const int32_t load_size = c_seg * sizeof(T);
const int32_t dram_stride = c * sizeof(T);
const int32_t nram_stride = c_num * sizeof(T);
if (has_weight && !partition_nc) {
loadInput<T>(nram_weight, (T *)weight, load_size, nram_stride, dram_stride,
1);
__asm__ volatile("sync;\n\t");
}
const int32_t repeat = n / n_seg;
const int32_t remain = n % n_seg;
/*
* Pipeline: The pipeline is processed in three stages: Load, Compute, Store.
* The allocated memory space of NRAM is divided into two parts:
* PING and Pong. In a single time slice, PING is used to process
* IO stream and PONG is used for computation. Both of them are
* processed synchronously until finished.
*
* diagram of PINGPONG:
* |------|-----------------------------------------------------------------|
* | | space |
* |------|-----------------------------------------------------------------|
* | time | Ping | Pong | Ping | Pong | Ping | Pong |
* |------|-----------------------------------------------------------------|
* | 0 | L0 | | | | | |
* | 1 | C0 | L1 | | | | |
* | 2 | S0 | C1 | L2 | | | |
* | 3 | | S1 | C2 | L3 | | |
* | 4 | | | S2 | C3 | L4 | |
* | 5 | | | | S3 | C4 | L5 |
* | 6 | | | | | S4 | C5 |
* | 7 | | | | | | S5 |
* |------|-----------------------------------------------------------------|
*/
// diagram of PINGPONG: L0
if (repeat > 0) {
loadInput<T>(nram_input, (T *)input, load_size, nram_stride, dram_stride,
n_seg);
loadInput<int32_t>(nram_target, (int32_t *)target, n_seg * sizeof(int32_t));
loadWeight<T>(nram_weight, (T *)weight, *((int32_t *)target), c, has_weight,
partition_nc);
__asm__ volatile("sync;\n\t");
}
// diagram of PINGPONG: C0 and L1
if (repeat > 1) {
compute((T *)nram_input, (int32_t *)nram_target, (T *)nram_weight,
has_weight, partition_nc, deal_num, n_seg, c, c_seg, c_offset_num,
alpha, gamma, (T *)nram_compute_a, (T *)nram_compute_b,
(T *)nram_output);
loadInput<T>((char *)nram_input + pingpong_offset, (T *)input + c * n_seg,
load_size, nram_stride, dram_stride, n_seg);
loadInput<int32_t>((char *)nram_target + pingpong_offset,
(int32_t *)target + n_seg, n_seg * sizeof(int32_t));
loadWeight<T>((char *)nram_weight + pingpong_weight_offset, (T *)weight,
*((int32_t *)target + n_seg), c, has_weight, partition_nc);
__asm__ volatile("sync;\n\t");
}
for (int32_t i = 0; i < repeat - 2; ++i) {
storeOutput<T>((T *)output + i * c * n_seg,
nram_output + (i % 2) * pingpong_offset, load_size,
dram_stride, nram_stride, n_seg);
loadInput<T>((char *)nram_input + (i % 2) * pingpong_offset,
(T *)(input) + (i + 2) * c * n_seg, load_size, nram_stride,
dram_stride, n_seg);
loadInput<int32_t>((char *)nram_target + (i % 2) * pingpong_offset,
(int32_t *)target + (i + 2) * n_seg,
n_seg * sizeof(int32_t));
loadWeight<T>((char *)nram_weight + (i % 2) * pingpong_weight_offset,
(T *)weight, *((int32_t *)target + (i + 2) * n_seg), c,
has_weight, partition_nc);
compute((T *)(nram_input + ((i + 1) % 2) * pingpong_offset),
(int32_t *)(nram_target + ((i + 1) % 2) * pingpong_offset),
(T *)(nram_weight +
partition_nc * ((i + 1) % 2) * pingpong_weight_offset),
has_weight, partition_nc, deal_num, n_seg, c, c_seg, c_offset_num,
alpha, gamma, (T *)nram_compute_a, (T *)nram_compute_b,
(T *)(nram_output + ((i + 1) % 2) * pingpong_offset));
__asm__ volatile("sync;\n\t");
}
if (repeat > 1) {
storeOutput<T>((T *)output + (repeat - 2) * c * n_seg,
(char *)nram_output + (repeat % 2) * pingpong_offset,
load_size, dram_stride, nram_stride, n_seg);
}
if (remain > 0) {
loadInput<T>((char *)nram_input + (repeat % 2) * pingpong_offset,
(T *)input + repeat * c * n_seg, load_size, nram_stride,
dram_stride, remain);
loadInput<int32_t>((char *)nram_target + (repeat % 2) * pingpong_offset,
(int32_t *)target + repeat * n_seg,
remain * sizeof(int32_t));
loadWeight<T>((char *)nram_weight + (repeat % 2) * pingpong_weight_offset,
(T *)weight, *((int32_t *)target + repeat * n_seg), c,
has_weight, partition_nc);
}
if (repeat > 0) {
compute((T *)(nram_input + ((repeat - 1) % 2) * pingpong_offset),
(int32_t *)(nram_target + ((repeat - 1) % 2) * pingpong_offset),
(T *)(nram_weight +
partition_nc * ((repeat - 1) % 2) * pingpong_weight_offset),
has_weight, partition_nc, deal_num, n_seg, c, c_seg, c_offset_num,
alpha, gamma, (T *)nram_compute_a, (T *)nram_compute_b,
(T *)(nram_output + ((repeat - 1) % 2) * pingpong_offset));
}
__asm__ volatile("sync;\n\t");
if (repeat > 0) {
storeOutput<T>((T *)output + (repeat - 1) * c * n_seg,
(char *)nram_output + ((repeat - 1) % 2) * pingpong_offset,
load_size, dram_stride, nram_stride, n_seg);
}
if (remain > 0) {
int32_t rem_num = PAD_UP(remain * c_num, NFU_ALIGN_SIZE / sizeof(T));
compute((T *)(nram_input + (repeat % 2) * pingpong_offset),
(int32_t *)(nram_target + (repeat % 2) * pingpong_offset),
(T *)(nram_weight +
partition_nc * (repeat % 2) * pingpong_weight_offset),
has_weight, partition_nc, rem_num, remain, c, c_seg, c_offset_num,
alpha, gamma, (T *)nram_compute_a, (T *)nram_compute_b,
(T *)(nram_output + (repeat % 2) * pingpong_offset));
__asm__ volatile("sync;\n\t");
storeOutput<T>((T *)output + repeat * c * n_seg,
(char *)nram_output + (repeat % 2) * pingpong_offset,
load_size, dram_stride, nram_stride, remain);
}
__asm__ volatile("sync;\n\t");
}
template <typename T>
__mlu_func__ void focalLossSigmoidForwardBlock(
const T *input, const int32_t *target, const T *weight, const int32_t n,
const int32_t c, const float alpha, const float gamma, T *output) {
/*
* NRAM partition
* |-----------------------------------------------------------------------|
* | weight |
* |------------------------------- COMPUTE -------------------------------|
* | | |
* | computeA | computeB |
* | | |
* |------------- PING ------------------------------- PONG ---------------|
* | | |
* | input | input |
* | | |
* |-----------------------------------|-----------------------------------|
* | | |
* | output | output |
* | | |
* |-----------------------------------|-----------------------------------|
* | target | target |
* |-----------------------------------|-----------------------------------|
*
* split_pipeline_num is 6: COMPUTE(computeA,computeB), PING(input,output),
* PONG(input,output).
* split_target_num is 2: PING(target), PONG(target).
* weight is not NULL:
* The nram-size of weight is equal to c_align_size when partition input-N.
* The nram-size of weight is equal to NFU_ALIGN_SIZE when partition
* input-NC.
*/
// calculate threshold of c
const int32_t split_pipeline_num = 6;
const int32_t split_target_num = 2;
const int32_t has_weight = weight != NULL;
const int32_t threshold_c =
PAD_DOWN((MAX_NRAM_SIZE - split_target_num * sizeof(int32_t)) /
(split_pipeline_num + has_weight),
NFU_ALIGN_SIZE) /
sizeof(T);
const int32_t c_align = PAD_UP(c, NFU_ALIGN_SIZE / sizeof(T));
const int32_t c_align_size = c_align * sizeof(T);
if (c <= threshold_c) {
// partition inputN
int32_t c_num = c;
int32_t reservered_align_size =
(split_target_num + split_pipeline_num) * NFU_ALIGN_SIZE;
int32_t weight_size = 0;
if (has_weight) {
c_num = c_align;
reservered_align_size = split_target_num * NFU_ALIGN_SIZE;
weight_size = c_align_size;
}
const int32_t remain_size =
MAX_NRAM_SIZE - weight_size - reservered_align_size;
const int32_t n_seg =
remain_size / (split_pipeline_num * c_num * sizeof(T) +
split_target_num * sizeof(int32_t));
const int32_t split_pipeline_size =
PAD_UP(c_num * n_seg * sizeof(T), NFU_ALIGN_SIZE);
const int32_t compute_size = 2 * split_pipeline_size;
const int32_t pingpong_offset = (MAX_NRAM_SIZE - weight_size - compute_size) / 2;
char *nram_weight = (char *)nram_buffer;
char *nram_compute_a = nram_weight + has_weight * c_align_size;
char *nram_compute_b = nram_compute_a + split_pipeline_size;
char *nram_input = nram_compute_b + split_pipeline_size;
char *nram_output = nram_input + split_pipeline_size;
char *nram_target = nram_output + split_pipeline_size;
startPipeline<T>(input, target, weight, nram_compute_a, nram_compute_b,
nram_input, nram_target, nram_weight, nram_output,
has_weight, 0, pingpong_offset, 0, 0, n, n_seg, c, c,
alpha, gamma, output);
} else {
// partition inputNC
const int32_t weight_size = has_weight * NFU_ALIGN_SIZE;
const int32_t remain_size = MAX_NRAM_SIZE - weight_size;
const int32_t split_pipeline_size = PAD_DOWN(
(remain_size - split_target_num * NFU_ALIGN_SIZE) / split_pipeline_num,
NFU_ALIGN_SIZE);
const int32_t c_seg = split_pipeline_size / sizeof(T);
const int32_t n_seg = 1;
const int32_t compute_size = 2 * split_pipeline_size;
const int32_t pingpong_offset = (MAX_NRAM_SIZE - weight_size - compute_size) / 2;
const int32_t pingpong_weight_offset = weight_size / 2;
char *nram_weight = (char *)nram_buffer;
char *nram_compute_a = nram_weight + weight_size;
char *nram_compute_b = nram_compute_a + split_pipeline_size;
char *nram_input = nram_compute_b + split_pipeline_size;
char *nram_output = nram_input + split_pipeline_size;
char *nram_target = nram_output + split_pipeline_size;
const int32_t loop_num = (c + c_seg - 1) / c_seg;
const int32_t partition_nc = 1;
for (int32_t i = 0; i < loop_num; ++i) {
const int32_t c_index = i * c_seg;
const int32_t c_seg_curr = i == (loop_num - 1) ? c - c_index : c_seg;
startPipeline<T>(input, target, weight, nram_compute_a, nram_compute_b,
nram_input, nram_target, nram_weight, nram_output,
has_weight, partition_nc, pingpong_offset,
pingpong_weight_offset, c_index, n, n_seg, c, c_seg_curr,
alpha, gamma, output);
}
}
}
template <typename T>
__mlu_global__ void MLUUnion1KernelFocalLossSigmoidForward(
const void *input, const void *target, const void *weight, const int32_t N,
const int32_t C, const float alpha, const float gamma, void *output) {
const int32_t n_seg = N / taskDim + (taskId == taskDim - 1) * (N % taskDim);
const T *input_offset = (T *)input + N / taskDim * taskId * C;
const int32_t *target_offset = (int32_t *)target + N / taskDim * taskId;
T *output_offset = (T *)output + N / taskDim * taskId * C;
focalLossSigmoidForwardBlock((T *)input_offset, (int32_t *)target_offset,
(T *)weight, n_seg, C, alpha, gamma,
(T *)output_offset);
}
} // namespace forward
namespace backward {
template <typename T>
__mlu_func__ void loadInput(char *nram_input, char *nram_target,
const T *gdram_input, const int32_t *gdram_target,
const int32_t deal_n, const int32_t total_c,
const bool pingping_flag, const bool has_weight,
const int32_t nram_offset,
const int32_t gdram_offset) {
if (pingping_flag == PONG) {
nram_input += nram_offset;
nram_target += nram_offset;
}
__memcpy_async(nram_target, gdram_target + gdram_offset / total_c,
deal_n * sizeof(int32_t), GDRAM2NRAM);
char *nram_input_load = nram_input;
int32_t compute_align_size = 2 * NFU_ALIGN_SIZE;
if (has_weight) {
if (sizeof(T) == sizeof(half)) {
int32_t compute_align_num = compute_align_size / sizeof(float);
int32_t align_c = PAD_UP(total_c, compute_align_num);
int32_t compute_size = deal_n * align_c * sizeof(float);
nram_input_load += compute_size / 2;
}
int32_t align_c = PAD_UP(total_c, NFU_ALIGN_SIZE / sizeof(T));
int32_t total_c_size = total_c * sizeof(T);
int32_t align_c_size = align_c * sizeof(T);
__memcpy_async(nram_input_load, gdram_input + gdram_offset, total_c_size,
GDRAM2NRAM, align_c_size, total_c_size, deal_n - 1);
} else {
if (sizeof(T) == sizeof(half)) {
int32_t compute_size =
PAD_UP(deal_n * total_c * sizeof(float), compute_align_size);
nram_input_load += compute_size / 2;
}
int32_t load_size = deal_n * total_c * sizeof(T);
__memcpy_async(nram_input_load, gdram_input + gdram_offset, load_size,
GDRAM2NRAM);
}
}
template <typename T>
__mlu_func__ void sigmoid(T *dst_data, const T *src_data,
const int32_t elem_count) {
__bang_mul_const(dst_data, (T *)src_data, T(-1), elem_count);
__bang_active_exphp(dst_data, dst_data, elem_count);
__bang_add_const(dst_data, dst_data, T(1), elem_count);
__bang_active_reciphp(dst_data, dst_data, elem_count);
}
template <typename T>
__mlu_func__ void coreCompute(char *nram_input, const T *nram_weight,
const float *nram_flt_min, char *nram_pt,
char *nram_alpha_t, char *nram_temp,
char *nram_target, const float *nram_gamma,
char *nram_output, const float alpha,
const int32_t compute_num, const int32_t deal_n,
const int32_t total_c, const bool pingpong_flag,
const int32_t nram_offset,
const bool has_weight) {
if (pingpong_flag == PONG) {
nram_input += nram_offset;
nram_pt += nram_offset;
nram_alpha_t += nram_offset;
nram_temp += nram_offset;
nram_output += nram_offset;
nram_target += nram_offset;
}
if (sizeof(T) == sizeof(half)) {
const int32_t compute_size = compute_num * sizeof(float);
char *nram_input_load = nram_input + compute_size / 2;
__bang_half2float((float *)nram_input, (half *)nram_input_load,
compute_num);
}
// 0. alpha_t = alpha - 1
__nramset((float *)nram_alpha_t, compute_num, (float)(alpha - 1.0));
// 1. pt = 1 - sigmoid(x)
sigmoid((float *)nram_pt, (float *)nram_input, compute_num);
__bang_mul_const((float *)nram_pt, (float *)nram_pt, (float)(-1),
compute_num);
__bang_add_const((float *)nram_pt, (float *)nram_pt, (float)1, compute_num);
// 2. pt = target[n] == c ? sigmoid(x) : 1 - sigmoid(x)
// alpha_t = target[n] == c ? alpha : alpha - 1
const int32_t nfu_align_num = NFU_ALIGN_SIZE / sizeof(float);
for (int n = 0; n < deal_n; n++) {
const int32_t target_value = ((int32_t *)nram_target)[n];
if (target_value >= total_c || target_value < 0) continue;
int32_t c_offset = 0;
if (has_weight) {
int32_t c_align_num = nfu_align_num;
if (sizeof(T) == sizeof(half)) {
c_align_num += nfu_align_num;
}
c_offset = PAD_UP(total_c, c_align_num);
} else {
c_offset = total_c;
}
int32_t idx = n * c_offset + target_value;
*((float *)nram_pt + idx) = 1.0 - *((float *)nram_pt + idx);
*((float *)nram_alpha_t + idx) = alpha;
}
// 3. temp = -alpha_t * e^(gamma * log(max(1 - pt, FLT_MIN))
__bang_mul_const((float *)nram_temp, (float *)nram_pt, (float)(-1),
compute_num);
__bang_add_const((float *)nram_temp, (float *)nram_temp, (float)(1),
compute_num);
__bang_cycle_maxequal((float *)nram_temp, (float *)nram_temp,
(float *)nram_flt_min, compute_num, nfu_align_num);
__bang_active_loghp((float *)nram_temp, (float *)nram_temp, compute_num);
__bang_cycle_mul((float *)nram_temp, (float *)nram_temp, (float *)nram_gamma,
compute_num, nfu_align_num);
__bang_active_exphp((float *)nram_temp, (float *)nram_temp, compute_num);
__bang_mul((float *)nram_temp, (float *)nram_temp, (float *)nram_alpha_t,
compute_num);
__bang_mul_const((float *)nram_temp, (float *)nram_temp, (float)(-1),
compute_num);
// 4. output = 1 - pt - gamma * pt * log(max(pt, FLT_MIN))
__bang_cycle_maxequal((float *)nram_output, (float *)nram_pt,
(float *)nram_flt_min, compute_num, nfu_align_num);
__bang_active_loghp((float *)nram_output, (float *)nram_output, compute_num);
__bang_mul((float *)nram_output, (float *)nram_output, (float *)nram_pt,
compute_num);
__bang_cycle_mul((float *)nram_output, (float *)nram_output,
(float *)nram_gamma, compute_num, nfu_align_num);
__bang_add((float *)nram_output, (float *)nram_output, (float *)nram_pt,
compute_num);
__bang_mul_const((float *)nram_output, (float *)nram_output, (float)(-1),
compute_num);
__bang_add_const((float *)nram_output, (float *)nram_output, (float)(1),
compute_num);
// 5. output = output * temp
__bang_mul((float *)nram_output, (float *)nram_output, (float *)nram_temp,
compute_num);
if (sizeof(T) == sizeof(half)) {
__bang_float2half_rd((half *)nram_output, (float *)nram_output,
compute_num);
}
if (has_weight) {
// with weight
for (int n = 0; n < deal_n; n++) {
int32_t c_align_num = nfu_align_num;
if (sizeof(T) == sizeof(half)) {
c_align_num += nfu_align_num;
}
int32_t align_c = PAD_UP(total_c, c_align_num);
int32_t target_value = ((int32_t *)nram_target)[n];
T weight_value = nram_weight[target_value];
__bang_mul_const((T *)nram_output + n * align_c,
(T *)nram_output + n * align_c, weight_value, align_c);
}
}
}
template <typename T>
__mlu_func__ void storeOutput(T *gdram_output, const char *nram_output,
const int32_t deal_n, const int32_t total_c,
const bool pingpong_flag, const bool has_weight,
const int32_t nram_offset,
const int32_t gdram_offset) {
if (pingpong_flag == PONG) {
nram_output += nram_offset;
}
const int32_t store_size = deal_n * total_c * sizeof(T);
if (has_weight) {
int32_t align_c = PAD_UP(total_c, NFU_ALIGN_SIZE / sizeof(T));
int32_t total_c_size = total_c * sizeof(T);
int32_t align_c_size = align_c * sizeof(T);
__memcpy_async(gdram_output + gdram_offset, nram_output, total_c_size,
NRAM2GDRAM, total_c_size, align_c_size, deal_n - 1);
} else {
__memcpy_async(gdram_output + gdram_offset, nram_output, store_size,
NRAM2GDRAM);
}
}
template <typename T>
__mlu_func__ void focalLossSigmoidBackwardBlock(
const T *input, const int32_t *target, const T *weight, const float gamma,
const float alpha, const int32_t total_n, const int32_t deal_n,
const int32_t total_c, T *output) {
// params per time slice
int32_t deal_num = deal_n * total_c;
int32_t deal_size = deal_num * sizeof(float);
int32_t compute_num = 0;
int32_t compute_size = 0;
int32_t compute_align_size = NFU_ALIGN_SIZE;
const int32_t nfu_align_num = NFU_ALIGN_SIZE / sizeof(T);
if (sizeof(T) == sizeof(half)) {
compute_align_size += NFU_ALIGN_SIZE;
}
const int32_t compute_align_num = compute_align_size / sizeof(float);
bool has_weight = false;
if (weight != NULL) {
has_weight = true;
int32_t align_c = PAD_UP(total_c, compute_align_num);
compute_num = deal_n * align_c;
compute_size = compute_num * sizeof(float);
} else {
compute_size = PAD_UP(deal_size, compute_align_size);
compute_num = compute_size / sizeof(float);
}
// params per core
int32_t total_num = total_n * total_c;
int32_t num_per_core = PAD_DOWN(total_num / taskDim, deal_num);
int32_t loop_per_core = num_per_core / deal_num;
/* NRAM partition:
*
* |-----------------ping pong--------------------|
* |input | pt | alpha_t | temp | output | target | flt_min | gamma | weight|
*
* split_pipeline_num is 5: input, pt, alpha_t, temp, output.
* nram_reserved_line_num is 2: flt_min, gamma.
*/
const int32_t split_pipeline_num = 5;
const int32_t nram_reserved_line_num = 2;
int32_t target_deal_size = deal_n * sizeof(int32_t);
int32_t target_deal_size_align = PAD_UP(target_deal_size, NFU_ALIGN_SIZE);
// nram PING/PONG offset
int32_t ping_pong_offset =
compute_size * split_pipeline_num + target_deal_size_align;
// gdram addr
int32_t *base_addr_target =
(int32_t *)target + taskId * loop_per_core * deal_n;
T *base_addr_input = (T *)input + taskId * num_per_core;
T *base_addr_output = output + taskId * num_per_core;
// nram addr
char *nram_input = (char *)nram_buffer;
char *nram_pt = nram_input + compute_size;
char *nram_alpha_t = nram_pt + compute_size;
char *nram_temp = nram_alpha_t + compute_size;
char *nram_output = nram_temp + compute_size;
char *nram_target = nram_output + compute_size;
float *nram_flt_min = NULL;
float *nram_gamma = NULL;
T *nram_weight = NULL;
if (!has_weight) {
nram_flt_min = (float *)(nram_buffer + MAX_NRAM_SIZE -
nram_reserved_line_num * NFU_ALIGN_SIZE);
nram_gamma = nram_flt_min + nfu_align_num;
} else {
int32_t weight_space = PAD_UP(total_c * sizeof(T), NFU_ALIGN_SIZE);
nram_flt_min =
(float *)(nram_buffer + MAX_NRAM_SIZE -
nram_reserved_line_num * NFU_ALIGN_SIZE - weight_space);
nram_gamma = nram_flt_min + nfu_align_num;
nram_weight = (T *)(nram_gamma + nfu_align_num);
__memcpy_async(nram_weight, weight, total_c * sizeof(T), GDRAM2NRAM);
}
// nram set gamma and FLT_MIN
__nramset(nram_gamma, nfu_align_num, gamma);
__nramset(nram_flt_min, nfu_align_num, FLT_MIN);
/*
* Pipeline: The pipeline is processed in three stages: Load, Compute, Store.
* The allocated memory space of NRAM is divided into two parts:
* PING and Pong. In a single time slice, PING is used to process
* IO stream and PONG is used for computation. Both of them are
* processed synchronously until finished.
*
* diagram of PINGPONG:
* |------|-----------------------------------------------------------------|
* | | space |
* |------|-----------------------------------------------------------------|
* | time | Ping | Pong | Ping | Pong | Ping | Pong |
* |------|-----------------------------------------------------------------|
* | 0 | L0 | | | | | |
* | 1 | C0 | L1 | | | | |
* | 2 | S0 | C1 | L2 | | | |
* | 3 | | S1 | C2 | L3 | | |
* | 4 | | | S2 | C3 | L4 | |
* | 5 | | | | S3 | C4 | L5 |
* | 6 | | | | | S4 | C5 |
* | 7 | | | | | | S5 |
* |------|-----------------------------------------------------------------|
*/
// diagram of PINGPONG: L0
if (loop_per_core > 0) {
loadInput(nram_input, nram_target, base_addr_input, base_addr_target,
deal_n, total_c, PING, has_weight, ping_pong_offset, 0);
__asm__ volatile("sync;");
}
// diagram of PINGPONG: C0 and L1
if (loop_per_core > 1) {
coreCompute(nram_input, nram_weight, nram_flt_min, nram_pt, nram_alpha_t,
nram_temp, nram_target, nram_gamma, nram_output, alpha,
compute_num, deal_n, total_c, PING, ping_pong_offset,
has_weight);
loadInput(nram_input, nram_target, base_addr_input, base_addr_target,
deal_n, total_c, PONG, has_weight, ping_pong_offset, deal_num);
__asm__ volatile("sync;");
}
for (int i = 0; i < loop_per_core - 2; ++i) {
if (i % 2 == PING) {
storeOutput(base_addr_output, nram_output, deal_n, total_c, PING,
has_weight, ping_pong_offset, i * deal_num);
coreCompute(nram_input, nram_weight, nram_flt_min, nram_pt, nram_alpha_t,
nram_temp, nram_target, nram_gamma, nram_output, alpha,
compute_num, deal_n, total_c, PONG, ping_pong_offset,
has_weight);
loadInput(nram_input, nram_target, base_addr_input, base_addr_target,
deal_n, total_c, PING, has_weight, ping_pong_offset,
(i + 2) * deal_num);
} else {
storeOutput(base_addr_output, nram_output, deal_n, total_c, PONG,
has_weight, ping_pong_offset, i * deal_num);
coreCompute(nram_input, nram_weight, nram_flt_min, nram_pt, nram_alpha_t,
nram_temp, nram_target, nram_gamma, nram_output, alpha,
compute_num, deal_n, total_c, PING, ping_pong_offset,
has_weight);
loadInput(nram_input, nram_target, base_addr_input, base_addr_target,
deal_n, total_c, PONG, has_weight, ping_pong_offset,
(i + 2) * deal_num);
}
__asm__ volatile("sync;");
}
if (loop_per_core > 1) {
if ((loop_per_core - 2) % 2 == PING) {
storeOutput(base_addr_output, nram_output, deal_n, total_c, PING,
has_weight, ping_pong_offset, (loop_per_core - 2) * deal_num);
coreCompute(nram_input, nram_weight, nram_flt_min, nram_pt, nram_alpha_t,
nram_temp, nram_target, nram_gamma, nram_output, alpha,
compute_num, deal_n, total_c, PONG, ping_pong_offset,
has_weight);
} else {
storeOutput(base_addr_output, nram_output, deal_n, total_c, PONG,
has_weight, ping_pong_offset, (loop_per_core - 2) * deal_num);
coreCompute(nram_input, nram_weight, nram_flt_min, nram_pt, nram_alpha_t,
nram_temp, nram_target, nram_gamma, nram_output, alpha,
compute_num, deal_n, total_c, PING, ping_pong_offset,
has_weight);
}
__asm__ volatile("sync;");
}
if (loop_per_core > 0) {
if (loop_per_core == 1) {
coreCompute(nram_input, nram_weight, nram_flt_min, nram_pt, nram_alpha_t,
nram_temp, nram_target, nram_gamma, nram_output, alpha,
compute_num, deal_n, total_c, PING, ping_pong_offset,
has_weight);
__asm__ volatile("sync;");
}
if ((loop_per_core - 1) % 2 == PING) {
storeOutput(base_addr_output, nram_output, deal_n, total_c, PING,
has_weight, ping_pong_offset, (loop_per_core - 1) * deal_num);
} else {
storeOutput(base_addr_output, nram_output, deal_n, total_c, PONG,
has_weight, ping_pong_offset, (loop_per_core - 1) * deal_num);
}
}
// process the remaining data which N remainder per core is less than deal_n
int32_t rem_for_all = total_num - num_per_core * taskDim;
if (rem_for_all == 0) return;
int32_t rem_n_for_all = rem_for_all / total_c;
int32_t rem_n_per_core = (rem_n_for_all + taskDim - 1) / taskDim;
int32_t rem_num_per_core = rem_n_per_core * total_c;
int32_t rem_num_per_core_align = 0;
int32_t rem_core_num = rem_for_all / rem_num_per_core;
int32_t rem_n_for_last = rem_n_for_all % rem_n_per_core;
int32_t rem_num_for_last = rem_n_for_last * total_c;
int32_t rem_num_for_last_align = 0;
if (has_weight) {
int32_t align_c = PAD_UP(total_c, compute_align_num);
rem_num_per_core_align = rem_n_per_core * align_c;
rem_num_for_last_align = rem_n_for_last * align_c;
} else {
rem_num_per_core_align = PAD_UP(rem_num_per_core, compute_align_num);
rem_num_for_last_align = PAD_UP(rem_num_for_last, compute_align_num);
}
int32_t rem_addr_base = num_per_core * taskDim;
int32_t rem_target_addr_base = loop_per_core * deal_n * taskDim;
base_addr_target = (int32_t *)target + rem_target_addr_base;
base_addr_input = (T *)input + rem_addr_base;
base_addr_output = output + rem_addr_base;
if (taskId < rem_core_num) {
loadInput(nram_input, nram_target, base_addr_input, base_addr_target,
rem_n_per_core, total_c, PING, has_weight, ping_pong_offset,
taskId * rem_num_per_core);
__asm__ volatile("sync;");
coreCompute(nram_input, nram_weight, nram_flt_min, nram_pt, nram_alpha_t,
nram_temp, nram_target, nram_gamma, nram_output, alpha,
rem_num_per_core_align, rem_n_per_core, total_c, PING,
ping_pong_offset, has_weight);
__asm__ volatile("sync;");
storeOutput(base_addr_output, nram_output, rem_n_per_core, total_c, PING,
has_weight, ping_pong_offset, taskId * rem_num_per_core);
} else if (taskId == rem_core_num) {
if (rem_num_for_last == 0) return;
loadInput(nram_input, nram_target, base_addr_input, base_addr_target,
rem_n_for_last, total_c, PING, has_weight, ping_pong_offset,
taskId * rem_num_per_core);
__asm__ volatile("sync;");
coreCompute(nram_input, nram_weight, nram_flt_min, nram_pt, nram_alpha_t,
nram_temp, nram_target, nram_gamma, nram_output, alpha,
rem_num_for_last_align, rem_n_for_last, total_c, PING,
ping_pong_offset, has_weight);
__asm__ volatile("sync;");
storeOutput(base_addr_output, nram_output, rem_n_for_last, total_c, PING,
has_weight, ping_pong_offset, taskId * rem_num_per_core);
} else {
return;
}
}
template <typename T>
__mlu_global__ void MLUUnion1KernelFocalLossSigmoidBackward(
const void *input, const void *target, const void *weight,
const float gamma, const float alpha, const int32_t total_n,
const int32_t deal_n, const int32_t total_c, void *output) {
focalLossSigmoidBackwardBlock((T *)input, (int32_t *)target, (T *)weight,
gamma, alpha, total_n, deal_n, total_c,
(T *)output);
}
} // namespace backward
void KernelFocalLossSigmoidForward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
cnrtQueue_t queue,
const cnrtDataType_t d_type,
const void *input, const void *target,
const void *weight, const int32_t N,
const int32_t C, const float alpha,
const float gamma, void *output) {
if (d_type == CNRT_FLOAT16) {
forward::MLUUnion1KernelFocalLossSigmoidForward<
half><<<k_dim, k_type, queue>>>(input, target, weight, N, C, alpha,
gamma, output);
} else {
forward::MLUUnion1KernelFocalLossSigmoidForward<
float><<<k_dim, k_type, queue>>>(input, target, weight, N, C, alpha,
gamma, output);
}
}
void KernelFocalLossSigmoidBackward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
cnrtQueue_t queue,
const cnrtDataType_t d_type,
const void *input, const void *target,
const void *weight, const float gamma,
const float alpha, const int32_t dim_n,
const int32_t deal_n, const int32_t dim_c,
void *output) {
if (d_type == CNRT_FLOAT16) {
backward::MLUUnion1KernelFocalLossSigmoidBackward<
half><<<k_dim, k_type, queue>>>(input, target, weight, gamma, alpha,
dim_n, deal_n, dim_c, output);
} else {
backward::MLUUnion1KernelFocalLossSigmoidBackward<
float><<<k_dim, k_type, queue>>>(input, target, weight, gamma, alpha,
dim_n, deal_n, dim_c, output);
}
}
mmcv/ops/csrc/common/mlu/iou3d_mlu_kernel.mlu
deleted
100644 → 0
View file @
6f674c7e
/*************************************************************************
* Copyright (C) 2022 Cambricon.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#include "common_mlu_helper.hpp"
#include "iou3d_utils.hpp"
#define SIZE_SRAM_BUF (MAX_SRAM_SIZE)
/* NRAM buffer
* Suppose deal N boxes once time.
----------------------------------------------------------------
| Basic |score (1N)+ |intersect_pts(48N)| |
| |valid_box(1N) |+ ordered_pts(48N)| temp_long(72N) |
| |+ temp_buffer(10N)| | |
|--------------------------|------------------|----------------|
| Reuse | null | null |rotated_pts(16N)|
|-------|------------------|------------------|----------------|
---------------------------------------------------------------------------
| Basic | dist_ram(24N) | valid_pts(24N) |box1(5N) |box1_buffer(5KB) |
| | |+ nums_in_ram(1N)|+ box2(5N)|+nram_save(5KB) |
|--------------------------|-----------------|----------|-----------------|
| Reuse | vec_buffer(5N) | null | null | null |
|-------|------------------|-----------------|----------|-----------------|
Total Basic Memory Size = 239N * sizeof(float) + 10KB
*/
__nram__ char nram_buffer[MAX_NRAM_SIZE];
__mlu_shared__ char sram_buffer[SIZE_SRAM_BUF];
template <typename T>
__mlu_func__ void iou3D_detection(int32_t &result_box_num, int32_t *output_data,
const T *boxes_data, float *scores_data,
const int core_limit, const int input_box_num,
const float iou_threshold,
mluMemcpyDirection_t scores_load_dir,
mluMemcpyDirection_t scores_store_dir,
mluMemcpyDirection_t boxes_load_dir) {
// NRAM divide by (2+4*COMPUTE_COUNT_ALIGN) copies of NRAM, counted by bytes
const int nram_save_limit_count = 256;
int box_read_limit_count = 256;
float div_thresh_iou = 1.0 / iou_threshold;
// every box require 239 * sizeof(float) space in nram;
const int32_t copies_of_nram = 239 * sizeof(float);
const int32_t limit = (MAX_NRAM_SIZE - 5 * box_read_limit_count * sizeof(T) -
nram_save_limit_count * sizeof(int32_t)) /
copies_of_nram;
// x,y,z,dx,dy,dz,angle
const T *input_x_ptr = boxes_data;
const T *input_y_ptr = input_x_ptr + input_box_num;
const T *input_dx_ptr = input_y_ptr + 2 * input_box_num;
const T *input_dy_ptr = input_dx_ptr + input_box_num;
const T *input_angle_ptr = input_dy_ptr + 2 * input_box_num;
float *input_score_ptr = scores_data;
// data split
int avg_cluster = 0;
int rem_cluster = 0;
int len_cluster = 0;
int cluster_offset = 0;
if (clusterDim > 0) {
// union
avg_cluster = input_box_num / clusterDim;
rem_cluster = input_box_num % clusterDim;
len_cluster = avg_cluster + (clusterId < rem_cluster ? 1 : 0);
cluster_offset = avg_cluster * clusterId +
(clusterId <= rem_cluster ? clusterId : rem_cluster);
} else {
// block
len_cluster = input_box_num;
cluster_offset = 0;
}
int len_core = input_box_num;
int input_offset = 0;
if (core_limit > 1) {
int avg_core = len_cluster / coreDim;
int rem_core = len_cluster % coreDim;
len_core = avg_core + (coreId < rem_core ? 1 : 0);
int core_offset =
avg_core * coreId + (coreId <= rem_core ? coreId : rem_core);
input_offset = cluster_offset + core_offset;
}
int32_t max_seg_pad = IOU3D_DOWN(limit, IOU3D_SIZE);
int repeat_iou_compute = len_core / max_seg_pad;
int remain_iou_compute = len_core % max_seg_pad;
// basic consistent memory layout
void *score = ((char *)nram_buffer);
void *valid_box = ((char *)score) + 1 * max_seg_pad * sizeof(float);
void *temp_buffer = ((char *)valid_box) + 1 * max_seg_pad * sizeof(float);
void *intersect_pts_x =
((char *)temp_buffer) + 10 * max_seg_pad * sizeof(float);
void *intersect_pts_y =
((char *)intersect_pts_x) + 24 * max_seg_pad * sizeof(float);
void *ordered_pts_x =
((char *)intersect_pts_y) + 24 * max_seg_pad * sizeof(float);
void *ordered_pts_y =
((char *)ordered_pts_x) + 24 * max_seg_pad * sizeof(float);
void *temp_long_1 =
((char *)ordered_pts_y) + 24 * max_seg_pad * sizeof(float);
void *temp_long_2 = ((char *)temp_long_1) + 24 * max_seg_pad * sizeof(float);
void *temp_long_3 = ((char *)temp_long_2) + 24 * max_seg_pad * sizeof(float);
void *dist_ram = ((char *)temp_long_3) + 24 * max_seg_pad * sizeof(float);
void *valid_pts = ((char *)dist_ram) + 24 * max_seg_pad * sizeof(float);
void *nums_in_ram = ((char *)valid_pts) + 24 * max_seg_pad * sizeof(float);
T *box1 = (T *)(((char *)nums_in_ram) + 1 * max_seg_pad * sizeof(float));
T *box2 = (T *)(((char *)box1) + 5 * max_seg_pad * sizeof(float));
void *box1_buffer = ((char *)box2) + 5 * max_seg_pad * sizeof(float);
int32_t *nram_save =
(int32_t *)(((char *)box1_buffer) + 5 * box_read_limit_count * sizeof(T));
// nram_save ~ nram_save_limit_count * sizeof(int32_t)
int nram_save_count = 0;
// reuse memory
void *rotated_pts1_x = ((char *)dist_ram);
void *rotated_pts1_y =
((char *)rotated_pts1_x) + 4 * max_seg_pad * sizeof(float);
void *rotated_pts2_x =
((char *)rotated_pts1_y) + 4 * max_seg_pad * sizeof(float);
void *rotated_pts2_y =
((char *)rotated_pts2_x) + 4 * max_seg_pad * sizeof(float);
void *vec_buffer = ((char *)temp_long_1) + 5 * max_seg_pad * sizeof(float);
// vec_buffer ~ 16 * max_seg_pad * sizeof(float)
// First, initialize ram with all 0, or could cause nan/inf unexcepted results
__bang_write_zero((unsigned char *)nram_buffer, copies_of_nram * max_seg_pad);
// number 8 and 0xff relay on box_read_limit_count initial as 256
const int max_box_seg_id = (input_box_num - 1) >> 8;
const int last_rem_box_number = ((input_box_num - 1) & 0xff) + 1;
for (int32_t cur_box = 0; cur_box < input_box_num; ++cur_box) {
__sync_all();
int box_seg_id = cur_box >> 8, box_id = cur_box & 0xff;
box_read_limit_count = box_seg_id == max_box_seg_id ? last_rem_box_number
: box_read_limit_count;
if (box_id == 0) {
// x,y,z,dx,dy,dz,angle
int offset_num = box_seg_id << 8;
// x
__memcpy((char *)box1_buffer, input_x_ptr + offset_num,
box_read_limit_count * 1 * sizeof(T), boxes_load_dir,
box_read_limit_count * 1 * sizeof(T),
box_read_limit_count * 1 * sizeof(T), 0);
// y
__memcpy((char *)box1_buffer + box_read_limit_count * 1 * sizeof(T),
input_y_ptr + offset_num, box_read_limit_count * 1 * sizeof(T),
boxes_load_dir, box_read_limit_count * 1 * sizeof(T),
box_read_limit_count * 1 * sizeof(T), 0);
// dx
__memcpy((char *)box1_buffer + box_read_limit_count * 2 * sizeof(T),
input_dx_ptr + offset_num, box_read_limit_count * 1 * sizeof(T),
boxes_load_dir, box_read_limit_count * 1 * sizeof(T),
box_read_limit_count * 1 * sizeof(T), 0);
// dy
__memcpy((char *)box1_buffer + box_read_limit_count * 3 * sizeof(T),
input_dy_ptr + offset_num, box_read_limit_count * 1 * sizeof(T),
boxes_load_dir, box_read_limit_count * 1 * sizeof(T),
box_read_limit_count * 1 * sizeof(T), 0);
// angle
__memcpy((char *)box1_buffer + box_read_limit_count * 4 * sizeof(T),
input_angle_ptr + offset_num,
box_read_limit_count * 1 * sizeof(T), boxes_load_dir,
box_read_limit_count * 1 * sizeof(T),
box_read_limit_count * 1 * sizeof(T), 0);
}
if (((float *)input_score_ptr)[cur_box] == 0) {
continue;
}
// save result
nram_save[nram_save_count] = cur_box;
result_box_num++;
nram_save_count++;
if (clusterId == 0 && coreId == 0 &&
nram_save_count == nram_save_limit_count) {
pvLock();
__memcpy(output_data, nram_save, nram_save_count * sizeof(int32_t),
NRAM2GDRAM);
pvUnlock();
output_data += nram_save_count;
nram_save_count = 0;
}
// prepare box1
// x
__bang_write_value((float *)box1, max_seg_pad,
float(((T *)box1_buffer)[box_id]));
// y
__bang_write_value(
(float *)box1 + max_seg_pad, max_seg_pad,
float(((T *)box1_buffer)[box_id + 1 * box_read_limit_count]));
// dx
__bang_write_value(
(float *)box1 + max_seg_pad * 2, max_seg_pad,
float(((T *)box1_buffer)[box_id + 2 * box_read_limit_count]));
// dy
__bang_write_value(
(float *)box1 + max_seg_pad * 3, max_seg_pad,
float(((T *)box1_buffer)[box_id + 3 * box_read_limit_count]));
// angle
__bang_write_value(
(float *)box1 + max_seg_pad * 4, max_seg_pad,
float(((T *)box1_buffer)[box_id + 4 * box_read_limit_count]));
float max_area = 1.0f *
((T *)box1_buffer)[box_id + 2 * box_read_limit_count] *
((T *)box1_buffer)[box_id + 3 * box_read_limit_count];
// update score
for (int i = 0; i <= repeat_iou_compute; i++) {
if (i == repeat_iou_compute && remain_iou_compute == 0) {
break;
}
int seg_len = max_seg_pad;
int cpy_len =
(i == repeat_iou_compute) ? remain_iou_compute : max_seg_pad;
// int half_offset = std::is_same<T, half>::value ? max_seg_pad * 5 : 0;
int half_offset = (sizeof(T) == sizeof(half)) ? max_seg_pad * 5 : 0;
// score
__memcpy(score, input_score_ptr + input_offset + i * max_seg_pad,
cpy_len * sizeof(float), scores_load_dir,
cpy_len * sizeof(float), cpy_len * sizeof(float), 0);
// x
__memcpy(box2 + half_offset, input_x_ptr + input_offset + i * max_seg_pad,
cpy_len * 1 * sizeof(T), boxes_load_dir, cpy_len * 1 * sizeof(T),
cpy_len * 1 * sizeof(T), 0);
// y
__memcpy(box2 + half_offset + seg_len * 1,
input_y_ptr + input_offset + i * max_seg_pad,
cpy_len * 1 * sizeof(T), boxes_load_dir, cpy_len * 1 * sizeof(T),
cpy_len * 1 * sizeof(T), 0);
// dx
__memcpy(box2 + half_offset + seg_len * 2,
input_dx_ptr + input_offset + i * max_seg_pad,
cpy_len * 1 * sizeof(T), boxes_load_dir, cpy_len * 1 * sizeof(T),
cpy_len * 1 * sizeof(T), 0);
// dy
__memcpy(box2 + half_offset + seg_len * 3,
input_dy_ptr + input_offset + i * max_seg_pad,
cpy_len * 1 * sizeof(T), boxes_load_dir, cpy_len * 1 * sizeof(T),
cpy_len * 1 * sizeof(T), 0);
// angle
__memcpy(box2 + half_offset + seg_len * 4,
input_angle_ptr + input_offset + i * max_seg_pad,
cpy_len * 1 * sizeof(T), boxes_load_dir, cpy_len * 1 * sizeof(T),
cpy_len * 1 * sizeof(T), 0);
// if (std::is_same<T, half>::value) {
if (sizeof(T) == sizeof(half)) {
__bang_half2float((float *)box2, (half *)(box2 + half_offset),
seg_len * 5);
}
// Calculate rotated vertices
void *temp1_ram = ((char *)temp_buffer);
void *temp2_ram = ((char *)temp_buffer) + seg_len * sizeof(float);
void *temp3_ram = ((char *)temp_buffer) + 2 * seg_len * sizeof(float);
void *temp4_ram = ((char *)temp_buffer) + 3 * seg_len * sizeof(float);
getRotatedVertices((float *)rotated_pts1_x, (float *)rotated_pts1_y,
(float *)box1, (float *)temp1_ram, (float *)temp2_ram,
(float *)temp3_ram, (float *)temp4_ram, seg_len);
getRotatedVertices((float *)rotated_pts2_x, (float *)rotated_pts2_y,
(float *)box2, (float *)temp1_ram, (float *)temp2_ram,
(float *)temp3_ram, (float *)temp4_ram, seg_len);
__bang_write_zero((float *)valid_pts, 24 * seg_len);
__bang_write_zero((float *)nums_in_ram, seg_len);
__bang_write_value(((float *)valid_box), seg_len, 1.0f);
void *vec1_x = ((char *)vec_buffer);
void *vec1_y = ((char *)vec1_x) + 4 * seg_len * sizeof(float);
void *vec2_x = ((char *)vec1_y) + 4 * seg_len * sizeof(float);
void *vec2_y = ((char *)vec2_x) + 4 * seg_len * sizeof(float);
void *temp5_ram = ((char *)temp_buffer) + 4 * seg_len * sizeof(float);
void *temp6_ram = ((char *)temp_buffer) + 5 * seg_len * sizeof(float);
void *temp7_ram = ((char *)temp_buffer) + 6 * seg_len * sizeof(float);
void *temp8_ram = ((char *)temp_buffer) + 7 * seg_len * sizeof(float);
void *temp9_ram = ((char *)temp_buffer) + 8 * seg_len * sizeof(float);
void *temp10_ram = ((char *)temp_buffer) + 9 * seg_len * sizeof(float);
// Get all intersection points
getIntersectPts(
(float *)rotated_pts1_x, (float *)rotated_pts1_y,
(float *)rotated_pts2_x, (float *)rotated_pts2_y, (float *)vec1_x,
(float *)vec1_y, (float *)vec2_x, (float *)vec2_y,
(float *)intersect_pts_x, (float *)intersect_pts_y,
(float *)valid_pts, (float *)nums_in_ram, (float *)temp1_ram,
(float *)temp2_ram, (float *)temp3_ram, (float *)temp4_ram,
(float *)temp5_ram, (float *)temp6_ram, (float *)temp7_ram,
(float *)temp8_ram, (float *)temp9_ram, (float *)temp10_ram, seg_len);
// Where nums_in <= 2, set valid_box to false
__bang_write_value((float *)temp9_ram, COMPUTE_COUNT_ALIGN, (float)2);
__bang_cycle_gt((float *)temp1_ram, (float *)nums_in_ram,
(float *)temp9_ram, seg_len, COMPUTE_COUNT_ALIGN);
__bang_and((float *)valid_box, (float *)valid_box, (float *)temp1_ram,
seg_len);
__bang_cycle_and((float *)valid_pts, (float *)valid_pts,
(float *)valid_box, 24 * seg_len, seg_len);
// Convex-hull-graham to order the intersection points in clockwise order
// and find the contour area
convexHullGraham(
(float *)intersect_pts_x, (float *)intersect_pts_y,
(float *)ordered_pts_x, (float *)ordered_pts_y, (float *)dist_ram,
(float *)valid_box, (float *)valid_pts, (float *)nums_in_ram,
(float *)temp7_ram, (float *)temp8_ram, (float *)temp9_ram,
(float *)temp_long_1, (float *)temp_long_2, (float *)temp_long_3,
seg_len, seg_len);
// Calculate polygon area
// set temp1 = intersection part area
polygonArea((float *)ordered_pts_x, (float *)ordered_pts_y,
(float *)valid_box, (float *)valid_pts, (float *)nums_in_ram,
(float *)temp1_ram, (float *)temp2_ram, (float *)temp3_ram,
(float *)temp4_ram, (float *)temp5_ram, (float *)temp6_ram,
(float *)temp7_ram, (float *)temp8_ram, (float *)temp9_ram,
seg_len);
// area
__bang_mul((float *)temp2_ram, (float *)box2 + seg_len * 2,
(float *)box2 + seg_len * 3, seg_len);
// get the area_U: area + max_area - area_I
__bang_add_scalar((float *)temp2_ram, (float *)temp2_ram, float(max_area),
seg_len);
__bang_sub((float *)temp2_ram, (float *)temp2_ram, (float *)temp1_ram,
seg_len); // area_U
if (iou_threshold > 0.0) {
__bang_mul_scalar((float *)temp1_ram, (float *)temp1_ram,
div_thresh_iou, seg_len);
} else {
__bang_mul_scalar((float *)temp2_ram, (float *)temp2_ram, iou_threshold,
seg_len);
}
__bang_ge((float *)temp1_ram, (float *)temp2_ram, (float *)temp1_ram,
seg_len);
__bang_mul((float *)score, (float *)score, (float *)temp1_ram, seg_len);
pvLock();
__memcpy(input_score_ptr + input_offset + i * max_seg_pad, score,
cpy_len * sizeof(float), scores_store_dir,
cpy_len * sizeof(float), cpy_len * sizeof(float), 0);
pvUnlock();
}
}
if (clusterId == 0 && coreId == 0 && nram_save_count) {
pvLock();
__memcpy(output_data, nram_save, nram_save_count * sizeof(int32_t),
NRAM2GDRAM);
pvUnlock();
}
}
__mlu_global__ void MLUBlockorUnionIKernelOU3D(
const void *input_boxes, const int input_box_num, const float iou_threshold,
const cnrtDataType_t data_type_input, void *workspace, void *result_num,
void *output) {
int input_dwidth = (data_type_input == CNRT_FLOAT32) ? 4 : 2;
mluMemcpyDirection_t scores_load_dir = GDRAM2NRAM;
mluMemcpyDirection_t scores_store_dir = NRAM2GDRAM;
mluMemcpyDirection_t boxes_load_dir = GDRAM2NRAM;
float *scores_data = (float *)workspace;
float *boxes_data = (float *)input_boxes;
const int cluster_score_size = input_box_num * sizeof(float);
const int cluster_boxes_size = input_box_num * 7 * input_dwidth;
char *sram_score = (char *)sram_buffer;
char *sram_boxes = (char *)sram_buffer + cluster_score_size;
if (clusterDim == 1 && SIZE_SRAM_BUF > cluster_score_size) {
scores_data = (float *)sram_score;
scores_load_dir = SRAM2NRAM;
scores_store_dir = NRAM2SRAM;
if (coreId == 0x80) {
__sramset((void *)sram_buffer, input_box_num, 1.0f);
}
} else {
if (coreId == 0) {
__gdramset(scores_data, input_box_num, 1.0f);
}
}
if (clusterDim == 1 &&
SIZE_SRAM_BUF - cluster_score_size >= cluster_boxes_size) {
boxes_load_dir = SRAM2NRAM;
boxes_data = (float *)sram_boxes;
if (coreId == 0x80) {
__memcpy((char *)boxes_data, (char *)input_boxes, cluster_boxes_size,
GDRAM2SRAM);
}
}
__sync_cluster();
int32_t result_box_num = 0;
int32_t *out_data = (int32_t *)output;
switch (data_type_input) {
default: { return; }
case CNRT_FLOAT16: {
iou3D_detection(result_box_num, out_data, (half *)boxes_data, scores_data,
taskDim, input_box_num, iou_threshold, scores_load_dir,
scores_store_dir, boxes_load_dir);
}; break;
case CNRT_FLOAT32: {
iou3D_detection(result_box_num, out_data, boxes_data, scores_data,
taskDim, input_box_num, iou_threshold, scores_load_dir,
scores_store_dir, boxes_load_dir);
}; break;
}
((int32_t *)result_num)[0] = result_box_num;
}
void KernelIou3d(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
const cnrtDataType_t data_type_input, const void *boxes_dram,
const int input_box_num, const float iou_threshold,
void *workspace, void *output_size, void *output) {
switch (k_type) {
default: { return; }
case CNRT_FUNC_TYPE_BLOCK:
case CNRT_FUNC_TYPE_UNION1:
case CNRT_FUNC_TYPE_UNION2:
case CNRT_FUNC_TYPE_UNION4:
case CNRT_FUNC_TYPE_UNION8:
case CNRT_FUNC_TYPE_UNION16: {
MLUBlockorUnionIKernelOU3D<<<k_dim, k_type, queue>>>(
(void *)boxes_dram, input_box_num, iou_threshold, data_type_input,
workspace, output_size, output);
}; break;
}
}
mmcv/ops/csrc/common/mlu/iou3d_utils.hpp
deleted
100644 → 0
View file @
6f674c7e
/*************************************************************************
* Copyright (C) 2022 Cambricon.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#ifndef IOU3D_UTILS_HPP_
#define IOU3D_UTILS_HPP_
#include "common_mlu_helper.hpp"
#define IOU3D_SIZE 64
#define IOU3D_UP(x, y) (x / y + (int)(x % y > 0)) * y
#define IOU3D_DOWN(x, y) (x / y) * y
#define SIZE_NRAM_BUF (MAX_NRAM_SIZE)
#define SIZE_SRAM_BUF (MAX_SRAM_SIZE)
#define COMPUTE_COUNT_ALIGN 64
#define INFO_NUM (5) // score, x1, y1, x2, y2
#define REDUCE_NUM \
(7) // score, x1, y1, x2, y2, max_index (reserve 2 num for half-type input)
#define SINGLE_BOX_DIM 5
#define MEMORY_CORE (0x80)
__mlu_func__
void
pvLock
()
{
#if __BANG_ARCH__ == 270
if
(
coreId
!=
MEMORY_CORE
)
{
__bang_lock
(
0
,
0
);
}
#endif
}
__mlu_func__
void
pvUnlock
()
{
#if __BANG_ARCH__ == 270
if
(
coreId
!=
MEMORY_CORE
)
{
__bang_unlock
(
0
,
0
);
}
#endif
}
// cross2d<T>(A, B) = A.x * B.y - A.y * B.x;
template
<
typename
T
>
inline
__mlu_func__
void
cross2d
(
T
*
result
,
const
T
*
p1_x
,
const
T
*
p1_y
,
const
T
*
p2_x
,
const
T
*
p2_y
,
const
int
&
length
,
T
*
temp_ram
)
{
__bang_mul
((
T
*
)
temp_ram
,
(
T
*
)
p1_x
,
(
T
*
)
p2_y
,
length
);
__bang_mul
((
T
*
)
result
,
(
T
*
)
p1_y
,
(
T
*
)
p2_x
,
length
);
__bang_sub
((
T
*
)
result
,
(
T
*
)
temp_ram
,
(
T
*
)
result
,
length
);
}
// dot2d<T>(A, B) = A.x * B.x + A.y * B.y
template
<
typename
T
>
inline
__mlu_func__
void
dot2d
(
T
*
result
,
const
T
*
p1_x
,
const
T
*
p1_y
,
const
T
*
p2_x
,
const
T
*
p2_y
,
const
int
&
length
,
T
*
temp_ram
)
{
__bang_mul
((
T
*
)
temp_ram
,
(
T
*
)
p1_x
,
(
T
*
)
p2_x
,
length
);
__bang_mul
((
T
*
)
result
,
(
T
*
)
p1_y
,
(
T
*
)
p2_y
,
length
);
__bang_add
((
T
*
)
result
,
(
T
*
)
temp_ram
,
(
T
*
)
result
,
length
);
}
template
<
typename
T
>
__mlu_func__
void
getRotatedVertices
(
T
*
pts_x
,
T
*
pts_y
,
T
*
box
,
T
*
temp1
,
T
*
temp2
,
T
*
temp3
,
T
*
temp4
,
const
uint32_t
&
actual_compute_box_num
)
{
// T cosTheta2 = (T)cos(theta) * 0.5f; -- temp1
// T sinTheta2 = (T)sin(theta) * 0.5f; -- temp2
// theta is the box's 5th data: a, rotated radian;
#if __BANG_ARCH__ >= 300
__bang_cos
((
float
*
)
temp1
,
((
float
*
)
box
)
+
4
*
actual_compute_box_num
,
actual_compute_box_num
);
__bang_sin
((
float
*
)
temp2
,
((
float
*
)
box
)
+
4
*
actual_compute_box_num
,
actual_compute_box_num
);
#else
__bang_taylor4_cos
((
T
*
)
temp1
,
((
T
*
)
box
)
+
4
*
actual_compute_box_num
,
(
T
*
)
temp3
,
(
T
*
)
temp4
,
actual_compute_box_num
);
__bang_taylor4_sin
((
T
*
)
temp2
,
((
T
*
)
box
)
+
4
*
actual_compute_box_num
,
(
T
*
)
temp3
,
(
T
*
)
temp4
,
actual_compute_box_num
);
#endif
__bang_mul_scalar
((
T
*
)
temp1
,
(
T
*
)
temp1
,
(
T
)
0.5
,
actual_compute_box_num
);
__bang_mul_scalar
((
T
*
)
temp2
,
(
T
*
)
temp2
,
(
T
)
0.5
,
actual_compute_box_num
);
// Temp3 = sinTheta2 * box.h;
// Temp4 = cosTheta2 * box.w;
__bang_mul
((
T
*
)
temp3
,
(
T
*
)
temp2
,
((
T
*
)
box
)
+
3
*
actual_compute_box_num
,
actual_compute_box_num
);
__bang_mul
((
T
*
)
temp4
,
(
T
*
)
temp1
,
((
T
*
)
box
)
+
2
*
actual_compute_box_num
,
actual_compute_box_num
);
// pts[0].x = box.x_ctr - sinTheta2 * box.h - cosTheta2 * box.w;
// pts[1].x = box.x_ctr + sinTheta2 * box.h - cosTheta2 * box.w;
__bang_sub
((
T
*
)
pts_x
,
(
T
*
)
box
,
(
T
*
)
temp3
,
actual_compute_box_num
);
__bang_sub
((
T
*
)
pts_x
,
(
T
*
)
pts_x
,
(
T
*
)
temp4
,
actual_compute_box_num
);
__bang_add
((
T
*
)
pts_x
+
1
*
actual_compute_box_num
,
(
T
*
)
box
,
(
T
*
)
temp3
,
actual_compute_box_num
);
__bang_sub
((
T
*
)
pts_x
+
1
*
actual_compute_box_num
,
(
T
*
)
pts_x
+
1
*
actual_compute_box_num
,
(
T
*
)
temp4
,
actual_compute_box_num
);
// Temp3 = cosTheta2 * box.h;
// Temp4 = sinTheta2 * box.w;
__bang_mul
((
T
*
)
temp3
,
(
T
*
)
temp1
,
box
+
3
*
actual_compute_box_num
,
actual_compute_box_num
);
__bang_mul
((
T
*
)
temp4
,
(
T
*
)
temp2
,
box
+
2
*
actual_compute_box_num
,
actual_compute_box_num
);
// pts[0].y = box.y_ctr + cosTheta2 * box.h - sinTheta2 * box.w;
// pts[1].y = box.y_ctr - cosTheta2 * box.h - sinTheta2 * box.w;
__bang_add
((
T
*
)
pts_y
,
(
T
*
)
box
+
1
*
actual_compute_box_num
,
(
T
*
)
temp3
,
actual_compute_box_num
);
__bang_sub
((
T
*
)
pts_y
,
(
T
*
)
pts_y
,
(
T
*
)
temp4
,
actual_compute_box_num
);
__bang_sub
((
T
*
)
pts_y
+
1
*
actual_compute_box_num
,
(
T
*
)
box
+
1
*
actual_compute_box_num
,
(
T
*
)
temp3
,
actual_compute_box_num
);
__bang_sub
((
T
*
)
pts_y
+
1
*
actual_compute_box_num
,
(
T
*
)
pts_y
+
1
*
actual_compute_box_num
,
(
T
*
)
temp4
,
actual_compute_box_num
);
// pts[2].x = 2 * box.x_ctr - pts[0].x;
// pts[3].x = 2 * box.x_ctr - pts[1].x;
__bang_add
((
T
*
)
pts_x
+
2
*
actual_compute_box_num
,
(
T
*
)
box
,
(
T
*
)
box
,
actual_compute_box_num
);
__bang_sub
((
T
*
)
pts_x
+
2
*
actual_compute_box_num
,
(
T
*
)
pts_x
+
2
*
actual_compute_box_num
,
(
T
*
)
pts_x
,
actual_compute_box_num
);
__bang_add
((
T
*
)
pts_x
+
3
*
actual_compute_box_num
,
(
T
*
)
box
,
(
T
*
)
box
,
actual_compute_box_num
);
__bang_sub
((
T
*
)
pts_x
+
3
*
actual_compute_box_num
,
(
T
*
)
pts_x
+
3
*
actual_compute_box_num
,
(
T
*
)
pts_x
+
1
*
actual_compute_box_num
,
actual_compute_box_num
);
// pts[2].y = 2 * box.y_ctr - pts[0].y;
// pts[3].y = 2 * box.y_ctr - pts[1].y;
__bang_add
((
T
*
)
pts_y
+
2
*
actual_compute_box_num
,
(
T
*
)
box
+
1
*
actual_compute_box_num
,
(
T
*
)
box
+
1
*
actual_compute_box_num
,
actual_compute_box_num
);
__bang_sub
((
T
*
)
pts_y
+
2
*
actual_compute_box_num
,
(
T
*
)
pts_y
+
2
*
actual_compute_box_num
,
(
T
*
)
pts_y
,
actual_compute_box_num
);
__bang_add
((
T
*
)
pts_y
+
3
*
actual_compute_box_num
,
(
T
*
)
box
+
1
*
actual_compute_box_num
,
(
T
*
)
box
+
1
*
actual_compute_box_num
,
actual_compute_box_num
);
__bang_sub
((
T
*
)
pts_y
+
3
*
actual_compute_box_num
,
(
T
*
)
pts_y
+
3
*
actual_compute_box_num
,
(
T
*
)
pts_y
+
1
*
actual_compute_box_num
,
actual_compute_box_num
);
}
template
<
typename
T
>
__mlu_func__
void
getIntersectPts
(
T
*
rotated_pts1_x
,
T
*
rotated_pts1_y
,
T
*
rotated_pts2_x
,
T
*
rotated_pts2_y
,
T
*
vec1_x
,
T
*
vec1_y
,
T
*
vec2_x
,
T
*
vec2_y
,
T
*
intersect_pts_x
,
T
*
intersect_pts_y
,
T
*
valid_pts
,
T
*
nums_in_ram
,
T
*
temp1_ram
,
T
*
temp2_ram
,
T
*
temp3_ram
,
T
*
temp4_ram
,
T
*
temp5_ram
,
T
*
temp6_ram
,
T
*
temp7_ram
,
T
*
temp8_ram
,
T
*
temp9_ram
,
T
*
temp10_ram
,
const
uint32_t
&
actual_compute_box_num
)
{
// Initialize const data to ram
// temp3 = const 1e-14(@float), length = COMPUTE_COUNT_ALIGN
#if __BANG_ARCH__ >= 300
__bang_write_value
((
T
*
)
temp3_ram
,
COMPUTE_COUNT_ALIGN
,
(
T
)
1e-14
);
#else
// NOTE: Since active_reciphp function has strict value range,
// [2.2205e-16, 2e6]@float, [0.00391, 65504]@half
__bang_write_value
((
T
*
)
temp3_ram
,
COMPUTE_COUNT_ALIGN
,
(
float
)
1e-14
);
#endif
// temp4 = const T(0), length = COMPUTE_COUNT_ALIGN
__bang_write_value
((
T
*
)
temp4_ram
,
COMPUTE_COUNT_ALIGN
,
(
T
)
0
);
// temp5 = const T(1), length = COMPUTE_COUNT_ALIGN
__bang_write_value
((
T
*
)
temp5_ram
,
COMPUTE_COUNT_ALIGN
,
(
T
)
1
);
// Line vector, from p1 to p2 is: p1+(p2-p1)*t, t=[0,1]
// for i = 0~3, vec[i] = pts[(i+1)%4] - pts[i]
__bang_sub
((
T
*
)
vec1_x
,
(
T
*
)
rotated_pts1_x
+
actual_compute_box_num
,
(
T
*
)
rotated_pts1_x
,
3
*
actual_compute_box_num
);
__bang_sub
((
T
*
)
vec1_x
+
3
*
actual_compute_box_num
,
(
T
*
)
rotated_pts1_x
,
(
T
*
)
rotated_pts1_x
+
3
*
actual_compute_box_num
,
actual_compute_box_num
);
__bang_sub
((
T
*
)
vec1_y
,
(
T
*
)
rotated_pts1_y
+
actual_compute_box_num
,
(
T
*
)
rotated_pts1_y
,
3
*
actual_compute_box_num
);
__bang_sub
((
T
*
)
vec1_y
+
3
*
actual_compute_box_num
,
(
T
*
)
rotated_pts1_y
,
(
T
*
)
rotated_pts1_y
+
3
*
actual_compute_box_num
,
actual_compute_box_num
);
__bang_sub
((
T
*
)
vec2_x
,
(
T
*
)
rotated_pts2_x
+
actual_compute_box_num
,
(
T
*
)
rotated_pts2_x
,
3
*
actual_compute_box_num
);
__bang_sub
((
T
*
)
vec2_x
+
3
*
actual_compute_box_num
,
(
T
*
)
rotated_pts2_x
,
(
T
*
)
rotated_pts2_x
+
3
*
actual_compute_box_num
,
actual_compute_box_num
);
__bang_sub
((
T
*
)
vec2_y
,
(
T
*
)
rotated_pts2_y
+
actual_compute_box_num
,
(
T
*
)
rotated_pts2_y
,
3
*
actual_compute_box_num
);
__bang_sub
((
T
*
)
vec2_y
+
3
*
actual_compute_box_num
,
(
T
*
)
rotated_pts2_y
,
(
T
*
)
rotated_pts2_y
+
3
*
actual_compute_box_num
,
actual_compute_box_num
);
// First, line test - test all line combos for intersection, 4x4 possible
for
(
int
i
=
0
;
i
<
4
;
i
++
)
{
for
(
int
j
=
0
;
j
<
4
;
j
++
)
{
// T det = cross2d<T>(vec2[j], vec1[i]) -- temp2
cross2d
<
T
>
((
T
*
)
temp2_ram
,
(
T
*
)
vec2_x
+
j
*
actual_compute_box_num
,
(
T
*
)
vec2_y
+
j
*
actual_compute_box_num
,
(
T
*
)
vec1_x
+
i
*
actual_compute_box_num
,
(
T
*
)
vec1_y
+
i
*
actual_compute_box_num
,
actual_compute_box_num
,
(
T
*
)
temp1_ram
);
// temp8 = sign(det), since active_reciphp only receive positive values
__bang_active_sign
((
T
*
)
temp8_ram
,
(
T
*
)
temp2_ram
,
actual_compute_box_num
);
// deal with parallel lines, temp2 = fabs(det), temp1 = temp2 > 1e-14
__bang_active_abs
((
T
*
)
temp2_ram
,
(
T
*
)
temp2_ram
,
actual_compute_box_num
);
__bang_cycle_gt
((
T
*
)
temp1_ram
,
(
T
*
)
temp2_ram
,
(
T
*
)
temp3_ram
,
actual_compute_box_num
,
COMPUTE_COUNT_ALIGN
);
// Where temp1 = false, set recip input to 1, avoiding recip(0), cause inf
__bang_not
((
T
*
)
temp9_ram
,
(
T
*
)
temp1_ram
,
actual_compute_box_num
);
__bang_mul
((
T
*
)
temp2_ram
,
(
T
*
)
temp2_ram
,
(
T
*
)
temp1_ram
,
actual_compute_box_num
);
__bang_add
((
T
*
)
temp2_ram
,
(
T
*
)
temp2_ram
,
(
T
*
)
temp9_ram
,
actual_compute_box_num
);
// temp2 = 1/temp2, use mult (1/temp2) instead of div temp2
#if __BANG_ARCH__ >= 300
__bang_recip
((
float
*
)
temp2_ram
,
(
float
*
)
temp2_ram
,
actual_compute_box_num
);
#else
// NOTE: active_reciphp function has strict value range:
// [2.2205e-16, 2e6]@float, [0.00391, 65504]@half
__bang_active_reciphp
((
T
*
)
temp2_ram
,
(
T
*
)
temp2_ram
,
actual_compute_box_num
);
#endif
// Restore temp2 invalid box value 1 and sign-bit
__bang_mul
((
T
*
)
temp2_ram
,
(
T
*
)
temp2_ram
,
(
T
*
)
temp1_ram
,
actual_compute_box_num
);
__bang_mul
((
T
*
)
temp2_ram
,
(
T
*
)
temp2_ram
,
(
T
*
)
temp8_ram
,
actual_compute_box_num
);
// auto vec12 = pts2[j] - pts1[i], (temp6, temp7) = (x, y)
__bang_sub
((
T
*
)
temp6_ram
,
(
T
*
)
rotated_pts2_x
+
j
*
actual_compute_box_num
,
(
T
*
)
rotated_pts1_x
+
i
*
actual_compute_box_num
,
actual_compute_box_num
);
__bang_sub
((
T
*
)
temp7_ram
,
(
T
*
)
rotated_pts2_y
+
j
*
actual_compute_box_num
,
(
T
*
)
rotated_pts1_y
+
i
*
actual_compute_box_num
,
actual_compute_box_num
);
// T t1 = cross2d<T>(vec2[j], vec12) mult (1/det) -- temp8
cross2d
<
T
>
((
T
*
)
temp8_ram
,
(
T
*
)
vec2_x
+
j
*
actual_compute_box_num
,
(
T
*
)
vec2_y
+
j
*
actual_compute_box_num
,
(
T
*
)
temp6_ram
,
(
T
*
)
temp7_ram
,
actual_compute_box_num
,
(
T
*
)
temp9_ram
);
__bang_mul
((
T
*
)
temp8_ram
,
(
T
*
)
temp8_ram
,
(
T
*
)
temp2_ram
,
actual_compute_box_num
);
// temp1 &= (t1 >= 0.0f && t1 <= 1.0f) -- temp9
__bang_cycle_ge
((
T
*
)
temp9_ram
,
(
T
*
)
temp8_ram
,
(
T
*
)
temp4_ram
,
actual_compute_box_num
,
COMPUTE_COUNT_ALIGN
);
__bang_and
((
T
*
)
temp1_ram
,
(
T
*
)
temp1_ram
,
(
T
*
)
temp9_ram
,
actual_compute_box_num
);
__bang_cycle_le
((
T
*
)
temp9_ram
,
(
T
*
)
temp8_ram
,
(
T
*
)
temp5_ram
,
actual_compute_box_num
,
COMPUTE_COUNT_ALIGN
);
__bang_and
((
T
*
)
temp1_ram
,
(
T
*
)
temp1_ram
,
(
T
*
)
temp9_ram
,
actual_compute_box_num
);
// T t2 = cross2d<T>(vec1[i], vec12) mult temp2 -- temp9
// NOTE: temp8(t1) is used after, reuse temp7(p2_y) as cross2d temp ram
cross2d
<
T
>
((
T
*
)
temp9_ram
,
(
T
*
)
vec1_x
+
i
*
actual_compute_box_num
,
(
T
*
)
vec1_y
+
i
*
actual_compute_box_num
,
(
T
*
)
temp6_ram
,
(
T
*
)
temp7_ram
,
actual_compute_box_num
,
(
T
*
)
temp7_ram
);
__bang_mul
((
T
*
)
temp9_ram
,
(
T
*
)
temp9_ram
,
(
T
*
)
temp2_ram
,
actual_compute_box_num
);
// temp1 &= (t2 >= 0.0f && t2 <= 1.0f) -- temp9
__bang_cycle_ge
((
T
*
)
temp7_ram
,
(
T
*
)
temp9_ram
,
(
T
*
)
temp4_ram
,
actual_compute_box_num
,
COMPUTE_COUNT_ALIGN
);
__bang_and
((
T
*
)
temp1_ram
,
(
T
*
)
temp1_ram
,
(
T
*
)
temp7_ram
,
actual_compute_box_num
);
__bang_cycle_le
((
T
*
)
temp7_ram
,
(
T
*
)
temp9_ram
,
(
T
*
)
temp5_ram
,
actual_compute_box_num
,
COMPUTE_COUNT_ALIGN
);
__bang_and
((
T
*
)
temp1_ram
,
(
T
*
)
temp1_ram
,
(
T
*
)
temp7_ram
,
actual_compute_box_num
);
// intersections = (pts1[i] + vec1[i] * t1) * temp1
__bang_mul
((
T
*
)
temp9_ram
,
(
T
*
)
vec1_x
+
i
*
actual_compute_box_num
,
(
T
*
)
temp8_ram
,
actual_compute_box_num
);
__bang_add
((
T
*
)
temp9_ram
,
(
T
*
)
rotated_pts1_x
+
i
*
actual_compute_box_num
,
(
T
*
)
temp9_ram
,
actual_compute_box_num
);
__bang_mul
((
T
*
)
intersect_pts_x
+
(
4
*
i
+
j
)
*
actual_compute_box_num
,
(
T
*
)
temp9_ram
,
(
T
*
)
temp1_ram
,
actual_compute_box_num
);
__bang_mul
((
T
*
)
temp9_ram
,
(
T
*
)
vec1_y
+
i
*
actual_compute_box_num
,
(
T
*
)
temp8_ram
,
actual_compute_box_num
);
__bang_add
((
T
*
)
temp9_ram
,
(
T
*
)
rotated_pts1_y
+
i
*
actual_compute_box_num
,
(
T
*
)
temp9_ram
,
actual_compute_box_num
);
__bang_mul
((
T
*
)
intersect_pts_y
+
(
4
*
i
+
j
)
*
actual_compute_box_num
,
(
T
*
)
temp9_ram
,
(
T
*
)
temp1_ram
,
actual_compute_box_num
);
// Assign `valid_pts` bit and accumulate `nums_in` of valid points of each
// box pair
__bang_or
((
T
*
)
valid_pts
+
(
4
*
i
+
j
)
*
actual_compute_box_num
,
(
T
*
)
valid_pts
+
(
4
*
i
+
j
)
*
actual_compute_box_num
,
(
T
*
)
temp1_ram
,
actual_compute_box_num
);
__bang_add
((
T
*
)
nums_in_ram
,
(
T
*
)
nums_in_ram
,
(
T
*
)
temp1_ram
,
actual_compute_box_num
);
}
}
// Check for vertices of rect1 inside rect2
// temp5 = ABdotAB
dot2d
<
T
>
((
T
*
)
temp5_ram
,
(
T
*
)
vec2_x
,
(
T
*
)
vec2_y
,
(
T
*
)
vec2_x
,
(
T
*
)
vec2_y
,
actual_compute_box_num
,
(
T
*
)
temp9_ram
);
// temp6 = ADdotAD
dot2d
<
T
>
((
T
*
)
temp6_ram
,
(
T
*
)
vec2_x
+
3
*
actual_compute_box_num
,
(
T
*
)
vec2_y
+
3
*
actual_compute_box_num
,
(
T
*
)
vec2_x
+
3
*
actual_compute_box_num
,
(
T
*
)
vec2_y
+
3
*
actual_compute_box_num
,
actual_compute_box_num
,
(
T
*
)
temp9_ram
);
// assume ABCD is the rectangle, and P is the point to be judged
// P is inside ABCD iff. P's projection on AB lines within AB
// and P's projection on AD lies within AD
for
(
int
i
=
0
;
i
<
4
;
i
++
)
{
// AP = pts1[i] - pts2[0] = (temp7, temp8)
__bang_sub
((
T
*
)
temp7_ram
,
(
T
*
)
rotated_pts1_x
+
i
*
actual_compute_box_num
,
(
T
*
)
rotated_pts2_x
,
actual_compute_box_num
);
__bang_sub
((
T
*
)
temp8_ram
,
(
T
*
)
rotated_pts1_y
+
i
*
actual_compute_box_num
,
(
T
*
)
rotated_pts2_y
,
actual_compute_box_num
);
// temp9 = APdotAB = dot2d<T>(AP, AB)
dot2d
<
T
>
((
T
*
)
temp9_ram
,
(
T
*
)
temp7_ram
,
(
T
*
)
temp8_ram
,
(
T
*
)
vec2_x
,
(
T
*
)
vec2_y
,
actual_compute_box_num
,
(
T
*
)
temp2_ram
);
// temp10 = APdotAD = -dot2d<T>(AP, DA)
dot2d
<
T
>
((
T
*
)
temp10_ram
,
(
T
*
)
temp7_ram
,
(
T
*
)
temp8_ram
,
(
T
*
)
vec2_x
+
3
*
actual_compute_box_num
,
(
T
*
)
vec2_y
+
3
*
actual_compute_box_num
,
actual_compute_box_num
,
(
T
*
)
temp2_ram
);
__bang_mul_scalar
((
T
*
)
temp10_ram
,
(
T
*
)
temp10_ram
,
(
T
)
-
1
,
actual_compute_box_num
);
// ((APdotAB >= 0) && (APdotAD >= 0) && (APdotAB <= ABdotAB) && (APdotAD <=
// ADdotAD))
__bang_cycle_ge
((
T
*
)
temp1_ram
,
(
T
*
)
temp9_ram
,
(
T
*
)
temp4_ram
,
actual_compute_box_num
,
COMPUTE_COUNT_ALIGN
);
__bang_cycle_ge
((
T
*
)
temp2_ram
,
(
T
*
)
temp10_ram
,
(
T
*
)
temp4_ram
,
actual_compute_box_num
,
COMPUTE_COUNT_ALIGN
);
__bang_and
((
T
*
)
temp1_ram
,
(
T
*
)
temp1_ram
,
(
T
*
)
temp2_ram
,
actual_compute_box_num
);
__bang_le
((
T
*
)
temp2_ram
,
(
T
*
)
temp9_ram
,
(
T
*
)
temp5_ram
,
actual_compute_box_num
);
__bang_and
((
T
*
)
temp1_ram
,
(
T
*
)
temp1_ram
,
(
T
*
)
temp2_ram
,
actual_compute_box_num
);
__bang_le
((
T
*
)
temp2_ram
,
(
T
*
)
temp10_ram
,
(
T
*
)
temp6_ram
,
actual_compute_box_num
);
__bang_and
((
T
*
)
temp1_ram
,
(
T
*
)
temp1_ram
,
(
T
*
)
temp2_ram
,
actual_compute_box_num
);
// 16 means the 4x4 possible intersection points above
__bang_mul
((
T
*
)
intersect_pts_x
+
(
16
+
i
)
*
actual_compute_box_num
,
(
T
*
)
temp1_ram
,
(
T
*
)
rotated_pts1_x
+
i
*
actual_compute_box_num
,
actual_compute_box_num
);
__bang_mul
((
T
*
)
intersect_pts_y
+
(
16
+
i
)
*
actual_compute_box_num
,
(
T
*
)
temp1_ram
,
(
T
*
)
rotated_pts1_y
+
i
*
actual_compute_box_num
,
actual_compute_box_num
);
// assign valid_pts bit and accumulate nums of valid points of each box pair
__bang_or
((
T
*
)
valid_pts
+
(
16
+
i
)
*
actual_compute_box_num
,
(
T
*
)
valid_pts
+
(
16
+
i
)
*
actual_compute_box_num
,
(
T
*
)
temp1_ram
,
actual_compute_box_num
);
__bang_add
((
T
*
)
nums_in_ram
,
(
T
*
)
nums_in_ram
,
(
T
*
)
temp1_ram
,
actual_compute_box_num
);
}
// Reverse the check - check for vertices of rect2 inside rect1
// temp5 = ABdotAB
dot2d
<
T
>
((
T
*
)
temp5_ram
,
(
T
*
)
vec1_x
,
(
T
*
)
vec1_y
,
(
T
*
)
vec1_x
,
(
T
*
)
vec1_y
,
actual_compute_box_num
,
(
T
*
)
temp9_ram
);
// temp6 = ADdotAD
dot2d
<
T
>
((
T
*
)
temp6_ram
,
(
T
*
)
vec1_x
+
3
*
actual_compute_box_num
,
(
T
*
)
vec1_y
+
3
*
actual_compute_box_num
,
(
T
*
)
vec1_x
+
3
*
actual_compute_box_num
,
(
T
*
)
vec1_y
+
3
*
actual_compute_box_num
,
actual_compute_box_num
,
(
T
*
)
temp9_ram
);
for
(
int
i
=
0
;
i
<
4
;
i
++
)
{
// AP = pts2[i] - pts1[0] = (temp7, temp8)
__bang_sub
((
T
*
)
temp7_ram
,
(
T
*
)
rotated_pts2_x
+
i
*
actual_compute_box_num
,
(
T
*
)
rotated_pts1_x
,
actual_compute_box_num
);
__bang_sub
((
T
*
)
temp8_ram
,
(
T
*
)
rotated_pts2_y
+
i
*
actual_compute_box_num
,
(
T
*
)
rotated_pts1_y
,
actual_compute_box_num
);
// temp9 = APdotAB = dot2d<T>(AP, AB)
dot2d
<
T
>
((
T
*
)
temp9_ram
,
(
T
*
)
temp7_ram
,
(
T
*
)
temp8_ram
,
(
T
*
)
vec1_x
,
(
T
*
)
vec1_y
,
actual_compute_box_num
,
(
T
*
)
temp2_ram
);
// temp10 = APdotAD = -dot2d<T>(AP, DA)
dot2d
<
T
>
((
T
*
)
temp10_ram
,
(
T
*
)
temp7_ram
,
(
T
*
)
temp8_ram
,
(
T
*
)
vec1_x
+
3
*
actual_compute_box_num
,
(
T
*
)
vec1_y
+
3
*
actual_compute_box_num
,
actual_compute_box_num
,
(
T
*
)
temp2_ram
);
__bang_mul_scalar
((
T
*
)
temp10_ram
,
(
T
*
)
temp10_ram
,
(
T
)
-
1
,
actual_compute_box_num
);
// ((APdotAB >= 0) && (APdotAD >= 0) && (APdotAB <= ABdotAB) && (APdotAD <=
// ADdotAD))
__bang_cycle_ge
((
T
*
)
temp1_ram
,
(
T
*
)
temp9_ram
,
(
T
*
)
temp4_ram
,
actual_compute_box_num
,
COMPUTE_COUNT_ALIGN
);
__bang_cycle_ge
((
T
*
)
temp2_ram
,
(
T
*
)
temp10_ram
,
(
T
*
)
temp4_ram
,
actual_compute_box_num
,
COMPUTE_COUNT_ALIGN
);
__bang_and
((
T
*
)
temp1_ram
,
(
T
*
)
temp1_ram
,
(
T
*
)
temp2_ram
,
actual_compute_box_num
);
__bang_le
((
T
*
)
temp2_ram
,
(
T
*
)
temp9_ram
,
(
T
*
)
temp5_ram
,
actual_compute_box_num
);
__bang_and
((
T
*
)
temp1_ram
,
(
T
*
)
temp1_ram
,
(
T
*
)
temp2_ram
,
actual_compute_box_num
);
__bang_le
((
T
*
)
temp2_ram
,
(
T
*
)
temp10_ram
,
(
T
*
)
temp6_ram
,
actual_compute_box_num
);
__bang_and
((
T
*
)
temp1_ram
,
(
T
*
)
temp1_ram
,
(
T
*
)
temp2_ram
,
actual_compute_box_num
);
// 20 means the (4x4+4) possible intersection points above
__bang_mul
((
T
*
)
intersect_pts_x
+
(
20
+
i
)
*
actual_compute_box_num
,
(
T
*
)
temp1_ram
,
(
T
*
)
rotated_pts2_x
+
i
*
actual_compute_box_num
,
actual_compute_box_num
);
__bang_mul
((
T
*
)
intersect_pts_y
+
(
20
+
i
)
*
actual_compute_box_num
,
(
T
*
)
temp1_ram
,
(
T
*
)
rotated_pts2_y
+
i
*
actual_compute_box_num
,
actual_compute_box_num
);
// assign valid_pts bit and accumulate nums of valid points of each box pair
__bang_or
((
T
*
)
valid_pts
+
(
20
+
i
)
*
actual_compute_box_num
,
(
T
*
)
valid_pts
+
(
20
+
i
)
*
actual_compute_box_num
,
(
T
*
)
temp1_ram
,
actual_compute_box_num
);
__bang_add
((
T
*
)
nums_in_ram
,
(
T
*
)
nums_in_ram
,
(
T
*
)
temp1_ram
,
actual_compute_box_num
);
}
}
template
<
typename
T
>
__mlu_func__
void
convexHullGraham
(
T
*
intersect_pts_x
,
T
*
intersect_pts_y
,
T
*
ordered_pts_x
,
T
*
ordered_pts_y
,
T
*
dist_ram
,
T
*
valid_box
,
T
*
valid_pts
,
T
*
nums_in_ram
,
T
*
temp1_ram
,
T
*
temp2_ram
,
T
*
temp3_ram
,
T
*
temp_long_1
,
T
*
temp_long_2
,
T
*
temp_long_3
,
const
uint32_t
&
actual_box_num
,
const
uint32_t
&
actual_compute_box_num
)
{
// Step1. Find the point with minimum y, if more than 1 points have the same
// minimum y,
// pick the one with the minimum x.
// set p[i].y to max_y_value if not valid_pts, to avoid invalid result
// 24 means all possible intersection points
__bang_max
((
T
*
)
temp2_ram
,
(
T
*
)
intersect_pts_y
,
24
*
actual_compute_box_num
);
__bang_write_value
((
T
*
)
temp3_ram
,
COMPUTE_COUNT_ALIGN
,
((
T
*
)
temp2_ram
)[
0
]);
__bang_not
((
T
*
)
temp_long_1
,
(
T
*
)
valid_pts
,
24
*
actual_compute_box_num
);
__bang_cycle_mul
((
T
*
)
temp_long_1
,
(
T
*
)
temp_long_1
,
(
T
*
)
temp3_ram
,
24
*
actual_compute_box_num
,
COMPUTE_COUNT_ALIGN
);
__bang_mul
((
T
*
)
temp_long_2
,
(
T
*
)
intersect_pts_y
,
(
T
*
)
valid_pts
,
24
*
actual_compute_box_num
);
__bang_add
((
T
*
)
temp_long_2
,
(
T
*
)
temp_long_2
,
(
T
*
)
temp_long_1
,
24
*
actual_compute_box_num
);
// temp2 = min_y_value(temp_long_2), use min_pool, channel=box_num, h=1, w=24
__bang_minpool
((
T
*
)
temp2_ram
,
(
T
*
)
temp_long_2
,
actual_compute_box_num
,
1
,
24
,
1
,
24
,
1
,
24
);
__bang_mul
((
T
*
)
temp2_ram
,
(
T
*
)
temp2_ram
,
(
T
*
)
valid_box
,
actual_compute_box_num
);
// set p[i].x to max_x_value if not min_y point
__bang_max
((
T
*
)
temp1_ram
,
(
T
*
)
intersect_pts_x
,
24
*
actual_compute_box_num
);
__bang_write_value
((
T
*
)
temp3_ram
,
COMPUTE_COUNT_ALIGN
,
((
T
*
)
temp1_ram
)[
0
]);
__bang_cycle_eq
((
T
*
)
temp_long_1
,
(
T
*
)
temp_long_2
,
(
T
*
)
temp2_ram
,
24
*
actual_compute_box_num
,
actual_compute_box_num
);
__bang_and
((
T
*
)
temp_long_1
,
(
T
*
)
temp_long_1
,
(
T
*
)
valid_pts
,
24
*
actual_compute_box_num
);
__bang_not
((
T
*
)
temp_long_3
,
(
T
*
)
temp_long_1
,
24
*
actual_compute_box_num
);
__bang_cycle_mul
((
T
*
)
temp_long_3
,
(
T
*
)
temp_long_3
,
(
T
*
)
temp3_ram
,
24
*
actual_compute_box_num
,
COMPUTE_COUNT_ALIGN
);
__bang_mul
((
T
*
)
temp_long_1
,
(
T
*
)
intersect_pts_x
,
(
T
*
)
temp_long_1
,
24
*
actual_compute_box_num
);
__bang_add
((
T
*
)
temp_long_1
,
(
T
*
)
temp_long_1
,
(
T
*
)
temp_long_3
,
24
*
actual_compute_box_num
);
// temp3 = min_x_value(temp_long_1), use min_pool, channel=box_num, h=1, w=24
__bang_minpool
((
T
*
)
temp3_ram
,
(
T
*
)
temp_long_1
,
actual_compute_box_num
,
1
,
24
,
1
,
24
,
1
,
24
);
__bang_mul
((
T
*
)
temp3_ram
,
(
T
*
)
temp3_ram
,
(
T
*
)
valid_box
,
actual_compute_box_num
);
// Step2. All points subtract starting-point (for sorting in the next step)
__bang_cycle_sub
((
T
*
)
ordered_pts_x
,
(
T
*
)
intersect_pts_x
,
(
T
*
)
temp3_ram
,
24
*
actual_compute_box_num
,
actual_compute_box_num
);
__bang_cycle_sub
((
T
*
)
ordered_pts_y
,
(
T
*
)
intersect_pts_y
,
(
T
*
)
temp2_ram
,
24
*
actual_compute_box_num
,
actual_compute_box_num
);
__bang_mul
((
T
*
)
ordered_pts_x
,
(
T
*
)
ordered_pts_x
,
(
T
*
)
valid_pts
,
24
*
actual_compute_box_num
);
__bang_mul
((
T
*
)
ordered_pts_y
,
(
T
*
)
ordered_pts_y
,
(
T
*
)
valid_pts
,
24
*
actual_compute_box_num
);
// Step3. Sort every intersection point according to their relative
// cross-product values (essentially sorting according to angles)
// If the angles are the same, sort according to distance to origin
dot2d
<
T
>
((
T
*
)
dist_ram
,
(
T
*
)
ordered_pts_x
,
(
T
*
)
ordered_pts_y
,
(
T
*
)
ordered_pts_x
,
(
T
*
)
ordered_pts_y
,
24
*
actual_compute_box_num
,
(
T
*
)
temp_long_3
);
T
temp
,
temp_nums_in
,
temp_dist_1
,
temp_dist_2
;
T
temp1_x
,
temp1_y
;
T
temp2_x
,
temp2_y
;
for
(
int
i
=
0
;
i
<
actual_box_num
;
i
++
)
{
if
(((
T
*
)
valid_box
)[
i
])
{
// make sure all nums_in[i] points are at the front
for
(
int
ii
=
0
;
ii
<
23
;
ii
++
)
{
for
(
int
jj
=
ii
+
1
;
jj
<
24
;
jj
++
)
{
int
ii_index
=
ii
*
actual_compute_box_num
+
i
;
int
jj_index
=
jj
*
actual_compute_box_num
+
i
;
// ii point is not valid and jj point is valid, swap jj for ii
if
((
!
((
T
*
)
valid_pts
)[
ii_index
])
&&
((
T
*
)
valid_pts
)[
jj_index
])
{
((
T
*
)
ordered_pts_x
)[
ii_index
]
=
((
T
*
)
ordered_pts_x
)[
jj_index
];
((
T
*
)
ordered_pts_y
)[
ii_index
]
=
((
T
*
)
ordered_pts_y
)[
jj_index
];
((
T
*
)
dist_ram
)[
ii_index
]
=
((
T
*
)
dist_ram
)[
jj_index
];
((
T
*
)
valid_pts
)[
ii_index
]
=
true
;
((
T
*
)
ordered_pts_x
)[
jj_index
]
=
0
;
((
T
*
)
ordered_pts_y
)[
jj_index
]
=
0
;
((
T
*
)
dist_ram
)[
jj_index
]
=
0
;
((
T
*
)
valid_pts
)[
jj_index
]
=
false
;
break
;
}
}
}
temp_nums_in
=
((
T
*
)
nums_in_ram
)[
i
];
// make original q[0] = min_x, min_y before sort
for
(
int
ii
=
1
;
ii
<
temp_nums_in
;
ii
++
)
{
int
ii_index
=
ii
*
actual_compute_box_num
+
i
;
if
(((
T
*
)
dist_ram
)[
ii_index
]
==
0
)
{
// swap q[ii_index] and q[0]
((
T
*
)
ordered_pts_x
)[
ii_index
]
=
((
T
*
)
ordered_pts_x
)[
i
];
((
T
*
)
ordered_pts_y
)[
ii_index
]
=
((
T
*
)
ordered_pts_y
)[
i
];
((
T
*
)
dist_ram
)[
ii_index
]
=
((
T
*
)
dist_ram
)[
i
];
((
T
*
)
ordered_pts_x
)[
i
]
=
0
;
((
T
*
)
ordered_pts_y
)[
i
]
=
0
;
((
T
*
)
dist_ram
)[
i
]
=
0
;
break
;
}
}
for
(
int
ii
=
1
;
ii
<
temp_nums_in
-
1
;
ii
++
)
{
for
(
int
jj
=
ii
+
1
;
jj
<
temp_nums_in
;
jj
++
)
{
int
ii_index
=
ii
*
actual_compute_box_num
+
i
;
int
jj_index
=
jj
*
actual_compute_box_num
+
i
;
temp1_x
=
((
T
*
)
ordered_pts_x
)[
ii_index
];
temp1_y
=
((
T
*
)
ordered_pts_y
)[
ii_index
];
temp2_x
=
((
T
*
)
ordered_pts_x
)[
jj_index
];
temp2_y
=
((
T
*
)
ordered_pts_y
)[
jj_index
];
// calculate cross product and sort q (ordered_pts)
temp
=
(
temp1_x
*
temp2_y
)
-
(
temp1_y
*
temp2_x
);
temp_dist_1
=
((
T
*
)
dist_ram
)[
ii_index
];
temp_dist_2
=
((
T
*
)
dist_ram
)[
jj_index
];
if
((
temp
<
(
T
)
-
1e-6
)
||
((
fabs
(
temp
)
<
(
T
)
1e-6
)
&&
(
temp_dist_1
>
temp_dist_2
)))
{
((
T
*
)
ordered_pts_x
)[
ii_index
]
=
temp2_x
;
((
T
*
)
ordered_pts_y
)[
ii_index
]
=
temp2_y
;
((
T
*
)
ordered_pts_x
)[
jj_index
]
=
temp1_x
;
((
T
*
)
ordered_pts_y
)[
jj_index
]
=
temp1_y
;
((
T
*
)
dist_ram
)[
ii_index
]
=
temp_dist_2
;
((
T
*
)
dist_ram
)[
jj_index
]
=
temp_dist_1
;
}
}
}
// Step4:
// Make sure there are at least 2 points(that don't overlap with each
// other) in the stack
int
k
;
// index of the non-overlapped second point
for
(
k
=
1
;
k
<
temp_nums_in
;
k
++
)
{
if
(((
T
*
)
dist_ram
)[
k
*
actual_compute_box_num
+
i
]
>
(
T
)
1e-8
)
{
break
;
}
}
if
(
k
==
temp_nums_in
)
{
// We reach the end, which means the convex hull is just one point
// set valid_box = 0, to get ious = 0
((
T
*
)
valid_box
)[
i
]
=
0
;
continue
;
}
// q[1] = q[k];
((
T
*
)
ordered_pts_x
)[
actual_compute_box_num
+
i
]
=
((
T
*
)
ordered_pts_x
)[
k
*
actual_compute_box_num
+
i
];
((
T
*
)
ordered_pts_y
)[
actual_compute_box_num
+
i
]
=
((
T
*
)
ordered_pts_y
)[
k
*
actual_compute_box_num
+
i
];
// Step 5:
// Finally we can start the scanning process.
// When a non-convex relationship between the 3 points is found
// (either concave shape or duplicated points),
// we pop the previous point from the stack
// until the 3-point relationship is convex again, or
// until the stack only contains two points
int
m
=
2
;
// 2 points in the stack
for
(
int
j
=
k
+
1
;
j
<
temp_nums_in
;
j
++
)
{
// while (m > 1 && cross2d<T>(q[j] - q[m - 2], q[m - 1] - q[m - 2]) >=
// 0) {
// m--;
// }
temp1_x
=
((
T
*
)
ordered_pts_x
)[
j
*
actual_compute_box_num
+
i
]
-
((
T
*
)
ordered_pts_x
)[(
m
-
2
)
*
actual_compute_box_num
+
i
];
temp1_y
=
((
T
*
)
ordered_pts_y
)[
j
*
actual_compute_box_num
+
i
]
-
((
T
*
)
ordered_pts_y
)[(
m
-
2
)
*
actual_compute_box_num
+
i
];
temp2_x
=
((
T
*
)
ordered_pts_x
)[(
m
-
1
)
*
actual_compute_box_num
+
i
]
-
((
T
*
)
ordered_pts_x
)[(
m
-
2
)
*
actual_compute_box_num
+
i
];
temp2_y
=
((
T
*
)
ordered_pts_y
)[(
m
-
1
)
*
actual_compute_box_num
+
i
]
-
((
T
*
)
ordered_pts_y
)[(
m
-
2
)
*
actual_compute_box_num
+
i
];
temp
=
(
temp1_x
*
temp2_y
)
-
(
temp1_y
*
temp2_x
);
while
((
m
>
1
)
&&
(
temp
>=
0
))
{
m
--
;
if
(
m
>
1
)
{
temp1_x
=
((
T
*
)
ordered_pts_x
)[
j
*
actual_compute_box_num
+
i
]
-
((
T
*
)
ordered_pts_x
)[(
m
-
2
)
*
actual_compute_box_num
+
i
];
temp1_y
=
((
T
*
)
ordered_pts_y
)[
j
*
actual_compute_box_num
+
i
]
-
((
T
*
)
ordered_pts_y
)[(
m
-
2
)
*
actual_compute_box_num
+
i
];
temp2_x
=
((
T
*
)
ordered_pts_x
)[(
m
-
1
)
*
actual_compute_box_num
+
i
]
-
((
T
*
)
ordered_pts_x
)[(
m
-
2
)
*
actual_compute_box_num
+
i
];
temp2_y
=
((
T
*
)
ordered_pts_y
)[(
m
-
1
)
*
actual_compute_box_num
+
i
]
-
((
T
*
)
ordered_pts_y
)[(
m
-
2
)
*
actual_compute_box_num
+
i
];
temp
=
(
temp1_x
*
temp2_y
)
-
(
temp1_y
*
temp2_x
);
}
}
// q[m++] = q[j];
((
T
*
)
ordered_pts_x
)[
m
*
actual_compute_box_num
+
i
]
=
((
T
*
)
ordered_pts_x
)[
j
*
actual_compute_box_num
+
i
];
((
T
*
)
ordered_pts_y
)[
m
*
actual_compute_box_num
+
i
]
=
((
T
*
)
ordered_pts_y
)[
j
*
actual_compute_box_num
+
i
];
m
++
;
}
// set last(24-m) valid_pts to false, to erase invalid q in polygon area
for
(
int
j
=
m
;
j
<
temp_nums_in
;
j
++
)
{
((
T
*
)
valid_pts
)[
j
*
actual_compute_box_num
+
i
]
=
0
;
}
((
T
*
)
nums_in_ram
)[
i
]
=
m
;
}
}
}
template
<
typename
T
>
__mlu_func__
void
polygonArea
(
T
*
ordered_pts_x
,
T
*
ordered_pts_y
,
T
*
valid_box
,
T
*
valid_pts
,
T
*
nums_in_ram
,
T
*
temp1_ram
,
T
*
temp2_ram
,
T
*
temp3_ram
,
T
*
temp4_ram
,
T
*
temp5_ram
,
T
*
temp6_ram
,
T
*
temp7_ram
,
T
*
temp8_ram
,
T
*
temp9_ram
,
const
uint32_t
&
actual_compute_box_num
)
{
// Set where nums_in <= 2, valid_box = false
__bang_write_value
((
T
*
)
temp9_ram
,
COMPUTE_COUNT_ALIGN
,
(
T
)
2
);
__bang_cycle_gt
((
T
*
)
temp1_ram
,
(
T
*
)
nums_in_ram
,
(
T
*
)
temp9_ram
,
actual_compute_box_num
,
COMPUTE_COUNT_ALIGN
);
__bang_and
((
T
*
)
valid_box
,
(
T
*
)
valid_box
,
(
T
*
)
temp1_ram
,
actual_compute_box_num
);
// temp1 = area, initialize with all 0
__bang_write_zero
((
T
*
)
temp1_ram
,
actual_compute_box_num
);
__bang_max
((
T
*
)
temp7_ram
,
(
T
*
)
nums_in_ram
,
actual_compute_box_num
);
// temp_nums_in = max(nums_in)
T
temp_nums_in
=
((
T
*
)
temp7_ram
)[
0
];
for
(
int
i
=
1
;
i
<
temp_nums_in
-
1
;
i
++
)
{
// q[i] - q[0]: (temp6, temp7)
__bang_sub
((
T
*
)
temp6_ram
,
(
T
*
)
ordered_pts_x
+
i
*
actual_compute_box_num
,
(
T
*
)
ordered_pts_x
,
actual_compute_box_num
);
__bang_sub
((
T
*
)
temp7_ram
,
(
T
*
)
ordered_pts_y
+
i
*
actual_compute_box_num
,
(
T
*
)
ordered_pts_y
,
actual_compute_box_num
);
__bang_mul
((
T
*
)
temp6_ram
,
(
T
*
)
temp6_ram
,
(
T
*
)
valid_pts
+
(
i
+
1
)
*
actual_compute_box_num
,
actual_compute_box_num
);
__bang_mul
((
T
*
)
temp7_ram
,
(
T
*
)
temp7_ram
,
(
T
*
)
valid_pts
+
(
i
+
1
)
*
actual_compute_box_num
,
actual_compute_box_num
);
// q[i + 1] - q[0]: (temp8, temp9)
__bang_sub
((
T
*
)
temp8_ram
,
(
T
*
)
ordered_pts_x
+
(
i
+
1
)
*
actual_compute_box_num
,
(
T
*
)
ordered_pts_x
,
actual_compute_box_num
);
__bang_sub
((
T
*
)
temp9_ram
,
(
T
*
)
ordered_pts_y
+
(
i
+
1
)
*
actual_compute_box_num
,
(
T
*
)
ordered_pts_y
,
actual_compute_box_num
);
__bang_mul
((
T
*
)
temp8_ram
,
(
T
*
)
temp8_ram
,
(
T
*
)
valid_pts
+
(
i
+
1
)
*
actual_compute_box_num
,
actual_compute_box_num
);
__bang_mul
((
T
*
)
temp9_ram
,
(
T
*
)
temp9_ram
,
(
T
*
)
valid_pts
+
(
i
+
1
)
*
actual_compute_box_num
,
actual_compute_box_num
);
// area += fabs(cross2d<T>(q[i] - q[0], q[i + 1] - q[0]));
__bang_mul
((
T
*
)
temp4_ram
,
(
T
*
)
temp6_ram
,
(
T
*
)
temp9_ram
,
actual_compute_box_num
);
__bang_mul
((
T
*
)
temp5_ram
,
(
T
*
)
temp7_ram
,
(
T
*
)
temp8_ram
,
actual_compute_box_num
);
__bang_sub
((
T
*
)
temp3_ram
,
(
T
*
)
temp4_ram
,
(
T
*
)
temp5_ram
,
actual_compute_box_num
);
__bang_active_abs
((
T
*
)
temp3_ram
,
(
T
*
)
temp3_ram
,
actual_compute_box_num
);
__bang_add
((
T
*
)
temp1_ram
,
(
T
*
)
temp1_ram
,
(
T
*
)
temp3_ram
,
actual_compute_box_num
);
}
// Set where valid_box = false, intersection = 0
__bang_mul
((
T
*
)
temp1_ram
,
(
T
*
)
temp1_ram
,
(
T
*
)
valid_box
,
actual_compute_box_num
);
// area = area / 2.0
__bang_mul_scalar
((
T
*
)
temp1_ram
,
(
T
*
)
temp1_ram
,
(
T
)
0.5
,
actual_compute_box_num
);
}
#endif // IOU3D_UTILS_HPP_
mmcv/ops/csrc/common/mlu/masked_conv2d_mlu_kernel.mlu
deleted
100755 → 0
View file @
6f674c7e
/*************************************************************************
* Copyright (C) 2022 Cambricon.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#include "common_mlu_helper.hpp"
__nram__ char nram_buffer[MAX_NRAM_SIZE];
template <typename T>
__mlu_func__ void MLUUnion1MaskedIm2colForward(
const T *feature, const int height, const int width, const int channels,
const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
const int32_t *mask_h_idx, const int32_t *mask_w_idx, const int mask_cnt,
T *data_col) {
for (int index = taskId; index < mask_cnt; index += taskDim) {
const int h_col = mask_h_idx[index];
const int w_col = mask_w_idx[index];
const int h_offset = h_col - pad_h;
const int w_offset = w_col - pad_w;
int h_start = h_offset;
int h_end = h_offset + kernel_h - 1;
int w_start = w_offset;
int w_end = w_start + kernel_w - 1;
if (h_start >= height || w_start >= width || h_end < 0 || w_end < 0) {
continue;
} else {
int h_start_valid = max(0, h_start);
int h_end_valid = min(height - 1, h_end);
int w_start_valid = max(0, w_start);
int w_end_valid = min(width - 1, w_end);
__memcpy(
data_col + index * kernel_h * kernel_w * channels +
((h_start_valid - h_start) * kernel_w +
(w_start_valid - w_start)) *
channels,
feature + h_start_valid * width * channels + w_start_valid * channels,
(w_end_valid - w_start_valid + 1) * channels * sizeof(T), GDRAM2GDRAM,
kernel_w * channels * sizeof(T), width * channels * sizeof(T),
h_end_valid - h_start_valid);
}
}
}
template <typename T>
__mlu_func__ void MLUUnion1MaskedCol2imForward(const T *col, const int height,
const int width,
const int channels,
const int32_t *mask_h_idx,
const int32_t *mask_w_idx,
const int mask_cnt, T *im) {
const int channels_max_num_nram = MAX_NRAM_SIZE / sizeof(T);
if (channels <= channels_max_num_nram) {
const int deal_num = channels_max_num_nram / channels;
int mask_per_core = mask_cnt / taskDim;
const int mask_remain = mask_cnt % taskDim;
mask_per_core += taskId < mask_remain ? 1 : 0;
int index_start = taskId < mask_remain
? taskId * mask_per_core
: taskId * mask_per_core + mask_remain;
int loop = mask_per_core / deal_num;
int remain_num = mask_per_core % deal_num;
T *nram_col = (T *)nram_buffer;
for (int index = 0; index < loop; ++index) {
int cur_index = index_start + index * deal_num;
__memcpy(nram_col, col + cur_index * channels,
deal_num * channels * sizeof(T), GDRAM2NRAM);
for (int i = 0; i < deal_num; ++i) {
int mask_index = cur_index + i;
const int h_im = mask_h_idx[mask_index];
const int w_im = mask_w_idx[mask_index];
// if(h_im>=height || w_im>=width) continue;
__memcpy(im + (h_im * width + w_im) * channels, nram_col + i * channels,
channels * sizeof(T), NRAM2GDRAM);
}
}
if (remain_num > 0) {
int cur_index = index_start + loop * deal_num;
__memcpy(nram_col, col + cur_index * channels,
remain_num * channels * sizeof(T), GDRAM2NRAM);
for (int i = 0; i < remain_num; ++i) {
int mask_index = cur_index + i;
const int h_im = mask_h_idx[mask_index];
const int w_im = mask_w_idx[mask_index];
// if(h_im>=height || w_im>=width) continue;
__memcpy(im + (h_im * width + w_im) * channels, nram_col + i * channels,
channels * sizeof(T), NRAM2GDRAM);
}
}
} else {
for (int index = taskId; index < mask_cnt; index += taskDim) {
const int m_index = index % mask_cnt;
const int h_im = mask_h_idx[m_index];
const int w_im = mask_w_idx[m_index];
// if(h_im>=height || w_im>=width) continue;
__memcpy(im + (h_im * width + w_im) * channels, col + index * channels,
channels * sizeof(T), GDRAM2GDRAM);
}
}
}
__mlu_global__ void MLUKernelMaskedIm2colForward(
const void *feature, const int height, const int width, const int channels,
const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
const void *mask_h_idx, const void *mask_w_idx, const int mask_cnt,
void *data_col, const cnrtDataType_t data_dtype) {
if (coreId == 0x80) {
return;
}
switch (data_dtype) {
case CNRT_FLOAT16: {
MLUUnion1MaskedIm2colForward((half *)feature, height, width, channels,
kernel_h, kernel_w, pad_h, pad_w,
(int32_t *)mask_h_idx, (int32_t *)mask_w_idx,
mask_cnt, (half *)data_col);
}; break;
case CNRT_FLOAT32: {
MLUUnion1MaskedIm2colForward((float *)feature, height, width, channels,
kernel_h, kernel_w, pad_h, pad_w,
(int32_t *)mask_h_idx, (int32_t *)mask_w_idx,
mask_cnt, (float *)data_col);
}; break;
default: {
break;
}
}
}
__mlu_global__ void MLUKernelMaskedCol2imForward(
const void *col, const int height, const int width, const int channels,
const void *mask_h_idx, const void *mask_w_idx, const int mask_cnt,
void *im, const cnrtDataType_t data_dtype) {
if (coreId == 0x80) {
return;
}
switch (data_dtype) {
case CNRT_FLOAT16: {
MLUUnion1MaskedCol2imForward((half *)col, height, width, channels,
(int32_t *)mask_h_idx, (int32_t *)mask_w_idx,
mask_cnt, (half *)im);
}; break;
case CNRT_FLOAT32: {
MLUUnion1MaskedCol2imForward((float *)col, height, width, channels,
(int32_t *)mask_h_idx, (int32_t *)mask_w_idx,
mask_cnt, (float *)im);
}; break;
default: {
break;
}
}
}
void KernelMaskedIm2colForward(
cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
cnrtDataType_t k_dtype, const void *im_ptr, const int height,
const int width, const int channels, const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w, const void *mask_h_idx_ptr,
const void *mask_w_idx_ptr, const int mask_cnt, void *col_ptr) {
MLUKernelMaskedIm2colForward<<<k_dim, k_type, queue>>>(
im_ptr, height, width, channels, kernel_h, kernel_w, pad_h, pad_w,
mask_h_idx_ptr, mask_w_idx_ptr, mask_cnt, col_ptr, k_dtype);
}
void KernelMaskedCol2imForward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
cnrtQueue_t queue, cnrtDataType_t k_dtype,
const void *col_ptr, const int height,
const int width, const int channels,
const void *mask_h_idx_ptr,
const void *mask_w_idx_ptr, const int mask_cnt,
void *im_ptr) {
MLUKernelMaskedCol2imForward<<<k_dim, k_type, queue>>>(
col_ptr, height, width, channels, mask_h_idx_ptr, mask_w_idx_ptr,
mask_cnt, im_ptr, k_dtype);
}
mmcv/ops/csrc/common/mlu/ms_deform_attn_mlu_kernel.mlu
deleted
100644 → 0
View file @
6f674c7e
/*************************************************************************
* Copyright (C) 2022 by Cambricon.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#include "common_mlu_helper.hpp"
#include <math.h>
/****************************************************************************************
*
* NRAM partition forward:
* | spatial_shapes | data_value_p1_ping | data_value_p2_ping |
* | data_value_p3_ping | data_value_p4_ping | data_col_ping |
* | data_value_p1_pong | data_value_p2_pong | data_value_p3_pong |
* | data_value_p4_pong | data_col_pong | auxiliary_a |
* | auxiliary_b |
* | 128bytes | deal_size | deal_size |
* | deal_size | deal_size | deal_size |
* | deal_size | deal_size | deal_size |
* | deal_size | deal_size | deal_size |
* | deal_size |
*
****************************************************************************************/
/****************************************************************************************
*
* NRAM partition backward:
* | grad_output_nram | grad_output_nram_temp | grad_weight |
* | grad_h_weight | grad_w_weight | top_grad |
* | top_grad_temp | spatial_shapes_nram | sampling_loc_nram |
* | deal_size | deal_size | deal_size |
* | deal_size | deal_size | deal_size |
* | deal_size | deal_size | 64bytes |
*
****************************************************************************************/
#define TWELVE_SPLIT 12
#define ALIGN_NUM 64
#define ALIGN_NUM_FOR_REDUCE 32
__nram__ char nram_buffer[MAX_NRAM_SIZE];
template <typename T>
__mlu_func__ void loadNeighborPointsData(
const T *data_value_gdram, T *data_value_p1_nram, T *data_value_p2_nram,
T *data_value_p3_nram, T *data_value_p4_nram, const size_t deal_num,
const int32_t &width, const int32_t &height, const int32_t &num_heads,
const int32_t &channels, const T &x, const T &y, const int32_t &head_idx) {
const int32_t w_low = floorf(x);
const int32_t h_low = floorf(y);
const int32_t w_high = w_low + 1;
const int32_t h_high = h_low + 1;
const int32_t w_stride = num_heads * channels;
const int32_t h_stride = width * w_stride;
const int32_t h_low_ptr_offset = h_low * h_stride;
const int32_t h_high_ptr_offset = h_low_ptr_offset + h_stride;
const int32_t w_low_ptr_offset = w_low * w_stride;
const int32_t w_high_ptr_offset = w_low_ptr_offset + w_stride;
const int32_t base_ptr_offset = head_idx * channels;
// top-left point
if (h_low >= 0 && w_low >= 0) {
const int32_t v1_offset =
h_low_ptr_offset + w_low_ptr_offset + base_ptr_offset;
__memcpy_async(data_value_p1_nram, data_value_gdram + v1_offset,
deal_num * sizeof(T), GDRAM2NRAM);
}
// top-right point
if (h_low >= 0 && w_high <= width - 1) {
const int32_t v2_offset =
h_low_ptr_offset + w_high_ptr_offset + base_ptr_offset;
__memcpy_async(data_value_p2_nram, data_value_gdram + v2_offset,
deal_num * sizeof(T), GDRAM2NRAM);
}
// bottom-left point
if (h_high <= height - 1 && w_low >= 0) {
const int32_t v3_offset =
h_high_ptr_offset + w_low_ptr_offset + base_ptr_offset;
__memcpy_async(data_value_p3_nram, data_value_gdram + v3_offset,
deal_num * sizeof(T), GDRAM2NRAM);
}
// bottom-right point
if (h_high <= height - 1 && w_high <= width - 1) {
const int32_t v4_offset =
h_high_ptr_offset + w_high_ptr_offset + base_ptr_offset;
__memcpy_async(data_value_p4_nram, data_value_gdram + v4_offset,
deal_num * sizeof(T), GDRAM2NRAM);
}
}
template <typename T>
__mlu_func__ void bilinearInterpolation(
T *data_value_p1_nram, T *data_value_p2_nram, T *data_value_p3_nram,
T *data_value_p4_nram, T *sample_point_value, T *auxiliary_b,
const size_t deal_num, const int32_t &width, const int32_t &height,
const T &x, const T &y) {
const int32_t w_low = floorf(x);
const int32_t h_low = floorf(y);
const int32_t w_high = w_low + 1;
const int32_t h_high = h_low + 1;
const T lw = x - w_low;
const T lh = y - h_low;
const T hw = 1 - lw;
const T hh = 1 - lh;
const T w1 = hh * hw;
const T w2 = hh * lw;
const T w3 = lh * hw;
const T w4 = lh * lw;
__bang_write_value((T *)sample_point_value, deal_num, (T)0);
// top-left point
if (h_low >= 0 && w_low >= 0) {
// sample_point_value += v1 * w1
__bang_mul_scalar((T *)auxiliary_b, (T *)data_value_p1_nram, (T)w1,
deal_num);
__bang_add((T *)sample_point_value, (T *)sample_point_value,
(T *)auxiliary_b, deal_num);
}
// top-right point
if (h_low >= 0 && w_high <= width - 1) {
// sample_point_value += v2 * w2
__bang_mul_scalar((T *)auxiliary_b, (T *)data_value_p2_nram, (T)w2,
deal_num);
__bang_add((T *)sample_point_value, (T *)sample_point_value,
(T *)auxiliary_b, deal_num);
}
// bottom-left point
if (h_high <= height - 1 && w_low >= 0) {
// sample_point_value += v3 * w3
__bang_mul_scalar((T *)auxiliary_b, (T *)data_value_p3_nram, (T)w3,
deal_num);
__bang_add((T *)sample_point_value, (T *)sample_point_value,
(T *)auxiliary_b, deal_num);
}
// bottom-right point
if (h_high <= height - 1 && w_high <= width - 1) {
// sample_point_value += v4 * w4
__bang_mul_scalar((T *)auxiliary_b, (T *)data_value_p4_nram, (T)w4,
deal_num);
__bang_add((T *)sample_point_value, (T *)sample_point_value,
(T *)auxiliary_b, deal_num);
}
}
template <typename T>
__mlu_global__ void MLUKernelMsDeformAttnForward(
const char *data_value_gdram, const char *data_spatial_shapes_gdram,
const char *data_level_start_index_gdram,
const char *data_sampling_loc_gdram, const char *data_attn_weight_gdram,
const int32_t batch_size, const int32_t num_keys, const int32_t num_heads,
const int32_t channels, const int32_t num_levels, const int32_t num_queries,
const int32_t num_points, char *data_col_gdram) {
if (coreId == 0x80) {
return;
}
const size_t spatial_size = PAD_UP(2 * sizeof(int32_t), NFU_ALIGN_SIZE);
const size_t span_num_deal =
PAD_DOWN((MAX_NRAM_SIZE - spatial_size) / TWELVE_SPLIT / sizeof(T),
NFU_ALIGN_SIZE);
const size_t align_num = NFU_ALIGN_SIZE;
const int32_t channels_seg_num = channels / span_num_deal;
const size_t channels_rem = channels % span_num_deal;
const size_t channels_align_rem = CEIL_ALIGN(channels_rem, align_num);
char *data_spatial_shapes_nram = nram_buffer;
char *ping_data_value_p1_nram = data_spatial_shapes_nram + spatial_size;
char *ping_data_value_p2_nram =
ping_data_value_p1_nram + span_num_deal * sizeof(T);
char *ping_data_value_p3_nram =
ping_data_value_p2_nram + span_num_deal * sizeof(T);
char *ping_data_value_p4_nram =
ping_data_value_p3_nram + span_num_deal * sizeof(T);
char *ping_data_col_nram =
ping_data_value_p4_nram + span_num_deal * sizeof(T);
char *pong_data_value_p1_nram =
ping_data_col_nram + span_num_deal * sizeof(T);
char *pong_data_value_p2_nram =
pong_data_value_p1_nram + span_num_deal * sizeof(T);
char *pong_data_value_p3_nram =
pong_data_value_p2_nram + span_num_deal * sizeof(T);
char *pong_data_value_p4_nram =
pong_data_value_p3_nram + span_num_deal * sizeof(T);
char *pong_data_col_nram =
pong_data_value_p4_nram + span_num_deal * sizeof(T);
char *auxiliary_a = pong_data_col_nram + span_num_deal * sizeof(T);
char *auxiliary_b = auxiliary_a + span_num_deal * sizeof(T);
const size_t ping_pong_gap = 5 * span_num_deal * sizeof(T);
size_t data_col_ping_pong_idx = 0;
int32_t block_num_per_core = (batch_size * num_queries * num_heads) / taskDim;
const int32_t block_num_rem =
(batch_size * num_queries * num_heads) % taskDim;
const int32_t idx_start = taskId < (block_num_rem + 1)
? taskId * (block_num_per_core + 1)
: taskId * block_num_per_core + block_num_rem;
block_num_per_core =
taskId < block_num_rem
? (batch_size * num_queries * num_heads) / taskDim + 1
: (batch_size * num_queries * num_heads) / taskDim;
for (int32_t cur_idx = idx_start; cur_idx < idx_start + block_num_per_core;
++cur_idx) {
// cur_idx = batch_idx * num_queries * num_heads + query_idx * num_heads +
// head_idx
const int32_t head_idx = cur_idx % num_heads;
const int32_t batch_idx = (cur_idx / num_heads) / num_queries;
const char *data_value_gdram_start =
data_value_gdram +
batch_idx * num_keys * num_heads * channels * sizeof(T);
const char *data_sampling_loc_gdram_start =
data_sampling_loc_gdram +
cur_idx * num_levels * num_points * 2 * sizeof(T);
const char *data_attn_weight_gdram_start =
data_attn_weight_gdram + cur_idx * num_levels * num_points * sizeof(T);
char *data_col_gdram_start =
data_col_gdram + cur_idx * channels * sizeof(T);
for (int32_t c_seg_idx = 0; c_seg_idx < channels_seg_num; ++c_seg_idx) {
__bang_write_value(
(T *)(ping_data_col_nram + data_col_ping_pong_idx * ping_pong_gap),
span_num_deal, (T)0);
// load data
// level_idx = 0, point_idx = 0
__memcpy(data_spatial_shapes_nram, data_spatial_shapes_gdram,
2 * sizeof(int32_t), GDRAM2NRAM);
int32_t spatial_h = ((int32_t *)data_spatial_shapes_nram)[0];
int32_t spatial_w = ((int32_t *)data_spatial_shapes_nram)[1];
const char *data_value_ptr =
data_value_gdram_start + c_seg_idx * span_num_deal * sizeof(T);
T loc_w = ((T *)data_sampling_loc_gdram_start)[0];
T loc_h = ((T *)data_sampling_loc_gdram_start)[1];
T weight = ((T *)data_attn_weight_gdram_start)[0];
T x = loc_w * spatial_w - 0.5;
T y = loc_h * spatial_h - 0.5;
if (y > -1 && x > -1 && y < spatial_h && x < spatial_w) {
loadNeighborPointsData(
(T *)data_value_ptr, (T *)ping_data_value_p1_nram,
(T *)ping_data_value_p2_nram, (T *)ping_data_value_p3_nram,
(T *)ping_data_value_p4_nram, span_num_deal, spatial_w, spatial_h,
num_heads, channels, x, y, head_idx);
}
T spatial_h_next_point = 0;
T spatial_w_next_point = 0;
T weight_next_point = 0;
T x_next_point = 0;
T y_next_point = 0;
__asm__ volatile("sync;");
for (int32_t level_idx = 0; level_idx < num_levels; ++level_idx) {
for (int32_t point_idx = 0; point_idx < num_points; ++point_idx) {
// load data
if (point_idx == num_points - 1 && level_idx == num_levels - 1) {
// last point no need to load data, continue to compute
} else if (point_idx == num_points - 1) {
const int32_t level_start_id =
((int32_t *)data_level_start_index_gdram)[level_idx + 1];
const int32_t spatial_h_ptr = (level_idx + 1) << 1;
__memcpy(
data_spatial_shapes_nram,
data_spatial_shapes_gdram + spatial_h_ptr * sizeof(int32_t),
2 * sizeof(int32_t), GDRAM2NRAM);
spatial_h_next_point = ((int32_t *)data_spatial_shapes_nram)[0];
spatial_w_next_point = ((int32_t *)data_spatial_shapes_nram)[1];
data_value_ptr = data_value_gdram_start +
(level_start_id * num_heads * channels +
c_seg_idx * span_num_deal) *
sizeof(T);
loc_w = ((T *)data_sampling_loc_gdram_start)
[(level_idx * num_points + point_idx + 1) * 2];
loc_h = ((T *)data_sampling_loc_gdram_start)
[(level_idx * num_points + point_idx + 1) * 2 + 1];
weight_next_point =
((T *)data_attn_weight_gdram_start)[level_idx * num_points +
point_idx + 1];
x_next_point = loc_w * spatial_w_next_point - 0.5;
y_next_point = loc_h * spatial_h_next_point - 0.5;
if (y_next_point > -1 && x_next_point > -1 &&
y_next_point < spatial_h_next_point &&
x_next_point < spatial_w_next_point) {
loadNeighborPointsData(
(T *)data_value_ptr,
(T *)(ping_data_value_p1_nram +
((level_idx * num_points + point_idx + 1) % 2) *
ping_pong_gap),
(T *)(ping_data_value_p2_nram +
((level_idx * num_points + point_idx + 1) % 2) *
ping_pong_gap),
(T *)(ping_data_value_p3_nram +
((level_idx * num_points + point_idx + 1) % 2) *
ping_pong_gap),
(T *)(ping_data_value_p4_nram +
((level_idx * num_points + point_idx + 1) % 2) *
ping_pong_gap),
span_num_deal, spatial_w_next_point, spatial_h_next_point,
num_heads, channels, x_next_point, y_next_point, head_idx);
}
} else {
spatial_h_next_point = spatial_h;
spatial_w_next_point = spatial_w;
loc_w = ((T *)data_sampling_loc_gdram_start)
[(level_idx * num_points + point_idx + 1) * 2];
loc_h = ((T *)data_sampling_loc_gdram_start)
[(level_idx * num_points + point_idx + 1) * 2 + 1];
weight_next_point =
((T *)data_attn_weight_gdram_start)[level_idx * num_points +
point_idx + 1];
x_next_point = loc_w * spatial_w - 0.5;
y_next_point = loc_h * spatial_h - 0.5;
if (y_next_point > -1 && x_next_point > -1 &&
y_next_point < spatial_h && x_next_point < spatial_w) {
loadNeighborPointsData(
(T *)data_value_ptr,
(T *)(ping_data_value_p1_nram +
((level_idx * num_points + point_idx + 1) % 2) *
ping_pong_gap),
(T *)(ping_data_value_p2_nram +
((level_idx * num_points + point_idx + 1) % 2) *
ping_pong_gap),
(T *)(ping_data_value_p3_nram +
((level_idx * num_points + point_idx + 1) % 2) *
ping_pong_gap),
(T *)(ping_data_value_p4_nram +
((level_idx * num_points + point_idx + 1) % 2) *
ping_pong_gap),
span_num_deal, spatial_w, spatial_h, num_heads, channels,
x_next_point, y_next_point, head_idx);
}
}
// compute
if (y > -1 && x > -1 && y < spatial_h && x < spatial_w) {
bilinearInterpolation(
(T *)(ping_data_value_p1_nram +
((level_idx * num_points + point_idx) % 2) *
ping_pong_gap),
(T *)(ping_data_value_p2_nram +
((level_idx * num_points + point_idx) % 2) *
ping_pong_gap),
(T *)(ping_data_value_p3_nram +
((level_idx * num_points + point_idx) % 2) *
ping_pong_gap),
(T *)(ping_data_value_p4_nram +
((level_idx * num_points + point_idx) % 2) *
ping_pong_gap),
(T *)auxiliary_a, (T *)auxiliary_b, span_num_deal, spatial_w,
spatial_h, x, y);
__bang_mul_scalar((T *)auxiliary_a, (T *)auxiliary_a, (T)weight,
span_num_deal);
__bang_add((T *)(ping_data_col_nram +
data_col_ping_pong_idx * ping_pong_gap),
(T *)(ping_data_col_nram +
data_col_ping_pong_idx * ping_pong_gap),
(T *)auxiliary_a, span_num_deal);
}
spatial_w = spatial_w_next_point;
spatial_h = spatial_h_next_point;
weight = weight_next_point;
x = x_next_point;
y = y_next_point;
__asm__ volatile("sync;");
}
}
// store
__memcpy_async(
data_col_gdram_start + c_seg_idx * span_num_deal * sizeof(T),
ping_data_col_nram + data_col_ping_pong_idx * ping_pong_gap,
span_num_deal * sizeof(T), NRAM2GDRAM);
data_col_ping_pong_idx = (data_col_ping_pong_idx + 1) % 2;
}
if (channels_rem > 0) {
__bang_write_value(
(T *)(ping_data_col_nram + data_col_ping_pong_idx * ping_pong_gap),
channels_align_rem, (T)0);
// load data
// level_idx = 0, point_idx = 0
__memcpy(data_spatial_shapes_nram, data_spatial_shapes_gdram,
2 * sizeof(int32_t), GDRAM2NRAM);
int32_t spatial_h = ((int32_t *)data_spatial_shapes_nram)[0];
int32_t spatial_w = ((int32_t *)data_spatial_shapes_nram)[1];
const char *data_value_ptr =
data_value_gdram_start + channels_seg_num * span_num_deal * sizeof(T);
T loc_w = ((T *)data_sampling_loc_gdram_start)[0];
T loc_h = ((T *)data_sampling_loc_gdram_start)[1];
T weight = ((T *)data_attn_weight_gdram_start)[0];
T x = loc_w * spatial_w - 0.5;
T y = loc_h * spatial_h - 0.5;
if (y > -1 && x > -1 && y < spatial_h && x < spatial_w) {
loadNeighborPointsData(
(T *)data_value_ptr, (T *)ping_data_value_p1_nram,
(T *)ping_data_value_p2_nram, (T *)ping_data_value_p3_nram,
(T *)ping_data_value_p4_nram, channels_rem, spatial_w, spatial_h,
num_heads, channels, x, y, head_idx);
}
T spatial_h_next_point = 0;
T spatial_w_next_point = 0;
T weight_next_point = 0;
T x_next_point = 0;
T y_next_point = 0;
__asm__ volatile("sync;");
for (int32_t level_idx = 0; level_idx < num_levels; ++level_idx) {
for (int32_t point_idx = 0; point_idx < num_points; ++point_idx) {
// load data
if (point_idx == num_points - 1 && level_idx == num_levels - 1) {
// last point no need to load data, continue to compute
} else if (point_idx == num_points - 1) {
const int32_t level_start_id =
((int32_t *)data_level_start_index_gdram)[level_idx + 1];
const int32_t spatial_h_ptr = (level_idx + 1) << 1;
__memcpy(
data_spatial_shapes_nram,
data_spatial_shapes_gdram + spatial_h_ptr * sizeof(int32_t),
2 * sizeof(int32_t), GDRAM2NRAM);
spatial_h_next_point = ((int32_t *)data_spatial_shapes_nram)[0];
spatial_w_next_point = ((int32_t *)data_spatial_shapes_nram)[1];
data_value_ptr = data_value_gdram_start +
(level_start_id * num_heads * channels +
channels_seg_num * span_num_deal) *
sizeof(T);
loc_w = ((T *)data_sampling_loc_gdram_start)
[(level_idx * num_points + point_idx + 1) * 2];
loc_h = ((T *)data_sampling_loc_gdram_start)
[(level_idx * num_points + point_idx + 1) * 2 + 1];
weight_next_point =
((T *)data_attn_weight_gdram_start)[level_idx * num_points +
point_idx + 1];
x_next_point = loc_w * spatial_w_next_point - 0.5;
y_next_point = loc_h * spatial_h_next_point - 0.5;
if (y_next_point > -1 && x_next_point > -1 &&
y_next_point < spatial_h_next_point &&
x_next_point < spatial_w_next_point) {
loadNeighborPointsData(
(T *)data_value_ptr,
(T *)(ping_data_value_p1_nram +
((level_idx * num_points + point_idx + 1) % 2) *
ping_pong_gap),
(T *)(ping_data_value_p2_nram +
((level_idx * num_points + point_idx + 1) % 2) *
ping_pong_gap),
(T *)(ping_data_value_p3_nram +
((level_idx * num_points + point_idx + 1) % 2) *
ping_pong_gap),
(T *)(ping_data_value_p4_nram +
((level_idx * num_points + point_idx + 1) % 2) *
ping_pong_gap),
channels_rem, spatial_w_next_point, spatial_h_next_point,
num_heads, channels, x_next_point, y_next_point, head_idx);
}
} else {
spatial_w_next_point = spatial_w;
spatial_h_next_point = spatial_h;
loc_w = ((T *)data_sampling_loc_gdram_start)
[(level_idx * num_points + point_idx + 1) * 2];
loc_h = ((T *)data_sampling_loc_gdram_start)
[(level_idx * num_points + point_idx + 1) * 2 + 1];
weight_next_point =
((T *)data_attn_weight_gdram_start)[level_idx * num_points +
point_idx + 1];
x_next_point = loc_w * spatial_w - 0.5;
y_next_point = loc_h * spatial_h - 0.5;
if (y_next_point > -1 && x_next_point > -1 &&
y_next_point < spatial_h && x_next_point < spatial_w) {
loadNeighborPointsData(
(T *)data_value_ptr,
(T *)(ping_data_value_p1_nram +
((level_idx * num_points + point_idx + 1) % 2) *
ping_pong_gap),
(T *)(ping_data_value_p2_nram +
((level_idx * num_points + point_idx + 1) % 2) *
ping_pong_gap),
(T *)(ping_data_value_p3_nram +
((level_idx * num_points + point_idx + 1) % 2) *
ping_pong_gap),
(T *)(ping_data_value_p4_nram +
((level_idx * num_points + point_idx + 1) % 2) *
ping_pong_gap),
channels_rem, spatial_w, spatial_h, num_heads, channels,
x_next_point, y_next_point, head_idx);
}
}
// compute
if (y > -1 && x > -1 && y < spatial_h && x < spatial_w) {
bilinearInterpolation(
(T *)(ping_data_value_p1_nram +
((level_idx * num_points + point_idx) % 2) *
ping_pong_gap),
(T *)(ping_data_value_p2_nram +
((level_idx * num_points + point_idx) % 2) *
ping_pong_gap),
(T *)(ping_data_value_p3_nram +
((level_idx * num_points + point_idx) % 2) *
ping_pong_gap),
(T *)(ping_data_value_p4_nram +
((level_idx * num_points + point_idx) % 2) *
ping_pong_gap),
(T *)auxiliary_a, (T *)auxiliary_b, channels_align_rem,
spatial_w, spatial_h, x, y);
__bang_mul_scalar((T *)auxiliary_a, (T *)auxiliary_a, (T)weight,
channels_align_rem);
__bang_add((T *)(ping_data_col_nram +
data_col_ping_pong_idx * ping_pong_gap),
(T *)(ping_data_col_nram +
data_col_ping_pong_idx * ping_pong_gap),
(T *)auxiliary_a, channels_align_rem);
}
spatial_w = spatial_w_next_point;
spatial_h = spatial_h_next_point;
weight = weight_next_point;
x = x_next_point;
y = y_next_point;
__asm__ volatile("sync;");
}
}
// store
__memcpy_async(
data_col_gdram_start + channels_seg_num * span_num_deal * sizeof(T),
ping_data_col_nram + data_col_ping_pong_idx * ping_pong_gap,
channels_rem * sizeof(T), NRAM2GDRAM);
data_col_ping_pong_idx = (data_col_ping_pong_idx + 1) % 2;
}
}
__asm__ volatile("sync;");
return;
}
template __mlu_global__ void MLUKernelMsDeformAttnForward<float>(
const char *data_value_gdram, const char *data_spatial_shapes_gdram,
const char *data_level_start_index_gdram,
const char *data_sampling_loc_gdram, const char *data_attn_weight_gdram,
const int32_t batch_size, const int32_t num_keys, const int32_t num_heads,
const int32_t channels, const int32_t num_levels, const int32_t num_queries,
const int32_t num_points, char *data_col_gdram);
void KernelMsDeformAttnForward(
cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
const cnrtDataType_t d_type, const char *data_value_gdram,
const char *data_spatial_shapes_gdram,
const char *data_level_start_index_gdram,
const char *data_sampling_loc_gdram, const char *data_attn_weight_gdram,
const int32_t batch_size, const int32_t num_keys, const int32_t num_heads,
const int32_t channels, const int32_t num_levels, const int32_t num_queries,
const int32_t num_points, char *data_col_gdram) {
MLUKernelMsDeformAttnForward<float><<<k_dim, k_type, queue>>>(
data_value_gdram, data_spatial_shapes_gdram, data_level_start_index_gdram,
data_sampling_loc_gdram, data_attn_weight_gdram, batch_size, num_keys,
num_heads, channels, num_levels, num_queries, num_points, data_col_gdram);
}
template <typename T>
void __mlu_func__ msDeformAttnCol2imBilinear(
T *top_grad_temp, const int32_t &height, const int32_t &width, const T &w1,
const T &w2, const T &w3, const T &w4, const int32_t &h_low,
const int32_t &w_low, const int32_t &h_high, const int32_t &w_high,
const int32_t &base_ptr, const int32_t &h_low_ptr_offset,
const int32_t &w_low_ptr_offset, const int32_t &h_high_ptr_offset,
const int32_t &w_high_ptr_offset, const T &hh, const T &hw, const T &lh,
const T &lw, T *top_grad, const T &data_attn_weight, T *grad_h_weight,
T *grad_w_weight, T *grad_value, T *grad_output_nram, T *grad_weight,
T *grad_sampling_loc, T *grad_attn_weight, T *grad_output_nram_temp,
const int32_t &deal_num, const int32_t &deal_num_real,
const T *data_value_ptr) {
if (h_low >= 0 && w_low >= 0) {
int32_t offset1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
__memcpy(grad_output_nram, data_value_ptr + offset1,
deal_num_real * sizeof(T), GDRAM2NRAM);
__bang_mul_scalar(grad_weight, grad_output_nram, hw, deal_num);
__bang_sub(grad_h_weight, grad_h_weight, grad_weight, deal_num);
__bang_mul_scalar(grad_weight, grad_output_nram, hh, deal_num);
__bang_sub(grad_w_weight, grad_w_weight, grad_weight, deal_num);
__bang_mul_scalar(top_grad_temp, top_grad, data_attn_weight, deal_num);
__bang_mul_scalar(top_grad_temp, top_grad_temp, w1, deal_num);
// for calc grad_attn_weight
__bang_mul_scalar(grad_output_nram, grad_output_nram, w1, deal_num);
__bang_atomic_add((T *)top_grad_temp, (T *)(grad_value + offset1),
(T *)top_grad_temp, deal_num_real);
}
if (h_low >= 0 && w_high <= width - 1) {
int32_t offset2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
__memcpy(grad_output_nram_temp, data_value_ptr + offset2,
deal_num_real * sizeof(T), GDRAM2NRAM);
__bang_mul_scalar(grad_weight, grad_output_nram_temp, lw, deal_num);
__bang_sub(grad_h_weight, grad_h_weight, grad_weight, deal_num);
__bang_mul_scalar(grad_weight, grad_output_nram_temp, hh, deal_num);
__bang_add(grad_w_weight, grad_w_weight, grad_weight, deal_num);
__bang_mul_scalar(top_grad_temp, top_grad, data_attn_weight, deal_num);
__bang_mul_scalar(top_grad_temp, top_grad_temp, w2, deal_num);
__bang_mul_scalar(grad_output_nram_temp, grad_output_nram_temp, w2,
deal_num);
__bang_add(grad_output_nram, grad_output_nram, grad_output_nram_temp,
deal_num);
__bang_atomic_add((T *)top_grad_temp, (T *)(grad_value + offset2),
(T *)top_grad_temp, deal_num_real);
}
if (h_high <= height - 1 && w_low >= 0) {
int32_t offset3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
__memcpy(grad_output_nram_temp, data_value_ptr + offset3,
deal_num_real * sizeof(T), GDRAM2NRAM);
__bang_mul_scalar(grad_weight, grad_output_nram_temp, hw, deal_num);
__bang_add(grad_h_weight, grad_h_weight, grad_weight, deal_num);
__bang_mul_scalar(grad_weight, grad_output_nram_temp, lh, deal_num);
__bang_sub(grad_w_weight, grad_w_weight, grad_weight, deal_num);
__bang_mul_scalar(top_grad_temp, top_grad, data_attn_weight, deal_num);
__bang_mul_scalar(top_grad_temp, top_grad_temp, w3, deal_num);
// for calc grad_attn_weight
__bang_mul_scalar(grad_output_nram_temp, grad_output_nram_temp, w3,
deal_num);
__bang_add(grad_output_nram, grad_output_nram, grad_output_nram_temp,
deal_num);
__bang_atomic_add((T *)top_grad_temp, (T *)(grad_value + offset3),
(T *)top_grad_temp, deal_num_real);
}
if (h_high <= height - 1 && w_high <= width - 1) {
int32_t offset4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
__memcpy(grad_output_nram_temp, data_value_ptr + offset4,
deal_num_real * sizeof(T), GDRAM2NRAM);
__bang_mul_scalar(grad_weight, grad_output_nram_temp, lw, deal_num);
__bang_add(grad_h_weight, grad_h_weight, grad_weight, deal_num);
__bang_mul_scalar(grad_weight, grad_output_nram_temp, lh, deal_num);
__bang_add(grad_w_weight, grad_w_weight, grad_weight, deal_num);
__bang_mul_scalar(top_grad_temp, top_grad, data_attn_weight, deal_num);
__bang_mul_scalar(top_grad_temp, top_grad_temp, w4, deal_num);
// for calc grad_attn_weight
__bang_mul_scalar(grad_output_nram_temp, grad_output_nram_temp, w4,
deal_num);
__bang_add(grad_output_nram, grad_output_nram, grad_output_nram_temp,
deal_num);
__bang_atomic_add((T *)top_grad_temp, (T *)(grad_value + offset4),
(T *)top_grad_temp, deal_num_real);
}
__bang_mul(grad_output_nram, grad_output_nram, top_grad, deal_num);
#if __BANG_ARCH__ >= 322
recursiveSumPool(grad_output_nram, 1, deal_num_real, ALIGN_NUM_FOR_REDUCE);
#else
const int32_t align_num_on_200 = NFU_ALIGN_SIZE / sizeof(float);
recursiveSumPool(grad_output_nram, align_num_on_200,
deal_num / align_num_on_200, ALIGN_NUM_FOR_REDUCE);
__bang_reduce_sum(grad_output_nram, grad_output_nram,
NFU_ALIGN_SIZE / sizeof(float));
#endif
__bang_atomic_add((T *)grad_output_nram, (T *)grad_attn_weight,
(T *)grad_output_nram, 1);
__bang_mul_scalar(grad_w_weight, grad_w_weight, width, deal_num);
__bang_mul_scalar(top_grad_temp, top_grad, data_attn_weight, deal_num);
__bang_mul(grad_w_weight, grad_w_weight, top_grad_temp, deal_num);
#if __BANG_ARCH__ >= 322
recursiveSumPool(grad_w_weight, 1, deal_num_real, ALIGN_NUM_FOR_REDUCE);
#else
recursiveSumPool(grad_w_weight, align_num_on_200, deal_num / align_num_on_200,
ALIGN_NUM_FOR_REDUCE);
__bang_reduce_sum(grad_w_weight, grad_w_weight,
NFU_ALIGN_SIZE / sizeof(float));
#endif
__bang_atomic_add((T *)grad_w_weight, (T *)(grad_sampling_loc),
(T *)grad_w_weight, 1);
__bang_mul_scalar(grad_h_weight, grad_h_weight, height, deal_num);
__bang_mul(grad_h_weight, grad_h_weight, top_grad_temp, deal_num);
#if __BANG_ARCH__ >= 322
recursiveSumPool(grad_h_weight, 1, deal_num_real, ALIGN_NUM_FOR_REDUCE);
#else
recursiveSumPool(grad_h_weight, align_num_on_200, deal_num / align_num_on_200,
ALIGN_NUM_FOR_REDUCE);
__bang_reduce_sum(grad_h_weight, grad_h_weight,
NFU_ALIGN_SIZE / sizeof(float));
#endif
__bang_atomic_add((T *)grad_h_weight, (T *)(grad_sampling_loc + 1),
(T *)grad_h_weight, 1);
}
__mlu_global__ void MLUUnion1KernelMsDeformAttnBackward(
const float *data_value, const int32_t *spatial_shapes,
const int32_t *data_level_start_index, const float *data_sampling_loc,
const float *data_attn_weight, const float *grad_output,
const int32_t batch, const int32_t spatial_size, const int32_t num_heads,
const int32_t channels, const int32_t num_levels, const int32_t num_query,
const int32_t num_points, float *grad_value, float *grad_sampling_loc,
float *grad_attn_weight) {
if (coreId == 0x80) {
return;
}
const int32_t split_num = 8;
const int32_t spatial_shapes_size = 64;
int32_t deal_num = PAD_DOWN(
(MAX_NRAM_SIZE - spatial_shapes_size) / split_num / sizeof(float),
ALIGN_NUM);
float *grad_output_nram = (float *)nram_buffer;
float *grad_output_nram_temp = (float *)nram_buffer + deal_num;
float *grad_weight = (float *)nram_buffer + 2 * deal_num;
float *grad_h_weight = (float *)nram_buffer + 3 * deal_num;
float *grad_w_weight = (float *)nram_buffer + 4 * deal_num;
float *top_grad = (float *)nram_buffer + 5 * deal_num;
float *top_grad_temp = (float *)nram_buffer + 6 * deal_num;
int32_t *spatial_shapes_nram =
(int32_t *)((float *)nram_buffer + 7 * deal_num);
float *sampling_loc_nram =
(float *)nram_buffer + 7 * deal_num + 2 * sizeof(int32_t);
const int32_t total_num = batch * num_query * num_heads * num_levels;
int32_t num_per_core = total_num / taskDim;
int32_t num_rem = total_num % taskDim;
num_per_core = num_per_core + int32_t(taskId < num_rem);
int32_t start_per_core =
num_rem > taskId
? (taskId * num_per_core)
: ((num_per_core + 1) * num_rem + (taskId - num_rem) * num_per_core);
int32_t end_per_core = start_per_core + num_per_core;
const int32_t C_repeat = channels / deal_num;
const int32_t C_tail = channels % deal_num;
const int32_t qid_stride = num_heads * channels;
int32_t base_ptr = 0;
for (int32_t num_loop = start_per_core; num_loop < end_per_core; ++num_loop) {
const int32_t l_col = num_loop % num_levels;
const int32_t m_col = num_loop / num_levels % num_heads;
const int32_t q_col = num_loop / num_levels / num_heads % num_query;
const int32_t b_col = num_loop / num_query / num_heads / num_levels;
int32_t data_weight_ptr = num_loop * num_points;
int32_t data_loc_w_ptr = data_weight_ptr << 1;
const int32_t value_offset = b_col * spatial_size * num_heads * channels;
const int32_t level_start_id = data_level_start_index[l_col];
int32_t spatial_h_ptr = l_col << 1;
int32_t grad_output_offset = b_col * num_query * num_heads * channels +
q_col * num_heads * channels +
m_col * channels;
__memcpy(spatial_shapes_nram, spatial_shapes + spatial_h_ptr,
2 * sizeof(int32_t), GDRAM2NRAM);
const int32_t spatial_h = spatial_shapes_nram[0];
const int32_t spatial_w = spatial_shapes_nram[1];
const int32_t value_ptr_offset = value_offset + level_start_id * qid_stride;
const float *data_value_ptr = data_value + value_ptr_offset;
float *grad_value_ptr = grad_value + value_ptr_offset;
const int32_t grad_attn_weight_out = num_loop * num_points;
const int32_t grad_sampling_loc_out = num_loop * num_points * 2;
for (int32_t p_col = 0; p_col < num_points; ++p_col) {
__memcpy(sampling_loc_nram, data_sampling_loc + data_loc_w_ptr,
2 * sizeof(float), GDRAM2NRAM);
const float loc_w = sampling_loc_nram[0];
const float loc_h = sampling_loc_nram[1];
const float weight = data_attn_weight[data_weight_ptr];
const float h_im = loc_h * spatial_h - 0.5;
const float w_im = loc_w * spatial_w - 0.5;
if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
const int32_t h_low = floorf(h_im);
const int32_t w_low = floorf(w_im);
const int32_t h_high = h_low + 1;
const int32_t w_high = w_low + 1;
const float lh = h_im - h_low;
const float lw = w_im - w_low;
const float hh = 1.0 - lh;
const float hw = 1.0 - lw;
const int32_t w_stride = num_heads * channels;
const int32_t h_stride = spatial_w * w_stride;
const int32_t h_low_ptr_offset = h_low * h_stride;
const int32_t h_high_ptr_offset = h_low_ptr_offset + h_stride;
const int32_t w_low_ptr_offset = w_low * w_stride;
const int32_t w_high_ptr_offset = w_low_ptr_offset + w_stride;
float w1 = hh * hw;
float w2 = hh * lw;
float w3 = lh * hw;
float w4 = lh * lw;
for (int32_t C_loop = 0; C_loop < C_repeat; ++C_loop) {
base_ptr = m_col * channels + C_loop * deal_num;
__bang_write_zero(grad_weight, 3 * deal_num);
__bang_write_zero(grad_output_nram, deal_num);
__memcpy(top_grad,
grad_output + grad_output_offset + C_loop * deal_num,
deal_num * sizeof(float), GDRAM2NRAM);
msDeformAttnCol2imBilinear(
top_grad_temp, spatial_h, spatial_w, w1, w2, w3, w4, h_low, w_low,
h_high, w_high, base_ptr, h_low_ptr_offset, w_low_ptr_offset,
h_high_ptr_offset, w_high_ptr_offset, hh, hw, lh, lw, top_grad,
weight, grad_h_weight, grad_w_weight, grad_value_ptr,
grad_output_nram, grad_weight,
grad_sampling_loc + grad_sampling_loc_out + p_col * 2,
grad_attn_weight + grad_attn_weight_out + p_col,
grad_output_nram_temp, deal_num, deal_num, data_value_ptr);
}
if (C_tail != 0) {
base_ptr = m_col * channels + C_repeat * deal_num;
__bang_write_zero(grad_output_nram, 8 * deal_num);
__memcpy(top_grad,
grad_output + grad_output_offset + C_repeat * deal_num,
C_tail * sizeof(float), GDRAM2NRAM);
msDeformAttnCol2imBilinear(
top_grad_temp, spatial_h, spatial_w, w1, w2, w3, w4, h_low, w_low,
h_high, w_high, base_ptr, h_low_ptr_offset, w_low_ptr_offset,
h_high_ptr_offset, w_high_ptr_offset, hh, hw, lh, lw, top_grad,
weight, grad_h_weight, grad_w_weight, grad_value_ptr,
grad_output_nram, grad_weight,
grad_sampling_loc + grad_sampling_loc_out + p_col * 2,
grad_attn_weight + grad_attn_weight_out + p_col,
grad_output_nram_temp, deal_num, C_tail, data_value_ptr);
}
}
data_weight_ptr += 1;
data_loc_w_ptr += 2;
}
}
}
__mlu_global__ void MLUUnion1KernelMsDeformAttnBackward(
const float *data_value, const int32_t *spatial_shapes,
const int32_t *data_level_start_index, const float *data_sampling_loc,
const float *data_attn_weight, const float *grad_output,
const int32_t batch, const int32_t spatial_size, const int32_t num_heads,
const int32_t channels, const int32_t num_levels, const int32_t num_query,
const int32_t num_points, float *grad_value, float *grad_sampling_loc,
float *grad_attn_weight);
void KernelMsDeformAttnBackward(
cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
const cnrtDataType_t d_type, const float *data_value,
const int32_t *spatial_shapes, const int32_t *data_level_start_index,
const float *data_sampling_loc, const float *data_attn_weight,
const float *grad_output, const int32_t batch, const int32_t spatial_size,
const int32_t num_heads, const int32_t channels, const int32_t num_levels,
const int32_t num_query, const int32_t num_points, float *grad_value,
float *grad_sampling_loc, float *grad_attn_weight) {
MLUUnion1KernelMsDeformAttnBackward<<<k_dim, k_type, queue>>>(
data_value, spatial_shapes, data_level_start_index, data_sampling_loc,
data_attn_weight, grad_output, batch, spatial_size, num_heads, channels,
num_levels, num_query, num_points, grad_value, grad_sampling_loc,
grad_attn_weight);
}
Prev
1
…
9
10
11
12
13
14
15
16
17
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment