Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
MMCV
Commits
fdeee889
Commit
fdeee889
authored
May 25, 2025
by
limm
Browse files
release v1.6.1 of mmcv
parent
df465820
Changes
457
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
2759 additions
and
40 deletions
+2759
-40
mmcv/ops/csrc/pytorch/cuda/sparse_reordering.cu
mmcv/ops/csrc/pytorch/cuda/sparse_reordering.cu
+157
-0
mmcv/ops/csrc/pytorch/cuda/spconv_ops_cuda.cu
mmcv/ops/csrc/pytorch/cuda/spconv_ops_cuda.cu
+474
-0
mmcv/ops/csrc/pytorch/cuda/three_interpolate_cuda.cu
mmcv/ops/csrc/pytorch/cuda/three_interpolate_cuda.cu
+2
-2
mmcv/ops/csrc/pytorch/cuda/three_nn_cuda.cu
mmcv/ops/csrc/pytorch/cuda/three_nn_cuda.cu
+1
-1
mmcv/ops/csrc/pytorch/cuda/voxelization_cuda.cu
mmcv/ops/csrc/pytorch/cuda/voxelization_cuda.cu
+98
-0
mmcv/ops/csrc/pytorch/diff_iou_rotated.cpp
mmcv/ops/csrc/pytorch/diff_iou_rotated.cpp
+14
-0
mmcv/ops/csrc/pytorch/fused_spconv_ops.cpp
mmcv/ops/csrc/pytorch/fused_spconv_ops.cpp
+34
-0
mmcv/ops/csrc/pytorch/iou3d.cpp
mmcv/ops/csrc/pytorch/iou3d.cpp
+21
-37
mmcv/ops/csrc/pytorch/min_area_polygons.cpp
mmcv/ops/csrc/pytorch/min_area_polygons.cpp
+11
-0
mmcv/ops/csrc/pytorch/mlu/bbox_overlaps_mlu.cpp
mmcv/ops/csrc/pytorch/mlu/bbox_overlaps_mlu.cpp
+100
-0
mmcv/ops/csrc/pytorch/mlu/focal_loss_sigmoid_mlu.cpp
mmcv/ops/csrc/pytorch/mlu/focal_loss_sigmoid_mlu.cpp
+332
-0
mmcv/ops/csrc/pytorch/mlu/nms_mlu.cpp
mmcv/ops/csrc/pytorch/mlu/nms_mlu.cpp
+130
-0
mmcv/ops/csrc/pytorch/mlu/psamask_mlu.cpp
mmcv/ops/csrc/pytorch/mlu/psamask_mlu.cpp
+308
-0
mmcv/ops/csrc/pytorch/mlu/roi_align_mlu.cpp
mmcv/ops/csrc/pytorch/mlu/roi_align_mlu.cpp
+206
-0
mmcv/ops/csrc/pytorch/mlu/roi_align_rotated_mlu.cpp
mmcv/ops/csrc/pytorch/mlu/roi_align_rotated_mlu.cpp
+232
-0
mmcv/ops/csrc/pytorch/mlu/roi_pool_mlu.cpp
mmcv/ops/csrc/pytorch/mlu/roi_pool_mlu.cpp
+275
-0
mmcv/ops/csrc/pytorch/mlu/tin_shift_mlu.cpp
mmcv/ops/csrc/pytorch/mlu/tin_shift_mlu.cpp
+203
-0
mmcv/ops/csrc/pytorch/mps/bbox_overlaps_mps.mm
mmcv/ops/csrc/pytorch/mps/bbox_overlaps_mps.mm
+99
-0
mmcv/ops/csrc/pytorch/points_in_polygons.cpp
mmcv/ops/csrc/pytorch/points_in_polygons.cpp
+15
-0
mmcv/ops/csrc/pytorch/prroi_pool.cpp
mmcv/ops/csrc/pytorch/prroi_pool.cpp
+47
-0
No files found.
Too many changes to show.
To preserve performance only
457 of 457+
files are displayed.
Plain diff
Email patch
mmcv/ops/csrc/pytorch/cuda/sparse_reordering.cu
0 → 100644
View file @
fdeee889
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <ATen/ATen.h>
#include <utils/spconv/spconv/mp_helper.h>
#include <utils/spconv/spconv/reordering.h>
#include <utils/spconv/tensorview/helper_launch.h>
#include <utils/spconv/tensorview/tensorview.h>
#include <chrono>
#include <limits>
#include <spconv/reordering.cuh>
#include <type_traits>
#include <utils/spconv/tensorview/helper_kernel.cuh>
#include "../spconv_utils.h"
#include "pytorch_cuda_helper.hpp"
namespace
functor
{
template
<
typename
scalar_t
,
typename
Index
>
struct
SparseGatherFunctor
<
tv
::
TorchGPU
,
scalar_t
,
Index
>
{
using
vecload_type_t
=
std
::
conditional_t
<
std
::
is_same
<
scalar_t
,
at
::
Half
>::
value
,
int2
,
int4
>
;
using
kernel_block_t
=
mp_list_c
<
int
,
64
,
32
,
16
>
;
void
operator
()(
const
tv
::
TorchGPU
&
d
,
tv
::
TensorView
<
scalar_t
>
buffer
,
tv
::
TensorView
<
const
scalar_t
>
features
,
tv
::
TensorView
<
const
Index
>
indices
,
int
size
)
{
if
(
size
<=
0
)
return
;
int
numPlanes
=
features
.
dim
(
1
);
bool
notFound
=
true
;
constexpr
int
vecloadFactor
=
sizeof
(
vecload_type_t
)
/
sizeof
(
scalar_t
);
mp_for_each
<
kernel_block_t
>
([
=
,
&
buffer
,
&
features
,
&
indices
,
&
notFound
](
auto
NumTLP
)
{
constexpr
int
NumILP
=
NumTLP
/
4
;
int
nHotBlock
=
(
size
/
NumTLP
)
*
NumTLP
;
if
(
notFound
)
{
if
(
numPlanes
%
NumTLP
==
0
)
{
if
(
nHotBlock
>=
NumTLP
)
{
gatherVecBlockKernel
<
scalar_t
,
Index
,
int
(
NumTLP
),
NumILP
,
vecload_type_t
>
<<<
dim3
(
numPlanes
/
NumTLP
,
size
/
NumTLP
),
dim3
(
NumTLP
/
vecloadFactor
,
NumTLP
/
NumILP
),
0
,
d
.
getStream
()
>>>
(
buffer
.
data
(),
features
.
data
(),
indices
.
data
(),
nHotBlock
,
numPlanes
/
vecloadFactor
);
TV_CHECK_CUDA_ERR
();
}
if
(
size
-
nHotBlock
>
0
)
{
gatherVecKernel
<
scalar_t
,
Index
,
int
(
NumTLP
),
NumILP
,
vecload_type_t
>
<<<
dim3
(
1
,
numPlanes
/
NumTLP
),
dim3
(
NumTLP
/
NumILP
,
NumTLP
/
vecloadFactor
),
0
,
d
.
getStream
()
>>>
(
buffer
.
data
()
+
nHotBlock
*
numPlanes
,
features
.
data
(),
indices
.
data
()
+
nHotBlock
,
size
-
nHotBlock
,
numPlanes
/
vecloadFactor
);
TV_CHECK_CUDA_ERR
();
}
notFound
=
false
;
}
}
});
if
(
notFound
)
{
constexpr
int
NumTLP
=
64
;
constexpr
int
NumILP
=
NumTLP
/
4
;
gatherGenericKernel
<
scalar_t
,
Index
,
NumTLP
,
NumILP
>
<<<
dim3
(
tv
::
launch
::
DivUp
(
size
,
NumTLP
),
tv
::
launch
::
DivUp
(
numPlanes
,
NumTLP
)),
dim3
(
NumTLP
/
NumILP
,
NumTLP
),
0
,
d
.
getStream
()
>>>
(
buffer
.
data
(),
features
.
data
(),
indices
.
data
(),
size
,
numPlanes
);
TV_CHECK_CUDA_ERR
();
}
}
};
template
<
typename
scalar_t
,
typename
Index
>
struct
SparseScatterAddFunctor
<
tv
::
TorchGPU
,
scalar_t
,
Index
>
{
using
vecload_type_t
=
std
::
conditional_t
<
std
::
is_same
<
scalar_t
,
at
::
Half
>::
value
,
int2
,
int4
>
;
using
kernel_block_t
=
mp_list_c
<
int
,
64
,
32
,
16
>
;
void
operator
()(
const
tv
::
TorchGPU
&
d
,
tv
::
TensorView
<
scalar_t
>
outFeatures
,
tv
::
TensorView
<
const
scalar_t
>
buffer
,
tv
::
TensorView
<
const
Index
>
indices
,
int
size
,
bool
stable
)
{
if
(
size
<=
0
)
return
;
int
numPlanes
=
outFeatures
.
dim
(
1
);
bool
notFound
=
true
;
constexpr
int
vecloadFactor
=
sizeof
(
vecload_type_t
)
/
sizeof
(
scalar_t
);
// important for half.
mp_for_each
<
kernel_block_t
>
([
=
,
&
d
,
&
outFeatures
,
&
buffer
,
&
indices
,
&
notFound
](
auto
NumTLP
)
{
constexpr
int
NumILP
=
NumTLP
/
4
;
int
nHotBlock
=
(
size
/
NumTLP
)
*
NumTLP
;
if
(
notFound
)
{
if
(
numPlanes
%
NumTLP
==
0
)
{
if
(
nHotBlock
>=
NumTLP
)
{
scatterAddVecBlockKernel
<
scalar_t
,
Index
,
int
(
NumTLP
),
NumILP
,
vecload_type_t
>
<<<
dim3
(
numPlanes
/
NumTLP
,
size
/
NumTLP
),
dim3
(
NumTLP
/
vecloadFactor
,
NumTLP
/
NumILP
),
0
,
d
.
getStream
()
>>>
(
outFeatures
.
data
(),
buffer
.
data
(),
indices
.
data
(),
nHotBlock
,
numPlanes
/
vecloadFactor
);
TV_CHECK_CUDA_ERR
();
}
if
(
size
-
nHotBlock
>
0
)
{
scatterAddGenericKernel
<
scalar_t
,
Index
,
int
(
NumTLP
),
NumILP
>
<<<
dim3
(
1
,
numPlanes
/
NumTLP
),
dim3
(
NumTLP
/
NumILP
,
NumTLP
),
0
,
d
.
getStream
()
>>>
(
outFeatures
.
data
(),
buffer
.
data
()
+
nHotBlock
*
numPlanes
,
indices
.
data
()
+
nHotBlock
,
size
-
nHotBlock
,
numPlanes
);
TV_CHECK_CUDA_ERR
();
}
notFound
=
false
;
}
}
});
if
(
notFound
)
{
constexpr
int
NumTLP
=
64
;
constexpr
int
NumILP
=
NumTLP
/
4
;
scatterAddGenericKernel
<
scalar_t
,
Index
,
NumTLP
,
NumILP
>
<<<
dim3
(
tv
::
launch
::
DivUp
(
size
,
NumTLP
),
tv
::
launch
::
DivUp
(
numPlanes
,
NumTLP
)),
dim3
(
NumTLP
/
NumILP
,
NumTLP
),
0
,
d
.
getStream
()
>>>
(
outFeatures
.
data
(),
buffer
.
data
(),
indices
.
data
(),
size
,
numPlanes
);
TV_CHECK_CUDA_ERR
();
}
}
};
}
// namespace functor
#define DECLARE_GPU_SPECS_T_INDEX(scalar_t, Index) \
template struct functor::SparseGatherFunctor<tv::TorchGPU, scalar_t, Index>; \
template struct functor::SparseScatterAddFunctor<tv::TorchGPU, scalar_t, \
Index>;
#define DECLARE_GPU_SPECS(scalar_t) DECLARE_GPU_SPECS_T_INDEX(scalar_t, int);
DECLARE_GPU_SPECS
(
float
);
DECLARE_GPU_SPECS
(
double
);
DECLARE_GPU_SPECS
(
at
::
Half
);
#undef DECLARE_GPU_SPECS
#undef DECLARE_GPU_SPECS_T_INDEX
mmcv/ops/csrc/pytorch/cuda/spconv_ops_cuda.cu
0 → 100644
View file @
fdeee889
#include <cuda_runtime_api.h>
#include <torch/script.h>
#include <utils/spconv/spconv/indice.h>
#include <utils/spconv/spconv/reordering.h>
#include "../spconv_utils.h"
#include "pytorch_cuda_helper.hpp"
template
<
unsigned
NDim
>
std
::
vector
<
torch
::
Tensor
>
GetIndicePairsForwardCUDAKernelLauncher
(
torch
::
Tensor
indices
,
int64_t
batchSize
,
std
::
vector
<
int64_t
>
outSpatialShape
,
std
::
vector
<
int64_t
>
spatialShape
,
std
::
vector
<
int64_t
>
kernelSize
,
std
::
vector
<
int64_t
>
stride
,
std
::
vector
<
int64_t
>
padding
,
std
::
vector
<
int64_t
>
dilation
,
std
::
vector
<
int64_t
>
outPadding
,
int64_t
_subM
,
int64_t
_transpose
)
{
at
::
cuda
::
CUDAGuard
device_guard
(
indices
.
device
());
bool
subM
=
_subM
!=
0
;
bool
transpose
=
_transpose
!=
0
;
auto
numAct
=
indices
.
size
(
0
);
auto
coorDim
=
indices
.
size
(
1
)
-
1
;
TV_ASSERT_RT_ERR
(
NDim
==
coorDim
,
"error"
);
TV_ASSERT_RT_ERR
(
kernelSize
.
size
()
==
coorDim
,
"error"
);
TV_ASSERT_RT_ERR
(
outSpatialShape
.
size
()
==
coorDim
,
"error"
);
TV_ASSERT_RT_ERR
(
stride
.
size
()
==
coorDim
,
"error"
);
TV_ASSERT_RT_ERR
(
padding
.
size
()
==
coorDim
,
"error"
);
TV_ASSERT_RT_ERR
(
outPadding
.
size
()
==
coorDim
,
"error"
);
TV_ASSERT_RT_ERR
(
dilation
.
size
()
==
coorDim
,
"error"
);
auto
kernelVolume
=
kernelSize
[
0
];
for
(
int
i
=
1
;
i
<
kernelSize
.
size
();
++
i
)
{
kernelVolume
*=
kernelSize
[
i
];
}
TV_ASSERT_RT_ERR
(
kernelVolume
<=
4096
,
"error"
);
auto
outputVolume
=
outSpatialShape
[
0
];
for
(
int
i
=
1
;
i
<
outSpatialShape
.
size
();
++
i
)
{
outputVolume
*=
outSpatialShape
[
i
];
}
torch
::
Tensor
indicePairs
=
torch
::
full
({
kernelVolume
,
2
,
numAct
},
-
1
,
torch
::
dtype
(
torch
::
kInt32
).
device
(
indices
.
device
()));
torch
::
Tensor
indiceNum
=
torch
::
zeros
(
{
kernelVolume
},
torch
::
dtype
(
torch
::
kInt32
).
device
(
indices
.
device
()));
torch
::
Tensor
gridOut
=
torch
::
full
({
batchSize
*
outputVolume
},
-
1
,
torch
::
dtype
(
torch
::
kInt32
).
device
(
indices
.
device
()));
int64_t
numActOut
=
-
1
;
tv
::
SimpleVector
<
int
,
NDim
>
outSpatialShape32
;
tv
::
SimpleVector
<
int
,
NDim
>
kernelSize32
;
tv
::
SimpleVector
<
int
,
NDim
>
stride32
;
tv
::
SimpleVector
<
int
,
NDim
>
padding32
;
tv
::
SimpleVector
<
int
,
NDim
>
dilation32
;
auto
indicePairUnique
=
torch
::
full
(
{
indicePairs
.
numel
()
/
2
+
1
},
std
::
numeric_limits
<
int
>::
max
(),
torch
::
dtype
(
torch
::
kInt32
).
device
(
indices
.
device
()));
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
outSpatialShape32
.
push_back
(
outSpatialShape
[
i
]);
kernelSize32
.
push_back
(
kernelSize
[
i
]);
if
(
subM
)
{
stride32
.
push_back
(
1
);
padding32
.
push_back
(
kernelSize
[
i
]
/
2
);
dilation32
.
push_back
(
dilation
[
i
]);
}
else
{
stride32
.
push_back
(
stride
[
i
]);
padding32
.
push_back
(
padding
[
i
]);
dilation32
.
push_back
(
dilation
[
i
]);
}
}
if
(
subM
)
{
if
(
indices
.
device
().
type
()
==
torch
::
kCPU
)
{
auto
getIndicePairFtor
=
functor
::
CreateSubMIndicePairFunctor
<
tv
::
CPU
,
int
,
int
,
NDim
>
();
numActOut
=
getIndicePairFtor
(
tv
::
CPU
(),
tv
::
torch2tv
<
const
int
>
(
indices
),
tv
::
torch2tv
<
int
>
(
gridOut
),
tv
::
torch2tv
<
int
>
(
indicePairs
),
tv
::
torch2tv
<
int
>
(
indiceNum
),
kernelSize32
,
stride32
,
padding32
,
dilation32
,
outSpatialShape32
,
transpose
);
}
else
{
auto
getIndicePairFtor
=
functor
::
CreateSubMIndicePairFunctor
<
tv
::
TorchGPU
,
int
,
int
,
NDim
>
();
numActOut
=
getIndicePairFtor
(
tv
::
TorchGPU
(),
tv
::
torch2tv
<
const
int
>
(
indices
),
tv
::
torch2tv
<
int
>
(
gridOut
),
tv
::
torch2tv
<
int
>
(
indicePairs
),
tv
::
torch2tv
<
int
>
(
indiceNum
),
kernelSize32
,
stride32
,
padding32
,
dilation32
,
outSpatialShape32
,
transpose
);
}
return
{
indices
,
indicePairs
,
indiceNum
};
}
else
{
torch
::
Tensor
outInds
=
torch
::
zeros
({
numAct
*
kernelVolume
,
coorDim
+
1
},
torch
::
dtype
(
torch
::
kInt32
).
device
(
indices
.
device
()));
if
(
indices
.
device
().
type
()
==
torch
::
kCPU
)
{
auto
getIndicePairFtor
=
functor
::
CreateConvIndicePairFunctor
<
tv
::
CPU
,
int
,
int
,
NDim
>
();
numActOut
=
getIndicePairFtor
(
tv
::
CPU
(),
tv
::
torch2tv
<
const
int
>
(
indices
),
tv
::
torch2tv
<
int
>
(
outInds
),
tv
::
torch2tv
<
int
>
(
gridOut
),
tv
::
torch2tv
<
int
>
(
indicePairs
),
tv
::
torch2tv
<
int
>
(
indiceNum
),
kernelSize32
,
stride32
,
padding32
,
dilation32
,
outSpatialShape32
,
transpose
);
}
else
{
auto
getIndicePairFtorP1
=
functor
::
CreateConvIndicePairFunctorP1
<
tv
::
TorchGPU
,
int
,
int
,
NDim
>
();
auto
getIndicePairFtorP2
=
functor
::
CreateConvIndicePairFunctorP2
<
tv
::
TorchGPU
,
int
,
int
,
NDim
>
();
numActOut
=
getIndicePairFtorP1
(
tv
::
TorchGPU
(),
tv
::
torch2tv
<
const
int
>
(
indices
),
tv
::
torch2tv
<
int
>
(
outInds
),
tv
::
torch2tv
<
int
>
(
gridOut
),
tv
::
torch2tv
<
int
>
(
indicePairs
),
tv
::
torch2tv
<
int
>
(
indiceNum
),
tv
::
torch2tv
<
int
>
(
indicePairUnique
),
kernelSize32
,
stride32
,
padding32
,
dilation32
,
outSpatialShape32
,
transpose
);
if
(
numActOut
>
0
)
{
auto
res
=
torch
::
_unique
(
indicePairUnique
);
indicePairUnique
=
std
::
get
<
0
>
(
res
);
numActOut
=
getIndicePairFtorP2
(
tv
::
TorchGPU
(),
tv
::
torch2tv
<
const
int
>
(
indices
),
tv
::
torch2tv
<
int
>
(
outInds
),
tv
::
torch2tv
<
int
>
(
gridOut
),
tv
::
torch2tv
<
int
>
(
indicePairs
),
tv
::
torch2tv
<
int
>
(
indiceNum
),
tv
::
torch2tv
<
int
>
(
indicePairUnique
),
outSpatialShape32
,
transpose
);
}
}
return
{
outInds
.
slice
(
0
,
0
,
numActOut
),
indicePairs
,
indiceNum
};
}
}
template
<
unsigned
NDim
>
std
::
vector
<
torch
::
Tensor
>
GetIndicePairsBackwardCUDAKernelLauncher
(
torch
::
Tensor
indices
,
torch
::
Tensor
gridOut
,
int64_t
batchSize
,
std
::
vector
<
int64_t
>
outSpatialShape
,
std
::
vector
<
int64_t
>
spatialShape
,
std
::
vector
<
int64_t
>
kernelSize
,
std
::
vector
<
int64_t
>
stride
,
std
::
vector
<
int64_t
>
padding
,
std
::
vector
<
int64_t
>
dilation
,
std
::
vector
<
int64_t
>
outPadding
,
int64_t
_subM
,
int64_t
_transpose
)
{
at
::
cuda
::
CUDAGuard
device_guard
(
indices
.
device
());
bool
subM
=
_subM
!=
0
;
bool
transpose
=
_transpose
!=
0
;
auto
numAct
=
indices
.
size
(
0
);
auto
coorDim
=
indices
.
size
(
1
)
-
1
;
TV_ASSERT_RT_ERR
(
NDim
==
coorDim
,
"error"
);
TV_ASSERT_RT_ERR
(
kernelSize
.
size
()
==
coorDim
,
"error"
);
TV_ASSERT_RT_ERR
(
outSpatialShape
.
size
()
==
coorDim
,
"error"
);
TV_ASSERT_RT_ERR
(
stride
.
size
()
==
coorDim
,
"error"
);
TV_ASSERT_RT_ERR
(
padding
.
size
()
==
coorDim
,
"error"
);
TV_ASSERT_RT_ERR
(
outPadding
.
size
()
==
coorDim
,
"error"
);
TV_ASSERT_RT_ERR
(
dilation
.
size
()
==
coorDim
,
"error"
);
auto
kernelVolume
=
kernelSize
[
0
];
for
(
int
i
=
1
;
i
<
kernelSize
.
size
();
++
i
)
{
kernelVolume
*=
kernelSize
[
i
];
}
TV_ASSERT_RT_ERR
(
kernelVolume
<=
4096
,
"error"
);
auto
outputVolume
=
outSpatialShape
[
0
];
for
(
int
i
=
1
;
i
<
outSpatialShape
.
size
();
++
i
)
{
outputVolume
*=
outSpatialShape
[
i
];
}
TV_ASSERT_INVALID_ARG
(
gridOut
.
numel
()
>=
outputVolume
*
batchSize
,
"error"
);
torch
::
Tensor
indicePairs
=
torch
::
full
({
kernelVolume
,
2
,
numAct
},
-
1
,
torch
::
dtype
(
torch
::
kInt32
).
device
(
indices
.
device
()));
torch
::
Tensor
indiceNum
=
torch
::
zeros
(
{
kernelVolume
},
torch
::
dtype
(
torch
::
kInt32
).
device
(
indices
.
device
()));
int64_t
numActOut
=
-
1
;
tv
::
SimpleVector
<
int
,
NDim
>
outSpatialShape32
;
tv
::
SimpleVector
<
int
,
NDim
>
kernelSize32
;
tv
::
SimpleVector
<
int
,
NDim
>
stride32
;
tv
::
SimpleVector
<
int
,
NDim
>
padding32
;
tv
::
SimpleVector
<
int
,
NDim
>
dilation32
;
auto
indicePairUnique
=
torch
::
full
(
{
indicePairs
.
numel
()
/
2
+
1
},
std
::
numeric_limits
<
int
>::
max
(),
torch
::
dtype
(
torch
::
kInt32
).
device
(
indices
.
device
()));
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
outSpatialShape32
.
push_back
(
outSpatialShape
[
i
]);
kernelSize32
.
push_back
(
kernelSize
[
i
]);
if
(
subM
)
{
stride32
.
push_back
(
1
);
padding32
.
push_back
(
kernelSize
[
i
]
/
2
);
dilation32
.
push_back
(
dilation
[
i
]);
}
else
{
stride32
.
push_back
(
stride
[
i
]);
padding32
.
push_back
(
padding
[
i
]);
dilation32
.
push_back
(
dilation
[
i
]);
}
}
if
(
subM
)
{
if
(
indices
.
device
().
type
()
==
torch
::
kCPU
)
{
auto
getIndicePairFtor
=
functor
::
CreateSubMIndicePairFunctor
<
tv
::
CPU
,
int
,
int
,
NDim
>
();
numActOut
=
getIndicePairFtor
(
tv
::
CPU
(),
tv
::
torch2tv
<
const
int
>
(
indices
),
tv
::
torch2tv
<
int
>
(
gridOut
),
tv
::
torch2tv
<
int
>
(
indicePairs
),
tv
::
torch2tv
<
int
>
(
indiceNum
),
kernelSize32
,
stride32
,
padding32
,
dilation32
,
outSpatialShape32
,
transpose
);
gridOut
.
fill_
(
-
1
);
}
else
{
auto
getIndicePairFtor
=
functor
::
CreateSubMIndicePairFunctor
<
tv
::
TorchGPU
,
int
,
int
,
NDim
>
();
numActOut
=
getIndicePairFtor
(
tv
::
TorchGPU
(),
tv
::
torch2tv
<
const
int
>
(
indices
),
tv
::
torch2tv
<
int
>
(
gridOut
),
tv
::
torch2tv
<
int
>
(
indicePairs
),
tv
::
torch2tv
<
int
>
(
indiceNum
),
kernelSize32
,
stride32
,
padding32
,
dilation32
,
outSpatialShape32
,
transpose
,
true
);
}
return
{
indices
,
indicePairs
,
indiceNum
};
}
else
{
torch
::
Tensor
outInds
=
torch
::
zeros
({
numAct
*
kernelVolume
,
coorDim
+
1
},
torch
::
dtype
(
torch
::
kInt32
).
device
(
indices
.
device
()));
if
(
indices
.
device
().
type
()
==
torch
::
kCPU
)
{
auto
getIndicePairFtor
=
functor
::
CreateConvIndicePairFunctor
<
tv
::
CPU
,
int
,
int
,
NDim
>
();
numActOut
=
getIndicePairFtor
(
tv
::
CPU
(),
tv
::
torch2tv
<
const
int
>
(
indices
),
tv
::
torch2tv
<
int
>
(
outInds
),
tv
::
torch2tv
<
int
>
(
gridOut
),
tv
::
torch2tv
<
int
>
(
indicePairs
),
tv
::
torch2tv
<
int
>
(
indiceNum
),
kernelSize32
,
stride32
,
padding32
,
dilation32
,
outSpatialShape32
,
transpose
,
true
);
gridOut
.
fill_
(
-
1
);
}
else
{
auto
getIndicePairFtorP1
=
functor
::
CreateConvIndicePairFunctorP1
<
tv
::
TorchGPU
,
int
,
int
,
NDim
>
();
auto
getIndicePairFtorP2
=
functor
::
CreateConvIndicePairFunctorP2
<
tv
::
TorchGPU
,
int
,
int
,
NDim
>
();
numActOut
=
getIndicePairFtorP1
(
tv
::
TorchGPU
(),
tv
::
torch2tv
<
const
int
>
(
indices
),
tv
::
torch2tv
<
int
>
(
outInds
),
tv
::
torch2tv
<
int
>
(
gridOut
),
tv
::
torch2tv
<
int
>
(
indicePairs
),
tv
::
torch2tv
<
int
>
(
indiceNum
),
tv
::
torch2tv
<
int
>
(
indicePairUnique
),
kernelSize32
,
stride32
,
padding32
,
dilation32
,
outSpatialShape32
,
transpose
);
if
(
numActOut
>
0
)
{
auto
res
=
torch
::
_unique
(
indicePairUnique
);
indicePairUnique
=
std
::
get
<
0
>
(
res
);
numActOut
=
getIndicePairFtorP2
(
tv
::
TorchGPU
(),
tv
::
torch2tv
<
const
int
>
(
indices
),
tv
::
torch2tv
<
int
>
(
outInds
),
tv
::
torch2tv
<
int
>
(
gridOut
),
tv
::
torch2tv
<
int
>
(
indicePairs
),
tv
::
torch2tv
<
int
>
(
indiceNum
),
tv
::
torch2tv
<
int
>
(
indicePairUnique
),
outSpatialShape32
,
transpose
,
true
);
}
}
return
{
outInds
.
slice
(
0
,
0
,
numActOut
),
indicePairs
,
indiceNum
};
}
}
torch
::
Tensor
IndiceConvForwardCUDAKernelLauncher
(
torch
::
Tensor
features
,
torch
::
Tensor
filters
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
int64_t
numActOut
,
int64_t
_inverse
,
int64_t
_subM
)
{
at
::
cuda
::
CUDAGuard
device_guard
(
features
.
device
());
bool
subM
=
_subM
!=
0
;
bool
inverse
=
_inverse
!=
0
;
auto
device
=
features
.
device
().
type
();
auto
ndim
=
filters
.
dim
()
-
2
;
auto
kernelVolume
=
indicePairs
.
size
(
0
);
auto
numInPlanes
=
features
.
size
(
1
);
auto
numOutPlanes
=
filters
.
size
(
ndim
+
1
);
auto
indicePairNumCpu
=
indiceNum
.
to
({
torch
::
kCPU
});
auto
indicePairMaxSizeIter
=
std
::
max_element
(
indicePairNumCpu
.
data_ptr
<
int
>
(),
indicePairNumCpu
.
data_ptr
<
int
>
()
+
kernelVolume
);
int
indicePairMaxOffset
=
indicePairMaxSizeIter
-
indicePairNumCpu
.
data_ptr
<
int
>
();
int
indicePairMaxSize
=
*
indicePairMaxSizeIter
;
auto
options
=
torch
::
TensorOptions
().
dtype
(
features
.
dtype
()).
device
(
features
.
device
());
torch
::
Tensor
output
=
torch
::
zeros
({
numActOut
,
numOutPlanes
},
options
);
torch
::
Tensor
inputBuffer
=
torch
::
zeros
({
indicePairMaxSize
,
numInPlanes
},
options
);
torch
::
Tensor
outputBuffer
=
torch
::
zeros
({
indicePairMaxSize
,
numOutPlanes
},
options
);
filters
=
filters
.
view
({
-
1
,
numInPlanes
,
numOutPlanes
});
if
(
subM
)
{
torch
::
mm_out
(
output
,
features
,
filters
[
indicePairMaxOffset
]);
}
double
totalGatherTime
=
0
;
double
totalGEMMTime
=
0
;
double
totalSAddTime
=
0
;
for
(
int
i
=
0
;
i
<
kernelVolume
;
++
i
)
{
auto
nHot
=
indicePairNumCpu
.
data_ptr
<
int
>
()[
i
];
if
(
nHot
<=
0
||
(
subM
&&
i
==
indicePairMaxOffset
))
{
continue
;
}
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(
features
.
scalar_type
(),
"IndiceConvForwardKernel"
,
[
&
]
{
auto
outputBufferBlob
=
torch
::
from_blob
(
outputBuffer
.
data_ptr
<
scalar_t
>
(),
{
nHot
,
numOutPlanes
},
options
);
auto
inputBufferBlob
=
torch
::
from_blob
(
inputBuffer
.
data_ptr
<
scalar_t
>
(),
{
nHot
,
numInPlanes
},
options
);
if
(
device
==
torch
::
kCPU
)
{
functor
::
SparseGatherFunctor
<
tv
::
CPU
,
scalar_t
,
int
>
gatherFtor
;
gatherFtor
(
tv
::
CPU
(),
tv
::
torch2tv
<
scalar_t
>
(
inputBuffer
),
tv
::
torch2tv
<
const
scalar_t
>
(
features
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
inverse
),
nHot
);
}
else
{
functor
::
SparseGatherFunctor
<
tv
::
TorchGPU
,
scalar_t
,
int
>
gatherFtor
;
gatherFtor
(
tv
::
TorchGPU
(),
tv
::
torch2tv
<
scalar_t
>
(
inputBuffer
),
tv
::
torch2tv
<
const
scalar_t
>
(
features
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
inverse
),
nHot
);
TV_CHECK_CUDA_ERR
();
/* slower than SparseGatherFunctor, may due to int->long conversion
auto indicePairLong = indicePairs[i][inverse].to(torch::kInt64);
auto indicePairBlob =
torch::from_blob(indicePairLong.data_ptr<long>(), {nHot},
indicePairOptions); torch::index_select_out(inputBufferBlob,
features, 0, indicePairBlob);*/
}
torch
::
mm_out
(
outputBufferBlob
,
inputBufferBlob
,
filters
[
i
]);
if
(
device
==
torch
::
kCPU
)
{
functor
::
SparseScatterAddFunctor
<
tv
::
CPU
,
scalar_t
,
int
>
scatterFtor
;
scatterFtor
(
tv
::
CPU
(),
tv
::
torch2tv
<
scalar_t
>
(
output
),
tv
::
torch2tv
<
const
scalar_t
>
(
outputBuffer
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
!
inverse
),
nHot
,
true
);
}
else
{
functor
::
SparseScatterAddFunctor
<
tv
::
TorchGPU
,
scalar_t
,
int
>
scatterFtor
;
scatterFtor
(
tv
::
TorchGPU
(),
tv
::
torch2tv
<
scalar_t
>
(
output
),
tv
::
torch2tv
<
const
scalar_t
>
(
outputBuffer
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
!
inverse
),
nHot
,
true
);
TV_CHECK_CUDA_ERR
();
}
});
}
return
output
;
}
std
::
vector
<
torch
::
Tensor
>
IndiceConvBackwardCUDAKernelLauncher
(
torch
::
Tensor
features
,
torch
::
Tensor
filters
,
torch
::
Tensor
outGrad
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
int64_t
_inverse
,
int64_t
_subM
)
{
at
::
cuda
::
CUDAGuard
device_guard
(
features
.
device
());
bool
subM
=
_subM
!=
0
;
bool
inverse
=
_inverse
!=
0
;
auto
device
=
features
.
device
().
type
();
auto
ndim
=
filters
.
dim
()
-
2
;
auto
kernelVolume
=
indicePairs
.
size
(
0
);
auto
numInPlanes
=
features
.
size
(
1
);
auto
numOutPlanes
=
filters
.
size
(
ndim
+
1
);
auto
indicePairNumCpu
=
indiceNum
.
to
({
torch
::
kCPU
});
auto
indicePairMaxSizeIter
=
std
::
max_element
(
indicePairNumCpu
.
data_ptr
<
int
>
(),
indicePairNumCpu
.
data_ptr
<
int
>
()
+
kernelVolume
);
int
indicePairMaxOffset
=
indicePairMaxSizeIter
-
indicePairNumCpu
.
data_ptr
<
int
>
();
int
indicePairMaxSize
=
*
indicePairMaxSizeIter
;
auto
options
=
torch
::
TensorOptions
().
dtype
(
features
.
dtype
()).
device
(
features
.
device
());
auto
filterShape
=
filters
.
sizes
();
torch
::
Tensor
inputGrad
=
torch
::
zeros
(
features
.
sizes
(),
options
);
torch
::
Tensor
filtersGrad
=
torch
::
zeros
(
filterShape
,
options
);
torch
::
Tensor
inputBuffer
=
torch
::
zeros
({
indicePairMaxSize
,
numInPlanes
},
options
);
torch
::
Tensor
outputBuffer
=
torch
::
zeros
({
indicePairMaxSize
,
numOutPlanes
},
options
);
filters
=
filters
.
view
({
-
1
,
numInPlanes
,
numOutPlanes
});
filtersGrad
=
filtersGrad
.
view
({
-
1
,
numInPlanes
,
numOutPlanes
});
if
(
subM
)
{
auto
filterGradSub
=
filtersGrad
[
indicePairMaxOffset
];
torch
::
mm_out
(
filterGradSub
,
features
.
t
(),
outGrad
);
torch
::
mm_out
(
inputGrad
,
outGrad
,
filters
[
indicePairMaxOffset
].
t
());
}
for
(
int
i
=
0
;
i
<
kernelVolume
;
++
i
)
{
auto
nHot
=
indicePairNumCpu
.
data_ptr
<
int
>
()[
i
];
if
(
nHot
<=
0
||
(
subM
&&
i
==
indicePairMaxOffset
))
{
continue
;
}
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(
features
.
scalar_type
(),
"IndiceConvBackwardKernel"
,
[
&
]
{
if
(
device
==
torch
::
kCPU
)
{
functor
::
SparseGatherFunctor
<
tv
::
CPU
,
scalar_t
,
int
>
gatherFtor
;
functor
::
SparseGatherFunctor
<
tv
::
CPU
,
scalar_t
,
int
>
gatherFtorOut
;
gatherFtor
(
tv
::
CPU
(),
tv
::
torch2tv
<
scalar_t
>
(
inputBuffer
),
tv
::
torch2tv
<
const
scalar_t
>
(
features
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
inverse
),
nHot
);
gatherFtorOut
(
tv
::
CPU
(),
tv
::
torch2tv
<
scalar_t
>
(
outputBuffer
),
tv
::
torch2tv
<
const
scalar_t
>
(
outGrad
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
!
inverse
),
nHot
);
}
else
{
functor
::
SparseGatherFunctor
<
tv
::
TorchGPU
,
scalar_t
,
int
>
gatherFtor
;
functor
::
SparseGatherFunctor
<
tv
::
TorchGPU
,
scalar_t
,
int
>
gatherFtorOut
;
gatherFtor
(
tv
::
TorchGPU
(),
tv
::
torch2tv
<
scalar_t
>
(
inputBuffer
),
tv
::
torch2tv
<
const
scalar_t
>
(
features
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
inverse
),
nHot
);
TV_CHECK_CUDA_ERR
();
gatherFtorOut
(
tv
::
TorchGPU
(),
tv
::
torch2tv
<
scalar_t
>
(
outputBuffer
),
tv
::
torch2tv
<
const
scalar_t
>
(
outGrad
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
!
inverse
),
nHot
);
TV_CHECK_CUDA_ERR
();
}
auto
filterGradSub
=
filtersGrad
[
i
];
auto
outputBufferBlob
=
torch
::
from_blob
(
outputBuffer
.
data_ptr
<
scalar_t
>
(),
{
nHot
,
numOutPlanes
},
options
);
auto
inputBufferBlob
=
torch
::
from_blob
(
inputBuffer
.
data_ptr
<
scalar_t
>
(),
{
nHot
,
numInPlanes
},
options
);
torch
::
mm_out
(
filterGradSub
,
inputBufferBlob
.
t
(),
outputBufferBlob
);
torch
::
mm_out
(
inputBufferBlob
,
outputBufferBlob
,
filters
[
i
].
t
());
if
(
device
==
torch
::
kCPU
)
{
functor
::
SparseScatterAddFunctor
<
tv
::
CPU
,
scalar_t
,
int
>
scatterFtor
;
scatterFtor
(
tv
::
CPU
(),
tv
::
torch2tv
<
scalar_t
>
(
inputGrad
),
tv
::
torch2tv
<
const
scalar_t
>
(
inputBuffer
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
inverse
),
nHot
);
}
else
{
functor
::
SparseScatterAddFunctor
<
tv
::
TorchGPU
,
scalar_t
,
int
>
scatterFtor
;
scatterFtor
(
tv
::
TorchGPU
(),
tv
::
torch2tv
<
scalar_t
>
(
inputGrad
),
tv
::
torch2tv
<
const
scalar_t
>
(
inputBuffer
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
inverse
),
nHot
);
TV_CHECK_CUDA_ERR
();
}
});
}
return
{
inputGrad
,
filtersGrad
.
view
(
filterShape
)};
}
template
std
::
vector
<
torch
::
Tensor
>
GetIndicePairsForwardCUDAKernelLauncher
<
2
>
(
torch
::
Tensor
indices
,
int64_t
batchSize
,
std
::
vector
<
int64_t
>
outSpatialShape
,
std
::
vector
<
int64_t
>
spatialShape
,
std
::
vector
<
int64_t
>
kernelSize
,
std
::
vector
<
int64_t
>
stride
,
std
::
vector
<
int64_t
>
padding
,
std
::
vector
<
int64_t
>
dilation
,
std
::
vector
<
int64_t
>
outPadding
,
int64_t
_subM
,
int64_t
_transpose
);
template
std
::
vector
<
torch
::
Tensor
>
GetIndicePairsForwardCUDAKernelLauncher
<
3
>
(
torch
::
Tensor
indices
,
int64_t
batchSize
,
std
::
vector
<
int64_t
>
outSpatialShape
,
std
::
vector
<
int64_t
>
spatialShape
,
std
::
vector
<
int64_t
>
kernelSize
,
std
::
vector
<
int64_t
>
stride
,
std
::
vector
<
int64_t
>
padding
,
std
::
vector
<
int64_t
>
dilation
,
std
::
vector
<
int64_t
>
outPadding
,
int64_t
_subM
,
int64_t
_transpose
);
template
std
::
vector
<
torch
::
Tensor
>
GetIndicePairsForwardCUDAKernelLauncher
<
4
>
(
torch
::
Tensor
indices
,
int64_t
batchSize
,
std
::
vector
<
int64_t
>
outSpatialShape
,
std
::
vector
<
int64_t
>
spatialShape
,
std
::
vector
<
int64_t
>
kernelSize
,
std
::
vector
<
int64_t
>
stride
,
std
::
vector
<
int64_t
>
padding
,
std
::
vector
<
int64_t
>
dilation
,
std
::
vector
<
int64_t
>
outPadding
,
int64_t
_subM
,
int64_t
_transpose
);
template
std
::
vector
<
torch
::
Tensor
>
GetIndicePairsBackwardCUDAKernelLauncher
<
2
>
(
torch
::
Tensor
indices
,
torch
::
Tensor
gridOut
,
int64_t
batchSize
,
std
::
vector
<
int64_t
>
outSpatialShape
,
std
::
vector
<
int64_t
>
spatialShape
,
std
::
vector
<
int64_t
>
kernelSize
,
std
::
vector
<
int64_t
>
stride
,
std
::
vector
<
int64_t
>
padding
,
std
::
vector
<
int64_t
>
dilation
,
std
::
vector
<
int64_t
>
outPadding
,
int64_t
_subM
,
int64_t
_transpose
);
template
std
::
vector
<
torch
::
Tensor
>
GetIndicePairsBackwardCUDAKernelLauncher
<
3
>
(
torch
::
Tensor
indices
,
torch
::
Tensor
gridOut
,
int64_t
batchSize
,
std
::
vector
<
int64_t
>
outSpatialShape
,
std
::
vector
<
int64_t
>
spatialShape
,
std
::
vector
<
int64_t
>
kernelSize
,
std
::
vector
<
int64_t
>
stride
,
std
::
vector
<
int64_t
>
padding
,
std
::
vector
<
int64_t
>
dilation
,
std
::
vector
<
int64_t
>
outPadding
,
int64_t
_subM
,
int64_t
_transpose
);
mmcv/ops/csrc/pytorch/cuda/three_interpolate_cuda.cu
View file @
fdeee889
...
...
@@ -23,7 +23,7 @@ void ThreeInterpolateForwardCUDAKernelLauncher(int b, int c, int m, int n,
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
// blockIdx.x(col), blockIdx.y(row)
dim3
blocks
(
DIVUP
(
n
,
THREADS_PER_BLOCK
),
c
,
b
);
dim3
blocks
(
GET_BLOCKS
(
n
,
THREADS_PER_BLOCK
),
c
,
b
);
dim3
threads
(
THREADS_PER_BLOCK
);
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(
...
...
@@ -51,7 +51,7 @@ void ThreeInterpolateBackwardCUDAKernelLauncher(int b, int c, int n, int m,
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
// blockIdx.x(col), blockIdx.y(row)
dim3
blocks
(
DIVUP
(
n
,
THREADS_PER_BLOCK
),
c
,
b
);
dim3
blocks
(
GET_BLOCKS
(
n
,
THREADS_PER_BLOCK
),
c
,
b
);
dim3
threads
(
THREADS_PER_BLOCK
);
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(
...
...
mmcv/ops/csrc/pytorch/cuda/three_nn_cuda.cu
View file @
fdeee889
...
...
@@ -21,7 +21,7 @@ void ThreeNNForwardCUDAKernelLauncher(int b, int n, int m, const Tensor unknown,
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
// blockIdx.x(col), blockIdx.y(row)
dim3
blocks
(
DIVUP
(
n
,
THREADS_PER_BLOCK
),
b
);
dim3
blocks
(
GET_BLOCKS
(
n
,
THREADS_PER_BLOCK
),
b
);
dim3
threads
(
THREADS_PER_BLOCK
);
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(
...
...
mmcv/ops/csrc/pytorch/cuda/voxelization_cuda.cu
View file @
fdeee889
...
...
@@ -145,6 +145,104 @@ int HardVoxelizeForwardCUDAKernelLauncher(
return
voxel_num_int
;
}
int
NondeterministicHardVoxelizeForwardCUDAKernelLauncher
(
const
at
::
Tensor
&
points
,
at
::
Tensor
&
voxels
,
at
::
Tensor
&
coors
,
at
::
Tensor
&
num_points_per_voxel
,
const
std
::
vector
<
float
>
voxel_size
,
const
std
::
vector
<
float
>
coors_range
,
const
int
max_points
,
const
int
max_voxels
,
const
int
NDim
=
3
)
{
at
::
cuda
::
CUDAGuard
device_guard
(
points
.
device
());
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
const
int
num_points
=
points
.
size
(
0
);
const
int
num_features
=
points
.
size
(
1
);
if
(
num_points
==
0
)
return
0
;
dim3
blocks
(
std
::
min
(
at
::
cuda
::
ATenCeilDiv
(
num_points
,
THREADS_PER_BLOCK
),
4096
));
dim3
threads
(
THREADS_PER_BLOCK
);
const
float
voxel_x
=
voxel_size
[
0
];
const
float
voxel_y
=
voxel_size
[
1
];
const
float
voxel_z
=
voxel_size
[
2
];
const
float
coors_x_min
=
coors_range
[
0
];
const
float
coors_y_min
=
coors_range
[
1
];
const
float
coors_z_min
=
coors_range
[
2
];
const
float
coors_x_max
=
coors_range
[
3
];
const
float
coors_y_max
=
coors_range
[
4
];
const
float
coors_z_max
=
coors_range
[
5
];
const
int
grid_x
=
round
((
coors_x_max
-
coors_x_min
)
/
voxel_x
);
const
int
grid_y
=
round
((
coors_y_max
-
coors_y_min
)
/
voxel_y
);
const
int
grid_z
=
round
((
coors_z_max
-
coors_z_min
)
/
voxel_z
);
// map points to voxel coors
at
::
Tensor
temp_coors
=
at
::
zeros
({
num_points
,
NDim
},
points
.
options
().
dtype
(
at
::
kInt
));
// 1. link point to corresponding voxel coors
AT_DISPATCH_ALL_TYPES
(
points
.
scalar_type
(),
"hard_voxelize_kernel"
,
([
&
]
{
dynamic_voxelize_kernel
<
scalar_t
,
int
><<<
blocks
,
threads
,
0
,
stream
>>>
(
points
.
contiguous
().
data_ptr
<
scalar_t
>
(),
temp_coors
.
contiguous
().
data_ptr
<
int
>
(),
voxel_x
,
voxel_y
,
voxel_z
,
coors_x_min
,
coors_y_min
,
coors_z_min
,
coors_x_max
,
coors_y_max
,
coors_z_max
,
grid_x
,
grid_y
,
grid_z
,
num_points
,
num_features
,
NDim
);
}));
at
::
Tensor
coors_map
;
at
::
Tensor
reduce_count
;
auto
coors_clean
=
temp_coors
.
masked_fill
(
temp_coors
.
lt
(
0
).
any
(
-
1
,
true
),
-
1
);
std
::
tie
(
temp_coors
,
coors_map
,
reduce_count
)
=
at
::
unique_dim
(
coors_clean
,
0
,
true
,
true
,
false
);
if
(
temp_coors
[
0
][
0
].
lt
(
0
).
item
<
bool
>
())
{
// the first element of temp_coors is (-1,-1,-1) and should be removed
temp_coors
=
temp_coors
.
slice
(
0
,
1
);
coors_map
=
coors_map
-
1
;
}
int
num_coors
=
temp_coors
.
size
(
0
);
temp_coors
=
temp_coors
.
to
(
at
::
kInt
);
coors_map
=
coors_map
.
to
(
at
::
kInt
);
at
::
Tensor
coors_count
=
at
::
zeros
({
1
},
coors_map
.
options
());
at
::
Tensor
coors_order
=
at
::
empty
({
num_coors
},
coors_map
.
options
());
at
::
Tensor
pts_id
=
at
::
zeros
({
num_points
},
coors_map
.
options
());
reduce_count
=
at
::
zeros
({
num_coors
},
coors_map
.
options
());
AT_DISPATCH_ALL_TYPES
(
points
.
scalar_type
(),
"get_assign_pos"
,
([
&
]
{
nondeterministic_get_assign_pos
<<<
blocks
,
threads
,
0
,
stream
>>>
(
num_points
,
coors_map
.
contiguous
().
data_ptr
<
int32_t
>
(),
pts_id
.
contiguous
().
data_ptr
<
int32_t
>
(),
coors_count
.
contiguous
().
data_ptr
<
int32_t
>
(),
reduce_count
.
contiguous
().
data_ptr
<
int32_t
>
(),
coors_order
.
contiguous
().
data_ptr
<
int32_t
>
());
}));
AT_DISPATCH_ALL_TYPES
(
points
.
scalar_type
(),
"assign_point_to_voxel"
,
([
&
]
{
nondeterministic_assign_point_voxel
<
scalar_t
>
<<<
blocks
,
threads
,
0
,
stream
>>>
(
num_points
,
points
.
contiguous
().
data_ptr
<
scalar_t
>
(),
coors_map
.
contiguous
().
data_ptr
<
int32_t
>
(),
pts_id
.
contiguous
().
data_ptr
<
int32_t
>
(),
temp_coors
.
contiguous
().
data_ptr
<
int32_t
>
(),
reduce_count
.
contiguous
().
data_ptr
<
int32_t
>
(),
coors_order
.
contiguous
().
data_ptr
<
int32_t
>
(),
voxels
.
contiguous
().
data_ptr
<
scalar_t
>
(),
coors
.
contiguous
().
data_ptr
<
int32_t
>
(),
num_points_per_voxel
.
contiguous
().
data_ptr
<
int32_t
>
(),
max_voxels
,
max_points
,
num_features
,
NDim
);
}));
AT_CUDA_CHECK
(
cudaGetLastError
());
return
max_voxels
<
num_coors
?
max_voxels
:
num_coors
;
}
void
DynamicVoxelizeForwardCUDAKernelLauncher
(
const
at
::
Tensor
&
points
,
at
::
Tensor
&
coors
,
const
std
::
vector
<
float
>
voxel_size
,
const
std
::
vector
<
float
>
coors_range
,
...
...
mmcv/ops/csrc/pytorch/diff_iou_rotated.cpp
0 → 100644
View file @
fdeee889
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
Tensor
diff_iou_rotated_sort_vertices_forward_impl
(
Tensor
vertices
,
Tensor
mask
,
Tensor
num_valid
)
{
return
DISPATCH_DEVICE_IMPL
(
diff_iou_rotated_sort_vertices_forward_impl
,
vertices
,
mask
,
num_valid
);
}
Tensor
diff_iou_rotated_sort_vertices_forward
(
Tensor
vertices
,
Tensor
mask
,
Tensor
num_valid
)
{
return
diff_iou_rotated_sort_vertices_forward_impl
(
vertices
,
mask
,
num_valid
);
}
mmcv/ops/csrc/pytorch/fused_spconv_ops.cpp
0 → 100644
View file @
fdeee889
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
torch
::
Tensor
fused_indice_conv_batchnorm_forward_impl
(
torch
::
Tensor
features
,
torch
::
Tensor
filters
,
torch
::
Tensor
bias
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
int64_t
numActOut
,
int64_t
_inverse
,
int64_t
_subM
)
{
return
DISPATCH_DEVICE_IMPL
(
fused_indice_conv_batchnorm_forward_impl
,
features
,
filters
,
bias
,
indicePairs
,
indiceNum
,
numActOut
,
_inverse
,
_subM
);
}
torch
::
Tensor
fused_indice_conv_batchnorm_forward
(
torch
::
Tensor
features
,
torch
::
Tensor
filters
,
torch
::
Tensor
bias
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
int64_t
numActOut
,
int64_t
_inverse
,
int64_t
_subM
)
{
return
fused_indice_conv_batchnorm_forward_impl
(
features
,
filters
,
bias
,
indicePairs
,
indiceNum
,
numActOut
,
_inverse
,
_subM
);
}
mmcv/ops/csrc/pytorch/iou3d.cpp
View file @
fdeee889
...
...
@@ -19,31 +19,24 @@ void iou3d_boxes_overlap_bev_forward_impl(const int num_a, const Tensor boxes_a,
num_b
,
boxes_b
,
ans_overlap
);
}
void
iou3d_boxes_iou_bev_forward_impl
(
const
int
num_a
,
const
Tensor
boxes_a
,
const
int
num_b
,
const
Tensor
boxes_b
,
Tensor
ans_iou
)
{
DISPATCH_DEVICE_IMPL
(
iou3d_boxes_iou_bev_forward_impl
,
num_a
,
boxes_a
,
num_b
,
boxes_b
,
ans_iou
);
}
void
iou3d_nms_forward_impl
(
const
Tensor
boxes
,
unsigned
long
long
*
mask
,
int
boxes_num
,
float
nms_overlap_thresh
)
{
DISPATCH_DEVICE_IMPL
(
iou3d_nms_forward_impl
,
boxes
,
mask
,
boxes_num
,
void
iou3d_nms3d_forward_impl
(
const
Tensor
boxes
,
unsigned
long
long
*
mask
,
int
boxes_num
,
float
nms_overlap_thresh
)
{
DISPATCH_DEVICE_IMPL
(
iou3d_nms3d_forward_impl
,
boxes
,
mask
,
boxes_num
,
nms_overlap_thresh
);
}
void
iou3d_nms_normal_forward_impl
(
const
Tensor
boxes
,
unsigned
long
long
*
mask
,
int
boxes_num
,
float
nms_overlap_thresh
)
{
DISPATCH_DEVICE_IMPL
(
iou3d_nms_normal_forward_impl
,
boxes
,
mask
,
boxes_num
,
void
iou3d_nms3d_normal_forward_impl
(
const
Tensor
boxes
,
unsigned
long
long
*
mask
,
int
boxes_num
,
float
nms_overlap_thresh
)
{
DISPATCH_DEVICE_IMPL
(
iou3d_nms3d_normal_forward_impl
,
boxes
,
mask
,
boxes_num
,
nms_overlap_thresh
);
}
void
iou3d_boxes_overlap_bev_forward
(
Tensor
boxes_a
,
Tensor
boxes_b
,
Tensor
ans_overlap
)
{
// params boxes
_a
: (N,
5
) [x
1
, y
1, x2, y2, ry
]
// params boxes: (N,
7
) [x, y
, z, dx, dy, dz, heading
]
// params boxes_b: (M, 5)
// params ans_overlap: (N, M)
int
num_a
=
boxes_a
.
size
(
0
);
int
num_b
=
boxes_b
.
size
(
0
);
...
...
@@ -51,20 +44,9 @@ void iou3d_boxes_overlap_bev_forward(Tensor boxes_a, Tensor boxes_b,
ans_overlap
);
}
void
iou3d_boxes_iou_bev_forward
(
Tensor
boxes_a
,
Tensor
boxes_b
,
Tensor
ans_iou
)
{
// params boxes_a: (N, 5) [x1, y1, x2, y2, ry]
// params boxes_b: (M, 5)
// params ans_overlap: (N, M)
int
num_a
=
boxes_a
.
size
(
0
);
int
num_b
=
boxes_b
.
size
(
0
);
iou3d_boxes_iou_bev_forward_impl
(
num_a
,
boxes_a
,
num_b
,
boxes_b
,
ans_iou
);
}
void
iou3d_nms_forward
(
Tensor
boxes
,
Tensor
keep
,
Tensor
keep_num
,
float
nms_overlap_thresh
)
{
// params boxes: (N, 5) [x1, y1, x2, y2, ry]
void
iou3d_nms3d_forward
(
Tensor
boxes
,
Tensor
keep
,
Tensor
keep_num
,
float
nms_overlap_thresh
)
{
// params boxes: (N, 7) [x, y, z, dx, dy, dz, heading]
// params keep: (N)
CHECK_CONTIGUOUS
(
boxes
);
CHECK_CONTIGUOUS
(
keep
);
...
...
@@ -73,13 +55,14 @@ void iou3d_nms_forward(Tensor boxes, Tensor keep, Tensor keep_num,
int64_t
*
keep_data
=
keep
.
data_ptr
<
int64_t
>
();
int64_t
*
keep_num_data
=
keep_num
.
data_ptr
<
int64_t
>
();
const
int
col_blocks
=
DIVUP
(
boxes_num
,
THREADS_PER_BLOCK_NMS
);
const
int
col_blocks
=
(
boxes_num
+
THREADS_PER_BLOCK_NMS
-
1
)
/
THREADS_PER_BLOCK_NMS
;
Tensor
mask
=
at
::
empty
({
boxes_num
,
col_blocks
},
boxes
.
options
().
dtype
(
at
::
kLong
));
unsigned
long
long
*
mask_data
=
(
unsigned
long
long
*
)
mask
.
data_ptr
<
int64_t
>
();
iou3d_nms_forward_impl
(
boxes
,
mask_data
,
boxes_num
,
nms_overlap_thresh
);
iou3d_nms
3d
_forward_impl
(
boxes
,
mask_data
,
boxes_num
,
nms_overlap_thresh
);
at
::
Tensor
mask_cpu
=
mask
.
to
(
at
::
kCPU
);
unsigned
long
long
*
mask_host
=
...
...
@@ -105,9 +88,9 @@ void iou3d_nms_forward(Tensor boxes, Tensor keep, Tensor keep_num,
}
}
void
iou3d_nms_normal_forward
(
Tensor
boxes
,
Tensor
keep
,
Tensor
keep_num
,
float
nms_overlap_thresh
)
{
// params boxes: (N,
5
) [x
1
, y
1
,
x2, y2, ry
]
void
iou3d_nms
3d
_normal_forward
(
Tensor
boxes
,
Tensor
keep
,
Tensor
keep_num
,
float
nms_overlap_thresh
)
{
// params boxes: (N,
7
) [x, y,
z, dx, dy, dz, heading
]
// params keep: (N)
CHECK_CONTIGUOUS
(
boxes
);
...
...
@@ -117,14 +100,15 @@ void iou3d_nms_normal_forward(Tensor boxes, Tensor keep, Tensor keep_num,
int64_t
*
keep_data
=
keep
.
data_ptr
<
int64_t
>
();
int64_t
*
keep_num_data
=
keep_num
.
data_ptr
<
int64_t
>
();
const
int
col_blocks
=
DIVUP
(
boxes_num
,
THREADS_PER_BLOCK_NMS
);
const
int
col_blocks
=
(
boxes_num
+
THREADS_PER_BLOCK_NMS
-
1
)
/
THREADS_PER_BLOCK_NMS
;
Tensor
mask
=
at
::
empty
({
boxes_num
,
col_blocks
},
boxes
.
options
().
dtype
(
at
::
kLong
));
unsigned
long
long
*
mask_data
=
(
unsigned
long
long
*
)
mask
.
data_ptr
<
int64_t
>
();
iou3d_nms_normal_forward_impl
(
boxes
,
mask_data
,
boxes_num
,
nms_overlap_thresh
);
iou3d_nms
3d
_normal_forward_impl
(
boxes
,
mask_data
,
boxes_num
,
nms_overlap_thresh
);
at
::
Tensor
mask_cpu
=
mask
.
to
(
at
::
kCPU
);
unsigned
long
long
*
mask_host
=
...
...
mmcv/ops/csrc/pytorch/min_area_polygons.cpp
0 → 100644
View file @
fdeee889
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
void
min_area_polygons_impl
(
const
Tensor
pointsets
,
Tensor
polygons
)
{
DISPATCH_DEVICE_IMPL
(
min_area_polygons_impl
,
pointsets
,
polygons
);
}
void
min_area_polygons
(
const
Tensor
pointsets
,
Tensor
polygons
)
{
min_area_polygons_impl
(
pointsets
,
polygons
);
}
mmcv/ops/csrc/pytorch/mlu/bbox_overlaps_mlu.cpp
0 → 100644
View file @
fdeee889
/*************************************************************************
* Copyright (C) 2021 Cambricon.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#include "pytorch_device_registry.hpp"
#include "pytorch_mlu_helper.hpp"
void
KernelBBoxOverlaps
(
cnrtDim3_t
k_dim
,
cnrtFunctionType_t
k_type
,
cnrtQueue_t
queue
,
const
cnrtDataType_t
d_type
,
const
void
*
bbox1
,
const
void
*
bbox2
,
void
*
ious
,
const
int32_t
num_bbox1
,
const
int32_t
num_bbox2
,
const
int32_t
mode
,
const
bool
aligned
,
const
int32_t
offset
);
static
void
policyFunc
(
cnrtDim3_t
*
k_dim
,
cnrtFunctionType_t
*
k_type
,
const
int32_t
batch_num_all
)
{
auto
union_num
=
torch_mlu
::
getDeviceAttr
(
cnrtAttrClusterCount
);
auto
core_dim
=
torch_mlu
::
getDeviceAttr
(
cnrtAttrMcorePerCluster
);
auto
core_num
=
union_num
*
core_dim
;
// Union1 policyFunc
*
k_type
=
CNRT_FUNC_TYPE_UNION1
;
k_dim
->
x
=
core_dim
;
auto
need_core_num
=
PAD_UP
(
batch_num_all
,
core_dim
);
k_dim
->
y
=
(
need_core_num
<
core_num
)
?
(
need_core_num
/
core_dim
)
:
union_num
;
k_dim
->
z
=
1
;
return
;
}
void
BBoxOverlapsMLUKernelLauncher
(
const
Tensor
bboxes1
,
const
Tensor
bboxes2
,
Tensor
ious
,
const
int32_t
mode
,
const
bool
aligned
,
const
int32_t
offset
)
{
// check dtype
TORCH_CHECK
(
bboxes1
.
scalar_type
()
==
at
::
kFloat
||
bboxes1
.
scalar_type
()
==
at
::
kHalf
,
"Data type of input should be Float or Half. But now input type is "
,
bboxes1
.
scalar_type
(),
"."
);
TORCH_CHECK
(
bboxes1
.
scalar_type
()
==
bboxes2
.
scalar_type
(),
"bboxes1's dtype should be the same with bboxes2's dtype."
);
// params check
TORCH_CHECK
(
bboxes1
.
dim
()
==
2
,
"bboxes1 should be a 2d tensor, got "
,
bboxes1
.
dim
(),
"D"
);
TORCH_CHECK
(
bboxes2
.
dim
()
==
2
,
"bboxes2 should be a 2d tensor, got "
,
bboxes2
.
dim
(),
"D"
);
auto
rows
=
bboxes1
.
size
(
0
);
auto
cols
=
bboxes2
.
size
(
0
);
auto
batch_num_all
=
rows
;
if
(
rows
*
cols
==
0
)
{
// return if zero element
return
;
}
// calculate task dimension
cnrtDim3_t
k_dim
;
cnrtFunctionType_t
k_type
;
policyFunc
(
&
k_dim
,
&
k_type
,
batch_num_all
);
// get compute queue
cnrtQueue_t
queue
=
torch_mlu
::
getCurQueue
();
// get dtype of input
cnrtDataType_t
d_type
=
torch_mlu
::
toCnrtDtype
(
bboxes1
.
dtype
());
// get ptr of tensors
auto
bboxes1_impl
=
torch_mlu
::
getMluTensorImpl
(
bboxes1
);
auto
bboxes1_ptr
=
bboxes1_impl
->
cnnlMalloc
();
auto
bboxes2_impl
=
torch_mlu
::
getMluTensorImpl
(
bboxes2
);
auto
bboxes2_ptr
=
bboxes2_impl
->
cnnlMalloc
();
auto
ious_impl
=
torch_mlu
::
getMluTensorImpl
(
ious
);
auto
ious_ptr
=
ious_impl
->
cnnlMalloc
();
// launch kernel
CNLOG
(
INFO
)
<<
"Launch Kernel MLUUnion1BboxOverlapsKernel"
;
CNLOG
(
INFO
)
<<
"kDim :[ "
<<
k_dim
.
x
<<
", "
<<
k_dim
.
y
<<
", "
<<
k_dim
.
z
<<
" ]"
;
KernelBBoxOverlaps
(
k_dim
,
k_type
,
queue
,
d_type
,
bboxes1_ptr
,
bboxes2_ptr
,
ious_ptr
,
rows
,
cols
,
mode
,
aligned
,
offset
);
}
void
bbox_overlaps_mlu
(
const
Tensor
bboxes1
,
const
Tensor
bboxes2
,
Tensor
ious
,
const
int
mode
,
const
bool
aligned
,
const
int
offset
)
{
BBoxOverlapsMLUKernelLauncher
(
bboxes1
,
bboxes2
,
ious
,
mode
,
aligned
,
offset
);
}
void
bbox_overlaps_impl
(
const
Tensor
bboxes1
,
const
Tensor
bboxes2
,
Tensor
ious
,
const
int
mode
,
const
bool
aligned
,
const
int
offset
);
REGISTER_DEVICE_IMPL
(
bbox_overlaps_impl
,
MLU
,
bbox_overlaps_mlu
);
mmcv/ops/csrc/pytorch/mlu/focal_loss_sigmoid_mlu.cpp
0 → 100644
View file @
fdeee889
/*************************************************************************
* Copyright (C) 2021 Cambricon.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#include <string>
#include <vector>
#include "pytorch_device_registry.hpp"
#include "pytorch_mlu_helper.hpp"
void
KernelFocalLossSigmoidForward
(
cnrtDim3_t
k_dim
,
cnrtFunctionType_t
k_type
,
cnrtQueue_t
queue
,
const
cnrtDataType_t
d_type
,
const
void
*
input
,
const
void
*
target
,
const
void
*
weight
,
const
int32_t
N
,
const
int32_t
C
,
const
float
alpha
,
const
float
gamma
,
void
*
output
);
void
KernelFocalLossSigmoidBackward
(
cnrtDim3_t
k_dim
,
cnrtFunctionType_t
k_type
,
cnrtQueue_t
queue
,
const
cnrtDataType_t
d_type
,
const
void
*
input
,
const
void
*
target
,
const
void
*
weight
,
const
float
gamma
,
const
float
alpha
,
const
int32_t
dim_n
,
const
int32_t
deal_n
,
const
int32_t
dim_c
,
void
*
output
);
// Policy Function for Forward
static
void
policyFuncForward
(
cnrtDim3_t
*
k_dim
,
cnrtFunctionType_t
*
k_type
,
const
Tensor
&
input
,
const
Tensor
&
target
,
const
Tensor
&
weight
)
{
auto
N
=
input
.
size
(
0
);
auto
C
=
input
.
size
(
1
);
const
size_t
nram_size
=
torch_mlu
::
getDeviceAttr
(
cnrtAttrNramSizePerMcore
);
const
size_t
c_align_size
=
PAD_UP
((
C
*
input
.
itemsize
()),
NFU_ALIGN_SIZE
);
const
int
split_target_num
=
2
;
const
int
split_pipeline_num
=
6
;
const
int
has_weight
=
weight
.
data_ptr
()
!=
nullptr
;
const
int
target_data_width
=
target
.
scalar_type
()
==
at
::
kLong
?
target
.
itemsize
()
/
2
:
target
.
itemsize
();
const
int
threshold_c
=
PAD_DOWN
((
nram_size
-
split_target_num
*
sizeof
(
int
))
/
(
split_pipeline_num
+
has_weight
),
NFU_ALIGN_SIZE
)
/
input
.
itemsize
();
int
n_seg
=
1
;
if
(
C
<=
threshold_c
)
{
int
c_size
=
C
*
input
.
itemsize
();
int
reservered_align_size
=
(
split_target_num
+
split_pipeline_num
)
*
NFU_ALIGN_SIZE
;
int
wegiht_size
=
0
;
if
(
has_weight
)
{
c_size
=
c_align_size
;
reservered_align_size
=
split_target_num
*
NFU_ALIGN_SIZE
;
wegiht_size
=
c_align_size
;
}
// n_seg * c_size * split_pipeline_num + n_seg * target.itemsize() *
// split_target_num
// + weight_size + reservered_align_size <= nram_size
n_seg
=
(
nram_size
-
wegiht_size
-
reservered_align_size
)
/
(
split_pipeline_num
*
c_size
+
split_target_num
*
sizeof
(
int32_t
));
}
auto
seg_num
=
n_seg
==
0
?
N
:
(
N
+
n_seg
-
1
)
/
n_seg
;
auto
core_dim
=
torch_mlu
::
getDeviceAttr
(
cnrtAttrMcorePerCluster
);
auto
cluster_num
=
torch_mlu
::
getDeviceAttr
(
cnrtAttrClusterCount
);
auto
core_num
=
core_dim
*
cluster_num
;
k_dim
->
x
=
*
k_type
;
k_dim
->
y
=
seg_num
>
core_num
?
cluster_num
:
(
seg_num
+
core_dim
-
1
)
/
core_dim
;
k_dim
->
z
=
1
;
}
// Policy Function for Backward
static
void
policyFuncBackward
(
cnrtDim3_t
*
k_dim
,
cnrtFunctionType_t
*
k_type
)
{
// set Union1 Job
*
k_type
=
CNRT_FUNC_TYPE_UNION1
;
k_dim
->
x
=
torch_mlu
::
getDeviceAttr
(
cnrtAttrMcorePerCluster
);
k_dim
->
y
=
torch_mlu
::
getDeviceAttr
(
cnrtAttrClusterCount
);
k_dim
->
z
=
1
;
}
void
SigmoidFocalLossForwardMLUKernelLauncher
(
Tensor
input
,
Tensor
target
,
Tensor
weight
,
Tensor
output
,
const
float
gamma
,
const
float
alpha
)
{
// params check
TORCH_CHECK
(
gamma
>=
0
,
"gamma should be greater than or equal to 0. "
,
"But now gamma is "
,
gamma
,
"."
);
// check dtype
TORCH_CHECK
(
input
.
scalar_type
()
==
at
::
kFloat
||
input
.
scalar_type
()
==
at
::
kHalf
,
"Data type of input should be Float or Half. But now input type is "
,
input
.
scalar_type
(),
"."
);
TORCH_CHECK
(
(
target
.
scalar_type
()
==
at
::
kInt
||
target
.
scalar_type
()
==
at
::
kLong
),
"target type should be Int or Long. "
,
"But now target type is "
,
target
.
scalar_type
(),
"."
);
if
(
weight
.
data_ptr
()
!=
nullptr
)
{
TORCH_CHECK
(
weight
.
scalar_type
()
==
input
.
scalar_type
(),
"Data types of input and weight should be the same. But now "
"input type is "
,
input
.
scalar_type
(),
", weight type is "
,
weight
.
scalar_type
(),
"."
);
}
else
{
CNLOG
(
INFO
)
<<
"weight is a empty tensor."
;
}
// return if zero-element
if
(
input
.
numel
()
==
0
||
target
.
numel
()
==
0
||
output
.
numel
()
==
0
)
{
return
;
}
// calculate task dimension
cnrtDim3_t
k_dim
;
cnrtFunctionType_t
k_type
=
CNRT_FUNC_TYPE_UNION1
;
policyFuncForward
(
&
k_dim
,
&
k_type
,
input
,
target
,
weight
);
auto
core_dim
=
torch_mlu
::
getDeviceAttr
(
cnrtAttrMcorePerCluster
);
// get compute queue
auto
queue
=
torch_mlu
::
getCurQueue
();
// get ptr of tensors
auto
input_impl
=
torch_mlu
::
getMluTensorImpl
(
input
);
auto
input_ptr
=
input_impl
->
cnnlMalloc
();
auto
target_impl
=
torch_mlu
::
getMluTensorImpl
(
target
);
auto
target_ptr
=
target_impl
->
cnnlMalloc
();
auto
weight_impl
=
torch_mlu
::
getMluTensorImpl
(
weight
);
auto
weight_ptr
=
weight_impl
->
cnnlMalloc
();
auto
output_impl
=
torch_mlu
::
getMluTensorImpl
(
output
);
auto
output_ptr
=
output_impl
->
cnnlMalloc
();
// get dtype of input
cnrtDataType_t
d_type
=
torch_mlu
::
toCnrtDtype
(
input
.
dtype
());
CNLOG
(
INFO
)
<<
"Launch Kernel KernelFocalLossSigmoidForward<<<Union"
<<
k_type
/
core_dim
<<
", "
<<
k_dim
.
x
<<
", "
<<
k_dim
.
y
<<
", "
<<
k_dim
.
z
<<
">>>"
;
// launch kernel
KernelFocalLossSigmoidForward
(
k_dim
,
k_type
,
queue
,
d_type
,
input_ptr
,
target_ptr
,
weight_ptr
,
input
.
size
(
0
),
input
.
size
(
1
),
alpha
,
gamma
,
output_ptr
);
}
void
getDealNAndThresholdC
(
const
int
compute_data_bytes
,
const
int
target_data_bytes
,
const
int
total_c
,
int
*
deal_n_ptr
,
int
*
threshold_c_ptr
,
const
bool
has_weight
,
const
bool
is_half
)
{
/* NRAM partition:
*
* |-----------------ping pong--------------------|
* |input | pt | alpha_t | temp | output | target | flt_min | gamma | weight|
*
* split_pipeline_num is 5: including input, pt, alpha_t, temp, output.
*/
const
int
nram_split_num
=
5
;
const
int
nram_split_pingpong
=
2
;
const
int
max_nram_size
=
torch_mlu
::
getDeviceAttr
(
cnrtAttrNramSizePerMcore
);
int32_t
compute_align_size
=
NFU_ALIGN_SIZE
;
if
(
is_half
)
{
compute_align_size
+=
NFU_ALIGN_SIZE
;
}
const
int32_t
compute_align_num
=
compute_align_size
/
compute_data_bytes
;
// reservered_align_size: including input(ping pong), pt(ping pong),
// alpha_t(ping pong), temp(ping pong),
// output(ping pong), target(ping pong),
// flt_min and gamma.
const
int
reservered_align_size
=
((
nram_split_num
+
1
)
*
nram_split_pingpong
+
2
)
*
compute_align_size
;
int
nram_pingpong_size
=
max_nram_size
-
reservered_align_size
;
int
compute_c
=
total_c
;
int
threshold_c
=
0
;
if
(
has_weight
)
{
// reserved space for weight to align
nram_pingpong_size
-=
NFU_ALIGN_SIZE
;
// threshold_c * nram_split_pingpong * compute_data_bytes * nram_split_num +
// nram_split_pingpong * target_data_bytes +
// threshold_c * compute_data_bytes <= nram_pingpong_size
threshold_c
=
(
nram_pingpong_size
-
nram_split_pingpong
*
target_data_bytes
)
/
(
compute_data_bytes
*
(
nram_split_num
*
nram_split_pingpong
+
1
));
threshold_c
=
PAD_DOWN
(
threshold_c
,
compute_align_num
);
int
weight_space
=
PAD_UP
(
total_c
*
compute_data_bytes
,
NFU_ALIGN_SIZE
);
// reserved space for weight
nram_pingpong_size
-=
weight_space
;
compute_c
=
PAD_UP
(
total_c
,
compute_align_num
);
}
else
{
// threshold_c * nram_split_pingpong * compute_data_bytes * nram_split_num +
// nram_split_pingpong * target_data_bytes <= nram_pingpong_size
threshold_c
=
(
nram_pingpong_size
/
nram_split_pingpong
-
target_data_bytes
)
/
(
nram_split_num
*
compute_data_bytes
);
}
// deal_n * compute_c * nram_split_pingpong * compute_data_bytes *
// nram_split_num + deal_n * nram_split_pingpong * target_data_bytes <=
// nram_pingpong_size
*
deal_n_ptr
=
nram_pingpong_size
/
((
nram_split_num
*
compute_c
*
compute_data_bytes
+
target_data_bytes
)
*
nram_split_pingpong
);
*
threshold_c_ptr
=
threshold_c
;
}
void
SigmoidFocalLossBackwardMLUKernelLauncher
(
Tensor
input
,
Tensor
target
,
Tensor
weight
,
Tensor
output
,
const
float
gamma
,
const
float
alpha
)
{
// params check
TORCH_CHECK
(
gamma
>=
0
,
"gamma should be greater than or equal to 0. "
,
"But now gamma is "
,
gamma
,
"."
);
// check dtype
TORCH_CHECK
(
input
.
scalar_type
()
==
at
::
kFloat
||
input
.
scalar_type
()
==
at
::
kHalf
,
"Data type of input should be Float or Half. But now input type is "
,
input
.
scalar_type
(),
"."
);
TORCH_CHECK
(
(
target
.
scalar_type
()
==
at
::
kInt
||
target
.
scalar_type
()
==
at
::
kLong
),
"target type should be Int or Long. "
,
"But now target type is "
,
target
.
scalar_type
(),
"."
);
bool
has_weight
=
false
;
if
(
weight
.
data_ptr
()
!=
nullptr
)
{
TORCH_CHECK
(
weight
.
scalar_type
()
==
input
.
scalar_type
(),
"Data types of input and weight should be the same. But now "
"input type is "
,
input
.
scalar_type
(),
", weight type is "
,
weight
.
scalar_type
(),
"."
);
has_weight
=
true
;
}
else
{
CNLOG
(
INFO
)
<<
"weight is a empty tensor."
;
}
auto
dim_c
=
input
.
size
(
1
);
const
int
compute_data_bytes
=
sizeof
(
float
);
// target supports only INT on MLU device while it keeps LONG on host side,
// so target.itemsize() / 2
const
int
target_data_bytes
=
target
.
scalar_type
()
==
at
::
kLong
?
(
target
.
itemsize
()
/
2
)
:
target
.
itemsize
();
int
deal_n
=
0
;
int
threshold_c
=
0
;
bool
is_half
=
false
;
if
(
input
.
scalar_type
()
==
at
::
kHalf
)
{
is_half
=
true
;
}
// calculate deal_n and threshold_c
getDealNAndThresholdC
(
compute_data_bytes
,
target_data_bytes
,
dim_c
,
&
deal_n
,
&
threshold_c
,
has_weight
,
is_half
);
// check C
TORCH_CHECK
(
threshold_c
>=
dim_c
,
"input.size(1) should be in the range of [0, "
,
threshold_c
,
"]. "
,
"But now input.size(1) is "
,
dim_c
,
"."
);
if
(
input
.
numel
()
==
0
||
target
.
numel
()
==
0
||
output
.
numel
()
==
0
)
{
// return if zero-element
return
;
}
// set task dimension
cnrtDim3_t
k_dim
;
cnrtFunctionType_t
k_type
;
policyFuncBackward
(
&
k_dim
,
&
k_type
);
// get compute queue
auto
queue
=
torch_mlu
::
getCurQueue
();
// get ptr of tensors
auto
input_impl
=
torch_mlu
::
getMluTensorImpl
(
input
);
auto
input_ptr
=
input_impl
->
cnnlMalloc
();
auto
target_impl
=
torch_mlu
::
getMluTensorImpl
(
target
);
auto
target_ptr
=
target_impl
->
cnnlMalloc
();
auto
weight_impl
=
torch_mlu
::
getMluTensorImpl
(
weight
);
auto
weight_ptr
=
weight_impl
->
cnnlMalloc
();
auto
output_impl
=
torch_mlu
::
getMluTensorImpl
(
output
);
auto
output_ptr
=
output_impl
->
cnnlMalloc
();
// get dtype of input
cnrtDataType_t
d_type
=
torch_mlu
::
toCnrtDtype
(
input
.
dtype
());
auto
core_dim
=
torch_mlu
::
getDeviceAttr
(
cnrtAttrMcorePerCluster
);
auto
dim_n
=
input
.
size
(
0
);
CNLOG
(
INFO
)
<<
"Launch Kernel KernelFocalLossSigmoidBackward<<<Union"
<<
k_type
/
core_dim
<<
", "
<<
k_dim
.
x
<<
", "
<<
k_dim
.
y
<<
", "
<<
k_dim
.
z
<<
">>>"
;
// launch kernel
KernelFocalLossSigmoidBackward
(
k_dim
,
k_type
,
queue
,
d_type
,
input_ptr
,
target_ptr
,
weight_ptr
,
gamma
,
alpha
,
dim_n
,
deal_n
,
dim_c
,
output_ptr
);
}
void
sigmoid_focal_loss_forward_mlu
(
Tensor
input
,
Tensor
target
,
Tensor
weight
,
Tensor
output
,
float
gamma
,
float
alpha
)
{
SigmoidFocalLossForwardMLUKernelLauncher
(
input
,
target
,
weight
,
output
,
gamma
,
alpha
);
}
void
sigmoid_focal_loss_backward_mlu
(
Tensor
input
,
Tensor
target
,
Tensor
weight
,
Tensor
grad_input
,
float
gamma
,
float
alpha
)
{
SigmoidFocalLossBackwardMLUKernelLauncher
(
input
,
target
,
weight
,
grad_input
,
gamma
,
alpha
);
}
void
sigmoid_focal_loss_forward_impl
(
Tensor
input
,
Tensor
target
,
Tensor
weight
,
Tensor
output
,
float
gamma
,
float
alpha
);
void
sigmoid_focal_loss_backward_impl
(
Tensor
input
,
Tensor
target
,
Tensor
weight
,
Tensor
grad_input
,
float
gamma
,
float
alpha
);
REGISTER_DEVICE_IMPL
(
sigmoid_focal_loss_forward_impl
,
MLU
,
sigmoid_focal_loss_forward_mlu
);
REGISTER_DEVICE_IMPL
(
sigmoid_focal_loss_backward_impl
,
MLU
,
sigmoid_focal_loss_backward_mlu
);
mmcv/ops/csrc/pytorch/mlu/nms_mlu.cpp
0 → 100644
View file @
fdeee889
/*************************************************************************
* Copyright (C) 2021 by Cambricon.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#include "pytorch_device_registry.hpp"
#include "pytorch_mlu_helper.hpp"
void
KernelNms
(
cnrtDim3_t
k_dim
,
cnrtFunctionType_t
k_type
,
cnrtQueue_t
queue
,
const
cnrtDataType_t
data_type_input
,
const
void
*
boxes_ptr
,
const
void
*
scores_ptr
,
const
int
input_num_boxes
,
const
int
input_stride
,
const
int
max_output_boxes
,
const
float
iou_threshold
,
const
float
offset
,
void
*
workspace_ptr
,
void
*
output_size_ptr
,
void
*
output_ptr
);
int
selectUnionType
(
uint32_t
use_job
,
int
box_num_per_core
)
{
// the box_num_per_core should be at least 256, otherwise the real IO
// bandwidth would be very low
while
(
box_num_per_core
<
256
&&
use_job
>=
4
)
{
box_num_per_core
*=
2
;
use_job
/=
2
;
}
return
use_job
;
}
Tensor
NMSMLUKernelLauncher
(
Tensor
boxes
,
Tensor
scores
,
float
iou_threshold
,
int
offset
)
{
// dimension parameters check
TORCH_CHECK
(
boxes
.
dim
()
==
2
,
"boxes should be a 2d tensor, got "
,
boxes
.
dim
(),
"D"
);
TORCH_CHECK
(
boxes
.
size
(
1
)
==
4
,
"boxes should have 4 elements in dimension 1, got "
,
boxes
.
size
(
1
));
TORCH_CHECK
(
scores
.
dim
()
==
1
,
"scores should be a 1d tensor, got "
,
scores
.
dim
(),
"D"
);
// data type check
TORCH_CHECK
(
boxes
.
scalar_type
()
==
scores
.
scalar_type
(),
"boxes should have the same type as scores"
);
TORCH_CHECK
(
boxes
.
scalar_type
()
==
at
::
kFloat
||
boxes
.
scalar_type
()
==
at
::
kHalf
,
"data type of boxes should be Float or Half, got "
,
boxes
.
scalar_type
());
if
(
boxes
.
numel
()
==
0
)
{
return
at
::
empty
({
0
},
boxes
.
options
().
dtype
(
at
::
kLong
));
}
int
input_num_boxes
=
boxes
.
size
(
0
);
int
input_stride
=
boxes
.
size
(
0
);
int
max_output_boxes
=
boxes
.
size
(
0
);
cnrtDataType_t
data_type_input
=
torch_mlu
::
toCnrtDtype
(
boxes
.
dtype
());
cnrtDim3_t
k_dim
;
cnrtJobType_t
k_type
;
uint32_t
union_number
=
torch_mlu
::
getDeviceAttr
(
cnrtAttrClusterCount
);
uint32_t
core_dim
=
torch_mlu
::
getDeviceAttr
(
cnrtAttrMcorePerCluster
);
uint32_t
job_limit
=
union_number
*
core_dim
;
uint32_t
core_number
=
union_number
*
core_dim
;
int
box_num_per_core
=
(
input_num_boxes
+
core_number
-
1
)
/
core_number
;
// initiate k_type as Union1
k_dim
.
x
=
core_dim
;
k_dim
.
y
=
1
;
k_dim
.
z
=
1
;
k_type
=
CNRT_FUNC_TYPE_UNION1
;
int
use_job
=
selectUnionType
(
job_limit
,
box_num_per_core
);
if
(
use_job
<
4
)
{
k_dim
.
x
=
1
;
k_type
=
CNRT_FUNC_TYPE_BLOCK
;
}
else
if
(
use_job
==
4
)
{
k_dim
.
x
=
core_dim
;
k_type
=
CNRT_FUNC_TYPE_UNION1
;
}
else
{
k_dim
.
x
=
use_job
;
k_type
=
(
cnrtFunctionType_t
)
use_job
;
}
// transpose boxes (n, 4) to (4, n) for better performance
auto
boxes_t
=
boxes
.
transpose
(
0
,
1
);
auto
boxes_
=
torch_mlu
::
cnnl
::
ops
::
cnnl_contiguous
(
boxes_t
);
auto
scores_
=
torch_mlu
::
cnnl
::
ops
::
cnnl_contiguous
(
scores
);
auto
output
=
at
::
empty
({
max_output_boxes
},
boxes
.
options
().
dtype
(
at
::
kLong
));
auto
output_size
=
at
::
empty
({
1
},
scores
.
options
().
dtype
(
at
::
kInt
));
// workspace
const
int
info_num
=
5
;
// x1, x2, y1, y2 and score
size_t
space_size
=
0
;
if
(
boxes
.
scalar_type
()
==
at
::
kHalf
)
{
space_size
=
input_num_boxes
*
sizeof
(
int16_t
)
*
info_num
+
sizeof
(
float
);
}
else
{
space_size
=
input_num_boxes
*
sizeof
(
float
)
*
info_num
+
sizeof
(
float
);
}
auto
workspace
=
at
::
empty
(
space_size
,
boxes
.
options
().
dtype
(
at
::
kByte
));
// get compute queue
auto
queue
=
torch_mlu
::
getCurQueue
();
auto
boxes_impl
=
torch_mlu
::
getMluTensorImpl
(
boxes_
);
auto
boxes_ptr
=
boxes_impl
->
cnnlMalloc
();
auto
scores_impl
=
torch_mlu
::
getMluTensorImpl
(
scores_
);
auto
scores_ptr
=
scores_impl
->
cnnlMalloc
();
auto
workspace_impl
=
torch_mlu
::
getMluTensorImpl
(
workspace
);
auto
workspace_ptr
=
workspace_impl
->
cnnlMalloc
();
auto
output_impl
=
torch_mlu
::
getMluTensorImpl
(
output
);
auto
output_ptr
=
output_impl
->
cnnlMalloc
();
auto
output_size_impl
=
torch_mlu
::
getMluTensorImpl
(
output_size
);
auto
output_size_ptr
=
output_size_impl
->
cnnlMalloc
();
CNLOG
(
INFO
)
<<
"Launch Kernel MLUUnionX NMS<<<Union"
<<
k_type
/
core_dim
<<
", "
<<
k_dim
.
x
<<
", "
<<
k_dim
.
y
<<
", "
<<
k_dim
.
z
<<
">>>"
;
KernelNms
(
k_dim
,
k_type
,
queue
,
data_type_input
,
boxes_ptr
,
scores_ptr
,
input_num_boxes
,
input_stride
,
max_output_boxes
,
iou_threshold
,
offset
,
workspace_ptr
,
output_size_ptr
,
output_ptr
);
int
output_num
=
*
static_cast
<
int
*>
(
output_size
.
cpu
().
data_ptr
());
return
output
.
slice
(
0
,
0
,
output_num
);
}
Tensor
nms_mlu
(
Tensor
boxes
,
Tensor
scores
,
float
iou_threshold
,
int
offset
)
{
return
NMSMLUKernelLauncher
(
boxes
,
scores
,
iou_threshold
,
offset
);
}
Tensor
nms_impl
(
Tensor
boxes
,
Tensor
scores
,
float
iou_threshold
,
int
offset
);
REGISTER_DEVICE_IMPL
(
nms_impl
,
MLU
,
nms_mlu
);
mmcv/ops/csrc/pytorch/mlu/psamask_mlu.cpp
0 → 100644
View file @
fdeee889
/*************************************************************************
* Copyright (C) 2022 Cambricon.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#include <algorithm>
#include "psamask_utils.hpp"
#include "pytorch_device_registry.hpp"
#include "pytorch_mlu_helper.hpp"
#define COMPUTE_COUNT_ALIGN 64
void
KernelPsamaskForward
(
cnrtDim3_t
k_dim
,
cnrtFunctionType_t
k_type
,
cnrtQueue_t
queue
,
const
void
*
x
,
void
*
y
,
const
PsamaskType
psa_type
,
const
DimPartitionType
core_partition
,
const
DimPartitionType
cluster_partition
,
const
int
batch
,
const
int
h_feature
,
const
int
w_feature
,
const
int
h_mask
,
const
int
w_mask
,
const
int
x_c
,
const
int
y_c
,
const
int
half_h_mask
,
const
int
half_w_mask
,
const
int
n_per_core
,
const
int
h_per_core
,
const
int
n_per_cluster
,
const
int
h_per_cluster
,
const
int
limit_n_seg
,
const
int
limit_h_seg
,
const
int
limit_w_seg
);
void
KernelPsamaskBackward
(
cnrtDim3_t
k_dim
,
cnrtFunctionType_t
k_type
,
cnrtQueue_t
queue
,
const
void
*
dy
,
void
*
dx
,
const
PsamaskType
psa_type
,
const
DimPartitionType
core_partition
,
const
DimPartitionType
cluster_partition
,
const
int
batch
,
const
int
h_feature
,
const
int
w_feature
,
const
int
h_mask
,
const
int
w_mask
,
const
int
dx_c
,
const
int
dy_c
,
const
int
half_h_mask
,
const
int
half_w_mask
,
const
int
n_per_core
,
const
int
h_per_core
,
const
int
n_per_cluster
,
const
int
h_per_cluster
,
const
int
limit_n_seg
,
const
int
limit_h_seg
,
const
int
limit_w_seg
);
namespace
{
void
policyFunc
(
cnrtDim3_t
*
k_dim_ptr
,
cnrtFunctionType_t
*
f_type_ptr
,
PartitionSeg
*
partition_ptr
,
const
int
n
,
const
int
h_feature
)
{
unsigned
int
core_dim
=
torch_mlu
::
getDeviceAttr
(
cnrtAttrMcorePerCluster
);
unsigned
int
cluster_num
=
torch_mlu
::
getDeviceAttr
(
cnrtAttrClusterCount
);
unsigned
int
use_cluster_num
=
cluster_num
;
unsigned
int
use_core_num
=
core_dim
;
if
(
n
>=
cluster_num
||
n
>=
h_feature
)
{
partition_ptr
->
cluster_partition
=
PARTITION_N
;
partition_ptr
->
n_per_cluster
=
(
n
+
cluster_num
-
1
)
/
cluster_num
;
partition_ptr
->
h_per_cluster
=
h_feature
;
use_cluster_num
=
(
n
+
partition_ptr
->
n_per_cluster
-
1
)
/
partition_ptr
->
n_per_cluster
;
}
else
{
partition_ptr
->
cluster_partition
=
PARTITION_H
;
partition_ptr
->
h_per_cluster
=
(
h_feature
+
cluster_num
-
1
)
/
cluster_num
;
partition_ptr
->
n_per_cluster
=
n
;
use_cluster_num
=
(
h_feature
+
partition_ptr
->
h_per_cluster
-
1
)
/
partition_ptr
->
h_per_cluster
;
}
if
(
partition_ptr
->
n_per_cluster
>=
core_dim
||
partition_ptr
->
n_per_cluster
>=
partition_ptr
->
h_per_cluster
)
{
partition_ptr
->
core_partition
=
PARTITION_N
;
partition_ptr
->
n_per_core
=
(
partition_ptr
->
n_per_cluster
+
core_dim
-
1
)
/
core_dim
;
partition_ptr
->
h_per_core
=
partition_ptr
->
h_per_cluster
;
use_core_num
=
(
partition_ptr
->
n_per_cluster
+
partition_ptr
->
n_per_core
-
1
)
/
partition_ptr
->
n_per_core
;
}
else
{
partition_ptr
->
core_partition
=
PARTITION_H
;
partition_ptr
->
h_per_core
=
(
partition_ptr
->
h_per_cluster
+
core_dim
-
1
)
/
core_dim
;
partition_ptr
->
n_per_core
=
partition_ptr
->
n_per_cluster
;
use_core_num
=
(
partition_ptr
->
h_per_cluster
+
partition_ptr
->
h_per_core
-
1
)
/
partition_ptr
->
h_per_core
;
}
*
k_dim_ptr
=
{
core_dim
,
use_cluster_num
,
1
};
}
}
// namespace
bool
findLimit
(
const
int
shape_core_n
,
const
int
shape_core_h
,
const
int
shape_core_w
,
const
int
shape_core_ci
,
const
int
shape_core_co
,
int
*
limit_n_seg_ptr
,
int
*
limit_h_seg_ptr
,
int
*
limit_w_seg_ptr
,
const
int
psa_type
)
{
const
bool
need_temp
=
psa_type
==
1
;
const
int
input_bytes
=
sizeof
(
float
);
int
limit_n_seg
=
shape_core_n
;
int
limit_h_seg
=
shape_core_h
;
int
limit_w_seg
=
shape_core_w
;
const
int
max_nram_size
=
torch_mlu
::
getDeviceAttr
(
cnrtAttrNramSizePerMcore
);
const
int
align_base_128
=
NFU_ALIGN_SIZE
/
input_bytes
;
const
int
align_base_64
=
COMPUTE_COUNT_ALIGN
/
input_bytes
;
const
int
align_co
=
CEIL_ALIGN
(
shape_core_co
,
align_base_64
);
const
int
align_w
=
CEIL_ALIGN
(
shape_core_w
,
align_base_64
);
const
int
align_hw
=
CEIL_ALIGN
(
shape_core_h
*
shape_core_w
,
align_base_64
);
const
int
max_num
=
max_nram_size
/
input_bytes
;
int
n_limit
=
max_num
/
(
CEIL_ALIGN
(
shape_core_h
*
shape_core_w
*
shape_core_ci
,
align_base_128
)
+
align_hw
*
align_co
*
(
1
+
need_temp
));
if
(
n_limit
>
0
)
{
n_limit
=
std
::
min
(
n_limit
,
shape_core_n
);
limit_n_seg
=
n_limit
;
}
else
{
int
h_limit
=
max_num
/
(
CEIL_ALIGN
(
shape_core_w
*
shape_core_ci
,
align_base_128
)
+
align_w
*
align_co
*
(
1
+
need_temp
));
if
(
h_limit
>
0
)
{
h_limit
=
std
::
min
(
h_limit
,
shape_core_h
);
limit_h_seg
=
h_limit
;
limit_n_seg
=
1
;
}
else
{
int
w_limit
=
max_num
/
(
CEIL_ALIGN
(
shape_core_ci
,
align_base_128
)
+
CEIL_ALIGN
(
align_co
,
align_base_128
)
*
(
1
+
need_temp
));
if
(
w_limit
>
0
&&
w_limit
>=
(
COMPUTE_COUNT_ALIGN
/
input_bytes
))
{
w_limit
=
std
::
min
(
w_limit
,
shape_core_w
);
w_limit
=
w_limit
/
(
COMPUTE_COUNT_ALIGN
/
input_bytes
)
*
(
COMPUTE_COUNT_ALIGN
/
input_bytes
);
limit_w_seg
=
w_limit
;
limit_h_seg
=
1
;
limit_n_seg
=
1
;
}
else
{
CNLOG
(
INFO
)
<<
"The size of input channel is too large."
;
return
false
;
}
}
}
*
limit_n_seg_ptr
=
limit_n_seg
;
*
limit_h_seg_ptr
=
limit_h_seg
;
*
limit_w_seg_ptr
=
limit_w_seg
;
return
true
;
}
void
PSAMaskForwardMLUKernelLauncher
(
const
int
psa_type
,
const
Tensor
x
,
Tensor
y
,
const
int
num_
,
const
int
h_feature
,
const
int
w_feature
,
const
int
h_mask
,
const
int
w_mask
,
const
int
half_h_mask
,
const
int
half_w_mask
)
{
// params check
TORCH_CHECK
(
x
.
scalar_type
()
==
at
::
kFloat
,
"x type should be Float, got "
,
x
.
scalar_type
());
TORCH_CHECK
(
y
.
scalar_type
()
==
x
.
scalar_type
(),
"y should have the same type as x"
);
TORCH_CHECK
(
x
.
dim
()
==
4
,
"x should be a 4d tensor, got "
,
x
.
dim
(),
"D"
);
TORCH_CHECK
(
y
.
dim
()
==
4
,
"y should be a 4d tensor, got "
,
y
.
dim
(),
"D"
);
int
x_c
=
x
.
size
(
1
);
int
y_c
=
y
.
size
(
1
);
TORCH_CHECK
(
h_mask
*
w_mask
==
x_c
,
"channel of x should be the same as h_mask * w_mask"
);
TORCH_CHECK
(
h_feature
*
w_feature
==
y_c
,
"channel of y should be the same as h_feature * w_feature"
);
TORCH_CHECK
(
psa_type
==
0
||
psa_type
==
1
,
"psa_type only suppurts 'COLLECT' and 'DISTRIBUTE' currently"
);
if
(
x
.
numel
()
==
0
)
{
CNLOG
(
INFO
)
<<
"skip zero-element tensor"
;
return
;
}
cnrtFunctionType_t
k_type
=
CNRT_FUNC_TYPE_UNION1
;
cnrtDim3_t
k_dim
;
PartitionSeg
partition_info
;
policyFunc
(
&
k_dim
,
&
k_type
,
&
partition_info
,
num_
,
h_feature
);
int
n_limit_seg
,
h_limit_seg
,
w_limit_seg
;
bool
ret
=
findLimit
(
partition_info
.
n_per_core
,
partition_info
.
h_per_core
,
w_feature
,
x_c
,
y_c
,
&
n_limit_seg
,
&
h_limit_seg
,
&
w_limit_seg
,
psa_type
);
if
(
ret
!=
true
)
{
return
;
}
auto
memory_format
=
torch_mlu
::
cnnl
::
ops
::
get_channels_last_memory_format
(
x
.
dim
());
auto
x_tensor
=
torch_mlu
::
cnnl
::
ops
::
cnnl_contiguous
(
x
,
memory_format
);
at
::
Tensor
y_tmp
=
at
::
empty
({
num_
,
y_c
,
h_feature
,
w_feature
},
x
.
options
(),
memory_format
);
// get compute queue
auto
queue
=
torch_mlu
::
getCurQueue
();
// get ptr of tensors
auto
x_impl
=
torch_mlu
::
getMluTensorImpl
(
x_tensor
);
auto
x_ptr
=
x_impl
->
cnnlMalloc
();
auto
y_impl
=
torch_mlu
::
getMluTensorImpl
(
y_tmp
);
auto
y_ptr
=
y_impl
->
cnnlMalloc
();
KernelPsamaskForward
(
k_dim
,
k_type
,
queue
,
x_ptr
,
y_ptr
,
(
PsamaskType
)
psa_type
,
partition_info
.
core_partition
,
partition_info
.
cluster_partition
,
num_
,
h_feature
,
w_feature
,
h_mask
,
w_mask
,
x_c
,
y_c
,
half_h_mask
,
half_w_mask
,
partition_info
.
n_per_core
,
partition_info
.
h_per_core
,
partition_info
.
n_per_cluster
,
partition_info
.
h_per_cluster
,
n_limit_seg
,
h_limit_seg
,
w_limit_seg
);
y
.
copy_
(
y_tmp
);
}
void
PSAMaskBackwardMLUKernelLauncher
(
const
int
psa_type
,
const
Tensor
dy
,
Tensor
dx
,
const
int
num_
,
const
int
h_feature
,
const
int
w_feature
,
const
int
h_mask
,
const
int
w_mask
,
const
int
half_h_mask
,
const
int
half_w_mask
)
{
// params check
TORCH_CHECK
(
dy
.
scalar_type
()
==
at
::
kFloat
,
"dy type should be Float, got "
,
dy
.
scalar_type
());
TORCH_CHECK
(
dx
.
scalar_type
()
==
dy
.
scalar_type
(),
"dx should have the same type as dy"
);
TORCH_CHECK
(
dy
.
dim
()
==
4
,
"dy should be a 4d tensor, got "
,
dy
.
dim
(),
"D"
);
TORCH_CHECK
(
dx
.
dim
()
==
4
,
"dx should be a 4d tensor, got "
,
dx
.
dim
(),
"D"
);
int
dy_c
=
dy
.
size
(
1
);
int
dx_c
=
dx
.
size
(
1
);
TORCH_CHECK
(
h_feature
*
w_feature
==
dy_c
,
"channel of dy should be the same as h_feature * w_feature"
);
TORCH_CHECK
(
h_mask
*
w_mask
==
dx_c
,
"channel of dx should be the same as h_mask * w_mask"
);
TORCH_CHECK
(
psa_type
==
0
||
psa_type
==
1
,
"psa_type only suppurts 'COLLECT' and 'DISTRIBUTE' currently"
);
if
(
dx
.
numel
()
==
0
)
{
CNLOG
(
INFO
)
<<
"skip zero-element tensor"
;
return
;
}
cnrtFunctionType_t
k_type
=
CNRT_FUNC_TYPE_UNION1
;
cnrtDim3_t
k_dim
;
PartitionSeg
partition_info
;
policyFunc
(
&
k_dim
,
&
k_type
,
&
partition_info
,
num_
,
h_feature
);
int
n_limit_seg
,
h_limit_seg
,
w_limit_seg
;
bool
ret
=
findLimit
(
partition_info
.
n_per_core
,
partition_info
.
h_per_core
,
w_feature
,
dx_c
,
dy_c
,
&
n_limit_seg
,
&
h_limit_seg
,
&
w_limit_seg
,
psa_type
);
if
(
ret
!=
true
)
{
return
;
}
auto
memory_format
=
torch_mlu
::
cnnl
::
ops
::
get_channels_last_memory_format
(
dy
.
dim
());
auto
dy_tensor
=
torch_mlu
::
cnnl
::
ops
::
cnnl_contiguous
(
dy
,
memory_format
);
at
::
Tensor
dx_tmp
=
at
::
empty
({
num_
,
dx_c
,
h_feature
,
w_feature
},
dy
.
options
(),
memory_format
);
// get compute queue
auto
queue
=
torch_mlu
::
getCurQueue
();
// get ptr of tensors
auto
dx_impl
=
torch_mlu
::
getMluTensorImpl
(
dx_tmp
);
auto
dx_ptr
=
dx_impl
->
cnnlMalloc
();
auto
dy_impl
=
torch_mlu
::
getMluTensorImpl
(
dy_tensor
);
auto
dy_ptr
=
dy_impl
->
cnnlMalloc
();
KernelPsamaskBackward
(
k_dim
,
k_type
,
queue
,
dy_ptr
,
dx_ptr
,
(
PsamaskType
)
psa_type
,
partition_info
.
core_partition
,
partition_info
.
cluster_partition
,
num_
,
h_feature
,
w_feature
,
h_mask
,
w_mask
,
dx_c
,
dy_c
,
half_h_mask
,
half_w_mask
,
partition_info
.
n_per_core
,
partition_info
.
h_per_core
,
partition_info
.
n_per_cluster
,
partition_info
.
h_per_cluster
,
n_limit_seg
,
h_limit_seg
,
w_limit_seg
);
dx
.
copy_
(
dx_tmp
);
}
void
psamask_forward_mlu
(
const
int
psa_type
,
const
Tensor
input
,
Tensor
output
,
const
int
num_
,
const
int
h_feature
,
const
int
w_feature
,
const
int
h_mask
,
const
int
w_mask
,
const
int
half_h_mask
,
const
int
half_w_mask
)
{
PSAMaskForwardMLUKernelLauncher
(
psa_type
,
input
,
output
,
num_
,
h_feature
,
w_feature
,
h_mask
,
w_mask
,
half_h_mask
,
half_w_mask
);
}
void
psamask_backward_mlu
(
const
int
psa_type
,
const
Tensor
grad_output
,
Tensor
grad_input
,
const
int
num_
,
const
int
h_feature
,
const
int
w_feature
,
const
int
h_mask
,
const
int
w_mask
,
const
int
half_h_mask
,
const
int
half_w_mask
)
{
PSAMaskBackwardMLUKernelLauncher
(
psa_type
,
grad_output
,
grad_input
,
num_
,
h_feature
,
w_feature
,
h_mask
,
w_mask
,
half_h_mask
,
half_w_mask
);
}
void
psamask_forward_impl
(
const
int
psa_type
,
const
Tensor
input
,
Tensor
output
,
const
int
num_
,
const
int
h_feature
,
const
int
w_feature
,
const
int
h_mask
,
const
int
w_mask
,
const
int
half_h_mask
,
const
int
half_w_mask
);
void
psamask_backward_impl
(
const
int
psa_type
,
const
Tensor
grad_output
,
Tensor
grad_input
,
const
int
num_
,
const
int
h_feature
,
const
int
w_feature
,
const
int
h_mask
,
const
int
w_mask
,
const
int
half_h_mask
,
const
int
half_w_mask
);
REGISTER_DEVICE_IMPL
(
psamask_forward_impl
,
MLU
,
psamask_forward_mlu
);
REGISTER_DEVICE_IMPL
(
psamask_backward_impl
,
MLU
,
psamask_backward_mlu
);
mmcv/ops/csrc/pytorch/mlu/roi_align_mlu.cpp
0 → 100644
View file @
fdeee889
/*************************************************************************
* Copyright (C) 2021 Cambricon.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#include "pytorch_device_registry.hpp"
#include "pytorch_mlu_helper.hpp"
void
KernelRoiAlign
(
cnrtDim3_t
k_dim
,
cnrtFunctionType_t
k_type
,
cnrtQueue_t
queue
,
const
cnrtDataType_t
d_type
,
const
void
*
input
,
const
void
*
rois
,
const
int
channels
,
const
bool
aligned
,
const
int
pooled_height
,
const
int
pooled_width
,
const
int
input_height
,
const
int
input_width
,
const
int
sampling_ratio
,
const
float
spatial_scale
,
const
int
num_rois
,
void
*
output
);
void
KernelRoiAlignBackward
(
cnrtDim3_t
k_dim
,
cnrtFunctionType_t
k_type
,
cnrtQueue_t
queue
,
const
cnrtDataType_t
dtype
,
const
void
*
grads
,
const
void
*
boxes
,
void
*
grads_image
,
const
int
boxes_num
,
const
int
hi
,
const
int
wi
,
const
int
c
,
const
int
no
,
const
int
ho
,
const
int
wo
,
const
float
spatial_scale
,
const
int
sampling_ratio
,
const
bool
aligned
);
void
ROIAlignForwardMLUKernelLauncher
(
Tensor
input
,
Tensor
rois
,
Tensor
output
,
Tensor
argmax_y
,
Tensor
argmax_x
,
int
aligned_height
,
int
aligned_width
,
float
spatial_scale
,
int
sampling_ratio
,
int
pool_mode
,
bool
aligned
)
{
// params check
TORCH_CHECK
(
input
.
scalar_type
()
==
at
::
kFloat
||
input
.
scalar_type
()
==
at
::
kHalf
,
"input type should be Float or Half, got "
,
input
.
scalar_type
());
TORCH_CHECK
(
rois
.
scalar_type
()
==
input
.
scalar_type
(),
"rois should have the same type as input"
);
TORCH_CHECK
(
input
.
dim
()
==
4
,
"input should be a 4d tensor, got "
,
input
.
dim
(),
"D"
);
TORCH_CHECK
(
rois
.
dim
()
==
2
,
"rois should be a 2d tensor, got "
,
rois
.
dim
(),
"D"
);
TORCH_CHECK
(
pool_mode
==
1
,
"pool_mode only suppurts 'avg' currently"
);
auto
memory_format
=
torch_mlu
::
cnnl
::
ops
::
get_channels_last_memory_format
(
input
.
dim
());
auto
input_tensor
=
torch_mlu
::
cnnl
::
ops
::
cnnl_contiguous
(
input
,
memory_format
);
auto
num_rois
=
rois
.
size
(
0
);
auto
channels
=
input
.
size
(
1
);
int
height
=
input
.
size
(
2
);
int
width
=
input
.
size
(
3
);
if
(
output
.
numel
()
==
0
)
{
output
=
at
::
zeros
({
num_rois
,
channels
,
aligned_height
,
aligned_width
},
input
.
options
());
return
;
}
at
::
Tensor
output_tmp
=
at
::
empty
({
num_rois
,
channels
,
aligned_height
,
aligned_width
},
input
.
options
(),
memory_format
);
// get tensor impl
auto
self_impl
=
torch_mlu
::
getMluTensorImpl
(
input_tensor
);
auto
rois_impl
=
torch_mlu
::
getMluTensorImpl
(
rois
);
auto
output_impl
=
torch_mlu
::
getMluTensorImpl
(
output_tmp
);
// get compute queue
auto
queue
=
torch_mlu
::
getCurQueue
();
// get the mlu ptr
auto
self_ptr
=
self_impl
->
cnnlMalloc
();
auto
rois_ptr
=
rois_impl
->
cnnlMalloc
();
auto
output_ptr
=
output_impl
->
cnnlMalloc
();
cnrtJobType_t
k_type
=
CNRT_FUNC_TYPE_UNION1
;
cnrtDim3_t
k_dim
;
k_dim
.
x
=
torch_mlu
::
getDeviceAttr
(
cnrtAttrMcorePerCluster
);
k_dim
.
y
=
torch_mlu
::
getDeviceAttr
(
cnrtAttrClusterCount
);
k_dim
.
z
=
1
;
cnrtDataType_t
data_type
=
torch_mlu
::
toCnrtDtype
(
input
.
dtype
());
KernelRoiAlign
(
k_dim
,
k_type
,
queue
,
data_type
,
self_ptr
,
rois_ptr
,
channels
,
aligned
,
aligned_height
,
aligned_width
,
height
,
width
,
sampling_ratio
,
spatial_scale
,
num_rois
,
output_ptr
);
output
.
copy_
(
output_tmp
);
}
static
int
nearestPower2
(
int
x
)
{
x
--
;
x
|=
x
>>
1
;
x
|=
x
>>
2
;
x
|=
x
>>
4
;
x
|=
x
>>
8
;
x
|=
x
>>
16
;
x
++
;
return
x
;
}
void
ROIAlignBackwardMLUKernelLauncher
(
Tensor
grad
,
Tensor
rois
,
Tensor
argmax_y
,
Tensor
argmax_x
,
Tensor
grad_input
,
int
aligned_height
,
int
aligned_width
,
float
spatial_scale
,
int
sampling_ratio
,
int
pool_mode
,
bool
aligned
)
{
// params check
TORCH_CHECK
(
grad
.
scalar_type
()
==
at
::
kFloat
||
grad
.
scalar_type
()
==
at
::
kHalf
,
"grad type should be Float or Half, got "
,
grad
.
scalar_type
());
TORCH_CHECK
(
rois
.
scalar_type
()
==
grad
.
scalar_type
(),
"rois should have the same type as grad"
);
TORCH_CHECK
(
grad
.
dim
()
==
4
,
"grad should be a 4d tensor, got "
,
grad
.
dim
(),
"D"
);
TORCH_CHECK
(
rois
.
dim
()
==
2
,
"rois should be a 2d tensor, got "
,
rois
.
dim
(),
"D"
);
TORCH_CHECK
(
pool_mode
==
1
,
"pool_mode only suppurts 'avg' currently"
);
int
batch_size
=
grad_input
.
size
(
0
);
int
channels
=
grad_input
.
size
(
1
);
int
height
=
grad_input
.
size
(
2
);
int
width
=
grad_input
.
size
(
3
);
auto
memory_format
=
torch_mlu
::
cnnl
::
ops
::
get_channels_last_memory_format
(
grad
.
dim
());
auto
grad_
=
torch_mlu
::
cnnl
::
ops
::
cnnl_contiguous
(
grad
,
memory_format
);
auto
grad_input_
=
at
::
empty
({
batch_size
,
channels
,
height
,
width
},
grad
.
options
(),
memory_format
)
.
zero_
();
int
boxes_num
=
rois
.
size
(
0
);
int
hi
=
grad
.
size
(
2
);
int
wi
=
grad
.
size
(
3
);
int
c
=
grad
.
size
(
1
);
int
no
=
grad_input
.
size
(
0
);
int
ho
=
grad_input
.
size
(
2
);
int
wo
=
grad_input
.
size
(
3
);
// get tensor impl
auto
grad_impl
=
torch_mlu
::
getMluTensorImpl
(
grad_
);
auto
grad_input_impl
=
torch_mlu
::
getMluTensorImpl
(
grad_input_
);
auto
rois_impl
=
torch_mlu
::
getMluTensorImpl
(
rois
);
// get compute queue
auto
queue
=
torch_mlu
::
getCurQueue
();
// get the mlu ptr
auto
grad_ptr
=
grad_impl
->
cnnlMalloc
();
auto
rois_ptr
=
rois_impl
->
cnnlMalloc
();
auto
grad_input_ptr
=
grad_input_impl
->
cnnlMalloc
();
cnrtJobType_t
k_type
=
CNRT_FUNC_TYPE_UNION1
;
int
need_core
=
nearestPower2
(
boxes_num
);
int
union_number
=
torch_mlu
::
getDeviceAttr
(
cnrtAttrClusterCount
);
uint32_t
dim_x
=
torch_mlu
::
getDeviceAttr
(
cnrtAttrMcorePerCluster
);
uint32_t
dim_y
=
(
need_core
-
1
)
/
dim_x
+
1
;
dim_y
=
(
dim_y
>
union_number
)
?
union_number
:
dim_y
;
cnrtDim3_t
k_dim
=
{
dim_x
,
dim_y
,
1
};
cnrtDataType_t
k_dtype
=
torch_mlu
::
toCnrtDtype
(
grad
.
dtype
());
KernelRoiAlignBackward
(
k_dim
,
k_type
,
queue
,
k_dtype
,
grad_ptr
,
rois_ptr
,
grad_input_ptr
,
boxes_num
,
hi
,
wi
,
c
,
no
,
ho
,
wo
,
spatial_scale
,
sampling_ratio
,
aligned
);
grad_input
.
copy_
(
grad_input_
);
}
void
roi_align_forward_mlu
(
Tensor
input
,
Tensor
rois
,
Tensor
output
,
Tensor
argmax_y
,
Tensor
argmax_x
,
int
aligned_height
,
int
aligned_width
,
float
spatial_scale
,
int
sampling_ratio
,
int
pool_mode
,
bool
aligned
)
{
ROIAlignForwardMLUKernelLauncher
(
input
,
rois
,
output
,
argmax_y
,
argmax_x
,
aligned_height
,
aligned_width
,
spatial_scale
,
sampling_ratio
,
pool_mode
,
aligned
);
}
void
roi_align_backward_mlu
(
Tensor
grad_output
,
Tensor
rois
,
Tensor
argmax_y
,
Tensor
argmax_x
,
Tensor
grad_input
,
int
aligned_height
,
int
aligned_width
,
float
spatial_scale
,
int
sampling_ratio
,
int
pool_mode
,
bool
aligned
)
{
ROIAlignBackwardMLUKernelLauncher
(
grad_output
,
rois
,
argmax_y
,
argmax_x
,
grad_input
,
aligned_height
,
aligned_width
,
spatial_scale
,
sampling_ratio
,
pool_mode
,
aligned
);
}
void
roi_align_forward_impl
(
Tensor
input
,
Tensor
rois
,
Tensor
output
,
Tensor
argmax_y
,
Tensor
argmax_x
,
int
aligned_height
,
int
aligned_width
,
float
spatial_scale
,
int
sampling_ratio
,
int
pool_mode
,
bool
aligned
);
void
roi_align_backward_impl
(
Tensor
grad_output
,
Tensor
rois
,
Tensor
argmax_y
,
Tensor
argmax_x
,
Tensor
grad_input
,
int
aligned_height
,
int
aligned_width
,
float
spatial_scale
,
int
sampling_ratio
,
int
pool_mode
,
bool
aligned
);
REGISTER_DEVICE_IMPL
(
roi_align_forward_impl
,
MLU
,
roi_align_forward_mlu
);
REGISTER_DEVICE_IMPL
(
roi_align_backward_impl
,
MLU
,
roi_align_backward_mlu
);
mmcv/ops/csrc/pytorch/mlu/roi_align_rotated_mlu.cpp
0 → 100644
View file @
fdeee889
/*************************************************************************
* Copyright (C) 2022 by Cambricon.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#include "pytorch_device_registry.hpp"
#include "pytorch_mlu_helper.hpp"
#include "roi_align_rotated_utils.hpp"
namespace
{
void
policyFunc
(
int
bin_num
,
cnrtDim3_t
*
k_dim
,
cnrtFunctionType_t
*
k_type
)
{
unsigned
int
core_num
=
torch_mlu
::
getDeviceAttr
(
cnrtAttrMcorePerCluster
);
unsigned
int
cluster_num
=
torch_mlu
::
getDeviceAttr
(
cnrtAttrClusterCount
);
*
k_type
=
CNRT_FUNC_TYPE_UNION1
;
k_dim
->
x
=
core_num
;
unsigned
int
use_cluster
=
(
bin_num
+
core_num
-
1
)
/
core_num
;
k_dim
->
y
=
use_cluster
>
cluster_num
?
cluster_num
:
use_cluster
;
k_dim
->
z
=
1
;
}
}
// namespace
void
KernelRoiAlignRotatedForward
(
cnrtDim3_t
k_dim
,
cnrtFunctionType_t
k_type
,
cnrtQueue_t
queue
,
const
cnrtDataType_t
d_type
,
const
void
*
features
,
const
void
*
rois
,
void
*
output
,
const
int
batch
,
const
int
height
,
const
int
width
,
const
int
channel
,
const
int
rois_num
,
const
RoiAlignRotatedParams
roiAlignRotatedParams
);
void
KernelRoiAlignRotatedBackward
(
cnrtDim3_t
k_dim
,
cnrtFunctionType_t
k_type
,
cnrtQueue_t
queue
,
const
cnrtDataType_t
d_type
,
const
void
*
top_grad
,
const
void
*
rois
,
void
*
bottom_grad
,
const
int
batch
,
const
int
height
,
const
int
width
,
const
int
channel
,
const
int
rois_num
,
const
RoiAlignRotatedParams
roiAlignRotatedParams
);
void
ROIAlignRotatedForwardMLUKernelLauncher
(
Tensor
input
,
Tensor
rois
,
Tensor
output
,
int
pooled_height
,
int
pooled_width
,
float
spatial_scale
,
int
sampling_ratio
,
bool
aligned
,
bool
clockwise
)
{
TORCH_CHECK
(((
input
.
scalar_type
()
==
output
.
scalar_type
())
&&
(
output
.
scalar_type
()
==
rois
.
scalar_type
())),
"data types of input, rois and output should be the same, "
,
"but now input type is "
,
input
.
scalar_type
(),
", rois type is "
,
rois
.
scalar_type
(),
", output type is "
,
output
.
scalar_type
(),
"."
);
TORCH_CHECK
(
(
input
.
scalar_type
()
==
at
::
kFloat
||
input
.
scalar_type
()
==
at
::
kHalf
),
"input type should be Float or Half, got "
,
input
.
scalar_type
(),
"."
);
TORCH_CHECK
(
input
.
dim
()
==
4
,
"input should be a 4d tensor, got "
,
input
.
dim
(),
"D."
);
TORCH_CHECK
(
rois
.
dim
()
==
2
,
"rois should be a 2d tensor, got "
,
rois
.
dim
(),
"D."
);
TORCH_CHECK
(
output
.
dim
()
==
4
,
"output should be a 4d tensor, got "
,
output
.
dim
(),
"D."
);
TORCH_CHECK
((
rois
.
size
(
0
)
==
output
.
size
(
0
)),
"the 1st dimensions of rois and output should be the same, "
,
"but now the 1st dimension of rois is "
,
rois
.
size
(
0
),
", and output is "
,
output
.
size
(
0
),
"."
);
TORCH_CHECK
((
input
.
size
(
1
)
==
output
.
size
(
1
)),
"the 2nd dimensions of input and output should be the same, "
,
"but now the 2nd dimension of input is "
,
input
.
size
(
1
),
", and output is "
,
output
.
size
(
1
),
"."
);
int
channel
=
input
.
size
(
1
);
int
width
=
input
.
size
(
3
);
int
height
=
input
.
size
(
2
);
int
batch
=
input
.
size
(
0
);
int
rois_nums
=
rois
.
size
(
0
);
cnrtDataType_t
d_type
=
torch_mlu
::
toCnrtDtype
(
input
.
dtype
());
// return if zero-elements
if
(
input
.
numel
()
==
0
)
{
CNLOG
(
INFO
)
<<
"Skip the zero-elements case."
;
return
;
}
RoiAlignRotatedParams
roiAlignRotatedParams
{
pooled_height
,
pooled_width
,
sampling_ratio
,
spatial_scale
,
aligned
,
clockwise
};
cnrtDim3_t
k_dim
;
cnrtFunctionType_t
k_type
;
policyFunc
(
rois_nums
*
pooled_height
*
pooled_width
,
&
k_dim
,
&
k_type
);
auto
memory_format
=
torch_mlu
::
cnnl
::
ops
::
get_channels_last_memory_format
(
input
.
dim
());
auto
input_tensor
=
torch_mlu
::
cnnl
::
ops
::
cnnl_contiguous
(
input
,
memory_format
);
at
::
Tensor
output_tmp
=
at
::
empty
({
batch
,
channel
,
pooled_height
,
pooled_width
},
input
.
options
(),
memory_format
);
// get compute queue
auto
queue
=
torch_mlu
::
getCurQueue
();
// get ptr of tensors
auto
input_impl
=
torch_mlu
::
getMluTensorImpl
(
input_tensor
);
auto
input_ptr
=
input_impl
->
cnnlMalloc
();
auto
rois_impl
=
torch_mlu
::
getMluTensorImpl
(
rois
);
auto
rois_ptr
=
rois_impl
->
cnnlMalloc
();
auto
output_impl
=
torch_mlu
::
getMluTensorImpl
(
output_tmp
);
auto
output_ptr
=
output_impl
->
cnnlMalloc
();
KernelRoiAlignRotatedForward
(
k_dim
,
k_type
,
queue
,
d_type
,
input_ptr
,
rois_ptr
,
output_ptr
,
batch
,
height
,
width
,
channel
,
rois_nums
,
roiAlignRotatedParams
);
output
.
copy_
(
output_tmp
);
}
void
ROIAlignRotatedBackwardMLUKernelLauncher
(
Tensor
top_grad
,
Tensor
rois
,
Tensor
bottom_grad
,
int
pooled_height
,
int
pooled_width
,
float
spatial_scale
,
int
sampling_ratio
,
bool
aligned
,
bool
clockwise
)
{
TORCH_CHECK
(((
top_grad
.
scalar_type
()
==
bottom_grad
.
scalar_type
())
&&
(
bottom_grad
.
scalar_type
()
==
rois
.
scalar_type
())),
"data types of top_grad, rois and bottom_grad should be "
,
"the same, but now top_grad type is "
,
top_grad
.
scalar_type
(),
", rois type is "
,
rois
.
scalar_type
(),
", bottom_grad type is "
,
bottom_grad
.
scalar_type
(),
"."
);
TORCH_CHECK
((
bottom_grad
.
scalar_type
()
==
at
::
kFloat
||
bottom_grad
.
scalar_type
()
==
at
::
kHalf
),
"Data type of bottom_grad should be Float ro Half, got "
,
bottom_grad
.
scalar_type
(),
"."
);
TORCH_CHECK
(
bottom_grad
.
dim
()
==
4
,
"bottom_grad should be a 4d tensor, got "
,
top_grad
.
dim
(),
"D."
);
TORCH_CHECK
(
rois
.
dim
()
==
2
,
"rois should be a 2d tensor, got "
,
rois
.
dim
(),
"D."
);
TORCH_CHECK
(
top_grad
.
dim
()
==
4
,
"top_grad should be a 4d tensor, got "
,
bottom_grad
.
dim
(),
"D."
);
TORCH_CHECK
((
rois
.
size
(
0
)
==
top_grad
.
size
(
0
)),
"the 1st dimensions of rois and top_grad should be the same, "
,
"but now the 1st dimension of rois is "
,
rois
.
size
(
0
),
", and top_grad is "
,
top_grad
.
size
(
0
),
"."
);
TORCH_CHECK
((
bottom_grad
.
size
(
1
)
==
top_grad
.
size
(
1
)),
"the 2nd dimensions of bottom_grad and top_grad should be "
,
"the same, but now the 2nd dimension of bottom_grad is "
,
bottom_grad
.
size
(
1
),
", and top_grad is "
,
top_grad
.
size
(
1
),
"."
);
int
channel
=
bottom_grad
.
size
(
1
);
int
width
=
bottom_grad
.
size
(
3
);
int
height
=
bottom_grad
.
size
(
2
);
int
batch
=
bottom_grad
.
size
(
0
);
int
rois_nums
=
rois
.
size
(
0
);
cnrtDataType_t
d_type
=
torch_mlu
::
toCnrtDtype
(
bottom_grad
.
dtype
());
// return if zero-elements
if
(
bottom_grad
.
numel
()
==
0
)
{
CNLOG
(
INFO
)
<<
"Skip the zero-elements case."
;
return
;
}
RoiAlignRotatedParams
roiAlignRotatedParams
{
pooled_height
,
pooled_width
,
sampling_ratio
,
spatial_scale
,
aligned
,
clockwise
};
cnrtDim3_t
k_dim
;
cnrtFunctionType_t
k_type
;
policyFunc
(
rois_nums
*
pooled_height
*
pooled_width
,
&
k_dim
,
&
k_type
);
auto
memory_format
=
torch_mlu
::
cnnl
::
ops
::
get_channels_last_memory_format
(
top_grad
.
dim
());
auto
top_grad_tensor
=
torch_mlu
::
cnnl
::
ops
::
cnnl_contiguous
(
top_grad
,
memory_format
);
at
::
Tensor
bottom_grad_tmp
=
at
::
empty
({
batch
,
channel
,
height
,
width
},
top_grad
.
options
(),
memory_format
)
.
zero_
();
// get compute queue
auto
queue
=
torch_mlu
::
getCurQueue
();
// get ptr of tensors
auto
bottom_grad_impl
=
torch_mlu
::
getMluTensorImpl
(
bottom_grad_tmp
);
auto
bottom_grad_ptr
=
bottom_grad_impl
->
cnnlMalloc
();
auto
rois_impl
=
torch_mlu
::
getMluTensorImpl
(
rois
);
auto
rois_ptr
=
rois_impl
->
cnnlMalloc
();
auto
top_grad_impl
=
torch_mlu
::
getMluTensorImpl
(
top_grad_tensor
);
auto
top_grad_ptr
=
top_grad_impl
->
cnnlMalloc
();
KernelRoiAlignRotatedBackward
(
k_dim
,
k_type
,
queue
,
d_type
,
top_grad_ptr
,
rois_ptr
,
bottom_grad_ptr
,
batch
,
height
,
width
,
channel
,
rois_nums
,
roiAlignRotatedParams
);
bottom_grad
.
copy_
(
bottom_grad_tmp
);
}
void
roi_align_rotated_forward_mlu
(
Tensor
input
,
Tensor
rois
,
Tensor
output
,
int
aligned_height
,
int
aligned_width
,
float
spatial_scale
,
int
sampling_ratio
,
bool
aligned
,
bool
clockwise
)
{
ROIAlignRotatedForwardMLUKernelLauncher
(
input
,
rois
,
output
,
aligned_height
,
aligned_width
,
spatial_scale
,
sampling_ratio
,
aligned
,
clockwise
);
}
void
roi_align_rotated_backward_mlu
(
Tensor
top_grad
,
Tensor
rois
,
Tensor
bottom_grad
,
int
aligned_height
,
int
aligned_width
,
float
spatial_scale
,
int
sampling_ratio
,
bool
aligned
,
bool
clockwise
)
{
ROIAlignRotatedBackwardMLUKernelLauncher
(
top_grad
,
rois
,
bottom_grad
,
aligned_height
,
aligned_width
,
spatial_scale
,
sampling_ratio
,
aligned
,
clockwise
);
}
void
roi_align_rotated_forward_impl
(
Tensor
input
,
Tensor
rois
,
Tensor
output
,
int
aligned_height
,
int
aligned_width
,
float
spatial_scale
,
int
sampling_ratio
,
bool
aligned
,
bool
clockwise
);
void
roi_align_rotated_backward_impl
(
Tensor
top_grad
,
Tensor
rois
,
Tensor
bottom_grad
,
int
aligned_height
,
int
aligned_width
,
float
spatial_scale
,
int
sampling_ratio
,
bool
aligned
,
bool
clockwise
);
REGISTER_DEVICE_IMPL
(
roi_align_rotated_forward_impl
,
MLU
,
roi_align_rotated_forward_mlu
);
REGISTER_DEVICE_IMPL
(
roi_align_rotated_backward_impl
,
MLU
,
roi_align_rotated_backward_mlu
);
mmcv/ops/csrc/pytorch/mlu/roi_pool_mlu.cpp
0 → 100644
View file @
fdeee889
/*************************************************************************
* Copyright (C) 2022 Cambricon.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#include "pytorch_device_registry.hpp"
#include "pytorch_mlu_helper.hpp"
void
KernelRoiPoolForward
(
cnrtDim3_t
k_dim
,
cnrtFunctionType_t
k_type
,
cnrtQueue_t
queue
,
cnrtDataType_t
data_type
,
const
void
*
input_data
,
const
void
*
input_rois
,
const
int
batch
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
pooled_height
,
const
int
pooled_width
,
const
int
rois_num
,
const
float
spatial_scale
,
void
*
output_data
,
int
*
argmax
);
void
KernelRoiPoolBackward
(
cnrtDim3_t
k_dim
,
cnrtFunctionType_t
k_type
,
cnrtQueue_t
queue
,
cnrtDataType_t
k_dtype
,
const
void
*
grad_output_ptr
,
const
void
*
rois_ptr
,
const
int
*
argmax_ptr
,
void
*
grad_input_ptr
,
const
int
box_num
,
const
int
pooled_height
,
const
int
pooled_width
,
const
int
channels
,
const
int
batch
,
const
int
height
,
const
int
width
,
const
float
spatial_scale
);
// policy function for forward
static
void
policyFuncForward
(
const
int
bin_num
,
cnrtDim3_t
*
k_dim
,
cnrtFunctionType_t
*
k_type
)
{
auto
core_num
=
torch_mlu
::
getDeviceAttr
(
cnrtAttrMcorePerCluster
);
auto
cluster_num
=
torch_mlu
::
getDeviceAttr
(
cnrtAttrClusterCount
);
*
k_type
=
CNRT_FUNC_TYPE_UNION1
;
k_dim
->
x
=
core_num
;
unsigned
int
use_cluster
=
bin_num
/
core_num
+
(
bin_num
%
core_num
>
0
);
k_dim
->
y
=
use_cluster
>
cluster_num
?
cluster_num
:
use_cluster
;
k_dim
->
z
=
1
;
}
void
ROIPoolForwardMLUKernelLauncher
(
Tensor
input
,
Tensor
rois
,
Tensor
output
,
Tensor
argmax
,
int
pooled_height
,
int
pooled_width
,
float
spatial_scale
)
{
// Check dtype.
TORCH_CHECK
(
input
.
scalar_type
()
==
at
::
kFloat
||
input
.
scalar_type
()
==
at
::
kHalf
,
"input type should be Float or Half, got "
,
input
.
scalar_type
());
TORCH_CHECK
(
input
.
scalar_type
()
==
rois
.
scalar_type
(),
"rois should have the same type as input"
);
// Check dtype relationship.
TORCH_CHECK
(
argmax
.
scalar_type
()
==
at
::
kLong
||
argmax
.
scalar_type
()
==
at
::
kInt
,
"argmax type should be Int or Long, got "
,
argmax
.
scalar_type
());
// Check shape.
TORCH_CHECK
(
input
.
dim
()
==
4
,
"input should be 4d tensor, got "
,
input
.
dim
(),
"D"
);
TORCH_CHECK
(
rois
.
dim
()
==
2
,
"rois should be 2d tensor, got "
,
rois
.
dim
(),
"D"
);
TORCH_CHECK
(
argmax
.
dim
()
==
4
,
"argmax should be 4d tensor, got "
,
argmax
.
dim
(),
"D"
);
TORCH_CHECK
(
spatial_scale
>
0
&&
spatial_scale
<=
1
,
"spatial_scale should be within (0, 1], got "
,
spatial_scale
);
// compute kernel params
auto
batch
=
input
.
size
(
0
);
auto
height
=
input
.
size
(
2
);
auto
width
=
input
.
size
(
3
);
auto
channels
=
input
.
size
(
1
);
auto
rois_num
=
output
.
size
(
0
);
if
(
output
.
numel
()
==
0
)
{
output
=
at
::
zeros
({
rois_num
,
channels
,
pooled_height
,
pooled_width
},
input
.
options
());
return
;
}
if
(
argmax
.
numel
()
==
0
)
{
argmax
=
at
::
zeros
({
rois_num
,
channels
,
pooled_height
,
pooled_width
},
argmax
.
options
());
return
;
}
// zero element check
if
(
input
.
numel
()
==
0
||
rois
.
numel
()
==
0
||
output
.
numel
()
==
0
||
argmax
.
numel
()
==
0
)
{
return
;
}
auto
memory_format
=
torch_mlu
::
cnnl
::
ops
::
get_channels_last_memory_format
(
input
.
dim
());
auto
input_
=
torch_mlu
::
cnnl
::
ops
::
cnnl_contiguous
(
input
,
memory_format
);
at
::
Tensor
output_
=
at
::
empty
({
rois_num
,
channels
,
pooled_height
,
pooled_width
},
input
.
options
(),
memory_format
);
at
::
Tensor
argmax_
=
at
::
empty
({
rois_num
,
channels
,
pooled_height
,
pooled_width
},
argmax
.
options
(),
memory_format
);
// calculate task dimension
cnrtDim3_t
k_dim
;
cnrtFunctionType_t
k_type
;
policyFuncForward
(
rois_num
*
pooled_height
*
pooled_width
,
&
k_dim
,
&
k_type
);
// get compute queue
auto
queue
=
torch_mlu
::
getCurQueue
();
// get ptr of tensors
auto
input_impl
=
torch_mlu
::
getMluTensorImpl
(
input_
);
auto
input_ptr
=
input_impl
->
cnnlMalloc
();
auto
rois_impl
=
torch_mlu
::
getMluTensorImpl
(
rois
);
auto
rois_ptr
=
rois_impl
->
cnnlMalloc
();
auto
output_impl
=
torch_mlu
::
getMluTensorImpl
(
output_
);
auto
output_ptr
=
output_impl
->
cnnlMalloc
();
auto
argmax_impl
=
torch_mlu
::
getMluTensorImpl
(
argmax_
);
auto
argmax_ptr
=
argmax_impl
->
cnnlMalloc
();
// get comput dtype of input
cnrtDataType_t
data_type
=
torch_mlu
::
toCnrtDtype
(
input_
.
dtype
());
// launch kernel
CNLOG
(
INFO
)
<<
"Launch Kernel MLUKernelRoiPoolForward<<<"
<<
k_dim
.
x
<<
", "
<<
k_dim
.
y
<<
", "
<<
k_dim
.
z
<<
">>>"
;
KernelRoiPoolForward
(
k_dim
,
k_type
,
queue
,
data_type
,
input_ptr
,
rois_ptr
,
batch
,
channels
,
height
,
width
,
pooled_height
,
pooled_width
,
rois_num
,
spatial_scale
,
output_ptr
,
(
int
*
)
argmax_ptr
);
output
.
copy_
(
output_
);
argmax
.
copy_
(
argmax_
);
}
// policy function for backward
static
void
policyFuncBackward
(
cnrtDim3_t
*
k_dim
,
cnrtFunctionType_t
*
k_type
)
{
*
k_type
=
CNRT_FUNC_TYPE_UNION1
;
k_dim
->
x
=
torch_mlu
::
getDeviceAttr
(
cnrtAttrMcorePerCluster
);
k_dim
->
y
=
torch_mlu
::
getDeviceAttr
(
cnrtAttrClusterCount
);
k_dim
->
z
=
1
;
}
void
ROIPoolBackwardMLUKernelLauncher
(
Tensor
grad_output
,
Tensor
rois
,
Tensor
argmax
,
Tensor
grad_input
,
int
pooled_height
,
int
pooled_width
,
float
spatial_scale
)
{
// Check dtype.
TORCH_CHECK
(
argmax
.
scalar_type
()
==
at
::
kLong
||
argmax
.
scalar_type
()
==
at
::
kInt
,
"argmax type should be Int or Long, got "
,
argmax
.
scalar_type
());
TORCH_CHECK
((
grad_output
.
scalar_type
()
==
at
::
kFloat
||
grad_output
.
scalar_type
()
==
at
::
kHalf
),
"grad_output type should be FLoat or Half, got "
,
grad_output
.
scalar_type
());
// Check dtype relationship.
TORCH_CHECK
((
rois
.
scalar_type
()
==
grad_output
.
scalar_type
()),
"rois should have the same type as grad_output"
);
// Check shape.
TORCH_CHECK
(
grad_output
.
dim
()
==
4
,
"grad_output should be 4d tensor, got "
,
grad_output
.
dim
(),
"D"
);
TORCH_CHECK
(
rois
.
dim
()
==
2
,
"rois should be 2d tensor, got "
,
rois
.
dim
(),
"D"
);
TORCH_CHECK
(
argmax
.
dim
()
==
4
,
"argmax should be 4d tensor, got "
,
argmax
.
dim
(),
"D"
);
TORCH_CHECK
(
spatial_scale
>
0
&&
spatial_scale
<=
1
,
"spatial_scale should be within (0, 1], got "
,
spatial_scale
);
// Check relationship between tensor.
// Check the relationship of n.
TORCH_CHECK
(
grad_output
.
size
(
0
)
==
rois
.
size
(
0
),
"grad_output.size(0) = "
,
grad_output
.
size
(
0
),
", while rois.size(0) = "
,
rois
.
size
(
0
),
". They should be the same."
);
// Check the relationship of channels.
TORCH_CHECK
(
grad_output
.
size
(
1
)
==
argmax
.
size
(
1
),
"grad_output.size(1) = "
,
grad_output
.
size
(
1
),
", while argmax.size(1) = "
,
argmax
.
size
(
1
),
". They should be the same."
);
// Check the relationship of height and width.
TORCH_CHECK
(
grad_output
.
size
(
2
)
==
argmax
.
size
(
2
),
"argmax.size(2) = "
,
argmax
.
size
(
2
),
", while grad_output.size(2) = "
,
grad_output
.
size
(
2
),
". They should be the same."
);
TORCH_CHECK
(
grad_output
.
size
(
3
)
==
argmax
.
size
(
3
),
"argmax.size(3) = "
,
argmax
.
size
(
3
),
", while grad_output.size(3) = "
,
grad_output
.
size
(
3
),
". They should be the same."
);
// Check zero element.
if
(
grad_output
.
numel
()
==
0
||
rois
.
numel
()
==
0
||
argmax
.
numel
()
==
0
||
grad_input
.
numel
()
==
0
)
{
// return if zero-element
return
;
}
auto
memory_format
=
torch_mlu
::
cnnl
::
ops
::
get_channels_last_memory_format
(
grad_output
.
dim
());
auto
grad_output_
=
torch_mlu
::
cnnl
::
ops
::
cnnl_contiguous
(
grad_output
,
memory_format
);
auto
argmax_
=
torch_mlu
::
cnnl
::
ops
::
cnnl_contiguous
(
argmax
,
memory_format
);
int
boxes_num
=
grad_output
.
size
(
0
);
int
no
=
grad_input
.
size
(
0
);
int
channels
=
grad_input
.
size
(
1
);
int
height
=
grad_input
.
size
(
2
);
int
width
=
grad_input
.
size
(
3
);
auto
grad_input_
=
at
::
empty
({
no
,
channels
,
height
,
width
},
grad_input
.
options
(),
memory_format
)
.
zero_
();
// get tensor impl
auto
grad_output_impl
=
torch_mlu
::
getMluTensorImpl
(
grad_output_
);
auto
rois_impl
=
torch_mlu
::
getMluTensorImpl
(
rois
);
auto
argmax_impl
=
torch_mlu
::
getMluTensorImpl
(
argmax_
);
auto
grad_input_impl
=
torch_mlu
::
getMluTensorImpl
(
grad_input_
);
// get compute queue
auto
queue
=
torch_mlu
::
getCurQueue
();
// get mlu ptr
auto
grad_output_ptr
=
grad_output_impl
->
cnnlMalloc
();
auto
rois_ptr
=
rois_impl
->
cnnlMalloc
();
auto
argmax_ptr
=
argmax_impl
->
cnnlMalloc
();
auto
grad_input_ptr
=
grad_input_impl
->
cnnlMalloc
();
// calculate task dimension
cnrtDataType_t
k_dtype
=
torch_mlu
::
toCnrtDtype
(
grad_input
.
dtype
());
cnrtDim3_t
k_dim
;
cnrtFunctionType_t
k_type
;
policyFuncBackward
(
&
k_dim
,
&
k_type
);
CNLOG
(
INFO
)
<<
"Launch Kernel MLUKernelRoiPoolBackward<<<"
<<
k_dim
.
x
<<
", "
<<
k_dim
.
y
<<
", "
<<
k_dim
.
z
<<
">>>"
;
KernelRoiPoolBackward
(
k_dim
,
k_type
,
queue
,
k_dtype
,
grad_output_ptr
,
rois_ptr
,
(
int
*
)
argmax_ptr
,
grad_input_ptr
,
boxes_num
,
pooled_height
,
pooled_width
,
channels
,
no
,
height
,
width
,
spatial_scale
);
grad_input
.
copy_
(
grad_input_
);
}
void
roi_pool_forward_mlu
(
Tensor
input
,
Tensor
rois
,
Tensor
output
,
Tensor
argmax
,
int
pooled_height
,
int
pooled_width
,
float
spatial_scale
)
{
ROIPoolForwardMLUKernelLauncher
(
input
,
rois
,
output
,
argmax
,
pooled_height
,
pooled_width
,
spatial_scale
);
}
void
roi_pool_backward_mlu
(
Tensor
grad_output
,
Tensor
rois
,
Tensor
argmax
,
Tensor
grad_input
,
int
pooled_height
,
int
pooled_width
,
float
spatial_scale
)
{
ROIPoolBackwardMLUKernelLauncher
(
grad_output
,
rois
,
argmax
,
grad_input
,
pooled_height
,
pooled_width
,
spatial_scale
);
}
void
roi_pool_forward_impl
(
Tensor
input
,
Tensor
rois
,
Tensor
output
,
Tensor
argmax
,
int
pooled_height
,
int
pooled_width
,
float
spatial_scale
);
void
roi_pool_backward_impl
(
Tensor
grad_output
,
Tensor
rois
,
Tensor
argmax
,
Tensor
grad_input
,
int
pooled_height
,
int
pooled_width
,
float
spatial_scale
);
REGISTER_DEVICE_IMPL
(
roi_pool_forward_impl
,
MLU
,
roi_pool_forward_mlu
);
REGISTER_DEVICE_IMPL
(
roi_pool_backward_impl
,
MLU
,
roi_pool_backward_mlu
);
mmcv/ops/csrc/pytorch/mlu/tin_shift_mlu.cpp
0 → 100644
View file @
fdeee889
/*************************************************************************
* Copyright (C) 2022 Cambricon.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#include "pytorch_device_registry.hpp"
#include "pytorch_mlu_helper.hpp"
void
KernelTinShiftForward
(
cnrtDim3_t
k_dim
,
cnrtFunctionType_t
k_type
,
cnrtQueue_t
queue
,
const
void
*
input
,
const
void
*
shifts
,
void
*
output
,
const
int
batch_size
,
const
int
time_size
,
const
int
channel_size
,
const
int
hw_size
,
const
int
group_size
,
const
int
group_channel
,
const
cnrtDataType_t
data_dtype
,
const
int
channel_per_core
,
const
int
max_number_hw_per_core
,
const
int
max_length_per_core
);
void
KernelTinShiftBackward
(
cnrtDim3_t
k_dim
,
cnrtFunctionType_t
k_type
,
cnrtQueue_t
queue
,
const
void
*
grad_output
,
const
void
*
shifts
,
void
*
grad_input
,
const
int
batch_size
,
const
int
time_size
,
const
int
channel_size
,
const
int
hw_size
,
const
int
group_size
,
const
int
group_channel
,
const
cnrtDataType_t
data_dtype
,
const
int
channel_per_core
,
const
int
max_number_hw_per_core
,
const
int
max_length_per_core
);
// policy function
static
void
policyFunc
(
const
Tensor
&
input
,
cnrtDim3_t
*
k_dim
,
cnrtFunctionType_t
*
k_type
,
int
*
channel_per_core
,
int
*
max_number_hw_per_core
,
int
*
max_length_per_core
)
{
const
int32_t
cluster_limit
=
torch_mlu
::
getDeviceAttr
(
cnrtAttrClusterCount
);
const
int32_t
core_limit
=
torch_mlu
::
getDeviceAttr
(
cnrtAttrMcorePerCluster
);
auto
nram_size
=
torch_mlu
::
getDeviceAttr
(
cnrtAttrNramSizePerMcore
);
const
int
core_num
=
core_limit
*
cluster_limit
;
const
int
batch_size
=
input
.
size
(
0
);
const
int
time_size
=
input
.
size
(
1
);
const
int
channel_size
=
input
.
size
(
2
);
const
int
hw_size
=
input
.
size
(
3
);
const
size_t
size_per_channel
=
time_size
*
hw_size
*
input
.
itemsize
();
*
channel_per_core
=
nram_size
/
size_per_channel
;
int
task_dim
=
0
;
if
(
*
channel_per_core
==
0
)
{
const
size_t
size_per_hw
=
hw_size
*
input
.
itemsize
();
*
max_number_hw_per_core
=
nram_size
/
size_per_hw
;
if
(
*
max_number_hw_per_core
<=
0
)
{
*
max_length_per_core
=
nram_size
/
input
.
itemsize
();
}
int
tmp_max_number_hw_per_core
=
*
max_number_hw_per_core
>
0
?
*
max_number_hw_per_core
:
1
;
const
int
loop_time
=
(
time_size
/
(
tmp_max_number_hw_per_core
))
+
((
time_size
%
(
tmp_max_number_hw_per_core
))
>
0
?
1
:
0
);
task_dim
=
batch_size
*
channel_size
*
loop_time
<
core_num
?
batch_size
*
channel_size
*
loop_time
:
core_num
;
}
else
{
task_dim
=
batch_size
*
channel_size
<
core_num
?
batch_size
*
channel_size
:
core_num
;
}
k_dim
->
x
=
core_limit
;
k_dim
->
y
=
(
task_dim
/
core_limit
)
>
0
?
(
task_dim
/
core_limit
)
:
1
;
k_dim
->
z
=
1
;
*
k_type
=
CNRT_FUNC_TYPE_UNION1
;
}
void
TINShiftForwardMLUKernelLauncher
(
Tensor
input
,
Tensor
shift
,
Tensor
output
)
{
// params check
TORCH_CHECK
(
input
.
scalar_type
()
==
at
::
kFloat
||
input
.
scalar_type
()
==
at
::
kHalf
,
"input type should be Float or Half, got "
,
input
.
scalar_type
(),
"."
);
TORCH_CHECK
(
input
.
dim
()
==
4
,
"input should be a 4d tensor, got "
,
input
.
dim
(),
"d."
);
TORCH_CHECK
(
shift
.
dim
()
==
2
,
"shift should be a 2d tensor, got "
,
shift
.
dim
(),
"d."
);
TORCH_CHECK
(
input
.
size
(
0
)
==
shift
.
size
(
0
),
"input batch size should be the same as shift's, input batch size is "
,
input
.
size
(
0
),
" and shift batch size is "
,
shift
.
size
(
0
),
"."
);
TORCH_CHECK
(
input
.
size
(
0
)
!=
0
,
"Input batch size should not be zero."
);
TORCH_CHECK
(
input
.
size
(
3
)
!=
0
,
"The last dim size of input should not be zero."
);
if
(
input
.
size
(
1
)
==
0
)
{
return
;
}
cnrtDim3_t
k_dim
;
cnrtFunctionType_t
k_type
;
int
channel_per_core
=
0
;
int
max_number_hw_per_core
=
0
;
int
max_length_per_core
=
0
;
policyFunc
(
input
,
&
k_dim
,
&
k_type
,
&
channel_per_core
,
&
max_number_hw_per_core
,
&
max_length_per_core
);
const
int
batch_size
=
input
.
size
(
0
);
const
int
time_size
=
input
.
size
(
1
);
const
int
channel_size
=
input
.
size
(
2
);
const
int
hw_size
=
input
.
size
(
3
);
const
int
group_size
=
shift
.
size
(
1
);
int
group_channel
=
channel_size
/
group_size
;
// get tensor impl
auto
input_impl
=
torch_mlu
::
getMluTensorImpl
(
input
);
auto
shift_impl
=
torch_mlu
::
getMluTensorImpl
(
shift
);
auto
output_impl
=
torch_mlu
::
getMluTensorImpl
(
output
);
// get compute queue
auto
queue
=
torch_mlu
::
getCurQueue
();
// get the mlu ptr
auto
input_ptr
=
input_impl
->
cnnlMalloc
();
auto
shift_ptr
=
shift_impl
->
cnnlMalloc
();
auto
output_ptr
=
output_impl
->
cnnlMalloc
();
cnrtDataType_t
data_dtype
=
torch_mlu
::
toCnrtDtype
(
input
.
dtype
());
KernelTinShiftForward
(
k_dim
,
k_type
,
queue
,
input_ptr
,
shift_ptr
,
output_ptr
,
batch_size
,
time_size
,
channel_size
,
hw_size
,
group_size
,
group_channel
,
data_dtype
,
channel_per_core
,
max_number_hw_per_core
,
max_length_per_core
);
}
void
TINShiftBackwardMLUKernelLauncher
(
Tensor
grad_output
,
Tensor
shift
,
Tensor
grad_input
)
{
// params check
TORCH_CHECK
(
grad_output
.
scalar_type
()
==
at
::
kFloat
||
grad_output
.
scalar_type
()
==
at
::
kHalf
,
"grad_output type should be Float or Half, got "
,
grad_output
.
scalar_type
(),
"."
);
TORCH_CHECK
(
grad_output
.
dim
()
==
4
,
"grad_output should be a 4d tensor, got "
,
grad_output
.
dim
(),
"d."
);
TORCH_CHECK
(
shift
.
dim
()
==
2
,
"shift should be a 2d tensor, got "
,
shift
.
dim
(),
"d."
);
TORCH_CHECK
(
grad_output
.
size
(
0
)
==
shift
.
size
(
0
),
"grad_output batch size should be the same as shift's, "
"grad_output batch size is "
,
grad_output
.
size
(
0
),
", shift batch size is "
,
shift
.
size
(
0
),
"."
);
TORCH_CHECK
(
grad_output
.
size
(
0
)
!=
0
,
"grad_output batch size should not be zero."
);
TORCH_CHECK
(
grad_output
.
size
(
3
)
!=
0
,
"The last dim size of grad_output should not be zero."
);
if
(
grad_output
.
size
(
1
)
==
0
)
{
return
;
}
cnrtDim3_t
k_dim
;
cnrtFunctionType_t
k_type
;
int
channel_per_core
=
0
;
int
max_number_hw_per_core
=
0
;
int
max_length_per_core
=
0
;
policyFunc
(
grad_output
,
&
k_dim
,
&
k_type
,
&
channel_per_core
,
&
max_number_hw_per_core
,
&
max_length_per_core
);
const
int
batch_size
=
grad_output
.
size
(
0
);
const
int
time_size
=
grad_output
.
size
(
1
);
const
int
channel_size
=
grad_output
.
size
(
2
);
const
int
hw_size
=
grad_output
.
size
(
3
);
const
int
group_size
=
shift
.
size
(
1
);
int
group_channel
=
channel_size
/
group_size
;
// get tensor impl
auto
grad_output_impl
=
torch_mlu
::
getMluTensorImpl
(
grad_output
);
auto
shift_impl
=
torch_mlu
::
getMluTensorImpl
(
shift
);
auto
grad_input_impl
=
torch_mlu
::
getMluTensorImpl
(
grad_input
);
// get compute queue
auto
queue
=
torch_mlu
::
getCurQueue
();
// get the mlu ptr
auto
grad_output_ptr
=
grad_output_impl
->
cnnlMalloc
();
auto
shift_ptr
=
shift_impl
->
cnnlMalloc
();
auto
grad_input_ptr
=
grad_input_impl
->
cnnlMalloc
();
cnrtDataType_t
data_dtype
=
torch_mlu
::
toCnrtDtype
(
grad_output
.
dtype
());
KernelTinShiftBackward
(
k_dim
,
k_type
,
queue
,
grad_output_ptr
,
shift_ptr
,
grad_input_ptr
,
batch_size
,
time_size
,
channel_size
,
hw_size
,
group_size
,
group_channel
,
data_dtype
,
channel_per_core
,
max_number_hw_per_core
,
max_length_per_core
);
}
void
tin_shift_forward_mlu
(
Tensor
input
,
Tensor
shift
,
Tensor
output
)
{
TINShiftForwardMLUKernelLauncher
(
input
,
shift
,
output
);
}
void
tin_shift_backward_mlu
(
Tensor
grad_output
,
Tensor
shift
,
Tensor
grad_input
)
{
TINShiftBackwardMLUKernelLauncher
(
grad_output
,
shift
,
grad_input
);
}
void
tin_shift_forward_impl
(
Tensor
input
,
Tensor
shift
,
Tensor
output
);
void
tin_shift_backward_impl
(
Tensor
grad_output
,
Tensor
shift
,
Tensor
grad_input
);
REGISTER_DEVICE_IMPL
(
tin_shift_forward_impl
,
MLU
,
tin_shift_forward_mlu
);
REGISTER_DEVICE_IMPL
(
tin_shift_backward_impl
,
MLU
,
tin_shift_backward_mlu
);
mmcv/ops/csrc/pytorch/mps/bbox_overlaps_mps.mm
0 → 100644
View file @
fdeee889
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
#include "pytorch_device_registry.hpp"
#include "MPSLibrary.h"
#include "MPSStream.h"
#include "MPSUtils.h"
using
at
::
Tensor
;
const
static
std
::
string
kSourceCode
=
R"(
#include <metal_math>
#include <metal_stdlib>
using namespace metal;
kernel void bbox_overlap_mps_kernel(constant const float4* bboxes1,
constant const float4* bboxes2,
device float* ious,
constant int& num_bbox1,
constant int& num_bbox2,
constant int& mode,
constant bool& aligned,
constant int& offset,
uint index [[thread_position_in_grid]])
{
int base1 = index;
int base2 = index;
if(!aligned){
base1 = index / num_bbox2;
base2 = index % num_bbox2;
}
const float f_offset = float(offset);
const float4 b1 = bboxes1[base1];
const float b1_area = (b1[2]-b1[0]+f_offset)*(b1[3]-b1[1]+f_offset);
const float4 b2 = bboxes2[base2];
const float b2_area = (b2[2]-b2[0]+f_offset)*(b2[3]-b2[1]+f_offset);
const float2 left_top = fmax(b1.xy, b2.xy);
const float2 right_bottom = fmin(b1.zw, b2.zw);
const float2 wh = fmax(right_bottom - left_top + f_offset, 0.0f);
const float interS = wh.x * wh.y;
const float baseS =
fmax(mode == 0 ? b1_area + b2_area - interS : b1_area, f_offset);
ious[index] = interS / baseS;
}
)"
;
void
BBoxOverlapsMPSKernelLauncher
(
const
Tensor
bboxes1
,
const
Tensor
bboxes2
,
Tensor
ious
,
const
int
mode
,
const
bool
aligned
,
const
int
offset
)
{
// get stream
auto
stream
=
at
::
mps
::
getCurrentMPSStream
();
auto
library_manager
=
MPSLibraryManager
::
getInstance
();
MPSLibrary
*
library
;
const
static
std
::
string
kLibraryName
=
"bbox_overlap"
;
if
(
library_manager
->
hasLibrary
(
kLibraryName
))
library
=
library_manager
->
getLibrary
(
kLibraryName
);
else
library
=
library_manager
->
createLibraryFromSouce
(
kLibraryName
,
kSourceCode
);
auto
func_pso
=
library
->
getComputePipelineState
(
"bbox_overlap_mps_kernel"
);
// create command buffer and encoder
MTLCommandBuffer_t
command_buffer
=
stream
->
commandBuffer
();
MTLComputeCommandEncoder_t
compute_encoder
=
[
command_buffer
computeCommandEncoder
];
// set pso and buffer
int
output_size
=
ious
.
numel
();
int
num_bbox1
=
bboxes1
.
size
(
0
);
int
num_bbox2
=
bboxes2
.
size
(
0
);
int
num_elements
=
output_size
;
setMTLArgs
(
compute_encoder
,
func_pso
,
bboxes1
,
bboxes2
,
ious
,
num_bbox1
,
num_bbox2
,
mode
,
aligned
,
offset
);
// set grid size
MTLSize
grid_size
=
MTLSizeMake
(
num_elements
,
1
,
1
);
NSUInteger
thread_group_size_x
=
func_pso
.
maxTotalThreadsPerThreadgroup
;
if
(
thread_group_size_x
>
num_elements
)
{
thread_group_size_x
=
num_elements
;
}
MTLSize
thread_group_size
=
MTLSizeMake
(
thread_group_size_x
,
1
,
1
);
// encoding
[
compute_encoder
dispatchThreads
:
grid_size
threadsPerThreadgroup
:
thread_group_size
];
[
compute_encoder
endEncoding
];
// commit, not sure if flush is required
stream
->
commit
(
false
);
}
void
bbox_overlaps_mps
(
const
Tensor
bboxes1
,
const
Tensor
bboxes2
,
Tensor
ious
,
const
int
mode
,
const
bool
aligned
,
const
int
offset
)
{
BBoxOverlapsMPSKernelLauncher
(
bboxes1
,
bboxes2
,
ious
,
mode
,
aligned
,
offset
);
}
void
bbox_overlaps_impl
(
const
Tensor
bboxes1
,
const
Tensor
bboxes2
,
Tensor
ious
,
const
int
mode
,
const
bool
aligned
,
const
int
offset
);
REGISTER_DEVICE_IMPL
(
bbox_overlaps_impl
,
MPS
,
bbox_overlaps_mps
);
mmcv/ops/csrc/pytorch/points_in_polygons.cpp
0 → 100644
View file @
fdeee889
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
void
points_in_polygons_forward_impl
(
const
Tensor
points
,
const
Tensor
polygons
,
Tensor
output
,
const
int
rows
,
const
int
cols
)
{
DISPATCH_DEVICE_IMPL
(
points_in_polygons_forward_impl
,
points
,
polygons
,
output
,
rows
,
cols
);
}
void
points_in_polygons_forward
(
Tensor
points
,
Tensor
polygons
,
Tensor
output
)
{
int
rows
=
points
.
size
(
0
);
int
cols
=
polygons
.
size
(
0
);
points_in_polygons_forward_impl
(
points
,
polygons
,
output
,
rows
,
cols
);
}
mmcv/ops/csrc/pytorch/prroi_pool.cpp
0 → 100644
View file @
fdeee889
// Copyright (c) OpenMMLab. All rights reserved
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
void
prroi_pool_forward_impl
(
Tensor
input
,
Tensor
rois
,
Tensor
output
,
int
pooled_height
,
int
pooled_width
,
float
spatial_scale
)
{
DISPATCH_DEVICE_IMPL
(
prroi_pool_forward_impl
,
input
,
rois
,
output
,
pooled_height
,
pooled_width
,
spatial_scale
);
}
void
prroi_pool_backward_impl
(
Tensor
grad_output
,
Tensor
rois
,
Tensor
grad_input
,
int
pooled_height
,
int
pooled_width
,
float
spatial_scale
)
{
DISPATCH_DEVICE_IMPL
(
prroi_pool_backward_impl
,
grad_output
,
rois
,
grad_input
,
pooled_height
,
pooled_width
,
spatial_scale
);
}
void
prroi_pool_coor_backward_impl
(
Tensor
output
,
Tensor
grad_output
,
Tensor
input
,
Tensor
rois
,
Tensor
grad_rois
,
int
pooled_height
,
int
pooled_width
,
float
spatial_scale
)
{
DISPATCH_DEVICE_IMPL
(
prroi_pool_coor_backward_impl
,
output
,
grad_output
,
input
,
rois
,
grad_rois
,
pooled_height
,
pooled_width
,
spatial_scale
);
}
void
prroi_pool_forward
(
Tensor
input
,
Tensor
rois
,
Tensor
output
,
int
pooled_height
,
int
pooled_width
,
float
spatial_scale
)
{
prroi_pool_forward_impl
(
input
,
rois
,
output
,
pooled_height
,
pooled_width
,
spatial_scale
);
}
void
prroi_pool_backward
(
Tensor
grad_output
,
Tensor
rois
,
Tensor
grad_input
,
int
pooled_height
,
int
pooled_width
,
float
spatial_scale
)
{
prroi_pool_backward_impl
(
grad_output
,
rois
,
grad_input
,
pooled_height
,
pooled_width
,
spatial_scale
);
}
void
prroi_pool_coor_backward
(
Tensor
output
,
Tensor
grad_output
,
Tensor
input
,
Tensor
rois
,
Tensor
grad_rois
,
int
pooled_height
,
int
pooled_width
,
float
spatial_scale
)
{
prroi_pool_coor_backward_impl
(
output
,
grad_output
,
input
,
rois
,
grad_rois
,
pooled_height
,
pooled_width
,
spatial_scale
);
}
Prev
1
…
15
16
17
18
19
20
21
22
23
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment