Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
one
spconv
Commits
a6abf55d
Commit
a6abf55d
authored
Oct 20, 2021
by
yan.yan
Browse files
Merge branch 'develop'
parents
fad30002
79a3eaf2
Changes
142
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
0 additions
and
2612 deletions
+0
-2612
include/paramsgrid.h
include/paramsgrid.h
+0
-65
include/spconv/box_iou.h
include/spconv/box_iou.h
+0
-156
include/spconv/cublas_gemm.h
include/spconv/cublas_gemm.h
+0
-47
include/spconv/fused_spconv_ops.h
include/spconv/fused_spconv_ops.h
+0
-126
include/spconv/geometry.h
include/spconv/geometry.h
+0
-183
include/spconv/indice.cu.h
include/spconv/indice.cu.h
+0
-464
include/spconv/indice.h
include/spconv/indice.h
+0
-58
include/spconv/maxpool.h
include/spconv/maxpool.h
+0
-44
include/spconv/nms.h
include/spconv/nms.h
+0
-202
include/spconv/nms_functor.h
include/spconv/nms_functor.h
+0
-37
include/spconv/nms_gpu.h
include/spconv/nms_gpu.h
+0
-18
include/spconv/nms_ops.h
include/spconv/nms_ops.h
+0
-74
include/spconv/pillar_scatter_functor.h
include/spconv/pillar_scatter_functor.h
+0
-31
include/spconv/pillar_scatter_ops.h
include/spconv/pillar_scatter_ops.h
+0
-56
include/spconv/point2voxel.h
include/spconv/point2voxel.h
+0
-276
include/spconv/pool_ops.h
include/spconv/pool_ops.h
+0
-35
include/spconv/reordering.cu.h
include/spconv/reordering.cu.h
+0
-432
include/spconv/reordering.h
include/spconv/reordering.h
+0
-40
include/spconv/spconv_ops.h
include/spconv/spconv_ops.h
+0
-61
include/tensorrt/inference.h
include/tensorrt/inference.h
+0
-207
No files found.
include/paramsgrid.h
deleted
100644 → 0
View file @
fad30002
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// This file is used for c++ unit test, but pytorch jit ops don't support c++
// debug build.
#ifndef PARAMS_GRID_H_
#define PARAMS_GRID_H_
#include <tuple>
#include <vector>
namespace
detail
{
template
<
class
T
>
int
getTotalSize
(
std
::
vector
<
T
>
arg
)
{
return
arg
.
size
();
}
template
<
class
T
,
class
...
TArgs
>
int
getTotalSize
(
std
::
vector
<
T
>
arg
,
std
::
vector
<
TArgs
>
...
args
)
{
return
arg
.
size
()
*
getTotalSize
(
args
...);
}
template
<
typename
T
>
int
getSize
(
std
::
vector
<
T
>
arg
)
{
return
arg
.
size
();
}
template
<
int
Idx
,
class
TT
,
class
T
>
void
assigner
(
TT
&
src
,
std
::
vector
<
int
>
counter
,
std
::
vector
<
T
>
&
arg
)
{
std
::
get
<
Idx
>
(
src
)
=
arg
[
counter
[
Idx
]];
}
template
<
int
Idx
,
class
TT
,
class
T
,
class
...
TArgs
>
void
assigner
(
TT
&
src
,
std
::
vector
<
int
>
counter
,
std
::
vector
<
T
>
&
arg
,
std
::
vector
<
TArgs
>
&
...
args
)
{
std
::
get
<
Idx
>
(
src
)
=
arg
[
counter
[
Idx
]];
assigner
<
Idx
+
1
>
(
src
,
counter
,
args
...);
}
}
// namespace detail
template
<
class
...
TArgs
>
std
::
vector
<
std
::
tuple
<
TArgs
...
>>
paramsGrid
(
std
::
vector
<
TArgs
>
...
args
)
{
int
length
=
detail
::
getTotalSize
(
args
...);
std
::
vector
<
int
>
sizes
=
{
detail
::
getSize
(
args
)...};
int
size
=
sizes
.
size
();
std
::
vector
<
std
::
tuple
<
TArgs
...
>>
params
(
length
);
std
::
vector
<
int
>
counter
(
size
);
for
(
int
i
=
0
;
i
<
length
;
++
i
)
{
detail
::
assigner
<
0
>
(
params
[
i
],
counter
,
args
...);
counter
[
size
-
1
]
+=
1
;
for
(
int
c
=
size
-
1
;
c
>=
0
;
--
c
)
{
if
(
counter
[
c
]
==
sizes
[
c
]
&&
c
>
0
)
{
counter
[
c
-
1
]
+=
1
;
counter
[
c
]
=
0
;
}
}
}
return
params
;
}
#endif
\ No newline at end of file
include/spconv/box_iou.h
deleted
100644 → 0
View file @
fad30002
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef BOX_IOU_H
#define BOX_IOU_H
#include <pybind11/pybind11.h>
// must include pybind11/eigen.h if using eigen matrix as arguments.
#include <algorithm>
#include <boost/geometry.hpp>
#include <pybind11/numpy.h>
namespace
spconv
{
// #include "voxelnet/core/cc/pybind11_helper.h"
namespace
py
=
pybind11
;
using
namespace
pybind11
::
literals
;
template
<
typename
DType
,
typename
ShapeContainer
>
inline
py
::
array_t
<
DType
>
constant
(
ShapeContainer
shape
,
DType
value
)
{
// create ROWMAJOR array.
py
::
array_t
<
DType
>
array
(
shape
);
std
::
fill
(
array
.
mutable_data
(),
array
.
mutable_data
()
+
array
.
size
(),
value
);
return
array
;
}
template
<
typename
DType
>
inline
py
::
array_t
<
DType
>
zeros
(
std
::
vector
<
long
int
>
shape
)
{
return
constant
<
DType
,
std
::
vector
<
long
int
>>
(
shape
,
0
);
}
template
<
typename
DType
>
py
::
array_t
<
DType
>
rbbox_iou
(
py
::
array_t
<
DType
>
box_corners
,
py
::
array_t
<
DType
>
qbox_corners
,
py
::
array_t
<
DType
>
standup_iou
,
DType
standup_thresh
)
{
namespace
bg
=
boost
::
geometry
;
typedef
bg
::
model
::
point
<
DType
,
2
,
bg
::
cs
::
cartesian
>
point_t
;
typedef
bg
::
model
::
polygon
<
point_t
>
polygon_t
;
polygon_t
poly
,
qpoly
;
std
::
vector
<
polygon_t
>
poly_inter
,
poly_union
;
DType
inter_area
,
union_area
;
auto
box_corners_r
=
box_corners
.
template
unchecked
<
3
>();
auto
qbox_corners_r
=
qbox_corners
.
template
unchecked
<
3
>();
auto
standup_iou_r
=
standup_iou
.
template
unchecked
<
2
>();
auto
N
=
box_corners_r
.
shape
(
0
);
auto
K
=
qbox_corners_r
.
shape
(
0
);
py
::
array_t
<
DType
>
overlaps
=
zeros
<
DType
>
({
int
(
N
),
int
(
K
)});
auto
overlaps_rw
=
overlaps
.
template
mutable_unchecked
<
2
>();
if
(
N
==
0
||
K
==
0
)
{
return
overlaps
;
}
for
(
int
k
=
0
;
k
<
K
;
++
k
)
{
for
(
int
n
=
0
;
n
<
N
;
++
n
)
{
if
(
standup_iou_r
(
n
,
k
)
<=
standup_thresh
)
continue
;
bg
::
append
(
poly
,
point_t
(
box_corners_r
(
n
,
0
,
0
),
box_corners_r
(
n
,
0
,
1
)));
bg
::
append
(
poly
,
point_t
(
box_corners_r
(
n
,
1
,
0
),
box_corners_r
(
n
,
1
,
1
)));
bg
::
append
(
poly
,
point_t
(
box_corners_r
(
n
,
2
,
0
),
box_corners_r
(
n
,
2
,
1
)));
bg
::
append
(
poly
,
point_t
(
box_corners_r
(
n
,
3
,
0
),
box_corners_r
(
n
,
3
,
1
)));
bg
::
append
(
poly
,
point_t
(
box_corners_r
(
n
,
0
,
0
),
box_corners_r
(
n
,
0
,
1
)));
bg
::
append
(
qpoly
,
point_t
(
qbox_corners_r
(
k
,
0
,
0
),
qbox_corners_r
(
k
,
0
,
1
)));
bg
::
append
(
qpoly
,
point_t
(
qbox_corners_r
(
k
,
1
,
0
),
qbox_corners_r
(
k
,
1
,
1
)));
bg
::
append
(
qpoly
,
point_t
(
qbox_corners_r
(
k
,
2
,
0
),
qbox_corners_r
(
k
,
2
,
1
)));
bg
::
append
(
qpoly
,
point_t
(
qbox_corners_r
(
k
,
3
,
0
),
qbox_corners_r
(
k
,
3
,
1
)));
bg
::
append
(
qpoly
,
point_t
(
qbox_corners_r
(
k
,
0
,
0
),
qbox_corners_r
(
k
,
0
,
1
)));
bg
::
intersection
(
poly
,
qpoly
,
poly_inter
);
if
(
!
poly_inter
.
empty
())
{
inter_area
=
bg
::
area
(
poly_inter
.
front
());
bg
::
union_
(
poly
,
qpoly
,
poly_union
);
if
(
!
poly_union
.
empty
())
{
union_area
=
bg
::
area
(
poly_union
.
front
());
overlaps_rw
(
n
,
k
)
=
inter_area
/
union_area
;
}
poly_union
.
clear
();
}
poly
.
clear
();
qpoly
.
clear
();
poly_inter
.
clear
();
}
}
return
overlaps
;
}
template
<
typename
DType
>
py
::
array_t
<
DType
>
rbbox_intersection
(
py
::
array_t
<
DType
>
box_corners
,
py
::
array_t
<
DType
>
qbox_corners
,
py
::
array_t
<
DType
>
standup_iou
,
DType
standup_thresh
)
{
namespace
bg
=
boost
::
geometry
;
typedef
bg
::
model
::
point
<
DType
,
2
,
bg
::
cs
::
cartesian
>
point_t
;
typedef
bg
::
model
::
polygon
<
point_t
>
polygon_t
;
polygon_t
poly
,
qpoly
;
std
::
vector
<
polygon_t
>
poly_inter
,
poly_union
;
DType
inter_area
,
union_area
;
auto
box_corners_r
=
box_corners
.
template
unchecked
<
3
>();
auto
qbox_corners_r
=
qbox_corners
.
template
unchecked
<
3
>();
auto
standup_iou_r
=
standup_iou
.
template
unchecked
<
2
>();
auto
N
=
box_corners_r
.
shape
(
0
);
auto
K
=
qbox_corners_r
.
shape
(
0
);
py
::
array_t
<
DType
>
overlaps
=
zeros
<
DType
>
({
int
(
N
),
int
(
K
)});
auto
overlaps_rw
=
overlaps
.
template
mutable_unchecked
<
2
>();
if
(
N
==
0
||
K
==
0
)
{
return
overlaps
;
}
for
(
int
k
=
0
;
k
<
K
;
++
k
)
{
for
(
int
n
=
0
;
n
<
N
;
++
n
)
{
if
(
standup_iou_r
(
n
,
k
)
<=
standup_thresh
)
continue
;
bg
::
append
(
poly
,
point_t
(
box_corners_r
(
n
,
0
,
0
),
box_corners_r
(
n
,
0
,
1
)));
bg
::
append
(
poly
,
point_t
(
box_corners_r
(
n
,
1
,
0
),
box_corners_r
(
n
,
1
,
1
)));
bg
::
append
(
poly
,
point_t
(
box_corners_r
(
n
,
2
,
0
),
box_corners_r
(
n
,
2
,
1
)));
bg
::
append
(
poly
,
point_t
(
box_corners_r
(
n
,
3
,
0
),
box_corners_r
(
n
,
3
,
1
)));
bg
::
append
(
poly
,
point_t
(
box_corners_r
(
n
,
0
,
0
),
box_corners_r
(
n
,
0
,
1
)));
bg
::
append
(
qpoly
,
point_t
(
qbox_corners_r
(
k
,
0
,
0
),
qbox_corners_r
(
k
,
0
,
1
)));
bg
::
append
(
qpoly
,
point_t
(
qbox_corners_r
(
k
,
1
,
0
),
qbox_corners_r
(
k
,
1
,
1
)));
bg
::
append
(
qpoly
,
point_t
(
qbox_corners_r
(
k
,
2
,
0
),
qbox_corners_r
(
k
,
2
,
1
)));
bg
::
append
(
qpoly
,
point_t
(
qbox_corners_r
(
k
,
3
,
0
),
qbox_corners_r
(
k
,
3
,
1
)));
bg
::
append
(
qpoly
,
point_t
(
qbox_corners_r
(
k
,
0
,
0
),
qbox_corners_r
(
k
,
0
,
1
)));
bg
::
intersection
(
poly
,
qpoly
,
poly_inter
);
if
(
!
poly_inter
.
empty
())
{
inter_area
=
bg
::
area
(
poly_inter
.
front
());
overlaps_rw
(
n
,
k
)
=
inter_area
;
}
poly
.
clear
();
qpoly
.
clear
();
poly_inter
.
clear
();
}
}
return
overlaps
;
}
}
// namespace spconv
#endif
\ No newline at end of file
include/spconv/cublas_gemm.h
deleted
100644 → 0
View file @
fad30002
#pragma once
#include <cublas_v2.h>
#include <tensorview/tensorview.h>
namespace
spconv
{
template
<
class
T
>
cublasStatus_t
cublasTgemm
(
cublasHandle_t
handle
,
cublasOperation_t
transa
,
cublasOperation_t
transb
,
int
m
,
int
n
,
int
k
,
const
T
*
alpha
,
const
T
*
A
,
int
lda
,
const
T
*
B
,
int
ldb
,
const
T
*
beta
,
T
*
C
,
int
ldc
);
template
<
class
T
>
cublasStatus_t
cublasTgemmRow
(
cublasHandle_t
handle
,
cublasOperation_t
transa
,
cublasOperation_t
transb
,
int
m
,
int
n
,
int
k
,
const
T
*
alpha
,
const
T
*
A
,
int
lda
,
const
T
*
B
,
int
ldb
,
const
T
*
beta
,
T
*
C
,
int
ldc
)
{
return
cublasTgemm
<
T
>
(
handle
,
transb
,
transa
,
n
,
m
,
k
,
alpha
,
B
,
ldb
,
A
,
lda
,
beta
,
C
,
ldc
);
}
template
<
class
T
>
inline
T
constant_scalar
(
float
data
)
{
return
T
(
data
);
}
template
<
class
T
>
cublasStatus_t
gemm
(
cublasHandle_t
handle
,
bool
transa
,
bool
transb
,
const
tv
::
TensorView
<
T
>
A
,
const
tv
::
TensorView
<
T
>
B
,
tv
::
TensorView
<
T
>
C
)
{
TV_ASSERT_RT_ERR
(
A
.
ndim
()
==
2
,
"error"
);
TV_ASSERT_RT_ERR
(
B
.
ndim
()
==
2
,
"error"
);
auto
transa_cublas
=
transa
?
CUBLAS_OP_T
:
CUBLAS_OP_N
;
auto
transb_cublas
=
transb
?
CUBLAS_OP_T
:
CUBLAS_OP_N
;
int
m
=
transa
?
A
.
dim
(
1
)
:
A
.
dim
(
0
);
int
n
=
transb
?
B
.
dim
(
0
)
:
B
.
dim
(
1
);
int
ka
=
transa
?
A
.
dim
(
0
)
:
A
.
dim
(
1
);
int
kb
=
transb
?
B
.
dim
(
1
)
:
B
.
dim
(
0
);
int
lda
=
transa
?
m
:
ka
;
int
ldb
=
transb
?
ka
:
n
;
int
ldc
=
n
;
TV_ASSERT_RT_ERR
(
ka
==
kb
,
"error"
);
T
alpha
=
constant_scalar
<
T
>
(
1
);
T
beta
=
constant_scalar
<
T
>
(
0
);
return
cublasTgemmRow
<
T
>
(
handle
,
transa_cublas
,
transb_cublas
,
m
,
n
,
ka
,
&
alpha
,
A
.
data
(),
lda
,
B
.
data
(),
ldb
,
&
beta
,
C
.
data
(),
ldc
);
}
}
// namespace spconv
include/spconv/fused_spconv_ops.h
deleted
100644 → 0
View file @
fad30002
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef FUSED_SPARSE_CONV_OP_H_
#define FUSED_SPARSE_CONV_OP_H_
#include <spconv/indice.h>
#include <spconv/reordering.h>
#include <tensorview/torch_utils.h>
#include <torch/script.h>
#include <utility/timer.h>
namespace
spconv
{
// torch.jit's doc says only support int64, so we need to convert to int32.
torch
::
Tensor
fusedIndiceConvBatchNorm
(
torch
::
Tensor
features
,
torch
::
Tensor
filters
,
torch
::
Tensor
bias
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
int64_t
numActOut
,
int64_t
_inverse
,
int64_t
_subM
)
{
bool
subM
=
_subM
!=
0
;
bool
inverse
=
_inverse
!=
0
;
auto
device
=
features
.
device
().
type
();
auto
ndim
=
filters
.
dim
()
-
2
;
auto
kernelVolume
=
indicePairs
.
size
(
0
);
auto
numInPlanes
=
features
.
size
(
1
);
auto
numOutPlanes
=
filters
.
size
(
ndim
+
1
);
auto
indicePairNumCpu
=
indiceNum
.
to
({
torch
::
kCPU
});
auto
indicePairMaxSizeIter
=
std
::
max_element
(
indicePairNumCpu
.
data_ptr
<
int
>
(),
indicePairNumCpu
.
data_ptr
<
int
>
()
+
kernelVolume
);
int
indicePairMaxOffset
=
indicePairMaxSizeIter
-
indicePairNumCpu
.
data_ptr
<
int
>
();
int
indicePairMaxSize
=
*
indicePairMaxSizeIter
;
/*if (_subM){
std::vector<int> indicePairNumVec(indicePairNumCpu.data_ptr<int>(),
indicePairNumCpu.data_ptr<int>() + kernelVolume);
indicePairNumVec.erase(indicePairNumVec.begin() + indicePairMaxOffset);
auto indicePairVecMaxSizeIter = std::max_element(
indicePairNumVec.begin(), indicePairNumVec.end());
indicePairMaxSize = *indicePairVecMaxSizeIter;
}*/
auto
options
=
torch
::
TensorOptions
().
dtype
(
features
.
dtype
()).
device
(
features
.
device
());
// auto indicePairOptions =
// torch::TensorOptions().dtype(torch::kInt64).device(indicePairs.device());
torch
::
Tensor
output
=
torch
::
zeros
({
numActOut
,
numOutPlanes
},
options
).
copy_
(
bias
);
torch
::
Tensor
inputBuffer
=
torch
::
zeros
({
indicePairMaxSize
,
numInPlanes
},
options
);
torch
::
Tensor
outputBuffer
=
torch
::
zeros
({
indicePairMaxSize
,
numOutPlanes
},
options
);
filters
=
filters
.
view
({
-
1
,
numInPlanes
,
numOutPlanes
});
if
(
subM
)
{
// the center index of subm conv don't need gather and scatter
// add.
torch
::
mm_out
(
output
,
features
,
filters
[
indicePairMaxOffset
]);
}
double
totalGatherTime
=
0
;
double
totalGEMMTime
=
0
;
double
totalSAddTime
=
0
;
for
(
int
i
=
0
;
i
<
kernelVolume
;
++
i
)
{
auto
nHot
=
indicePairNumCpu
.
data_ptr
<
int
>
()[
i
];
if
(
nHot
<=
0
||
(
subM
&&
i
==
indicePairMaxOffset
))
{
continue
;
}
// auto timer = spconv::CudaContextTimer<>();
auto
outputBufferBlob
=
torch
::
from_blob
(
outputBuffer
.
data_ptr
(),
{
nHot
,
numOutPlanes
},
options
);
auto
inputBufferBlob
=
torch
::
from_blob
(
inputBuffer
.
data_ptr
(),
{
nHot
,
numInPlanes
},
options
);
if
(
device
==
torch
::
kCPU
)
{
sparse_gather_cpu
(
inputBuffer
,
features
,
indicePairs
[
i
][
inverse
],
nHot
);
}
#ifdef TV_CUDA
else
if
(
device
==
torch
::
kCUDA
)
{
sparse_gather_cuda
(
inputBuffer
,
features
,
indicePairs
[
i
][
inverse
],
nHot
);
}
#endif
else
{
TV_ASSERT_INVALID_ARG
(
false
,
"unknown device type"
);
}
// totalGatherTime += timer.report() / 1000.0;
torch
::
mm_out
(
outputBufferBlob
,
inputBufferBlob
,
filters
[
i
]);
// totalGEMMTime += timer.report() / 1000.0;
if
(
device
==
torch
::
kCPU
)
{
sparse_scatter_add_cpu
(
outputBuffer
,
output
,
indicePairs
[
i
][
!
inverse
],
nHot
);
}
#ifdef TV_CUDA
else
if
(
device
==
torch
::
kCUDA
)
{
sparse_scatter_add_cuda
(
outputBuffer
,
output
,
indicePairs
[
i
][
!
inverse
],
nHot
);
}
#endif
else
{
TV_ASSERT_INVALID_ARG
(
false
,
"unknown device type"
);
}
// totalSAddTime += timer.report() / 1000.0;
}
// std::cout << "gather time " << totalGatherTime << std::endl;
// std::cout << "gemm time " << totalGEMMTime << std::endl;
// std::cout << "scatteradd time " << totalSAddTime << std::endl;
return
output
;
}
}
// namespace spconv
#endif
\ No newline at end of file
include/spconv/geometry.h
deleted
100644 → 0
View file @
fad30002
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef SPCONV_GEOMETRY_H_
#define SPCONV_GEOMETRY_H_
#include <iostream>
#include <limits>
#include <tensorview/tensorview.h>
#include <tsl/robin_map.h>
#include <unordered_map>
namespace
spconv
{
namespace
detail
{
template
<
typename
T
>
struct
ToUnsigned
;
template
<
>
struct
ToUnsigned
<
int
>
{
using
type
=
uint32_t
;
};
template
<
>
struct
ToUnsigned
<
long
>
{
using
type
=
uint64_t
;
};
template
<
typename
T
>
struct
FNVInternal
;
template
<
>
struct
FNVInternal
<
uint32_t
>
{
constexpr
static
uint32_t
defaultOffsetBasis
=
0x811C9DC5
;
constexpr
static
uint32_t
prime
=
0x01000193
;
};
template
<
>
struct
FNVInternal
<
uint64_t
>
{
constexpr
static
uint64_t
defaultOffsetBasis
=
0xcbf29ce484222325
;
constexpr
static
uint64_t
prime
=
0x100000001b3
;
};
}
// namespace detail
template
<
typename
T
>
using
to_unsigned_t
=
typename
detail
::
ToUnsigned
<
std
::
remove_const_t
<
T
>>::
type
;
template
<
typename
T
>
struct
FNV1a
:
detail
::
FNVInternal
<
T
>
{
std
::
size_t
operator
()(
const
T
*
data
,
std
::
size_t
size
)
{
to_unsigned_t
<
T
>
hash
=
detail
::
FNVInternal
<
T
>::
defaultOffsetBasis
;
for
(
std
::
size_t
i
=
0
;
i
<
size
;
++
i
)
{
hash
*=
detail
::
FNVInternal
<
T
>::
prime
;
hash
^=
static_cast
<
to_unsigned_t
<
T
>>
(
data
[
i
]);
}
return
hash
;
}
};
template
<
typename
Index
,
unsigned
NDim
>
TV_HOST_DEVICE
Index
getValidOutPos
(
const
Index
*
input_pos
,
const
Index
*
kernelSize
,
const
Index
*
stride
,
const
Index
*
padding
,
const
Index
*
dilation
,
const
Index
*
outSpatialShape
,
Index
*
out
)
{
Index
lowers
[
NDim
];
Index
uppers
[
NDim
];
Index
counter
[
NDim
];
Index
counterSize
[
NDim
];
Index
pointCounter
=
0
;
Index
val
;
Index
numPoints
=
1
;
Index
m
,
offset
;
bool
valid
=
false
;
#pragma unroll
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
lowers
[
i
]
=
(
input_pos
[
i
]
-
(
kernelSize
[
i
]
-
1
)
*
dilation
[
i
]
-
1
+
stride
[
i
]
+
padding
[
i
])
/
stride
[
i
];
uppers
[
i
]
=
(
input_pos
[
i
]
+
padding
[
i
])
/
stride
[
i
];
}
#pragma unroll
for
(
unsigned
i
=
0
;
i
<
NDim
;
++
i
)
{
counterSize
[
i
]
=
((
uppers
[
i
]
-
lowers
[
i
])
/
dilation
[
i
]
+
1
);
numPoints
*=
counterSize
[
i
];
}
#pragma unroll
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
counter
[
i
]
=
0
;
}
for
(
int
i
=
0
;
i
<
numPoints
;
++
i
)
{
valid
=
true
;
m
=
1
;
offset
=
0
;
#pragma unroll
for
(
int
j
=
NDim
-
1
;
j
>=
0
;
--
j
)
{
val
=
uppers
[
j
]
-
counter
[
j
]
*
dilation
[
j
];
out
[
pointCounter
*
(
NDim
+
1
)
+
j
]
=
val
;
if
(
val
<
0
||
(
val
>
outSpatialShape
[
j
]
-
1
))
{
valid
=
false
;
// break;
}
offset
+=
m
*
(
input_pos
[
j
]
-
val
*
stride
[
j
]
+
padding
[
j
])
/
dilation
[
j
];
m
*=
kernelSize
[
j
];
}
out
[
pointCounter
*
(
NDim
+
1
)
+
NDim
]
=
offset
;
if
(
valid
)
++
pointCounter
;
counter
[
NDim
-
1
]
+=
1
;
#pragma unroll
for
(
int
c
=
NDim
-
1
;
c
>=
0
;
--
c
)
{
if
(
counter
[
c
]
==
counterSize
[
c
]
&&
c
>
0
)
{
counter
[
c
-
1
]
+=
1
;
counter
[
c
]
=
0
;
}
}
}
return
pointCounter
;
}
template
<
typename
Index
,
unsigned
NDim
>
TV_HOST_DEVICE
Index
getValidOutPosTranspose
(
const
Index
*
input_pos
,
const
Index
*
kernelSize
,
const
Index
*
stride
,
const
Index
*
padding
,
const
Index
*
dilation
,
const
Index
*
outSpatialShape
,
Index
*
out
)
{
Index
lowers
[
NDim
];
Index
uppers
[
NDim
];
Index
counter
[
NDim
];
Index
counterSize
[
NDim
];
Index
pointCounter
=
0
;
Index
val
;
Index
numPoints
=
1
;
Index
m
,
offset
;
bool
valid
=
false
;
#pragma unroll
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
lowers
[
i
]
=
input_pos
[
i
]
*
stride
[
i
]
-
padding
[
i
];
uppers
[
i
]
=
lowers
[
i
]
+
(
kernelSize
[
i
]
-
1
)
*
dilation
[
i
];
}
#pragma unroll
for
(
unsigned
i
=
0
;
i
<
NDim
;
++
i
)
{
counterSize
[
i
]
=
((
uppers
[
i
]
-
lowers
[
i
])
/
dilation
[
i
]
+
1
);
numPoints
*=
counterSize
[
i
];
}
#pragma unroll
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
counter
[
i
]
=
0
;
}
for
(
int
i
=
0
;
i
<
numPoints
;
++
i
)
{
valid
=
true
;
m
=
1
;
offset
=
0
;
#pragma unroll
for
(
int
j
=
NDim
-
1
;
j
>=
0
;
--
j
)
{
val
=
uppers
[
j
]
-
counter
[
j
]
*
dilation
[
j
];
out
[
pointCounter
*
(
NDim
+
1
)
+
j
]
=
val
;
if
(
val
<
0
||
(
val
>
outSpatialShape
[
j
]
-
1
))
{
valid
=
false
;
// break;
}
offset
+=
m
*
(
val
-
lowers
[
j
])
/
dilation
[
j
];
m
*=
kernelSize
[
j
];
}
out
[
pointCounter
*
(
NDim
+
1
)
+
NDim
]
=
offset
;
if
(
valid
)
++
pointCounter
;
counter
[
NDim
-
1
]
+=
1
;
#pragma unroll
for
(
int
c
=
NDim
-
1
;
c
>=
0
;
--
c
)
{
if
(
counter
[
c
]
==
counterSize
[
c
]
&&
c
>
0
)
{
counter
[
c
-
1
]
+=
1
;
counter
[
c
]
=
0
;
}
}
}
return
pointCounter
;
}
}
// namespace spconv
#endif
\ No newline at end of file
include/spconv/indice.cu.h
deleted
100644 → 0
View file @
fad30002
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef INDICE_CU_H_
#define INDICE_CU_H_
#include <cuhash/hash_table.cuh>
#include <spconv/geometry.h>
#include <tensorview/kernel_utils.h>
#include <tensorview/tensorview.h>
namespace
spconv
{
template
<
typename
Index
,
unsigned
NDim
,
int
KernelMaxVolume
=
256
,
typename
Index1D
=
int
>
__global__
void
prepareIndicePairsKernel
(
tv
::
TensorView
<
const
Index
>
indicesIn
,
tv
::
TensorView
<
Index
>
indicePairs
,
tv
::
TensorView
<
Index
>
indiceNum
,
tv
::
TensorView
<
Index1D
>
indicePairUnique
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
kernelSize
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
stride
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
padding
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
dilation
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
outSpatialShape
)
{
auto
numActIn
=
indicesIn
.
dim
(
0
);
Index
spatialVolume
=
1
;
#pragma unroll
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
spatialVolume
*=
outSpatialShape
[
i
];
}
Index
kernelVolume
=
1
;
#pragma unroll
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
kernelVolume
*=
kernelSize
[
i
];
}
Index
numValidPoints
=
0
;
Index
validPoints
[
KernelMaxVolume
*
(
NDim
+
1
)];
Index
*
pointPtr
=
nullptr
;
auto
indicePairsDim2
=
indicePairs
.
dim
(
2
);
Index
index
;
for
(
int
ix
:
tv
::
KernelLoopX
<
int
>
(
numActIn
))
{
numValidPoints
=
getValidOutPos
<
Index
,
NDim
>
(
indicesIn
.
data
()
+
ix
*
(
NDim
+
1
)
+
1
,
kernelSize
.
data
(),
stride
.
data
(),
padding
.
data
(),
dilation
.
data
(),
outSpatialShape
.
data
(),
validPoints
);
for
(
Index
i
=
0
;
i
<
numValidPoints
;
++
i
)
{
pointPtr
=
validPoints
+
i
*
(
NDim
+
1
);
auto
offset
=
pointPtr
[
NDim
];
Index
oldNum
=
atomicAdd
(
indiceNum
.
data
()
+
offset
,
Index
(
1
));
indicePairs
(
0
,
offset
,
oldNum
)
=
ix
;
index
=
tv
::
ArrayIndexRowMajor
<
NDim
,
NDim
>::
runPtrs
(
pointPtr
,
outSpatialShape
.
data
(),
0
)
+
spatialVolume
*
indicesIn
(
ix
,
0
);
indicePairs
(
1
,
offset
,
oldNum
)
=
index
;
indicePairUnique
[
offset
*
indicePairsDim2
+
oldNum
]
=
index
;
}
}
}
template
<
typename
Index
,
unsigned
NDim
,
int
KernelMaxVolume
=
256
>
__global__
void
prepareDeConvIndicePairsKernel
(
tv
::
TensorView
<
const
Index
>
indicesIn
,
tv
::
TensorView
<
Index
>
indicePairs
,
tv
::
TensorView
<
Index
>
indiceNum
,
tv
::
TensorView
<
Index
>
indicePairUnique
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
kernelSize
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
stride
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
padding
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
dilation
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
outSpatialShape
)
{
auto
numActIn
=
indicesIn
.
dim
(
0
);
Index
spatialVolume
=
1
;
#pragma unroll
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
spatialVolume
*=
outSpatialShape
[
i
];
}
Index
kernelVolume
=
1
;
#pragma unroll
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
kernelVolume
*=
kernelSize
[
i
];
}
Index
numValidPoints
=
0
;
Index
validPoints
[
KernelMaxVolume
*
(
NDim
+
1
)];
Index
*
pointPtr
=
nullptr
;
auto
indicePairsDim2
=
indicePairs
.
dim
(
2
);
Index
index
;
for
(
int
ix
:
tv
::
KernelLoopX
<
int
>
(
numActIn
))
{
numValidPoints
=
getValidOutPosTranspose
<
Index
,
NDim
>
(
indicesIn
.
data
()
+
ix
*
(
NDim
+
1
)
+
1
,
kernelSize
.
data
(),
stride
.
data
(),
padding
.
data
(),
dilation
.
data
(),
outSpatialShape
.
data
(),
validPoints
);
for
(
Index
i
=
0
;
i
<
numValidPoints
;
++
i
)
{
pointPtr
=
validPoints
+
i
*
(
NDim
+
1
);
auto
offset
=
pointPtr
[
NDim
];
Index
oldNum
=
atomicAdd
(
indiceNum
.
data
()
+
offset
,
Index
(
1
));
indicePairs
(
0
,
offset
,
oldNum
)
=
ix
;
index
=
tv
::
ArrayIndexRowMajor
<
NDim
,
NDim
>::
runPtrs
(
pointPtr
,
outSpatialShape
.
data
(),
0
)
+
spatialVolume
*
indicesIn
(
ix
,
0
);
indicePairs
(
1
,
offset
,
oldNum
)
=
index
;
indicePairUnique
[
offset
*
indicePairsDim2
+
oldNum
]
=
index
;
}
}
}
template
<
typename
Index
,
typename
IndexGrid
,
unsigned
NDim
>
__global__
void
assignGridAndIndiceOutKernel
(
tv
::
TensorView
<
Index
>
indicesOut
,
tv
::
TensorView
<
IndexGrid
>
gridsOut
,
int
numAct
,
tv
::
TensorView
<
Index
>
indicePairs
,
tv
::
TensorView
<
Index
>
indicePairUnique
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
outSpatialShape
,
int
batchSize
)
{
Index
index
;
auto
indicesOutPtr
=
indicesOut
.
data
();
for
(
int
ix
:
tv
::
KernelLoopX
<
int
>
(
numAct
))
{
index
=
indicePairUnique
[
ix
];
gridsOut
[
index
]
=
ix
;
index
=
tv
::
rowArrayIdxInv
<
Index
,
NDim
>
(
index
,
indicesOutPtr
+
ix
*
(
NDim
+
1
)
+
1
,
outSpatialShape
.
data
());
indicesOut
[
ix
*
(
NDim
+
1
)]
=
index
%
batchSize
;
}
}
template
<
typename
Index
,
unsigned
NDim
,
unsigned
kNumHashFunctions
=
4
>
__global__
void
assignIndiceOutKernel
(
tv
::
TensorView
<
Index
>
indicesOut
,
int
numAct
,
tv
::
TensorView
<
Index
>
indicePairUnique
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
outSpatialShape
,
int
batchSize
)
{
Index
index
;
auto
indicesOutPtr
=
indicesOut
.
data
();
for
(
unsigned
ix
:
tv
::
KernelLoopX
<
unsigned
>
(
numAct
))
{
index
=
indicePairUnique
[
ix
];
index
=
tv
::
rowArrayIdxInv
<
Index
,
NDim
>
(
index
,
indicesOutPtr
+
ix
*
(
NDim
+
1
)
+
1
,
outSpatialShape
.
data
());
indicesOut
[
ix
*
(
NDim
+
1
)]
=
index
%
batchSize
;
}
}
template
<
typename
Index
,
unsigned
NDim
,
unsigned
kNumHashFunctions
=
4
>
__global__
void
assignIndicePairsHashKernel
(
tv
::
TensorView
<
Index
>
indicesOut
,
int
numActIn
,
tv
::
TensorView
<
Index
>
indicePairs
,
tv
::
TensorView
<
Index
>
indicePairUnique
,
unsigned
table_size
,
const
cuhash
::
Entry
*
table
,
cuhash
::
Functions
<
kNumHashFunctions
>
constants
,
uint2
stash_constants
,
unsigned
stash_count
)
{
Index
index
;
int
kernelVolume
=
indicePairs
.
dim
(
1
);
auto
indicePairsOut
=
indicePairs
.
subview
(
1
);
for
(
int
ix
:
tv
::
KernelLoopX
<
int
>
(
numActIn
))
{
for
(
int
i
=
0
;
i
<
kernelVolume
;
++
i
)
{
index
=
indicePairsOut
(
i
,
ix
);
if
(
index
>
-
1
)
{
auto
val
=
cuhash
::
retrieve
((
unsigned
)(
index
),
table_size
,
table
,
constants
,
stash_constants
,
stash_count
);
assert
(
val
!=
cuhash
::
kNotFound
);
indicePairsOut
(
i
,
ix
)
=
(
unsigned
)
val
;
}
}
}
}
template
<
typename
Index
,
typename
IndexGrid
,
unsigned
NDim
>
__global__
void
assignIndicePairsKernel
(
tv
::
TensorView
<
Index
>
indicesOut
,
tv
::
TensorView
<
IndexGrid
>
gridsOut
,
int
numActIn
,
tv
::
TensorView
<
Index
>
indicePairs
,
tv
::
TensorView
<
Index
>
indicePairUnique
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
outSpatialShape
)
{
Index
index
;
int
kernelVolume
=
indicePairs
.
dim
(
1
);
auto
indicePairsOut
=
indicePairs
.
subview
(
1
);
for
(
int
ix
:
tv
::
KernelLoopX
<
int
>
(
numActIn
))
{
for
(
int
i
=
0
;
i
<
kernelVolume
;
++
i
)
{
index
=
indicePairsOut
(
i
,
ix
);
if
(
index
>
-
1
)
{
indicePairsOut
(
i
,
ix
)
=
gridsOut
[
index
];
}
}
}
}
template
<
typename
Index
,
typename
IndexGrid
,
unsigned
NDim
>
__global__
void
prepareSubMGridKernel
(
tv
::
TensorView
<
const
Index
>
indicesIn
,
tv
::
TensorView
<
IndexGrid
>
gridsOut
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
outSpatialShape
,
Index
spatialVolume
)
{
auto
numActIn
=
indicesIn
.
dim
(
0
);
Index
index
=
0
;
for
(
int
ix
:
tv
::
KernelLoopX
<
int
>
(
numActIn
))
{
index
=
tv
::
ArrayIndexRowMajor
<
NDim
,
NDim
>::
runPtrs
(
indicesIn
.
data
()
+
ix
*
(
NDim
+
1
)
+
1
,
outSpatialShape
.
data
(),
0
)
+
spatialVolume
*
indicesIn
(
ix
,
0
);
gridsOut
[
index
]
=
ix
;
}
}
template
<
typename
Index
,
unsigned
NDim
>
__global__
void
prepareSubMHashKernel
(
tv
::
TensorView
<
const
Index
>
indicesIn
,
unsigned
*
keys
,
unsigned
*
values
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
outSpatialShape
)
{
auto
numActIn
=
indicesIn
.
dim
(
0
);
Index
spatialVolume
=
1
;
#pragma unroll
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
spatialVolume
*=
outSpatialShape
[
i
];
}
Index
index
=
0
;
for
(
int
ix
:
tv
::
KernelLoopX
<
int
>
(
numActIn
))
{
index
=
tv
::
rowArrayIdx
<
Index
,
NDim
>
(
indicesIn
.
data
()
+
ix
*
(
NDim
+
1
)
+
1
,
outSpatialShape
.
data
())
+
spatialVolume
*
indicesIn
(
ix
,
0
);
keys
[
ix
]
=
index
;
values
[
ix
]
=
ix
;
}
}
template
<
typename
Index
,
typename
IndexGrid
,
unsigned
NDim
,
int
KernelMaxVolume
=
256
>
__global__
void
getSubMIndicePairsKernel
(
tv
::
TensorView
<
const
Index
>
indicesIn
,
tv
::
TensorView
<
IndexGrid
>
gridsOut
,
tv
::
TensorView
<
Index
>
indicePairs
,
tv
::
TensorView
<
Index
>
indiceNum
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
kernelSize
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
stride
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
padding
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
dilation
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
outSpatialShape
)
{
auto
numActIn
=
indicesIn
.
dim
(
0
);
Index
spatialVolume
=
1
;
#pragma unroll
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
spatialVolume
*=
outSpatialShape
[
i
];
}
Index
numValidPoints
=
0
;
Index
validPoints
[
KernelMaxVolume
*
(
NDim
+
1
)];
Index
*
pointPtr
=
nullptr
;
Index
index
=
0
;
for
(
int
ix
:
tv
::
KernelLoopX
<
int
>
(
numActIn
))
{
numValidPoints
=
getValidOutPos
<
Index
,
NDim
>
(
indicesIn
.
data
()
+
ix
*
(
NDim
+
1
)
+
1
,
kernelSize
.
data
(),
stride
.
data
(),
padding
.
data
(),
dilation
.
data
(),
outSpatialShape
.
data
(),
validPoints
);
for
(
int
i
=
0
;
i
<
numValidPoints
;
++
i
)
{
pointPtr
=
validPoints
+
i
*
(
NDim
+
1
);
auto
offset
=
pointPtr
[
NDim
];
index
=
tv
::
ArrayIndexRowMajor
<
NDim
,
NDim
>::
runPtrs
(
pointPtr
,
outSpatialShape
.
data
(),
0
)
+
spatialVolume
*
indicesIn
(
ix
,
0
);
if
(
gridsOut
[
index
]
>
-
1
)
{
Index
oldNum
=
atomicAdd
(
indiceNum
.
data
()
+
offset
,
Index
(
1
));
indicePairs
(
1
,
offset
,
oldNum
)
=
gridsOut
[
index
];
indicePairs
(
0
,
offset
,
oldNum
)
=
ix
;
}
}
}
}
template
<
typename
Index
,
typename
IndexGrid
,
unsigned
K0
,
unsigned
K1
,
unsigned
K2
>
__global__
void
getSubMIndicePairsKernel3
(
tv
::
TensorView
<
const
Index
>
indicesIn
,
tv
::
TensorView
<
IndexGrid
>
gridsOut
,
tv
::
TensorView
<
Index
>
indicePairs
,
tv
::
TensorView
<
Index
>
indiceNum
,
const
tv
::
SimpleVector
<
Index
,
3
>
outSpatialShape
,
Index
spatialVolume
)
{
auto
numActIn
=
indicesIn
.
dim
(
0
);
Index
point
[
3
];
Index
index
=
0
;
Index
offset
;
constexpr
unsigned
KV
=
K0
*
K1
*
K2
;
constexpr
unsigned
center
=
KV
/
2
;
*
(
indiceNum
.
data
()
+
center
)
=
numActIn
;
for
(
int
ix
:
tv
::
KernelLoopX
<
int
>
(
numActIn
))
{
const
Index
*
indice_data
=
indicesIn
.
data
()
+
ix
*
(
3
+
1
);
#pragma unroll
for
(
int
i
=
0
;
i
<
K0
;
++
i
)
{
#pragma unroll
for
(
int
j
=
0
;
j
<
K1
;
++
j
)
{
#pragma unroll
for
(
int
k
=
0
;
k
<
K2
;
++
k
)
{
offset
=
i
*
K1
*
K2
+
j
*
K2
+
k
;
if
(
offset
>
center
){
continue
;
}
if
(
center
==
offset
){
// center of subm indice pairs dont need atomicadd
indicePairs
(
1
,
offset
,
ix
)
=
ix
;
indicePairs
(
0
,
offset
,
ix
)
=
ix
;
}
else
{
point
[
2
]
=
indice_data
[
3
]
-
k
+
K2
/
2
;
point
[
1
]
=
indice_data
[
2
]
-
j
+
K1
/
2
;
point
[
0
]
=
indice_data
[
1
]
-
i
+
K0
/
2
;
if
(
point
[
1
]
>=
0
&&
point
[
1
]
<
outSpatialShape
[
1
]
&&
point
[
2
]
>=
0
&&
point
[
2
]
<
outSpatialShape
[
2
]
&&
point
[
0
]
>=
0
&&
point
[
0
]
<
outSpatialShape
[
0
])
{
index
=
tv
::
ArrayIndexRowMajor
<
3
,
3
>::
runPtrs
(
point
,
outSpatialShape
.
data
(),
0
)
+
spatialVolume
*
indice_data
[
0
];
if
(
gridsOut
[
index
]
!=
-
1
)
{
// for subm: indicePairs[0, i] = indicePairs[1, kernelVolume - i - 1]
Index
oldNum
=
atomicAdd
(
indiceNum
.
data
()
+
offset
,
Index
(
1
));
atomicAdd
(
indiceNum
.
data
()
+
KV
-
offset
-
1
,
Index
(
1
));
indicePairs
(
1
,
offset
,
oldNum
)
=
gridsOut
[
index
];
indicePairs
(
0
,
offset
,
oldNum
)
=
ix
;
indicePairs
(
1
,
KV
-
offset
-
1
,
oldNum
)
=
ix
;
indicePairs
(
0
,
KV
-
offset
-
1
,
oldNum
)
=
gridsOut
[
index
];
}
}
}
}
}
}
}
}
template
<
typename
Index
,
typename
IndexGrid
,
unsigned
K0
,
unsigned
K1
>
__global__
void
getSubMIndicePairsKernel2
(
tv
::
TensorView
<
const
Index
>
indicesIn
,
tv
::
TensorView
<
IndexGrid
>
gridsOut
,
tv
::
TensorView
<
Index
>
indicePairs
,
tv
::
TensorView
<
Index
>
indiceNum
,
const
tv
::
SimpleVector
<
Index
,
2
>
outSpatialShape
,
Index
spatialVolume
)
{
auto
numActIn
=
indicesIn
.
dim
(
0
);
Index
point
[
2
];
Index
index
=
0
;
Index
offset
;
constexpr
unsigned
KV
=
K0
*
K1
;
constexpr
unsigned
center
=
KV
/
2
;
*
(
indiceNum
.
data
()
+
center
)
=
numActIn
;
for
(
int
ix
:
tv
::
KernelLoopX
<
int
>
(
numActIn
))
{
const
Index
*
indice_data
=
indicesIn
.
data
()
+
ix
*
(
2
+
1
);
#pragma unroll
for
(
int
i
=
0
;
i
<
K0
;
++
i
)
{
#pragma unroll
for
(
int
j
=
0
;
j
<
K1
;
++
j
)
{
offset
=
i
*
K1
+
j
;
if
(
offset
>
center
){
continue
;
}
if
(
center
==
offset
){
// center of subm indice pairs dont need atomicadd
indicePairs
(
1
,
offset
,
ix
)
=
ix
;
indicePairs
(
0
,
offset
,
ix
)
=
ix
;
}
else
{
point
[
1
]
=
indice_data
[
2
]
-
j
+
K1
/
2
;
point
[
0
]
=
indice_data
[
1
]
-
i
+
K0
/
2
;
if
(
point
[
1
]
>=
0
&&
point
[
1
]
<
outSpatialShape
[
1
]
&&
point
[
0
]
>=
0
&&
point
[
0
]
<
outSpatialShape
[
0
])
{
index
=
tv
::
ArrayIndexRowMajor
<
2
,
2
>::
runPtrs
(
point
,
outSpatialShape
.
data
(),
0
)
+
spatialVolume
*
indice_data
[
0
];
if
(
gridsOut
[
index
]
>
-
1
)
{
Index
oldNum
=
atomicAdd
(
indiceNum
.
data
()
+
offset
,
Index
(
1
));
atomicAdd
(
indiceNum
.
data
()
+
KV
-
offset
-
1
,
Index
(
1
));
indicePairs
(
1
,
offset
,
oldNum
)
=
gridsOut
[
index
];
indicePairs
(
0
,
offset
,
oldNum
)
=
ix
;
indicePairs
(
1
,
KV
-
offset
-
1
,
oldNum
)
=
ix
;
indicePairs
(
0
,
KV
-
offset
-
1
,
oldNum
)
=
gridsOut
[
index
];
}
}
}
}
}
}
}
template
<
typename
Index
,
unsigned
NDim
,
int
KernelMaxVolume
=
256
,
unsigned
kNumHashFunctions
=
4
>
__global__
void
getSubMIndicePairsHashKernel
(
tv
::
TensorView
<
const
Index
>
indicesIn
,
tv
::
TensorView
<
Index
>
indicePairs
,
tv
::
TensorView
<
Index
>
indiceNum
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
kernelSize
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
stride
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
padding
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
dilation
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
outSpatialShape
,
unsigned
table_size
,
const
cuhash
::
Entry
*
table
,
cuhash
::
Functions
<
kNumHashFunctions
>
constants
,
uint2
stash_constants
,
unsigned
stash_count
)
{
auto
numActIn
=
indicesIn
.
dim
(
0
);
Index
spatialVolume
=
1
;
#pragma unroll
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
spatialVolume
*=
outSpatialShape
[
i
];
}
Index
numValidPoints
=
0
;
Index
validPoints
[
KernelMaxVolume
*
(
NDim
+
1
)];
Index
*
pointPtr
=
nullptr
;
Index
index
=
0
;
for
(
int
ix
:
tv
::
KernelLoopX
<
int
>
(
numActIn
))
{
numValidPoints
=
getValidOutPos
<
Index
,
NDim
>
(
indicesIn
.
data
()
+
ix
*
(
NDim
+
1
)
+
1
,
kernelSize
.
data
(),
stride
.
data
(),
padding
.
data
(),
dilation
.
data
(),
outSpatialShape
.
data
(),
validPoints
);
for
(
int
i
=
0
;
i
<
numValidPoints
;
++
i
)
{
pointPtr
=
validPoints
+
i
*
(
NDim
+
1
);
auto
offset
=
pointPtr
[
NDim
];
index
=
tv
::
ArrayIndexRowMajor
<
NDim
,
NDim
>::
runPtrs
(
pointPtr
,
outSpatialShape
.
data
(),
0
)
+
spatialVolume
*
indicesIn
(
ix
,
0
);
auto
val
=
cuhash
::
retrieve
((
unsigned
)(
index
),
table_size
,
table
,
constants
,
stash_constants
,
stash_count
);
if
(
val
!=
cuhash
::
kNotFound
)
{
Index
oldNum
=
atomicAdd
(
indiceNum
.
data
()
+
offset
,
Index
(
1
));
indicePairs
(
1
,
offset
,
oldNum
)
=
val
;
indicePairs
(
0
,
offset
,
oldNum
)
=
ix
;
}
}
}
}
template
<
typename
Index
,
typename
IndexGrid
,
unsigned
NDim
>
__global__
void
resetGridKernel
(
const
Index
*
indicePairUnique
,
tv
::
TensorView
<
IndexGrid
>
gridsOut
,
int
numAct
)
{
for
(
int
ix
:
tv
::
KernelLoopX
<
int
>
(
numAct
))
{
gridsOut
[
indicePairUnique
[
ix
]]
=
-
1
;
}
}
template
<
typename
T
>
__global__
void
arangeKernel
(
T
*
data
,
int
size
)
{
for
(
int
ix
:
tv
::
KernelLoopX
<
int
>
(
size
))
{
data
[
ix
]
=
ix
;
}
}
template
<
typename
Index
,
typename
IndexGrid
,
unsigned
NDim
>
__global__
void
resetGridSubMKernel
(
const
Index
*
indices
,
tv
::
TensorView
<
IndexGrid
>
gridsOut
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
outSpatialShape
,
int
numAct
)
{
Index
outSpatialShapeReg
[
NDim
];
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
outSpatialShapeReg
[
i
]
=
outSpatialShape
[
i
];
}
Index
spatialVolume
=
1
;
auto
indsPtr
=
indices
;
#pragma unroll
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
spatialVolume
*=
outSpatialShape
[
i
];
}
Index
index
;
for
(
int
ix
:
tv
::
KernelLoopX
<
int
>
(
numAct
))
{
indsPtr
=
indices
+
ix
*
(
NDim
+
1
);
index
=
tv
::
ArrayIndexRowMajor
<
NDim
,
NDim
>::
runPtrs
(
indsPtr
+
1
,
outSpatialShapeReg
,
0
);
gridsOut
[
index
+
spatialVolume
*
indsPtr
[
0
]]
=
-
1
;
}
}
}
// namespace spconv
#undef atomicAdd
#endif
\ No newline at end of file
include/spconv/indice.h
deleted
100644 → 0
View file @
fad30002
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef SPARSE_CONV_INDICE_FUNCTOR_H_
#define SPARSE_CONV_INDICE_FUNCTOR_H_
#include <tensorview/tensorview.h>
#include <torch/script.h>
namespace
spconv
{
int
create_conv_indice_pair_p1_cuda
(
torch
::
Tensor
indicesIn
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
torch
::
Tensor
indicePairUnique
,
std
::
vector
<
int64_t
>
kernelSize
,
std
::
vector
<
int64_t
>
stride
,
std
::
vector
<
int64_t
>
padding
,
std
::
vector
<
int64_t
>
dilation
,
std
::
vector
<
int64_t
>
outSpatialShape
,
bool
transpose
);
int
create_conv_indice_pair_p2_cuda
(
torch
::
Tensor
indicesIn
,
torch
::
Tensor
indicesOut
,
torch
::
Tensor
gridsOut
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
torch
::
Tensor
indicePairUnique
,
std
::
vector
<
int64_t
>
outSpatialShape
,
bool
transpose
,
bool
resetGrid
,
bool
useHash
);
int
create_submconv_indice_pair_cuda
(
torch
::
Tensor
indicesIn
,
torch
::
Tensor
gridsOut
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
std
::
vector
<
int64_t
>
kernelSize
,
std
::
vector
<
int64_t
>
stride
,
std
::
vector
<
int64_t
>
padding
,
std
::
vector
<
int64_t
>
dilation
,
std
::
vector
<
int64_t
>
outSpatialShape
,
bool
transpose
,
bool
resetGrid
,
bool
useHash
);
int
create_conv_indice_pair_cpu
(
torch
::
Tensor
indicesIn
,
torch
::
Tensor
indicesOut
,
torch
::
Tensor
gridsOut
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
std
::
vector
<
int64_t
>
kernelSize
,
std
::
vector
<
int64_t
>
stride
,
std
::
vector
<
int64_t
>
padding
,
std
::
vector
<
int64_t
>
dilation
,
std
::
vector
<
int64_t
>
outSpatialShape
,
bool
transpose
,
bool
resetGrid
,
bool
useHash
);
int
create_submconv_indice_pair_cpu
(
torch
::
Tensor
indicesIn
,
torch
::
Tensor
gridsOut
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
std
::
vector
<
int64_t
>
kernelSize
,
std
::
vector
<
int64_t
>
stride
,
std
::
vector
<
int64_t
>
padding
,
std
::
vector
<
int64_t
>
dilation
,
std
::
vector
<
int64_t
>
outSpatialShape
,
bool
transpose
,
bool
resetGrid
,
bool
useHash
);
}
// namespace spconv
#endif
\ No newline at end of file
include/spconv/maxpool.h
deleted
100644 → 0
View file @
fad30002
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef SPARSE_MAXPOOL_FUNCTOR_H_
#define SPARSE_MAXPOOL_FUNCTOR_H_
#include <tensorview/mp_helper.h>
#include <tensorview/tensor.h>
#include <tensorview/torch_utils.h>
#include <torch/script.h>
namespace
spconv
{
void
maxpool_bwd_cpu
(
torch
::
Tensor
outFeatures
,
torch
::
Tensor
inFeatures
,
torch
::
Tensor
dout
,
torch
::
Tensor
din
,
torch
::
Tensor
indicesIn
,
torch
::
Tensor
indicesOut
,
int
size
);
void
maxpool_fwd_cpu
(
torch
::
Tensor
outFeatures
,
torch
::
Tensor
inFeatures
,
torch
::
Tensor
indicesIn
,
torch
::
Tensor
indicesOut
,
int
size
);
void
maxpool_bwd_cuda
(
torch
::
Tensor
outFeatures
,
torch
::
Tensor
inFeatures
,
torch
::
Tensor
dout
,
torch
::
Tensor
din
,
torch
::
Tensor
indicesIn
,
torch
::
Tensor
indicesOut
,
int
size
);
void
maxpool_fwd_cuda
(
torch
::
Tensor
outFeatures
,
torch
::
Tensor
inFeatures
,
torch
::
Tensor
indicesIn
,
torch
::
Tensor
indicesOut
,
int
size
);
}
// namespace spconv
#endif
\ No newline at end of file
include/spconv/nms.h
deleted
100644 → 0
View file @
fad30002
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef NMS_CPU_H
#define NMS_CPU_H
#include <pybind11/pybind11.h>
// must include pybind11/stl.h if using containers in STL in arguments.
#include "box_iou.h"
#include "nms_gpu.h"
#include <algorithm>
#include <boost/geometry.hpp>
#include <pybind11/numpy.h>
#include <pybind11/stl.h>
#include <vector>
namespace
spconv
{
namespace
py
=
pybind11
;
using
namespace
pybind11
::
literals
;
template
<
typename
DType
>
std
::
vector
<
int
>
non_max_suppression_cpu
(
py
::
array_t
<
DType
>
boxes
,
py
::
array_t
<
int
>
order
,
DType
thresh
,
DType
eps
=
0
)
{
auto
ndets
=
boxes
.
shape
(
0
);
auto
boxes_r
=
boxes
.
template
unchecked
<
2
>();
auto
order_r
=
order
.
template
unchecked
<
1
>();
auto
suppressed
=
zeros
<
int
>
({
int
(
ndets
)});
auto
suppressed_rw
=
suppressed
.
template
mutable_unchecked
<
1
>();
auto
area
=
zeros
<
DType
>
({
int
(
ndets
)});
auto
area_rw
=
area
.
template
mutable_unchecked
<
1
>();
// get areas
for
(
int
i
=
0
;
i
<
ndets
;
++
i
)
{
area_rw
(
i
)
=
(
boxes_r
(
i
,
2
)
-
boxes_r
(
i
,
0
)
+
eps
)
*
(
boxes_r
(
i
,
3
)
-
boxes_r
(
i
,
1
)
+
eps
);
}
std
::
vector
<
int
>
keep
;
int
i
,
j
;
DType
xx1
,
xx2
,
w
,
h
,
inter
,
ovr
;
for
(
int
_i
=
0
;
_i
<
ndets
;
++
_i
)
{
i
=
order_r
(
_i
);
if
(
suppressed_rw
(
i
)
==
1
)
continue
;
keep
.
push_back
(
i
);
for
(
int
_j
=
_i
+
1
;
_j
<
ndets
;
++
_j
)
{
j
=
order_r
(
_j
);
if
(
suppressed_rw
(
j
)
==
1
)
continue
;
xx2
=
std
::
min
(
boxes_r
(
i
,
2
),
boxes_r
(
j
,
2
));
xx1
=
std
::
max
(
boxes_r
(
i
,
0
),
boxes_r
(
j
,
0
));
w
=
xx2
-
xx1
+
eps
;
if
(
w
>
0
)
{
xx2
=
std
::
min
(
boxes_r
(
i
,
3
),
boxes_r
(
j
,
3
));
xx1
=
std
::
max
(
boxes_r
(
i
,
1
),
boxes_r
(
j
,
1
));
h
=
xx2
-
xx1
+
eps
;
if
(
h
>
0
)
{
inter
=
w
*
h
;
ovr
=
inter
/
(
area_rw
(
i
)
+
area_rw
(
j
)
-
inter
);
if
(
ovr
>=
thresh
)
suppressed_rw
(
j
)
=
1
;
}
}
}
}
return
keep
;
}
template
<
typename
DType
>
std
::
vector
<
int
>
rotate_non_max_suppression_cpu
(
py
::
array_t
<
DType
>
box_corners
,
py
::
array_t
<
int
>
order
,
py
::
array_t
<
DType
>
standup_iou
,
DType
thresh
)
{
auto
ndets
=
box_corners
.
shape
(
0
);
auto
box_corners_r
=
box_corners
.
template
unchecked
<
3
>();
auto
order_r
=
order
.
template
unchecked
<
1
>();
auto
suppressed
=
zeros
<
int
>
({
int
(
ndets
)});
auto
suppressed_rw
=
suppressed
.
template
mutable_unchecked
<
1
>();
auto
standup_iou_r
=
standup_iou
.
template
unchecked
<
2
>();
std
::
vector
<
int
>
keep
;
int
i
,
j
;
namespace
bg
=
boost
::
geometry
;
typedef
bg
::
model
::
point
<
DType
,
2
,
bg
::
cs
::
cartesian
>
point_t
;
typedef
bg
::
model
::
polygon
<
point_t
>
polygon_t
;
polygon_t
poly
,
qpoly
;
std
::
vector
<
polygon_t
>
poly_inter
,
poly_union
;
DType
inter_area
,
union_area
,
overlap
;
for
(
int
_i
=
0
;
_i
<
ndets
;
++
_i
)
{
i
=
order_r
(
_i
);
if
(
suppressed_rw
(
i
)
==
1
)
continue
;
keep
.
push_back
(
i
);
for
(
int
_j
=
_i
+
1
;
_j
<
ndets
;
++
_j
)
{
j
=
order_r
(
_j
);
if
(
suppressed_rw
(
j
)
==
1
)
continue
;
if
(
standup_iou_r
(
i
,
j
)
<=
0.0
)
continue
;
// std::cout << "pre_poly" << std::endl;
try
{
bg
::
append
(
poly
,
point_t
(
box_corners_r
(
i
,
0
,
0
),
box_corners_r
(
i
,
0
,
1
)));
bg
::
append
(
poly
,
point_t
(
box_corners_r
(
i
,
1
,
0
),
box_corners_r
(
i
,
1
,
1
)));
bg
::
append
(
poly
,
point_t
(
box_corners_r
(
i
,
2
,
0
),
box_corners_r
(
i
,
2
,
1
)));
bg
::
append
(
poly
,
point_t
(
box_corners_r
(
i
,
3
,
0
),
box_corners_r
(
i
,
3
,
1
)));
bg
::
append
(
poly
,
point_t
(
box_corners_r
(
i
,
0
,
0
),
box_corners_r
(
i
,
0
,
1
)));
bg
::
append
(
qpoly
,
point_t
(
box_corners_r
(
j
,
0
,
0
),
box_corners_r
(
j
,
0
,
1
)));
bg
::
append
(
qpoly
,
point_t
(
box_corners_r
(
j
,
1
,
0
),
box_corners_r
(
j
,
1
,
1
)));
bg
::
append
(
qpoly
,
point_t
(
box_corners_r
(
j
,
2
,
0
),
box_corners_r
(
j
,
2
,
1
)));
bg
::
append
(
qpoly
,
point_t
(
box_corners_r
(
j
,
3
,
0
),
box_corners_r
(
j
,
3
,
1
)));
bg
::
append
(
qpoly
,
point_t
(
box_corners_r
(
j
,
0
,
0
),
box_corners_r
(
j
,
0
,
1
)));
bg
::
intersection
(
poly
,
qpoly
,
poly_inter
);
}
catch
(
const
std
::
exception
&
e
)
{
std
::
cout
<<
"box i corners:"
<<
std
::
endl
;
for
(
int
k
=
0
;
k
<
4
;
++
k
)
{
std
::
cout
<<
box_corners_r
(
i
,
k
,
0
)
<<
" "
<<
box_corners_r
(
i
,
k
,
1
)
<<
std
::
endl
;
}
std
::
cout
<<
"box j corners:"
<<
std
::
endl
;
for
(
int
k
=
0
;
k
<
4
;
++
k
)
{
std
::
cout
<<
box_corners_r
(
j
,
k
,
0
)
<<
" "
<<
box_corners_r
(
j
,
k
,
1
)
<<
std
::
endl
;
}
// throw e;
continue
;
}
// std::cout << "post_poly" << std::endl;
// std::cout << "post_intsec" << std::endl;
if
(
!
poly_inter
.
empty
())
{
inter_area
=
bg
::
area
(
poly_inter
.
front
());
// std::cout << "pre_union" << " " << inter_area << std::endl;
bg
::
union_
(
poly
,
qpoly
,
poly_union
);
/*
if (poly_union.empty()){
std::cout << "intsec area:" << " " << inter_area << std::endl;
std::cout << "box i corners:" << std::endl;
for(int k = 0; k < 4; ++k){
std::cout << box_corners_r(i, k, 0) << " " << box_corners_r(i,
k, 1) << std::endl;
}
std::cout << "box j corners:" << std::endl;
for(int k = 0; k < 4; ++k){
std::cout << box_corners_r(j, k, 0) << " " << box_corners_r(j,
k, 1) << std::endl;
}
}*/
// std::cout << "post_union" << poly_union.empty() << std::endl;
if
(
!
poly_union
.
empty
())
{
// ignore invalid box
union_area
=
bg
::
area
(
poly_union
.
front
());
// std::cout << "post union area" << std::endl;
// std::cout << union_area << "debug" << std::endl;
overlap
=
inter_area
/
union_area
;
if
(
overlap
>=
thresh
)
suppressed_rw
(
j
)
=
1
;
poly_union
.
clear
();
}
}
poly
.
clear
();
qpoly
.
clear
();
poly_inter
.
clear
();
}
}
return
keep
;
}
#ifdef TV_CUDA
constexpr
int
const
threadsPerBlock
=
sizeof
(
unsigned
long
long
)
*
8
;
template
<
typename
DType
>
int
non_max_suppression
(
py
::
array_t
<
DType
>
boxes
,
py
::
array_t
<
int
>
keep_out
,
DType
nms_overlap_thresh
,
int
device_id
)
{
py
::
buffer_info
info
=
boxes
.
request
();
auto
boxes_ptr
=
static_cast
<
DType
*>
(
info
.
ptr
);
py
::
buffer_info
info_k
=
keep_out
.
request
();
auto
keep_out_ptr
=
static_cast
<
int
*>
(
info_k
.
ptr
);
return
_nms_gpu
<
DType
,
threadsPerBlock
>
(
keep_out_ptr
,
boxes_ptr
,
boxes
.
shape
(
0
),
boxes
.
shape
(
1
),
nms_overlap_thresh
,
device_id
);
}
#endif
}
// namespace spconv
#endif
include/spconv/nms_functor.h
deleted
100644 → 0
View file @
fad30002
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef NMS_FUNCTOR_H_
#define NMS_FUNCTOR_H_
#include <tensorview/tensorview.h>
namespace
spconv
{
namespace
functor
{
template
<
typename
Device
,
typename
T
,
typename
Index
>
struct
NonMaxSupressionFunctor
{
Index
operator
()(
const
Device
&
d
,
tv
::
TensorView
<
Index
>
keep
,
tv
::
TensorView
<
const
T
>
boxes
,
T
threshold
,
T
eps
);
};
template
<
typename
Device
,
typename
T
,
typename
Index
>
struct
rotateNonMaxSupressionFunctor
{
Index
operator
()(
const
Device
&
d
,
tv
::
TensorView
<
Index
>
keep
,
tv
::
TensorView
<
const
T
>
boxCorners
,
tv
::
TensorView
<
const
T
>
standupIoU
,
T
threshold
);
};
}
// namespace functor
}
// namespace spconv
#endif
\ No newline at end of file
include/spconv/nms_gpu.h
deleted
100644 → 0
View file @
fad30002
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
template
<
typename
DType
,
int
BLOCK_THREADS
>
int
_nms_gpu
(
int
*
keep_out
,
const
DType
*
boxes_host
,
int
boxes_num
,
int
boxes_dim
,
DType
nms_overlap_thresh
,
int
device_id
);
include/spconv/nms_ops.h
deleted
100644 → 0
View file @
fad30002
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef NMS_TORCH_OP_H_
#define NMS_TORCH_OP_H_
#include <spconv/indice.h>
#include <spconv/nms_functor.h>
#include <spconv/reordering.h>
#include <tensorview/torch_utils.h>
#include <torch/script.h>
#include <utility/timer.h>
namespace
spconv
{
// torch.jit's doc says only support int64, so we need to convert to int32.
template
<
typename
T
>
torch
::
Tensor
nonMaxSuppression
(
torch
::
Tensor
boxes
,
torch
::
Tensor
scores
,
int64_t
preMaxSize
,
int64_t
postMaxSize
,
double
thresh
,
double
eps
)
{
// auto timer = spconv::CudaContextTimer<>();
tv
::
check_torch_dtype
<
T
>
(
boxes
);
auto
resOptions
=
torch
::
TensorOptions
().
dtype
(
torch
::
kInt64
).
device
(
boxes
.
device
());
if
(
boxes
.
size
(
0
)
==
0
)
{
return
torch
::
zeros
({
0
},
resOptions
);
}
torch
::
Tensor
indices
;
if
(
preMaxSize
>
0
)
{
auto
numKeepedScores
=
scores
.
size
(
0
);
preMaxSize
=
std
::
min
(
numKeepedScores
,
preMaxSize
);
auto
res
=
torch
::
topk
(
scores
,
preMaxSize
);
indices
=
std
::
get
<
1
>
(
res
);
boxes
=
torch
::
index_select
(
boxes
,
0
,
indices
);
}
else
{
indices
=
std
::
get
<
1
>
(
torch
::
sort
(
scores
));
boxes
=
torch
::
index_select
(
boxes
,
0
,
indices
);
}
if
(
boxes
.
size
(
0
)
==
0
)
return
torch
::
zeros
({
0
},
resOptions
);
auto
keep
=
torch
::
zeros
({
boxes
.
size
(
0
)},
resOptions
);
int64_t
keepNum
=
0
;
if
(
boxes
.
device
().
type
()
==
torch
::
kCPU
)
{
auto
nmsFunctor
=
functor
::
NonMaxSupressionFunctor
<
tv
::
CPU
,
T
,
int64_t
>
();
keepNum
=
nmsFunctor
(
tv
::
CPU
(),
tv
::
torch2tv
<
int64_t
>
(
keep
),
tv
::
torch2tv
<
const
T
>
(
boxes
),
T
(
thresh
),
T
(
eps
));
}
else
{
TV_ASSERT_RT_ERR
(
false
,
"not implemented"
);
}
if
(
postMaxSize
<=
0
)
{
postMaxSize
=
keepNum
;
}
// std::cout << keep << std::endl;
keep
=
keep
.
slice
(
0
,
0
,
std
::
min
(
keepNum
,
postMaxSize
));
if
(
preMaxSize
>
0
)
{
return
torch
::
index_select
(
indices
,
0
,
keep
);
}
return
keep
;
}
}
// namespace spconv
#endif
\ No newline at end of file
include/spconv/pillar_scatter_functor.h
deleted
100644 → 0
View file @
fad30002
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef POINTPILLARS_SCATTER_FUNCTOR_H_
#define POINTPILLARS_SCATTER_FUNCTOR_H_
#include <tensorview/tensorview.h>
namespace
spconv
{
namespace
functor
{
template
<
typename
Device
,
typename
T
,
typename
Index
>
struct
PointPillarScatter
{
void
operator
()(
const
Device
&
d
,
tv
::
TensorView
<
T
>
canvas
,
tv
::
TensorView
<
const
T
>
features
,
tv
::
TensorView
<
const
T
>
coors
);
};
}
// namespace functor
}
// namespace spconv
#endif
\ No newline at end of file
include/spconv/pillar_scatter_ops.h
deleted
100644 → 0
View file @
fad30002
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef PILLAR_SCATTER_OP_H_
#define PILLAR_SCATTER_OP_H_
#include <spconv/pillar_scatter_functor.h>
#include <tensorview/torch_utils.h>
#include <torch/script.h>
#include <utility/timer.h>
namespace
spconv
{
// torch.jit's doc says only support int64, so we need to convert to int32.
template
<
typename
T
>
torch
::
Tensor
pointPillarScatter
(
torch
::
Tensor
features
,
torch
::
Tensor
coors
,
torch
::
Tensor
shape
)
{
TV_ASSERT_RT_ERR
(
shape
.
device
().
type
()
==
torch
::
kCPU
,
"error"
);
TV_ASSERT_RT_ERR
(
features
.
device
().
type
()
==
torch
::
kCUDA
,
"error"
);
TV_ASSERT_RT_ERR
(
shape
.
dim
()
==
1
,
"error"
);
TV_ASSERT_RT_ERR
(
shape
.
size
(
0
)
==
4
,
"error"
);
TV_ASSERT_RT_ERR
(
features
.
dim
()
>=
3
,
"error"
);
TV_ASSERT_RT_ERR
(
features
.
size
(
0
)
==
1
,
"feature first dim must be 1"
);
TV_ASSERT_RT_ERR
(
coors
.
size
(
0
)
==
1
,
"coors first dim must be 1"
);
TV_ASSERT_RT_ERR
(
features
.
size
(
2
)
==
coors
.
size
(
2
),
"err"
);
tv
::
check_torch_dtype
<
int
>
(
shape
);
tv
::
check_torch_dtype
<
T
>
(
coors
);
auto
shapeData
=
shape
.
data_ptr
<
int
>
();
torch
::
Tensor
canvas
=
torch
::
zeros
({
shapeData
[
0
],
shapeData
[
1
],
shapeData
[
2
],
shapeData
[
3
]},
features
.
options
());
TV_ASSERT_RT_ERR
(
shapeData
[
1
]
==
features
.
size
(
1
),
"error"
);
#ifdef TV_CUDA
functor
::
PointPillarScatter
<
tv
::
GPU
,
T
,
int
>
ftor
;
ftor
(
tv
::
TorchGPU
(),
tv
::
torch2tv
<
T
>
(
canvas
),
tv
::
torch2tv
<
const
T
>
(
features
.
squeeze
()),
tv
::
torch2tv
<
const
T
>
(
coors
.
squeeze
()));
#endif
return
canvas
;
}
}
// namespace spconv
#endif
\ No newline at end of file
include/spconv/point2voxel.h
deleted
100644 → 0
View file @
fad30002
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <pybind11/pybind11.h>
// must include pybind11/eigen.h if using eigen matrix as arguments.
// must include pybind11/stl.h if using containers in STL in arguments.
#include <algorithm>
#include <pybind11/numpy.h>
#include <pybind11/stl.h>
// #include <vector>
#include <iostream>
#include <math.h>
namespace
spconv
{
namespace
py
=
pybind11
;
using
namespace
pybind11
::
literals
;
template
<
typename
DType
,
int
NDim
>
int
points_to_voxel_3d_np
(
py
::
array_t
<
DType
>
points
,
py
::
array_t
<
DType
>
voxels
,
py
::
array_t
<
DType
>
voxel_point_mask
,
py
::
array_t
<
int
>
coors
,
py
::
array_t
<
int
>
num_points_per_voxel
,
py
::
array_t
<
int
>
coor_to_voxelidx
,
std
::
vector
<
DType
>
voxel_size
,
std
::
vector
<
DType
>
coors_range
,
int
max_points
,
int
max_voxels
)
{
auto
points_rw
=
points
.
template
mutable_unchecked
<
2
>();
auto
voxels_rw
=
voxels
.
template
mutable_unchecked
<
3
>();
auto
voxel_point_mask_rw
=
voxel_point_mask
.
template
mutable_unchecked
<
2
>();
auto
coors_rw
=
coors
.
mutable_unchecked
<
2
>
();
auto
num_points_per_voxel_rw
=
num_points_per_voxel
.
mutable_unchecked
<
1
>
();
auto
coor_to_voxelidx_rw
=
coor_to_voxelidx
.
mutable_unchecked
<
NDim
>
();
auto
N
=
points_rw
.
shape
(
0
);
auto
num_features
=
points_rw
.
shape
(
1
);
// auto ndim = points_rw.shape(1) - 1;
constexpr
int
ndim_minus_1
=
NDim
-
1
;
int
voxel_num
=
0
;
bool
failed
=
false
;
int
coor
[
NDim
];
int
c
;
int
grid_size
[
NDim
];
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
grid_size
[
i
]
=
round
((
coors_range
[
NDim
+
i
]
-
coors_range
[
i
])
/
voxel_size
[
i
]);
}
int
voxelidx
,
num
;
for
(
int
i
=
0
;
i
<
N
;
++
i
)
{
failed
=
false
;
for
(
int
j
=
0
;
j
<
NDim
;
++
j
)
{
c
=
floor
((
points_rw
(
i
,
j
)
-
coors_range
[
j
])
/
voxel_size
[
j
]);
if
((
c
<
0
||
c
>=
grid_size
[
j
]))
{
failed
=
true
;
break
;
}
coor
[
ndim_minus_1
-
j
]
=
c
;
}
if
(
failed
)
continue
;
voxelidx
=
coor_to_voxelidx_rw
(
coor
[
0
],
coor
[
1
],
coor
[
2
]);
if
(
voxelidx
==
-
1
)
{
voxelidx
=
voxel_num
;
if
(
voxel_num
>=
max_voxels
)
continue
;
voxel_num
+=
1
;
coor_to_voxelidx_rw
(
coor
[
0
],
coor
[
1
],
coor
[
2
])
=
voxelidx
;
for
(
int
k
=
0
;
k
<
NDim
;
++
k
)
{
coors_rw
(
voxelidx
,
k
)
=
coor
[
k
];
}
}
num
=
num_points_per_voxel_rw
(
voxelidx
);
if
(
num
<
max_points
)
{
voxel_point_mask_rw
(
voxelidx
,
num
)
=
DType
(
1
);
for
(
int
k
=
0
;
k
<
num_features
;
++
k
)
{
voxels_rw
(
voxelidx
,
num
,
k
)
=
points_rw
(
i
,
k
);
}
num_points_per_voxel_rw
(
voxelidx
)
+=
1
;
}
}
for
(
int
i
=
0
;
i
<
voxel_num
;
++
i
)
{
coor_to_voxelidx_rw
(
coors_rw
(
i
,
0
),
coors_rw
(
i
,
1
),
coors_rw
(
i
,
2
))
=
-
1
;
}
return
voxel_num
;
}
template
<
typename
DType
,
int
NDim
>
int
points_to_voxel_3d_np_mean
(
py
::
array_t
<
DType
>
points
,
py
::
array_t
<
DType
>
voxel_point_mask
,
py
::
array_t
<
DType
>
voxels
,
py
::
array_t
<
DType
>
means
,
py
::
array_t
<
int
>
coors
,
py
::
array_t
<
int
>
num_points_per_voxel
,
py
::
array_t
<
int
>
coor_to_voxelidx
,
std
::
vector
<
DType
>
voxel_size
,
std
::
vector
<
DType
>
coors_range
,
int
max_points
,
int
max_voxels
)
{
auto
points_rw
=
points
.
template
mutable_unchecked
<
2
>();
auto
means_rw
=
means
.
template
mutable_unchecked
<
2
>();
auto
voxels_rw
=
voxels
.
template
mutable_unchecked
<
3
>();
auto
voxel_point_mask_rw
=
voxel_point_mask
.
template
mutable_unchecked
<
2
>();
auto
coors_rw
=
coors
.
mutable_unchecked
<
2
>
();
auto
num_points_per_voxel_rw
=
num_points_per_voxel
.
mutable_unchecked
<
1
>
();
auto
coor_to_voxelidx_rw
=
coor_to_voxelidx
.
mutable_unchecked
<
NDim
>
();
auto
N
=
points_rw
.
shape
(
0
);
auto
num_features
=
points_rw
.
shape
(
1
);
// auto ndim = points_rw.shape(1) - 1;
constexpr
int
ndim_minus_1
=
NDim
-
1
;
int
voxel_num
=
0
;
bool
failed
=
false
;
int
coor
[
NDim
];
int
c
;
int
grid_size
[
NDim
];
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
grid_size
[
i
]
=
round
((
coors_range
[
NDim
+
i
]
-
coors_range
[
i
])
/
voxel_size
[
i
]);
}
int
voxelidx
,
num
;
for
(
int
i
=
0
;
i
<
N
;
++
i
)
{
failed
=
false
;
for
(
int
j
=
0
;
j
<
NDim
;
++
j
)
{
c
=
floor
((
points_rw
(
i
,
j
)
-
coors_range
[
j
])
/
voxel_size
[
j
]);
if
((
c
<
0
||
c
>=
grid_size
[
j
]))
{
failed
=
true
;
break
;
}
coor
[
ndim_minus_1
-
j
]
=
c
;
}
if
(
failed
)
continue
;
voxelidx
=
coor_to_voxelidx_rw
(
coor
[
0
],
coor
[
1
],
coor
[
2
]);
if
(
voxelidx
==
-
1
)
{
voxelidx
=
voxel_num
;
if
(
voxel_num
>=
max_voxels
)
continue
;
voxel_num
+=
1
;
coor_to_voxelidx_rw
(
coor
[
0
],
coor
[
1
],
coor
[
2
])
=
voxelidx
;
for
(
int
k
=
0
;
k
<
NDim
;
++
k
)
{
coors_rw
(
voxelidx
,
k
)
=
coor
[
k
];
}
}
num
=
num_points_per_voxel_rw
(
voxelidx
);
if
(
num
<
max_points
)
{
voxel_point_mask_rw
(
voxelidx
,
num
)
=
DType
(
1
);
for
(
int
k
=
0
;
k
<
num_features
;
++
k
)
{
voxels_rw
(
voxelidx
,
num
,
k
)
=
points_rw
(
i
,
k
);
}
num_points_per_voxel_rw
(
voxelidx
)
+=
1
;
for
(
int
k
=
0
;
k
<
num_features
;
++
k
)
{
means_rw
(
voxelidx
,
k
)
+=
(
points_rw
(
i
,
k
)
-
means_rw
(
voxelidx
,
k
))
/
DType
(
num
+
1
);
}
}
}
for
(
int
i
=
0
;
i
<
voxel_num
;
++
i
)
{
coor_to_voxelidx_rw
(
coors_rw
(
i
,
0
),
coors_rw
(
i
,
1
),
coors_rw
(
i
,
2
))
=
-
1
;
num
=
num_points_per_voxel_rw
(
i
);
for
(
int
j
=
num
;
j
<
max_points
;
++
j
)
{
for
(
int
k
=
0
;
k
<
num_features
;
++
k
)
{
voxels_rw
(
i
,
j
,
k
)
=
means_rw
(
i
,
k
);
}
}
}
return
voxel_num
;
}
template
<
typename
DType
,
int
NDim
>
int
points_to_voxel_3d_with_filtering
(
py
::
array_t
<
DType
>
points
,
py
::
array_t
<
DType
>
voxels
,
py
::
array_t
<
DType
>
voxel_point_mask
,
py
::
array_t
<
int
>
voxel_mask
,
py
::
array_t
<
DType
>
mins
,
py
::
array_t
<
DType
>
maxs
,
py
::
array_t
<
int
>
coors
,
py
::
array_t
<
int
>
num_points_per_voxel
,
py
::
array_t
<
int
>
coor_to_voxelidx
,
std
::
vector
<
DType
>
voxel_size
,
std
::
vector
<
DType
>
coors_range
,
int
max_points
,
int
max_voxels
,
int
block_factor
,
int
block_size
,
DType
height_threshold
,
DType
height_high_threshold
)
{
auto
points_rw
=
points
.
template
mutable_unchecked
<
2
>();
auto
mins_rw
=
mins
.
template
mutable_unchecked
<
2
>();
auto
maxs_rw
=
maxs
.
template
mutable_unchecked
<
2
>();
auto
voxels_rw
=
voxels
.
template
mutable_unchecked
<
3
>();
auto
voxel_point_mask_rw
=
voxel_point_mask
.
template
mutable_unchecked
<
2
>();
auto
voxel_mask_rw
=
voxel_mask
.
template
mutable_unchecked
<
1
>();
auto
coors_rw
=
coors
.
mutable_unchecked
<
2
>
();
auto
num_points_per_voxel_rw
=
num_points_per_voxel
.
mutable_unchecked
<
1
>
();
auto
coor_to_voxelidx_rw
=
coor_to_voxelidx
.
mutable_unchecked
<
NDim
>
();
auto
N
=
points_rw
.
shape
(
0
);
auto
num_features
=
points_rw
.
shape
(
1
);
// auto ndim = points_rw.shape(1) - 1;
constexpr
int
ndim_minus_1
=
NDim
-
1
;
int
voxel_num
=
0
;
bool
failed
=
false
;
int
coor
[
NDim
];
int
c
;
int
grid_size
[
NDim
];
DType
max_value
,
min_value
;
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
grid_size
[
i
]
=
round
((
coors_range
[
NDim
+
i
]
-
coors_range
[
i
])
/
voxel_size
[
i
]);
}
int
block_shape_H
=
grid_size
[
1
]
/
block_factor
;
int
block_shape_W
=
grid_size
[
0
]
/
block_factor
;
int
voxelidx
,
num
;
int
block_coor
[
2
];
int
startx
,
stopx
,
starty
,
stopy
;
for
(
int
i
=
0
;
i
<
N
;
++
i
)
{
failed
=
false
;
for
(
int
j
=
0
;
j
<
NDim
;
++
j
)
{
c
=
floor
((
points_rw
(
i
,
j
)
-
coors_range
[
j
])
/
voxel_size
[
j
]);
if
((
c
<
0
||
c
>=
grid_size
[
j
]))
{
failed
=
true
;
break
;
}
coor
[
ndim_minus_1
-
j
]
=
c
;
}
if
(
failed
)
continue
;
voxelidx
=
coor_to_voxelidx_rw
(
coor
[
0
],
coor
[
1
],
coor
[
2
]);
if
(
voxelidx
==
-
1
)
{
voxelidx
=
voxel_num
;
if
(
voxel_num
>=
max_voxels
)
continue
;
voxel_num
+=
1
;
coor_to_voxelidx_rw
(
coor
[
0
],
coor
[
1
],
coor
[
2
])
=
voxelidx
;
for
(
int
k
=
0
;
k
<
NDim
;
++
k
)
{
coors_rw
(
voxelidx
,
k
)
=
coor
[
k
];
}
}
num
=
num_points_per_voxel_rw
(
voxelidx
);
if
(
num
<
max_points
)
{
voxel_point_mask_rw
(
voxelidx
,
num
)
=
DType
(
1
);
for
(
int
k
=
0
;
k
<
num_features
;
++
k
)
{
voxels_rw
(
voxelidx
,
num
,
k
)
=
points_rw
(
i
,
k
);
}
block_coor
[
0
]
=
coor
[
1
]
/
block_factor
;
block_coor
[
1
]
=
coor
[
2
]
/
block_factor
;
mins_rw
(
block_coor
[
0
],
block_coor
[
1
])
=
std
::
min
(
points_rw
(
i
,
2
),
mins_rw
(
block_coor
[
0
],
block_coor
[
1
]));
maxs_rw
(
block_coor
[
0
],
block_coor
[
1
])
=
std
::
max
(
points_rw
(
i
,
2
),
maxs_rw
(
block_coor
[
0
],
block_coor
[
1
]));
num_points_per_voxel_rw
(
voxelidx
)
+=
1
;
}
}
for
(
int
i
=
0
;
i
<
voxel_num
;
++
i
)
{
coor
[
1
]
=
coors_rw
(
i
,
1
);
coor
[
2
]
=
coors_rw
(
i
,
2
);
coor_to_voxelidx_rw
(
coors_rw
(
i
,
0
),
coor
[
1
],
coor
[
2
])
=
-
1
;
block_coor
[
0
]
=
coor
[
1
]
/
block_factor
;
block_coor
[
1
]
=
coor
[
2
]
/
block_factor
;
min_value
=
mins_rw
(
block_coor
[
0
],
block_coor
[
1
]);
max_value
=
maxs_rw
(
block_coor
[
0
],
block_coor
[
1
]);
startx
=
std
::
max
(
0
,
block_coor
[
0
]
-
block_size
/
2
);
stopx
=
std
::
min
(
block_shape_H
,
block_coor
[
0
]
+
block_size
-
block_size
/
2
);
starty
=
std
::
max
(
0
,
block_coor
[
1
]
-
block_size
/
2
);
stopy
=
std
::
min
(
block_shape_W
,
block_coor
[
1
]
+
block_size
-
block_size
/
2
);
for
(
int
j
=
startx
;
j
<
stopx
;
++
j
)
{
for
(
int
k
=
starty
;
k
<
stopy
;
++
k
)
{
min_value
=
std
::
min
(
min_value
,
mins_rw
(
j
,
k
));
max_value
=
std
::
max
(
max_value
,
maxs_rw
(
j
,
k
));
}
}
voxel_mask_rw
(
i
)
=
((
max_value
-
min_value
)
>
height_threshold
)
&&
((
max_value
-
min_value
)
<
height_high_threshold
);
}
return
voxel_num
;
}
}
// namespace spconv
include/spconv/pool_ops.h
deleted
100644 → 0
View file @
fad30002
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef SPARSE_POOL_OP_H_
#define SPARSE_POOL_OP_H_
#include <spconv/maxpool.h>
#include <tensorview/torch_utils.h>
#include <torch/script.h>
#include <utility/timer.h>
namespace
spconv
{
torch
::
Tensor
indiceMaxPool
(
torch
::
Tensor
features
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
int64_t
numAct
);
torch
::
Tensor
indiceMaxPoolBackward
(
torch
::
Tensor
features
,
torch
::
Tensor
outFeatures
,
torch
::
Tensor
outGrad
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
);
}
// namespace spconv
#endif
\ No newline at end of file
include/spconv/reordering.cu.h
deleted
100644 → 0
View file @
fad30002
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef REORDERING_CU_H_
#define REORDERING_CU_H_
#include <THC/THCAtomics.cuh>
#include <THC/THCNumerics.cuh>
#include <cuda_fp16.h>
#include <tensorview/kernel_utils.h>
#if PYTORCH_VERSION < 10500
#define TH_ATOMIC_ADD atomicAdd
#else
#define TH_ATOMIC_ADD gpuAtomicAdd
#endif
// see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
namespace
spconv
{
template
<
typename
T
,
typename
Index
,
int
NumTLP
,
int
NumILP
>
__global__
void
gatherGenericKernel
(
T
*
buffer
,
const
T
*
features
,
const
Index
*
indices
,
int
size
,
int
numPlanes
)
{
int
ILPStrideX
[
NumILP
];
Index
inds
[
NumILP
];
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
ILPStrideX
[
ilp
]
=
ilp
*
gridDim
.
x
*
blockDim
.
x
;
for
(
int
ix
:
tv
::
KernelLoopX
<
int
,
NumILP
>
(
size
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
{
if
(
ix
+
ILPStrideX
[
ilp
]
<
size
)
inds
[
ilp
]
=
indices
[
ix
+
ILPStrideX
[
ilp
]]
*
numPlanes
;
}
for
(
int
iy
:
tv
::
KernelLoopY
<
int
>
(
numPlanes
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
++
ilp
)
{
if
(
ix
+
ILPStrideX
[
ilp
]
<
size
)
buffer
[(
ix
+
ILPStrideX
[
ilp
])
*
numPlanes
+
iy
]
=
features
[
inds
[
ilp
]
+
iy
];
}
}
}
}
template
<
typename
T
,
typename
Index
,
int
NumTLP
,
int
NumILP
,
typename
VecType
>
__global__
void
gatherVecKernel
(
T
*
buffer
,
const
T
*
features
,
const
Index
*
indices
,
int
size
,
int
numPlanes
)
{
int
ILPStrideX
[
NumILP
];
Index
inds
[
NumILP
];
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
ILPStrideX
[
ilp
]
=
ilp
*
gridDim
.
x
*
blockDim
.
x
;
for
(
int
ix
:
tv
::
KernelLoopX
<
int
,
NumILP
>
(
size
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
{
if
(
ix
+
ILPStrideX
[
ilp
]
<
size
)
inds
[
ilp
]
=
indices
[
ix
+
ILPStrideX
[
ilp
]]
*
numPlanes
;
}
for
(
int
iy
:
tv
::
KernelLoopY
<
int
>
(
numPlanes
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
++
ilp
)
{
if
(
ix
+
ILPStrideX
[
ilp
]
<
size
)
reinterpret_cast
<
VecType
*>
(
buffer
)[(
ix
+
ILPStrideX
[
ilp
])
*
numPlanes
+
iy
]
=
reinterpret_cast
<
const
VecType
*>
(
features
)[
inds
[
ilp
]
+
iy
];
}
}
}
}
template
<
typename
T
,
typename
Index
,
int
NumTLP
,
int
NumILP
,
typename
VecType
=
int4
>
__global__
void
gatherVecBlockKernel
(
T
*
buffer
,
const
T
*
features
,
const
Index
*
indices
,
int
size
,
int
numPlanes
)
{
int
ILPStrideX
[
NumILP
];
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
ILPStrideX
[
ilp
]
=
ilp
*
gridDim
.
x
*
blockDim
.
x
;
features
+=
blockIdx
.
y
*
NumTLP
;
buffer
+=
blockIdx
.
y
*
NumTLP
;
for
(
int
ix
:
tv
::
KernelLoopX
<
int
,
NumILP
>
(
size
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
++
ilp
)
{
reinterpret_cast
<
VecType
*>
(
buffer
)[(
ix
+
ILPStrideX
[
ilp
])
*
numPlanes
+
threadIdx
.
y
]
=
reinterpret_cast
<
const
VecType
*>
(
features
)[
indices
[
ix
+
ILPStrideX
[
ilp
]]
*
numPlanes
+
threadIdx
.
y
];
}
}
}
template
<
typename
T
,
typename
Index
,
int
NumTLP
,
int
NumILP
>
__global__
void
batchGatherGenericKernel
(
T
*
buffer
,
const
T
*
features
,
const
Index
*
indices
,
int
size
,
int
numPlanes
,
int
indice_batch_stride
,
int
feature_batch_stride
)
{
// size: max indice num * kernel volume
// inds: [volume, num_elems]
int
ILPStrideX
[
NumILP
];
Index
inds
[
NumILP
];
Index
inds_elem
;
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
ILPStrideX
[
ilp
]
=
ilp
*
gridDim
.
x
*
blockDim
.
x
;
for
(
int
ix
:
tv
::
KernelLoopX
<
int
,
NumILP
>
(
size
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
{
if
(
ix
+
ILPStrideX
[
ilp
]
<
size
)
{
inds_elem
=
ix
+
ILPStrideX
[
ilp
];
inds
[
ilp
]
=
indices
[(
inds_elem
/
feature_batch_stride
)
*
indice_batch_stride
+
inds_elem
%
feature_batch_stride
];
}
}
for
(
int
iy
:
tv
::
KernelLoopY
<
int
>
(
numPlanes
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
++
ilp
)
{
if
(
ix
+
ILPStrideX
[
ilp
]
<
size
)
{
if
(
inds
[
ilp
]
!=
-
1
)
{
buffer
[(
ix
+
ILPStrideX
[
ilp
])
*
numPlanes
+
iy
]
=
features
[
inds
[
ilp
]
*
numPlanes
+
iy
];
}
else
{
buffer
[(
ix
+
ILPStrideX
[
ilp
])
*
numPlanes
+
iy
]
=
T
(
0
);
}
}
}
}
}
}
template
<
typename
T
,
typename
Index
,
int
NumTLP
,
int
NumILP
,
typename
VecType
>
__global__
void
batchGatherVecKernel
(
T
*
buffer
,
const
T
*
features
,
const
Index
*
indices
,
int
size
,
int
feature_offset
,
int
numPlanes
,
int
indice_batch_stride
,
int
feature_batch_stride
)
{
int
ILPStrideX
[
NumILP
];
Index
inds
[
NumILP
];
Index
zero
[
sizeof
(
VecType
)
/
sizeof
(
T
)];
#pragma unroll
for
(
int
i
=
0
;
i
<
sizeof
(
VecType
)
/
sizeof
(
T
);
++
i
)
{
zero
[
i
]
=
T
(
0
);
}
Index
inds_elem
;
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
ILPStrideX
[
ilp
]
=
ilp
*
gridDim
.
x
*
blockDim
.
x
;
for
(
int
ix
:
tv
::
KernelLoopX
<
int
,
NumILP
>
(
size
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
{
if
(
ix
+
ILPStrideX
[
ilp
]
<
size
)
{
inds_elem
=
ix
+
ILPStrideX
[
ilp
]
+
feature_offset
;
inds
[
ilp
]
=
indices
[(
inds_elem
/
feature_batch_stride
)
*
indice_batch_stride
+
inds_elem
%
feature_batch_stride
];
}
}
for
(
int
iy
:
tv
::
KernelLoopY
<
int
>
(
numPlanes
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
++
ilp
)
{
if
(
ix
+
ILPStrideX
[
ilp
]
<
size
)
{
if
(
inds
[
ilp
]
!=
-
1
)
{
reinterpret_cast
<
VecType
*>
(
buffer
)[(
ix
+
ILPStrideX
[
ilp
])
*
numPlanes
+
iy
]
=
reinterpret_cast
<
const
VecType
*>
(
features
)[
inds
[
ilp
]
*
numPlanes
+
iy
];
}
else
{
reinterpret_cast
<
VecType
*>
(
buffer
)[(
ix
+
ILPStrideX
[
ilp
])
*
numPlanes
+
iy
]
=
reinterpret_cast
<
const
VecType
*>
(
&
zero
)[
0
];
}
}
}
}
}
}
template
<
typename
T
,
typename
Index
,
int
NumTLP
,
int
NumILP
,
typename
VecType
=
int4
>
__global__
void
batchGatherVecBlockKernel
(
T
*
buffer
,
const
T
*
features
,
const
Index
*
indices
,
int
size
,
int
numPlanes
,
int
indice_batch_stride
,
int
feature_batch_stride
)
{
int
ILPStrideX
[
NumILP
];
Index
inds
;
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
ILPStrideX
[
ilp
]
=
ilp
*
gridDim
.
x
*
blockDim
.
x
;
features
+=
blockIdx
.
y
*
NumTLP
;
buffer
+=
blockIdx
.
y
*
NumTLP
;
Index
inds_elem
;
Index
zero
[
sizeof
(
VecType
)
/
sizeof
(
T
)];
#pragma unroll
for
(
int
i
=
0
;
i
<
sizeof
(
VecType
)
/
sizeof
(
T
);
++
i
)
{
zero
[
i
]
=
T
(
0
);
}
for
(
int
ix
:
tv
::
KernelLoopX
<
int
,
NumILP
>
(
size
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
++
ilp
)
{
inds_elem
=
ix
+
ILPStrideX
[
ilp
];
inds
=
indices
[(
inds_elem
/
feature_batch_stride
)
*
indice_batch_stride
+
inds_elem
%
feature_batch_stride
];
if
(
inds
!=
-
1
)
{
reinterpret_cast
<
VecType
*>
(
buffer
)[(
ix
+
ILPStrideX
[
ilp
])
*
numPlanes
+
threadIdx
.
y
]
=
reinterpret_cast
<
const
VecType
*>
(
features
)[
inds
*
numPlanes
+
threadIdx
.
y
];
}
else
{
reinterpret_cast
<
VecType
*>
(
buffer
)[(
ix
+
ILPStrideX
[
ilp
])
*
numPlanes
+
threadIdx
.
y
]
=
reinterpret_cast
<
const
VecType
*>
(
&
zero
)[
0
];
}
}
}
}
template
<
typename
T
,
typename
Index
,
int
NumTLP
,
int
NumILP
>
__global__
void
scatterAddGenericKernel
(
T
*
outFeatures
,
const
T
*
buffer
,
const
Index
*
indices
,
int
size
,
int
numPlanes
)
{
int
ILPStrideX
[
NumILP
];
Index
inds
[
NumILP
];
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
ILPStrideX
[
ilp
]
=
ilp
*
gridDim
.
x
*
blockDim
.
x
;
for
(
int
ix
:
tv
::
KernelLoopX
<
int
,
NumILP
>
(
size
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
{
if
(
ix
+
ILPStrideX
[
ilp
]
<
size
)
inds
[
ilp
]
=
indices
[
ix
+
ILPStrideX
[
ilp
]]
*
numPlanes
;
}
for
(
int
iy
:
tv
::
KernelLoopY
<
int
>
(
numPlanes
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
++
ilp
)
{
if
(
ix
+
ILPStrideX
[
ilp
]
<
size
)
{
outFeatures
[
inds
[
ilp
]
+
iy
]
+=
buffer
[(
ix
+
ILPStrideX
[
ilp
])
*
numPlanes
+
iy
];
}
}
}
}
}
template
<
typename
T
,
typename
Index
,
int
NumTLP
,
int
NumILP
,
typename
VecType
=
int4
>
__global__
void
scatterAddVecBlockKernel
(
T
*
outFeatures
,
const
T
*
buffer
,
const
Index
*
indices
,
int
size
,
int
numPlanes
)
{
int
ILPStrideX
[
NumILP
];
constexpr
int
vecloadFactor
=
sizeof
(
VecType
)
/
sizeof
(
T
);
constexpr
int
vecloadHalf2Factor
=
sizeof
(
VecType
)
/
sizeof
(
__half2
);
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
ILPStrideX
[
ilp
]
=
ilp
*
gridDim
.
x
*
blockDim
.
x
;
outFeatures
+=
blockIdx
.
y
*
NumTLP
;
buffer
+=
blockIdx
.
y
*
NumTLP
;
T
buf
[
vecloadFactor
];
T
buf2
[
vecloadFactor
];
Index
idx
;
for
(
int
ix
:
tv
::
KernelLoopX
<
int
,
NumILP
>
(
size
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
++
ilp
)
{
idx
=
indices
[
ix
+
ILPStrideX
[
ilp
]]
*
numPlanes
+
threadIdx
.
y
;
reinterpret_cast
<
VecType
*>
(
buf
)[
0
]
=
reinterpret_cast
<
VecType
*>
(
outFeatures
)[
idx
];
reinterpret_cast
<
VecType
*>
(
buf2
)[
0
]
=
reinterpret_cast
<
const
VecType
*>
(
buffer
)[(
ix
+
ILPStrideX
[
ilp
])
*
numPlanes
+
threadIdx
.
y
];
if
(
std
::
is_same
<
T
,
at
::
Half
>::
value
)
{
#if __CUDA_ARCH__ >= 530
#pragma unroll
for
(
int
i
=
0
;
i
<
vecloadHalf2Factor
;
i
++
)
{
reinterpret_cast
<
__half2
*>
(
buf
)[
i
]
=
__hadd2
(
reinterpret_cast
<
__half2
*>
(
buf
)[
i
],
reinterpret_cast
<
__half2
*>
(
buf2
)[
i
]);
}
#else
#pragma unroll
for
(
int
i
=
0
;
i
<
vecloadFactor
;
i
++
)
{
buf
[
i
]
+=
buf2
[
i
];
}
#endif
}
else
{
#pragma unroll
for
(
int
i
=
0
;
i
<
vecloadFactor
;
i
++
)
{
buf
[
i
]
+=
buf2
[
i
];
}
}
reinterpret_cast
<
VecType
*>
(
outFeatures
)[
idx
]
=
reinterpret_cast
<
VecType
*>
(
buf
)[
0
];
}
}
}
template
<
typename
T
,
typename
Index
,
int
NumTLP
,
int
NumILP
>
__global__
void
scatterAddBlockKernel
(
T
*
outFeatures
,
const
T
*
buffer
,
const
Index
*
indices
,
int
size
,
int
numPlanes
)
{
int
ILPStrideX
[
NumILP
];
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
ILPStrideX
[
ilp
]
=
ilp
*
gridDim
.
x
*
blockDim
.
x
;
outFeatures
+=
blockIdx
.
y
*
NumTLP
;
buffer
+=
blockIdx
.
y
*
NumTLP
;
for
(
int
ix
:
tv
::
KernelLoopX
<
int
,
NumILP
>
(
size
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
++
ilp
)
{
outFeatures
[
indices
[
ix
+
ILPStrideX
[
ilp
]]
*
numPlanes
+
threadIdx
.
y
]
+=
buffer
[(
ix
+
ILPStrideX
[
ilp
])
*
numPlanes
+
threadIdx
.
y
];
}
}
}
#if __CUDA_ARCH__ >= 530
template
<
typename
T
,
typename
Index
,
int
NumTLP
,
int
NumILP
>
__global__
void
scatterAddHalfBlockKernel
(
T
*
outFeatures
,
const
T
*
buffer
,
const
Index
*
indices
,
int
size
,
int
numPlanes
)
{
int
ILPStrideX
[
NumILP
];
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
ILPStrideX
[
ilp
]
=
ilp
*
gridDim
.
x
*
blockDim
.
x
;
outFeatures
+=
blockIdx
.
y
*
NumTLP
;
buffer
+=
blockIdx
.
y
*
NumTLP
;
Index
idx
;
for
(
int
ix
:
tv
::
KernelLoopX
<
int
,
NumILP
>
(
size
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
++
ilp
)
{
idx
=
indices
[
ix
+
ILPStrideX
[
ilp
]]
*
numPlanes
+
threadIdx
.
y
;
reinterpret_cast
<
__half2
*>
(
outFeatures
)[
idx
]
=
__hadd2
(
reinterpret_cast
<
__half2
*>
(
outFeatures
)[
idx
],
reinterpret_cast
<
__half2
*>
(
buffer
)[(
ix
+
ILPStrideX
[
ilp
])
*
numPlanes
+
threadIdx
.
y
]);
}
}
}
#endif
template
<
typename
T
,
typename
Index
,
int
NumTLP
,
int
NumILP
>
__global__
void
batchScatterAddGenericKernel
(
T
*
outFeatures
,
const
T
*
buffer
,
const
Index
*
indices
,
int
size
,
int
feature_offset
,
int
numPlanes
,
int
indice_batch_stride
,
int
feature_batch_stride
)
{
// batch scatter add is greatly slower than native scatter when the number of
// points is large. this may due to atomicAdd?
// batch scatter add is greatly faster than native when the number of points
// is small.
int
ILPStrideX
[
NumILP
];
Index
inds
[
NumILP
];
Index
inds_elem
;
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
ILPStrideX
[
ilp
]
=
ilp
*
gridDim
.
x
*
blockDim
.
x
;
for
(
int
ix
:
tv
::
KernelLoopX
<
int
,
NumILP
>
(
size
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
{
if
(
ix
+
ILPStrideX
[
ilp
]
<
size
)
{
inds_elem
=
ix
+
ILPStrideX
[
ilp
]
+
feature_offset
;
inds
[
ilp
]
=
indices
[(
inds_elem
/
feature_batch_stride
)
*
indice_batch_stride
+
inds_elem
%
feature_batch_stride
];
}
}
for
(
int
iy
:
tv
::
KernelLoopY
<
int
>
(
numPlanes
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
++
ilp
)
{
if
(
ix
+
ILPStrideX
[
ilp
]
<
size
&&
inds
[
ilp
]
!=
-
1
)
{
TH_ATOMIC_ADD
(
outFeatures
+
inds
[
ilp
]
*
numPlanes
+
iy
,
buffer
[(
ix
+
ILPStrideX
[
ilp
])
*
numPlanes
+
iy
]);
}
}
}
}
}
template
<
typename
T
,
typename
Index
,
int
NumTLP
,
int
NumILP
>
__global__
void
batchScatterAddBlockKernel
(
T
*
outFeatures
,
const
T
*
buffer
,
const
Index
*
indices
,
int
size
,
int
numPlanes
,
int
indice_batch_stride
,
int
feature_batch_stride
)
{
int
ILPStrideX
[
NumILP
];
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
ILPStrideX
[
ilp
]
=
ilp
*
gridDim
.
x
*
blockDim
.
x
;
outFeatures
+=
blockIdx
.
y
*
NumTLP
;
buffer
+=
blockIdx
.
y
*
NumTLP
;
Index
inds
,
inds_elem
;
for
(
int
ix
:
tv
::
KernelLoopX
<
int
,
NumILP
>
(
size
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
++
ilp
)
{
inds_elem
=
ix
+
ILPStrideX
[
ilp
];
inds
=
indices
[(
inds_elem
/
feature_batch_stride
)
*
indice_batch_stride
+
inds_elem
%
feature_batch_stride
];
if
(
inds
!=
-
1
)
{
TH_ATOMIC_ADD
(
outFeatures
+
inds
*
numPlanes
+
threadIdx
.
y
,
buffer
[(
ix
+
ILPStrideX
[
ilp
])
*
numPlanes
+
threadIdx
.
y
]);
}
}
}
}
}
// namespace spconv
#undef TH_ATOMIC_ADD
#endif
\ No newline at end of file
include/spconv/reordering.h
deleted
100644 → 0
View file @
fad30002
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef SPARSE_REORDERING_FUNCTOR_H_
#define SPARSE_REORDERING_FUNCTOR_H_
#include <tensorview/tensorview.h>
#include <torch/script.h>
namespace
spconv
{
void
batch_sparse_gather_cuda
(
torch
::
Tensor
buffer
,
torch
::
Tensor
features
,
torch
::
Tensor
indices
,
int
size
);
void
batch_sparse_scatter_add_cuda
(
torch
::
Tensor
buffer
,
torch
::
Tensor
outFeatures
,
torch
::
Tensor
indices
,
int
size
);
void
sparse_gather_cuda
(
torch
::
Tensor
buffer
,
torch
::
Tensor
features
,
torch
::
Tensor
indices
,
int
size
);
void
sparse_scatter_add_cuda
(
torch
::
Tensor
buffer
,
torch
::
Tensor
outFeatures
,
torch
::
Tensor
indices
,
int
size
);
void
sparse_gather_cpu
(
torch
::
Tensor
buffer
,
torch
::
Tensor
features
,
torch
::
Tensor
indices
,
int
size
);
void
sparse_scatter_add_cpu
(
torch
::
Tensor
buffer
,
torch
::
Tensor
outFeatures
,
torch
::
Tensor
indices
,
int
size
);
}
// namespace spconv
#endif
\ No newline at end of file
include/spconv/spconv_ops.h
deleted
100644 → 0
View file @
fad30002
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef SPARSE_CONV_OP_H_
#define SPARSE_CONV_OP_H_
#include <spconv/indice.h>
#include <spconv/reordering.h>
#include <tensorview/torch_utils.h>
#include <torch/script.h>
#include <utility/timer.h>
namespace
spconv
{
enum
ConvAlgo
{
kNative
=
0
,
kBatch
=
1
,
kBatchGemmGather
=
2
};
// torch.jit's doc says only support int64, so we need to convert to int32.
std
::
vector
<
torch
::
Tensor
>
getIndicePairs
(
torch
::
Tensor
indices
,
int64_t
batchSize
,
std
::
vector
<
int64_t
>
outSpatialShape
,
std
::
vector
<
int64_t
>
spatialShape
,
std
::
vector
<
int64_t
>
kernelSize
,
std
::
vector
<
int64_t
>
stride
,
std
::
vector
<
int64_t
>
padding
,
std
::
vector
<
int64_t
>
dilation
,
std
::
vector
<
int64_t
>
outPadding
,
int64_t
_subM
,
int64_t
_transpose
,
int64_t
_useHash
);
torch
::
Tensor
indiceConvBatch
(
torch
::
Tensor
features
,
torch
::
Tensor
filters
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
int64_t
numActOut
,
int64_t
_inverse
,
int64_t
_subM
,
bool
batchScatter
);
torch
::
Tensor
indiceConv
(
torch
::
Tensor
features
,
torch
::
Tensor
filters
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
int64_t
numActOut
,
int64_t
_inverse
,
int64_t
_subM
,
int64_t
algo
);
std
::
vector
<
torch
::
Tensor
>
indiceConvBackward
(
torch
::
Tensor
features
,
torch
::
Tensor
filters
,
torch
::
Tensor
outGrad
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
int64_t
_inverse
,
int64_t
_subM
,
int64_t
algo
);
std
::
vector
<
torch
::
Tensor
>
indiceConvBackwardBatch
(
torch
::
Tensor
features
,
torch
::
Tensor
filters
,
torch
::
Tensor
outGrad
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
int64_t
_inverse
,
int64_t
_subM
,
bool
batchScatter
);
}
// namespace spconv
#endif
\ No newline at end of file
include/tensorrt/inference.h
deleted
100644 → 0
View file @
fad30002
#include "NvInfer.h"
#include <memory>
#include <tensorview/tensor.h>
#include <unordered_map>
#include <vector>
namespace
trt
{
template
<
typename
T
>
tv
::
DType
trt_dtype_to_tv
(
T
trt_dtype
)
{
switch
(
trt_dtype
)
{
case
nvinfer1
::
DataType
::
kFLOAT
:
return
tv
::
float32
;
case
nvinfer1
::
DataType
::
kHALF
:
return
tv
::
float16
;
case
nvinfer1
::
DataType
::
kINT32
:
return
tv
::
int32
;
case
nvinfer1
::
DataType
::
kINT8
:
return
tv
::
int8
;
default:
;
}
TV_THROW_INVALID_ARG
(
"unknown trt dtype"
);
}
struct
InferDeleter
{
template
<
typename
T
>
void
operator
()(
T
*
obj
)
const
{
if
(
obj
)
{
obj
->
destroy
();
}
}
};
template
<
typename
T
>
using
trt_unique_ptr_t
=
std
::
unique_ptr
<
T
,
InferDeleter
>
;
class
Logger
:
public
nvinfer1
::
ILogger
{
public:
Logger
(
Severity
severity
=
Severity
::
kWARNING
)
:
reportableSeverity
(
severity
)
{}
void
log
(
Severity
severity
,
const
char
*
msg
)
override
{
// suppress messages with severity enum value greater than the reportable
if
(
severity
>
reportableSeverity
)
return
;
switch
(
severity
)
{
case
Severity
::
kINTERNAL_ERROR
:
std
::
cerr
<<
"INTERNAL_ERROR: "
;
break
;
case
Severity
::
kERROR
:
std
::
cerr
<<
"ERROR: "
;
break
;
case
Severity
::
kWARNING
:
std
::
cerr
<<
"WARNING: "
;
break
;
case
Severity
::
kINFO
:
std
::
cerr
<<
"INFO: "
;
break
;
default:
std
::
cerr
<<
"UNKNOWN: "
;
break
;
}
std
::
cerr
<<
msg
<<
std
::
endl
;
}
Severity
reportableSeverity
;
};
class
InferenceContext
{
public:
explicit
InferenceContext
(
const
std
::
string
&
engine_bin
,
int
device
)
:
logger_
(
nvinfer1
::
ILogger
::
Severity
::
kINFO
),
device_
(
device
)
{
TV_ASSERT_INVALID_ARG
(
device
>=
0
,
"invalid device id"
);
int
deviceCount
;
cudaGetDeviceCount
(
&
deviceCount
);
if
(
device
>=
deviceCount
)
{
TV_THROW_INVALID_ARG
(
"you provide device "
,
device
,
" but you only have "
,
deviceCount
,
" device."
);
}
cudaSetDevice
(
device
);
auto
runtime
=
trt_unique_ptr_t
<
nvinfer1
::
IRuntime
>
(
nvinfer1
::
createInferRuntime
(
logger_
));
engine_
=
trt_unique_ptr_t
<
nvinfer1
::
ICudaEngine
>
(
runtime
->
deserializeCudaEngine
(
engine_bin
.
c_str
(),
engine_bin
.
size
(),
nullptr
));
ctx_
=
trt_unique_ptr_t
<
nvinfer1
::
IExecutionContext
>
(
engine_
->
createExecutionContext
());
max_batch_size_
=
engine_
->
getMaxBatchSize
();
for
(
int
i
=
0
;
i
<
engine_
->
getNbBindings
();
++
i
)
{
auto
dims
=
engine_
->
getBindingDimensions
(
i
);
std
::
vector
<
int
>
shape_vec
(
dims
.
d
,
dims
.
d
+
dims
.
nbDims
);
shape_vec
.
insert
(
shape_vec
.
begin
(),
{
max_batch_size_
});
tv
::
TensorShape
shape
(
shape_vec
);
std
::
string
name
=
engine_
->
getBindingName
(
i
);
auto
trt_dtype
=
engine_
->
getBindingDataType
(
i
);
auto
tv_dtype
=
trt_dtype_to_tv
(
trt_dtype
);
bool
isInput
=
engine_
->
bindingIsInput
(
i
);
name_to_idx_
[
name
]
=
i
;
idx_to_name_
[
i
]
=
name
;
name_to_host_mem_
.
insert
({
name
,
tv
::
Tensor
(
shape
,
tv_dtype
,
-
1
)});
name_to_dev_mem_
.
insert
({
name
,
tv
::
Tensor
(
shape
,
tv_dtype
,
0
)});
if
(
isInput
)
inp_idxes_
.
push_back
(
i
);
else
out_idxes_
.
push_back
(
i
);
bindings_
.
push_back
(
name_to_dev_mem_
[
name
].
raw_data
());
}
checkCudaErrors
(
cudaStreamCreate
(
&
stream_
));
}
std
::
unordered_map
<
std
::
string
,
tv
::
Tensor
>
operator
()(
std
::
vector
<
tv
::
Tensor
>
inputs
)
{
TV_ASSERT_INVALID_ARG
(
inputs
.
size
()
==
inp_idxes_
.
size
(),
"must provide"
,
inp_idxes_
.
size
(),
"inputs, but got"
,
inputs
.
size
());
// inference batch size
int
bs
=
inputs
[
0
].
dim
(
0
);
for
(
auto
&
inp
:
inputs
)
{
TV_ASSERT_INVALID_ARG
(
inp
.
dim
(
0
)
==
bs
,
"batch sizes of all input must same"
);
}
TV_ASSERT_INVALID_ARG
(
bs
<=
max_batch_size_
,
"your batchsize too large"
,
bs
,
max_batch_size_
);
for
(
int
i
=
0
;
i
<
inputs
.
size
();
++
i
)
{
auto
&
dev_mem
=
name_to_dev_mem_
[
idx_to_name_
[
i
]];
auto
shape_inp
=
inputs
[
i
].
shape
().
subshape
(
1
);
auto
shape_dev
=
dev_mem
.
shape
().
subshape
(
1
);
TV_ASSERT_INVALID_ARG
(
shape_inp
==
shape_dev
,
"shape except batch must same"
,
shape_inp
,
shape_dev
);
dev_mem
.
slice_first_axis
(
0
,
bs
).
copy_
(
inputs
[
i
].
slice_first_axis
(
0
,
bs
),
stream_
);
}
ctx_
->
enqueue
(
bs
,
bindings_
.
data
(),
stream_
,
nullptr
);
for
(
int
i
:
out_idxes_
)
{
name_to_host_mem_
[
idx_to_name_
[
i
]].
slice_first_axis
(
0
,
bs
).
copy_
(
name_to_dev_mem_
[
idx_to_name_
[
i
]].
slice_first_axis
(
0
,
bs
),
stream_
);
}
checkCudaErrors
(
cudaStreamSynchronize
(
stream_
));
std
::
unordered_map
<
std
::
string
,
tv
::
Tensor
>
output_map
;
for
(
int
i
=
0
;
i
<
out_idxes_
.
size
();
++
i
)
{
auto
name
=
idx_to_name_
[
out_idxes_
[
i
]];
output_map
[
name
]
=
name_to_host_mem_
[
name
].
slice_first_axis
(
0
,
bs
);
}
return
output_map
;
}
std
::
unordered_map
<
std
::
string
,
tv
::
Tensor
>
operator
()(
std
::
unordered_map
<
std
::
string
,
tv
::
Tensor
>
inputs
)
{
std
::
vector
<
tv
::
Tensor
>
inputs_vec
(
inp_idxes_
.
size
());
int
count
=
0
;
for
(
auto
&
p
:
inputs
)
{
auto
iter
=
name_to_idx_
.
find
(
p
.
first
);
TV_ASSERT_INVALID_ARG
(
iter
!=
name_to_idx_
.
end
(),
"cant find your name"
,
p
.
first
);
inputs_vec
[
name_to_idx_
[
p
.
first
]]
=
p
.
second
;
}
TV_ASSERT_INVALID_ARG
(
count
==
inp_idxes_
.
size
(),
"your inp not enough"
);
return
(
*
this
)(
inputs_vec
);
}
tv
::
Tensor
operator
[](
std
::
string
name
)
{
auto
iter
=
name_to_host_mem_
.
find
(
name
);
if
(
iter
==
name_to_host_mem_
.
end
())
{
TV_THROW_INVALID_ARG
(
name
,
"not found."
);
}
return
iter
->
second
;
}
std
::
string
repr
()
{
std
::
stringstream
ss
;
ss
<<
"InferenceContext[gpu="
<<
device_
<<
"]"
;
ss
<<
"
\n
Inputs:"
;
std
::
string
name
;
for
(
auto
&
i
:
inp_idxes_
)
{
name
=
idx_to_name_
[
i
];
auto
&
mem
=
name_to_host_mem_
[
name
];
ss
<<
"
\n
"
<<
name
<<
"["
<<
tv
::
detail
::
typeString
(
mem
.
dtype
())
<<
"]: "
<<
mem
.
shape
();
}
ss
<<
"
\n
Outputs:"
;
for
(
auto
&
i
:
out_idxes_
)
{
name
=
idx_to_name_
[
i
];
auto
&
mem
=
name_to_host_mem_
[
name
];
ss
<<
"
\n
"
<<
name
<<
"["
<<
tv
::
detail
::
typeString
(
mem
.
dtype
())
<<
"]: "
<<
mem
.
shape
();
}
return
ss
.
str
();
}
private:
Logger
logger_
;
trt_unique_ptr_t
<
nvinfer1
::
ICudaEngine
>
engine_
;
trt_unique_ptr_t
<
nvinfer1
::
IExecutionContext
>
ctx_
;
std
::
unordered_map
<
std
::
string
,
tv
::
Tensor
>
name_to_dev_mem_
;
std
::
unordered_map
<
std
::
string
,
tv
::
Tensor
>
name_to_host_mem_
;
std
::
unordered_map
<
std
::
string
,
int
>
name_to_idx_
;
std
::
unordered_map
<
int
,
std
::
string
>
idx_to_name_
;
std
::
vector
<
int
>
inp_idxes_
;
std
::
vector
<
int
>
out_idxes_
;
std
::
vector
<
void
*>
bindings_
;
cudaStream_t
stream_
;
int
max_batch_size_
;
int
device_
;
};
}
// namespace trt
Prev
1
2
3
4
5
6
…
8
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment