Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
one
spconv
Commits
3517290c
"platforms/hip/include/HipArray.h" did not exist on "d0426ba9d6b04481f13945cc9f2b1f2eb51166ba"
Commit
3517290c
authored
Jul 09, 2020
by
yanyan
Browse files
format code, add benchmark per layer
parent
540a2209
Changes
29
Show whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
470 additions
and
137 deletions
+470
-137
src/spconv/point2voxel.cu
src/spconv/point2voxel.cu
+63
-67
src/spconv/point2voxel_ops.cc
src/spconv/point2voxel_ops.cc
+10
-14
src/spconv/spconv_ops.cc
src/spconv/spconv_ops.cc
+11
-12
test/benchmark.py
test/benchmark.py
+80
-14
test/benchmark_detail.py
test/benchmark_detail.py
+199
-0
test/benchmark_points_to_voxel.py
test/benchmark_points_to_voxel.py
+15
-9
test/benchmark_points_to_voxel_gpu.py
test/benchmark_points_to_voxel_gpu.py
+90
-19
third_party/cutlass
third_party/cutlass
+1
-1
third_party/mp11
third_party/mp11
+1
-1
No files found.
src/spconv/point2voxel.cu
View file @
3517290c
...
@@ -10,9 +10,7 @@
...
@@ -10,9 +10,7 @@
namespace
spconv
{
namespace
spconv
{
void
scatter_point_to_grid_cuda
(
void
scatter_point_to_grid_cuda
(
torch
::
Tensor
points
,
torch
::
Tensor
indexes
,
torch
::
Tensor
points
,
torch
::
Tensor
indexes
,
torch
::
Tensor
grids
,
torch
::
Tensor
grids
,
torch
::
Tensor
numPointsPerGrid
,
torch
::
Tensor
numPointsPerGrid
,
torch
::
Tensor
pointIndex
,
torch
::
Tensor
pointIndex
,
...
@@ -27,27 +25,25 @@ void scatter_point_to_grid_cuda(
...
@@ -27,27 +25,25 @@ void scatter_point_to_grid_cuda(
constexpr
int
NDim
=
decltype
(
I
)
::
value
;
constexpr
int
NDim
=
decltype
(
I
)
::
value
;
tv
::
SimpleVector
<
Index
,
NDim
>
gs
(
gridShape
.
begin
(),
gridShape
.
end
());
tv
::
SimpleVector
<
Index
,
NDim
>
gs
(
gridShape
.
begin
(),
gridShape
.
end
());
scatterPointToGridKernel
<
Index
,
NDim
>
scatterPointToGridKernel
<
Index
,
NDim
>
<<<
tv
::
cuda
::
getBlocks
(
num_points
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
<<<
tv
::
cuda
::
getBlocks
(
num_points
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
0
,
stream
>>>
(
tv
::
torch2tv
<
float
>
(
points
),
stream
>>>
(
tv
::
torch2tv
<
float
>
(
points
),
tv
::
torch2tv
<
Index
>
(
indexes
),
tv
::
torch2tv
<
Index
>
(
indexes
),
tv
::
torch2tv
<
float
>
(
grids
),
tv
::
torch2tv
<
float
>
(
grids
),
tv
::
torch2tv
<
Index
>
(
numPointsPerGrid
),
tv
::
torch2tv
<
Index
>
(
numPointsPerGrid
),
tv
::
torch2tv
<
Index
>
(
pointIndex
),
tv
::
torch2tv
<
Index
>
(
pointIndex
),
gs
);
gs
);
TV_CHECK_CUDA_ERR_V2
(
"scatterPointToGridKernel failed"
);
TV_CHECK_CUDA_ERR_V2
(
"scatterPointToGridKernel failed"
);
#ifdef TV_LOG_KERNEL_INFO
#ifdef TV_LOG_KERNEL_INFO
cudaFuncAttributes
attr
;
cudaFuncAttributes
attr
;
checkCudaErrors
(
cudaFuncGetAttributes
(
checkCudaErrors
(
&
attr
,
scatterPointToGridKernel
<
Index
,
NDim
>
));
cudaFuncGetAttributes
(
&
attr
,
scatterPointToGridKernel
<
Index
,
NDim
>
));
tv
::
ssprint
(
"scatterPointToGridKernel<"
,
tv
::
type_s
<
Index
>
,
NDim
,
tv
::
ssprint
(
"scatterPointToGridKernel<"
,
tv
::
type_s
<
Index
>
,
NDim
,
">"
,
">"
,
attr
.
numRegs
);
attr
.
numRegs
);
#endif
#endif
});
});
});
});
}
}
void
gather_point_from_grid_cuda
(
void
gather_point_from_grid_cuda
(
torch
::
Tensor
grids
,
torch
::
Tensor
grids
,
torch
::
Tensor
numPointsPerGrid
,
torch
::
Tensor
numPointsPerGrid
,
torch
::
Tensor
pointIndex
,
torch
::
Tensor
pointIndex
,
torch
::
Tensor
pointIndexUnique
,
torch
::
Tensor
pointIndexUnique
,
torch
::
Tensor
voxels
,
torch
::
Tensor
coors
,
torch
::
Tensor
voxels
,
torch
::
Tensor
coors
,
...
@@ -57,15 +53,17 @@ void gather_point_from_grid_cuda(
...
@@ -57,15 +53,17 @@ void gather_point_from_grid_cuda(
auto
num_voxel
=
voxels
.
size
(
0
);
auto
num_voxel
=
voxels
.
size
(
0
);
auto
num_max_points
=
pointIndex
.
size
(
0
)
-
1
;
auto
num_max_points
=
pointIndex
.
size
(
0
)
-
1
;
auto
grid_volume
=
grids
.
size
(
0
);
auto
grid_volume
=
grids
.
size
(
0
);
tv
::
dispatch_torch
<
int32_t
>
(
pointIndexUnique
.
scalar_type
(),
[
&
](
auto
IndexValue
)
{
tv
::
dispatch_torch
<
int32_t
>
(
pointIndexUnique
.
scalar_type
(),
[
&
](
auto
IndexValue
)
{
using
Index
=
decltype
(
IndexValue
);
using
Index
=
decltype
(
IndexValue
);
tv
::
dispatch_int
<
2
,
3
,
4
>
(
ndim
,
[
&
](
auto
I
)
{
tv
::
dispatch_int
<
2
,
3
,
4
>
(
ndim
,
[
&
](
auto
I
)
{
constexpr
int
NDim
=
decltype
(
I
)
::
value
;
constexpr
int
NDim
=
decltype
(
I
)
::
value
;
tv
::
SimpleVector
<
Index
,
NDim
>
gs
(
gridShape
.
begin
(),
gridShape
.
end
());
tv
::
SimpleVector
<
Index
,
NDim
>
gs
(
gridShape
.
begin
(),
gridShape
.
end
());
resetPointIndexKernel
<
Index
>
resetPointIndexKernel
<
Index
>
<<<
tv
::
cuda
::
getBlocks
(
num_max_points
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
<<<
tv
::
cuda
::
getBlocks
(
num_max_points
),
0
,
stream
>>>
(
tv
::
torch2tv
<
Index
>
(
pointIndex
),
grid_volume
);
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
stream
>>>
(
tv
::
torch2tv
<
Index
>
(
pointIndex
),
grid_volume
);
TV_CHECK_CUDA_ERR_V2
(
"resetPointIndexKernel failed"
);
TV_CHECK_CUDA_ERR_V2
(
"resetPointIndexKernel failed"
);
#ifdef TV_LOG_KERNEL_INFO
#ifdef TV_LOG_KERNEL_INFO
cudaFuncAttributes
attr0
;
cudaFuncAttributes
attr0
;
...
@@ -76,32 +74,30 @@ void gather_point_from_grid_cuda(
...
@@ -76,32 +74,30 @@ void gather_point_from_grid_cuda(
#endif
#endif
gatherPointFromGridKernel
<
Index
,
NDim
>
gatherPointFromGridKernel
<
Index
,
NDim
>
<<<
tv
::
cuda
::
getBlocks
(
num_voxel
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
<<<
tv
::
cuda
::
getBlocks
(
num_voxel
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
0
,
stream
>>>
(
tv
::
torch2tv
<
float
>
(
grids
),
stream
>>>
(
tv
::
torch2tv
<
float
>
(
grids
),
tv
::
torch2tv
<
Index
>
(
numPointsPerGrid
),
tv
::
torch2tv
<
Index
>
(
numPointsPerGrid
),
tv
::
torch2tv
<
Index
>
(
pointIndexUnique
),
tv
::
torch2tv
<
Index
>
(
pointIndexUnique
),
tv
::
torch2tv
<
float
>
(
voxels
),
tv
::
torch2tv
<
float
>
(
voxels
),
tv
::
torch2tv
<
Index
>
(
coors
),
tv
::
torch2tv
<
Index
>
(
coors
),
gs
);
gs
);
TV_CHECK_CUDA_ERR_V2
(
"gatherPointFromGridKernel failed"
);
TV_CHECK_CUDA_ERR_V2
(
"gatherPointFromGridKernel failed"
);
#ifdef TV_LOG_KERNEL_INFO
#ifdef TV_LOG_KERNEL_INFO
cudaFuncAttributes
attr1
;
cudaFuncAttributes
attr1
;
checkCudaErrors
(
cudaFuncGetAttributes
(
checkCudaErrors
(
cudaFuncGetAttributes
(
&
attr1
,
gatherPointFromGridKernel
<
Index
,
NDim
>
));
&
attr1
,
gatherPointFromGridKernel
<
Index
,
NDim
>
));
tv
::
ssprint
(
"gatherPointFromGridKernel<"
,
tv
::
type_s
<
Index
>
,
NDim
,
">"
,
tv
::
ssprint
(
"gatherPointFromGridKernel<"
,
tv
::
type_s
<
Index
>
,
NDim
,
attr1
.
numRegs
);
">"
,
attr1
.
numRegs
);
#endif
#endif
resetGridKernel
<
Index
>
resetGridKernel
<
Index
><<<
tv
::
cuda
::
getBlocks
(
num_voxel
),
<<<
tv
::
cuda
::
getBlocks
(
num_voxel
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
stream
>>>
(
0
,
stream
>>>
(
tv
::
torch2tv
<
float
>
(
grids
),
tv
::
torch2tv
<
float
>
(
grids
),
tv
::
torch2tv
<
Index
>
(
numPointsPerGrid
),
tv
::
torch2tv
<
Index
>
(
numPointsPerGrid
),
tv
::
torch2tv
<
Index
>
(
pointIndexUnique
));
tv
::
torch2tv
<
Index
>
(
pointIndexUnique
));
TV_CHECK_CUDA_ERR_V2
(
"resetGridKernel failed"
);
TV_CHECK_CUDA_ERR_V2
(
"resetGridKernel failed"
);
#ifdef TV_LOG_KERNEL_INFO
#ifdef TV_LOG_KERNEL_INFO
cudaFuncAttributes
attr2
;
cudaFuncAttributes
attr2
;
checkCudaErrors
(
cudaFuncGetAttributes
(
checkCudaErrors
(
&
attr2
,
resetGridKernel
<
Index
,
NDim
>
));
cudaFuncGetAttributes
(
&
attr2
,
resetGridKernel
<
Index
,
NDim
>
));
tv
::
ssprint
(
"resetGridKernel<"
,
tv
::
type_s
<
Index
>
,
NDim
,
">"
,
tv
::
ssprint
(
"resetGridKernel<"
,
tv
::
type_s
<
Index
>
,
NDim
,
">"
,
attr2
.
numRegs
);
attr2
.
numRegs
);
#endif
#endif
...
...
src/spconv/point2voxel_ops.cc
View file @
3517290c
...
@@ -3,23 +3,18 @@
...
@@ -3,23 +3,18 @@
namespace
spconv
{
namespace
spconv
{
int64_t
int64_t
pointsToVoxel
(
torch
::
Tensor
points
,
torch
::
Tensor
indexes
,
pointsToVoxel
(
torch
::
Tensor
points
,
torch
::
Tensor
pointIndex
,
torch
::
Tensor
grids
,
torch
::
Tensor
indexes
,
torch
::
Tensor
numPointsPerGrid
,
torch
::
Tensor
voxels
,
torch
::
Tensor
pointIndex
,
torch
::
Tensor
coors
,
std
::
vector
<
int64_t
>
gridShape
,
torch
::
Tensor
grids
,
torch
::
Tensor
numPointsPerGrid
,
torch
::
Tensor
voxels
,
torch
::
Tensor
coors
,
std
::
vector
<
int64_t
>
gridShape
,
const
int64_t
ndim
)
{
const
int64_t
ndim
)
{
if
(
points
.
device
().
type
()
==
torch
::
kCPU
)
{
if
(
points
.
device
().
type
()
==
torch
::
kCPU
)
{
TV_THROW_INVALID_ARG
(
"not support cpu currently"
);
TV_THROW_INVALID_ARG
(
"not support cpu currently"
);
}
}
#ifdef TV_CUDA
#ifdef TV_CUDA
else
if
(
points
.
device
().
type
()
==
torch
::
kCUDA
)
{
else
if
(
points
.
device
().
type
()
==
torch
::
kCUDA
)
{
scatter_point_to_grid_cuda
(
points
,
indexes
,
grids
,
scatter_point_to_grid_cuda
(
points
,
indexes
,
grids
,
numPointsPerGrid
,
numPointsPerGrid
,
pointIndex
,
gridShape
,
ndim
);
pointIndex
,
gridShape
,
ndim
);
}
}
#endif
#endif
else
{
else
{
...
@@ -33,8 +28,9 @@ pointsToVoxel(torch::Tensor points,
...
@@ -33,8 +28,9 @@ pointsToVoxel(torch::Tensor points,
}
}
#ifdef TV_CUDA
#ifdef TV_CUDA
else
if
(
points
.
device
().
type
()
==
torch
::
kCUDA
)
{
else
if
(
points
.
device
().
type
()
==
torch
::
kCUDA
)
{
gather_point_from_grid_cuda
(
grids
,
numPointsPerGrid
,
gather_point_from_grid_cuda
(
grids
,
numPointsPerGrid
,
pointIndex
,
pointIndex
,
pointIndexUnique
,
voxels
,
coors
,
gridShape
,
ndim
);
pointIndexUnique
,
voxels
,
coors
,
gridShape
,
ndim
);
}
}
#endif
#endif
else
{
else
{
...
...
src/spconv/spconv_ops.cc
View file @
3517290c
...
@@ -247,10 +247,10 @@ torch::Tensor indiceConvNative(torch::Tensor features, torch::Tensor filters,
...
@@ -247,10 +247,10 @@ torch::Tensor indiceConvNative(torch::Tensor features, torch::Tensor filters,
}
}
template
<
int
Algo
>
template
<
int
Algo
>
torch
::
Tensor
torch
::
Tensor
indiceConvFused
(
torch
::
Tensor
features
,
torch
::
Tensor
filters
,
indiceConvFused
(
torch
::
Tensor
features
,
torch
::
Tensor
filte
rs
,
torch
::
Tensor
indicePai
rs
,
torch
::
Tensor
indice
Pairs
,
torch
::
Tensor
indiceNum
,
torch
::
Tensor
indice
Num
,
int64_t
numActOut
,
int64_t
numActOut
,
int64_t
_inverse
,
int64_t
_subM
)
{
int64_t
_inverse
,
int64_t
_subM
)
{
auto
kernelVolume
=
indiceNum
.
size
(
0
);
auto
kernelVolume
=
indiceNum
.
size
(
0
);
// auto timer = spconv::CudaContextTimer<>();
// auto timer = spconv::CudaContextTimer<>();
bool
subM
=
_subM
!=
0
;
bool
subM
=
_subM
!=
0
;
...
@@ -282,7 +282,8 @@ indiceConvFused(torch::Tensor features, torch::Tensor filters,
...
@@ -282,7 +282,8 @@ indiceConvFused(torch::Tensor features, torch::Tensor filters,
}
}
#ifdef TV_CUDA
#ifdef TV_CUDA
else
if
(
device
==
torch
::
kCUDA
)
{
else
if
(
device
==
torch
::
kCUDA
)
{
FusedConvDispatch
<
Algo
>::
fwd
(
output
,
features
,
filters
[
i
],
indicePairs
[
inverse
][
i
],
FusedConvDispatch
<
Algo
>::
fwd
(
output
,
features
,
filters
[
i
],
indicePairs
[
inverse
][
i
],
indicePairs
[
!
inverse
][
i
],
nHot
);
indicePairs
[
!
inverse
][
i
],
nHot
);
}
}
#endif
#endif
...
@@ -518,8 +519,7 @@ template <int Algo>
...
@@ -518,8 +519,7 @@ template <int Algo>
std
::
vector
<
torch
::
Tensor
>
std
::
vector
<
torch
::
Tensor
>
indiceConvBwFused
(
torch
::
Tensor
features
,
torch
::
Tensor
filters
,
indiceConvBwFused
(
torch
::
Tensor
features
,
torch
::
Tensor
filters
,
torch
::
Tensor
outGrad
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
outGrad
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
int64_t
_inverse
,
torch
::
Tensor
indiceNum
,
int64_t
_inverse
,
int64_t
_subM
)
{
int64_t
_subM
)
{
auto
kernelVolume
=
indiceNum
.
size
(
0
);
auto
kernelVolume
=
indiceNum
.
size
(
0
);
bool
subM
=
_subM
!=
0
;
bool
subM
=
_subM
!=
0
;
bool
inverse
=
_inverse
!=
0
;
bool
inverse
=
_inverse
!=
0
;
...
@@ -723,7 +723,6 @@ template <> struct ConvDispatch<kMinkowskiEngine> {
...
@@ -723,7 +723,6 @@ template <> struct ConvDispatch<kMinkowskiEngine> {
constexpr
static
auto
*
bwd
=
indiceConvBwFused
<
kFMinkowskiEngine
>
;
constexpr
static
auto
*
bwd
=
indiceConvBwFused
<
kFMinkowskiEngine
>
;
};
};
torch
::
Tensor
indiceConv
(
torch
::
Tensor
features
,
torch
::
Tensor
filters
,
torch
::
Tensor
indiceConv
(
torch
::
Tensor
features
,
torch
::
Tensor
filters
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
int64_t
numActOut
,
int64_t
_inverse
,
int64_t
_subM
,
int64_t
numActOut
,
int64_t
_inverse
,
int64_t
_subM
,
...
...
test/benchmark.py
View file @
3517290c
...
@@ -26,38 +26,104 @@ class Net(nn.Module):
...
@@ -26,38 +26,104 @@ class Net(nn.Module):
def
__init__
(
self
,
shape
,
algo
):
def
__init__
(
self
,
shape
,
algo
):
super
().
__init__
()
super
().
__init__
()
self
.
net
=
spconv
.
SparseSequential
(
self
.
net
=
spconv
.
SparseSequential
(
spconv
.
SubMConv3d
(
3
,
64
,
3
,
bias
=
False
,
indice_key
=
"c0"
,
algo
=
algo
),
spconv
.
SubMConv3d
(
3
,
64
,
3
,
bias
=
False
,
indice_key
=
"c0"
,
spconv
.
SubMConv3d
(
64
,
64
,
3
,
bias
=
False
,
indice_key
=
"c0"
,
algo
=
algo
),
algo
=
algo
),
spconv
.
SubMConv3d
(
64
,
64
,
3
,
bias
=
False
,
indice_key
=
"c0"
,
algo
=
algo
),
# nn.BatchNorm1d(32),
# nn.BatchNorm1d(32),
# nn.ReLU(),
# nn.ReLU(),
spconv
.
SparseMaxPool3d
(
2
,
2
),
spconv
.
SparseMaxPool3d
(
2
,
2
),
spconv
.
SubMConv3d
(
64
,
96
,
3
,
bias
=
False
,
indice_key
=
"c1"
,
algo
=
algo
),
spconv
.
SubMConv3d
(
64
,
spconv
.
SubMConv3d
(
96
,
96
,
3
,
bias
=
False
,
indice_key
=
"c1"
,
algo
=
algo
),
96
,
3
,
bias
=
False
,
indice_key
=
"c1"
,
algo
=
algo
),
spconv
.
SubMConv3d
(
96
,
96
,
3
,
bias
=
False
,
indice_key
=
"c1"
,
algo
=
algo
),
# nn.BatchNorm1d(64),
# nn.BatchNorm1d(64),
# nn.ReLU(),
# nn.ReLU(),
spconv
.
SparseMaxPool3d
(
2
,
2
),
spconv
.
SparseMaxPool3d
(
2
,
2
),
spconv
.
SubMConv3d
(
96
,
128
,
3
,
bias
=
False
,
indice_key
=
"c2"
,
algo
=
algo
),
spconv
.
SubMConv3d
(
96
,
spconv
.
SubMConv3d
(
128
,
128
,
3
,
bias
=
False
,
indice_key
=
"c2"
,
algo
=
algo
),
128
,
3
,
bias
=
False
,
indice_key
=
"c2"
,
algo
=
algo
),
spconv
.
SubMConv3d
(
128
,
128
,
3
,
bias
=
False
,
indice_key
=
"c2"
,
algo
=
algo
),
# nn.BatchNorm1d(128),
# nn.BatchNorm1d(128),
# nn.ReLU(),
# nn.ReLU(),
spconv
.
SparseMaxPool3d
(
2
,
2
),
spconv
.
SparseMaxPool3d
(
2
,
2
),
spconv
.
SubMConv3d
(
128
,
160
,
3
,
bias
=
False
,
indice_key
=
"c3"
,
algo
=
algo
),
spconv
.
SubMConv3d
(
128
,
spconv
.
SubMConv3d
(
160
,
160
,
3
,
bias
=
False
,
indice_key
=
"c3"
,
algo
=
algo
),
160
,
3
,
bias
=
False
,
indice_key
=
"c3"
,
algo
=
algo
),
spconv
.
SubMConv3d
(
160
,
160
,
3
,
bias
=
False
,
indice_key
=
"c3"
,
algo
=
algo
),
# nn.BatchNorm1d(128),
# nn.BatchNorm1d(128),
# nn.ReLU(),
# nn.ReLU(),
spconv
.
SparseMaxPool3d
(
2
,
2
),
spconv
.
SparseMaxPool3d
(
2
,
2
),
spconv
.
SubMConv3d
(
160
,
192
,
3
,
bias
=
False
,
indice_key
=
"c4"
,
algo
=
algo
),
spconv
.
SubMConv3d
(
160
,
spconv
.
SubMConv3d
(
192
,
192
,
3
,
bias
=
False
,
indice_key
=
"c4"
,
algo
=
algo
),
192
,
3
,
bias
=
False
,
indice_key
=
"c4"
,
algo
=
algo
),
spconv
.
SubMConv3d
(
192
,
192
,
3
,
bias
=
False
,
indice_key
=
"c4"
,
algo
=
algo
),
# nn.BatchNorm1d(128),
# nn.BatchNorm1d(128),
# nn.ReLU(),
# nn.ReLU(),
spconv
.
SparseMaxPool3d
(
2
,
2
),
spconv
.
SparseMaxPool3d
(
2
,
2
),
spconv
.
SubMConv3d
(
192
,
224
,
3
,
bias
=
False
,
indice_key
=
"c5"
,
algo
=
algo
),
spconv
.
SubMConv3d
(
192
,
spconv
.
SubMConv3d
(
224
,
224
,
3
,
bias
=
False
,
indice_key
=
"c5"
,
algo
=
algo
),
224
,
3
,
bias
=
False
,
indice_key
=
"c5"
,
algo
=
algo
),
spconv
.
SubMConv3d
(
224
,
224
,
3
,
bias
=
False
,
indice_key
=
"c5"
,
algo
=
algo
),
# nn.BatchNorm1d(128),
# nn.BatchNorm1d(128),
# nn.ReLU(),
# nn.ReLU(),
spconv
.
SparseMaxPool3d
(
2
,
2
),
spconv
.
SparseMaxPool3d
(
2
,
2
),
spconv
.
SubMConv3d
(
224
,
256
,
3
,
bias
=
False
,
indice_key
=
"c6"
,
algo
=
algo
),
spconv
.
SubMConv3d
(
224
,
spconv
.
SubMConv3d
(
256
,
256
,
3
,
bias
=
False
,
indice_key
=
"c6"
,
algo
=
algo
),
256
,
3
,
bias
=
False
,
indice_key
=
"c6"
,
algo
=
algo
),
spconv
.
SubMConv3d
(
256
,
256
,
3
,
bias
=
False
,
indice_key
=
"c6"
,
algo
=
algo
),
)
)
max_batch_size
=
1
max_batch_size
=
1
# grid (dense map) is used for indice generation. use pre-allocated grid can run faster.
# grid (dense map) is used for indice generation. use pre-allocated grid can run faster.
...
...
test/benchmark_detail.py
0 → 100644
View file @
3517290c
import
time
from
pathlib
import
Path
import
numpy
as
np
import
torch
from
torch
import
nn
import
spconv
from
spconv.utils
import
VoxelGeneratorV2
def
waymo_data
(
batch_size
=
1
):
gen
=
VoxelGeneratorV2
([
0.1
,
0.1
,
0.1
],
[
-
80
,
-
80
,
-
2
,
80
,
80
,
6
],
1
,
150000
)
data
=
np
.
load
(
Path
(
__file__
).
parent
/
"data"
/
"benchmark-pc.npz"
)
pc
=
data
[
"pc"
]
data
=
gen
.
generate
(
pc
)
voxels
=
data
[
"voxels"
].
reshape
(
-
1
,
3
)
coors
=
data
[
"coordinates"
]
N
=
coors
.
shape
[
0
]
coors
=
np
.
concatenate
([
np
.
full
([
N
,
1
],
0
,
coors
.
dtype
),
coors
],
axis
=
1
)
return
voxels
,
coors
,
gen
.
grid_size
class
Net
(
nn
.
Module
):
def
__init__
(
self
,
shape
,
algo
):
super
().
__init__
()
self
.
net
=
spconv
.
SparseSequential
(
spconv
.
SubMConv3d
(
3
,
64
,
3
,
bias
=
False
,
indice_key
=
"c0"
,
algo
=
algo
,
name
=
"subm-0-0"
),
spconv
.
SubMConv3d
(
64
,
64
,
3
,
bias
=
False
,
indice_key
=
"c0"
,
algo
=
algo
,
name
=
"subm-0-1"
),
# nn.BatchNorm1d(32),
# nn.ReLU(),
spconv
.
SparseMaxPool3d
(
2
,
2
,
name
=
"pool-0"
),
spconv
.
SubMConv3d
(
64
,
96
,
3
,
bias
=
False
,
indice_key
=
"c1"
,
algo
=
algo
,
name
=
"subm-1-0"
),
spconv
.
SubMConv3d
(
96
,
96
,
3
,
bias
=
False
,
indice_key
=
"c1"
,
algo
=
algo
,
name
=
"subm-1-1"
),
# nn.BatchNorm1d(64),
# nn.ReLU(),
spconv
.
SparseMaxPool3d
(
2
,
2
,
name
=
"pool-1"
),
spconv
.
SubMConv3d
(
96
,
128
,
3
,
bias
=
False
,
indice_key
=
"c2"
,
algo
=
algo
,
name
=
"subm-2-0"
),
spconv
.
SubMConv3d
(
128
,
128
,
3
,
bias
=
False
,
indice_key
=
"c2"
,
algo
=
algo
,
name
=
"subm-2-1"
),
# nn.BatchNorm1d(128),
# nn.ReLU(),
spconv
.
SparseMaxPool3d
(
2
,
2
,
name
=
"pool-2"
),
spconv
.
SubMConv3d
(
128
,
160
,
3
,
bias
=
False
,
indice_key
=
"c3"
,
algo
=
algo
,
name
=
"subm-3-0"
),
spconv
.
SubMConv3d
(
160
,
160
,
3
,
bias
=
False
,
indice_key
=
"c3"
,
algo
=
algo
,
name
=
"subm-3-1"
),
# nn.BatchNorm1d(128),
# nn.ReLU(),
spconv
.
SparseMaxPool3d
(
2
,
2
,
name
=
"pool-3"
),
spconv
.
SubMConv3d
(
160
,
192
,
3
,
bias
=
False
,
indice_key
=
"c4"
,
algo
=
algo
,
name
=
"subm-4-0"
),
spconv
.
SubMConv3d
(
192
,
192
,
3
,
bias
=
False
,
indice_key
=
"c4"
,
algo
=
algo
,
name
=
"subm-4-1"
),
# nn.BatchNorm1d(128),
# nn.ReLU(),
spconv
.
SparseMaxPool3d
(
2
,
2
,
name
=
"pool-4"
),
spconv
.
SubMConv3d
(
192
,
224
,
3
,
bias
=
False
,
indice_key
=
"c5"
,
algo
=
algo
,
name
=
"subm-5-0"
),
spconv
.
SubMConv3d
(
224
,
224
,
3
,
bias
=
False
,
indice_key
=
"c5"
,
algo
=
algo
,
name
=
"subm-5-1"
),
# nn.BatchNorm1d(128),
# nn.ReLU(),
spconv
.
SparseMaxPool3d
(
2
,
2
,
name
=
"pool-5"
),
spconv
.
SubMConv3d
(
224
,
256
,
3
,
bias
=
False
,
indice_key
=
"c6"
,
algo
=
algo
,
name
=
"subm-6-0"
),
spconv
.
SubMConv3d
(
256
,
256
,
3
,
bias
=
False
,
indice_key
=
"c6"
,
algo
=
algo
,
name
=
"subm-6-1"
),
)
max_batch_size
=
1
# grid (dense map) is used for indice generation. use pre-allocated grid can run faster.
self
.
grid
=
torch
.
full
([
max_batch_size
,
*
shape
],
-
1
,
dtype
=
torch
.
int32
).
cuda
()
# self.grid = None
self
.
shape
=
shape
def
forward
(
self
,
features
,
coors
,
batch_size
):
x
=
spconv
.
SparseConvTensor
(
features
,
coors
,
self
.
shape
,
batch_size
,
self
.
grid
,
benchmark
=
True
)
return
self
.
net
(
x
)
def
main
():
dtype
=
torch
.
float32
voxels
,
coors
,
spatial_shape
=
waymo_data
()
voxels_th
=
torch
.
from_numpy
(
voxels
).
cuda
().
to
(
dtype
)
coors_th
=
torch
.
from_numpy
(
coors
).
cuda
().
int
()
algo
=
spconv
.
ConvAlgo
.
Minkowski
net
=
Net
(
spatial_shape
[::
-
1
],
algo
).
cuda
().
eval
().
to
(
dtype
)
print
(
coors_th
.
shape
)
out
=
net
(
voxels_th
,
coors_th
,
1
)
print
(
out
.
spatial_shape
)
times
=
[]
detail_bench
=
{}
detail_ind_gen_bench
=
{}
with
torch
.
no_grad
():
for
i
in
range
(
20
):
torch
.
cuda
.
synchronize
()
t
=
time
.
time
()
out
=
net
(
voxels_th
,
coors_th
,
1
)
for
k
,
v
in
out
.
benchmark_record
.
items
():
if
k
not
in
detail_bench
:
detail_bench
[
k
]
=
[]
detail_ind_gen_bench
[
k
]
=
[]
detail_bench
[
k
].
extend
(
v
[
"time"
])
detail_ind_gen_bench
[
k
].
extend
(
v
[
"indice_gen_time"
])
torch
.
cuda
.
synchronize
()
times
.
append
(
time
.
time
()
-
t
)
# print((net.grid == -1).float().sum(), net.grid.numel())
# print("spconv time", time.time() - t)
print
(
"spconv time"
,
np
.
mean
(
times
[
10
:]))
print
(
detail_bench
[
"subm-6-0"
])
print
(
detail_ind_gen_bench
[
"subm-6-0"
])
if
__name__
==
"__main__"
:
main
()
test/benchmark_points_to_voxel.py
View file @
3517290c
...
@@ -13,10 +13,14 @@ def waymo_data_gpu(batch_size=1):
...
@@ -13,10 +13,14 @@ def waymo_data_gpu(batch_size=1):
print
(
'gpu with total points available per voxel'
)
print
(
'gpu with total points available per voxel'
)
data
=
np
.
load
(
Path
(
__file__
).
parent
/
"data"
/
"benchmark-pc.npz"
)
data
=
np
.
load
(
Path
(
__file__
).
parent
/
"data"
/
"benchmark-pc.npz"
)
points
=
torch
.
from_numpy
(
data
[
'pc'
]).
cuda
().
float
()
points
=
torch
.
from_numpy
(
data
[
'pc'
]).
cuda
().
float
()
voxel_size
=
torch
.
Tensor
([
0.1
,
0.1
,
0.1
]).
to
(
points
.
dtype
).
to
(
points
.
device
)
voxel_size
=
torch
.
Tensor
([
0.1
,
0.1
,
coors_range
=
torch
.
Tensor
([
-
80
,
-
80
,
-
2
,
80
,
80
,
6
]).
to
(
points
.
dtype
).
to
(
points
.
device
)
0.1
]).
to
(
points
.
dtype
).
to
(
points
.
device
)
coors_range
=
torch
.
Tensor
([
-
80
,
-
80
,
-
2
,
80
,
80
,
gen
=
VoxelGeneratorV3
(
voxel_size
,
coors_range
,
max_points
=
200000
,
6
]).
to
(
points
.
dtype
).
to
(
points
.
device
)
gen
=
VoxelGeneratorV3
(
voxel_size
,
coors_range
,
max_points
=
200000
,
num_features
=
points
.
shape
[
1
],
num_features
=
points
.
shape
[
1
],
dtype
=
points
.
dtype
,
dtype
=
points
.
dtype
,
device
=
points
.
device
)
device
=
points
.
device
)
...
@@ -40,8 +44,8 @@ def waymo_data_gpu(batch_size=1):
...
@@ -40,8 +44,8 @@ def waymo_data_gpu(batch_size=1):
def
waymo_data_cpu
(
max_points_per_voxel
=
1
,
batch_size
=
1
):
def
waymo_data_cpu
(
max_points_per_voxel
=
1
,
batch_size
=
1
):
print
(
'cpu with %d max points per voxel'
%
max_points_per_voxel
)
print
(
'cpu with %d max points per voxel'
%
max_points_per_voxel
)
gen
=
VoxelGeneratorV2
([
0.1
,
0.1
,
0.1
],
[
-
80
,
-
80
,
-
2
,
80
,
80
,
6
],
max_points_per_voxel
,
gen
=
VoxelGeneratorV2
([
0.1
,
0.1
,
0.1
],
[
-
80
,
-
80
,
-
2
,
80
,
80
,
6
],
150000
)
max_points_per_voxel
,
150000
)
data
=
np
.
load
(
Path
(
__file__
).
parent
/
"data"
/
"benchmark-pc.npz"
)
data
=
np
.
load
(
Path
(
__file__
).
parent
/
"data"
/
"benchmark-pc.npz"
)
pc
=
data
[
"pc"
]
pc
=
data
[
"pc"
]
data
=
gen
.
generate
(
pc
)
data
=
gen
.
generate
(
pc
)
...
@@ -62,6 +66,7 @@ def waymo_data_cpu(max_points_per_voxel=1, batch_size=1):
...
@@ -62,6 +66,7 @@ def waymo_data_cpu(max_points_per_voxel=1, batch_size=1):
coors
=
np
.
concatenate
([
np
.
full
([
N
,
1
],
0
,
coors
.
dtype
),
coors
],
axis
=
1
)
coors
=
np
.
concatenate
([
np
.
full
([
N
,
1
],
0
,
coors
.
dtype
),
coors
],
axis
=
1
)
return
voxels
,
coors
,
gen
.
grid_size
return
voxels
,
coors
,
gen
.
grid_size
def
get_index
(
coor
,
grid_size
):
def
get_index
(
coor
,
grid_size
):
index
=
coor
[
0
]
index
=
coor
[
0
]
for
c
,
g
in
zip
(
coor
[
1
:],
grid_size
):
for
c
,
g
in
zip
(
coor
[
1
:],
grid_size
):
...
@@ -100,5 +105,6 @@ def main():
...
@@ -100,5 +105,6 @@ def main():
print
(
'Perfect GPU Voxelization!!!'
)
print
(
'Perfect GPU Voxelization!!!'
)
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
main
()
main
()
test/benchmark_points_to_voxel_gpu.py
View file @
3517290c
...
@@ -12,8 +12,10 @@ from spconv.utils import VoxelGeneratorV3
...
@@ -12,8 +12,10 @@ from spconv.utils import VoxelGeneratorV3
def
waymo_data
(
batch_size
=
1
):
def
waymo_data
(
batch_size
=
1
):
data
=
np
.
load
(
Path
(
__file__
).
parent
/
"data"
/
"benchmark-pc.npz"
)
data
=
np
.
load
(
Path
(
__file__
).
parent
/
"data"
/
"benchmark-pc.npz"
)
points
=
torch
.
from_numpy
(
data
[
'pc'
]).
cuda
().
float
()
points
=
torch
.
from_numpy
(
data
[
'pc'
]).
cuda
().
float
()
voxel_size
=
torch
.
Tensor
([
0.1
,
0.1
,
0.1
]).
to
(
points
.
dtype
).
to
(
points
.
device
)
voxel_size
=
torch
.
Tensor
([
0.1
,
0.1
,
coors_range
=
torch
.
Tensor
([
-
80
,
-
80
,
-
2
,
80
,
80
,
6
]).
to
(
points
.
dtype
).
to
(
points
.
device
)
0.1
]).
to
(
points
.
dtype
).
to
(
points
.
device
)
coors_range
=
torch
.
Tensor
([
-
80
,
-
80
,
-
2
,
80
,
80
,
6
]).
to
(
points
.
dtype
).
to
(
points
.
device
)
gen
=
VoxelGeneratorV3
(
voxel_size
,
coors_range
)
gen
=
VoxelGeneratorV3
(
voxel_size
,
coors_range
)
voxels
,
coors
=
gen
.
generate
(
points
)
voxels
,
coors
=
gen
.
generate
(
points
)
...
@@ -28,43 +30,111 @@ class Net(nn.Module):
...
@@ -28,43 +30,111 @@ class Net(nn.Module):
super
().
__init__
()
super
().
__init__
()
self
.
device
=
device
self
.
device
=
device
self
.
net
=
spconv
.
SparseSequential
(
self
.
net
=
spconv
.
SparseSequential
(
spconv
.
SubMConv3d
(
3
,
64
,
3
,
bias
=
False
,
indice_key
=
"c0"
,
algo
=
algo
),
spconv
.
SubMConv3d
(
3
,
64
,
3
,
bias
=
False
,
indice_key
=
"c0"
,
spconv
.
SubMConv3d
(
64
,
64
,
3
,
bias
=
False
,
indice_key
=
"c0"
,
algo
=
algo
),
algo
=
algo
),
spconv
.
SubMConv3d
(
64
,
64
,
3
,
bias
=
False
,
indice_key
=
"c0"
,
algo
=
algo
),
# nn.BatchNorm1d(32),
# nn.BatchNorm1d(32),
# nn.ReLU(),
# nn.ReLU(),
spconv
.
SparseMaxPool3d
(
2
,
2
),
spconv
.
SparseMaxPool3d
(
2
,
2
),
spconv
.
SubMConv3d
(
64
,
96
,
3
,
bias
=
False
,
indice_key
=
"c1"
,
algo
=
algo
),
spconv
.
SubMConv3d
(
64
,
spconv
.
SubMConv3d
(
96
,
96
,
3
,
bias
=
False
,
indice_key
=
"c1"
,
algo
=
algo
),
96
,
3
,
bias
=
False
,
indice_key
=
"c1"
,
algo
=
algo
),
spconv
.
SubMConv3d
(
96
,
96
,
3
,
bias
=
False
,
indice_key
=
"c1"
,
algo
=
algo
),
# nn.BatchNorm1d(64),
# nn.BatchNorm1d(64),
# nn.ReLU(),
# nn.ReLU(),
spconv
.
SparseMaxPool3d
(
2
,
2
),
spconv
.
SparseMaxPool3d
(
2
,
2
),
spconv
.
SubMConv3d
(
96
,
128
,
3
,
bias
=
False
,
indice_key
=
"c2"
,
algo
=
algo
),
spconv
.
SubMConv3d
(
96
,
spconv
.
SubMConv3d
(
128
,
128
,
3
,
bias
=
False
,
indice_key
=
"c2"
,
algo
=
algo
),
128
,
3
,
bias
=
False
,
indice_key
=
"c2"
,
algo
=
algo
),
spconv
.
SubMConv3d
(
128
,
128
,
3
,
bias
=
False
,
indice_key
=
"c2"
,
algo
=
algo
),
# nn.BatchNorm1d(128),
# nn.BatchNorm1d(128),
# nn.ReLU(),
# nn.ReLU(),
spconv
.
SparseMaxPool3d
(
2
,
2
),
spconv
.
SparseMaxPool3d
(
2
,
2
),
spconv
.
SubMConv3d
(
128
,
160
,
3
,
bias
=
False
,
indice_key
=
"c3"
,
algo
=
algo
),
spconv
.
SubMConv3d
(
128
,
spconv
.
SubMConv3d
(
160
,
160
,
3
,
bias
=
False
,
indice_key
=
"c3"
,
algo
=
algo
),
160
,
3
,
bias
=
False
,
indice_key
=
"c3"
,
algo
=
algo
),
spconv
.
SubMConv3d
(
160
,
160
,
3
,
bias
=
False
,
indice_key
=
"c3"
,
algo
=
algo
),
# nn.BatchNorm1d(128),
# nn.BatchNorm1d(128),
# nn.ReLU(),
# nn.ReLU(),
spconv
.
SparseMaxPool3d
(
2
,
2
),
spconv
.
SparseMaxPool3d
(
2
,
2
),
spconv
.
SubMConv3d
(
160
,
192
,
3
,
bias
=
False
,
indice_key
=
"c4"
,
algo
=
algo
),
spconv
.
SubMConv3d
(
160
,
spconv
.
SubMConv3d
(
192
,
192
,
3
,
bias
=
False
,
indice_key
=
"c4"
,
algo
=
algo
),
192
,
3
,
bias
=
False
,
indice_key
=
"c4"
,
algo
=
algo
),
spconv
.
SubMConv3d
(
192
,
192
,
3
,
bias
=
False
,
indice_key
=
"c4"
,
algo
=
algo
),
# nn.BatchNorm1d(128),
# nn.BatchNorm1d(128),
# nn.ReLU(),
# nn.ReLU(),
spconv
.
SparseMaxPool3d
(
2
,
2
),
spconv
.
SparseMaxPool3d
(
2
,
2
),
spconv
.
SubMConv3d
(
192
,
224
,
3
,
bias
=
False
,
indice_key
=
"c5"
,
algo
=
algo
),
spconv
.
SubMConv3d
(
192
,
spconv
.
SubMConv3d
(
224
,
224
,
3
,
bias
=
False
,
indice_key
=
"c5"
,
algo
=
algo
),
224
,
3
,
bias
=
False
,
indice_key
=
"c5"
,
algo
=
algo
),
spconv
.
SubMConv3d
(
224
,
224
,
3
,
bias
=
False
,
indice_key
=
"c5"
,
algo
=
algo
),
# nn.BatchNorm1d(128),
# nn.BatchNorm1d(128),
# nn.ReLU(),
# nn.ReLU(),
spconv
.
SparseMaxPool3d
(
2
,
2
),
spconv
.
SparseMaxPool3d
(
2
,
2
),
spconv
.
SubMConv3d
(
224
,
256
,
3
,
bias
=
False
,
indice_key
=
"c6"
,
algo
=
algo
),
spconv
.
SubMConv3d
(
224
,
spconv
.
SubMConv3d
(
256
,
256
,
3
,
bias
=
False
,
indice_key
=
"c6"
,
algo
=
algo
),
256
,
3
,
bias
=
False
,
indice_key
=
"c6"
,
algo
=
algo
),
spconv
.
SubMConv3d
(
256
,
256
,
3
,
bias
=
False
,
indice_key
=
"c6"
,
algo
=
algo
),
)
)
max_batch_size
=
1
max_batch_size
=
1
# grid (dense map) is used for indice generation. use pre-allocated grid can run faster.
# grid (dense map) is used for indice generation. use pre-allocated grid can run faster.
self
.
grid
=
torch
.
full
([
max_batch_size
,
*
shape
],
-
1
,
self
.
grid
=
torch
.
full
([
max_batch_size
,
*
shape
],
dtype
=
torch
.
int32
,
device
=
self
.
device
)
-
1
,
dtype
=
torch
.
int32
,
device
=
self
.
device
)
# self.grid = None
# self.grid = None
self
.
shape
=
shape
self
.
shape
=
shape
...
@@ -78,7 +148,8 @@ def main():
...
@@ -78,7 +148,8 @@ def main():
voxels
,
coors
,
spatial_shape
=
waymo_data
()
voxels
,
coors
,
spatial_shape
=
waymo_data
()
voxels_th
,
coors_th
=
voxels
,
coors
voxels_th
,
coors_th
=
voxels
,
coors
algo
=
spconv
.
ConvAlgo
.
Native
algo
=
spconv
.
ConvAlgo
.
Native
net
=
Net
(
spatial_shape
[::
-
1
],
algo
,
voxels_th
.
device
).
cuda
(
device
=
voxels_th
.
device
).
eval
().
float
()
net
=
Net
(
spatial_shape
[::
-
1
],
algo
,
voxels_th
.
device
).
cuda
(
device
=
voxels_th
.
device
).
eval
().
float
()
print
(
coors_th
.
shape
)
print
(
coors_th
.
shape
)
out
=
net
(
voxels_th
,
coors_th
,
1
)
out
=
net
(
voxels_th
,
coors_th
,
1
)
print
(
out
.
spatial_shape
)
print
(
out
.
spatial_shape
)
...
...
cutlass
@
fd7e058d
Compare
86931fef
...
fd7e058d
Subproject commit
86931fef8538008a1a92036732b3eb7fe47b25d0
Subproject commit
fd7e058d0cb3e4bf743edc530c7778a210cb168b
mp11
@
29764aad
Compare
10ba80ac
...
29764aad
Subproject commit
10ba80acb91f138170b7a22bb86523cb07d6f942
Subproject commit
29764aad4881fde809af6a025c12012e47a55515
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment