Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
one
spconv
Commits
19e73bbe
Commit
19e73bbe
authored
May 20, 2020
by
Yan Yan
Browse files
format code with clang-format, better c++ code
parent
c336139f
Changes
77
Hide whitespace changes
Inline
Side-by-side
Showing
17 changed files
with
905 additions
and
618 deletions
+905
-618
src/cuhash/main.cc
src/cuhash/main.cc
+40
-41
src/spconv/all.cc
src/spconv/all.cc
+17
-12
src/spconv/indice.cc
src/spconv/indice.cc
+47
-51
src/spconv/indice.cu
src/spconv/indice.cu
+293
-64
src/spconv/maxpool.cu
src/spconv/maxpool.cu
+35
-31
src/spconv/nms.cc
src/spconv/nms.cc
+2
-2
src/spconv/nms.cu
src/spconv/nms.cu
+17
-27
src/spconv/pillar_scatter.cu
src/spconv/pillar_scatter.cu
+5
-5
src/spconv/reordering.cc
src/spconv/reordering.cc
+15
-16
src/spconv/reordering.cu
src/spconv/reordering.cu
+22
-21
src/spconv/spconv_ops.cc
src/spconv/spconv_ops.cc
+6
-6
src/utils/all.cc
src/utils/all.cc
+1
-1
src/utils/nms.cu
src/utils/nms.cu
+46
-63
test/fake_dist_train.py
test/fake_dist_train.py
+35
-12
test/fake_train.py
test/fake_train.py
+29
-7
test/test_conv.py
test/test_conv.py
+294
-258
third_party/pybind11
third_party/pybind11
+1
-1
No files found.
src/cuhash/main.cc
View file @
19e73bbe
#include <cuhash/hash_table.h>
#include <cuda.h>
#include <cuhash/hash_table.h>
int
main
(){
auto
table
=
cuhash
::
HashTable
();
table
.
Initialize
(
10
,
2.0
);
const
int
N
=
10
;
// ハッシュテーブルに格納するデータ
int
keys
[
N
]
=
{
1
,
6
,
4
,
9
,
0
,
3
,
7
,
2
,
5
,
8
};
int
vals
[
N
]
=
{
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
};
// デバイスメモリにコピー
int
*
d_keys
,
*
d_vals
;
cudaMalloc
((
void
**
)
&
d_keys
,
sizeof
(
int
)
*
N
);
cudaMemcpy
(
d_keys
,
keys
,
sizeof
(
int
)
*
N
,
cudaMemcpyHostToDevice
);
cudaMalloc
((
void
**
)
&
d_vals
,
sizeof
(
int
)
*
N
);
cudaMemcpy
(
d_vals
,
vals
,
sizeof
(
int
)
*
N
,
cudaMemcpyHostToDevice
);
// ハッシュテーブルにクエリするデータ
int
input
[
N
]
=
{
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
};
int
output
[
N
];
// デバイスメモリにコピー
int
*
d_input
,
*
d_output
;
cudaMalloc
((
void
**
)
&
d_input
,
sizeof
(
int
)
*
N
);
cudaMemcpy
(
d_input
,
input
,
sizeof
(
int
)
*
N
,
cudaMemcpyHostToDevice
);
cudaMalloc
((
void
**
)
&
d_output
,
sizeof
(
int
)
*
N
);
cudaMemset
(
d_output
,
0
,
sizeof
(
int
)
*
N
);
bool
s
=
table
.
Build
(
N
,
(
const
unsigned
int
*
)
d_keys
,
(
const
unsigned
int
*
)
d_vals
);
std
::
cout
<<
s
<<
std
::
endl
;
table
.
Retrieve
(
N
,
(
const
unsigned
int
*
)
d_input
,
(
unsigned
int
*
)
d_output
);
std
::
cout
<<
s
<<
std
::
endl
;
cudaMemcpy
(
output
,
d_output
,
sizeof
(
int
)
*
N
,
cudaMemcpyDeviceToHost
);
for
(
int
i
=
0
;
i
<
N
;
++
i
)
{
printf
(
"%d
\n
"
,
output
[
i
]);
}
return
0
;
int
main
()
{
auto
table
=
cuhash
::
HashTable
();
table
.
Initialize
(
10
,
2.0
);
const
int
N
=
10
;
// ハッシュテーブルに格納するデータ
int
keys
[
N
]
=
{
1
,
6
,
4
,
9
,
0
,
3
,
7
,
2
,
5
,
8
};
int
vals
[
N
]
=
{
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
};
// デバイスメモリにコピー
int
*
d_keys
,
*
d_vals
;
cudaMalloc
((
void
**
)
&
d_keys
,
sizeof
(
int
)
*
N
);
cudaMemcpy
(
d_keys
,
keys
,
sizeof
(
int
)
*
N
,
cudaMemcpyHostToDevice
);
cudaMalloc
((
void
**
)
&
d_vals
,
sizeof
(
int
)
*
N
);
cudaMemcpy
(
d_vals
,
vals
,
sizeof
(
int
)
*
N
,
cudaMemcpyHostToDevice
);
// ハッシュテーブルにクエリするデータ
int
input
[
N
]
=
{
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
};
int
output
[
N
];
// デバイスメモリにコピー
int
*
d_input
,
*
d_output
;
cudaMalloc
((
void
**
)
&
d_input
,
sizeof
(
int
)
*
N
);
cudaMemcpy
(
d_input
,
input
,
sizeof
(
int
)
*
N
,
cudaMemcpyHostToDevice
);
cudaMalloc
((
void
**
)
&
d_output
,
sizeof
(
int
)
*
N
);
cudaMemset
(
d_output
,
0
,
sizeof
(
int
)
*
N
);
bool
s
=
table
.
Build
(
N
,
(
const
unsigned
int
*
)
d_keys
,
(
const
unsigned
int
*
)
d_vals
);
std
::
cout
<<
s
<<
std
::
endl
;
table
.
Retrieve
(
N
,
(
const
unsigned
int
*
)
d_input
,
(
unsigned
int
*
)
d_output
);
std
::
cout
<<
s
<<
std
::
endl
;
cudaMemcpy
(
output
,
d_output
,
sizeof
(
int
)
*
N
,
cudaMemcpyDeviceToHost
);
for
(
int
i
=
0
;
i
<
N
;
++
i
)
{
printf
(
"%d
\n
"
,
output
[
i
]);
}
return
0
;
}
\ No newline at end of file
src/spconv/all.cc
View file @
19e73bbe
// Copyright 2019 Yan Yan
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <torch/script.h>
#include <spconv/pool_ops.h>
#include <spconv/spconv_ops.h>
#include <spconv/pillar_scatter_ops.h>
#include <spconv/fused_spconv_ops.h>
#include <spconv/nms_ops.h>
#include <spconv/pillar_scatter_ops.h>
#include <spconv/pool_ops.h>
#include <spconv/spconv_ops.h>
#include <torch/script.h>
static
auto
registry
=
torch
::
RegisterOperators
()
.
op
(
"spconv::get_indice_pairs_2d"
,
&
spconv
::
getIndicePair
<
2
>
)
.
op
(
"spconv::get_indice_pairs_3d"
,
&
spconv
::
getIndicePair
<
3
>
)
.
op
(
"spconv::get_indice_pairs_4d"
,
&
spconv
::
getIndicePair
<
4
>
)
.
op
(
"spconv::get_indice_pairs_grid_2d"
,
&
spconv
::
getIndicePairPreGrid
<
2
>
)
.
op
(
"spconv::get_indice_pairs_grid_3d"
,
&
spconv
::
getIndicePairPreGrid
<
3
>
)
.
op
(
"spconv::get_indice_pairs_grid_2d"
,
&
spconv
::
getIndicePairPreGrid
<
2
>
)
.
op
(
"spconv::get_indice_pairs_grid_3d"
,
&
spconv
::
getIndicePairPreGrid
<
3
>
)
.
op
(
"spconv::indice_conv"
,
&
spconv
::
indiceConv
)
.
op
(
"spconv::indice_conv_backward"
,
&
spconv
::
indiceConvBackward
)
.
op
(
"spconv::fused_indice_conv_fp32"
,
&
spconv
::
fusedIndiceConvBatchNorm
<
float
>
)
.
op
(
"spconv::fused_indice_conv_half"
,
&
spconv
::
fusedIndiceConvBatchNorm
<
at
::
Half
>
)
.
op
(
"spconv::fused_indice_conv_fp32"
,
&
spconv
::
fusedIndiceConvBatchNorm
<
float
>
)
.
op
(
"spconv::fused_indice_conv_half"
,
&
spconv
::
fusedIndiceConvBatchNorm
<
at
::
Half
>
)
.
op
(
"spconv::indice_maxpool_fp32"
,
&
spconv
::
indiceMaxPool
<
float
>
)
.
op
(
"spconv::indice_maxpool_backward_fp32"
,
&
spconv
::
indiceMaxPoolBackward
<
float
>
)
...
...
@@ -38,4 +42,5 @@ static auto registry =
&
spconv
::
indiceMaxPoolBackward
<
at
::
Half
>
)
.
op
(
"spconv::nms"
,
&
spconv
::
nonMaxSuppression
<
float
>
)
.
op
(
"spconv::pillar_scatter_float"
,
&
spconv
::
pointPillarScatter
<
float
>
)
.
op
(
"spconv::pillar_scatter_half"
,
&
spconv
::
pointPillarScatter
<
at
::
Half
>
);
.
op
(
"spconv::pillar_scatter_half"
,
&
spconv
::
pointPillarScatter
<
at
::
Half
>
);
src/spconv/indice.cc
View file @
19e73bbe
// Copyright 2019 Yan Yan
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <ATen/Parallel.h>
#include <spconv/geometry.h>
#include <spconv/indice.h>
#include <spconv/spconv_ops.h>
#include <torch/script.h>
#include <ATen/Parallel.h>
namespace
spconv
{
...
...
@@ -45,7 +45,7 @@ Index getIndicePairsConv(tv::TensorView<const Index> indicesIn,
}
Index
numValidPoints
=
0
;
std
::
vector
<
Index
>
validPoints_
(
kernelVolume
*
(
NDim
+
1
));
Index
*
validPoints
=
validPoints_
.
data
();
Index
*
validPoints
=
validPoints_
.
data
();
Index
*
pointPtr
=
nullptr
;
Index
hashval
;
tsl
::
robin_map
<
Index
,
Index
>
hash
;
...
...
@@ -67,7 +67,7 @@ Index getIndicePairsConv(tv::TensorView<const Index> indicesIn,
indicesOut
(
numAct
,
0
)
=
batchIdx
;
hashval
=
numAct
++
;
hash
[
index
]
=
hashval
;
}
else
{
}
else
{
hashval
=
iter
->
second
;
}
// indicePairs: [K, 2, L]
...
...
@@ -102,7 +102,7 @@ Index getIndicePairsDeConv(tv::TensorView<const Index> indicesIn,
}
Index
numValidPoints
=
0
;
std
::
vector
<
Index
>
validPoints_
(
kernelVolume
*
(
NDim
+
1
));
Index
*
validPoints
=
validPoints_
.
data
();
Index
*
validPoints
=
validPoints_
.
data
();
Index
*
pointPtr
=
nullptr
;
Index
hashval
;
tsl
::
robin_map
<
Index
,
Index
>
hash
;
...
...
@@ -125,7 +125,7 @@ Index getIndicePairsDeConv(tv::TensorView<const Index> indicesIn,
indicesOut
(
numAct
,
0
)
=
batchIdx
;
hashval
=
numAct
++
;
hash
[
index
]
=
hashval
;
}
else
{
}
else
{
hashval
=
iter
->
second
;
}
// indicePairs: [K, 2, L]
...
...
@@ -136,7 +136,6 @@ Index getIndicePairsDeConv(tv::TensorView<const Index> indicesIn,
return
numAct
;
}
#ifndef TV_WINDOWS
template
<
typename
Index
,
typename
IndexGrid
,
unsigned
NDim
>
Index
getIndicePairsSubM
(
tv
::
TensorView
<
const
Index
>
indicesIn
,
...
...
@@ -145,7 +144,8 @@ Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
tv
::
TensorView
<
Index
>
indiceNum
,
const
Index
*
const
kernelSize
,
const
Index
*
const
stride
,
const
Index
*
const
padding
,
const
Index
*
dilation
,
const
Index
*
const
outSpatialShape
)
{
const
Index
*
dilation
,
const
Index
*
const
outSpatialShape
)
{
Index
numAct
=
0
;
auto
numActIn
=
indicesIn
.
dim
(
0
);
Index
batchIdx
=
0
;
...
...
@@ -167,12 +167,12 @@ Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
spatialVolume
*
indicesIn
(
j
,
0
);
hash
[
index
]
=
j
;
}
at
::
parallel_for
(
0
,
numActIn
,
0
,
[
&
](
int64_t
begin
,
int64_t
end
){
at
::
parallel_for
(
0
,
numActIn
,
0
,
[
&
](
int64_t
begin
,
int64_t
end
)
{
Index
index
=
0
;
Index
numValidPoints
=
0
;
std
::
vector
<
Index
>
validPoints_
(
kernelVolume
*
(
NDim
+
1
));
Index
*
validPoints
=
validPoints_
.
data
();
Index
*
validPoints
=
validPoints_
.
data
();
Index
*
pointPtr
=
nullptr
;
Index
oldOffset
=
0
;
for
(
int
j
=
begin
;
j
<
end
;
++
j
)
{
...
...
@@ -186,7 +186,7 @@ Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
spatialVolume
*
indicesIn
(
j
,
0
);
auto
iter
=
hash
.
find
(
index
);
if
(
iter
!=
hash
.
end
())
{
#pragma omp atomic capture
#pragma omp atomic capture
oldOffset
=
indiceNum
[
offset
]
++
;
indicePairs
(
offset
,
0
,
oldOffset
)
=
j
;
indicePairs
(
offset
,
1
,
oldOffset
)
=
iter
->
second
;
...
...
@@ -196,7 +196,7 @@ Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
});
return
numActIn
;
}
#else
#else
template
<
typename
Index
,
typename
IndexGrid
,
unsigned
NDim
>
Index
getIndicePairsSubM
(
tv
::
TensorView
<
const
Index
>
indicesIn
,
tv
::
TensorView
<
IndexGrid
>
gridsOut
,
...
...
@@ -204,7 +204,8 @@ Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
tv
::
TensorView
<
Index
>
indiceNum
,
const
Index
*
const
kernelSize
,
const
Index
*
const
stride
,
const
Index
*
const
padding
,
const
Index
*
dilation
,
const
Index
*
const
outSpatialShape
)
{
const
Index
*
dilation
,
const
Index
*
const
outSpatialShape
)
{
Index
numAct
=
0
;
auto
numActIn
=
indicesIn
.
dim
(
0
);
Index
batchIdx
=
0
;
...
...
@@ -221,7 +222,7 @@ Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
Index
numValidPoints
=
0
;
// Index validPoints[kernelVolume * (NDim + 1)];
std
::
vector
<
Index
>
validPoints_
(
kernelVolume
*
(
NDim
+
1
));
Index
*
validPoints
=
validPoints_
.
data
();
Index
*
validPoints
=
validPoints_
.
data
();
Index
*
pointPtr
=
nullptr
;
tsl
::
robin_map
<
Index
,
Index
>
hash
;
for
(
int
j
=
0
;
j
<
numActIn
;
++
j
)
{
...
...
@@ -255,57 +256,53 @@ Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
namespace
functor
{
template
<
typename
Index
,
typename
IndexGrid
,
unsigned
NDim
>
struct
CreateConvIndicePairFunctor
<
tv
::
CPU
,
Index
,
IndexGrid
,
NDim
>
{
Index
operator
()(
const
tv
::
CPU
&
d
,
tv
::
TensorView
<
const
Index
>
indicesIn
,
tv
::
TensorView
<
Index
>
indicesOut
,
tv
::
TensorView
<
IndexGrid
>
gridsOut
,
tv
::
TensorView
<
Index
>
indicePairs
,
tv
::
TensorView
<
Index
>
indiceNum
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
kernelSize
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
stride
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
padding
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
dilation
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
outSpatialShape
,
bool
transpose
,
bool
resetGrid
,
bool
useHash
)
{
Index
operator
()(
const
tv
::
CPU
&
d
,
tv
::
TensorView
<
const
Index
>
indicesIn
,
tv
::
TensorView
<
Index
>
indicesOut
,
tv
::
TensorView
<
IndexGrid
>
gridsOut
,
tv
::
TensorView
<
Index
>
indicePairs
,
tv
::
TensorView
<
Index
>
indiceNum
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
kernelSize
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
stride
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
padding
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
dilation
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
outSpatialShape
,
bool
transpose
,
bool
resetGrid
,
bool
useHash
)
{
if
(
transpose
)
return
getIndicePairsDeConv
<
Index
,
IndexGrid
,
NDim
>
(
indicesIn
,
indicesOut
,
gridsOut
,
indicePairs
,
indiceNum
,
indicesIn
,
indicesOut
,
gridsOut
,
indicePairs
,
indiceNum
,
kernelSize
.
data
(),
stride
.
data
(),
padding
.
data
(),
dilation
.
data
(),
outSpatialShape
.
data
());
else
return
getIndicePairsConv
<
Index
,
IndexGrid
,
NDim
>
(
indicesIn
,
indicesOut
,
gridsOut
,
indicePairs
,
indiceNum
,
indicesIn
,
indicesOut
,
gridsOut
,
indicePairs
,
indiceNum
,
kernelSize
.
data
(),
stride
.
data
(),
padding
.
data
(),
dilation
.
data
(),
outSpatialShape
.
data
());
}
};
template
<
typename
Index
,
typename
IndexGrid
,
unsigned
NDim
>
struct
CreateSubMIndicePairFunctor
<
tv
::
CPU
,
Index
,
IndexGrid
,
NDim
>
{
Index
operator
()(
const
tv
::
CPU
&
d
,
tv
::
TensorView
<
const
Index
>
indicesIn
,
tv
::
TensorView
<
IndexGrid
>
gridsOut
,
tv
::
TensorView
<
Index
>
indicePairs
,
tv
::
TensorView
<
Index
>
indiceNum
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
kernelSize
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
stride
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
padding
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
dilation
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
outSpatialShape
,
bool
transpose
,
bool
resetGrid
,
bool
useHash
)
{
Index
operator
()(
const
tv
::
CPU
&
d
,
tv
::
TensorView
<
const
Index
>
indicesIn
,
tv
::
TensorView
<
IndexGrid
>
gridsOut
,
tv
::
TensorView
<
Index
>
indicePairs
,
tv
::
TensorView
<
Index
>
indiceNum
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
kernelSize
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
stride
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
padding
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
dilation
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
outSpatialShape
,
bool
transpose
,
bool
resetGrid
,
bool
useHash
)
{
return
getIndicePairsSubM
<
Index
,
IndexGrid
,
NDim
>
(
indicesIn
,
gridsOut
,
indicePairs
,
indiceNum
,
kernelSize
.
data
(),
stride
.
data
(),
padding
.
data
(),
dilation
.
data
(),
outSpatialShape
.
data
());
indicesIn
,
gridsOut
,
indicePairs
,
indiceNum
,
kernelSize
.
data
(),
stride
.
data
(),
padding
.
data
(),
dilation
.
data
(),
outSpatialShape
.
data
());
}
};
}
// namespace functor
#define DECLARE_CPU_SPECS_INDEX_NDIM(Index, NDIM) \
template struct functor::CreateConvIndicePairFunctor<tv::CPU, Index, int,
NDIM>;
\
template struct functor::CreateSubMIndicePairFunctor<tv::CPU, Index, int,
\
NDIM>;
template struct functor::CreateConvIndicePairFunctor<tv::CPU, Index, int, \
NDIM>;
\
template struct functor::CreateSubMIndicePairFunctor<tv::CPU, Index, int, \
NDIM>;
#define DECLARE_CPU_INDEX(Index) \
DECLARE_CPU_SPECS_INDEX_NDIM(Index, 1); \
...
...
@@ -320,4 +317,3 @@ DECLARE_CPU_INDEX(long);
#undef DECLARE_CPU_SPECS_INDEX_NDIM
}
// namespace spconv
src/spconv/indice.cu
View file @
19e73bbe
...
...
@@ -14,17 +14,240 @@
#include <ATen/ATen.h>
#include <chrono>
#include <cuhash/hash_table.h>
#include <limits>
#include <spconv/mp_helper.h>
#include <spconv/indice.h>
#include <spconv/indice.cu.h>
#include <tensorview/helper_launch.h>
#include <spconv/indice.h>
#include <tensorview/cuda_utils.h>
#include <tensorview/mp_helper.h>
#include <tensorview/torch_utils.h>
#include <tensorview/tensor.h>
#include <tensorview/tensorview.h>
#include <type_traits>
#include <utility/timer.h>
#include <cuhash/hash_table.h>
namespace
spconv
{
int
create_conv_indice_pair_p1_cuda
(
torch
::
Tensor
indicesIn
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
torch
::
Tensor
indicePairUnique
,
std
::
vector
<
int64_t
>
kernelSize
,
std
::
vector
<
int64_t
>
stride
,
std
::
vector
<
int64_t
>
padding
,
std
::
vector
<
int64_t
>
dilation
,
std
::
vector
<
int64_t
>
outSpatialShape
,
bool
transpose
)
{
auto
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
auto
ndim
=
kernelSize
.
size
();
auto
numActIn
=
indicesIn
.
size
(
0
);
if
(
numActIn
==
0
)
return
0
;
// dispatch_torch must be in outside, this is a gcc bug, fixed in gcc 8.
tv
::
dispatch_torch
<
int32_t
>
(
indicesIn
.
scalar_type
(),
[
&
](
auto
V
)
{
using
Index
=
decltype
(
V
);
using
IndexGrid
=
int32_t
;
tv
::
dispatch_int
<
2
,
3
,
4
>
(
ndim
,
[
&
](
auto
I
)
{
constexpr
int
NDim
=
I
;
tv
::
SimpleVector
<
Index
,
NDim
>
ks
(
kernelSize
.
begin
(),
kernelSize
.
end
());
tv
::
SimpleVector
<
Index
,
NDim
>
st
(
stride
.
begin
(),
stride
.
end
());
tv
::
SimpleVector
<
Index
,
NDim
>
pa
(
padding
.
begin
(),
padding
.
end
());
tv
::
SimpleVector
<
Index
,
NDim
>
di
(
dilation
.
begin
(),
dilation
.
end
());
tv
::
SimpleVector
<
Index
,
NDim
>
ou
(
outSpatialShape
.
begin
(),
outSpatialShape
.
end
());
if
(
transpose
)
{
prepareDeConvIndicePairsKernel
<
Index
,
NDim
,
4096
>
<<<
tv
::
cuda
::
getBlocks
(
numActIn
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
stream
>>>
(
tv
::
torch2tv
<
Index
>
(
indicesIn
),
tv
::
torch2tv
<
Index
>
(
indicePairs
),
tv
::
torch2tv
<
Index
>
(
indiceNum
),
tv
::
torch2tv
<
Index
>
(
indicePairUnique
),
ks
,
st
,
pa
,
di
,
ou
);
}
else
{
prepareIndicePairsKernel
<
Index
,
NDim
,
4096
>
<<<
tv
::
cuda
::
getBlocks
(
numActIn
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
stream
>>>
(
tv
::
torch2tv
<
Index
>
(
indicesIn
),
tv
::
torch2tv
<
Index
>
(
indicePairs
),
tv
::
torch2tv
<
Index
>
(
indiceNum
),
tv
::
torch2tv
<
Index
>
(
indicePairUnique
),
ks
,
st
,
pa
,
di
,
ou
);
}
});
});
return
1
;
}
int
create_conv_indice_pair_p2_cuda
(
torch
::
Tensor
indicesIn
,
torch
::
Tensor
indicesOut
,
torch
::
Tensor
gridsOut
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
torch
::
Tensor
indicePairUnique
,
std
::
vector
<
int64_t
>
outSpatialShape
,
bool
transpose
,
bool
resetGrid
,
bool
useHash
)
{
auto
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
auto
ndim
=
outSpatialShape
.
size
();
auto
numActIn
=
indicesIn
.
size
(
0
);
int
batchSize
=
gridsOut
.
size
(
0
);
int
numAct
=
indicePairUnique
.
size
(
0
)
-
1
;
auto
kernelVolume
=
indicePairs
.
size
(
0
);
if
(
numActIn
==
0
)
return
0
;
// dispatch_torch must be in outside, this is a gcc bug, fixed in gcc 8.
tv
::
dispatch_torch
<
int32_t
>
(
indicesIn
.
scalar_type
(),
[
&
](
auto
V
)
{
using
Index
=
decltype
(
V
);
using
IndexGrid
=
int32_t
;
tv
::
dispatch_int
<
2
,
3
,
4
>
(
ndim
,
[
&
](
auto
I
)
{
constexpr
int
NDim
=
I
;
using
IndexGrid
=
int32_t
;
tv
::
SimpleVector
<
Index
,
NDim
>
ou
(
outSpatialShape
.
begin
(),
outSpatialShape
.
end
());
if
(
useHash
)
{
auto
table
=
cuhash
::
HashTable
();
// std::cout << "create " << numAct << " size table..." << std::endl;
table
.
Initialize
(
numAct
,
2.0
,
4
);
unsigned
*
d_values
=
nullptr
;
cudaMalloc
((
void
**
)
&
d_values
,
sizeof
(
unsigned
)
*
numAct
);
TV_CHECK_CUDA_ERR_V2
(
"cudaMalloc failed"
);
arangeKernel
<
unsigned
>
<<<
tv
::
cuda
::
getBlocks
(
numAct
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
stream
>>>
(
d_values
,
numAct
);
TV_CHECK_CUDA_ERR_V2
(
"arangeKernel failed"
);
bool
res
=
table
.
Build
(
numAct
,
reinterpret_cast
<
unsigned
*>
(
tv
::
torch2tv
<
Index
>
(
indicePairUnique
).
data
()),
d_values
);
cudaFree
(
d_values
);
TV_CHECK_CUDA_ERR_V2
(
"cudaFree failed"
);
if
(
!
res
)
{
return
-
1
;
// use -1 to tell outside use CPU implementation
}
assignIndiceOutKernel
<
Index
,
NDim
>
<<<
tv
::
cuda
::
getBlocks
(
numAct
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
stream
>>>
(
tv
::
torch2tv
<
Index
>
(
indicesOut
),
numAct
,
tv
::
torch2tv
<
Index
>
(
indicePairUnique
),
ou
,
batchSize
);
auto
tableSize
=
table
.
get_table_size
();
auto
tableData
=
table
.
data
();
auto
constants
=
table
.
get_constants_4
();
auto
stash_constants
=
table
.
get_stash_constants
();
auto
stash_count
=
table
.
get_stash_count
();
assignIndicePairsHashKernel
<
Index
,
NDim
>
<<<
tv
::
cuda
::
getBlocks
(
numActIn
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
stream
>>>
(
tv
::
torch2tv
<
Index
>
(
indicesOut
),
numActIn
,
tv
::
torch2tv
<
Index
>
(
indicePairs
),
tv
::
torch2tv
<
Index
>
(
indicePairUnique
),
tableSize
,
tableData
,
constants
,
stash_constants
,
stash_count
);
}
else
{
assignGridAndIndiceOutKernel
<
Index
,
IndexGrid
,
NDim
>
<<<
tv
::
cuda
::
getBlocks
(
numAct
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
stream
>>>
(
tv
::
torch2tv
<
Index
>
(
indicesOut
),
tv
::
torch2tv
<
IndexGrid
>
(
gridsOut
),
numAct
,
tv
::
torch2tv
<
Index
>
(
indicePairs
),
tv
::
torch2tv
<
Index
>
(
indicePairUnique
),
ou
,
batchSize
);
TV_CHECK_CUDA_ERR_V2
(
"assignGridAndIndiceOutKernel failed"
);
assignIndicePairsKernel
<
Index
,
IndexGrid
,
NDim
>
<<<
tv
::
cuda
::
getBlocks
(
numActIn
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
stream
>>>
(
tv
::
torch2tv
<
Index
>
(
indicesOut
),
tv
::
torch2tv
<
IndexGrid
>
(
gridsOut
),
numAct
,
tv
::
torch2tv
<
Index
>
(
indicePairs
),
tv
::
torch2tv
<
Index
>
(
indicePairUnique
),
ou
);
TV_CHECK_CUDA_ERR_V2
(
"assignIndicePairsKernel failed"
);
}
if
(
resetGrid
&&
(
!
useHash
))
{
resetGridKernel
<
Index
,
IndexGrid
,
NDim
>
<<<
tv
::
cuda
::
getBlocks
(
numAct
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
stream
>>>
(
indicePairUnique
.
data_ptr
<
Index
>
(),
tv
::
torch2tv
<
IndexGrid
>
(
gridsOut
),
numAct
);
TV_CHECK_CUDA_ERR_V2
(
"resetGridKernel failed"
);
}
});
});
return
numAct
;
}
int
create_submconv_indice_pair_cuda
(
torch
::
Tensor
indicesIn
,
torch
::
Tensor
gridsOut
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
std
::
vector
<
int64_t
>
kernelSize
,
std
::
vector
<
int64_t
>
stride
,
std
::
vector
<
int64_t
>
padding
,
std
::
vector
<
int64_t
>
dilation
,
std
::
vector
<
int64_t
>
outSpatialShape
,
bool
transpose
,
bool
resetGrid
,
bool
useHash
)
{
auto
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
auto
ndim
=
outSpatialShape
.
size
();
auto
numActIn
=
indicesIn
.
size
(
0
);
int
batchSize
=
gridsOut
.
size
(
0
);
auto
kernelVolume
=
indicePairs
.
size
(
0
);
if
(
numActIn
==
0
)
return
0
;
tv
::
dispatch_torch
<
int32_t
>
(
indicesIn
.
scalar_type
(),
[
&
](
auto
V
)
{
using
Index
=
decltype
(
V
);
using
IndexGrid
=
int32_t
;
tv
::
dispatch_int
<
2
,
3
,
4
>
(
ndim
,
[
&
](
auto
I
)
{
constexpr
int
NDim
=
I
;
tv
::
SimpleVector
<
Index
,
NDim
>
ks
(
kernelSize
.
begin
(),
kernelSize
.
end
());
tv
::
SimpleVector
<
Index
,
NDim
>
st
(
stride
.
begin
(),
stride
.
end
());
tv
::
SimpleVector
<
Index
,
NDim
>
pa
(
padding
.
begin
(),
padding
.
end
());
tv
::
SimpleVector
<
Index
,
NDim
>
di
(
dilation
.
begin
(),
dilation
.
end
());
tv
::
SimpleVector
<
Index
,
NDim
>
ou
(
outSpatialShape
.
begin
(),
outSpatialShape
.
end
());
if
(
useHash
)
{
auto
table
=
cuhash
::
HashTable
();
// std::cout << "create " << numAct << " size table..." << std::endl;
table
.
Initialize
(
numActIn
,
2.0
,
4
);
unsigned
*
d_keyvalues
=
nullptr
;
cudaMalloc
((
void
**
)
&
d_keyvalues
,
sizeof
(
unsigned
)
*
numActIn
*
2
);
unsigned
*
d_values
=
d_keyvalues
+
numActIn
;
TV_CHECK_CUDA_ERR_V2
(
"cudaMalloc failed"
);
prepareSubMHashKernel
<
Index
,
NDim
>
<<<
tv
::
cuda
::
getBlocks
(
numActIn
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
stream
>>>
(
tv
::
torch2tv
<
Index
>
(
indicesIn
),
d_keyvalues
,
d_values
,
ou
);
TV_CHECK_CUDA_ERR_V2
(
"prepareSubMHashKernel failed"
);
bool
res
=
table
.
Build
(
numActIn
,
reinterpret_cast
<
unsigned
*>
(
d_keyvalues
),
reinterpret_cast
<
unsigned
*>
(
d_values
));
cudaFree
(
d_values
);
TV_CHECK_CUDA_ERR_V2
(
"cudaFree failed"
);
if
(
!
res
)
{
return
-
1
;
// use -1 to tell outside use CPU implementation
}
auto
tableSize
=
table
.
get_table_size
();
auto
tableData
=
table
.
data
();
auto
constants
=
table
.
get_constants_4
();
auto
stash_constants
=
table
.
get_stash_constants
();
auto
stash_count
=
table
.
get_stash_count
();
getSubMIndicePairsHashKernel
<
Index
,
NDim
,
4096
>
<<<
tv
::
cuda
::
getBlocks
(
numActIn
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
stream
>>>
(
tv
::
torch2tv
<
Index
>
(
indicesIn
),
tv
::
torch2tv
<
Index
>
(
indicePairs
),
tv
::
torch2tv
<
Index
>
(
indiceNum
),
ks
,
st
,
pa
,
di
,
ou
,
tableSize
,
tableData
,
constants
,
stash_constants
,
stash_count
);
}
else
{
prepareSubMGridKernel
<
Index
,
IndexGrid
,
NDim
>
<<<
tv
::
cuda
::
getBlocks
(
numActIn
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
stream
>>>
(
tv
::
torch2tv
<
Index
>
(
indicesIn
),
tv
::
torch2tv
<
IndexGrid
>
(
gridsOut
),
ou
);
TV_CHECK_CUDA_ERR_V2
(
"prepareSubMGridKernel failed"
);
getSubMIndicePairsKernel
<
Index
,
IndexGrid
,
NDim
,
4096
>
<<<
tv
::
cuda
::
getBlocks
(
numActIn
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
stream
>>>
(
tv
::
torch2tv
<
Index
>
(
indicesIn
),
tv
::
torch2tv
<
IndexGrid
>
(
gridsOut
),
tv
::
torch2tv
<
Index
>
(
indicePairs
),
tv
::
torch2tv
<
Index
>
(
indiceNum
),
ks
,
st
,
pa
,
di
,
ou
);
TV_CHECK_CUDA_ERR_V2
(
"assignIndicePairsKernel failed"
);
}
if
(
resetGrid
&&
(
!
useHash
))
{
resetGridSubMKernel
<
Index
,
IndexGrid
,
NDim
>
<<<
tv
::
cuda
::
getBlocks
(
numActIn
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
stream
>>>
(
indicesIn
.
data_ptr
<
Index
>
(),
tv
::
torch2tv
<
IndexGrid
>
(
gridsOut
),
ou
,
numActIn
);
TV_CHECK_CUDA_ERR_V2
(
"resetGridKernel failed"
);
}
});
});
return
numActIn
;
}
namespace
functor
{
template
<
typename
Index
,
typename
IndexGrid
,
unsigned
NDim
>
struct
CreateConvIndicePairFunctorP1
<
tv
::
GPU
,
Index
,
IndexGrid
,
NDim
>
{
...
...
@@ -46,17 +269,17 @@ struct CreateConvIndicePairFunctorP1<tv::GPU, Index, IndexGrid, NDim> {
return
0
;
// auto timer = spconv::CudaContextTimer<>();
if
(
transpose
)
prepareDeConvIndicePairsKernel
<
Index
,
IndexGrid
,
NDim
,
4096
>
<<<
tv
::
launch
::
getBlocks
(
numActIn
),
tv
::
launch
::
CUDA_NUM_THREADS
,
0
,
d
.
getStream
()
>>>
(
indicesIn
,
indice
sOut
,
gridsOut
,
indice
Pairs
,
indiceNum
,
indicePairUnique
,
kernelSize
,
stride
,
padding
,
dilation
,
outSpatialShape
);
prepareDeConvIndicePairsKernel
<
Index
,
NDim
,
4096
>
<<<
tv
::
cuda
::
getBlocks
(
numActIn
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
d
.
getStream
()
>>>
(
indicesIn
,
indice
Pairs
,
indice
Num
,
indicePairUnique
,
kernelSize
,
stride
,
padding
,
dilation
,
outSpatialShape
);
else
prepareIndicePairsKernel
<
Index
,
IndexGrid
,
NDim
,
4096
>
<<<
tv
::
launch
::
getBlocks
(
numActIn
),
tv
::
launch
::
CUDA_NUM_THREADS
,
0
,
d
.
getStream
()
>>>
(
indicesIn
,
indice
sOut
,
gridsOut
,
indice
Pairs
,
indiceNum
,
indicePairUnique
,
kernelSize
,
stride
,
padding
,
dilation
,
outSpatialShape
);
prepareIndicePairsKernel
<
Index
,
NDim
,
4096
>
<<<
tv
::
cuda
::
getBlocks
(
numActIn
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
d
.
getStream
()
>>>
(
indicesIn
,
indice
Pairs
,
indice
Num
,
indicePairUnique
,
kernelSize
,
stride
,
padding
,
dilation
,
outSpatialShape
);
TV_CHECK_CUDA_ERR
();
// std::cout << "p1 gene time " << timer.report() / 1000.0 << std::endl;
return
1
;
...
...
@@ -78,57 +301,58 @@ struct CreateConvIndicePairFunctorP2<tv::GPU, Index, IndexGrid, NDim> {
auto
numActIn
=
indicesIn
.
dim
(
0
);
if
(
numActIn
==
0
)
return
0
;
// after unique, there is a std::numeric_limits<int>::max() in the end of indicePairUnique
Index
numAct
=
indicePairUnique
.
dim
(
0
)
-
1
;
if
(
useHash
){
// after unique, there is a std::numeric_limits<int>::max() in the end of
// indicePairUnique
Index
numAct
=
indicePairUnique
.
dim
(
0
)
-
1
;
if
(
useHash
)
{
auto
table
=
cuhash
::
HashTable
();
// std::cout << "create " << numAct << " size table..." << std::endl;
table
.
Initialize
(
numAct
,
2.0
,
4
);
unsigned
*
d_values
=
nullptr
;
cudaMalloc
((
void
**
)
&
d_values
,
sizeof
(
unsigned
)
*
numAct
);
cudaMalloc
((
void
**
)
&
d_values
,
sizeof
(
unsigned
)
*
numAct
);
TV_CHECK_CUDA_ERR_V2
(
"cudaMalloc failed"
);
arangeKernel
<
unsigned
><<<
tv
::
launch
::
getBlocks
(
numAct
),
tv
::
launch
::
CUDA_NUM_THREADS
,
0
,
d
.
getStream
()
>>>
(
d_values
,
numAct
);
bool
res
=
table
.
Build
(
numAct
,
reinterpret_cast
<
unsigned
*>
(
indicePairUnique
.
data
()),
d_values
);
arangeKernel
<
unsigned
>
<<<
tv
::
cuda
::
getBlocks
(
numAct
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
d
.
getStream
()
>>>
(
d_values
,
numAct
);
bool
res
=
table
.
Build
(
numAct
,
reinterpret_cast
<
unsigned
*>
(
indicePairUnique
.
data
()),
d_values
);
cudaFree
(
d_values
);
if
(
!
res
){
return
-
1
;
//use -1 to tell outside use CPU implementation
if
(
!
res
)
{
return
-
1
;
//
use -1 to tell outside use CPU implementation
}
assignIndiceOutKernel
<
Index
,
NDim
>
<<<
tv
::
launch
::
getBlocks
(
numAct
),
tv
::
launch
::
CUDA_NUM_THREADS
,
0
,
d
.
getStream
()
>>>
(
indicesOut
,
numAct
,
indicePairUnique
,
outSpatialShape
,
batchSize
);
<<<
tv
::
cuda
::
getBlocks
(
numAct
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
d
.
getStream
()
>>>
(
indicesOut
,
numAct
,
indicePairUnique
,
outSpatialShape
,
batchSize
);
TV_CHECK_CUDA_ERR_V2
(
"assignGridAndIndiceOutKernel failed"
);
auto
tableSize
=
table
.
get_table_size
();
auto
tableData
=
table
.
data
();
auto
constants
=
table
.
get_constants_4
();
auto
stash_constants
=
table
.
get_stash_constants
();
auto
stash_count
=
table
.
get_stash_count
();
assignIndicePairsHashKernel
<
Index
,
IndexGrid
,
NDim
>
<<<
tv
::
launch
::
getBlocks
(
numActIn
),
tv
::
launch
::
CUDA_NUM_THREADS
,
0
,
d
.
getStream
()
>>>
(
indicesOut
,
numActIn
,
indicePairs
,
indicePairUnique
,
tableSize
,
tableData
,
constants
,
stash_constants
,
stash_count
);
assignIndicePairsHashKernel
<
Index
,
NDim
>
<<<
tv
::
cuda
::
getBlocks
(
numActIn
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
d
.
getStream
()
>>>
(
indicesOut
,
numActIn
,
indicePairs
,
indicePairUnique
,
tableSize
,
tableData
,
constants
,
stash_constants
,
stash_count
);
TV_CHECK_CUDA_ERR_V2
(
"assignIndicePairsKernel failed"
);
}
else
{
}
else
{
assignGridAndIndiceOutKernel
<
Index
,
IndexGrid
,
NDim
>
<<<
tv
::
launch
::
getBlocks
(
numAct
),
tv
::
launch
::
CUDA_NUM_THREADS
,
0
,
d
.
getStream
()
>>>
(
indicesOut
,
gridsOut
,
numAct
,
indicePairs
,
indicePairUnique
,
outSpatialShape
,
batchSize
);
<<<
tv
::
cuda
::
getBlocks
(
numAct
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
d
.
getStream
()
>>>
(
indicesOut
,
gridsOut
,
numAct
,
indicePairs
,
indicePairUnique
,
outSpatialShape
,
batchSize
);
TV_CHECK_CUDA_ERR_V2
(
"assignGridAndIndiceOutKernel failed"
);
assignIndicePairsKernel
<
Index
,
IndexGrid
,
NDim
>
<<<
tv
::
launch
::
getBlocks
(
numActIn
),
tv
::
launch
::
CUDA_NUM_THREADS
,
0
,
d
.
getStream
()
>>>
(
indicesOut
,
gridsOut
,
numActIn
,
indicePairs
,
indicePairUnique
,
outSpatialShape
);
<<<
tv
::
cuda
::
getBlocks
(
numActIn
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
d
.
getStream
()
>>>
(
indicesOut
,
gridsOut
,
numActIn
,
indicePairs
,
indicePairUnique
,
outSpatialShape
);
TV_CHECK_CUDA_ERR_V2
(
"assignIndicePairsKernel failed"
);
}
if
(
resetGrid
&&
(
!
useHash
))
{
resetGridKernel
<
Index
,
IndexGrid
,
NDim
>
<<<
tv
::
launch
::
getBlocks
(
numAct
),
tv
::
launch
::
CUDA_NUM_THREADS
,
0
,
<<<
tv
::
cuda
::
getBlocks
(
numAct
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
d
.
getStream
()
>>>
(
indicePairUnique
.
data
(),
gridsOut
,
numAct
);
TV_CHECK_CUDA_ERR_V2
(
"resetGridKernel failed"
);
}
...
...
@@ -152,22 +376,25 @@ struct CreateSubMIndicePairFunctor<tv::GPU, Index, IndexGrid, NDim> {
if
(
numActIn
==
0
)
return
0
;
// auto timer = spconv::CudaContextTimer<>();
if
(
useHash
){
if
(
useHash
)
{
auto
table
=
cuhash
::
HashTable
();
// std::cout << "subm create " << numActIn << " size table..." << std::endl;
// std::cout << "subm create " << numActIn << " size table..." <<
// std::endl;
table
.
Initialize
(
numActIn
,
2.0
,
4
);
unsigned
*
d_keyvalues
=
nullptr
;
cudaMalloc
((
void
**
)
&
d_keyvalues
,
sizeof
(
unsigned
)
*
numActIn
*
2
);
cudaMalloc
((
void
**
)
&
d_keyvalues
,
sizeof
(
unsigned
)
*
numActIn
*
2
);
unsigned
*
d_values
=
d_keyvalues
+
numActIn
;
prepareSubMHashKernel
<
Index
,
NDim
>
<<<
tv
::
launch
::
getBlocks
(
numActIn
),
tv
::
launch
::
CUDA_NUM_THREADS
,
0
,
d
.
getStream
()
>>>
(
indicesIn
,
d_keyvalues
,
d_values
,
outSpatialShape
);
<<<
tv
::
cuda
::
getBlocks
(
numActIn
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
d
.
getStream
()
>>>
(
indicesIn
,
d_keyvalues
,
d_values
,
outSpatialShape
);
TV_CHECK_CUDA_ERR_V2
(
"prepareSubMHashKernel failed"
);
bool
res
=
table
.
Build
(
numActIn
,
reinterpret_cast
<
unsigned
*>
(
d_keyvalues
),
reinterpret_cast
<
unsigned
*>
(
d_values
));
bool
res
=
table
.
Build
(
numActIn
,
reinterpret_cast
<
unsigned
*>
(
d_keyvalues
),
reinterpret_cast
<
unsigned
*>
(
d_values
));
cudaFree
(
d_keyvalues
);
if
(
!
res
){
return
-
1
;
//use -1 to tell outside use CPU implementation
if
(
!
res
)
{
return
-
1
;
//
use -1 to tell outside use CPU implementation
}
auto
tableSize
=
table
.
get_table_size
();
auto
tableData
=
table
.
data
();
...
...
@@ -175,28 +402,30 @@ struct CreateSubMIndicePairFunctor<tv::GPU, Index, IndexGrid, NDim> {
auto
stash_constants
=
table
.
get_stash_constants
();
auto
stash_count
=
table
.
get_stash_count
();
getSubMIndicePairsHashKernel
<
Index
,
NDim
,
4096
>
<<<
tv
::
launch
::
getBlocks
(
numActIn
),
tv
::
launch
::
CUDA_NUM_THREADS
,
0
,
d
.
getStream
()
>>>
(
indicesIn
,
indicePairs
,
indiceNum
,
kernelSize
,
stride
,
padding
,
dilation
,
outSpatialShape
,
tableSize
,
tableData
,
constants
,
stash_constants
,
stash_count
);
<<<
tv
::
cuda
::
getBlocks
(
numActIn
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
d
.
getStream
()
>>>
(
indicesIn
,
indicePairs
,
indiceNum
,
kernelSize
,
stride
,
padding
,
dilation
,
outSpatialShape
,
tableSize
,
tableData
,
constants
,
stash_constants
,
stash_count
);
TV_CHECK_CUDA_ERR_V2
(
"getSubMIndicePairsHashKernel failed"
);
}
else
{
}
else
{
prepareSubMGridKernel
<
Index
,
IndexGrid
,
NDim
>
<<<
tv
::
launch
::
getBlocks
(
numActIn
),
tv
::
launch
::
CUDA_NUM_THREADS
,
0
,
d
.
getStream
()
>>>
(
indicesIn
,
gridsOut
,
outSpatialShape
);
<<<
tv
::
cuda
::
getBlocks
(
numActIn
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
d
.
getStream
()
>>>
(
indicesIn
,
gridsOut
,
outSpatialShape
);
TV_CHECK_CUDA_ERR
();
getSubMIndicePairsKernel
<
Index
,
IndexGrid
,
NDim
,
4096
>
<<<
tv
::
launch
::
getBlocks
(
numActIn
),
tv
::
launch
::
CUDA_NUM_THREADS
,
0
,
d
.
getStream
()
>>>
(
indicesIn
,
gridsOut
,
indicePairs
,
indiceNum
,
kernelSize
,
stride
,
padding
,
dilation
,
outSpatialShape
);
<<<
tv
::
cuda
::
getBlocks
(
numActIn
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
d
.
getStream
()
>>>
(
indicesIn
,
gridsOut
,
indicePairs
,
indiceNum
,
kernelSize
,
stride
,
padding
,
dilation
,
outSpatialShape
);
TV_CHECK_CUDA_ERR
();
}
// std::cout << "subm gene time " << timer.report() / 1000.0 << std::endl;
if
(
resetGrid
&&
(
!
useHash
))
{
resetGridSubMKernel
<
Index
,
IndexGrid
,
NDim
>
<<<
tv
::
launch
::
getBlocks
(
numActIn
),
tv
::
launch
::
CUDA_NUM_THREADS
,
0
,
d
.
getStream
()
>>>
(
indicesIn
.
data
(),
gridsOut
,
outSpatialShape
,
numActIn
);
<<<
tv
::
cuda
::
getBlocks
(
numActIn
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
d
.
getStream
()
>>>
(
indicesIn
.
data
(),
gridsOut
,
outSpatialShape
,
numActIn
);
TV_CHECK_CUDA_ERR
();
}
return
numActIn
;
...
...
src/spconv/maxpool.cu
View file @
19e73bbe
...
...
@@ -16,9 +16,9 @@
#include <chrono>
#include <limits>
#include <spconv/maxpool.h>
#include <
spconv/mp_helper
.h>
#include <tensorview/
helper_kernel.cu
.h>
#include <tensorview/helper
_launch
.h>
#include <
tensorview/cuda_utils
.h>
#include <tensorview/
kernel_utils
.h>
#include <tensorview/
mp_
helper.h>
#include <tensorview/tensorview.h>
#include <type_traits>
...
...
@@ -255,7 +255,8 @@ maxPoolBwdVecBlockKernel(const T *outFeatures, const T *inFeatures,
reinterpret_cast
<
const
VecType
*>
(
inFeatures
)[
idxi
];
reinterpret_cast
<
VecType
*>
(
bufdo
)[
0
]
=
reinterpret_cast
<
const
VecType
*>
(
dout
)[
idxo
];
reinterpret_cast
<
VecType
*>
(
bufdi
)[
0
]
=
reinterpret_cast
<
VecType
*>
(
din
)[
idxi
];
reinterpret_cast
<
VecType
*>
(
bufdi
)[
0
]
=
reinterpret_cast
<
VecType
*>
(
din
)[
idxi
];
#pragma unroll
for
(
int
i
=
0
;
i
<
vecloadFactor
;
i
++
)
{
...
...
@@ -263,7 +264,8 @@ maxPoolBwdVecBlockKernel(const T *outFeatures, const T *inFeatures,
bufdi
[
i
]
+=
bufdo
[
i
];
}
}
reinterpret_cast
<
VecType
*>
(
din
)[
idxi
]
=
reinterpret_cast
<
VecType
*>
(
bufdi
)[
0
];
reinterpret_cast
<
VecType
*>
(
din
)[
idxi
]
=
reinterpret_cast
<
VecType
*>
(
bufdi
)[
0
];
}
}
}
...
...
@@ -309,7 +311,7 @@ template <typename T, typename Index>
struct
SparseMaxPoolForwardFunctor
<
tv
::
GPU
,
T
,
Index
>
{
using
vecload_type_t
=
std
::
conditional_t
<
std
::
is_same
<
T
,
at
::
Half
>::
value
,
int2
,
int4
>
;
using
kernel_block_t
=
mp_list_c
<
int
,
64
,
32
,
16
>
;
using
kernel_block_t
=
tv
::
mp_list_c
<
int
,
64
,
32
,
16
>
;
void
operator
()(
const
tv
::
GPU
&
d
,
tv
::
TensorView
<
T
>
outFeatures
,
tv
::
TensorView
<
const
T
>
inFeatures
,
tv
::
TensorView
<
const
Index
>
indices
,
int
size
)
{
...
...
@@ -318,21 +320,22 @@ struct SparseMaxPoolForwardFunctor<tv::GPU, T, Index> {
int
numPlanes
=
inFeatures
.
dim
(
1
);
bool
notFound
=
true
;
constexpr
int
vecloadFactor
=
sizeof
(
vecload_type_t
)
/
sizeof
(
T
);
mp_for_each
<
kernel_block_t
>
([
=
,
&
outFeatures
,
&
inFeatures
,
&
indices
,
&
notFound
](
auto
NumTLP
)
{
tv
::
mp_for_each
<
kernel_block_t
>
([
=
,
&
outFeatures
,
&
inFeatures
,
&
indices
,
&
notFound
](
auto
NumTLP
)
{
constexpr
int
NumILP
=
NumTLP
/
4
;
int
numHotBlock
=
(
size
/
NumTLP
)
*
NumTLP
;
if
(
notFound
)
{
if
(
numPlanes
%
NumTLP
==
0
)
{
if
(
numHotBlock
>=
NumTLP
)
{
maxPoolFwdVecBlockKernel
<
T
,
Index
,
int
(
NumTLP
),
NumILP
,
vecload_type_t
>
maxPoolFwdVecBlockKernel
<
T
,
Index
,
int
(
NumTLP
),
NumILP
,
vecload_type_t
>
<<<
dim3
(
std
::
min
(
size
/
NumTLP
,
512
),
numPlanes
/
NumTLP
),
dim3
(
NumTLP
/
vecloadFactor
,
NumTLP
/
NumILP
),
0
,
d
.
getStream
()
>>>
(
outFeatures
.
data
(),
inFeatures
.
data
(),
indices
.
subview
(
0
).
data
(),
indices
.
subview
(
1
).
data
(),
numHotBlock
,
numPlanes
/
vecloadFactor
);
indices
.
subview
(
0
).
data
(),
indices
.
subview
(
1
).
data
(),
numHotBlock
,
numPlanes
/
vecloadFactor
);
TV_CHECK_CUDA_ERR
();
}
...
...
@@ -340,9 +343,9 @@ struct SparseMaxPoolForwardFunctor<tv::GPU, T, Index> {
maxPoolFwdGenericKernel
<
T
,
Index
,
int
(
NumTLP
),
NumILP
>
<<<
dim3
(
1
,
numPlanes
/
NumTLP
),
dim3
(
NumTLP
/
NumILP
,
NumTLP
),
0
,
d
.
getStream
()
>>>
(
outFeatures
.
data
(),
inFeatures
.
data
(),
indices
.
subview
(
0
).
data
()
+
numHotBlock
,
indices
.
subview
(
1
).
data
()
+
numHotBlock
,
size
-
numHotBlock
,
numPlanes
);
indices
.
subview
(
0
).
data
()
+
numHotBlock
,
indices
.
subview
(
1
).
data
()
+
numHotBlock
,
size
-
numHotBlock
,
numPlanes
);
TV_CHECK_CUDA_ERR
();
}
notFound
=
false
;
...
...
@@ -356,7 +359,7 @@ struct SparseMaxPoolForwardFunctor<tv::GPU, T, Index> {
int
numHotBlock
=
(
size
/
NumTLP
)
*
NumTLP
;
if
(
numHotBlock
>=
NumTLP
)
{
maxPoolFwdGenericBlockKernel
<
T
,
Index
,
NumTLP
,
NumILP
>
<<<
dim3
(
size
/
NumTLP
,
tv
::
launch
::
DivUp
(
numPlanes
,
NumTLP
)),
<<<
dim3
(
size
/
NumTLP
,
tv
::
cuda
::
DivUp
(
numPlanes
,
NumTLP
)),
dim3
(
NumTLP
/
NumILP
,
NumTLP
),
0
,
d
.
getStream
()
>>>
(
outFeatures
.
data
(),
inFeatures
.
data
(),
indices
.
subview
(
0
).
data
(),
indices
.
subview
(
1
).
data
(),
...
...
@@ -366,7 +369,7 @@ struct SparseMaxPoolForwardFunctor<tv::GPU, T, Index> {
if
(
size
>
numHotBlock
)
{
maxPoolFwdGenericKernel
<
T
,
Index
,
NumTLP
,
NumILP
>
<<<
dim3
(
1
,
tv
::
launch
::
DivUp
(
numPlanes
,
NumTLP
)),
<<<
dim3
(
1
,
tv
::
cuda
::
DivUp
(
numPlanes
,
NumTLP
)),
dim3
(
NumTLP
/
NumILP
,
NumTLP
),
0
,
d
.
getStream
()
>>>
(
outFeatures
.
data
(),
inFeatures
.
data
(),
indices
.
subview
(
0
).
data
()
+
numHotBlock
,
...
...
@@ -382,7 +385,7 @@ template <typename T, typename Index>
struct
SparseMaxPoolBackwardFunctor
<
tv
::
GPU
,
T
,
Index
>
{
using
vecload_type_t
=
std
::
conditional_t
<
std
::
is_same
<
T
,
at
::
Half
>::
value
,
int2
,
int4
>
;
using
kernel_block_t
=
mp_list_c
<
int
,
64
,
32
,
16
>
;
using
kernel_block_t
=
tv
::
mp_list_c
<
int
,
64
,
32
,
16
>
;
void
operator
()(
const
tv
::
GPU
&
d
,
tv
::
TensorView
<
const
T
>
outFeatures
,
tv
::
TensorView
<
const
T
>
inFeatures
,
tv
::
TensorView
<
const
T
>
dout
,
tv
::
TensorView
<
T
>
din
,
...
...
@@ -392,22 +395,23 @@ struct SparseMaxPoolBackwardFunctor<tv::GPU, T, Index> {
int
numPlanes
=
inFeatures
.
dim
(
1
);
bool
notFound
=
true
;
constexpr
int
vecloadFactor
=
sizeof
(
vecload_type_t
)
/
sizeof
(
T
);
mp_for_each
<
kernel_block_t
>
([
=
,
&
outFeatures
,
&
inFeatures
,
&
dout
,
&
din
,
&
indices
,
&
notFound
](
auto
NumTLP
)
{
tv
::
mp_for_each
<
kernel_block_t
>
([
=
,
&
outFeatures
,
&
inFeatures
,
&
dout
,
&
din
,
&
indices
,
&
notFound
](
auto
NumTLP
)
{
constexpr
int
NumILP
=
NumTLP
/
4
;
int
numHotBlock
=
(
size
/
NumTLP
)
*
NumTLP
;
if
(
notFound
)
{
if
(
numPlanes
%
NumTLP
==
0
)
{
if
(
numHotBlock
>=
NumTLP
)
{
maxPoolBwdVecBlockKernel
<
T
,
Index
,
int
(
NumTLP
),
NumILP
,
vecload_type_t
>
maxPoolBwdVecBlockKernel
<
T
,
Index
,
int
(
NumTLP
),
NumILP
,
vecload_type_t
>
<<<
dim3
(
std
::
min
(
size
/
NumTLP
,
512
),
numPlanes
/
NumTLP
),
dim3
(
NumTLP
/
vecloadFactor
,
NumTLP
/
NumILP
),
0
,
d
.
getStream
()
>>>
(
outFeatures
.
data
(),
inFeatures
.
data
(),
dout
.
data
(),
din
.
data
(),
indices
.
subview
(
0
).
data
(),
indices
.
subview
(
1
).
data
(),
numHotBlock
,
numPlanes
/
vecloadFactor
);
dout
.
data
(),
din
.
data
(),
indices
.
subview
(
0
).
data
(),
indices
.
subview
(
1
).
data
(),
numHotBlock
,
numPlanes
/
vecloadFactor
);
TV_CHECK_CUDA_ERR
();
}
...
...
@@ -415,10 +419,10 @@ struct SparseMaxPoolBackwardFunctor<tv::GPU, T, Index> {
maxPoolBwdGenericKernel
<
T
,
Index
,
int
(
NumTLP
),
NumILP
>
<<<
dim3
(
1
,
numPlanes
/
NumTLP
),
dim3
(
NumTLP
/
NumILP
,
NumTLP
),
0
,
d
.
getStream
()
>>>
(
outFeatures
.
data
(),
inFeatures
.
data
(),
dout
.
data
(),
din
.
data
(),
indices
.
subview
(
0
).
data
()
+
numHotBlock
,
indices
.
subview
(
1
).
data
()
+
numHotBlock
,
size
-
numHotBlock
,
numPlanes
);
dout
.
data
(),
din
.
data
(),
indices
.
subview
(
0
).
data
()
+
numHotBlock
,
indices
.
subview
(
1
).
data
()
+
numHotBlock
,
size
-
numHotBlock
,
numPlanes
);
TV_CHECK_CUDA_ERR
();
}
notFound
=
false
;
...
...
@@ -432,7 +436,7 @@ struct SparseMaxPoolBackwardFunctor<tv::GPU, T, Index> {
int
numHotBlock
=
(
size
/
NumTLP
)
*
NumTLP
;
if
(
numHotBlock
>=
NumTLP
)
{
maxPoolBwdGenericBlockKernel
<
T
,
Index
,
NumTLP
,
NumILP
>
<<<
dim3
(
size
/
NumTLP
,
tv
::
launch
::
DivUp
(
numPlanes
,
NumTLP
)),
<<<
dim3
(
size
/
NumTLP
,
tv
::
cuda
::
DivUp
(
numPlanes
,
NumTLP
)),
dim3
(
NumTLP
/
NumILP
,
NumTLP
),
0
,
d
.
getStream
()
>>>
(
outFeatures
.
data
(),
inFeatures
.
data
(),
dout
.
data
(),
din
.
data
(),
indices
.
subview
(
0
).
data
(),
indices
.
subview
(
1
).
data
(),
...
...
@@ -442,7 +446,7 @@ struct SparseMaxPoolBackwardFunctor<tv::GPU, T, Index> {
if
(
size
>
numHotBlock
)
{
maxPoolBwdGenericKernel
<
T
,
Index
,
NumTLP
,
NumILP
>
<<<
dim3
(
1
,
tv
::
launch
::
DivUp
(
numPlanes
,
NumTLP
)),
<<<
dim3
(
1
,
tv
::
cuda
::
DivUp
(
numPlanes
,
NumTLP
)),
dim3
(
NumTLP
/
NumILP
,
NumTLP
),
0
,
d
.
getStream
()
>>>
(
outFeatures
.
data
(),
inFeatures
.
data
(),
dout
.
data
(),
din
.
data
(),
indices
.
subview
(
0
).
data
()
+
numHotBlock
,
...
...
src/spconv/nms.cc
View file @
19e73bbe
...
...
@@ -21,7 +21,7 @@ namespace spconv {
namespace
functor
{
template
<
typename
T
,
typename
Index
>
struct
NonMaxSupressionFunctor
<
tv
::
CPU
,
T
,
Index
>
{
struct
NonMaxSupressionFunctor
<
tv
::
CPU
,
T
,
Index
>
{
Index
operator
()(
const
tv
::
CPU
&
d
,
tv
::
TensorView
<
Index
>
keep
,
tv
::
TensorView
<
const
T
>
boxes
,
T
threshold
,
T
eps
)
{
auto
ndets
=
boxes
.
dim
(
0
);
...
...
@@ -131,7 +131,7 @@ struct rotateNonMaxSupressionFunctor<tv::CPU, T, Index> {
#define DECLARE_CPU_INDEX(Index) \
DECLARE_CPU_T_INDEX(float, Index); \
DECLARE_CPU_T_INDEX(double, Index);
DECLARE_CPU_T_INDEX(double, Index);
DECLARE_CPU_INDEX
(
int
);
DECLARE_CPU_INDEX
(
long
);
...
...
src/spconv/nms.cu
View file @
19e73bbe
...
...
@@ -2,18 +2,18 @@
// Deformable Convolutional Networks
// Copyright (c) 2015 Microsoft
// Licensed under The MIT License
// Modified from MATLAB Faster R-CNN (https://github.com/shaoqingren/faster_rcnn)
// Modified from MATLAB Faster R-CNN
// (https://github.com/shaoqingren/faster_rcnn)
// ------------------------------------------------------------------
#include <ATen/ATen.h>
#include <chrono>
#include <limits>
#include <spconv/mp_helper.h>
#include <spconv/reordering.h>
#include <spconv/reordering.cu.h>
#include <tensorview/helper_kernel.cu.h>
#include <tensorview/helper_launch.h>
#include <spconv/reordering.h>
#include <tensorview/cuda_utils.h>
#include <tensorview/kernel_utils.h>
#include <tensorview/mp_helper.h>
#include <tensorview/tensorview.h>
#include <type_traits>
#include <utility/timer.h>
...
...
@@ -22,8 +22,7 @@
int
const
threadsPerBlock
=
sizeof
(
unsigned
long
long
)
*
8
;
template
<
typename
DType
>
__device__
inline
DType
devIoU
(
DType
const
*
const
a
,
DType
const
*
const
b
)
{
__device__
inline
DType
devIoU
(
DType
const
*
const
a
,
DType
const
*
const
b
)
{
DType
left
=
max
(
a
[
0
],
b
[
0
]),
right
=
min
(
a
[
2
],
b
[
2
]);
DType
top
=
max
(
a
[
1
],
b
[
1
]),
bottom
=
min
(
a
[
3
],
b
[
3
]);
DType
width
=
max
(
right
-
left
+
1
,
0.
f
),
height
=
max
(
bottom
-
top
+
1
,
0.
f
);
...
...
@@ -35,44 +34,36 @@ __device__ inline DType devIoU(DType const *const a, DType const *const b)
template
<
typename
DType
,
int
BLOCK_THREADS
>
__global__
void
nms_kernel
(
const
int
n_boxes
,
const
DType
nms_overlap_thresh
,
const
DType
*
dev_boxes
,
unsigned
long
long
*
dev_mask
)
{
const
DType
*
dev_boxes
,
unsigned
long
long
*
dev_mask
)
{
const
int
row_start
=
blockIdx
.
y
;
const
int
col_start
=
blockIdx
.
x
;
// if (row_start > col_start) return;
const
int
row_size
=
min
(
n_boxes
-
row_start
*
BLOCK_THREADS
,
BLOCK_THREADS
);
const
int
col_size
=
min
(
n_boxes
-
col_start
*
BLOCK_THREADS
,
BLOCK_THREADS
);
const
int
row_size
=
min
(
n_boxes
-
row_start
*
BLOCK_THREADS
,
BLOCK_THREADS
);
const
int
col_size
=
min
(
n_boxes
-
col_start
*
BLOCK_THREADS
,
BLOCK_THREADS
);
__shared__
DType
block_boxes
[
BLOCK_THREADS
*
5
];
if
(
threadIdx
.
x
<
col_size
)
{
if
(
threadIdx
.
x
<
col_size
)
{
#pragma unroll
for
(
int
i
=
0
;
i
<
5
;
++
i
)
{
for
(
int
i
=
0
;
i
<
5
;
++
i
)
{
block_boxes
[
threadIdx
.
x
*
5
+
i
]
=
dev_boxes
[(
BLOCK_THREADS
*
col_start
+
threadIdx
.
x
)
*
5
+
i
];
}
}
__syncthreads
();
if
(
threadIdx
.
x
<
row_size
)
{
if
(
threadIdx
.
x
<
row_size
)
{
const
int
cur_box_idx
=
BLOCK_THREADS
*
row_start
+
threadIdx
.
x
;
const
DType
*
cur_box
=
dev_boxes
+
cur_box_idx
*
5
;
unsigned
long
long
t
=
0
;
int
start
=
0
;
if
(
row_start
==
col_start
)
{
if
(
row_start
==
col_start
)
{
start
=
threadIdx
.
x
+
1
;
}
for
(
int
i
=
start
;
i
<
col_size
;
i
++
)
{
if
(
devIoU
(
cur_box
,
block_boxes
+
i
*
5
)
>
nms_overlap_thresh
)
{
for
(
int
i
=
start
;
i
<
col_size
;
i
++
)
{
if
(
devIoU
(
cur_box
,
block_boxes
+
i
*
5
)
>
nms_overlap_thresh
)
{
t
|=
1ULL
<<
i
;
}
}
...
...
@@ -80,4 +71,3 @@ __global__ void nms_kernel(const int n_boxes, const DType nms_overlap_thresh,
dev_mask
[
cur_box_idx
*
col_blocks
+
col_start
]
=
t
;
}
}
src/spconv/pillar_scatter.cu
View file @
19e73bbe
...
...
@@ -15,10 +15,10 @@
#include <ATen/ATen.h>
#include <chrono>
#include <limits>
#include <spconv/mp_helper.h>
#include <spconv/pillar_scatter_functor.h>
#include <tensorview/helper_kernel.cu.h>
#include <tensorview/helper_launch.h>
#include <tensorview/cuda_utils.h>
#include <tensorview/kernel_utils.h>
#include <tensorview/mp_helper.h>
#include <tensorview/tensorview.h>
#include <type_traits>
#include <utility/timer.h>
...
...
@@ -43,8 +43,8 @@ struct PointPillarScatter<tv::GPU, T, Index> {
void
operator
()(
const
tv
::
GPU
&
d
,
tv
::
TensorView
<
T
>
canvas
,
tv
::
TensorView
<
const
T
>
features
,
tv
::
TensorView
<
const
T
>
coors
)
{
auto
grid
=
dim3
(
tv
::
launch
::
DivUp
(
features
.
dim
(
1
),
32
),
tv
::
launch
::
DivUp
(
features
.
dim
(
0
),
32
));
auto
grid
=
dim3
(
tv
::
cuda
::
DivUp
(
features
.
dim
(
1
),
32
),
tv
::
cuda
::
DivUp
(
features
.
dim
(
0
),
32
));
pointPillarsScatterKernel
<
T
,
Index
>
<<<
grid
,
dim3
(
32
,
32
),
0
,
d
.
getStream
()
>>>
(
canvas
,
features
,
coors
);
TV_CHECK_CUDA_ERR
();
...
...
src/spconv/reordering.cc
View file @
19e73bbe
// Copyright 2019 Yan Yan
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <ATen/Parallel.h>
#include <spconv/reordering.h>
#include <torch/script.h>
#include <ATen/Parallel.h>
namespace
spconv
{
namespace
functor
{
template
<
typename
T
,
typename
Index
>
struct
SparseGatherFunctor
<
tv
::
CPU
,
T
,
Index
>
{
void
operator
()(
const
tv
::
CPU
&
d
,
tv
::
TensorView
<
T
>
buffer
,
tv
::
TensorView
<
const
T
>
features
,
void
operator
()(
const
tv
::
CPU
&
d
,
tv
::
TensorView
<
T
>
buffer
,
tv
::
TensorView
<
const
T
>
features
,
tv
::
TensorView
<
const
Index
>
indices
,
int
size
)
{
int
numPlanes
=
features
.
dim
(
1
);
at
::
parallel_for
(
0
,
size
,
0
,
[
&
](
int64_t
begin
,
int64_t
end
){
at
::
parallel_for
(
0
,
size
,
0
,
[
&
](
int64_t
begin
,
int64_t
end
)
{
for
(
int
i
=
begin
;
i
<
end
;
++
i
)
{
std
::
memcpy
(
buffer
.
data
()
+
i
*
numPlanes
,
features
.
data
()
+
indices
[
i
]
*
numPlanes
,
...
...
@@ -35,16 +36,16 @@ struct SparseGatherFunctor<tv::CPU, T, Index> {
template
<
typename
T
,
typename
Index
>
struct
SparseScatterAddFunctor
<
tv
::
CPU
,
T
,
Index
>
{
void
operator
()(
const
tv
::
CPU
&
d
,
tv
::
TensorView
<
T
>
outFeatures
,
tv
::
TensorView
<
const
T
>
buffer
,
tv
::
TensorView
<
const
Index
>
indices
,
int
size
,
bool
stable
)
{
void
operator
()(
const
tv
::
CPU
&
d
,
tv
::
TensorView
<
T
>
outFeatures
,
tv
::
TensorView
<
const
T
>
buffer
,
tv
::
TensorView
<
const
Index
>
indices
,
int
size
,
bool
stable
)
{
int
numPlanes
=
outFeatures
.
dim
(
1
);
const
T
*
buf
=
buffer
.
data
();
T
*
out
=
outFeatures
.
data
();
const
T
*
buf
=
buffer
.
data
();
T
*
out
=
outFeatures
.
data
();
for
(
int
i
=
0
;
i
<
size
;
++
i
)
{
buf
=
buffer
.
data
()
+
i
*
numPlanes
;
out
=
outFeatures
.
data
()
+
indices
[
i
]
*
numPlanes
;
for
(
int
j
=
0
;
j
<
numPlanes
;
++
j
){
for
(
int
j
=
0
;
j
<
numPlanes
;
++
j
)
{
out
[
j
]
+=
buf
[
j
];
}
}
...
...
@@ -53,9 +54,8 @@ struct SparseScatterAddFunctor<tv::CPU, T, Index> {
}
// namespace functor
#define DECLARE_CPU_SPECS_T_INDEX(T, Index) \
template struct functor::SparseGatherFunctor<tv::CPU, T, Index>; \
#define DECLARE_CPU_SPECS_T_INDEX(T, Index) \
template struct functor::SparseGatherFunctor<tv::CPU, T, Index>; \
template struct functor::SparseScatterAddFunctor<tv::CPU, T, Index>;
#define DECLARE_CPU_SPECS(T) \
...
...
@@ -70,4 +70,3 @@ DECLARE_CPU_SPECS(at::Half);
#undef DECLARE_CPU_SPECS_T_INDEX
}
// namespace spconv
src/spconv/reordering.cu
View file @
19e73bbe
...
...
@@ -15,11 +15,11 @@
#include <ATen/ATen.h>
#include <chrono>
#include <limits>
#include <spconv/mp_helper.h>
#include <spconv/reordering.h>
#include <spconv/reordering.cu.h>
#include <tensorview/helper_kernel.cu.h>
#include <tensorview/helper_launch.h>
#include <spconv/reordering.h>
#include <tensorview/cuda_utils.h>
#include <tensorview/kernel_utils.h>
#include <tensorview/mp_helper.h>
#include <tensorview/tensorview.h>
#include <type_traits>
#include <utility/timer.h>
...
...
@@ -30,7 +30,7 @@ template <typename T, typename Index>
struct
SparseGatherFunctor
<
tv
::
GPU
,
T
,
Index
>
{
using
vecload_type_t
=
std
::
conditional_t
<
std
::
is_same
<
T
,
at
::
Half
>::
value
,
int2
,
int4
>
;
using
kernel_block_t
=
mp_list_c
<
int
,
64
,
32
,
16
>
;
using
kernel_block_t
=
tv
::
mp_list_c
<
int
,
64
,
32
,
16
>
;
void
operator
()(
const
tv
::
GPU
&
d
,
tv
::
TensorView
<
T
>
buffer
,
tv
::
TensorView
<
const
T
>
features
,
tv
::
TensorView
<
const
Index
>
indices
,
int
size
)
{
...
...
@@ -39,8 +39,8 @@ struct SparseGatherFunctor<tv::GPU, T, Index> {
int
numPlanes
=
features
.
dim
(
1
);
bool
notFound
=
true
;
constexpr
int
vecloadFactor
=
sizeof
(
vecload_type_t
)
/
sizeof
(
T
);
mp_for_each
<
kernel_block_t
>
([
=
,
&
buffer
,
&
features
,
&
indices
,
&
notFound
](
auto
NumTLP
)
{
tv
::
mp_for_each
<
kernel_block_t
>
([
=
,
&
buffer
,
&
features
,
&
indices
,
&
notFound
](
auto
NumTLP
)
{
constexpr
int
NumILP
=
NumTLP
/
4
;
// constexpr int NumILP = NumTLP / (64 / (NumTLP / vecloadFactor));
int
nHotBlock
=
(
size
/
NumTLP
)
*
NumTLP
;
...
...
@@ -50,8 +50,9 @@ struct SparseGatherFunctor<tv::GPU, T, Index> {
gatherVecBlockKernel
<
T
,
Index
,
int
(
NumTLP
),
NumILP
,
vecload_type_t
>
<<<
dim3
(
numPlanes
/
NumTLP
,
size
/
NumTLP
),
dim3
(
NumTLP
/
vecloadFactor
,
NumTLP
/
NumILP
),
0
,
d
.
getStream
()
>>>
(
buffer
.
data
(),
features
.
data
(),
indices
.
data
(),
nHotBlock
,
numPlanes
/
vecloadFactor
);
d
.
getStream
()
>>>
(
buffer
.
data
(),
features
.
data
(),
indices
.
data
(),
nHotBlock
,
numPlanes
/
vecloadFactor
);
TV_CHECK_CUDA_ERR
();
}
...
...
@@ -60,8 +61,9 @@ struct SparseGatherFunctor<tv::GPU, T, Index> {
<<<
dim3
(
1
,
numPlanes
/
NumTLP
),
dim3
(
NumTLP
/
NumILP
,
NumTLP
/
vecloadFactor
),
0
,
d
.
getStream
()
>>>
(
buffer
.
data
()
+
nHotBlock
*
numPlanes
,
features
.
data
(),
indices
.
data
()
+
nHotBlock
,
size
-
nHotBlock
,
numPlanes
/
vecloadFactor
);
features
.
data
(),
indices
.
data
()
+
nHotBlock
,
size
-
nHotBlock
,
numPlanes
/
vecloadFactor
);
TV_CHECK_CUDA_ERR
();
}
notFound
=
false
;
...
...
@@ -73,8 +75,8 @@ struct SparseGatherFunctor<tv::GPU, T, Index> {
constexpr
int
NumTLP
=
64
;
constexpr
int
NumILP
=
NumTLP
/
4
;
gatherGenericKernel
<
T
,
Index
,
NumTLP
,
NumILP
>
<<<
dim3
(
tv
::
launch
::
DivUp
(
size
,
NumTLP
),
tv
::
launch
::
DivUp
(
numPlanes
,
NumTLP
)),
<<<
dim3
(
tv
::
cuda
::
DivUp
(
size
,
NumTLP
),
tv
::
cuda
::
DivUp
(
numPlanes
,
NumTLP
)),
dim3
(
NumTLP
/
NumILP
,
NumTLP
),
0
,
d
.
getStream
()
>>>
(
buffer
.
data
(),
features
.
data
(),
indices
.
data
(),
size
,
numPlanes
);
TV_CHECK_CUDA_ERR
();
...
...
@@ -85,7 +87,7 @@ template <typename T, typename Index>
struct
SparseScatterAddFunctor
<
tv
::
GPU
,
T
,
Index
>
{
using
vecload_type_t
=
std
::
conditional_t
<
std
::
is_same
<
T
,
at
::
Half
>::
value
,
int2
,
int4
>
;
using
kernel_block_t
=
mp_list_c
<
int
,
64
,
32
,
16
>
;
using
kernel_block_t
=
tv
::
mp_list_c
<
int
,
64
,
32
,
16
>
;
void
operator
()(
const
tv
::
GPU
&
d
,
tv
::
TensorView
<
T
>
outFeatures
,
tv
::
TensorView
<
const
T
>
buffer
,
tv
::
TensorView
<
const
Index
>
indices
,
int
size
,
bool
stable
)
{
...
...
@@ -95,8 +97,8 @@ struct SparseScatterAddFunctor<tv::GPU, T, Index> {
bool
notFound
=
true
;
constexpr
int
vecloadFactor
=
sizeof
(
vecload_type_t
)
/
sizeof
(
T
);
// important for half.
mp_for_each
<
kernel_block_t
>
([
=
,
&
d
,
&
outFeatures
,
&
buffer
,
&
indices
,
&
notFound
](
auto
NumTLP
)
{
tv
::
mp_for_each
<
kernel_block_t
>
([
=
,
&
d
,
&
outFeatures
,
&
buffer
,
&
indices
,
&
notFound
](
auto
NumTLP
)
{
// constexpr int NumILP = NumTLP / (64 / (NumTLP / vecloadFactor));
constexpr
int
NumILP
=
NumTLP
/
4
;
int
nHotBlock
=
(
size
/
NumTLP
)
*
NumTLP
;
...
...
@@ -108,8 +110,8 @@ struct SparseScatterAddFunctor<tv::GPU, T, Index> {
<<<
dim3
(
numPlanes
/
NumTLP
,
size
/
NumTLP
),
dim3
(
NumTLP
/
vecloadFactor
,
NumTLP
/
NumILP
),
0
,
d
.
getStream
()
>>>
(
outFeatures
.
data
(),
buffer
.
data
(),
indices
.
data
(),
nHotBlock
,
numPlanes
/
vecloadFactor
);
indices
.
data
(),
nHotBlock
,
numPlanes
/
vecloadFactor
);
TV_CHECK_CUDA_ERR
();
}
if
(
size
-
nHotBlock
>
0
)
{
...
...
@@ -128,8 +130,8 @@ struct SparseScatterAddFunctor<tv::GPU, T, Index> {
constexpr
int
NumTLP
=
64
;
constexpr
int
NumILP
=
NumTLP
/
4
;
scatterAddGenericKernel
<
T
,
Index
,
NumTLP
,
NumILP
>
<<<
dim3
(
tv
::
launch
::
DivUp
(
size
,
NumTLP
),
tv
::
launch
::
DivUp
(
numPlanes
,
NumTLP
)),
<<<
dim3
(
tv
::
cuda
::
DivUp
(
size
,
NumTLP
),
tv
::
cuda
::
DivUp
(
numPlanes
,
NumTLP
)),
dim3
(
NumTLP
/
NumILP
,
NumTLP
),
0
,
d
.
getStream
()
>>>
(
outFeatures
.
data
(),
buffer
.
data
(),
indices
.
data
(),
size
,
numPlanes
);
...
...
@@ -139,7 +141,6 @@ struct SparseScatterAddFunctor<tv::GPU, T, Index> {
};
}
// namespace functor
#define DECLARE_GPU_SPECS_T_INDEX(T, Index) \
template struct functor::SparseGatherFunctor<tv::GPU, T, Index>; \
template struct functor::SparseScatterAddFunctor<tv::GPU, T, Index>;
...
...
src/spconv/spconv_ops.cc
View file @
19e73bbe
...
...
@@ -47,7 +47,7 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
double
totalGatherTime
=
0
;
double
totalGEMMTime
=
0
;
double
totalSAddTime
=
0
;
tv
::
torch_
dispatch
<
float
,
double
,
at
::
Half
>
(
tv
::
dispat
ch_tor
ch
<
float
,
double
,
at
::
Half
>
(
features
.
scalar_type
(),
[
&
](
auto
I
)
{
using
T
=
decltype
(
I
);
for
(
int
i
=
0
;
i
<
kernelVolume
;
++
i
)
{
...
...
@@ -68,7 +68,7 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
inverse
),
nHot
);
}
#ifdef
SPCON
V_CUDA
#ifdef
T
V_CUDA
else
if
(
device
==
torch
::
kCUDA
)
{
functor
::
SparseGatherFunctor
<
tv
::
GPU
,
T
,
int
>
gatherFtor
;
gatherFtor
(
tv
::
TorchGPU
(),
tv
::
torch2tv
<
T
>
(
inputBuffer
),
...
...
@@ -99,7 +99,7 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
!
inverse
),
nHot
,
true
);
}
#ifdef
SPCON
V_CUDA
#ifdef
T
V_CUDA
else
if
(
device
==
torch
::
kCUDA
)
{
functor
::
SparseScatterAddFunctor
<
tv
::
GPU
,
T
,
int
>
scatterFtor
;
scatterFtor
(
...
...
@@ -158,7 +158,7 @@ indiceConvBackward(torch::Tensor features, torch::Tensor filters,
torch
::
mm_out
(
filterGradSub
,
features
.
t
(),
outGrad
);
torch
::
mm_out
(
inputGrad
,
outGrad
,
filters
[
indicePairMaxOffset
].
t
());
}
tv
::
torch_
dispatch
<
float
,
double
,
tv
::
dispat
ch_tor
ch
<
float
,
double
,
at
::
Half
>
(
features
.
scalar_type
(),
[
&
](
auto
I
)
{
using
T
=
decltype
(
I
);
for
(
int
i
=
0
;
i
<
kernelVolume
;
++
i
)
{
...
...
@@ -178,7 +178,7 @@ indiceConvBackward(torch::Tensor features, torch::Tensor filters,
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
!
inverse
),
nHot
);
}
#ifdef
SPCON
V_CUDA
#ifdef
T
V_CUDA
else
if
(
device
==
torch
::
kCUDA
)
{
functor
::
SparseGatherFunctor
<
tv
::
GPU
,
T
,
int
>
gatherFtor
;
functor
::
SparseGatherFunctor
<
tv
::
GPU
,
T
,
int
>
gatherFtorOut
;
...
...
@@ -213,7 +213,7 @@ indiceConvBackward(torch::Tensor features, torch::Tensor filters,
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
inverse
),
nHot
);
}
#ifdef
SPCON
V_CUDA
#ifdef
T
V_CUDA
else
if
(
device
==
torch
::
kCUDA
)
{
functor
::
SparseScatterAddFunctor
<
tv
::
GPU
,
T
,
int
>
scatterFtor
;
scatterFtor
(
tv
::
TorchGPU
(),
tv
::
torch2tv
<
T
>
(
inputGrad
),
...
...
src/utils/all.cc
View file @
19e73bbe
...
...
@@ -20,7 +20,7 @@ using namespace pybind11::literals;
PYBIND11_MODULE
(
spconv_utils
,
m
)
{
m
.
doc
()
=
"util pybind11 functions for spconv"
;
#ifdef
SPCON
V_CUDA
#ifdef
T
V_CUDA
m
.
def
(
"non_max_suppression"
,
&
spconv
::
non_max_suppression
<
double
>
,
py
::
return_value_policy
::
reference_internal
,
"bbox iou"
,
"boxes"
_a
=
1
,
"keep_out"
_a
=
2
,
"nms_overlap_thresh"
_a
=
3
,
"device_id"
_a
=
4
);
...
...
src/utils/nms.cu
View file @
19e73bbe
...
...
@@ -2,30 +2,28 @@
// Deformable Convolutional Networks
// Copyright (c) 2015 Microsoft
// Licensed under The MIT License
// Modified from MATLAB Faster R-CNN (https://github.com/shaoqingren/faster_rcnn)
// Modified from MATLAB Faster R-CNN
// (https://github.com/shaoqingren/faster_rcnn)
// ------------------------------------------------------------------
#include <vector>
#include <iostream>
#include <cuda_runtime.h>
#include <iostream>
#include <spconv/nms_gpu.h>
#include <vector>
#define CUDA_CHECK(condition) \
/* Code block avoids redefinition of cudaError_t error */
\
do \
{ \
cudaError_t error = condition; \
if (error != cudaSuccess) \
{ \
std::cout << cudaGetErrorString(error) << std::endl; \
} \
#define CUDA_CHECK(condition) \
/* Code block avoids redefinition of cudaError_t error */
\
do { \
cudaError_t error = condition; \
if (error != cudaSuccess) { \
std::cout << cudaGetErrorString(error) << std::endl; \
} \
} while (0)
#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
int
const
threadsPerBlock
=
sizeof
(
unsigned
long
long
)
*
8
;
template
<
typename
DType
>
__device__
inline
DType
devIoU
(
DType
const
*
const
a
,
DType
const
*
const
b
)
{
__device__
inline
DType
devIoU
(
DType
const
*
const
a
,
DType
const
*
const
b
)
{
DType
left
=
max
(
a
[
0
],
b
[
0
]),
right
=
min
(
a
[
2
],
b
[
2
]);
DType
top
=
max
(
a
[
1
],
b
[
1
]),
bottom
=
min
(
a
[
3
],
b
[
3
]);
DType
width
=
max
(
right
-
left
+
1
,
0.
f
),
height
=
max
(
bottom
-
top
+
1
,
0.
f
);
...
...
@@ -37,44 +35,36 @@ __device__ inline DType devIoU(DType const *const a, DType const *const b)
template
<
typename
DType
,
int
BLOCK_THREADS
>
__global__
void
nms_kernel
(
const
int
n_boxes
,
const
DType
nms_overlap_thresh
,
const
DType
*
dev_boxes
,
unsigned
long
long
*
dev_mask
)
{
const
DType
*
dev_boxes
,
unsigned
long
long
*
dev_mask
)
{
const
int
row_start
=
blockIdx
.
y
;
const
int
col_start
=
blockIdx
.
x
;
// if (row_start > col_start) return;
const
int
row_size
=
min
(
n_boxes
-
row_start
*
BLOCK_THREADS
,
BLOCK_THREADS
);
const
int
col_size
=
min
(
n_boxes
-
col_start
*
BLOCK_THREADS
,
BLOCK_THREADS
);
const
int
row_size
=
min
(
n_boxes
-
row_start
*
BLOCK_THREADS
,
BLOCK_THREADS
);
const
int
col_size
=
min
(
n_boxes
-
col_start
*
BLOCK_THREADS
,
BLOCK_THREADS
);
__shared__
DType
block_boxes
[
BLOCK_THREADS
*
5
];
if
(
threadIdx
.
x
<
col_size
)
{
if
(
threadIdx
.
x
<
col_size
)
{
#pragma unroll
for
(
int
i
=
0
;
i
<
5
;
++
i
)
{
for
(
int
i
=
0
;
i
<
5
;
++
i
)
{
block_boxes
[
threadIdx
.
x
*
5
+
i
]
=
dev_boxes
[(
BLOCK_THREADS
*
col_start
+
threadIdx
.
x
)
*
5
+
i
];
}
}
__syncthreads
();
if
(
threadIdx
.
x
<
row_size
)
{
if
(
threadIdx
.
x
<
row_size
)
{
const
int
cur_box_idx
=
BLOCK_THREADS
*
row_start
+
threadIdx
.
x
;
const
DType
*
cur_box
=
dev_boxes
+
cur_box_idx
*
5
;
unsigned
long
long
t
=
0
;
int
start
=
0
;
if
(
row_start
==
col_start
)
{
if
(
row_start
==
col_start
)
{
start
=
threadIdx
.
x
+
1
;
}
for
(
int
i
=
start
;
i
<
col_size
;
i
++
)
{
if
(
devIoU
(
cur_box
,
block_boxes
+
i
*
5
)
>
nms_overlap_thresh
)
{
for
(
int
i
=
start
;
i
<
col_size
;
i
++
)
{
if
(
devIoU
(
cur_box
,
block_boxes
+
i
*
5
)
>
nms_overlap_thresh
)
{
t
|=
1ULL
<<
i
;
}
}
...
...
@@ -83,12 +73,10 @@ __global__ void nms_kernel(const int n_boxes, const DType nms_overlap_thresh,
}
}
void
_set_device
(
int
device_id
)
{
void
_set_device
(
int
device_id
)
{
int
current_device
;
CUDA_CHECK
(
cudaGetDevice
(
&
current_device
));
if
(
current_device
==
device_id
)
{
if
(
current_device
==
device_id
)
{
return
;
}
// The call to cudaSetDevice must come before any calls to Get, which
...
...
@@ -98,8 +86,7 @@ void _set_device(int device_id)
template
<
typename
DType
,
int
BLOCK_THREADS
>
int
_nms_gpu
(
int
*
keep_out
,
const
DType
*
boxes_host
,
int
boxes_num
,
int
boxes_dim
,
DType
nms_overlap_thresh
,
int
device_id
)
{
int
boxes_dim
,
DType
nms_overlap_thresh
,
int
device_id
)
{
_set_device
(
device_id
);
DType
*
boxes_dev
=
NULL
;
...
...
@@ -107,27 +94,21 @@ int _nms_gpu(int *keep_out, const DType *boxes_host, int boxes_num,
const
int
col_blocks
=
DIVUP
(
boxes_num
,
BLOCK_THREADS
);
CUDA_CHECK
(
cudaMalloc
(
&
boxes_dev
,
boxes_num
*
boxes_dim
*
sizeof
(
DType
)));
CUDA_CHECK
(
cudaMemcpy
(
boxes_dev
,
boxes_host
,
CUDA_CHECK
(
cudaMalloc
(
&
boxes_dev
,
boxes_num
*
boxes_dim
*
sizeof
(
DType
)));
CUDA_CHECK
(
cudaMemcpy
(
boxes_dev
,
boxes_host
,
boxes_num
*
boxes_dim
*
sizeof
(
DType
),
cudaMemcpyHostToDevice
));
CUDA_CHECK
(
cudaMalloc
(
&
mask_dev
,
boxes_num
*
col_blocks
*
sizeof
(
unsigned
long
long
)));
dim3
blocks
(
DIVUP
(
boxes_num
,
BLOCK_THREADS
),
DIVUP
(
boxes_num
,
BLOCK_THREADS
));
dim3
blocks
(
DIVUP
(
boxes_num
,
BLOCK_THREADS
),
DIVUP
(
boxes_num
,
BLOCK_THREADS
));
dim3
threads
(
BLOCK_THREADS
);
nms_kernel
<
DType
,
BLOCK_THREADS
><<<
blocks
,
threads
>>>
(
boxes_num
,
nms_overlap_thresh
,
boxes_dev
,
mask_dev
);
nms_kernel
<
DType
,
BLOCK_THREADS
>
<<<
blocks
,
threads
>>>
(
boxes_num
,
nms_overlap_thresh
,
boxes_dev
,
mask_dev
);
std
::
vector
<
unsigned
long
long
>
mask_host
(
boxes_num
*
col_blocks
);
CUDA_CHECK
(
cudaMemcpy
(
&
mask_host
[
0
],
mask_dev
,
CUDA_CHECK
(
cudaMemcpy
(
&
mask_host
[
0
],
mask_dev
,
sizeof
(
unsigned
long
long
)
*
boxes_num
*
col_blocks
,
cudaMemcpyDeviceToHost
));
...
...
@@ -135,17 +116,14 @@ int _nms_gpu(int *keep_out, const DType *boxes_host, int boxes_num,
memset
(
&
remv
[
0
],
0
,
sizeof
(
unsigned
long
long
)
*
col_blocks
);
int
num_to_keep
=
0
;
for
(
int
i
=
0
;
i
<
boxes_num
;
i
++
)
{
for
(
int
i
=
0
;
i
<
boxes_num
;
i
++
)
{
int
nblock
=
i
/
BLOCK_THREADS
;
int
inblock
=
i
%
BLOCK_THREADS
;
if
(
!
(
remv
[
nblock
]
&
(
1ULL
<<
inblock
)))
{
if
(
!
(
remv
[
nblock
]
&
(
1ULL
<<
inblock
)))
{
keep_out
[
num_to_keep
++
]
=
i
;
unsigned
long
long
*
p
=
&
mask_host
[
0
]
+
i
*
col_blocks
;
for
(
int
j
=
nblock
;
j
<
col_blocks
;
j
++
)
{
for
(
int
j
=
nblock
;
j
<
col_blocks
;
j
++
)
{
remv
[
j
]
|=
p
[
j
];
}
}
...
...
@@ -156,10 +134,15 @@ int _nms_gpu(int *keep_out, const DType *boxes_host, int boxes_num,
return
num_to_keep
;
}
//template<>
template
int
_nms_gpu
<
float
,
threadsPerBlock
>(
int
*
keep_out
,
const
float
*
boxes_host
,
int
boxes_num
,
int
boxes_dim
,
float
nms_overlap_thresh
,
int
device_id
);
//template<>
template
int
_nms_gpu
<
double
,
threadsPerBlock
>(
int
*
keep_out
,
const
double
*
boxes_host
,
int
boxes_num
,
int
boxes_dim
,
double
nms_overlap_thresh
,
int
device_id
);
\ No newline at end of file
// template<>
template
int
_nms_gpu
<
float
,
threadsPerBlock
>(
int
*
keep_out
,
const
float
*
boxes_host
,
int
boxes_num
,
int
boxes_dim
,
float
nms_overlap_thresh
,
int
device_id
);
// template<>
template
int
_nms_gpu
<
double
,
threadsPerBlock
>(
int
*
keep_out
,
const
double
*
boxes_host
,
int
boxes_num
,
int
boxes_dim
,
double
nms_overlap_thresh
,
int
device_id
);
\ No newline at end of file
test/fake_dist_train.py
View file @
19e73bbe
import
horovod.torch
as
hvd
import
time
from
pathlib
import
Path
...
...
@@ -12,6 +11,7 @@ from torch.utils import data
from
torch.utils.data
import
DataLoader
,
Dataset
from
torchvision
import
datasets
,
transforms
import
horovod.torch
as
hvd
import
spconv
from
spconv.test_utils
import
generate_sparse_data
...
...
@@ -53,25 +53,47 @@ class FakeClassifier(nn.Module):
def
__init__
(
self
):
super
().
__init__
()
self
.
net
=
spconv
.
SparseSequential
(
spconv
.
SubMConv3d
(
3
,
8
,
3
,
indice_key
=
"subm1"
,
padding
=
1
,
use_hash
=
False
),
spconv
.
SubMConv3d
(
3
,
8
,
3
,
indice_key
=
"subm1"
,
padding
=
1
,
use_hash
=
False
),
nn
.
BatchNorm1d
(
8
),
nn
.
ReLU
(),
spconv
.
SparseConv3d
(
8
,
16
,
3
,
stride
=
2
,
padding
=
1
,
use_hash
=
False
),
nn
.
BatchNorm1d
(
16
),
nn
.
ReLU
(),
spconv
.
SubMConv3d
(
16
,
16
,
3
,
indice_key
=
"subm2"
,
padding
=
1
,
use_hash
=
False
),
spconv
.
SubMConv3d
(
16
,
16
,
3
,
indice_key
=
"subm2"
,
padding
=
1
,
use_hash
=
False
),
nn
.
BatchNorm1d
(
16
),
nn
.
ReLU
(),
spconv
.
SparseConv3d
(
16
,
32
,
3
,
stride
=
2
,
padding
=
1
,
use_hash
=
False
),
spconv
.
SparseConv3d
(
16
,
32
,
3
,
stride
=
2
,
padding
=
1
,
use_hash
=
False
),
nn
.
BatchNorm1d
(
32
),
nn
.
ReLU
(),
spconv
.
SubMConv3d
(
32
,
32
,
3
,
indice_key
=
"subm3"
,
padding
=
1
,
use_hash
=
False
),
spconv
.
SubMConv3d
(
32
,
32
,
3
,
indice_key
=
"subm3"
,
padding
=
1
,
use_hash
=
False
),
nn
.
BatchNorm1d
(
32
),
nn
.
ReLU
(),
spconv
.
SparseConv3d
(
32
,
64
,
3
,
stride
=
2
,
padding
=
1
,
use_hash
=
False
),
spconv
.
SparseConv3d
(
32
,
64
,
3
,
stride
=
2
,
padding
=
1
,
use_hash
=
False
),
nn
.
BatchNorm1d
(
64
),
nn
.
ReLU
(),
spconv
.
SubMConv3d
(
64
,
64
,
3
,
indice_key
=
"subm4"
,
padding
=
1
,
use_hash
=
False
),
spconv
.
SubMConv3d
(
64
,
64
,
3
,
indice_key
=
"subm4"
,
padding
=
1
,
use_hash
=
False
),
nn
.
BatchNorm1d
(
64
),
nn
.
ReLU
(),
spconv
.
ToDense
()
# [64, 2, 8, 8]
...
...
@@ -100,15 +122,16 @@ def run():
hvd
.
broadcast_optimizer_state
(
optimizer
,
root_rank
=
0
)
compression
=
hvd
.
Compression
.
none
optimizer
=
hvd
.
DistributedOptimizer
(
optimizer
,
named_parameters
=
model
.
named_parameters
(),
compression
=
compression
,
op
=
hvd
.
Average
)
optimizer
=
hvd
.
DistributedOptimizer
(
optimizer
,
named_parameters
=
model
.
named_parameters
(),
compression
=
compression
,
op
=
hvd
.
Average
)
for
i
in
tqdm
.
tqdm
(
list
(
range
(
100
))):
# for j in range(4):
# features, indices, label = ds[(i * 4 + j) % len(ds)]
features
,
indices
,
label
=
ds
[
i
%
len
(
ds
)]
features_t
=
torch
.
from_numpy
(
features
)
indices_t
=
torch
.
from_numpy
(
indices
)
...
...
test/fake_train.py
View file @
19e73bbe
...
...
@@ -52,25 +52,47 @@ class FakeClassifier(nn.Module):
def
__init__
(
self
):
super
().
__init__
()
self
.
net
=
spconv
.
SparseSequential
(
spconv
.
SubMConv3d
(
3
,
8
,
3
,
indice_key
=
"subm1"
,
padding
=
1
,
use_hash
=
False
),
spconv
.
SubMConv3d
(
3
,
8
,
3
,
indice_key
=
"subm1"
,
padding
=
1
,
use_hash
=
False
),
nn
.
BatchNorm1d
(
8
),
nn
.
ReLU
(),
spconv
.
SparseConv3d
(
8
,
16
,
3
,
stride
=
2
,
padding
=
1
,
use_hash
=
False
),
nn
.
BatchNorm1d
(
16
),
nn
.
ReLU
(),
spconv
.
SubMConv3d
(
16
,
16
,
3
,
indice_key
=
"subm2"
,
padding
=
1
,
use_hash
=
False
),
spconv
.
SubMConv3d
(
16
,
16
,
3
,
indice_key
=
"subm2"
,
padding
=
1
,
use_hash
=
False
),
nn
.
BatchNorm1d
(
16
),
nn
.
ReLU
(),
spconv
.
SparseConv3d
(
16
,
32
,
3
,
stride
=
2
,
padding
=
1
,
use_hash
=
False
),
spconv
.
SparseConv3d
(
16
,
32
,
3
,
stride
=
2
,
padding
=
1
,
use_hash
=
False
),
nn
.
BatchNorm1d
(
32
),
nn
.
ReLU
(),
spconv
.
SubMConv3d
(
32
,
32
,
3
,
indice_key
=
"subm3"
,
padding
=
1
,
use_hash
=
False
),
spconv
.
SubMConv3d
(
32
,
32
,
3
,
indice_key
=
"subm3"
,
padding
=
1
,
use_hash
=
False
),
nn
.
BatchNorm1d
(
32
),
nn
.
ReLU
(),
spconv
.
SparseConv3d
(
32
,
64
,
3
,
stride
=
2
,
padding
=
1
,
use_hash
=
False
),
spconv
.
SparseConv3d
(
32
,
64
,
3
,
stride
=
2
,
padding
=
1
,
use_hash
=
False
),
nn
.
BatchNorm1d
(
64
),
nn
.
ReLU
(),
spconv
.
SubMConv3d
(
64
,
64
,
3
,
indice_key
=
"subm4"
,
padding
=
1
,
use_hash
=
False
),
spconv
.
SubMConv3d
(
64
,
64
,
3
,
indice_key
=
"subm4"
,
padding
=
1
,
use_hash
=
False
),
nn
.
BatchNorm1d
(
64
),
nn
.
ReLU
(),
spconv
.
ToDense
()
# [64, 2, 8, 8]
...
...
@@ -97,7 +119,7 @@ def run():
for
i
in
tqdm
.
tqdm
(
list
(
range
(
100
))):
# for j in range(4):
# features, indices, label = ds[(i * 4 + j) % len(ds)]
features
,
indices
,
label
=
ds
[
i
%
len
(
ds
)]
features_t
=
torch
.
from_numpy
(
features
)
indices_t
=
torch
.
from_numpy
(
indices
)
...
...
test/test_conv.py
View file @
19e73bbe
# Copyright 2019 Yan Yan
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
time
import
unittest
from
pathlib
import
Path
import
spconv
import
numpy
as
np
import
torch
from
torch
import
nn
import
numpy
as
np
import
time
from
spconv.test_utils
import
params_grid
,
generate_sparse_data
,
TestCase
import
unittest
# import sparseconvnet as scn
from
torch
import
nn
import
spconv
from
spconv.test_utils
import
TestCase
,
generate_sparse_data
,
params_grid
# import sparseconvnet as scn
class
SparseConv3dTestTorch
(
nn
.
Module
):
def
__init__
(
self
,
num_layers
,
ndim
,
shape
,
in_channels
,
out_channels
,
kernel_size
,
stride
,
padding
,
dilation
):
def
__init__
(
self
,
num_layers
,
ndim
,
shape
,
in_channels
,
out_channels
,
kernel_size
,
stride
,
padding
,
dilation
):
super
().
__init__
()
layers
=
[
spconv
.
SparseConv3d
(
in_channels
,
out_channels
,
kernel_size
,
stride
,
padding
=
padding
,
dilation
=
dilation
,
bias
=
False
,
use_hash
=
True
)]
layers
=
[
spconv
.
SparseConv3d
(
in_channels
,
out_channels
,
kernel_size
,
stride
,
padding
=
padding
,
dilation
=
dilation
,
bias
=
False
,
use_hash
=
True
)
]
for
i
in
range
(
1
,
num_layers
):
layers
.
append
(
spconv
.
SparseConv3d
(
out_channels
,
out_channels
,
kernel_size
,
stride
,
padding
=
padding
,
dilation
=
dilation
,
bias
=
False
))
self
.
net
=
spconv
.
SparseSequential
(
*
layers
,
)
layers
.
append
(
spconv
.
SparseConv3d
(
out_channels
,
out_channels
,
kernel_size
,
stride
,
padding
=
padding
,
dilation
=
dilation
,
bias
=
False
))
self
.
net
=
spconv
.
SparseSequential
(
*
layers
,
)
# self.grid = torch.full([3, *shape], -1, dtype=torch.int32).cuda()
self
.
grid
=
None
self
.
shape
=
shape
def
forward
(
self
,
features
,
coors
,
batch_size
):
coors
=
coors
.
int
()
x
=
spconv
.
SparseConvTensor
(
features
,
coors
,
self
.
shape
,
batch_size
,
self
.
grid
)
return
self
.
net
(
x
)
# .dense()
x
=
spconv
.
SparseConvTensor
(
features
,
coors
,
self
.
shape
,
batch_size
,
self
.
grid
)
return
self
.
net
(
x
)
# .dense()
class
SubMConv3dTestTorch
(
nn
.
Module
):
def
__init__
(
self
,
num_layers
,
ndim
,
shape
,
in_channels
,
out_channels
,
kernel_size
,
stride
,
padding
,
dilation
):
def
__init__
(
self
,
num_layers
,
ndim
,
shape
,
in_channels
,
out_channels
,
kernel_size
,
stride
,
padding
,
dilation
):
super
().
__init__
()
layers
=
[
spconv
.
SubMConv3d
(
in_channels
,
out_channels
,
kernel_size
,
stride
,
padding
=
padding
,
dilation
=
dilation
,
bias
=
False
)]
layers
=
[
spconv
.
SubMConv3d
(
in_channels
,
out_channels
,
kernel_size
,
stride
,
padding
=
padding
,
dilation
=
dilation
,
bias
=
False
)
]
for
i
in
range
(
1
,
num_layers
):
layers
.
append
(
spconv
.
SubMConv3d
(
out_channels
,
out_channels
,
kernel_size
,
stride
,
padding
=
padding
,
dilation
=
dilation
,
bias
=
False
))
self
.
net
=
spconv
.
SparseSequential
(
*
layers
,
)
layers
.
append
(
spconv
.
SubMConv3d
(
out_channels
,
out_channels
,
kernel_size
,
stride
,
padding
=
padding
,
dilation
=
dilation
,
bias
=
False
))
self
.
net
=
spconv
.
SparseSequential
(
*
layers
,
)
self
.
grid
=
torch
.
full
([
3
,
*
shape
],
-
1
,
dtype
=
torch
.
int32
).
cuda
()
self
.
shape
=
shape
def
forward
(
self
,
features
,
coors
,
batch_size
):
coors
=
coors
.
int
()
x
=
spconv
.
SparseConvTensor
(
features
,
coors
,
self
.
shape
,
batch_size
,
self
.
grid
)
return
self
.
net
(
x
)
# .dense()
x
=
spconv
.
SparseConvTensor
(
features
,
coors
,
self
.
shape
,
batch_size
,
self
.
grid
)
return
self
.
net
(
x
)
# .dense()
class
Conv3dTestTorch
(
nn
.
Module
):
def
__init__
(
self
,
num_layers
,
ndim
,
shape
,
in_channels
,
out_channels
,
kernel_size
,
stride
,
padding
,
dilation
):
def
__init__
(
self
,
num_layers
,
ndim
,
shape
,
in_channels
,
out_channels
,
kernel_size
,
stride
,
padding
,
dilation
):
super
().
__init__
()
layers
=
[
nn
.
Conv3d
(
in_channels
,
out_channels
,
kernel_size
,
stride
,
padding
=
padding
,
dilation
=
dilation
,
bias
=
False
)]
layers
=
[
nn
.
Conv3d
(
in_channels
,
out_channels
,
kernel_size
,
stride
,
padding
=
padding
,
dilation
=
dilation
,
bias
=
False
)
]
for
i
in
range
(
1
,
num_layers
):
layers
.
append
(
nn
.
Conv3d
(
out_channels
,
out_channels
,
kernel_size
,
stride
,
padding
=
padding
,
dilation
=
dilation
,
bias
=
False
))
self
.
net
=
nn
.
Sequential
(
*
layers
,
)
layers
.
append
(
nn
.
Conv3d
(
out_channels
,
out_channels
,
kernel_size
,
stride
,
padding
=
padding
,
dilation
=
dilation
,
bias
=
False
))
self
.
net
=
nn
.
Sequential
(
*
layers
,
)
self
.
shape
=
shape
def
forward
(
self
,
x
):
return
self
.
net
(
x
)
# .dense()
return
self
.
net
(
x
)
# .dense()
class
SparseDeConv3dTestTorch
(
nn
.
Module
):
def
__init__
(
self
,
num_layers
,
ndim
,
shape
,
in_channels
,
out_channels
,
kernel_size
,
stride
,
padding
,
dilation
):
def
__init__
(
self
,
num_layers
,
ndim
,
shape
,
in_channels
,
out_channels
,
kernel_size
,
stride
,
padding
,
dilation
):
super
().
__init__
()
layers
=
[
spconv
.
SparseConvTranspose3d
(
in_channels
,
out_channels
,
kernel_size
,
stride
,
padding
=
padding
,
dilation
=
dilation
,
bias
=
False
)]
layers
=
[
spconv
.
SparseConvTranspose3d
(
in_channels
,
out_channels
,
kernel_size
,
stride
,
padding
=
padding
,
dilation
=
dilation
,
bias
=
False
)
]
for
i
in
range
(
1
,
num_layers
):
layers
.
append
(
spconv
.
SparseConvTranspose3d
(
out_channels
,
out_channels
,
kernel_size
,
stride
,
padding
=
padding
,
dilation
=
dilation
,
bias
=
False
))
self
.
net
=
spconv
.
SparseSequential
(
*
layers
,
)
layers
.
append
(
spconv
.
SparseConvTranspose3d
(
out_channels
,
out_channels
,
kernel_size
,
stride
,
padding
=
padding
,
dilation
=
dilation
,
bias
=
False
))
self
.
net
=
spconv
.
SparseSequential
(
*
layers
,
)
self
.
shape
=
shape
def
forward
(
self
,
features
,
coors
,
batch_size
):
coors
=
coors
.
int
()
x
=
spconv
.
SparseConvTensor
(
features
,
coors
,
self
.
shape
,
batch_size
)
return
self
.
net
(
x
)
# .dense()
x
=
spconv
.
SparseConvTensor
(
features
,
coors
,
self
.
shape
,
batch_size
)
return
self
.
net
(
x
)
# .dense()
class
DeConv3dTestTorch
(
nn
.
Module
):
def
__init__
(
self
,
num_layers
,
ndim
,
shape
,
in_channels
,
out_channels
,
kernel_size
,
stride
,
padding
,
dilation
):
def
__init__
(
self
,
num_layers
,
ndim
,
shape
,
in_channels
,
out_channels
,
kernel_size
,
stride
,
padding
,
dilation
):
super
().
__init__
()
layers
=
[
nn
.
ConvTranspose3d
(
in_channels
,
out_channels
,
kernel_size
,
stride
,
padding
=
padding
,
dilation
=
dilation
,
bias
=
False
)]
layers
=
[
nn
.
ConvTranspose3d
(
in_channels
,
out_channels
,
kernel_size
,
stride
,
padding
=
padding
,
dilation
=
dilation
,
bias
=
False
)
]
for
i
in
range
(
1
,
num_layers
):
layers
.
append
(
nn
.
ConvTranspose3d
(
out_channels
,
out_channels
,
kernel_size
,
stride
,
padding
=
padding
,
dilation
=
dilation
,
bias
=
False
))
self
.
net
=
nn
.
Sequential
(
*
layers
,
)
layers
.
append
(
nn
.
ConvTranspose3d
(
out_channels
,
out_channels
,
kernel_size
,
stride
,
padding
=
padding
,
dilation
=
dilation
,
bias
=
False
))
self
.
net
=
nn
.
Sequential
(
*
layers
,
)
self
.
shape
=
shape
def
forward
(
self
,
x
):
return
self
.
net
(
x
)
# .dense()
return
self
.
net
(
x
)
# .dense()
class
SparseMaxPoolTestTorch
(
nn
.
Module
):
def
__init__
(
self
,
num_layers
,
ndim
,
shape
,
kernel_size
,
stride
,
padding
,
dilation
):
def
__init__
(
self
,
num_layers
,
ndim
,
shape
,
kernel_size
,
stride
,
padding
,
dilation
):
super
().
__init__
()
layers
=
[
spconv
.
SparseMaxPool3d
(
kernel_size
,
stride
,
padding
,
dilation
)
]
layers
=
[
spconv
.
SparseMaxPool3d
(
kernel_size
,
stride
,
padding
,
dilation
)
]
for
i
in
range
(
1
,
num_layers
):
layers
.
append
(
spconv
.
SparseMaxPool3d
(
kernel_size
,
stride
,
padding
,
dilation
))
self
.
net
=
spconv
.
SparseSequential
(
*
layers
,
)
layers
.
append
(
spconv
.
SparseMaxPool3d
(
kernel_size
,
stride
,
padding
,
dilation
))
self
.
net
=
spconv
.
SparseSequential
(
*
layers
,
)
self
.
shape
=
shape
def
forward
(
self
,
features
,
coors
,
batch_size
):
coors
=
coors
.
int
()
x
=
spconv
.
SparseConvTensor
(
features
,
coors
,
self
.
shape
,
batch_size
)
return
self
.
net
(
x
)
# .dense()
x
=
spconv
.
SparseConvTensor
(
features
,
coors
,
self
.
shape
,
batch_size
)
return
self
.
net
(
x
)
# .dense()
class
MaxPool3dTestTorch
(
nn
.
Module
):
def
__init__
(
self
,
num_layers
,
ndim
,
shape
,
kernel_size
,
stride
,
padding
,
dilation
):
def
__init__
(
self
,
num_layers
,
ndim
,
shape
,
kernel_size
,
stride
,
padding
,
dilation
):
super
().
__init__
()
layers
=
[
nn
.
MaxPool3d
(
kernel_size
,
stride
,
padding
,
dilation
)]
layers
=
[
nn
.
MaxPool3d
(
kernel_size
,
stride
,
padding
,
dilation
)]
for
i
in
range
(
1
,
num_layers
):
layers
.
append
(
nn
.
MaxPool3d
(
kernel_size
,
stride
,
padding
,
dilation
))
self
.
net
=
nn
.
Sequential
(
*
layers
,
)
layers
.
append
(
nn
.
MaxPool3d
(
kernel_size
,
stride
,
padding
,
dilation
))
self
.
net
=
nn
.
Sequential
(
*
layers
,
)
self
.
shape
=
shape
def
forward
(
self
,
x
):
return
self
.
net
(
x
)
# .dense()
return
self
.
net
(
x
)
# .dense()
class
SubmanifoldConvTestTorch
(
nn
.
Module
):
def
__init__
(
self
,
num_layers
,
ndim
,
shape
,
in_channels
,
out_channels
,
kernel_size
,
stride
):
def
__init__
(
self
,
num_layers
,
ndim
,
shape
,
in_channels
,
out_channels
,
kernel_size
,
stride
):
super
().
__init__
()
layers
=
[
spconv
.
SubMConv3d
(
in_channels
,
out_channels
,
kernel_size
,
bias
=
False
,
indice_key
=
"subm0"
)]
layers
=
[
spconv
.
SubMConv3d
(
in_channels
,
out_channels
,
kernel_size
,
bias
=
False
,
indice_key
=
"subm0"
)
]
for
i
in
range
(
1
,
num_layers
):
layers
.
append
(
spconv
.
SubMConv3d
(
out_channels
,
out_channels
,
kernel_size
,
bias
=
False
))
self
.
net
=
nn
.
Sequential
(
*
layers
,
)
layers
.
append
(
spconv
.
SubMConv3d
(
out_channels
,
out_channels
,
kernel_size
,
bias
=
False
))
self
.
net
=
nn
.
Sequential
(
*
layers
,
)
self
.
shape
=
shape
def
forward
(
self
,
features
,
coors
,
batch_size
):
coors
=
coors
.
int
()
x
=
spconv
.
SparseConvTensor
(
features
,
coors
,
self
.
shape
,
batch_size
)
x
=
spconv
.
SparseConvTensor
(
features
,
coors
,
self
.
shape
,
batch_size
)
return
self
.
net
(
x
)
class
SCNCoupleDeConvTest
(
nn
.
Module
):
def
__init__
(
self
,
num_layers
,
ndim
,
shape
,
in_channels
,
out_channels
,
kernel_size
,
stride
):
def
__init__
(
self
,
num_layers
,
ndim
,
shape
,
in_channels
,
out_channels
,
kernel_size
,
stride
):
super
().
__init__
()
self
.
scn_input
=
scn
.
InputLayer
(
ndim
,
shape
,
mode
=
0
)
self
.
net
=
nn
.
Sequential
(
scn
.
Convolution
(
ndim
,
in_channels
,
out_channels
,
kernel_size
,
stride
,
bias
=
False
),
scn
.
Deconvolution
(
ndim
,
out_channels
,
in_channels
,
kernel_size
,
stride
,
bias
=
False
),
scn
.
Convolution
(
ndim
,
in_channels
,
out_channels
,
kernel_size
,
stride
,
bias
=
False
),
scn
.
Deconvolution
(
ndim
,
out_channels
,
in_channels
,
kernel_size
,
stride
,
bias
=
False
),
scn
.
SparseToDense
(
ndim
,
in_channels
),
)
...
...
@@ -267,44 +268,44 @@ class SCNCoupleDeConvTest(nn.Module):
x
=
self
.
scn_input
((
coors
,
features
))
return
self
.
net
(
x
)
class
SparseCoupleDeConvTest
(
nn
.
Module
):
def
__init__
(
self
,
num_layers
,
ndim
,
shape
,
in_channels
,
out_channels
,
kernel_size
,
stride
):
def
__init__
(
self
,
num_layers
,
ndim
,
shape
,
in_channels
,
out_channels
,
kernel_size
,
stride
):
super
().
__init__
()
self
.
net
=
spconv
.
SparseSequential
(
spconv
.
SparseConv3d
(
in_channels
,
out_channels
,
kernel_size
,
stride
,
indice_key
=
"cp0"
,
bias
=
False
),
spconv
.
SparseInverseConv3d
(
out_channels
,
in_channels
,
kernel_size
,
indice_key
=
"cp0"
,
bias
=
False
),
spconv
.
SparseConv3d
(
in_channels
,
out_channels
,
kernel_size
,
stride
,
indice_key
=
"cp0"
,
bias
=
False
),
spconv
.
SparseInverseConv3d
(
out_channels
,
in_channels
,
kernel_size
,
indice_key
=
"cp0"
,
bias
=
False
),
)
self
.
todense
=
spconv
.
ToDense
()
self
.
shape
=
shape
def
forward
(
self
,
features
,
coors
,
batch_size
):
coors
=
coors
.
int
()
x
=
spconv
.
SparseConvTensor
(
features
,
coors
,
self
.
shape
,
batch_size
)
return
self
.
todense
(
self
.
net
(
x
))
# .dense()
x
=
spconv
.
SparseConvTensor
(
features
,
coors
,
self
.
shape
,
batch_size
)
return
self
.
todense
(
self
.
net
(
x
))
# .dense()
def
gather_nd
(
params
,
indices
):
# this function has a limit that MAX_ADVINDEX_CALC_DIMS=5
ndim
=
indices
.
shape
[
-
1
]
output_shape
=
list
(
indices
.
shape
[:
-
1
])
+
list
(
params
.
shape
[
indices
.
shape
[
-
1
]:])
output_shape
=
list
(
indices
.
shape
[:
-
1
])
+
list
(
params
.
shape
[
indices
.
shape
[
-
1
]:])
flatted_indices
=
indices
.
view
(
-
1
,
ndim
)
slices
=
[
flatted_indices
[:,
i
]
for
i
in
range
(
ndim
)]
slices
+=
[
Ellipsis
]
return
params
[
slices
].
view
(
*
output_shape
)
def
scatter_nd
(
indices
,
updates
,
shape
):
"""pytorch edition of tensorflow scatter_nd.
this function don't contain except handle code. so use this carefully
...
...
@@ -322,7 +323,6 @@ def scatter_nd(indices, updates, shape):
class
TestSpConv
(
TestCase
):
def
testSpConv3d
(
self
):
np
.
random
.
seed
(
484
)
devices
=
[
"cpu:0"
]
...
...
@@ -337,36 +337,44 @@ class TestSpConv(TestCase):
dilations
=
[
1
,
2
,
3
]
for
dev
,
shape
,
bs
,
IC
,
OC
,
k
,
s
,
p
,
d
in
params_grid
(
devices
,
shapes
,
batchsizes
,
in_channels
,
out_channels
,
ksizes
,
strides
,
paddings
,
dilations
):
devices
,
shapes
,
batchsizes
,
in_channels
,
out_channels
,
ksizes
,
strides
,
paddings
,
dilations
):
if
all
([
s
>
1
,
d
>
1
]):
continue
# don't support this.
continue
# don't support this.
device
=
torch
.
device
(
dev
)
num_points
=
[
1000
]
*
bs
sparse_dict
=
generate_sparse_data
(
shape
,
num_points
,
IC
)
features
=
np
.
ascontiguousarray
(
sparse_dict
[
"features"
]).
astype
(
np
.
float32
)
indices
=
np
.
ascontiguousarray
(
sparse_dict
[
"indices"
][:,
[
3
,
0
,
1
,
2
]]).
astype
(
np
.
int32
)
features
=
np
.
ascontiguousarray
(
sparse_dict
[
"features"
]).
astype
(
np
.
float32
)
indices
=
np
.
ascontiguousarray
(
sparse_dict
[
"indices"
][:,
[
3
,
0
,
1
,
2
]]).
astype
(
np
.
int32
)
features_dense
=
sparse_dict
[
"features_dense"
].
astype
(
np
.
float32
)
filters
=
np
.
random
.
uniform
(
0
,
1
,
size
=
[
k
,
k
,
k
,
IC
,
OC
]).
astype
(
np
.
float32
)
filters
=
np
.
random
.
uniform
(
0
,
1
,
size
=
[
k
,
k
,
k
,
IC
,
OC
]).
astype
(
np
.
float32
)
indices_t
=
torch
.
from_numpy
(
indices
).
int
().
to
(
device
)
features_t
=
torch
.
from_numpy
(
features
).
to
(
device
)
features_t
.
requires_grad
=
True
features_dense_t
=
torch
.
from_numpy
(
features_dense
).
to
(
device
)
features_dense_t
.
requires_grad
=
True
net
=
SparseConv3dTestTorch
(
1
,
3
,
shape
,
IC
,
OC
,
k
,
s
,
p
,
d
).
to
(
device
)
net_ref
=
Conv3dTestTorch
(
1
,
3
,
shape
,
IC
,
OC
,
k
,
s
,
p
,
d
).
to
(
device
)
net
=
SparseConv3dTestTorch
(
1
,
3
,
shape
,
IC
,
OC
,
k
,
s
,
p
,
d
).
to
(
device
)
net_ref
=
Conv3dTestTorch
(
1
,
3
,
shape
,
IC
,
OC
,
k
,
s
,
p
,
d
).
to
(
device
)
filters_t
=
torch
.
from_numpy
(
filters
).
to
(
device
)
net_ref
.
net
[
0
].
weight
.
data
[:]
=
filters_t
.
permute
(
4
,
3
,
0
,
1
,
2
).
contiguous
()
net_ref
.
net
[
0
].
weight
.
data
[:]
=
filters_t
.
permute
(
4
,
3
,
0
,
1
,
2
).
contiguous
()
net
.
net
[
0
].
weight
.
data
[:]
=
filters_t
out_ref
=
net_ref
(
features_dense_t
)
out
=
net
(
features_t
,
indices_t
,
bs
).
dense
()
dout
=
np
.
random
.
uniform
(
-
0.2
,
0.2
,
out_ref
.
shape
).
astype
(
features
.
dtype
)
dout
=
np
.
random
.
uniform
(
-
0.2
,
0.2
,
out_ref
.
shape
).
astype
(
features
.
dtype
)
dout_t
=
torch
.
from_numpy
(
dout
).
to
(
device
)
out
.
backward
(
dout_t
)
out_ref
.
backward
(
dout_t
)
din_dense
=
features_dense_t
.
grad
.
detach
().
permute
(
0
,
2
,
3
,
4
,
1
).
contiguous
()
din_dense
=
features_dense_t
.
grad
.
detach
().
permute
(
0
,
2
,
3
,
4
,
1
).
contiguous
()
din_sparse
=
gather_nd
(
din_dense
,
indices_t
.
long
())
din
=
features_t
.
grad
.
detach
()
din_np
=
din
.
cpu
().
numpy
()
...
...
@@ -381,7 +389,7 @@ class TestSpConv(TestCase):
out_np
=
out
.
detach
().
cpu
().
numpy
()
out_ref_np
=
out_ref
.
detach
().
cpu
().
numpy
()
self
.
assertAllClose
(
out_np
,
out_ref_np
,
atol
=
1e-4
)
def
testSpDeConv3d
(
self
):
np
.
random
.
seed
(
484
)
devices
=
[
"cuda:0"
,
"cpu:0"
]
...
...
@@ -396,36 +404,44 @@ class TestSpConv(TestCase):
dilations
=
[
1
,
2
,
3
]
for
dev
,
shape
,
bs
,
IC
,
OC
,
k
,
s
,
p
,
d
in
params_grid
(
devices
,
shapes
,
batchsizes
,
in_channels
,
out_channels
,
ksizes
,
strides
,
paddings
,
dilations
):
devices
,
shapes
,
batchsizes
,
in_channels
,
out_channels
,
ksizes
,
strides
,
paddings
,
dilations
):
if
all
([
s
>
1
,
d
>
1
]):
continue
# don't support this.
continue
# don't support this.
device
=
torch
.
device
(
dev
)
num_points
=
[
1000
]
*
bs
sparse_dict
=
generate_sparse_data
(
shape
,
num_points
,
IC
)
features
=
np
.
ascontiguousarray
(
sparse_dict
[
"features"
]).
astype
(
np
.
float32
)
indices
=
np
.
ascontiguousarray
(
sparse_dict
[
"indices"
][:,
[
3
,
0
,
1
,
2
]]).
astype
(
np
.
int32
)
features
=
np
.
ascontiguousarray
(
sparse_dict
[
"features"
]).
astype
(
np
.
float32
)
indices
=
np
.
ascontiguousarray
(
sparse_dict
[
"indices"
][:,
[
3
,
0
,
1
,
2
]]).
astype
(
np
.
int32
)
features_dense
=
sparse_dict
[
"features_dense"
].
astype
(
np
.
float32
)
filters
=
np
.
random
.
uniform
(
0
,
1
,
size
=
[
k
,
k
,
k
,
IC
,
OC
]).
astype
(
np
.
float32
)
filters
=
np
.
random
.
uniform
(
0
,
1
,
size
=
[
k
,
k
,
k
,
IC
,
OC
]).
astype
(
np
.
float32
)
indices_t
=
torch
.
from_numpy
(
indices
).
int
().
to
(
device
)
features_t
=
torch
.
from_numpy
(
features
).
to
(
device
)
features_t
.
requires_grad
=
True
features_dense_t
=
torch
.
from_numpy
(
features_dense
).
to
(
device
)
features_dense_t
.
requires_grad
=
True
net
=
SparseDeConv3dTestTorch
(
1
,
3
,
shape
,
IC
,
OC
,
k
,
s
,
p
,
d
).
to
(
device
)
net_ref
=
DeConv3dTestTorch
(
1
,
3
,
shape
,
IC
,
OC
,
k
,
s
,
p
,
d
).
to
(
device
)
net
=
SparseDeConv3dTestTorch
(
1
,
3
,
shape
,
IC
,
OC
,
k
,
s
,
p
,
d
).
to
(
device
)
net_ref
=
DeConv3dTestTorch
(
1
,
3
,
shape
,
IC
,
OC
,
k
,
s
,
p
,
d
).
to
(
device
)
filters_t
=
torch
.
from_numpy
(
filters
).
to
(
device
)
net_ref
.
net
[
0
].
weight
.
data
[:]
=
filters_t
.
permute
(
3
,
4
,
0
,
1
,
2
).
contiguous
()
net_ref
.
net
[
0
].
weight
.
data
[:]
=
filters_t
.
permute
(
3
,
4
,
0
,
1
,
2
).
contiguous
()
net
.
net
[
0
].
weight
.
data
[:]
=
filters_t
out_ref
=
net_ref
(
features_dense_t
)
out
=
net
(
features_t
,
indices_t
,
bs
).
dense
()
dout
=
np
.
random
.
uniform
(
-
0.2
,
0.2
,
out_ref
.
shape
).
astype
(
features
.
dtype
)
dout
=
np
.
random
.
uniform
(
-
0.2
,
0.2
,
out_ref
.
shape
).
astype
(
features
.
dtype
)
dout_t
=
torch
.
from_numpy
(
dout
).
to
(
device
)
out
.
backward
(
dout_t
)
out_ref
.
backward
(
dout_t
)
din_dense
=
features_dense_t
.
grad
.
detach
().
permute
(
0
,
2
,
3
,
4
,
1
).
contiguous
()
din_dense
=
features_dense_t
.
grad
.
detach
().
permute
(
0
,
2
,
3
,
4
,
1
).
contiguous
()
din_sparse
=
gather_nd
(
din_dense
,
indices_t
.
long
())
din
=
features_t
.
grad
.
detach
()
din_np
=
din
.
cpu
().
numpy
()
...
...
@@ -440,7 +456,7 @@ class TestSpConv(TestCase):
out_np
=
out
.
detach
().
cpu
().
numpy
()
out_ref_np
=
out_ref
.
detach
().
cpu
().
numpy
()
self
.
assertAllClose
(
out_np
,
out_ref_np
,
atol
=
1e-4
)
def
testSpCpConv3d
(
self
):
np
.
random
.
seed
(
484
)
devices
=
[
"cuda:0"
,
"cpu:0"
]
...
...
@@ -455,19 +471,23 @@ class TestSpConv(TestCase):
dilations
=
[
1
,
2
,
3
]
for
dev
,
shape
,
bs
,
IC
,
OC
,
k
,
s
in
params_grid
(
devices
,
shapes
,
batchsizes
,
in_channels
,
out_channels
,
ksizes
,
strides
):
devices
,
shapes
,
batchsizes
,
in_channels
,
out_channels
,
ksizes
,
strides
):
device
=
torch
.
device
(
dev
)
num_points
=
[
1000
]
*
bs
sparse_dict
=
generate_sparse_data
(
shape
,
num_points
,
IC
)
features
=
np
.
ascontiguousarray
(
sparse_dict
[
"features"
]).
astype
(
np
.
float32
)
indices
=
np
.
ascontiguousarray
(
sparse_dict
[
"indices"
][:,
[
3
,
0
,
1
,
2
]]).
astype
(
np
.
int32
)
features
=
np
.
ascontiguousarray
(
sparse_dict
[
"features"
]).
astype
(
np
.
float32
)
indices
=
np
.
ascontiguousarray
(
sparse_dict
[
"indices"
][:,
[
3
,
0
,
1
,
2
]]).
astype
(
np
.
int32
)
features_dense
=
sparse_dict
[
"features_dense"
].
astype
(
np
.
float32
)
filters
=
np
.
random
.
uniform
(
0
,
1
,
size
=
[
k
,
k
,
k
,
IC
,
OC
]).
astype
(
np
.
float32
)
filters
=
np
.
random
.
uniform
(
0
,
1
,
size
=
[
k
,
k
,
k
,
IC
,
OC
]).
astype
(
np
.
float32
)
indices_t
=
torch
.
from_numpy
(
indices
).
int
().
to
(
device
)
indices_scn_t
=
torch
.
from_numpy
(
indices
[:,
[
1
,
2
,
3
,
0
]]).
int
().
to
(
device
)
indices_scn_t
=
torch
.
from_numpy
(
indices
[:,
[
1
,
2
,
3
,
0
]]).
int
().
to
(
device
)
features_t
=
torch
.
from_numpy
(
features
).
to
(
device
)
features_t
.
requires_grad
=
True
features_ref_t
=
torch
.
from_numpy
(
features
).
to
(
device
)
...
...
@@ -475,11 +495,14 @@ class TestSpConv(TestCase):
net_ref
=
SCNCoupleDeConvTest
(
1
,
3
,
shape
,
IC
,
OC
,
k
,
s
).
to
(
device
)
net
=
SparseCoupleDeConvTest
(
1
,
3
,
shape
,
IC
,
OC
,
k
,
s
).
to
(
device
)
net_ref
.
net
[
0
].
weight
.
data
[:]
=
net
.
net
[
0
].
weight
.
data
[:].
view
(
*
net_ref
.
net
[
0
].
weight
.
shape
)
net_ref
.
net
[
1
].
weight
.
data
[:]
=
net
.
net
[
1
].
weight
.
data
[:].
view
(
*
net_ref
.
net
[
1
].
weight
.
shape
)
net_ref
.
net
[
0
].
weight
.
data
[:]
=
net
.
net
[
0
].
weight
.
data
[:].
view
(
*
net_ref
.
net
[
0
].
weight
.
shape
)
net_ref
.
net
[
1
].
weight
.
data
[:]
=
net
.
net
[
1
].
weight
.
data
[:].
view
(
*
net_ref
.
net
[
1
].
weight
.
shape
)
out_ref
=
net_ref
(
features_ref_t
,
indices_scn_t
,
bs
)
out
=
net
(
features_t
,
indices_t
,
bs
)
dout
=
np
.
random
.
uniform
(
-
0.2
,
0.2
,
out_ref
.
shape
).
astype
(
features
.
dtype
)
dout
=
np
.
random
.
uniform
(
-
0.2
,
0.2
,
out_ref
.
shape
).
astype
(
features
.
dtype
)
dout_t
=
torch
.
from_numpy
(
dout
).
to
(
device
)
out
.
backward
(
dout_t
)
out_ref
.
backward
(
dout_t
)
...
...
@@ -490,15 +513,14 @@ class TestSpConv(TestCase):
self
.
assertAllClose
(
din_ref_np
,
din_np
,
atol
=
1e-4
)
for
layer
,
layer_ref
in
zip
(
net
.
net
,
net_ref
.
net
):
dw
=
layer
.
weight
.
grad
.
detach
().
cpu
().
numpy
()
dw_ref
=
layer_ref
.
weight
.
grad
.
detach
().
cpu
().
view
(
*
dw
.
shape
).
numpy
()
dw_ref
=
layer_ref
.
weight
.
grad
.
detach
().
cpu
().
view
(
*
dw
.
shape
).
numpy
()
self
.
assertAllClose
(
dw
,
dw_ref
,
atol
=
1e-4
)
out_np
=
out
.
detach
().
cpu
().
numpy
()
out_ref_np
=
out_ref
.
detach
().
cpu
().
numpy
()
self
.
assertAllClose
(
out_np
,
out_ref_np
,
atol
=
1e-4
)
def
testSpMaxPool3d
(
self
):
np
.
random
.
seed
(
485
)
devices
=
[
"cuda:0"
,
"cpu:0"
]
...
...
@@ -513,19 +535,25 @@ class TestSpConv(TestCase):
dilations
=
[
1
,
2
,
3
]
for
dev
,
shape
,
bs
,
IC
,
OC
,
k
,
s
,
p
,
d
in
params_grid
(
devices
,
shapes
,
batchsizes
,
in_channels
,
out_channels
,
ksizes
,
strides
,
paddings
,
dilations
):
devices
,
shapes
,
batchsizes
,
in_channels
,
out_channels
,
ksizes
,
strides
,
paddings
,
dilations
):
if
all
([
s
>
1
,
d
>
1
]):
continue
# don't support this.
continue
# don't support this.
device
=
torch
.
device
(
dev
)
num_points
=
[
1000
]
*
bs
# when data contains negative, sparse maxpool is not equal to dense maxpool.
sparse_dict
=
generate_sparse_data
(
shape
,
num_points
,
IC
,
data_range
=
[
0.1
,
1
])
features
=
np
.
ascontiguousarray
(
sparse_dict
[
"features"
]).
astype
(
np
.
float32
)
indices
=
np
.
ascontiguousarray
(
sparse_dict
[
"indices"
][:,
[
3
,
0
,
1
,
2
]]).
astype
(
np
.
int32
)
sparse_dict
=
generate_sparse_data
(
shape
,
num_points
,
IC
,
data_range
=
[
0.1
,
1
])
features
=
np
.
ascontiguousarray
(
sparse_dict
[
"features"
]).
astype
(
np
.
float32
)
indices
=
np
.
ascontiguousarray
(
sparse_dict
[
"indices"
][:,
[
3
,
0
,
1
,
2
]]).
astype
(
np
.
int32
)
features_dense
=
sparse_dict
[
"features_dense"
].
astype
(
np
.
float32
)
filters
=
np
.
random
.
uniform
(
0
,
1
,
size
=
[
k
,
k
,
k
,
IC
,
OC
]).
astype
(
np
.
float32
)
filters
=
np
.
random
.
uniform
(
0
,
1
,
size
=
[
k
,
k
,
k
,
IC
,
OC
]).
astype
(
np
.
float32
)
indices_t
=
torch
.
from_numpy
(
indices
).
int
().
to
(
device
)
features_t
=
torch
.
from_numpy
(
features
).
to
(
device
)
features_t
.
requires_grad
=
True
...
...
@@ -540,24 +568,27 @@ class TestSpConv(TestCase):
outfeatures
=
out
.
features
out_dense
=
out
.
dense
(
channels_first
=
False
)
out
=
out_dense
.
permute
(
0
,
4
,
1
,
2
,
3
).
contiguous
()
dout_sparse
=
np
.
random
.
uniform
(
-
0.2
,
0.2
,
outfeatures
.
shape
).
astype
(
features
.
dtype
)
dout_sparse
=
np
.
random
.
uniform
(
-
0.2
,
0.2
,
outfeatures
.
shape
).
astype
(
features
.
dtype
)
dout_sparse_t
=
torch
.
from_numpy
(
dout_sparse
).
to
(
device
)
dout_t
=
scatter_nd
(
outids
.
long
(),
dout_sparse_t
,
list
(
out_dense
.
shape
))
dout_t
=
scatter_nd
(
outids
.
long
(),
dout_sparse_t
,
list
(
out_dense
.
shape
))
dout_t
=
dout_t
.
permute
(
0
,
4
,
1
,
2
,
3
).
contiguous
()
out
.
backward
(
dout_t
)
out_ref
.
backward
(
dout_t
)
din_dense
=
features_dense_t
.
grad
.
detach
().
permute
(
0
,
2
,
3
,
4
,
1
).
contiguous
()
din_dense
=
features_dense_t
.
grad
.
detach
().
permute
(
0
,
2
,
3
,
4
,
1
).
contiguous
()
din_sparse
=
gather_nd
(
din_dense
,
indices_t
.
long
())
din
=
features_t
.
grad
.
detach
()
din_np
=
din
.
cpu
().
numpy
()
din_sparse_np
=
din_sparse
.
cpu
().
numpy
()
self
.
assertAllClose
(
din_np
,
din_sparse_np
,
atol
=
1e-4
)
out_np
=
out
.
detach
().
cpu
().
numpy
()
out_ref_np
=
out_ref
.
detach
().
cpu
().
numpy
()
self
.
assertAllClose
(
out_np
,
out_ref_np
,
atol
=
1e-4
)
def
main
():
# function for develop.
...
...
@@ -567,7 +598,6 @@ def main():
shapes
=
[[
50
,
30
,
30
]]
batchsizes
=
[
2
]
in_channels
=
[
256
]
out_channels
=
[
256
]
ksizes
=
[(
3
,
1
,
1
)]
...
...
@@ -576,8 +606,8 @@ def main():
dilations
=
[
1
]
for
dev
,
shape
,
bs
,
IC
,
OC
,
k
,
s
,
p
,
d
in
params_grid
(
devices
,
shapes
,
batchsizes
,
in_channels
,
out_channels
,
ksizes
,
strides
,
paddings
,
dilations
):
devices
,
shapes
,
batchsizes
,
in_channels
,
out_channels
,
ksizes
,
strides
,
paddings
,
dilations
):
if
all
([
s
>
1
,
d
>
1
]):
continue
device
=
torch
.
device
(
dev
)
...
...
@@ -585,19 +615,25 @@ def main():
sparse_dict
=
generate_sparse_data
(
shape
,
num_points
,
IC
)
features
=
np
.
ascontiguousarray
(
sparse_dict
[
"features"
]).
astype
(
np
.
float32
)
indices
=
np
.
ascontiguousarray
(
sparse_dict
[
"indices"
][:,
[
3
,
0
,
1
,
2
]]).
astype
(
np
.
int32
)
features
=
np
.
ascontiguousarray
(
sparse_dict
[
"features"
]).
astype
(
np
.
float32
)
indices
=
np
.
ascontiguousarray
(
sparse_dict
[
"indices"
][:,
[
3
,
0
,
1
,
2
]]).
astype
(
np
.
int32
)
features_dense
=
sparse_dict
[
"features_dense"
].
astype
(
np
.
float32
)
indices_t
=
torch
.
from_numpy
(
indices
)
filters
=
np
.
random
.
uniform
(
0
,
1
,
size
=
[
k
[
0
],
1
,
1
,
IC
,
OC
]).
astype
(
np
.
float32
)
filters
=
np
.
random
.
uniform
(
0
,
1
,
size
=
[
k
[
0
],
1
,
1
,
IC
,
OC
]).
astype
(
np
.
float32
)
indices_t
=
torch
.
from_numpy
(
indices
).
int
().
to
(
device
).
float
()
features_t
=
torch
.
from_numpy
(
features
).
to
(
device
).
float
()
features_dense_t
=
torch
.
from_numpy
(
features_dense
).
to
(
device
).
float
()
net
=
SparseConv3dTestTorch
(
1
,
3
,
shape
,
IC
,
OC
,
k
,
s
,
p
,
d
).
to
(
device
).
float
()
net_ref
=
Conv3dTestTorch
(
1
,
3
,
shape
,
IC
,
OC
,
k
,
s
,
p
,
d
).
to
(
device
).
float
()
net
=
SparseConv3dTestTorch
(
1
,
3
,
shape
,
IC
,
OC
,
k
,
s
,
p
,
d
).
to
(
device
).
float
()
net_ref
=
Conv3dTestTorch
(
1
,
3
,
shape
,
IC
,
OC
,
k
,
s
,
p
,
d
).
to
(
device
).
float
()
filters_t
=
torch
.
from_numpy
(
filters
).
to
(
device
).
float
()
net_ref
.
net
[
0
].
weight
[:]
=
filters_t
.
permute
(
4
,
3
,
0
,
1
,
2
).
contiguous
()
net_ref
.
net
[
0
].
weight
[:]
=
filters_t
.
permute
(
4
,
3
,
0
,
1
,
2
).
contiguous
()
net
.
net
[
0
].
weight
[:]
=
filters_t
out_ref
=
net_ref
(
features_dense_t
)
times
=
[]
...
...
@@ -607,16 +643,16 @@ def main():
torch
.
cuda
.
synchronize
()
times
.
append
(
time
.
time
()
-
t
)
# print((net.grid == -1).float().sum(), net.grid.numel())
# print("spconv time", time.time() - t)
# print("spconv time", time.time() - t)
print
(
"spconv time"
,
np
.
mean
(
times
[
2
:]))
out
=
net
(
features_t
,
indices_t
,
bs
)
# print(out.indices)
out
=
out
.
dense
()
print
(
np
.
linalg
.
norm
(
out
.
detach
().
cpu
().
numpy
()
-
out_ref
.
detach
().
cpu
().
numpy
()))
print
(
np
.
linalg
.
norm
(
out
.
detach
().
cpu
().
numpy
()
-
out_ref
.
detach
().
cpu
().
numpy
()))
if
__name__
==
'__main__'
:
main
()
# unittest.main()
pybind11
@
3b1dbeba
Compare
085a2943
...
3b1dbeba
Subproject commit
085a29436a8c472caaaf7157aa644b571079bcaa
Subproject commit
3b1dbebabc801c9cf6f0953a4c20b904d444f879
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment