Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
one
spconv
Commits
19e73bbe
Commit
19e73bbe
authored
May 20, 2020
by
Yan Yan
Browse files
format code with clang-format, better c++ code
parent
c336139f
Changes
77
Hide whitespace changes
Inline
Side-by-side
Showing
17 changed files
with
905 additions
and
618 deletions
+905
-618
src/cuhash/main.cc
src/cuhash/main.cc
+40
-41
src/spconv/all.cc
src/spconv/all.cc
+17
-12
src/spconv/indice.cc
src/spconv/indice.cc
+47
-51
src/spconv/indice.cu
src/spconv/indice.cu
+293
-64
src/spconv/maxpool.cu
src/spconv/maxpool.cu
+35
-31
src/spconv/nms.cc
src/spconv/nms.cc
+2
-2
src/spconv/nms.cu
src/spconv/nms.cu
+17
-27
src/spconv/pillar_scatter.cu
src/spconv/pillar_scatter.cu
+5
-5
src/spconv/reordering.cc
src/spconv/reordering.cc
+15
-16
src/spconv/reordering.cu
src/spconv/reordering.cu
+22
-21
src/spconv/spconv_ops.cc
src/spconv/spconv_ops.cc
+6
-6
src/utils/all.cc
src/utils/all.cc
+1
-1
src/utils/nms.cu
src/utils/nms.cu
+46
-63
test/fake_dist_train.py
test/fake_dist_train.py
+35
-12
test/fake_train.py
test/fake_train.py
+29
-7
test/test_conv.py
test/test_conv.py
+294
-258
third_party/pybind11
third_party/pybind11
+1
-1
No files found.
src/cuhash/main.cc
View file @
19e73bbe
#include <cuhash/hash_table.h>
#include <cuda.h>
#include <cuda.h>
#include <cuhash/hash_table.h>
int
main
(){
int
main
()
{
auto
table
=
cuhash
::
HashTable
();
auto
table
=
cuhash
::
HashTable
();
table
.
Initialize
(
10
,
2.0
);
table
.
Initialize
(
10
,
2.0
);
const
int
N
=
10
;
const
int
N
=
10
;
// ハッシュテーブルに格納するデータ
// ハッシュテーブルに格納するデータ
int
keys
[
N
]
=
{
1
,
6
,
4
,
9
,
0
,
3
,
7
,
2
,
5
,
8
};
int
keys
[
N
]
=
{
1
,
6
,
4
,
9
,
0
,
3
,
7
,
2
,
5
,
8
};
int
vals
[
N
]
=
{
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
};
int
vals
[
N
]
=
{
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
};
// デバイスメモリにコピー
// デバイスメモリにコピー
int
*
d_keys
,
*
d_vals
;
int
*
d_keys
,
*
d_vals
;
cudaMalloc
((
void
**
)
&
d_keys
,
sizeof
(
int
)
*
N
);
cudaMalloc
((
void
**
)
&
d_keys
,
sizeof
(
int
)
*
N
);
cudaMemcpy
(
d_keys
,
keys
,
sizeof
(
int
)
*
N
,
cudaMemcpyHostToDevice
);
cudaMemcpy
(
d_keys
,
keys
,
sizeof
(
int
)
*
N
,
cudaMemcpyHostToDevice
);
cudaMalloc
((
void
**
)
&
d_vals
,
sizeof
(
int
)
*
N
);
cudaMalloc
((
void
**
)
&
d_vals
,
sizeof
(
int
)
*
N
);
cudaMemcpy
(
d_vals
,
vals
,
sizeof
(
int
)
*
N
,
cudaMemcpyHostToDevice
);
cudaMemcpy
(
d_vals
,
vals
,
sizeof
(
int
)
*
N
,
cudaMemcpyHostToDevice
);
// ハッシュテーブルにクエリするデータ
// ハッシュテーブルにクエリするデータ
int
input
[
N
]
=
{
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
};
int
input
[
N
]
=
{
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
};
int
output
[
N
];
int
output
[
N
];
// デバイスメモリにコピー
// デバイスメモリにコピー
int
*
d_input
,
*
d_output
;
int
*
d_input
,
*
d_output
;
cudaMalloc
((
void
**
)
&
d_input
,
sizeof
(
int
)
*
N
);
cudaMalloc
((
void
**
)
&
d_input
,
sizeof
(
int
)
*
N
);
cudaMemcpy
(
d_input
,
input
,
sizeof
(
int
)
*
N
,
cudaMemcpyHostToDevice
);
cudaMemcpy
(
d_input
,
input
,
sizeof
(
int
)
*
N
,
cudaMemcpyHostToDevice
);
cudaMalloc
((
void
**
)
&
d_output
,
sizeof
(
int
)
*
N
);
cudaMalloc
((
void
**
)
&
d_output
,
sizeof
(
int
)
*
N
);
cudaMemset
(
d_output
,
0
,
sizeof
(
int
)
*
N
);
cudaMemset
(
d_output
,
0
,
sizeof
(
int
)
*
N
);
bool
s
=
table
.
Build
(
N
,
(
const
unsigned
int
*
)
d_keys
,
bool
s
=
table
.
Build
(
N
,
(
const
unsigned
int
*
)
d_keys
,
(
const
unsigned
int
*
)
d_vals
);
(
const
unsigned
int
*
)
d_vals
);
std
::
cout
<<
s
<<
std
::
endl
;
std
::
cout
<<
s
<<
std
::
endl
;
table
.
Retrieve
(
N
,
(
const
unsigned
int
*
)
d_input
,
table
.
Retrieve
(
N
,
(
const
unsigned
int
*
)
d_input
,
(
unsigned
int
*
)
d_output
);
(
unsigned
int
*
)
d_output
);
std
::
cout
<<
s
<<
std
::
endl
;
std
::
cout
<<
s
<<
std
::
endl
;
cudaMemcpy
(
output
,
d_output
,
sizeof
(
int
)
*
N
,
cudaMemcpyDeviceToHost
);
cudaMemcpy
(
output
,
d_output
,
sizeof
(
int
)
*
N
,
cudaMemcpyDeviceToHost
);
for
(
int
i
=
0
;
i
<
N
;
++
i
)
{
for
(
int
i
=
0
;
i
<
N
;
++
i
)
{
printf
(
"%d
\n
"
,
output
[
i
]);
printf
(
"%d
\n
"
,
output
[
i
]);
}
}
return
0
;
return
0
;
}
}
\ No newline at end of file
src/spconv/all.cc
View file @
19e73bbe
// Copyright 2019 Yan Yan
// Copyright 2019 Yan Yan
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// You may obtain a copy of the License at
//
//
// http://www.apache.org/licenses/LICENSE-2.0
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// See the License for the specific language governing permissions and
// limitations under the License.
// limitations under the License.
#include <torch/script.h>
#include <spconv/pool_ops.h>
#include <spconv/spconv_ops.h>
#include <spconv/pillar_scatter_ops.h>
#include <spconv/fused_spconv_ops.h>
#include <spconv/fused_spconv_ops.h>
#include <spconv/nms_ops.h>
#include <spconv/nms_ops.h>
#include <spconv/pillar_scatter_ops.h>
#include <spconv/pool_ops.h>
#include <spconv/spconv_ops.h>
#include <torch/script.h>
static
auto
registry
=
static
auto
registry
=
torch
::
RegisterOperators
()
torch
::
RegisterOperators
()
.
op
(
"spconv::get_indice_pairs_2d"
,
&
spconv
::
getIndicePair
<
2
>
)
.
op
(
"spconv::get_indice_pairs_2d"
,
&
spconv
::
getIndicePair
<
2
>
)
.
op
(
"spconv::get_indice_pairs_3d"
,
&
spconv
::
getIndicePair
<
3
>
)
.
op
(
"spconv::get_indice_pairs_3d"
,
&
spconv
::
getIndicePair
<
3
>
)
.
op
(
"spconv::get_indice_pairs_4d"
,
&
spconv
::
getIndicePair
<
4
>
)
.
op
(
"spconv::get_indice_pairs_4d"
,
&
spconv
::
getIndicePair
<
4
>
)
.
op
(
"spconv::get_indice_pairs_grid_2d"
,
&
spconv
::
getIndicePairPreGrid
<
2
>
)
.
op
(
"spconv::get_indice_pairs_grid_2d"
,
.
op
(
"spconv::get_indice_pairs_grid_3d"
,
&
spconv
::
getIndicePairPreGrid
<
3
>
)
&
spconv
::
getIndicePairPreGrid
<
2
>
)
.
op
(
"spconv::get_indice_pairs_grid_3d"
,
&
spconv
::
getIndicePairPreGrid
<
3
>
)
.
op
(
"spconv::indice_conv"
,
&
spconv
::
indiceConv
)
.
op
(
"spconv::indice_conv"
,
&
spconv
::
indiceConv
)
.
op
(
"spconv::indice_conv_backward"
,
&
spconv
::
indiceConvBackward
)
.
op
(
"spconv::indice_conv_backward"
,
&
spconv
::
indiceConvBackward
)
.
op
(
"spconv::fused_indice_conv_fp32"
,
&
spconv
::
fusedIndiceConvBatchNorm
<
float
>
)
.
op
(
"spconv::fused_indice_conv_fp32"
,
.
op
(
"spconv::fused_indice_conv_half"
,
&
spconv
::
fusedIndiceConvBatchNorm
<
at
::
Half
>
)
&
spconv
::
fusedIndiceConvBatchNorm
<
float
>
)
.
op
(
"spconv::fused_indice_conv_half"
,
&
spconv
::
fusedIndiceConvBatchNorm
<
at
::
Half
>
)
.
op
(
"spconv::indice_maxpool_fp32"
,
&
spconv
::
indiceMaxPool
<
float
>
)
.
op
(
"spconv::indice_maxpool_fp32"
,
&
spconv
::
indiceMaxPool
<
float
>
)
.
op
(
"spconv::indice_maxpool_backward_fp32"
,
.
op
(
"spconv::indice_maxpool_backward_fp32"
,
&
spconv
::
indiceMaxPoolBackward
<
float
>
)
&
spconv
::
indiceMaxPoolBackward
<
float
>
)
...
@@ -38,4 +42,5 @@ static auto registry =
...
@@ -38,4 +42,5 @@ static auto registry =
&
spconv
::
indiceMaxPoolBackward
<
at
::
Half
>
)
&
spconv
::
indiceMaxPoolBackward
<
at
::
Half
>
)
.
op
(
"spconv::nms"
,
&
spconv
::
nonMaxSuppression
<
float
>
)
.
op
(
"spconv::nms"
,
&
spconv
::
nonMaxSuppression
<
float
>
)
.
op
(
"spconv::pillar_scatter_float"
,
&
spconv
::
pointPillarScatter
<
float
>
)
.
op
(
"spconv::pillar_scatter_float"
,
&
spconv
::
pointPillarScatter
<
float
>
)
.
op
(
"spconv::pillar_scatter_half"
,
&
spconv
::
pointPillarScatter
<
at
::
Half
>
);
.
op
(
"spconv::pillar_scatter_half"
,
&
spconv
::
pointPillarScatter
<
at
::
Half
>
);
src/spconv/indice.cc
View file @
19e73bbe
// Copyright 2019 Yan Yan
// Copyright 2019 Yan Yan
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// You may obtain a copy of the License at
//
//
// http://www.apache.org/licenses/LICENSE-2.0
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// See the License for the specific language governing permissions and
// limitations under the License.
// limitations under the License.
#include <ATen/Parallel.h>
#include <spconv/geometry.h>
#include <spconv/geometry.h>
#include <spconv/indice.h>
#include <spconv/indice.h>
#include <spconv/spconv_ops.h>
#include <spconv/spconv_ops.h>
#include <torch/script.h>
#include <torch/script.h>
#include <ATen/Parallel.h>
namespace
spconv
{
namespace
spconv
{
...
@@ -45,7 +45,7 @@ Index getIndicePairsConv(tv::TensorView<const Index> indicesIn,
...
@@ -45,7 +45,7 @@ Index getIndicePairsConv(tv::TensorView<const Index> indicesIn,
}
}
Index
numValidPoints
=
0
;
Index
numValidPoints
=
0
;
std
::
vector
<
Index
>
validPoints_
(
kernelVolume
*
(
NDim
+
1
));
std
::
vector
<
Index
>
validPoints_
(
kernelVolume
*
(
NDim
+
1
));
Index
*
validPoints
=
validPoints_
.
data
();
Index
*
validPoints
=
validPoints_
.
data
();
Index
*
pointPtr
=
nullptr
;
Index
*
pointPtr
=
nullptr
;
Index
hashval
;
Index
hashval
;
tsl
::
robin_map
<
Index
,
Index
>
hash
;
tsl
::
robin_map
<
Index
,
Index
>
hash
;
...
@@ -67,7 +67,7 @@ Index getIndicePairsConv(tv::TensorView<const Index> indicesIn,
...
@@ -67,7 +67,7 @@ Index getIndicePairsConv(tv::TensorView<const Index> indicesIn,
indicesOut
(
numAct
,
0
)
=
batchIdx
;
indicesOut
(
numAct
,
0
)
=
batchIdx
;
hashval
=
numAct
++
;
hashval
=
numAct
++
;
hash
[
index
]
=
hashval
;
hash
[
index
]
=
hashval
;
}
else
{
}
else
{
hashval
=
iter
->
second
;
hashval
=
iter
->
second
;
}
}
// indicePairs: [K, 2, L]
// indicePairs: [K, 2, L]
...
@@ -102,7 +102,7 @@ Index getIndicePairsDeConv(tv::TensorView<const Index> indicesIn,
...
@@ -102,7 +102,7 @@ Index getIndicePairsDeConv(tv::TensorView<const Index> indicesIn,
}
}
Index
numValidPoints
=
0
;
Index
numValidPoints
=
0
;
std
::
vector
<
Index
>
validPoints_
(
kernelVolume
*
(
NDim
+
1
));
std
::
vector
<
Index
>
validPoints_
(
kernelVolume
*
(
NDim
+
1
));
Index
*
validPoints
=
validPoints_
.
data
();
Index
*
validPoints
=
validPoints_
.
data
();
Index
*
pointPtr
=
nullptr
;
Index
*
pointPtr
=
nullptr
;
Index
hashval
;
Index
hashval
;
tsl
::
robin_map
<
Index
,
Index
>
hash
;
tsl
::
robin_map
<
Index
,
Index
>
hash
;
...
@@ -125,7 +125,7 @@ Index getIndicePairsDeConv(tv::TensorView<const Index> indicesIn,
...
@@ -125,7 +125,7 @@ Index getIndicePairsDeConv(tv::TensorView<const Index> indicesIn,
indicesOut
(
numAct
,
0
)
=
batchIdx
;
indicesOut
(
numAct
,
0
)
=
batchIdx
;
hashval
=
numAct
++
;
hashval
=
numAct
++
;
hash
[
index
]
=
hashval
;
hash
[
index
]
=
hashval
;
}
else
{
}
else
{
hashval
=
iter
->
second
;
hashval
=
iter
->
second
;
}
}
// indicePairs: [K, 2, L]
// indicePairs: [K, 2, L]
...
@@ -136,7 +136,6 @@ Index getIndicePairsDeConv(tv::TensorView<const Index> indicesIn,
...
@@ -136,7 +136,6 @@ Index getIndicePairsDeConv(tv::TensorView<const Index> indicesIn,
return
numAct
;
return
numAct
;
}
}
#ifndef TV_WINDOWS
#ifndef TV_WINDOWS
template
<
typename
Index
,
typename
IndexGrid
,
unsigned
NDim
>
template
<
typename
Index
,
typename
IndexGrid
,
unsigned
NDim
>
Index
getIndicePairsSubM
(
tv
::
TensorView
<
const
Index
>
indicesIn
,
Index
getIndicePairsSubM
(
tv
::
TensorView
<
const
Index
>
indicesIn
,
...
@@ -145,7 +144,8 @@ Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
...
@@ -145,7 +144,8 @@ Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
tv
::
TensorView
<
Index
>
indiceNum
,
tv
::
TensorView
<
Index
>
indiceNum
,
const
Index
*
const
kernelSize
,
const
Index
*
const
kernelSize
,
const
Index
*
const
stride
,
const
Index
*
const
padding
,
const
Index
*
const
stride
,
const
Index
*
const
padding
,
const
Index
*
dilation
,
const
Index
*
const
outSpatialShape
)
{
const
Index
*
dilation
,
const
Index
*
const
outSpatialShape
)
{
Index
numAct
=
0
;
Index
numAct
=
0
;
auto
numActIn
=
indicesIn
.
dim
(
0
);
auto
numActIn
=
indicesIn
.
dim
(
0
);
Index
batchIdx
=
0
;
Index
batchIdx
=
0
;
...
@@ -167,12 +167,12 @@ Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
...
@@ -167,12 +167,12 @@ Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
spatialVolume
*
indicesIn
(
j
,
0
);
spatialVolume
*
indicesIn
(
j
,
0
);
hash
[
index
]
=
j
;
hash
[
index
]
=
j
;
}
}
at
::
parallel_for
(
0
,
numActIn
,
0
,
[
&
](
int64_t
begin
,
int64_t
end
){
at
::
parallel_for
(
0
,
numActIn
,
0
,
[
&
](
int64_t
begin
,
int64_t
end
)
{
Index
index
=
0
;
Index
index
=
0
;
Index
numValidPoints
=
0
;
Index
numValidPoints
=
0
;
std
::
vector
<
Index
>
validPoints_
(
kernelVolume
*
(
NDim
+
1
));
std
::
vector
<
Index
>
validPoints_
(
kernelVolume
*
(
NDim
+
1
));
Index
*
validPoints
=
validPoints_
.
data
();
Index
*
validPoints
=
validPoints_
.
data
();
Index
*
pointPtr
=
nullptr
;
Index
*
pointPtr
=
nullptr
;
Index
oldOffset
=
0
;
Index
oldOffset
=
0
;
for
(
int
j
=
begin
;
j
<
end
;
++
j
)
{
for
(
int
j
=
begin
;
j
<
end
;
++
j
)
{
...
@@ -186,7 +186,7 @@ Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
...
@@ -186,7 +186,7 @@ Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
spatialVolume
*
indicesIn
(
j
,
0
);
spatialVolume
*
indicesIn
(
j
,
0
);
auto
iter
=
hash
.
find
(
index
);
auto
iter
=
hash
.
find
(
index
);
if
(
iter
!=
hash
.
end
())
{
if
(
iter
!=
hash
.
end
())
{
#pragma omp atomic capture
#pragma omp atomic capture
oldOffset
=
indiceNum
[
offset
]
++
;
oldOffset
=
indiceNum
[
offset
]
++
;
indicePairs
(
offset
,
0
,
oldOffset
)
=
j
;
indicePairs
(
offset
,
0
,
oldOffset
)
=
j
;
indicePairs
(
offset
,
1
,
oldOffset
)
=
iter
->
second
;
indicePairs
(
offset
,
1
,
oldOffset
)
=
iter
->
second
;
...
@@ -196,7 +196,7 @@ Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
...
@@ -196,7 +196,7 @@ Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
});
});
return
numActIn
;
return
numActIn
;
}
}
#else
#else
template
<
typename
Index
,
typename
IndexGrid
,
unsigned
NDim
>
template
<
typename
Index
,
typename
IndexGrid
,
unsigned
NDim
>
Index
getIndicePairsSubM
(
tv
::
TensorView
<
const
Index
>
indicesIn
,
Index
getIndicePairsSubM
(
tv
::
TensorView
<
const
Index
>
indicesIn
,
tv
::
TensorView
<
IndexGrid
>
gridsOut
,
tv
::
TensorView
<
IndexGrid
>
gridsOut
,
...
@@ -204,7 +204,8 @@ Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
...
@@ -204,7 +204,8 @@ Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
tv
::
TensorView
<
Index
>
indiceNum
,
tv
::
TensorView
<
Index
>
indiceNum
,
const
Index
*
const
kernelSize
,
const
Index
*
const
kernelSize
,
const
Index
*
const
stride
,
const
Index
*
const
padding
,
const
Index
*
const
stride
,
const
Index
*
const
padding
,
const
Index
*
dilation
,
const
Index
*
const
outSpatialShape
)
{
const
Index
*
dilation
,
const
Index
*
const
outSpatialShape
)
{
Index
numAct
=
0
;
Index
numAct
=
0
;
auto
numActIn
=
indicesIn
.
dim
(
0
);
auto
numActIn
=
indicesIn
.
dim
(
0
);
Index
batchIdx
=
0
;
Index
batchIdx
=
0
;
...
@@ -221,7 +222,7 @@ Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
...
@@ -221,7 +222,7 @@ Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
Index
numValidPoints
=
0
;
Index
numValidPoints
=
0
;
// Index validPoints[kernelVolume * (NDim + 1)];
// Index validPoints[kernelVolume * (NDim + 1)];
std
::
vector
<
Index
>
validPoints_
(
kernelVolume
*
(
NDim
+
1
));
std
::
vector
<
Index
>
validPoints_
(
kernelVolume
*
(
NDim
+
1
));
Index
*
validPoints
=
validPoints_
.
data
();
Index
*
validPoints
=
validPoints_
.
data
();
Index
*
pointPtr
=
nullptr
;
Index
*
pointPtr
=
nullptr
;
tsl
::
robin_map
<
Index
,
Index
>
hash
;
tsl
::
robin_map
<
Index
,
Index
>
hash
;
for
(
int
j
=
0
;
j
<
numActIn
;
++
j
)
{
for
(
int
j
=
0
;
j
<
numActIn
;
++
j
)
{
...
@@ -255,57 +256,53 @@ Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
...
@@ -255,57 +256,53 @@ Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
namespace
functor
{
namespace
functor
{
template
<
typename
Index
,
typename
IndexGrid
,
unsigned
NDim
>
template
<
typename
Index
,
typename
IndexGrid
,
unsigned
NDim
>
struct
CreateConvIndicePairFunctor
<
tv
::
CPU
,
Index
,
IndexGrid
,
NDim
>
{
struct
CreateConvIndicePairFunctor
<
tv
::
CPU
,
Index
,
IndexGrid
,
NDim
>
{
Index
operator
()(
const
tv
::
CPU
&
d
,
tv
::
TensorView
<
const
Index
>
indicesIn
,
Index
operator
()(
const
tv
::
CPU
&
d
,
tv
::
TensorView
<
const
Index
>
indicesIn
,
tv
::
TensorView
<
Index
>
indicesOut
,
tv
::
TensorView
<
Index
>
indicesOut
,
tv
::
TensorView
<
IndexGrid
>
gridsOut
,
tv
::
TensorView
<
IndexGrid
>
gridsOut
,
tv
::
TensorView
<
Index
>
indicePairs
,
tv
::
TensorView
<
Index
>
indicePairs
,
tv
::
TensorView
<
Index
>
indiceNum
,
tv
::
TensorView
<
Index
>
indiceNum
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
kernelSize
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
kernelSize
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
stride
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
stride
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
padding
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
padding
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
dilation
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
dilation
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
outSpatialShape
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
outSpatialShape
,
bool
transpose
,
bool
resetGrid
,
bool
useHash
)
{
bool
transpose
,
bool
resetGrid
,
bool
useHash
)
{
if
(
transpose
)
if
(
transpose
)
return
getIndicePairsDeConv
<
Index
,
IndexGrid
,
NDim
>
(
return
getIndicePairsDeConv
<
Index
,
IndexGrid
,
NDim
>
(
indicesIn
,
indicesOut
,
indicesIn
,
indicesOut
,
gridsOut
,
indicePairs
,
indiceNum
,
gridsOut
,
indicePairs
,
indiceNum
,
kernelSize
.
data
(),
stride
.
data
(),
padding
.
data
(),
dilation
.
data
(),
kernelSize
.
data
(),
stride
.
data
(),
padding
.
data
(),
dilation
.
data
(),
outSpatialShape
.
data
());
outSpatialShape
.
data
());
else
else
return
getIndicePairsConv
<
Index
,
IndexGrid
,
NDim
>
(
return
getIndicePairsConv
<
Index
,
IndexGrid
,
NDim
>
(
indicesIn
,
indicesOut
,
indicesIn
,
indicesOut
,
gridsOut
,
indicePairs
,
indiceNum
,
gridsOut
,
indicePairs
,
indiceNum
,
kernelSize
.
data
(),
stride
.
data
(),
padding
.
data
(),
dilation
.
data
(),
kernelSize
.
data
(),
stride
.
data
(),
padding
.
data
(),
dilation
.
data
(),
outSpatialShape
.
data
());
outSpatialShape
.
data
());
}
}
};
};
template
<
typename
Index
,
typename
IndexGrid
,
unsigned
NDim
>
template
<
typename
Index
,
typename
IndexGrid
,
unsigned
NDim
>
struct
CreateSubMIndicePairFunctor
<
tv
::
CPU
,
Index
,
IndexGrid
,
NDim
>
{
struct
CreateSubMIndicePairFunctor
<
tv
::
CPU
,
Index
,
IndexGrid
,
NDim
>
{
Index
operator
()(
const
tv
::
CPU
&
d
,
tv
::
TensorView
<
const
Index
>
indicesIn
,
Index
operator
()(
const
tv
::
CPU
&
d
,
tv
::
TensorView
<
const
Index
>
indicesIn
,
tv
::
TensorView
<
IndexGrid
>
gridsOut
,
tv
::
TensorView
<
IndexGrid
>
gridsOut
,
tv
::
TensorView
<
Index
>
indicePairs
,
tv
::
TensorView
<
Index
>
indicePairs
,
tv
::
TensorView
<
Index
>
indiceNum
,
tv
::
TensorView
<
Index
>
indiceNum
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
kernelSize
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
kernelSize
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
stride
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
stride
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
padding
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
padding
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
dilation
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
dilation
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
outSpatialShape
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
outSpatialShape
,
bool
transpose
,
bool
resetGrid
,
bool
useHash
)
{
bool
transpose
,
bool
resetGrid
,
bool
useHash
)
{
return
getIndicePairsSubM
<
Index
,
IndexGrid
,
NDim
>
(
return
getIndicePairsSubM
<
Index
,
IndexGrid
,
NDim
>
(
indicesIn
,
indicesIn
,
gridsOut
,
indicePairs
,
indiceNum
,
kernelSize
.
data
(),
gridsOut
,
indicePairs
,
indiceNum
,
stride
.
data
(),
padding
.
data
(),
dilation
.
data
(),
outSpatialShape
.
data
());
kernelSize
.
data
(),
stride
.
data
(),
padding
.
data
(),
dilation
.
data
(),
outSpatialShape
.
data
());
}
}
};
};
}
// namespace functor
}
// namespace functor
#define DECLARE_CPU_SPECS_INDEX_NDIM(Index, NDIM) \
#define DECLARE_CPU_SPECS_INDEX_NDIM(Index, NDIM) \
template struct functor::CreateConvIndicePairFunctor<tv::CPU, Index, int,
NDIM>;
\
template struct functor::CreateConvIndicePairFunctor<tv::CPU, Index, int, \
template struct functor::CreateSubMIndicePairFunctor<tv::CPU, Index, int,
\
NDIM>;
\
NDIM>;
template struct functor::CreateSubMIndicePairFunctor<tv::CPU, Index, int, \
NDIM>;
#define DECLARE_CPU_INDEX(Index) \
#define DECLARE_CPU_INDEX(Index) \
DECLARE_CPU_SPECS_INDEX_NDIM(Index, 1); \
DECLARE_CPU_SPECS_INDEX_NDIM(Index, 1); \
...
@@ -320,4 +317,3 @@ DECLARE_CPU_INDEX(long);
...
@@ -320,4 +317,3 @@ DECLARE_CPU_INDEX(long);
#undef DECLARE_CPU_SPECS_INDEX_NDIM
#undef DECLARE_CPU_SPECS_INDEX_NDIM
}
// namespace spconv
}
// namespace spconv
src/spconv/indice.cu
View file @
19e73bbe
...
@@ -14,17 +14,240 @@
...
@@ -14,17 +14,240 @@
#include <ATen/ATen.h>
#include <ATen/ATen.h>
#include <chrono>
#include <chrono>
#include <cuhash/hash_table.h>
#include <limits>
#include <limits>
#include <spconv/mp_helper.h>
#include <spconv/indice.h>
#include <spconv/indice.cu.h>
#include <spconv/indice.cu.h>
#include <tensorview/helper_launch.h>
#include <spconv/indice.h>
#include <tensorview/cuda_utils.h>
#include <tensorview/mp_helper.h>
#include <tensorview/torch_utils.h>
#include <tensorview/tensor.h>
#include <tensorview/tensorview.h>
#include <tensorview/tensorview.h>
#include <type_traits>
#include <type_traits>
#include <utility/timer.h>
#include <utility/timer.h>
#include <cuhash/hash_table.h>
namespace
spconv
{
namespace
spconv
{
int
create_conv_indice_pair_p1_cuda
(
torch
::
Tensor
indicesIn
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
torch
::
Tensor
indicePairUnique
,
std
::
vector
<
int64_t
>
kernelSize
,
std
::
vector
<
int64_t
>
stride
,
std
::
vector
<
int64_t
>
padding
,
std
::
vector
<
int64_t
>
dilation
,
std
::
vector
<
int64_t
>
outSpatialShape
,
bool
transpose
)
{
auto
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
auto
ndim
=
kernelSize
.
size
();
auto
numActIn
=
indicesIn
.
size
(
0
);
if
(
numActIn
==
0
)
return
0
;
// dispatch_torch must be in outside, this is a gcc bug, fixed in gcc 8.
tv
::
dispatch_torch
<
int32_t
>
(
indicesIn
.
scalar_type
(),
[
&
](
auto
V
)
{
using
Index
=
decltype
(
V
);
using
IndexGrid
=
int32_t
;
tv
::
dispatch_int
<
2
,
3
,
4
>
(
ndim
,
[
&
](
auto
I
)
{
constexpr
int
NDim
=
I
;
tv
::
SimpleVector
<
Index
,
NDim
>
ks
(
kernelSize
.
begin
(),
kernelSize
.
end
());
tv
::
SimpleVector
<
Index
,
NDim
>
st
(
stride
.
begin
(),
stride
.
end
());
tv
::
SimpleVector
<
Index
,
NDim
>
pa
(
padding
.
begin
(),
padding
.
end
());
tv
::
SimpleVector
<
Index
,
NDim
>
di
(
dilation
.
begin
(),
dilation
.
end
());
tv
::
SimpleVector
<
Index
,
NDim
>
ou
(
outSpatialShape
.
begin
(),
outSpatialShape
.
end
());
if
(
transpose
)
{
prepareDeConvIndicePairsKernel
<
Index
,
NDim
,
4096
>
<<<
tv
::
cuda
::
getBlocks
(
numActIn
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
stream
>>>
(
tv
::
torch2tv
<
Index
>
(
indicesIn
),
tv
::
torch2tv
<
Index
>
(
indicePairs
),
tv
::
torch2tv
<
Index
>
(
indiceNum
),
tv
::
torch2tv
<
Index
>
(
indicePairUnique
),
ks
,
st
,
pa
,
di
,
ou
);
}
else
{
prepareIndicePairsKernel
<
Index
,
NDim
,
4096
>
<<<
tv
::
cuda
::
getBlocks
(
numActIn
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
stream
>>>
(
tv
::
torch2tv
<
Index
>
(
indicesIn
),
tv
::
torch2tv
<
Index
>
(
indicePairs
),
tv
::
torch2tv
<
Index
>
(
indiceNum
),
tv
::
torch2tv
<
Index
>
(
indicePairUnique
),
ks
,
st
,
pa
,
di
,
ou
);
}
});
});
return
1
;
}
int
create_conv_indice_pair_p2_cuda
(
torch
::
Tensor
indicesIn
,
torch
::
Tensor
indicesOut
,
torch
::
Tensor
gridsOut
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
torch
::
Tensor
indicePairUnique
,
std
::
vector
<
int64_t
>
outSpatialShape
,
bool
transpose
,
bool
resetGrid
,
bool
useHash
)
{
auto
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
auto
ndim
=
outSpatialShape
.
size
();
auto
numActIn
=
indicesIn
.
size
(
0
);
int
batchSize
=
gridsOut
.
size
(
0
);
int
numAct
=
indicePairUnique
.
size
(
0
)
-
1
;
auto
kernelVolume
=
indicePairs
.
size
(
0
);
if
(
numActIn
==
0
)
return
0
;
// dispatch_torch must be in outside, this is a gcc bug, fixed in gcc 8.
tv
::
dispatch_torch
<
int32_t
>
(
indicesIn
.
scalar_type
(),
[
&
](
auto
V
)
{
using
Index
=
decltype
(
V
);
using
IndexGrid
=
int32_t
;
tv
::
dispatch_int
<
2
,
3
,
4
>
(
ndim
,
[
&
](
auto
I
)
{
constexpr
int
NDim
=
I
;
using
IndexGrid
=
int32_t
;
tv
::
SimpleVector
<
Index
,
NDim
>
ou
(
outSpatialShape
.
begin
(),
outSpatialShape
.
end
());
if
(
useHash
)
{
auto
table
=
cuhash
::
HashTable
();
// std::cout << "create " << numAct << " size table..." << std::endl;
table
.
Initialize
(
numAct
,
2.0
,
4
);
unsigned
*
d_values
=
nullptr
;
cudaMalloc
((
void
**
)
&
d_values
,
sizeof
(
unsigned
)
*
numAct
);
TV_CHECK_CUDA_ERR_V2
(
"cudaMalloc failed"
);
arangeKernel
<
unsigned
>
<<<
tv
::
cuda
::
getBlocks
(
numAct
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
stream
>>>
(
d_values
,
numAct
);
TV_CHECK_CUDA_ERR_V2
(
"arangeKernel failed"
);
bool
res
=
table
.
Build
(
numAct
,
reinterpret_cast
<
unsigned
*>
(
tv
::
torch2tv
<
Index
>
(
indicePairUnique
).
data
()),
d_values
);
cudaFree
(
d_values
);
TV_CHECK_CUDA_ERR_V2
(
"cudaFree failed"
);
if
(
!
res
)
{
return
-
1
;
// use -1 to tell outside use CPU implementation
}
assignIndiceOutKernel
<
Index
,
NDim
>
<<<
tv
::
cuda
::
getBlocks
(
numAct
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
stream
>>>
(
tv
::
torch2tv
<
Index
>
(
indicesOut
),
numAct
,
tv
::
torch2tv
<
Index
>
(
indicePairUnique
),
ou
,
batchSize
);
auto
tableSize
=
table
.
get_table_size
();
auto
tableData
=
table
.
data
();
auto
constants
=
table
.
get_constants_4
();
auto
stash_constants
=
table
.
get_stash_constants
();
auto
stash_count
=
table
.
get_stash_count
();
assignIndicePairsHashKernel
<
Index
,
NDim
>
<<<
tv
::
cuda
::
getBlocks
(
numActIn
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
stream
>>>
(
tv
::
torch2tv
<
Index
>
(
indicesOut
),
numActIn
,
tv
::
torch2tv
<
Index
>
(
indicePairs
),
tv
::
torch2tv
<
Index
>
(
indicePairUnique
),
tableSize
,
tableData
,
constants
,
stash_constants
,
stash_count
);
}
else
{
assignGridAndIndiceOutKernel
<
Index
,
IndexGrid
,
NDim
>
<<<
tv
::
cuda
::
getBlocks
(
numAct
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
stream
>>>
(
tv
::
torch2tv
<
Index
>
(
indicesOut
),
tv
::
torch2tv
<
IndexGrid
>
(
gridsOut
),
numAct
,
tv
::
torch2tv
<
Index
>
(
indicePairs
),
tv
::
torch2tv
<
Index
>
(
indicePairUnique
),
ou
,
batchSize
);
TV_CHECK_CUDA_ERR_V2
(
"assignGridAndIndiceOutKernel failed"
);
assignIndicePairsKernel
<
Index
,
IndexGrid
,
NDim
>
<<<
tv
::
cuda
::
getBlocks
(
numActIn
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
stream
>>>
(
tv
::
torch2tv
<
Index
>
(
indicesOut
),
tv
::
torch2tv
<
IndexGrid
>
(
gridsOut
),
numAct
,
tv
::
torch2tv
<
Index
>
(
indicePairs
),
tv
::
torch2tv
<
Index
>
(
indicePairUnique
),
ou
);
TV_CHECK_CUDA_ERR_V2
(
"assignIndicePairsKernel failed"
);
}
if
(
resetGrid
&&
(
!
useHash
))
{
resetGridKernel
<
Index
,
IndexGrid
,
NDim
>
<<<
tv
::
cuda
::
getBlocks
(
numAct
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
stream
>>>
(
indicePairUnique
.
data_ptr
<
Index
>
(),
tv
::
torch2tv
<
IndexGrid
>
(
gridsOut
),
numAct
);
TV_CHECK_CUDA_ERR_V2
(
"resetGridKernel failed"
);
}
});
});
return
numAct
;
}
int
create_submconv_indice_pair_cuda
(
torch
::
Tensor
indicesIn
,
torch
::
Tensor
gridsOut
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
std
::
vector
<
int64_t
>
kernelSize
,
std
::
vector
<
int64_t
>
stride
,
std
::
vector
<
int64_t
>
padding
,
std
::
vector
<
int64_t
>
dilation
,
std
::
vector
<
int64_t
>
outSpatialShape
,
bool
transpose
,
bool
resetGrid
,
bool
useHash
)
{
auto
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
auto
ndim
=
outSpatialShape
.
size
();
auto
numActIn
=
indicesIn
.
size
(
0
);
int
batchSize
=
gridsOut
.
size
(
0
);
auto
kernelVolume
=
indicePairs
.
size
(
0
);
if
(
numActIn
==
0
)
return
0
;
tv
::
dispatch_torch
<
int32_t
>
(
indicesIn
.
scalar_type
(),
[
&
](
auto
V
)
{
using
Index
=
decltype
(
V
);
using
IndexGrid
=
int32_t
;
tv
::
dispatch_int
<
2
,
3
,
4
>
(
ndim
,
[
&
](
auto
I
)
{
constexpr
int
NDim
=
I
;
tv
::
SimpleVector
<
Index
,
NDim
>
ks
(
kernelSize
.
begin
(),
kernelSize
.
end
());
tv
::
SimpleVector
<
Index
,
NDim
>
st
(
stride
.
begin
(),
stride
.
end
());
tv
::
SimpleVector
<
Index
,
NDim
>
pa
(
padding
.
begin
(),
padding
.
end
());
tv
::
SimpleVector
<
Index
,
NDim
>
di
(
dilation
.
begin
(),
dilation
.
end
());
tv
::
SimpleVector
<
Index
,
NDim
>
ou
(
outSpatialShape
.
begin
(),
outSpatialShape
.
end
());
if
(
useHash
)
{
auto
table
=
cuhash
::
HashTable
();
// std::cout << "create " << numAct << " size table..." << std::endl;
table
.
Initialize
(
numActIn
,
2.0
,
4
);
unsigned
*
d_keyvalues
=
nullptr
;
cudaMalloc
((
void
**
)
&
d_keyvalues
,
sizeof
(
unsigned
)
*
numActIn
*
2
);
unsigned
*
d_values
=
d_keyvalues
+
numActIn
;
TV_CHECK_CUDA_ERR_V2
(
"cudaMalloc failed"
);
prepareSubMHashKernel
<
Index
,
NDim
>
<<<
tv
::
cuda
::
getBlocks
(
numActIn
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
stream
>>>
(
tv
::
torch2tv
<
Index
>
(
indicesIn
),
d_keyvalues
,
d_values
,
ou
);
TV_CHECK_CUDA_ERR_V2
(
"prepareSubMHashKernel failed"
);
bool
res
=
table
.
Build
(
numActIn
,
reinterpret_cast
<
unsigned
*>
(
d_keyvalues
),
reinterpret_cast
<
unsigned
*>
(
d_values
));
cudaFree
(
d_values
);
TV_CHECK_CUDA_ERR_V2
(
"cudaFree failed"
);
if
(
!
res
)
{
return
-
1
;
// use -1 to tell outside use CPU implementation
}
auto
tableSize
=
table
.
get_table_size
();
auto
tableData
=
table
.
data
();
auto
constants
=
table
.
get_constants_4
();
auto
stash_constants
=
table
.
get_stash_constants
();
auto
stash_count
=
table
.
get_stash_count
();
getSubMIndicePairsHashKernel
<
Index
,
NDim
,
4096
>
<<<
tv
::
cuda
::
getBlocks
(
numActIn
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
stream
>>>
(
tv
::
torch2tv
<
Index
>
(
indicesIn
),
tv
::
torch2tv
<
Index
>
(
indicePairs
),
tv
::
torch2tv
<
Index
>
(
indiceNum
),
ks
,
st
,
pa
,
di
,
ou
,
tableSize
,
tableData
,
constants
,
stash_constants
,
stash_count
);
}
else
{
prepareSubMGridKernel
<
Index
,
IndexGrid
,
NDim
>
<<<
tv
::
cuda
::
getBlocks
(
numActIn
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
stream
>>>
(
tv
::
torch2tv
<
Index
>
(
indicesIn
),
tv
::
torch2tv
<
IndexGrid
>
(
gridsOut
),
ou
);
TV_CHECK_CUDA_ERR_V2
(
"prepareSubMGridKernel failed"
);
getSubMIndicePairsKernel
<
Index
,
IndexGrid
,
NDim
,
4096
>
<<<
tv
::
cuda
::
getBlocks
(
numActIn
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
stream
>>>
(
tv
::
torch2tv
<
Index
>
(
indicesIn
),
tv
::
torch2tv
<
IndexGrid
>
(
gridsOut
),
tv
::
torch2tv
<
Index
>
(
indicePairs
),
tv
::
torch2tv
<
Index
>
(
indiceNum
),
ks
,
st
,
pa
,
di
,
ou
);
TV_CHECK_CUDA_ERR_V2
(
"assignIndicePairsKernel failed"
);
}
if
(
resetGrid
&&
(
!
useHash
))
{
resetGridSubMKernel
<
Index
,
IndexGrid
,
NDim
>
<<<
tv
::
cuda
::
getBlocks
(
numActIn
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
stream
>>>
(
indicesIn
.
data_ptr
<
Index
>
(),
tv
::
torch2tv
<
IndexGrid
>
(
gridsOut
),
ou
,
numActIn
);
TV_CHECK_CUDA_ERR_V2
(
"resetGridKernel failed"
);
}
});
});
return
numActIn
;
}
namespace
functor
{
namespace
functor
{
template
<
typename
Index
,
typename
IndexGrid
,
unsigned
NDim
>
template
<
typename
Index
,
typename
IndexGrid
,
unsigned
NDim
>
struct
CreateConvIndicePairFunctorP1
<
tv
::
GPU
,
Index
,
IndexGrid
,
NDim
>
{
struct
CreateConvIndicePairFunctorP1
<
tv
::
GPU
,
Index
,
IndexGrid
,
NDim
>
{
...
@@ -46,17 +269,17 @@ struct CreateConvIndicePairFunctorP1<tv::GPU, Index, IndexGrid, NDim> {
...
@@ -46,17 +269,17 @@ struct CreateConvIndicePairFunctorP1<tv::GPU, Index, IndexGrid, NDim> {
return
0
;
return
0
;
// auto timer = spconv::CudaContextTimer<>();
// auto timer = spconv::CudaContextTimer<>();
if
(
transpose
)
if
(
transpose
)
prepareDeConvIndicePairsKernel
<
Index
,
IndexGrid
,
NDim
,
4096
>
prepareDeConvIndicePairsKernel
<
Index
,
NDim
,
4096
>
<<<
tv
::
launch
::
getBlocks
(
numActIn
),
tv
::
launch
::
CUDA_NUM_THREADS
,
0
,
<<<
tv
::
cuda
::
getBlocks
(
numActIn
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
d
.
getStream
()
>>>
(
indicesIn
,
indice
sOut
,
gridsOut
,
indice
Pairs
,
d
.
getStream
()
>>>
(
indicesIn
,
indice
Pairs
,
indice
Num
,
indiceNum
,
indicePairUnique
,
kernelSize
,
stride
,
indicePairUnique
,
kernelSize
,
stride
,
padding
,
padding
,
dilation
,
outSpatialShape
);
dilation
,
outSpatialShape
);
else
else
prepareIndicePairsKernel
<
Index
,
IndexGrid
,
NDim
,
4096
>
prepareIndicePairsKernel
<
Index
,
NDim
,
4096
>
<<<
tv
::
launch
::
getBlocks
(
numActIn
),
tv
::
launch
::
CUDA_NUM_THREADS
,
0
,
<<<
tv
::
cuda
::
getBlocks
(
numActIn
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
d
.
getStream
()
>>>
(
indicesIn
,
indice
sOut
,
gridsOut
,
indice
Pairs
,
d
.
getStream
()
>>>
(
indicesIn
,
indice
Pairs
,
indice
Num
,
indiceNum
,
indicePairUnique
,
kernelSize
,
stride
,
indicePairUnique
,
kernelSize
,
stride
,
padding
,
padding
,
dilation
,
outSpatialShape
);
dilation
,
outSpatialShape
);
TV_CHECK_CUDA_ERR
();
TV_CHECK_CUDA_ERR
();
// std::cout << "p1 gene time " << timer.report() / 1000.0 << std::endl;
// std::cout << "p1 gene time " << timer.report() / 1000.0 << std::endl;
return
1
;
return
1
;
...
@@ -78,57 +301,58 @@ struct CreateConvIndicePairFunctorP2<tv::GPU, Index, IndexGrid, NDim> {
...
@@ -78,57 +301,58 @@ struct CreateConvIndicePairFunctorP2<tv::GPU, Index, IndexGrid, NDim> {
auto
numActIn
=
indicesIn
.
dim
(
0
);
auto
numActIn
=
indicesIn
.
dim
(
0
);
if
(
numActIn
==
0
)
if
(
numActIn
==
0
)
return
0
;
return
0
;
// after unique, there is a std::numeric_limits<int>::max() in the end of indicePairUnique
// after unique, there is a std::numeric_limits<int>::max() in the end of
Index
numAct
=
indicePairUnique
.
dim
(
0
)
-
1
;
// indicePairUnique
if
(
useHash
){
Index
numAct
=
indicePairUnique
.
dim
(
0
)
-
1
;
if
(
useHash
)
{
auto
table
=
cuhash
::
HashTable
();
auto
table
=
cuhash
::
HashTable
();
// std::cout << "create " << numAct << " size table..." << std::endl;
// std::cout << "create " << numAct << " size table..." << std::endl;
table
.
Initialize
(
numAct
,
2.0
,
4
);
table
.
Initialize
(
numAct
,
2.0
,
4
);
unsigned
*
d_values
=
nullptr
;
unsigned
*
d_values
=
nullptr
;
cudaMalloc
((
void
**
)
&
d_values
,
sizeof
(
unsigned
)
*
numAct
);
cudaMalloc
((
void
**
)
&
d_values
,
sizeof
(
unsigned
)
*
numAct
);
TV_CHECK_CUDA_ERR_V2
(
"cudaMalloc failed"
);
TV_CHECK_CUDA_ERR_V2
(
"cudaMalloc failed"
);
arangeKernel
<
unsigned
><<<
tv
::
launch
::
getBlocks
(
numAct
),
tv
::
launch
::
CUDA_NUM_THREADS
,
0
,
arangeKernel
<
unsigned
>
d
.
getStream
()
>>>
(
d_values
,
numAct
);
<<<
tv
::
cuda
::
getBlocks
(
numAct
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
bool
res
=
table
.
Build
(
numAct
,
reinterpret_cast
<
unsigned
*>
(
indicePairUnique
.
data
()),
d
.
getStream
()
>>>
(
d_values
,
numAct
);
d_values
);
bool
res
=
table
.
Build
(
numAct
,
reinterpret_cast
<
unsigned
*>
(
indicePairUnique
.
data
()),
d_values
);
cudaFree
(
d_values
);
cudaFree
(
d_values
);
if
(
!
res
){
if
(
!
res
)
{
return
-
1
;
//use -1 to tell outside use CPU implementation
return
-
1
;
//
use -1 to tell outside use CPU implementation
}
}
assignIndiceOutKernel
<
Index
,
NDim
>
assignIndiceOutKernel
<
Index
,
NDim
>
<<<
tv
::
launch
::
getBlocks
(
numAct
),
tv
::
launch
::
CUDA_NUM_THREADS
,
0
,
<<<
tv
::
cuda
::
getBlocks
(
numAct
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
d
.
getStream
()
>>>
(
indicesOut
,
numAct
,
d
.
getStream
()
>>>
(
indicesOut
,
numAct
,
indicePairUnique
,
indicePairUnique
,
outSpatialShape
,
batchSize
);
outSpatialShape
,
batchSize
);
TV_CHECK_CUDA_ERR_V2
(
"assignGridAndIndiceOutKernel failed"
);
TV_CHECK_CUDA_ERR_V2
(
"assignGridAndIndiceOutKernel failed"
);
auto
tableSize
=
table
.
get_table_size
();
auto
tableSize
=
table
.
get_table_size
();
auto
tableData
=
table
.
data
();
auto
tableData
=
table
.
data
();
auto
constants
=
table
.
get_constants_4
();
auto
constants
=
table
.
get_constants_4
();
auto
stash_constants
=
table
.
get_stash_constants
();
auto
stash_constants
=
table
.
get_stash_constants
();
auto
stash_count
=
table
.
get_stash_count
();
auto
stash_count
=
table
.
get_stash_count
();
assignIndicePairsHashKernel
<
Index
,
IndexGrid
,
NDim
>
assignIndicePairsHashKernel
<
Index
,
NDim
>
<<<
tv
::
launch
::
getBlocks
(
numActIn
),
tv
::
launch
::
CUDA_NUM_THREADS
,
0
,
<<<
tv
::
cuda
::
getBlocks
(
numActIn
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
d
.
getStream
()
>>>
(
indicesOut
,
numActIn
,
indicePairs
,
d
.
getStream
()
>>>
(
indicesOut
,
numActIn
,
indicePairs
,
indicePairUnique
,
indicePairUnique
,
tableSize
,
tableData
,
constants
,
tableSize
,
tableData
,
constants
,
stash_constants
,
stash_constants
,
stash_count
);
stash_count
);
TV_CHECK_CUDA_ERR_V2
(
"assignIndicePairsKernel failed"
);
TV_CHECK_CUDA_ERR_V2
(
"assignIndicePairsKernel failed"
);
}
else
{
}
else
{
assignGridAndIndiceOutKernel
<
Index
,
IndexGrid
,
NDim
>
assignGridAndIndiceOutKernel
<
Index
,
IndexGrid
,
NDim
>
<<<
tv
::
launch
::
getBlocks
(
numAct
),
tv
::
launch
::
CUDA_NUM_THREADS
,
0
,
<<<
tv
::
cuda
::
getBlocks
(
numAct
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
d
.
getStream
()
>>>
(
indicesOut
,
gridsOut
,
numAct
,
indicePairs
,
d
.
getStream
()
>>>
(
indicesOut
,
gridsOut
,
numAct
,
indicePairs
,
indicePairUnique
,
outSpatialShape
,
batchSize
);
indicePairUnique
,
outSpatialShape
,
batchSize
);
TV_CHECK_CUDA_ERR_V2
(
"assignGridAndIndiceOutKernel failed"
);
TV_CHECK_CUDA_ERR_V2
(
"assignGridAndIndiceOutKernel failed"
);
assignIndicePairsKernel
<
Index
,
IndexGrid
,
NDim
>
assignIndicePairsKernel
<
Index
,
IndexGrid
,
NDim
>
<<<
tv
::
launch
::
getBlocks
(
numActIn
),
tv
::
launch
::
CUDA_NUM_THREADS
,
0
,
<<<
tv
::
cuda
::
getBlocks
(
numActIn
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
d
.
getStream
()
>>>
(
indicesOut
,
gridsOut
,
numActIn
,
indicePairs
,
d
.
getStream
()
>>>
(
indicesOut
,
gridsOut
,
numActIn
,
indicePairs
,
indicePairUnique
,
outSpatialShape
);
indicePairUnique
,
outSpatialShape
);
TV_CHECK_CUDA_ERR_V2
(
"assignIndicePairsKernel failed"
);
TV_CHECK_CUDA_ERR_V2
(
"assignIndicePairsKernel failed"
);
}
}
if
(
resetGrid
&&
(
!
useHash
))
{
if
(
resetGrid
&&
(
!
useHash
))
{
resetGridKernel
<
Index
,
IndexGrid
,
NDim
>
resetGridKernel
<
Index
,
IndexGrid
,
NDim
>
<<<
tv
::
launch
::
getBlocks
(
numAct
),
tv
::
launch
::
CUDA_NUM_THREADS
,
0
,
<<<
tv
::
cuda
::
getBlocks
(
numAct
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
d
.
getStream
()
>>>
(
indicePairUnique
.
data
(),
gridsOut
,
numAct
);
d
.
getStream
()
>>>
(
indicePairUnique
.
data
(),
gridsOut
,
numAct
);
TV_CHECK_CUDA_ERR_V2
(
"resetGridKernel failed"
);
TV_CHECK_CUDA_ERR_V2
(
"resetGridKernel failed"
);
}
}
...
@@ -152,22 +376,25 @@ struct CreateSubMIndicePairFunctor<tv::GPU, Index, IndexGrid, NDim> {
...
@@ -152,22 +376,25 @@ struct CreateSubMIndicePairFunctor<tv::GPU, Index, IndexGrid, NDim> {
if
(
numActIn
==
0
)
if
(
numActIn
==
0
)
return
0
;
return
0
;
// auto timer = spconv::CudaContextTimer<>();
// auto timer = spconv::CudaContextTimer<>();
if
(
useHash
){
if
(
useHash
)
{
auto
table
=
cuhash
::
HashTable
();
auto
table
=
cuhash
::
HashTable
();
// std::cout << "subm create " << numActIn << " size table..." << std::endl;
// std::cout << "subm create " << numActIn << " size table..." <<
// std::endl;
table
.
Initialize
(
numActIn
,
2.0
,
4
);
table
.
Initialize
(
numActIn
,
2.0
,
4
);
unsigned
*
d_keyvalues
=
nullptr
;
unsigned
*
d_keyvalues
=
nullptr
;
cudaMalloc
((
void
**
)
&
d_keyvalues
,
sizeof
(
unsigned
)
*
numActIn
*
2
);
cudaMalloc
((
void
**
)
&
d_keyvalues
,
sizeof
(
unsigned
)
*
numActIn
*
2
);
unsigned
*
d_values
=
d_keyvalues
+
numActIn
;
unsigned
*
d_values
=
d_keyvalues
+
numActIn
;
prepareSubMHashKernel
<
Index
,
NDim
>
prepareSubMHashKernel
<
Index
,
NDim
>
<<<
tv
::
launch
::
getBlocks
(
numActIn
),
tv
::
launch
::
CUDA_NUM_THREADS
,
0
,
<<<
tv
::
cuda
::
getBlocks
(
numActIn
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
d
.
getStream
()
>>>
(
indicesIn
,
d_keyvalues
,
d_values
,
outSpatialShape
);
d
.
getStream
()
>>>
(
indicesIn
,
d_keyvalues
,
d_values
,
outSpatialShape
);
TV_CHECK_CUDA_ERR_V2
(
"prepareSubMHashKernel failed"
);
TV_CHECK_CUDA_ERR_V2
(
"prepareSubMHashKernel failed"
);
bool
res
=
table
.
Build
(
numActIn
,
reinterpret_cast
<
unsigned
*>
(
d_keyvalues
),
bool
res
=
reinterpret_cast
<
unsigned
*>
(
d_values
));
table
.
Build
(
numActIn
,
reinterpret_cast
<
unsigned
*>
(
d_keyvalues
),
reinterpret_cast
<
unsigned
*>
(
d_values
));
cudaFree
(
d_keyvalues
);
cudaFree
(
d_keyvalues
);
if
(
!
res
){
if
(
!
res
)
{
return
-
1
;
//use -1 to tell outside use CPU implementation
return
-
1
;
//
use -1 to tell outside use CPU implementation
}
}
auto
tableSize
=
table
.
get_table_size
();
auto
tableSize
=
table
.
get_table_size
();
auto
tableData
=
table
.
data
();
auto
tableData
=
table
.
data
();
...
@@ -175,28 +402,30 @@ struct CreateSubMIndicePairFunctor<tv::GPU, Index, IndexGrid, NDim> {
...
@@ -175,28 +402,30 @@ struct CreateSubMIndicePairFunctor<tv::GPU, Index, IndexGrid, NDim> {
auto
stash_constants
=
table
.
get_stash_constants
();
auto
stash_constants
=
table
.
get_stash_constants
();
auto
stash_count
=
table
.
get_stash_count
();
auto
stash_count
=
table
.
get_stash_count
();
getSubMIndicePairsHashKernel
<
Index
,
NDim
,
4096
>
getSubMIndicePairsHashKernel
<
Index
,
NDim
,
4096
>
<<<
tv
::
launch
::
getBlocks
(
numActIn
),
tv
::
launch
::
CUDA_NUM_THREADS
,
0
,
<<<
tv
::
cuda
::
getBlocks
(
numActIn
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
d
.
getStream
()
>>>
(
indicesIn
,
indicePairs
,
indiceNum
,
d
.
getStream
()
>>>
(
indicesIn
,
indicePairs
,
indiceNum
,
kernelSize
,
kernelSize
,
stride
,
padding
,
dilation
,
outSpatialShape
,
stride
,
padding
,
dilation
,
outSpatialShape
,
tableSize
,
tableData
,
constants
,
stash_constants
,
tableSize
,
tableData
,
constants
,
stash_constants
,
stash_count
);
stash_count
);
TV_CHECK_CUDA_ERR_V2
(
"getSubMIndicePairsHashKernel failed"
);
TV_CHECK_CUDA_ERR_V2
(
"getSubMIndicePairsHashKernel failed"
);
}
else
{
}
else
{
prepareSubMGridKernel
<
Index
,
IndexGrid
,
NDim
>
prepareSubMGridKernel
<
Index
,
IndexGrid
,
NDim
>
<<<
tv
::
launch
::
getBlocks
(
numActIn
),
tv
::
launch
::
CUDA_NUM_THREADS
,
0
,
<<<
tv
::
cuda
::
getBlocks
(
numActIn
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
d
.
getStream
()
>>>
(
indicesIn
,
gridsOut
,
outSpatialShape
);
d
.
getStream
()
>>>
(
indicesIn
,
gridsOut
,
outSpatialShape
);
TV_CHECK_CUDA_ERR
();
TV_CHECK_CUDA_ERR
();
getSubMIndicePairsKernel
<
Index
,
IndexGrid
,
NDim
,
4096
>
getSubMIndicePairsKernel
<
Index
,
IndexGrid
,
NDim
,
4096
>
<<<
tv
::
launch
::
getBlocks
(
numActIn
),
tv
::
launch
::
CUDA_NUM_THREADS
,
0
,
<<<
tv
::
cuda
::
getBlocks
(
numActIn
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
d
.
getStream
()
>>>
(
indicesIn
,
gridsOut
,
indicePairs
,
indiceNum
,
d
.
getStream
()
>>>
(
indicesIn
,
gridsOut
,
indicePairs
,
indiceNum
,
kernelSize
,
stride
,
padding
,
dilation
,
outSpatialShape
);
kernelSize
,
stride
,
padding
,
dilation
,
outSpatialShape
);
TV_CHECK_CUDA_ERR
();
TV_CHECK_CUDA_ERR
();
}
}
// std::cout << "subm gene time " << timer.report() / 1000.0 << std::endl;
// std::cout << "subm gene time " << timer.report() / 1000.0 << std::endl;
if
(
resetGrid
&&
(
!
useHash
))
{
if
(
resetGrid
&&
(
!
useHash
))
{
resetGridSubMKernel
<
Index
,
IndexGrid
,
NDim
>
resetGridSubMKernel
<
Index
,
IndexGrid
,
NDim
>
<<<
tv
::
launch
::
getBlocks
(
numActIn
),
tv
::
launch
::
CUDA_NUM_THREADS
,
0
,
<<<
tv
::
cuda
::
getBlocks
(
numActIn
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
d
.
getStream
()
>>>
(
indicesIn
.
data
(),
gridsOut
,
outSpatialShape
,
numActIn
);
d
.
getStream
()
>>>
(
indicesIn
.
data
(),
gridsOut
,
outSpatialShape
,
numActIn
);
TV_CHECK_CUDA_ERR
();
TV_CHECK_CUDA_ERR
();
}
}
return
numActIn
;
return
numActIn
;
...
...
src/spconv/maxpool.cu
View file @
19e73bbe
...
@@ -16,9 +16,9 @@
...
@@ -16,9 +16,9 @@
#include <chrono>
#include <chrono>
#include <limits>
#include <limits>
#include <spconv/maxpool.h>
#include <spconv/maxpool.h>
#include <
spconv/mp_helper
.h>
#include <
tensorview/cuda_utils
.h>
#include <tensorview/
helper_kernel.cu
.h>
#include <tensorview/
kernel_utils
.h>
#include <tensorview/helper
_launch
.h>
#include <tensorview/
mp_
helper.h>
#include <tensorview/tensorview.h>
#include <tensorview/tensorview.h>
#include <type_traits>
#include <type_traits>
...
@@ -255,7 +255,8 @@ maxPoolBwdVecBlockKernel(const T *outFeatures, const T *inFeatures,
...
@@ -255,7 +255,8 @@ maxPoolBwdVecBlockKernel(const T *outFeatures, const T *inFeatures,
reinterpret_cast
<
const
VecType
*>
(
inFeatures
)[
idxi
];
reinterpret_cast
<
const
VecType
*>
(
inFeatures
)[
idxi
];
reinterpret_cast
<
VecType
*>
(
bufdo
)[
0
]
=
reinterpret_cast
<
VecType
*>
(
bufdo
)[
0
]
=
reinterpret_cast
<
const
VecType
*>
(
dout
)[
idxo
];
reinterpret_cast
<
const
VecType
*>
(
dout
)[
idxo
];
reinterpret_cast
<
VecType
*>
(
bufdi
)[
0
]
=
reinterpret_cast
<
VecType
*>
(
din
)[
idxi
];
reinterpret_cast
<
VecType
*>
(
bufdi
)[
0
]
=
reinterpret_cast
<
VecType
*>
(
din
)[
idxi
];
#pragma unroll
#pragma unroll
for
(
int
i
=
0
;
i
<
vecloadFactor
;
i
++
)
{
for
(
int
i
=
0
;
i
<
vecloadFactor
;
i
++
)
{
...
@@ -263,7 +264,8 @@ maxPoolBwdVecBlockKernel(const T *outFeatures, const T *inFeatures,
...
@@ -263,7 +264,8 @@ maxPoolBwdVecBlockKernel(const T *outFeatures, const T *inFeatures,
bufdi
[
i
]
+=
bufdo
[
i
];
bufdi
[
i
]
+=
bufdo
[
i
];
}
}
}
}
reinterpret_cast
<
VecType
*>
(
din
)[
idxi
]
=
reinterpret_cast
<
VecType
*>
(
bufdi
)[
0
];
reinterpret_cast
<
VecType
*>
(
din
)[
idxi
]
=
reinterpret_cast
<
VecType
*>
(
bufdi
)[
0
];
}
}
}
}
}
}
...
@@ -309,7 +311,7 @@ template <typename T, typename Index>
...
@@ -309,7 +311,7 @@ template <typename T, typename Index>
struct
SparseMaxPoolForwardFunctor
<
tv
::
GPU
,
T
,
Index
>
{
struct
SparseMaxPoolForwardFunctor
<
tv
::
GPU
,
T
,
Index
>
{
using
vecload_type_t
=
using
vecload_type_t
=
std
::
conditional_t
<
std
::
is_same
<
T
,
at
::
Half
>::
value
,
int2
,
int4
>
;
std
::
conditional_t
<
std
::
is_same
<
T
,
at
::
Half
>::
value
,
int2
,
int4
>
;
using
kernel_block_t
=
mp_list_c
<
int
,
64
,
32
,
16
>
;
using
kernel_block_t
=
tv
::
mp_list_c
<
int
,
64
,
32
,
16
>
;
void
operator
()(
const
tv
::
GPU
&
d
,
tv
::
TensorView
<
T
>
outFeatures
,
void
operator
()(
const
tv
::
GPU
&
d
,
tv
::
TensorView
<
T
>
outFeatures
,
tv
::
TensorView
<
const
T
>
inFeatures
,
tv
::
TensorView
<
const
T
>
inFeatures
,
tv
::
TensorView
<
const
Index
>
indices
,
int
size
)
{
tv
::
TensorView
<
const
Index
>
indices
,
int
size
)
{
...
@@ -318,21 +320,22 @@ struct SparseMaxPoolForwardFunctor<tv::GPU, T, Index> {
...
@@ -318,21 +320,22 @@ struct SparseMaxPoolForwardFunctor<tv::GPU, T, Index> {
int
numPlanes
=
inFeatures
.
dim
(
1
);
int
numPlanes
=
inFeatures
.
dim
(
1
);
bool
notFound
=
true
;
bool
notFound
=
true
;
constexpr
int
vecloadFactor
=
sizeof
(
vecload_type_t
)
/
sizeof
(
T
);
constexpr
int
vecloadFactor
=
sizeof
(
vecload_type_t
)
/
sizeof
(
T
);
mp_for_each
<
kernel_block_t
>
([
=
,
&
outFeatures
,
&
inFeatures
,
&
indices
,
tv
::
mp_for_each
<
kernel_block_t
>
([
=
,
&
outFeatures
,
&
inFeatures
,
&
indices
,
&
notFound
](
auto
NumTLP
)
{
&
notFound
](
auto
NumTLP
)
{
constexpr
int
NumILP
=
NumTLP
/
4
;
constexpr
int
NumILP
=
NumTLP
/
4
;
int
numHotBlock
=
(
size
/
NumTLP
)
*
NumTLP
;
int
numHotBlock
=
(
size
/
NumTLP
)
*
NumTLP
;
if
(
notFound
)
{
if
(
notFound
)
{
if
(
numPlanes
%
NumTLP
==
0
)
{
if
(
numPlanes
%
NumTLP
==
0
)
{
if
(
numHotBlock
>=
NumTLP
)
{
if
(
numHotBlock
>=
NumTLP
)
{
maxPoolFwdVecBlockKernel
<
T
,
Index
,
int
(
NumTLP
),
NumILP
,
vecload_type_t
>
maxPoolFwdVecBlockKernel
<
T
,
Index
,
int
(
NumTLP
),
NumILP
,
vecload_type_t
>
<<<
dim3
(
std
::
min
(
size
/
NumTLP
,
512
),
numPlanes
/
NumTLP
),
<<<
dim3
(
std
::
min
(
size
/
NumTLP
,
512
),
numPlanes
/
NumTLP
),
dim3
(
NumTLP
/
vecloadFactor
,
NumTLP
/
NumILP
),
0
,
dim3
(
NumTLP
/
vecloadFactor
,
NumTLP
/
NumILP
),
0
,
d
.
getStream
()
>>>
(
outFeatures
.
data
(),
inFeatures
.
data
(),
d
.
getStream
()
>>>
(
outFeatures
.
data
(),
inFeatures
.
data
(),
indices
.
subview
(
0
).
data
(),
indices
.
subview
(
0
).
data
(),
indices
.
subview
(
1
).
data
(),
numHotBlock
,
indices
.
subview
(
1
).
data
(),
numHotBlock
,
numPlanes
/
vecloadFactor
);
numPlanes
/
vecloadFactor
);
TV_CHECK_CUDA_ERR
();
TV_CHECK_CUDA_ERR
();
}
}
...
@@ -340,9 +343,9 @@ struct SparseMaxPoolForwardFunctor<tv::GPU, T, Index> {
...
@@ -340,9 +343,9 @@ struct SparseMaxPoolForwardFunctor<tv::GPU, T, Index> {
maxPoolFwdGenericKernel
<
T
,
Index
,
int
(
NumTLP
),
NumILP
>
maxPoolFwdGenericKernel
<
T
,
Index
,
int
(
NumTLP
),
NumILP
>
<<<
dim3
(
1
,
numPlanes
/
NumTLP
),
dim3
(
NumTLP
/
NumILP
,
NumTLP
),
<<<
dim3
(
1
,
numPlanes
/
NumTLP
),
dim3
(
NumTLP
/
NumILP
,
NumTLP
),
0
,
d
.
getStream
()
>>>
(
outFeatures
.
data
(),
inFeatures
.
data
(),
0
,
d
.
getStream
()
>>>
(
outFeatures
.
data
(),
inFeatures
.
data
(),
indices
.
subview
(
0
).
data
()
+
numHotBlock
,
indices
.
subview
(
0
).
data
()
+
numHotBlock
,
indices
.
subview
(
1
).
data
()
+
numHotBlock
,
indices
.
subview
(
1
).
data
()
+
numHotBlock
,
size
-
numHotBlock
,
numPlanes
);
size
-
numHotBlock
,
numPlanes
);
TV_CHECK_CUDA_ERR
();
TV_CHECK_CUDA_ERR
();
}
}
notFound
=
false
;
notFound
=
false
;
...
@@ -356,7 +359,7 @@ struct SparseMaxPoolForwardFunctor<tv::GPU, T, Index> {
...
@@ -356,7 +359,7 @@ struct SparseMaxPoolForwardFunctor<tv::GPU, T, Index> {
int
numHotBlock
=
(
size
/
NumTLP
)
*
NumTLP
;
int
numHotBlock
=
(
size
/
NumTLP
)
*
NumTLP
;
if
(
numHotBlock
>=
NumTLP
)
{
if
(
numHotBlock
>=
NumTLP
)
{
maxPoolFwdGenericBlockKernel
<
T
,
Index
,
NumTLP
,
NumILP
>
maxPoolFwdGenericBlockKernel
<
T
,
Index
,
NumTLP
,
NumILP
>
<<<
dim3
(
size
/
NumTLP
,
tv
::
launch
::
DivUp
(
numPlanes
,
NumTLP
)),
<<<
dim3
(
size
/
NumTLP
,
tv
::
cuda
::
DivUp
(
numPlanes
,
NumTLP
)),
dim3
(
NumTLP
/
NumILP
,
NumTLP
),
0
,
d
.
getStream
()
>>>
(
dim3
(
NumTLP
/
NumILP
,
NumTLP
),
0
,
d
.
getStream
()
>>>
(
outFeatures
.
data
(),
inFeatures
.
data
(),
outFeatures
.
data
(),
inFeatures
.
data
(),
indices
.
subview
(
0
).
data
(),
indices
.
subview
(
1
).
data
(),
indices
.
subview
(
0
).
data
(),
indices
.
subview
(
1
).
data
(),
...
@@ -366,7 +369,7 @@ struct SparseMaxPoolForwardFunctor<tv::GPU, T, Index> {
...
@@ -366,7 +369,7 @@ struct SparseMaxPoolForwardFunctor<tv::GPU, T, Index> {
if
(
size
>
numHotBlock
)
{
if
(
size
>
numHotBlock
)
{
maxPoolFwdGenericKernel
<
T
,
Index
,
NumTLP
,
NumILP
>
maxPoolFwdGenericKernel
<
T
,
Index
,
NumTLP
,
NumILP
>
<<<
dim3
(
1
,
tv
::
launch
::
DivUp
(
numPlanes
,
NumTLP
)),
<<<
dim3
(
1
,
tv
::
cuda
::
DivUp
(
numPlanes
,
NumTLP
)),
dim3
(
NumTLP
/
NumILP
,
NumTLP
),
0
,
d
.
getStream
()
>>>
(
dim3
(
NumTLP
/
NumILP
,
NumTLP
),
0
,
d
.
getStream
()
>>>
(
outFeatures
.
data
(),
inFeatures
.
data
(),
outFeatures
.
data
(),
inFeatures
.
data
(),
indices
.
subview
(
0
).
data
()
+
numHotBlock
,
indices
.
subview
(
0
).
data
()
+
numHotBlock
,
...
@@ -382,7 +385,7 @@ template <typename T, typename Index>
...
@@ -382,7 +385,7 @@ template <typename T, typename Index>
struct
SparseMaxPoolBackwardFunctor
<
tv
::
GPU
,
T
,
Index
>
{
struct
SparseMaxPoolBackwardFunctor
<
tv
::
GPU
,
T
,
Index
>
{
using
vecload_type_t
=
using
vecload_type_t
=
std
::
conditional_t
<
std
::
is_same
<
T
,
at
::
Half
>::
value
,
int2
,
int4
>
;
std
::
conditional_t
<
std
::
is_same
<
T
,
at
::
Half
>::
value
,
int2
,
int4
>
;
using
kernel_block_t
=
mp_list_c
<
int
,
64
,
32
,
16
>
;
using
kernel_block_t
=
tv
::
mp_list_c
<
int
,
64
,
32
,
16
>
;
void
operator
()(
const
tv
::
GPU
&
d
,
tv
::
TensorView
<
const
T
>
outFeatures
,
void
operator
()(
const
tv
::
GPU
&
d
,
tv
::
TensorView
<
const
T
>
outFeatures
,
tv
::
TensorView
<
const
T
>
inFeatures
,
tv
::
TensorView
<
const
T
>
inFeatures
,
tv
::
TensorView
<
const
T
>
dout
,
tv
::
TensorView
<
T
>
din
,
tv
::
TensorView
<
const
T
>
dout
,
tv
::
TensorView
<
T
>
din
,
...
@@ -392,22 +395,23 @@ struct SparseMaxPoolBackwardFunctor<tv::GPU, T, Index> {
...
@@ -392,22 +395,23 @@ struct SparseMaxPoolBackwardFunctor<tv::GPU, T, Index> {
int
numPlanes
=
inFeatures
.
dim
(
1
);
int
numPlanes
=
inFeatures
.
dim
(
1
);
bool
notFound
=
true
;
bool
notFound
=
true
;
constexpr
int
vecloadFactor
=
sizeof
(
vecload_type_t
)
/
sizeof
(
T
);
constexpr
int
vecloadFactor
=
sizeof
(
vecload_type_t
)
/
sizeof
(
T
);
mp_for_each
<
kernel_block_t
>
([
=
,
&
outFeatures
,
&
inFeatures
,
&
dout
,
&
din
,
tv
::
mp_for_each
<
kernel_block_t
>
([
=
,
&
outFeatures
,
&
inFeatures
,
&
dout
,
&
din
,
&
indices
,
&
notFound
](
auto
NumTLP
)
{
&
indices
,
&
notFound
](
auto
NumTLP
)
{
constexpr
int
NumILP
=
NumTLP
/
4
;
constexpr
int
NumILP
=
NumTLP
/
4
;
int
numHotBlock
=
(
size
/
NumTLP
)
*
NumTLP
;
int
numHotBlock
=
(
size
/
NumTLP
)
*
NumTLP
;
if
(
notFound
)
{
if
(
notFound
)
{
if
(
numPlanes
%
NumTLP
==
0
)
{
if
(
numPlanes
%
NumTLP
==
0
)
{
if
(
numHotBlock
>=
NumTLP
)
{
if
(
numHotBlock
>=
NumTLP
)
{
maxPoolBwdVecBlockKernel
<
T
,
Index
,
int
(
NumTLP
),
NumILP
,
vecload_type_t
>
maxPoolBwdVecBlockKernel
<
T
,
Index
,
int
(
NumTLP
),
NumILP
,
vecload_type_t
>
<<<
dim3
(
std
::
min
(
size
/
NumTLP
,
512
),
numPlanes
/
NumTLP
),
<<<
dim3
(
std
::
min
(
size
/
NumTLP
,
512
),
numPlanes
/
NumTLP
),
dim3
(
NumTLP
/
vecloadFactor
,
NumTLP
/
NumILP
),
0
,
dim3
(
NumTLP
/
vecloadFactor
,
NumTLP
/
NumILP
),
0
,
d
.
getStream
()
>>>
(
outFeatures
.
data
(),
inFeatures
.
data
(),
d
.
getStream
()
>>>
(
outFeatures
.
data
(),
inFeatures
.
data
(),
dout
.
data
(),
din
.
data
(),
dout
.
data
(),
din
.
data
(),
indices
.
subview
(
0
).
data
(),
indices
.
subview
(
0
).
data
(),
indices
.
subview
(
1
).
data
(),
numHotBlock
,
indices
.
subview
(
1
).
data
(),
numHotBlock
,
numPlanes
/
vecloadFactor
);
numPlanes
/
vecloadFactor
);
TV_CHECK_CUDA_ERR
();
TV_CHECK_CUDA_ERR
();
}
}
...
@@ -415,10 +419,10 @@ struct SparseMaxPoolBackwardFunctor<tv::GPU, T, Index> {
...
@@ -415,10 +419,10 @@ struct SparseMaxPoolBackwardFunctor<tv::GPU, T, Index> {
maxPoolBwdGenericKernel
<
T
,
Index
,
int
(
NumTLP
),
NumILP
>
maxPoolBwdGenericKernel
<
T
,
Index
,
int
(
NumTLP
),
NumILP
>
<<<
dim3
(
1
,
numPlanes
/
NumTLP
),
dim3
(
NumTLP
/
NumILP
,
NumTLP
),
<<<
dim3
(
1
,
numPlanes
/
NumTLP
),
dim3
(
NumTLP
/
NumILP
,
NumTLP
),
0
,
d
.
getStream
()
>>>
(
outFeatures
.
data
(),
inFeatures
.
data
(),
0
,
d
.
getStream
()
>>>
(
outFeatures
.
data
(),
inFeatures
.
data
(),
dout
.
data
(),
din
.
data
(),
dout
.
data
(),
din
.
data
(),
indices
.
subview
(
0
).
data
()
+
numHotBlock
,
indices
.
subview
(
0
).
data
()
+
numHotBlock
,
indices
.
subview
(
1
).
data
()
+
numHotBlock
,
indices
.
subview
(
1
).
data
()
+
numHotBlock
,
size
-
numHotBlock
,
numPlanes
);
size
-
numHotBlock
,
numPlanes
);
TV_CHECK_CUDA_ERR
();
TV_CHECK_CUDA_ERR
();
}
}
notFound
=
false
;
notFound
=
false
;
...
@@ -432,7 +436,7 @@ struct SparseMaxPoolBackwardFunctor<tv::GPU, T, Index> {
...
@@ -432,7 +436,7 @@ struct SparseMaxPoolBackwardFunctor<tv::GPU, T, Index> {
int
numHotBlock
=
(
size
/
NumTLP
)
*
NumTLP
;
int
numHotBlock
=
(
size
/
NumTLP
)
*
NumTLP
;
if
(
numHotBlock
>=
NumTLP
)
{
if
(
numHotBlock
>=
NumTLP
)
{
maxPoolBwdGenericBlockKernel
<
T
,
Index
,
NumTLP
,
NumILP
>
maxPoolBwdGenericBlockKernel
<
T
,
Index
,
NumTLP
,
NumILP
>
<<<
dim3
(
size
/
NumTLP
,
tv
::
launch
::
DivUp
(
numPlanes
,
NumTLP
)),
<<<
dim3
(
size
/
NumTLP
,
tv
::
cuda
::
DivUp
(
numPlanes
,
NumTLP
)),
dim3
(
NumTLP
/
NumILP
,
NumTLP
),
0
,
d
.
getStream
()
>>>
(
dim3
(
NumTLP
/
NumILP
,
NumTLP
),
0
,
d
.
getStream
()
>>>
(
outFeatures
.
data
(),
inFeatures
.
data
(),
dout
.
data
(),
din
.
data
(),
outFeatures
.
data
(),
inFeatures
.
data
(),
dout
.
data
(),
din
.
data
(),
indices
.
subview
(
0
).
data
(),
indices
.
subview
(
1
).
data
(),
indices
.
subview
(
0
).
data
(),
indices
.
subview
(
1
).
data
(),
...
@@ -442,7 +446,7 @@ struct SparseMaxPoolBackwardFunctor<tv::GPU, T, Index> {
...
@@ -442,7 +446,7 @@ struct SparseMaxPoolBackwardFunctor<tv::GPU, T, Index> {
if
(
size
>
numHotBlock
)
{
if
(
size
>
numHotBlock
)
{
maxPoolBwdGenericKernel
<
T
,
Index
,
NumTLP
,
NumILP
>
maxPoolBwdGenericKernel
<
T
,
Index
,
NumTLP
,
NumILP
>
<<<
dim3
(
1
,
tv
::
launch
::
DivUp
(
numPlanes
,
NumTLP
)),
<<<
dim3
(
1
,
tv
::
cuda
::
DivUp
(
numPlanes
,
NumTLP
)),
dim3
(
NumTLP
/
NumILP
,
NumTLP
),
0
,
d
.
getStream
()
>>>
(
dim3
(
NumTLP
/
NumILP
,
NumTLP
),
0
,
d
.
getStream
()
>>>
(
outFeatures
.
data
(),
inFeatures
.
data
(),
dout
.
data
(),
din
.
data
(),
outFeatures
.
data
(),
inFeatures
.
data
(),
dout
.
data
(),
din
.
data
(),
indices
.
subview
(
0
).
data
()
+
numHotBlock
,
indices
.
subview
(
0
).
data
()
+
numHotBlock
,
...
...
src/spconv/nms.cc
View file @
19e73bbe
...
@@ -21,7 +21,7 @@ namespace spconv {
...
@@ -21,7 +21,7 @@ namespace spconv {
namespace
functor
{
namespace
functor
{
template
<
typename
T
,
typename
Index
>
template
<
typename
T
,
typename
Index
>
struct
NonMaxSupressionFunctor
<
tv
::
CPU
,
T
,
Index
>
{
struct
NonMaxSupressionFunctor
<
tv
::
CPU
,
T
,
Index
>
{
Index
operator
()(
const
tv
::
CPU
&
d
,
tv
::
TensorView
<
Index
>
keep
,
Index
operator
()(
const
tv
::
CPU
&
d
,
tv
::
TensorView
<
Index
>
keep
,
tv
::
TensorView
<
const
T
>
boxes
,
T
threshold
,
T
eps
)
{
tv
::
TensorView
<
const
T
>
boxes
,
T
threshold
,
T
eps
)
{
auto
ndets
=
boxes
.
dim
(
0
);
auto
ndets
=
boxes
.
dim
(
0
);
...
@@ -131,7 +131,7 @@ struct rotateNonMaxSupressionFunctor<tv::CPU, T, Index> {
...
@@ -131,7 +131,7 @@ struct rotateNonMaxSupressionFunctor<tv::CPU, T, Index> {
#define DECLARE_CPU_INDEX(Index) \
#define DECLARE_CPU_INDEX(Index) \
DECLARE_CPU_T_INDEX(float, Index); \
DECLARE_CPU_T_INDEX(float, Index); \
DECLARE_CPU_T_INDEX(double, Index);
DECLARE_CPU_T_INDEX(double, Index);
DECLARE_CPU_INDEX
(
int
);
DECLARE_CPU_INDEX
(
int
);
DECLARE_CPU_INDEX
(
long
);
DECLARE_CPU_INDEX
(
long
);
...
...
src/spconv/nms.cu
View file @
19e73bbe
...
@@ -2,18 +2,18 @@
...
@@ -2,18 +2,18 @@
// Deformable Convolutional Networks
// Deformable Convolutional Networks
// Copyright (c) 2015 Microsoft
// Copyright (c) 2015 Microsoft
// Licensed under The MIT License
// Licensed under The MIT License
// Modified from MATLAB Faster R-CNN (https://github.com/shaoqingren/faster_rcnn)
// Modified from MATLAB Faster R-CNN
// (https://github.com/shaoqingren/faster_rcnn)
// ------------------------------------------------------------------
// ------------------------------------------------------------------
#include <ATen/ATen.h>
#include <ATen/ATen.h>
#include <chrono>
#include <chrono>
#include <limits>
#include <limits>
#include <spconv/mp_helper.h>
#include <spconv/reordering.h>
#include <spconv/reordering.cu.h>
#include <spconv/reordering.cu.h>
#include <tensorview/helper_kernel.cu.h>
#include <spconv/reordering.h>
#include <tensorview/helper_launch.h>
#include <tensorview/cuda_utils.h>
#include <tensorview/kernel_utils.h>
#include <tensorview/mp_helper.h>
#include <tensorview/tensorview.h>
#include <tensorview/tensorview.h>
#include <type_traits>
#include <type_traits>
#include <utility/timer.h>
#include <utility/timer.h>
...
@@ -22,8 +22,7 @@
...
@@ -22,8 +22,7 @@
int
const
threadsPerBlock
=
sizeof
(
unsigned
long
long
)
*
8
;
int
const
threadsPerBlock
=
sizeof
(
unsigned
long
long
)
*
8
;
template
<
typename
DType
>
template
<
typename
DType
>
__device__
inline
DType
devIoU
(
DType
const
*
const
a
,
DType
const
*
const
b
)
__device__
inline
DType
devIoU
(
DType
const
*
const
a
,
DType
const
*
const
b
)
{
{
DType
left
=
max
(
a
[
0
],
b
[
0
]),
right
=
min
(
a
[
2
],
b
[
2
]);
DType
left
=
max
(
a
[
0
],
b
[
0
]),
right
=
min
(
a
[
2
],
b
[
2
]);
DType
top
=
max
(
a
[
1
],
b
[
1
]),
bottom
=
min
(
a
[
3
],
b
[
3
]);
DType
top
=
max
(
a
[
1
],
b
[
1
]),
bottom
=
min
(
a
[
3
],
b
[
3
]);
DType
width
=
max
(
right
-
left
+
1
,
0.
f
),
height
=
max
(
bottom
-
top
+
1
,
0.
f
);
DType
width
=
max
(
right
-
left
+
1
,
0.
f
),
height
=
max
(
bottom
-
top
+
1
,
0.
f
);
...
@@ -35,44 +34,36 @@ __device__ inline DType devIoU(DType const *const a, DType const *const b)
...
@@ -35,44 +34,36 @@ __device__ inline DType devIoU(DType const *const a, DType const *const b)
template
<
typename
DType
,
int
BLOCK_THREADS
>
template
<
typename
DType
,
int
BLOCK_THREADS
>
__global__
void
nms_kernel
(
const
int
n_boxes
,
const
DType
nms_overlap_thresh
,
__global__
void
nms_kernel
(
const
int
n_boxes
,
const
DType
nms_overlap_thresh
,
const
DType
*
dev_boxes
,
unsigned
long
long
*
dev_mask
)
const
DType
*
dev_boxes
,
{
unsigned
long
long
*
dev_mask
)
{
const
int
row_start
=
blockIdx
.
y
;
const
int
row_start
=
blockIdx
.
y
;
const
int
col_start
=
blockIdx
.
x
;
const
int
col_start
=
blockIdx
.
x
;
// if (row_start > col_start) return;
// if (row_start > col_start) return;
const
int
row_size
=
const
int
row_size
=
min
(
n_boxes
-
row_start
*
BLOCK_THREADS
,
BLOCK_THREADS
);
min
(
n_boxes
-
row_start
*
BLOCK_THREADS
,
BLOCK_THREADS
);
const
int
col_size
=
min
(
n_boxes
-
col_start
*
BLOCK_THREADS
,
BLOCK_THREADS
);
const
int
col_size
=
min
(
n_boxes
-
col_start
*
BLOCK_THREADS
,
BLOCK_THREADS
);
__shared__
DType
block_boxes
[
BLOCK_THREADS
*
5
];
__shared__
DType
block_boxes
[
BLOCK_THREADS
*
5
];
if
(
threadIdx
.
x
<
col_size
)
if
(
threadIdx
.
x
<
col_size
)
{
{
#pragma unroll
#pragma unroll
for
(
int
i
=
0
;
i
<
5
;
++
i
)
for
(
int
i
=
0
;
i
<
5
;
++
i
)
{
{
block_boxes
[
threadIdx
.
x
*
5
+
i
]
=
block_boxes
[
threadIdx
.
x
*
5
+
i
]
=
dev_boxes
[(
BLOCK_THREADS
*
col_start
+
threadIdx
.
x
)
*
5
+
i
];
dev_boxes
[(
BLOCK_THREADS
*
col_start
+
threadIdx
.
x
)
*
5
+
i
];
}
}
}
}
__syncthreads
();
__syncthreads
();
if
(
threadIdx
.
x
<
row_size
)
if
(
threadIdx
.
x
<
row_size
)
{
{
const
int
cur_box_idx
=
BLOCK_THREADS
*
row_start
+
threadIdx
.
x
;
const
int
cur_box_idx
=
BLOCK_THREADS
*
row_start
+
threadIdx
.
x
;
const
DType
*
cur_box
=
dev_boxes
+
cur_box_idx
*
5
;
const
DType
*
cur_box
=
dev_boxes
+
cur_box_idx
*
5
;
unsigned
long
long
t
=
0
;
unsigned
long
long
t
=
0
;
int
start
=
0
;
int
start
=
0
;
if
(
row_start
==
col_start
)
if
(
row_start
==
col_start
)
{
{
start
=
threadIdx
.
x
+
1
;
start
=
threadIdx
.
x
+
1
;
}
}
for
(
int
i
=
start
;
i
<
col_size
;
i
++
)
for
(
int
i
=
start
;
i
<
col_size
;
i
++
)
{
{
if
(
devIoU
(
cur_box
,
block_boxes
+
i
*
5
)
>
nms_overlap_thresh
)
{
if
(
devIoU
(
cur_box
,
block_boxes
+
i
*
5
)
>
nms_overlap_thresh
)
{
t
|=
1ULL
<<
i
;
t
|=
1ULL
<<
i
;
}
}
}
}
...
@@ -80,4 +71,3 @@ __global__ void nms_kernel(const int n_boxes, const DType nms_overlap_thresh,
...
@@ -80,4 +71,3 @@ __global__ void nms_kernel(const int n_boxes, const DType nms_overlap_thresh,
dev_mask
[
cur_box_idx
*
col_blocks
+
col_start
]
=
t
;
dev_mask
[
cur_box_idx
*
col_blocks
+
col_start
]
=
t
;
}
}
}
}
src/spconv/pillar_scatter.cu
View file @
19e73bbe
...
@@ -15,10 +15,10 @@
...
@@ -15,10 +15,10 @@
#include <ATen/ATen.h>
#include <ATen/ATen.h>
#include <chrono>
#include <chrono>
#include <limits>
#include <limits>
#include <spconv/mp_helper.h>
#include <spconv/pillar_scatter_functor.h>
#include <spconv/pillar_scatter_functor.h>
#include <tensorview/helper_kernel.cu.h>
#include <tensorview/cuda_utils.h>
#include <tensorview/helper_launch.h>
#include <tensorview/kernel_utils.h>
#include <tensorview/mp_helper.h>
#include <tensorview/tensorview.h>
#include <tensorview/tensorview.h>
#include <type_traits>
#include <type_traits>
#include <utility/timer.h>
#include <utility/timer.h>
...
@@ -43,8 +43,8 @@ struct PointPillarScatter<tv::GPU, T, Index> {
...
@@ -43,8 +43,8 @@ struct PointPillarScatter<tv::GPU, T, Index> {
void
operator
()(
const
tv
::
GPU
&
d
,
tv
::
TensorView
<
T
>
canvas
,
void
operator
()(
const
tv
::
GPU
&
d
,
tv
::
TensorView
<
T
>
canvas
,
tv
::
TensorView
<
const
T
>
features
,
tv
::
TensorView
<
const
T
>
features
,
tv
::
TensorView
<
const
T
>
coors
)
{
tv
::
TensorView
<
const
T
>
coors
)
{
auto
grid
=
dim3
(
tv
::
launch
::
DivUp
(
features
.
dim
(
1
),
32
),
auto
grid
=
dim3
(
tv
::
cuda
::
DivUp
(
features
.
dim
(
1
),
32
),
tv
::
launch
::
DivUp
(
features
.
dim
(
0
),
32
));
tv
::
cuda
::
DivUp
(
features
.
dim
(
0
),
32
));
pointPillarsScatterKernel
<
T
,
Index
>
pointPillarsScatterKernel
<
T
,
Index
>
<<<
grid
,
dim3
(
32
,
32
),
0
,
d
.
getStream
()
>>>
(
canvas
,
features
,
coors
);
<<<
grid
,
dim3
(
32
,
32
),
0
,
d
.
getStream
()
>>>
(
canvas
,
features
,
coors
);
TV_CHECK_CUDA_ERR
();
TV_CHECK_CUDA_ERR
();
...
...
src/spconv/reordering.cc
View file @
19e73bbe
// Copyright 2019 Yan Yan
// Copyright 2019 Yan Yan
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// You may obtain a copy of the License at
//
//
// http://www.apache.org/licenses/LICENSE-2.0
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// See the License for the specific language governing permissions and
// limitations under the License.
// limitations under the License.
#include <ATen/Parallel.h>
#include <spconv/reordering.h>
#include <spconv/reordering.h>
#include <torch/script.h>
#include <torch/script.h>
#include <ATen/Parallel.h>
namespace
spconv
{
namespace
spconv
{
namespace
functor
{
namespace
functor
{
template
<
typename
T
,
typename
Index
>
template
<
typename
T
,
typename
Index
>
struct
SparseGatherFunctor
<
tv
::
CPU
,
T
,
Index
>
{
struct
SparseGatherFunctor
<
tv
::
CPU
,
T
,
Index
>
{
void
operator
()(
const
tv
::
CPU
&
d
,
tv
::
TensorView
<
T
>
buffer
,
tv
::
TensorView
<
const
T
>
features
,
void
operator
()(
const
tv
::
CPU
&
d
,
tv
::
TensorView
<
T
>
buffer
,
tv
::
TensorView
<
const
T
>
features
,
tv
::
TensorView
<
const
Index
>
indices
,
int
size
)
{
tv
::
TensorView
<
const
Index
>
indices
,
int
size
)
{
int
numPlanes
=
features
.
dim
(
1
);
int
numPlanes
=
features
.
dim
(
1
);
at
::
parallel_for
(
0
,
size
,
0
,
[
&
](
int64_t
begin
,
int64_t
end
){
at
::
parallel_for
(
0
,
size
,
0
,
[
&
](
int64_t
begin
,
int64_t
end
)
{
for
(
int
i
=
begin
;
i
<
end
;
++
i
)
{
for
(
int
i
=
begin
;
i
<
end
;
++
i
)
{
std
::
memcpy
(
buffer
.
data
()
+
i
*
numPlanes
,
std
::
memcpy
(
buffer
.
data
()
+
i
*
numPlanes
,
features
.
data
()
+
indices
[
i
]
*
numPlanes
,
features
.
data
()
+
indices
[
i
]
*
numPlanes
,
...
@@ -35,16 +36,16 @@ struct SparseGatherFunctor<tv::CPU, T, Index> {
...
@@ -35,16 +36,16 @@ struct SparseGatherFunctor<tv::CPU, T, Index> {
template
<
typename
T
,
typename
Index
>
template
<
typename
T
,
typename
Index
>
struct
SparseScatterAddFunctor
<
tv
::
CPU
,
T
,
Index
>
{
struct
SparseScatterAddFunctor
<
tv
::
CPU
,
T
,
Index
>
{
void
operator
()(
const
tv
::
CPU
&
d
,
tv
::
TensorView
<
T
>
outFeatures
,
void
operator
()(
const
tv
::
CPU
&
d
,
tv
::
TensorView
<
T
>
outFeatures
,
tv
::
TensorView
<
const
T
>
buffer
,
tv
::
TensorView
<
const
Index
>
indices
,
tv
::
TensorView
<
const
T
>
buffer
,
int
size
,
bool
stable
)
{
tv
::
TensorView
<
const
Index
>
indices
,
int
size
,
bool
stable
)
{
int
numPlanes
=
outFeatures
.
dim
(
1
);
int
numPlanes
=
outFeatures
.
dim
(
1
);
const
T
*
buf
=
buffer
.
data
();
const
T
*
buf
=
buffer
.
data
();
T
*
out
=
outFeatures
.
data
();
T
*
out
=
outFeatures
.
data
();
for
(
int
i
=
0
;
i
<
size
;
++
i
)
{
for
(
int
i
=
0
;
i
<
size
;
++
i
)
{
buf
=
buffer
.
data
()
+
i
*
numPlanes
;
buf
=
buffer
.
data
()
+
i
*
numPlanes
;
out
=
outFeatures
.
data
()
+
indices
[
i
]
*
numPlanes
;
out
=
outFeatures
.
data
()
+
indices
[
i
]
*
numPlanes
;
for
(
int
j
=
0
;
j
<
numPlanes
;
++
j
){
for
(
int
j
=
0
;
j
<
numPlanes
;
++
j
)
{
out
[
j
]
+=
buf
[
j
];
out
[
j
]
+=
buf
[
j
];
}
}
}
}
...
@@ -53,9 +54,8 @@ struct SparseScatterAddFunctor<tv::CPU, T, Index> {
...
@@ -53,9 +54,8 @@ struct SparseScatterAddFunctor<tv::CPU, T, Index> {
}
// namespace functor
}
// namespace functor
#define DECLARE_CPU_SPECS_T_INDEX(T, Index) \
#define DECLARE_CPU_SPECS_T_INDEX(T, Index) \
template struct functor::SparseGatherFunctor<tv::CPU, T, Index>; \
template struct functor::SparseGatherFunctor<tv::CPU, T, Index>; \
template struct functor::SparseScatterAddFunctor<tv::CPU, T, Index>;
template struct functor::SparseScatterAddFunctor<tv::CPU, T, Index>;
#define DECLARE_CPU_SPECS(T) \
#define DECLARE_CPU_SPECS(T) \
...
@@ -70,4 +70,3 @@ DECLARE_CPU_SPECS(at::Half);
...
@@ -70,4 +70,3 @@ DECLARE_CPU_SPECS(at::Half);
#undef DECLARE_CPU_SPECS_T_INDEX
#undef DECLARE_CPU_SPECS_T_INDEX
}
// namespace spconv
}
// namespace spconv
src/spconv/reordering.cu
View file @
19e73bbe
...
@@ -15,11 +15,11 @@
...
@@ -15,11 +15,11 @@
#include <ATen/ATen.h>
#include <ATen/ATen.h>
#include <chrono>
#include <chrono>
#include <limits>
#include <limits>
#include <spconv/mp_helper.h>
#include <spconv/reordering.h>
#include <spconv/reordering.cu.h>
#include <spconv/reordering.cu.h>
#include <tensorview/helper_kernel.cu.h>
#include <spconv/reordering.h>
#include <tensorview/helper_launch.h>
#include <tensorview/cuda_utils.h>
#include <tensorview/kernel_utils.h>
#include <tensorview/mp_helper.h>
#include <tensorview/tensorview.h>
#include <tensorview/tensorview.h>
#include <type_traits>
#include <type_traits>
#include <utility/timer.h>
#include <utility/timer.h>
...
@@ -30,7 +30,7 @@ template <typename T, typename Index>
...
@@ -30,7 +30,7 @@ template <typename T, typename Index>
struct
SparseGatherFunctor
<
tv
::
GPU
,
T
,
Index
>
{
struct
SparseGatherFunctor
<
tv
::
GPU
,
T
,
Index
>
{
using
vecload_type_t
=
using
vecload_type_t
=
std
::
conditional_t
<
std
::
is_same
<
T
,
at
::
Half
>::
value
,
int2
,
int4
>
;
std
::
conditional_t
<
std
::
is_same
<
T
,
at
::
Half
>::
value
,
int2
,
int4
>
;
using
kernel_block_t
=
mp_list_c
<
int
,
64
,
32
,
16
>
;
using
kernel_block_t
=
tv
::
mp_list_c
<
int
,
64
,
32
,
16
>
;
void
operator
()(
const
tv
::
GPU
&
d
,
tv
::
TensorView
<
T
>
buffer
,
void
operator
()(
const
tv
::
GPU
&
d
,
tv
::
TensorView
<
T
>
buffer
,
tv
::
TensorView
<
const
T
>
features
,
tv
::
TensorView
<
const
T
>
features
,
tv
::
TensorView
<
const
Index
>
indices
,
int
size
)
{
tv
::
TensorView
<
const
Index
>
indices
,
int
size
)
{
...
@@ -39,8 +39,8 @@ struct SparseGatherFunctor<tv::GPU, T, Index> {
...
@@ -39,8 +39,8 @@ struct SparseGatherFunctor<tv::GPU, T, Index> {
int
numPlanes
=
features
.
dim
(
1
);
int
numPlanes
=
features
.
dim
(
1
);
bool
notFound
=
true
;
bool
notFound
=
true
;
constexpr
int
vecloadFactor
=
sizeof
(
vecload_type_t
)
/
sizeof
(
T
);
constexpr
int
vecloadFactor
=
sizeof
(
vecload_type_t
)
/
sizeof
(
T
);
mp_for_each
<
kernel_block_t
>
([
=
,
&
buffer
,
&
features
,
&
indices
,
tv
::
mp_for_each
<
kernel_block_t
>
([
=
,
&
buffer
,
&
features
,
&
indices
,
&
notFound
](
auto
NumTLP
)
{
&
notFound
](
auto
NumTLP
)
{
constexpr
int
NumILP
=
NumTLP
/
4
;
constexpr
int
NumILP
=
NumTLP
/
4
;
// constexpr int NumILP = NumTLP / (64 / (NumTLP / vecloadFactor));
// constexpr int NumILP = NumTLP / (64 / (NumTLP / vecloadFactor));
int
nHotBlock
=
(
size
/
NumTLP
)
*
NumTLP
;
int
nHotBlock
=
(
size
/
NumTLP
)
*
NumTLP
;
...
@@ -50,8 +50,9 @@ struct SparseGatherFunctor<tv::GPU, T, Index> {
...
@@ -50,8 +50,9 @@ struct SparseGatherFunctor<tv::GPU, T, Index> {
gatherVecBlockKernel
<
T
,
Index
,
int
(
NumTLP
),
NumILP
,
vecload_type_t
>
gatherVecBlockKernel
<
T
,
Index
,
int
(
NumTLP
),
NumILP
,
vecload_type_t
>
<<<
dim3
(
numPlanes
/
NumTLP
,
size
/
NumTLP
),
<<<
dim3
(
numPlanes
/
NumTLP
,
size
/
NumTLP
),
dim3
(
NumTLP
/
vecloadFactor
,
NumTLP
/
NumILP
),
0
,
dim3
(
NumTLP
/
vecloadFactor
,
NumTLP
/
NumILP
),
0
,
d
.
getStream
()
>>>
(
buffer
.
data
(),
features
.
data
(),
indices
.
data
(),
d
.
getStream
()
>>>
(
buffer
.
data
(),
features
.
data
(),
nHotBlock
,
numPlanes
/
vecloadFactor
);
indices
.
data
(),
nHotBlock
,
numPlanes
/
vecloadFactor
);
TV_CHECK_CUDA_ERR
();
TV_CHECK_CUDA_ERR
();
}
}
...
@@ -60,8 +61,9 @@ struct SparseGatherFunctor<tv::GPU, T, Index> {
...
@@ -60,8 +61,9 @@ struct SparseGatherFunctor<tv::GPU, T, Index> {
<<<
dim3
(
1
,
numPlanes
/
NumTLP
),
<<<
dim3
(
1
,
numPlanes
/
NumTLP
),
dim3
(
NumTLP
/
NumILP
,
NumTLP
/
vecloadFactor
),
0
,
dim3
(
NumTLP
/
NumILP
,
NumTLP
/
vecloadFactor
),
0
,
d
.
getStream
()
>>>
(
buffer
.
data
()
+
nHotBlock
*
numPlanes
,
d
.
getStream
()
>>>
(
buffer
.
data
()
+
nHotBlock
*
numPlanes
,
features
.
data
(),
indices
.
data
()
+
nHotBlock
,
features
.
data
(),
indices
.
data
()
+
nHotBlock
,
size
-
nHotBlock
,
numPlanes
/
vecloadFactor
);
size
-
nHotBlock
,
numPlanes
/
vecloadFactor
);
TV_CHECK_CUDA_ERR
();
TV_CHECK_CUDA_ERR
();
}
}
notFound
=
false
;
notFound
=
false
;
...
@@ -73,8 +75,8 @@ struct SparseGatherFunctor<tv::GPU, T, Index> {
...
@@ -73,8 +75,8 @@ struct SparseGatherFunctor<tv::GPU, T, Index> {
constexpr
int
NumTLP
=
64
;
constexpr
int
NumTLP
=
64
;
constexpr
int
NumILP
=
NumTLP
/
4
;
constexpr
int
NumILP
=
NumTLP
/
4
;
gatherGenericKernel
<
T
,
Index
,
NumTLP
,
NumILP
>
gatherGenericKernel
<
T
,
Index
,
NumTLP
,
NumILP
>
<<<
dim3
(
tv
::
launch
::
DivUp
(
size
,
NumTLP
),
<<<
dim3
(
tv
::
cuda
::
DivUp
(
size
,
NumTLP
),
tv
::
launch
::
DivUp
(
numPlanes
,
NumTLP
)),
tv
::
cuda
::
DivUp
(
numPlanes
,
NumTLP
)),
dim3
(
NumTLP
/
NumILP
,
NumTLP
),
0
,
d
.
getStream
()
>>>
(
dim3
(
NumTLP
/
NumILP
,
NumTLP
),
0
,
d
.
getStream
()
>>>
(
buffer
.
data
(),
features
.
data
(),
indices
.
data
(),
size
,
numPlanes
);
buffer
.
data
(),
features
.
data
(),
indices
.
data
(),
size
,
numPlanes
);
TV_CHECK_CUDA_ERR
();
TV_CHECK_CUDA_ERR
();
...
@@ -85,7 +87,7 @@ template <typename T, typename Index>
...
@@ -85,7 +87,7 @@ template <typename T, typename Index>
struct
SparseScatterAddFunctor
<
tv
::
GPU
,
T
,
Index
>
{
struct
SparseScatterAddFunctor
<
tv
::
GPU
,
T
,
Index
>
{
using
vecload_type_t
=
using
vecload_type_t
=
std
::
conditional_t
<
std
::
is_same
<
T
,
at
::
Half
>::
value
,
int2
,
int4
>
;
std
::
conditional_t
<
std
::
is_same
<
T
,
at
::
Half
>::
value
,
int2
,
int4
>
;
using
kernel_block_t
=
mp_list_c
<
int
,
64
,
32
,
16
>
;
using
kernel_block_t
=
tv
::
mp_list_c
<
int
,
64
,
32
,
16
>
;
void
operator
()(
const
tv
::
GPU
&
d
,
tv
::
TensorView
<
T
>
outFeatures
,
void
operator
()(
const
tv
::
GPU
&
d
,
tv
::
TensorView
<
T
>
outFeatures
,
tv
::
TensorView
<
const
T
>
buffer
,
tv
::
TensorView
<
const
T
>
buffer
,
tv
::
TensorView
<
const
Index
>
indices
,
int
size
,
bool
stable
)
{
tv
::
TensorView
<
const
Index
>
indices
,
int
size
,
bool
stable
)
{
...
@@ -95,8 +97,8 @@ struct SparseScatterAddFunctor<tv::GPU, T, Index> {
...
@@ -95,8 +97,8 @@ struct SparseScatterAddFunctor<tv::GPU, T, Index> {
bool
notFound
=
true
;
bool
notFound
=
true
;
constexpr
int
vecloadFactor
=
constexpr
int
vecloadFactor
=
sizeof
(
vecload_type_t
)
/
sizeof
(
T
);
// important for half.
sizeof
(
vecload_type_t
)
/
sizeof
(
T
);
// important for half.
mp_for_each
<
kernel_block_t
>
([
=
,
&
d
,
&
outFeatures
,
&
buffer
,
&
indices
,
tv
::
mp_for_each
<
kernel_block_t
>
([
=
,
&
d
,
&
outFeatures
,
&
buffer
,
&
indices
,
&
notFound
](
auto
NumTLP
)
{
&
notFound
](
auto
NumTLP
)
{
// constexpr int NumILP = NumTLP / (64 / (NumTLP / vecloadFactor));
// constexpr int NumILP = NumTLP / (64 / (NumTLP / vecloadFactor));
constexpr
int
NumILP
=
NumTLP
/
4
;
constexpr
int
NumILP
=
NumTLP
/
4
;
int
nHotBlock
=
(
size
/
NumTLP
)
*
NumTLP
;
int
nHotBlock
=
(
size
/
NumTLP
)
*
NumTLP
;
...
@@ -108,8 +110,8 @@ struct SparseScatterAddFunctor<tv::GPU, T, Index> {
...
@@ -108,8 +110,8 @@ struct SparseScatterAddFunctor<tv::GPU, T, Index> {
<<<
dim3
(
numPlanes
/
NumTLP
,
size
/
NumTLP
),
<<<
dim3
(
numPlanes
/
NumTLP
,
size
/
NumTLP
),
dim3
(
NumTLP
/
vecloadFactor
,
NumTLP
/
NumILP
),
0
,
dim3
(
NumTLP
/
vecloadFactor
,
NumTLP
/
NumILP
),
0
,
d
.
getStream
()
>>>
(
outFeatures
.
data
(),
buffer
.
data
(),
d
.
getStream
()
>>>
(
outFeatures
.
data
(),
buffer
.
data
(),
indices
.
data
(),
nHotBlock
,
indices
.
data
(),
nHotBlock
,
numPlanes
/
vecloadFactor
);
numPlanes
/
vecloadFactor
);
TV_CHECK_CUDA_ERR
();
TV_CHECK_CUDA_ERR
();
}
}
if
(
size
-
nHotBlock
>
0
)
{
if
(
size
-
nHotBlock
>
0
)
{
...
@@ -128,8 +130,8 @@ struct SparseScatterAddFunctor<tv::GPU, T, Index> {
...
@@ -128,8 +130,8 @@ struct SparseScatterAddFunctor<tv::GPU, T, Index> {
constexpr
int
NumTLP
=
64
;
constexpr
int
NumTLP
=
64
;
constexpr
int
NumILP
=
NumTLP
/
4
;
constexpr
int
NumILP
=
NumTLP
/
4
;
scatterAddGenericKernel
<
T
,
Index
,
NumTLP
,
NumILP
>
scatterAddGenericKernel
<
T
,
Index
,
NumTLP
,
NumILP
>
<<<
dim3
(
tv
::
launch
::
DivUp
(
size
,
NumTLP
),
<<<
dim3
(
tv
::
cuda
::
DivUp
(
size
,
NumTLP
),
tv
::
launch
::
DivUp
(
numPlanes
,
NumTLP
)),
tv
::
cuda
::
DivUp
(
numPlanes
,
NumTLP
)),
dim3
(
NumTLP
/
NumILP
,
NumTLP
),
0
,
d
.
getStream
()
>>>
(
dim3
(
NumTLP
/
NumILP
,
NumTLP
),
0
,
d
.
getStream
()
>>>
(
outFeatures
.
data
(),
buffer
.
data
(),
indices
.
data
(),
size
,
outFeatures
.
data
(),
buffer
.
data
(),
indices
.
data
(),
size
,
numPlanes
);
numPlanes
);
...
@@ -139,7 +141,6 @@ struct SparseScatterAddFunctor<tv::GPU, T, Index> {
...
@@ -139,7 +141,6 @@ struct SparseScatterAddFunctor<tv::GPU, T, Index> {
};
};
}
// namespace functor
}
// namespace functor
#define DECLARE_GPU_SPECS_T_INDEX(T, Index) \
#define DECLARE_GPU_SPECS_T_INDEX(T, Index) \
template struct functor::SparseGatherFunctor<tv::GPU, T, Index>; \
template struct functor::SparseGatherFunctor<tv::GPU, T, Index>; \
template struct functor::SparseScatterAddFunctor<tv::GPU, T, Index>;
template struct functor::SparseScatterAddFunctor<tv::GPU, T, Index>;
...
...
src/spconv/spconv_ops.cc
View file @
19e73bbe
...
@@ -47,7 +47,7 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
...
@@ -47,7 +47,7 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
double
totalGatherTime
=
0
;
double
totalGatherTime
=
0
;
double
totalGEMMTime
=
0
;
double
totalGEMMTime
=
0
;
double
totalSAddTime
=
0
;
double
totalSAddTime
=
0
;
tv
::
torch_
dispatch
<
float
,
double
,
at
::
Half
>
(
tv
::
dispat
ch_tor
ch
<
float
,
double
,
at
::
Half
>
(
features
.
scalar_type
(),
[
&
](
auto
I
)
{
features
.
scalar_type
(),
[
&
](
auto
I
)
{
using
T
=
decltype
(
I
);
using
T
=
decltype
(
I
);
for
(
int
i
=
0
;
i
<
kernelVolume
;
++
i
)
{
for
(
int
i
=
0
;
i
<
kernelVolume
;
++
i
)
{
...
@@ -68,7 +68,7 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
...
@@ -68,7 +68,7 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
inverse
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
inverse
),
nHot
);
nHot
);
}
}
#ifdef
SPCON
V_CUDA
#ifdef
T
V_CUDA
else
if
(
device
==
torch
::
kCUDA
)
{
else
if
(
device
==
torch
::
kCUDA
)
{
functor
::
SparseGatherFunctor
<
tv
::
GPU
,
T
,
int
>
gatherFtor
;
functor
::
SparseGatherFunctor
<
tv
::
GPU
,
T
,
int
>
gatherFtor
;
gatherFtor
(
tv
::
TorchGPU
(),
tv
::
torch2tv
<
T
>
(
inputBuffer
),
gatherFtor
(
tv
::
TorchGPU
(),
tv
::
torch2tv
<
T
>
(
inputBuffer
),
...
@@ -99,7 +99,7 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
...
@@ -99,7 +99,7 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
!
inverse
),
nHot
,
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
!
inverse
),
nHot
,
true
);
true
);
}
}
#ifdef
SPCON
V_CUDA
#ifdef
T
V_CUDA
else
if
(
device
==
torch
::
kCUDA
)
{
else
if
(
device
==
torch
::
kCUDA
)
{
functor
::
SparseScatterAddFunctor
<
tv
::
GPU
,
T
,
int
>
scatterFtor
;
functor
::
SparseScatterAddFunctor
<
tv
::
GPU
,
T
,
int
>
scatterFtor
;
scatterFtor
(
scatterFtor
(
...
@@ -158,7 +158,7 @@ indiceConvBackward(torch::Tensor features, torch::Tensor filters,
...
@@ -158,7 +158,7 @@ indiceConvBackward(torch::Tensor features, torch::Tensor filters,
torch
::
mm_out
(
filterGradSub
,
features
.
t
(),
outGrad
);
torch
::
mm_out
(
filterGradSub
,
features
.
t
(),
outGrad
);
torch
::
mm_out
(
inputGrad
,
outGrad
,
filters
[
indicePairMaxOffset
].
t
());
torch
::
mm_out
(
inputGrad
,
outGrad
,
filters
[
indicePairMaxOffset
].
t
());
}
}
tv
::
torch_
dispatch
<
float
,
double
,
tv
::
dispat
ch_tor
ch
<
float
,
double
,
at
::
Half
>
(
features
.
scalar_type
(),
[
&
](
auto
I
)
{
at
::
Half
>
(
features
.
scalar_type
(),
[
&
](
auto
I
)
{
using
T
=
decltype
(
I
);
using
T
=
decltype
(
I
);
for
(
int
i
=
0
;
i
<
kernelVolume
;
++
i
)
{
for
(
int
i
=
0
;
i
<
kernelVolume
;
++
i
)
{
...
@@ -178,7 +178,7 @@ indiceConvBackward(torch::Tensor features, torch::Tensor filters,
...
@@ -178,7 +178,7 @@ indiceConvBackward(torch::Tensor features, torch::Tensor filters,
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
!
inverse
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
!
inverse
),
nHot
);
nHot
);
}
}
#ifdef
SPCON
V_CUDA
#ifdef
T
V_CUDA
else
if
(
device
==
torch
::
kCUDA
)
{
else
if
(
device
==
torch
::
kCUDA
)
{
functor
::
SparseGatherFunctor
<
tv
::
GPU
,
T
,
int
>
gatherFtor
;
functor
::
SparseGatherFunctor
<
tv
::
GPU
,
T
,
int
>
gatherFtor
;
functor
::
SparseGatherFunctor
<
tv
::
GPU
,
T
,
int
>
gatherFtorOut
;
functor
::
SparseGatherFunctor
<
tv
::
GPU
,
T
,
int
>
gatherFtorOut
;
...
@@ -213,7 +213,7 @@ indiceConvBackward(torch::Tensor features, torch::Tensor filters,
...
@@ -213,7 +213,7 @@ indiceConvBackward(torch::Tensor features, torch::Tensor filters,
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
inverse
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
inverse
),
nHot
);
nHot
);
}
}
#ifdef
SPCON
V_CUDA
#ifdef
T
V_CUDA
else
if
(
device
==
torch
::
kCUDA
)
{
else
if
(
device
==
torch
::
kCUDA
)
{
functor
::
SparseScatterAddFunctor
<
tv
::
GPU
,
T
,
int
>
scatterFtor
;
functor
::
SparseScatterAddFunctor
<
tv
::
GPU
,
T
,
int
>
scatterFtor
;
scatterFtor
(
tv
::
TorchGPU
(),
tv
::
torch2tv
<
T
>
(
inputGrad
),
scatterFtor
(
tv
::
TorchGPU
(),
tv
::
torch2tv
<
T
>
(
inputGrad
),
...
...
src/utils/all.cc
View file @
19e73bbe
...
@@ -20,7 +20,7 @@ using namespace pybind11::literals;
...
@@ -20,7 +20,7 @@ using namespace pybind11::literals;
PYBIND11_MODULE
(
spconv_utils
,
m
)
{
PYBIND11_MODULE
(
spconv_utils
,
m
)
{
m
.
doc
()
=
"util pybind11 functions for spconv"
;
m
.
doc
()
=
"util pybind11 functions for spconv"
;
#ifdef
SPCON
V_CUDA
#ifdef
T
V_CUDA
m
.
def
(
"non_max_suppression"
,
&
spconv
::
non_max_suppression
<
double
>
,
m
.
def
(
"non_max_suppression"
,
&
spconv
::
non_max_suppression
<
double
>
,
py
::
return_value_policy
::
reference_internal
,
"bbox iou"
,
"boxes"
_a
=
1
,
py
::
return_value_policy
::
reference_internal
,
"bbox iou"
,
"boxes"
_a
=
1
,
"keep_out"
_a
=
2
,
"nms_overlap_thresh"
_a
=
3
,
"device_id"
_a
=
4
);
"keep_out"
_a
=
2
,
"nms_overlap_thresh"
_a
=
3
,
"device_id"
_a
=
4
);
...
...
src/utils/nms.cu
View file @
19e73bbe
...
@@ -2,30 +2,28 @@
...
@@ -2,30 +2,28 @@
// Deformable Convolutional Networks
// Deformable Convolutional Networks
// Copyright (c) 2015 Microsoft
// Copyright (c) 2015 Microsoft
// Licensed under The MIT License
// Licensed under The MIT License
// Modified from MATLAB Faster R-CNN (https://github.com/shaoqingren/faster_rcnn)
// Modified from MATLAB Faster R-CNN
// (https://github.com/shaoqingren/faster_rcnn)
// ------------------------------------------------------------------
// ------------------------------------------------------------------
#include <vector>
#include <iostream>
#include <cuda_runtime.h>
#include <cuda_runtime.h>
#include <iostream>
#include <spconv/nms_gpu.h>
#include <spconv/nms_gpu.h>
#include <vector>
#define CUDA_CHECK(condition) \
#define CUDA_CHECK(condition) \
/* Code block avoids redefinition of cudaError_t error */
\
/* Code block avoids redefinition of cudaError_t error */
\
do \
do { \
{ \
cudaError_t error = condition; \
cudaError_t error = condition; \
if (error != cudaSuccess) { \
if (error != cudaSuccess) \
std::cout << cudaGetErrorString(error) << std::endl; \
{ \
} \
std::cout << cudaGetErrorString(error) << std::endl; \
} \
} while (0)
} while (0)
#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
int
const
threadsPerBlock
=
sizeof
(
unsigned
long
long
)
*
8
;
int
const
threadsPerBlock
=
sizeof
(
unsigned
long
long
)
*
8
;
template
<
typename
DType
>
template
<
typename
DType
>
__device__
inline
DType
devIoU
(
DType
const
*
const
a
,
DType
const
*
const
b
)
__device__
inline
DType
devIoU
(
DType
const
*
const
a
,
DType
const
*
const
b
)
{
{
DType
left
=
max
(
a
[
0
],
b
[
0
]),
right
=
min
(
a
[
2
],
b
[
2
]);
DType
left
=
max
(
a
[
0
],
b
[
0
]),
right
=
min
(
a
[
2
],
b
[
2
]);
DType
top
=
max
(
a
[
1
],
b
[
1
]),
bottom
=
min
(
a
[
3
],
b
[
3
]);
DType
top
=
max
(
a
[
1
],
b
[
1
]),
bottom
=
min
(
a
[
3
],
b
[
3
]);
DType
width
=
max
(
right
-
left
+
1
,
0.
f
),
height
=
max
(
bottom
-
top
+
1
,
0.
f
);
DType
width
=
max
(
right
-
left
+
1
,
0.
f
),
height
=
max
(
bottom
-
top
+
1
,
0.
f
);
...
@@ -37,44 +35,36 @@ __device__ inline DType devIoU(DType const *const a, DType const *const b)
...
@@ -37,44 +35,36 @@ __device__ inline DType devIoU(DType const *const a, DType const *const b)
template
<
typename
DType
,
int
BLOCK_THREADS
>
template
<
typename
DType
,
int
BLOCK_THREADS
>
__global__
void
nms_kernel
(
const
int
n_boxes
,
const
DType
nms_overlap_thresh
,
__global__
void
nms_kernel
(
const
int
n_boxes
,
const
DType
nms_overlap_thresh
,
const
DType
*
dev_boxes
,
unsigned
long
long
*
dev_mask
)
const
DType
*
dev_boxes
,
{
unsigned
long
long
*
dev_mask
)
{
const
int
row_start
=
blockIdx
.
y
;
const
int
row_start
=
blockIdx
.
y
;
const
int
col_start
=
blockIdx
.
x
;
const
int
col_start
=
blockIdx
.
x
;
// if (row_start > col_start) return;
// if (row_start > col_start) return;
const
int
row_size
=
const
int
row_size
=
min
(
n_boxes
-
row_start
*
BLOCK_THREADS
,
BLOCK_THREADS
);
min
(
n_boxes
-
row_start
*
BLOCK_THREADS
,
BLOCK_THREADS
);
const
int
col_size
=
min
(
n_boxes
-
col_start
*
BLOCK_THREADS
,
BLOCK_THREADS
);
const
int
col_size
=
min
(
n_boxes
-
col_start
*
BLOCK_THREADS
,
BLOCK_THREADS
);
__shared__
DType
block_boxes
[
BLOCK_THREADS
*
5
];
__shared__
DType
block_boxes
[
BLOCK_THREADS
*
5
];
if
(
threadIdx
.
x
<
col_size
)
if
(
threadIdx
.
x
<
col_size
)
{
{
#pragma unroll
#pragma unroll
for
(
int
i
=
0
;
i
<
5
;
++
i
)
for
(
int
i
=
0
;
i
<
5
;
++
i
)
{
{
block_boxes
[
threadIdx
.
x
*
5
+
i
]
=
block_boxes
[
threadIdx
.
x
*
5
+
i
]
=
dev_boxes
[(
BLOCK_THREADS
*
col_start
+
threadIdx
.
x
)
*
5
+
i
];
dev_boxes
[(
BLOCK_THREADS
*
col_start
+
threadIdx
.
x
)
*
5
+
i
];
}
}
}
}
__syncthreads
();
__syncthreads
();
if
(
threadIdx
.
x
<
row_size
)
if
(
threadIdx
.
x
<
row_size
)
{
{
const
int
cur_box_idx
=
BLOCK_THREADS
*
row_start
+
threadIdx
.
x
;
const
int
cur_box_idx
=
BLOCK_THREADS
*
row_start
+
threadIdx
.
x
;
const
DType
*
cur_box
=
dev_boxes
+
cur_box_idx
*
5
;
const
DType
*
cur_box
=
dev_boxes
+
cur_box_idx
*
5
;
unsigned
long
long
t
=
0
;
unsigned
long
long
t
=
0
;
int
start
=
0
;
int
start
=
0
;
if
(
row_start
==
col_start
)
if
(
row_start
==
col_start
)
{
{
start
=
threadIdx
.
x
+
1
;
start
=
threadIdx
.
x
+
1
;
}
}
for
(
int
i
=
start
;
i
<
col_size
;
i
++
)
for
(
int
i
=
start
;
i
<
col_size
;
i
++
)
{
{
if
(
devIoU
(
cur_box
,
block_boxes
+
i
*
5
)
>
nms_overlap_thresh
)
{
if
(
devIoU
(
cur_box
,
block_boxes
+
i
*
5
)
>
nms_overlap_thresh
)
{
t
|=
1ULL
<<
i
;
t
|=
1ULL
<<
i
;
}
}
}
}
...
@@ -83,12 +73,10 @@ __global__ void nms_kernel(const int n_boxes, const DType nms_overlap_thresh,
...
@@ -83,12 +73,10 @@ __global__ void nms_kernel(const int n_boxes, const DType nms_overlap_thresh,
}
}
}
}
void
_set_device
(
int
device_id
)
void
_set_device
(
int
device_id
)
{
{
int
current_device
;
int
current_device
;
CUDA_CHECK
(
cudaGetDevice
(
&
current_device
));
CUDA_CHECK
(
cudaGetDevice
(
&
current_device
));
if
(
current_device
==
device_id
)
if
(
current_device
==
device_id
)
{
{
return
;
return
;
}
}
// The call to cudaSetDevice must come before any calls to Get, which
// The call to cudaSetDevice must come before any calls to Get, which
...
@@ -98,8 +86,7 @@ void _set_device(int device_id)
...
@@ -98,8 +86,7 @@ void _set_device(int device_id)
template
<
typename
DType
,
int
BLOCK_THREADS
>
template
<
typename
DType
,
int
BLOCK_THREADS
>
int
_nms_gpu
(
int
*
keep_out
,
const
DType
*
boxes_host
,
int
boxes_num
,
int
_nms_gpu
(
int
*
keep_out
,
const
DType
*
boxes_host
,
int
boxes_num
,
int
boxes_dim
,
DType
nms_overlap_thresh
,
int
device_id
)
int
boxes_dim
,
DType
nms_overlap_thresh
,
int
device_id
)
{
{
_set_device
(
device_id
);
_set_device
(
device_id
);
DType
*
boxes_dev
=
NULL
;
DType
*
boxes_dev
=
NULL
;
...
@@ -107,27 +94,21 @@ int _nms_gpu(int *keep_out, const DType *boxes_host, int boxes_num,
...
@@ -107,27 +94,21 @@ int _nms_gpu(int *keep_out, const DType *boxes_host, int boxes_num,
const
int
col_blocks
=
DIVUP
(
boxes_num
,
BLOCK_THREADS
);
const
int
col_blocks
=
DIVUP
(
boxes_num
,
BLOCK_THREADS
);
CUDA_CHECK
(
cudaMalloc
(
&
boxes_dev
,
CUDA_CHECK
(
cudaMalloc
(
&
boxes_dev
,
boxes_num
*
boxes_dim
*
sizeof
(
DType
)));
boxes_num
*
boxes_dim
*
sizeof
(
DType
)));
CUDA_CHECK
(
cudaMemcpy
(
boxes_dev
,
boxes_host
,
CUDA_CHECK
(
cudaMemcpy
(
boxes_dev
,
boxes_host
,
boxes_num
*
boxes_dim
*
sizeof
(
DType
),
boxes_num
*
boxes_dim
*
sizeof
(
DType
),
cudaMemcpyHostToDevice
));
cudaMemcpyHostToDevice
));
CUDA_CHECK
(
cudaMalloc
(
&
mask_dev
,
CUDA_CHECK
(
cudaMalloc
(
&
mask_dev
,
boxes_num
*
col_blocks
*
sizeof
(
unsigned
long
long
)));
boxes_num
*
col_blocks
*
sizeof
(
unsigned
long
long
)));
dim3
blocks
(
DIVUP
(
boxes_num
,
BLOCK_THREADS
),
dim3
blocks
(
DIVUP
(
boxes_num
,
BLOCK_THREADS
),
DIVUP
(
boxes_num
,
BLOCK_THREADS
));
DIVUP
(
boxes_num
,
BLOCK_THREADS
));
dim3
threads
(
BLOCK_THREADS
);
dim3
threads
(
BLOCK_THREADS
);
nms_kernel
<
DType
,
BLOCK_THREADS
><<<
blocks
,
threads
>>>
(
boxes_num
,
nms_kernel
<
DType
,
BLOCK_THREADS
>
nms_overlap_thresh
,
<<<
blocks
,
threads
>>>
(
boxes_num
,
nms_overlap_thresh
,
boxes_dev
,
mask_dev
);
boxes_dev
,
mask_dev
);
std
::
vector
<
unsigned
long
long
>
mask_host
(
boxes_num
*
col_blocks
);
std
::
vector
<
unsigned
long
long
>
mask_host
(
boxes_num
*
col_blocks
);
CUDA_CHECK
(
cudaMemcpy
(
&
mask_host
[
0
],
CUDA_CHECK
(
cudaMemcpy
(
&
mask_host
[
0
],
mask_dev
,
mask_dev
,
sizeof
(
unsigned
long
long
)
*
boxes_num
*
col_blocks
,
sizeof
(
unsigned
long
long
)
*
boxes_num
*
col_blocks
,
cudaMemcpyDeviceToHost
));
cudaMemcpyDeviceToHost
));
...
@@ -135,17 +116,14 @@ int _nms_gpu(int *keep_out, const DType *boxes_host, int boxes_num,
...
@@ -135,17 +116,14 @@ int _nms_gpu(int *keep_out, const DType *boxes_host, int boxes_num,
memset
(
&
remv
[
0
],
0
,
sizeof
(
unsigned
long
long
)
*
col_blocks
);
memset
(
&
remv
[
0
],
0
,
sizeof
(
unsigned
long
long
)
*
col_blocks
);
int
num_to_keep
=
0
;
int
num_to_keep
=
0
;
for
(
int
i
=
0
;
i
<
boxes_num
;
i
++
)
for
(
int
i
=
0
;
i
<
boxes_num
;
i
++
)
{
{
int
nblock
=
i
/
BLOCK_THREADS
;
int
nblock
=
i
/
BLOCK_THREADS
;
int
inblock
=
i
%
BLOCK_THREADS
;
int
inblock
=
i
%
BLOCK_THREADS
;
if
(
!
(
remv
[
nblock
]
&
(
1ULL
<<
inblock
)))
if
(
!
(
remv
[
nblock
]
&
(
1ULL
<<
inblock
)))
{
{
keep_out
[
num_to_keep
++
]
=
i
;
keep_out
[
num_to_keep
++
]
=
i
;
unsigned
long
long
*
p
=
&
mask_host
[
0
]
+
i
*
col_blocks
;
unsigned
long
long
*
p
=
&
mask_host
[
0
]
+
i
*
col_blocks
;
for
(
int
j
=
nblock
;
j
<
col_blocks
;
j
++
)
for
(
int
j
=
nblock
;
j
<
col_blocks
;
j
++
)
{
{
remv
[
j
]
|=
p
[
j
];
remv
[
j
]
|=
p
[
j
];
}
}
}
}
...
@@ -156,10 +134,15 @@ int _nms_gpu(int *keep_out, const DType *boxes_host, int boxes_num,
...
@@ -156,10 +134,15 @@ int _nms_gpu(int *keep_out, const DType *boxes_host, int boxes_num,
return
num_to_keep
;
return
num_to_keep
;
}
}
// template<>
//template<>
template
int
_nms_gpu
<
float
,
threadsPerBlock
>(
int
*
keep_out
,
template
int
_nms_gpu
<
float
,
threadsPerBlock
>(
int
*
keep_out
,
const
float
*
boxes_host
,
int
boxes_num
,
const
float
*
boxes_host
,
int
boxes_dim
,
float
nms_overlap_thresh
,
int
device_id
);
int
boxes_num
,
int
boxes_dim
,
//template<>
float
nms_overlap_thresh
,
template
int
_nms_gpu
<
double
,
threadsPerBlock
>(
int
*
keep_out
,
const
double
*
boxes_host
,
int
boxes_num
,
int
device_id
);
int
boxes_dim
,
double
nms_overlap_thresh
,
int
device_id
);
// template<>
\ No newline at end of file
template
int
_nms_gpu
<
double
,
threadsPerBlock
>(
int
*
keep_out
,
const
double
*
boxes_host
,
int
boxes_num
,
int
boxes_dim
,
double
nms_overlap_thresh
,
int
device_id
);
\ No newline at end of file
test/fake_dist_train.py
View file @
19e73bbe
import
horovod.torch
as
hvd
import
time
import
time
from
pathlib
import
Path
from
pathlib
import
Path
...
@@ -12,6 +11,7 @@ from torch.utils import data
...
@@ -12,6 +11,7 @@ from torch.utils import data
from
torch.utils.data
import
DataLoader
,
Dataset
from
torch.utils.data
import
DataLoader
,
Dataset
from
torchvision
import
datasets
,
transforms
from
torchvision
import
datasets
,
transforms
import
horovod.torch
as
hvd
import
spconv
import
spconv
from
spconv.test_utils
import
generate_sparse_data
from
spconv.test_utils
import
generate_sparse_data
...
@@ -53,25 +53,47 @@ class FakeClassifier(nn.Module):
...
@@ -53,25 +53,47 @@ class FakeClassifier(nn.Module):
def
__init__
(
self
):
def
__init__
(
self
):
super
().
__init__
()
super
().
__init__
()
self
.
net
=
spconv
.
SparseSequential
(
self
.
net
=
spconv
.
SparseSequential
(
spconv
.
SubMConv3d
(
3
,
8
,
3
,
indice_key
=
"subm1"
,
padding
=
1
,
use_hash
=
False
),
spconv
.
SubMConv3d
(
3
,
8
,
3
,
indice_key
=
"subm1"
,
padding
=
1
,
use_hash
=
False
),
nn
.
BatchNorm1d
(
8
),
nn
.
BatchNorm1d
(
8
),
nn
.
ReLU
(),
nn
.
ReLU
(),
spconv
.
SparseConv3d
(
8
,
16
,
3
,
stride
=
2
,
padding
=
1
,
use_hash
=
False
),
spconv
.
SparseConv3d
(
8
,
16
,
3
,
stride
=
2
,
padding
=
1
,
use_hash
=
False
),
nn
.
BatchNorm1d
(
16
),
nn
.
BatchNorm1d
(
16
),
nn
.
ReLU
(),
nn
.
ReLU
(),
spconv
.
SubMConv3d
(
16
,
16
,
3
,
indice_key
=
"subm2"
,
padding
=
1
,
use_hash
=
False
),
spconv
.
SubMConv3d
(
16
,
16
,
3
,
indice_key
=
"subm2"
,
padding
=
1
,
use_hash
=
False
),
nn
.
BatchNorm1d
(
16
),
nn
.
BatchNorm1d
(
16
),
nn
.
ReLU
(),
nn
.
ReLU
(),
spconv
.
SparseConv3d
(
16
,
32
,
3
,
stride
=
2
,
padding
=
1
,
use_hash
=
False
),
spconv
.
SparseConv3d
(
16
,
32
,
3
,
stride
=
2
,
padding
=
1
,
use_hash
=
False
),
nn
.
BatchNorm1d
(
32
),
nn
.
BatchNorm1d
(
32
),
nn
.
ReLU
(),
nn
.
ReLU
(),
spconv
.
SubMConv3d
(
32
,
32
,
3
,
indice_key
=
"subm3"
,
padding
=
1
,
use_hash
=
False
),
spconv
.
SubMConv3d
(
32
,
32
,
3
,
indice_key
=
"subm3"
,
padding
=
1
,
use_hash
=
False
),
nn
.
BatchNorm1d
(
32
),
nn
.
BatchNorm1d
(
32
),
nn
.
ReLU
(),
nn
.
ReLU
(),
spconv
.
SparseConv3d
(
32
,
64
,
3
,
stride
=
2
,
padding
=
1
,
use_hash
=
False
),
spconv
.
SparseConv3d
(
32
,
64
,
3
,
stride
=
2
,
padding
=
1
,
use_hash
=
False
),
nn
.
BatchNorm1d
(
64
),
nn
.
BatchNorm1d
(
64
),
nn
.
ReLU
(),
nn
.
ReLU
(),
spconv
.
SubMConv3d
(
64
,
64
,
3
,
indice_key
=
"subm4"
,
padding
=
1
,
use_hash
=
False
),
spconv
.
SubMConv3d
(
64
,
64
,
3
,
indice_key
=
"subm4"
,
padding
=
1
,
use_hash
=
False
),
nn
.
BatchNorm1d
(
64
),
nn
.
BatchNorm1d
(
64
),
nn
.
ReLU
(),
nn
.
ReLU
(),
spconv
.
ToDense
()
# [64, 2, 8, 8]
spconv
.
ToDense
()
# [64, 2, 8, 8]
...
@@ -100,15 +122,16 @@ def run():
...
@@ -100,15 +122,16 @@ def run():
hvd
.
broadcast_optimizer_state
(
optimizer
,
root_rank
=
0
)
hvd
.
broadcast_optimizer_state
(
optimizer
,
root_rank
=
0
)
compression
=
hvd
.
Compression
.
none
compression
=
hvd
.
Compression
.
none
optimizer
=
hvd
.
DistributedOptimizer
(
optimizer
,
optimizer
=
hvd
.
DistributedOptimizer
(
named_parameters
=
model
.
named_parameters
(),
optimizer
,
compression
=
compression
,
named_parameters
=
model
.
named_parameters
(),
op
=
hvd
.
Average
)
compression
=
compression
,
op
=
hvd
.
Average
)
for
i
in
tqdm
.
tqdm
(
list
(
range
(
100
))):
for
i
in
tqdm
.
tqdm
(
list
(
range
(
100
))):
# for j in range(4):
# for j in range(4):
# features, indices, label = ds[(i * 4 + j) % len(ds)]
# features, indices, label = ds[(i * 4 + j) % len(ds)]
features
,
indices
,
label
=
ds
[
i
%
len
(
ds
)]
features
,
indices
,
label
=
ds
[
i
%
len
(
ds
)]
features_t
=
torch
.
from_numpy
(
features
)
features_t
=
torch
.
from_numpy
(
features
)
indices_t
=
torch
.
from_numpy
(
indices
)
indices_t
=
torch
.
from_numpy
(
indices
)
...
...
test/fake_train.py
View file @
19e73bbe
...
@@ -52,25 +52,47 @@ class FakeClassifier(nn.Module):
...
@@ -52,25 +52,47 @@ class FakeClassifier(nn.Module):
def
__init__
(
self
):
def
__init__
(
self
):
super
().
__init__
()
super
().
__init__
()
self
.
net
=
spconv
.
SparseSequential
(
self
.
net
=
spconv
.
SparseSequential
(
spconv
.
SubMConv3d
(
3
,
8
,
3
,
indice_key
=
"subm1"
,
padding
=
1
,
use_hash
=
False
),
spconv
.
SubMConv3d
(
3
,
8
,
3
,
indice_key
=
"subm1"
,
padding
=
1
,
use_hash
=
False
),
nn
.
BatchNorm1d
(
8
),
nn
.
BatchNorm1d
(
8
),
nn
.
ReLU
(),
nn
.
ReLU
(),
spconv
.
SparseConv3d
(
8
,
16
,
3
,
stride
=
2
,
padding
=
1
,
use_hash
=
False
),
spconv
.
SparseConv3d
(
8
,
16
,
3
,
stride
=
2
,
padding
=
1
,
use_hash
=
False
),
nn
.
BatchNorm1d
(
16
),
nn
.
BatchNorm1d
(
16
),
nn
.
ReLU
(),
nn
.
ReLU
(),
spconv
.
SubMConv3d
(
16
,
16
,
3
,
indice_key
=
"subm2"
,
padding
=
1
,
use_hash
=
False
),
spconv
.
SubMConv3d
(
16
,
16
,
3
,
indice_key
=
"subm2"
,
padding
=
1
,
use_hash
=
False
),
nn
.
BatchNorm1d
(
16
),
nn
.
BatchNorm1d
(
16
),
nn
.
ReLU
(),
nn
.
ReLU
(),
spconv
.
SparseConv3d
(
16
,
32
,
3
,
stride
=
2
,
padding
=
1
,
use_hash
=
False
),
spconv
.
SparseConv3d
(
16
,
32
,
3
,
stride
=
2
,
padding
=
1
,
use_hash
=
False
),
nn
.
BatchNorm1d
(
32
),
nn
.
BatchNorm1d
(
32
),
nn
.
ReLU
(),
nn
.
ReLU
(),
spconv
.
SubMConv3d
(
32
,
32
,
3
,
indice_key
=
"subm3"
,
padding
=
1
,
use_hash
=
False
),
spconv
.
SubMConv3d
(
32
,
32
,
3
,
indice_key
=
"subm3"
,
padding
=
1
,
use_hash
=
False
),
nn
.
BatchNorm1d
(
32
),
nn
.
BatchNorm1d
(
32
),
nn
.
ReLU
(),
nn
.
ReLU
(),
spconv
.
SparseConv3d
(
32
,
64
,
3
,
stride
=
2
,
padding
=
1
,
use_hash
=
False
),
spconv
.
SparseConv3d
(
32
,
64
,
3
,
stride
=
2
,
padding
=
1
,
use_hash
=
False
),
nn
.
BatchNorm1d
(
64
),
nn
.
BatchNorm1d
(
64
),
nn
.
ReLU
(),
nn
.
ReLU
(),
spconv
.
SubMConv3d
(
64
,
64
,
3
,
indice_key
=
"subm4"
,
padding
=
1
,
use_hash
=
False
),
spconv
.
SubMConv3d
(
64
,
64
,
3
,
indice_key
=
"subm4"
,
padding
=
1
,
use_hash
=
False
),
nn
.
BatchNorm1d
(
64
),
nn
.
BatchNorm1d
(
64
),
nn
.
ReLU
(),
nn
.
ReLU
(),
spconv
.
ToDense
()
# [64, 2, 8, 8]
spconv
.
ToDense
()
# [64, 2, 8, 8]
...
@@ -97,7 +119,7 @@ def run():
...
@@ -97,7 +119,7 @@ def run():
for
i
in
tqdm
.
tqdm
(
list
(
range
(
100
))):
for
i
in
tqdm
.
tqdm
(
list
(
range
(
100
))):
# for j in range(4):
# for j in range(4):
# features, indices, label = ds[(i * 4 + j) % len(ds)]
# features, indices, label = ds[(i * 4 + j) % len(ds)]
features
,
indices
,
label
=
ds
[
i
%
len
(
ds
)]
features
,
indices
,
label
=
ds
[
i
%
len
(
ds
)]
features_t
=
torch
.
from_numpy
(
features
)
features_t
=
torch
.
from_numpy
(
features
)
indices_t
=
torch
.
from_numpy
(
indices
)
indices_t
=
torch
.
from_numpy
(
indices
)
...
...
test/test_conv.py
View file @
19e73bbe
# Copyright 2019 Yan Yan
# Copyright 2019 Yan Yan
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# You may obtain a copy of the License at
#
#
# http://www.apache.org/licenses/LICENSE-2.0
# http://www.apache.org/licenses/LICENSE-2.0
#
#
# Unless required by applicable law or agreed to in writing, software
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# See the License for the specific language governing permissions and
# limitations under the License.
# limitations under the License.
import
time
import
unittest
from
pathlib
import
Path
from
pathlib
import
Path
import
spconv
import
numpy
as
np
import
torch
import
torch
from
torch
import
nn
from
torch
import
nn
import
numpy
as
np
import
time
import
spconv
from
spconv.test_utils
import
params_grid
,
generate_sparse_data
,
TestCase
from
spconv.test_utils
import
TestCase
,
generate_sparse_data
,
params_grid
import
unittest
# import sparseconvnet as scn
# import sparseconvnet as scn
class
SparseConv3dTestTorch
(
nn
.
Module
):
class
SparseConv3dTestTorch
(
nn
.
Module
):
def
__init__
(
self
,
num_layers
,
ndim
,
shape
,
in_channels
,
out_channels
,
kernel_size
,
def
__init__
(
self
,
num_layers
,
ndim
,
shape
,
in_channels
,
out_channels
,
stride
,
padding
,
dilation
):
kernel_size
,
stride
,
padding
,
dilation
):
super
().
__init__
()
super
().
__init__
()
layers
=
[
spconv
.
SparseConv3d
(
layers
=
[
in_channels
,
spconv
.
SparseConv3d
(
in_channels
,
out_channels
,
out_channels
,
kernel_size
,
kernel_size
,
stride
,
stride
,
padding
=
padding
,
padding
=
padding
,
dilation
=
dilation
,
dilation
=
dilation
,
bias
=
False
,
bias
=
False
,
use_hash
=
True
)]
use_hash
=
True
)
]
for
i
in
range
(
1
,
num_layers
):
for
i
in
range
(
1
,
num_layers
):
layers
.
append
(
spconv
.
SparseConv3d
(
layers
.
append
(
out_channels
,
spconv
.
SparseConv3d
(
out_channels
,
out_channels
,
out_channels
,
kernel_size
,
kernel_size
,
stride
,
stride
,
padding
=
padding
,
padding
=
padding
,
dilation
=
dilation
,
dilation
=
dilation
,
bias
=
False
))
bias
=
False
))
self
.
net
=
spconv
.
SparseSequential
(
self
.
net
=
spconv
.
SparseSequential
(
*
layers
,
)
*
layers
,
)
# self.grid = torch.full([3, *shape], -1, dtype=torch.int32).cuda()
# self.grid = torch.full([3, *shape], -1, dtype=torch.int32).cuda()
self
.
grid
=
None
self
.
grid
=
None
self
.
shape
=
shape
self
.
shape
=
shape
def
forward
(
self
,
features
,
coors
,
batch_size
):
def
forward
(
self
,
features
,
coors
,
batch_size
):
coors
=
coors
.
int
()
coors
=
coors
.
int
()
x
=
spconv
.
SparseConvTensor
(
features
,
coors
,
self
.
shape
,
batch_size
,
self
.
grid
)
x
=
spconv
.
SparseConvTensor
(
features
,
coors
,
self
.
shape
,
batch_size
,
return
self
.
net
(
x
)
# .dense()
self
.
grid
)
return
self
.
net
(
x
)
# .dense()
class
SubMConv3dTestTorch
(
nn
.
Module
):
class
SubMConv3dTestTorch
(
nn
.
Module
):
def
__init__
(
self
,
num_layers
,
ndim
,
shape
,
in_channels
,
out_channels
,
kernel_size
,
def
__init__
(
self
,
num_layers
,
ndim
,
shape
,
in_channels
,
out_channels
,
stride
,
padding
,
dilation
):
kernel_size
,
stride
,
padding
,
dilation
):
super
().
__init__
()
super
().
__init__
()
layers
=
[
spconv
.
SubMConv3d
(
layers
=
[
in_channels
,
spconv
.
SubMConv3d
(
in_channels
,
out_channels
,
out_channels
,
kernel_size
,
kernel_size
,
stride
,
stride
,
padding
=
padding
,
padding
=
padding
,
dilation
=
dilation
,
dilation
=
dilation
,
bias
=
False
)]
bias
=
False
)
]
for
i
in
range
(
1
,
num_layers
):
for
i
in
range
(
1
,
num_layers
):
layers
.
append
(
spconv
.
SubMConv3d
(
layers
.
append
(
out_channels
,
spconv
.
SubMConv3d
(
out_channels
,
out_channels
,
out_channels
,
kernel_size
,
kernel_size
,
stride
,
stride
,
padding
=
padding
,
padding
=
padding
,
dilation
=
dilation
,
dilation
=
dilation
,
bias
=
False
))
bias
=
False
))
self
.
net
=
spconv
.
SparseSequential
(
self
.
net
=
spconv
.
SparseSequential
(
*
layers
,
)
*
layers
,
)
self
.
grid
=
torch
.
full
([
3
,
*
shape
],
-
1
,
dtype
=
torch
.
int32
).
cuda
()
self
.
grid
=
torch
.
full
([
3
,
*
shape
],
-
1
,
dtype
=
torch
.
int32
).
cuda
()
self
.
shape
=
shape
self
.
shape
=
shape
def
forward
(
self
,
features
,
coors
,
batch_size
):
def
forward
(
self
,
features
,
coors
,
batch_size
):
coors
=
coors
.
int
()
coors
=
coors
.
int
()
x
=
spconv
.
SparseConvTensor
(
features
,
coors
,
self
.
shape
,
batch_size
,
self
.
grid
)
x
=
spconv
.
SparseConvTensor
(
features
,
coors
,
self
.
shape
,
batch_size
,
return
self
.
net
(
x
)
# .dense()
self
.
grid
)
return
self
.
net
(
x
)
# .dense()
class
Conv3dTestTorch
(
nn
.
Module
):
class
Conv3dTestTorch
(
nn
.
Module
):
def
__init__
(
self
,
num_layers
,
ndim
,
shape
,
in_channels
,
out_channels
,
kernel_size
,
def
__init__
(
self
,
num_layers
,
ndim
,
shape
,
in_channels
,
out_channels
,
stride
,
padding
,
dilation
):
kernel_size
,
stride
,
padding
,
dilation
):
super
().
__init__
()
super
().
__init__
()
layers
=
[
nn
.
Conv3d
(
layers
=
[
in_channels
,
nn
.
Conv3d
(
in_channels
,
out_channels
,
out_channels
,
kernel_size
,
kernel_size
,
stride
,
stride
,
padding
=
padding
,
padding
=
padding
,
dilation
=
dilation
,
dilation
=
dilation
,
bias
=
False
)]
bias
=
False
)
]
for
i
in
range
(
1
,
num_layers
):
for
i
in
range
(
1
,
num_layers
):
layers
.
append
(
nn
.
Conv3d
(
layers
.
append
(
out_channels
,
nn
.
Conv3d
(
out_channels
,
out_channels
,
out_channels
,
kernel_size
,
kernel_size
,
stride
,
stride
,
padding
=
padding
,
padding
=
padding
,
dilation
=
dilation
,
dilation
=
dilation
,
bias
=
False
))
bias
=
False
))
self
.
net
=
nn
.
Sequential
(
self
.
net
=
nn
.
Sequential
(
*
layers
,
)
*
layers
,
)
self
.
shape
=
shape
self
.
shape
=
shape
def
forward
(
self
,
x
):
def
forward
(
self
,
x
):
return
self
.
net
(
x
)
# .dense()
return
self
.
net
(
x
)
# .dense()
class
SparseDeConv3dTestTorch
(
nn
.
Module
):
class
SparseDeConv3dTestTorch
(
nn
.
Module
):
def
__init__
(
self
,
num_layers
,
ndim
,
shape
,
in_channels
,
out_channels
,
kernel_size
,
def
__init__
(
self
,
num_layers
,
ndim
,
shape
,
in_channels
,
out_channels
,
stride
,
padding
,
dilation
):
kernel_size
,
stride
,
padding
,
dilation
):
super
().
__init__
()
super
().
__init__
()
layers
=
[
spconv
.
SparseConvTranspose3d
(
layers
=
[
in_channels
,
spconv
.
SparseConvTranspose3d
(
in_channels
,
out_channels
,
out_channels
,
kernel_size
,
kernel_size
,
stride
,
stride
,
padding
=
padding
,
padding
=
padding
,
dilation
=
dilation
,
dilation
=
dilation
,
bias
=
False
)]
bias
=
False
)
]
for
i
in
range
(
1
,
num_layers
):
for
i
in
range
(
1
,
num_layers
):
layers
.
append
(
spconv
.
SparseConvTranspose3d
(
layers
.
append
(
out_channels
,
spconv
.
SparseConvTranspose3d
(
out_channels
,
out_channels
,
out_channels
,
kernel_size
,
kernel_size
,
stride
,
stride
,
padding
=
padding
,
padding
=
padding
,
dilation
=
dilation
,
dilation
=
dilation
,
bias
=
False
))
bias
=
False
))
self
.
net
=
spconv
.
SparseSequential
(
self
.
net
=
spconv
.
SparseSequential
(
*
layers
,
)
*
layers
,
)
self
.
shape
=
shape
self
.
shape
=
shape
def
forward
(
self
,
features
,
coors
,
batch_size
):
def
forward
(
self
,
features
,
coors
,
batch_size
):
coors
=
coors
.
int
()
coors
=
coors
.
int
()
x
=
spconv
.
SparseConvTensor
(
features
,
coors
,
self
.
shape
,
batch_size
)
x
=
spconv
.
SparseConvTensor
(
features
,
coors
,
self
.
shape
,
batch_size
)
return
self
.
net
(
x
)
# .dense()
return
self
.
net
(
x
)
# .dense()
class
DeConv3dTestTorch
(
nn
.
Module
):
class
DeConv3dTestTorch
(
nn
.
Module
):
def
__init__
(
self
,
num_layers
,
ndim
,
shape
,
in_channels
,
out_channels
,
kernel_size
,
def
__init__
(
self
,
num_layers
,
ndim
,
shape
,
in_channels
,
out_channels
,
stride
,
padding
,
dilation
):
kernel_size
,
stride
,
padding
,
dilation
):
super
().
__init__
()
super
().
__init__
()
layers
=
[
nn
.
ConvTranspose3d
(
layers
=
[
in_channels
,
nn
.
ConvTranspose3d
(
in_channels
,
out_channels
,
out_channels
,
kernel_size
,
kernel_size
,
stride
,
stride
,
padding
=
padding
,
padding
=
padding
,
dilation
=
dilation
,
dilation
=
dilation
,
bias
=
False
)]
bias
=
False
)
]
for
i
in
range
(
1
,
num_layers
):
for
i
in
range
(
1
,
num_layers
):
layers
.
append
(
nn
.
ConvTranspose3d
(
layers
.
append
(
out_channels
,
nn
.
ConvTranspose3d
(
out_channels
,
out_channels
,
out_channels
,
kernel_size
,
kernel_size
,
stride
,
stride
,
padding
=
padding
,
padding
=
padding
,
dilation
=
dilation
,
dilation
=
dilation
,
bias
=
False
))
bias
=
False
))
self
.
net
=
nn
.
Sequential
(
self
.
net
=
nn
.
Sequential
(
*
layers
,
)
*
layers
,
)
self
.
shape
=
shape
self
.
shape
=
shape
def
forward
(
self
,
x
):
def
forward
(
self
,
x
):
return
self
.
net
(
x
)
# .dense()
return
self
.
net
(
x
)
# .dense()
class
SparseMaxPoolTestTorch
(
nn
.
Module
):
class
SparseMaxPoolTestTorch
(
nn
.
Module
):
def
__init__
(
self
,
num_layers
,
ndim
,
shape
,
kernel_size
,
def
__init__
(
self
,
num_layers
,
ndim
,
shape
,
kernel_size
,
stride
,
padding
,
stride
,
padding
,
dilation
):
dilation
):
super
().
__init__
()
super
().
__init__
()
layers
=
[
spconv
.
SparseMaxPool3d
(
layers
=
[
kernel_size
,
spconv
.
SparseMaxPool3d
(
kernel_size
,
stride
,
padding
,
dilation
)
stride
,
padding
,
dilation
)
]
]
for
i
in
range
(
1
,
num_layers
):
for
i
in
range
(
1
,
num_layers
):
layers
.
append
(
spconv
.
SparseMaxPool3d
(
layers
.
append
(
kernel_size
,
spconv
.
SparseMaxPool3d
(
kernel_size
,
stride
,
padding
,
dilation
))
stride
,
padding
,
dilation
))
self
.
net
=
spconv
.
SparseSequential
(
*
layers
,
)
self
.
net
=
spconv
.
SparseSequential
(
*
layers
,
)
self
.
shape
=
shape
self
.
shape
=
shape
def
forward
(
self
,
features
,
coors
,
batch_size
):
def
forward
(
self
,
features
,
coors
,
batch_size
):
coors
=
coors
.
int
()
coors
=
coors
.
int
()
x
=
spconv
.
SparseConvTensor
(
features
,
coors
,
self
.
shape
,
batch_size
)
x
=
spconv
.
SparseConvTensor
(
features
,
coors
,
self
.
shape
,
batch_size
)
return
self
.
net
(
x
)
# .dense()
return
self
.
net
(
x
)
# .dense()
class
MaxPool3dTestTorch
(
nn
.
Module
):
class
MaxPool3dTestTorch
(
nn
.
Module
):
def
__init__
(
self
,
num_layers
,
ndim
,
shape
,
kernel_size
,
def
__init__
(
self
,
num_layers
,
ndim
,
shape
,
kernel_size
,
stride
,
padding
,
stride
,
padding
,
dilation
):
dilation
):
super
().
__init__
()
super
().
__init__
()
layers
=
[
nn
.
MaxPool3d
(
layers
=
[
nn
.
MaxPool3d
(
kernel_size
,
stride
,
padding
,
dilation
)]
kernel_size
,
stride
,
padding
,
dilation
)]
for
i
in
range
(
1
,
num_layers
):
for
i
in
range
(
1
,
num_layers
):
layers
.
append
(
nn
.
MaxPool3d
(
layers
.
append
(
nn
.
MaxPool3d
(
kernel_size
,
stride
,
padding
,
dilation
))
kernel_size
,
self
.
net
=
nn
.
Sequential
(
*
layers
,
)
stride
,
padding
,
dilation
))
self
.
net
=
nn
.
Sequential
(
*
layers
,
)
self
.
shape
=
shape
self
.
shape
=
shape
def
forward
(
self
,
x
):
def
forward
(
self
,
x
):
return
self
.
net
(
x
)
# .dense()
return
self
.
net
(
x
)
# .dense()
class
SubmanifoldConvTestTorch
(
nn
.
Module
):
class
SubmanifoldConvTestTorch
(
nn
.
Module
):
def
__init__
(
self
,
num_layers
,
ndim
,
shape
,
in_channels
,
out_channels
,
kernel_size
,
stride
):
def
__init__
(
self
,
num_layers
,
ndim
,
shape
,
in_channels
,
out_channels
,
kernel_size
,
stride
):
super
().
__init__
()
super
().
__init__
()
layers
=
[
spconv
.
SubMConv3d
(
layers
=
[
in_channels
,
out_channels
,
kernel_size
,
bias
=
False
,
indice_key
=
"subm0"
)]
spconv
.
SubMConv3d
(
in_channels
,
out_channels
,
kernel_size
,
bias
=
False
,
indice_key
=
"subm0"
)
]
for
i
in
range
(
1
,
num_layers
):
for
i
in
range
(
1
,
num_layers
):
layers
.
append
(
spconv
.
SubMConv3d
(
layers
.
append
(
out_channels
,
out_channels
,
kernel_size
,
bias
=
False
))
spconv
.
SubMConv3d
(
out_channels
,
self
.
net
=
nn
.
Sequential
(
out_channels
,
*
layers
,
kernel_size
,
)
bias
=
False
))
self
.
net
=
nn
.
Sequential
(
*
layers
,
)
self
.
shape
=
shape
self
.
shape
=
shape
def
forward
(
self
,
features
,
coors
,
batch_size
):
def
forward
(
self
,
features
,
coors
,
batch_size
):
coors
=
coors
.
int
()
coors
=
coors
.
int
()
x
=
spconv
.
SparseConvTensor
(
features
,
coors
,
self
.
shape
,
batch_size
)
x
=
spconv
.
SparseConvTensor
(
features
,
coors
,
self
.
shape
,
batch_size
)
return
self
.
net
(
x
)
return
self
.
net
(
x
)
class
SCNCoupleDeConvTest
(
nn
.
Module
):
class
SCNCoupleDeConvTest
(
nn
.
Module
):
def
__init__
(
self
,
num_layers
,
ndim
,
shape
,
in_channels
,
out_channels
,
kernel_size
,
def
__init__
(
self
,
num_layers
,
ndim
,
shape
,
in_channels
,
out_channels
,
stride
):
kernel_size
,
stride
):
super
().
__init__
()
super
().
__init__
()
self
.
scn_input
=
scn
.
InputLayer
(
ndim
,
shape
,
mode
=
0
)
self
.
scn_input
=
scn
.
InputLayer
(
ndim
,
shape
,
mode
=
0
)
self
.
net
=
nn
.
Sequential
(
self
.
net
=
nn
.
Sequential
(
scn
.
Convolution
(
scn
.
Convolution
(
ndim
,
ndim
,
in_channels
,
in_channels
,
out_channels
,
out_channels
,
kernel_size
,
kernel_size
,
stride
,
stride
,
bias
=
False
),
bias
=
False
),
scn
.
Deconvolution
(
ndim
,
scn
.
Deconvolution
(
out_channels
,
ndim
,
in_channels
,
out_channels
,
kernel_size
,
in_channels
,
stride
,
kernel_size
,
bias
=
False
),
stride
,
bias
=
False
),
scn
.
SparseToDense
(
ndim
,
in_channels
),
scn
.
SparseToDense
(
ndim
,
in_channels
),
)
)
...
@@ -267,44 +268,44 @@ class SCNCoupleDeConvTest(nn.Module):
...
@@ -267,44 +268,44 @@ class SCNCoupleDeConvTest(nn.Module):
x
=
self
.
scn_input
((
coors
,
features
))
x
=
self
.
scn_input
((
coors
,
features
))
return
self
.
net
(
x
)
return
self
.
net
(
x
)
class
SparseCoupleDeConvTest
(
nn
.
Module
):
class
SparseCoupleDeConvTest
(
nn
.
Module
):
def
__init__
(
self
,
num_layers
,
ndim
,
shape
,
in_channels
,
out_channels
,
kernel_size
,
def
__init__
(
self
,
num_layers
,
ndim
,
shape
,
in_channels
,
out_channels
,
stride
):
kernel_size
,
stride
):
super
().
__init__
()
super
().
__init__
()
self
.
net
=
spconv
.
SparseSequential
(
self
.
net
=
spconv
.
SparseSequential
(
spconv
.
SparseConv3d
(
spconv
.
SparseConv3d
(
in_channels
,
in_channels
,
out_channels
,
out_channels
,
kernel_size
,
kernel_size
,
stride
,
stride
,
indice_key
=
"cp0"
,
indice_key
=
"cp0"
,
bias
=
False
),
bias
=
False
),
spconv
.
SparseInverseConv3d
(
out_channels
,
spconv
.
SparseInverseConv3d
(
in_channels
,
out_channels
,
kernel_size
,
in_channels
,
indice_key
=
"cp0"
,
kernel_size
,
bias
=
False
),
indice_key
=
"cp0"
,
bias
=
False
),
)
)
self
.
todense
=
spconv
.
ToDense
()
self
.
todense
=
spconv
.
ToDense
()
self
.
shape
=
shape
self
.
shape
=
shape
def
forward
(
self
,
features
,
coors
,
batch_size
):
def
forward
(
self
,
features
,
coors
,
batch_size
):
coors
=
coors
.
int
()
coors
=
coors
.
int
()
x
=
spconv
.
SparseConvTensor
(
features
,
coors
,
self
.
shape
,
batch_size
)
x
=
spconv
.
SparseConvTensor
(
features
,
coors
,
self
.
shape
,
batch_size
)
return
self
.
todense
(
self
.
net
(
x
))
# .dense()
return
self
.
todense
(
self
.
net
(
x
))
# .dense()
def
gather_nd
(
params
,
indices
):
def
gather_nd
(
params
,
indices
):
# this function has a limit that MAX_ADVINDEX_CALC_DIMS=5
# this function has a limit that MAX_ADVINDEX_CALC_DIMS=5
ndim
=
indices
.
shape
[
-
1
]
ndim
=
indices
.
shape
[
-
1
]
output_shape
=
list
(
indices
.
shape
[:
-
1
])
+
list
(
params
.
shape
[
indices
.
shape
[
-
1
]:])
output_shape
=
list
(
indices
.
shape
[:
-
1
])
+
list
(
params
.
shape
[
indices
.
shape
[
-
1
]:])
flatted_indices
=
indices
.
view
(
-
1
,
ndim
)
flatted_indices
=
indices
.
view
(
-
1
,
ndim
)
slices
=
[
flatted_indices
[:,
i
]
for
i
in
range
(
ndim
)]
slices
=
[
flatted_indices
[:,
i
]
for
i
in
range
(
ndim
)]
slices
+=
[
Ellipsis
]
slices
+=
[
Ellipsis
]
return
params
[
slices
].
view
(
*
output_shape
)
return
params
[
slices
].
view
(
*
output_shape
)
def
scatter_nd
(
indices
,
updates
,
shape
):
def
scatter_nd
(
indices
,
updates
,
shape
):
"""pytorch edition of tensorflow scatter_nd.
"""pytorch edition of tensorflow scatter_nd.
this function don't contain except handle code. so use this carefully
this function don't contain except handle code. so use this carefully
...
@@ -322,7 +323,6 @@ def scatter_nd(indices, updates, shape):
...
@@ -322,7 +323,6 @@ def scatter_nd(indices, updates, shape):
class
TestSpConv
(
TestCase
):
class
TestSpConv
(
TestCase
):
def
testSpConv3d
(
self
):
def
testSpConv3d
(
self
):
np
.
random
.
seed
(
484
)
np
.
random
.
seed
(
484
)
devices
=
[
"cpu:0"
]
devices
=
[
"cpu:0"
]
...
@@ -337,36 +337,44 @@ class TestSpConv(TestCase):
...
@@ -337,36 +337,44 @@ class TestSpConv(TestCase):
dilations
=
[
1
,
2
,
3
]
dilations
=
[
1
,
2
,
3
]
for
dev
,
shape
,
bs
,
IC
,
OC
,
k
,
s
,
p
,
d
in
params_grid
(
for
dev
,
shape
,
bs
,
IC
,
OC
,
k
,
s
,
p
,
d
in
params_grid
(
devices
,
shapes
,
batchsizes
,
in_channels
,
out_channels
,
ksizes
,
devices
,
shapes
,
batchsizes
,
in_channels
,
out_channels
,
ksizes
,
strides
,
paddings
,
dilations
):
strides
,
paddings
,
dilations
):
if
all
([
s
>
1
,
d
>
1
]):
if
all
([
s
>
1
,
d
>
1
]):
continue
# don't support this.
continue
# don't support this.
device
=
torch
.
device
(
dev
)
device
=
torch
.
device
(
dev
)
num_points
=
[
1000
]
*
bs
num_points
=
[
1000
]
*
bs
sparse_dict
=
generate_sparse_data
(
shape
,
num_points
,
IC
)
sparse_dict
=
generate_sparse_data
(
shape
,
num_points
,
IC
)
features
=
np
.
ascontiguousarray
(
sparse_dict
[
"features"
]).
astype
(
np
.
float32
)
features
=
np
.
ascontiguousarray
(
sparse_dict
[
"features"
]).
astype
(
indices
=
np
.
ascontiguousarray
(
sparse_dict
[
"indices"
][:,
[
3
,
0
,
1
,
2
]]).
astype
(
np
.
int32
)
np
.
float32
)
indices
=
np
.
ascontiguousarray
(
sparse_dict
[
"indices"
][:,
[
3
,
0
,
1
,
2
]]).
astype
(
np
.
int32
)
features_dense
=
sparse_dict
[
"features_dense"
].
astype
(
np
.
float32
)
features_dense
=
sparse_dict
[
"features_dense"
].
astype
(
np
.
float32
)
filters
=
np
.
random
.
uniform
(
0
,
1
,
size
=
[
k
,
k
,
k
,
IC
,
OC
]).
astype
(
np
.
float32
)
filters
=
np
.
random
.
uniform
(
0
,
1
,
size
=
[
k
,
k
,
k
,
IC
,
OC
]).
astype
(
np
.
float32
)
indices_t
=
torch
.
from_numpy
(
indices
).
int
().
to
(
device
)
indices_t
=
torch
.
from_numpy
(
indices
).
int
().
to
(
device
)
features_t
=
torch
.
from_numpy
(
features
).
to
(
device
)
features_t
=
torch
.
from_numpy
(
features
).
to
(
device
)
features_t
.
requires_grad
=
True
features_t
.
requires_grad
=
True
features_dense_t
=
torch
.
from_numpy
(
features_dense
).
to
(
device
)
features_dense_t
=
torch
.
from_numpy
(
features_dense
).
to
(
device
)
features_dense_t
.
requires_grad
=
True
features_dense_t
.
requires_grad
=
True
net
=
SparseConv3dTestTorch
(
1
,
3
,
shape
,
IC
,
OC
,
k
,
s
,
p
,
d
).
to
(
device
)
net
=
SparseConv3dTestTorch
(
1
,
3
,
shape
,
IC
,
OC
,
k
,
s
,
p
,
net_ref
=
Conv3dTestTorch
(
1
,
3
,
shape
,
IC
,
OC
,
k
,
s
,
p
,
d
).
to
(
device
)
d
).
to
(
device
)
net_ref
=
Conv3dTestTorch
(
1
,
3
,
shape
,
IC
,
OC
,
k
,
s
,
p
,
d
).
to
(
device
)
filters_t
=
torch
.
from_numpy
(
filters
).
to
(
device
)
filters_t
=
torch
.
from_numpy
(
filters
).
to
(
device
)
net_ref
.
net
[
0
].
weight
.
data
[:]
=
filters_t
.
permute
(
4
,
3
,
0
,
1
,
2
).
contiguous
()
net_ref
.
net
[
0
].
weight
.
data
[:]
=
filters_t
.
permute
(
4
,
3
,
0
,
1
,
2
).
contiguous
()
net
.
net
[
0
].
weight
.
data
[:]
=
filters_t
net
.
net
[
0
].
weight
.
data
[:]
=
filters_t
out_ref
=
net_ref
(
features_dense_t
)
out_ref
=
net_ref
(
features_dense_t
)
out
=
net
(
features_t
,
indices_t
,
bs
).
dense
()
out
=
net
(
features_t
,
indices_t
,
bs
).
dense
()
dout
=
np
.
random
.
uniform
(
-
0.2
,
0.2
,
out_ref
.
shape
).
astype
(
features
.
dtype
)
dout
=
np
.
random
.
uniform
(
-
0.2
,
0.2
,
out_ref
.
shape
).
astype
(
features
.
dtype
)
dout_t
=
torch
.
from_numpy
(
dout
).
to
(
device
)
dout_t
=
torch
.
from_numpy
(
dout
).
to
(
device
)
out
.
backward
(
dout_t
)
out
.
backward
(
dout_t
)
out_ref
.
backward
(
dout_t
)
out_ref
.
backward
(
dout_t
)
din_dense
=
features_dense_t
.
grad
.
detach
().
permute
(
0
,
2
,
3
,
4
,
1
).
contiguous
()
din_dense
=
features_dense_t
.
grad
.
detach
().
permute
(
0
,
2
,
3
,
4
,
1
).
contiguous
()
din_sparse
=
gather_nd
(
din_dense
,
indices_t
.
long
())
din_sparse
=
gather_nd
(
din_dense
,
indices_t
.
long
())
din
=
features_t
.
grad
.
detach
()
din
=
features_t
.
grad
.
detach
()
din_np
=
din
.
cpu
().
numpy
()
din_np
=
din
.
cpu
().
numpy
()
...
@@ -381,7 +389,7 @@ class TestSpConv(TestCase):
...
@@ -381,7 +389,7 @@ class TestSpConv(TestCase):
out_np
=
out
.
detach
().
cpu
().
numpy
()
out_np
=
out
.
detach
().
cpu
().
numpy
()
out_ref_np
=
out_ref
.
detach
().
cpu
().
numpy
()
out_ref_np
=
out_ref
.
detach
().
cpu
().
numpy
()
self
.
assertAllClose
(
out_np
,
out_ref_np
,
atol
=
1e-4
)
self
.
assertAllClose
(
out_np
,
out_ref_np
,
atol
=
1e-4
)
def
testSpDeConv3d
(
self
):
def
testSpDeConv3d
(
self
):
np
.
random
.
seed
(
484
)
np
.
random
.
seed
(
484
)
devices
=
[
"cuda:0"
,
"cpu:0"
]
devices
=
[
"cuda:0"
,
"cpu:0"
]
...
@@ -396,36 +404,44 @@ class TestSpConv(TestCase):
...
@@ -396,36 +404,44 @@ class TestSpConv(TestCase):
dilations
=
[
1
,
2
,
3
]
dilations
=
[
1
,
2
,
3
]
for
dev
,
shape
,
bs
,
IC
,
OC
,
k
,
s
,
p
,
d
in
params_grid
(
for
dev
,
shape
,
bs
,
IC
,
OC
,
k
,
s
,
p
,
d
in
params_grid
(
devices
,
shapes
,
batchsizes
,
in_channels
,
out_channels
,
ksizes
,
devices
,
shapes
,
batchsizes
,
in_channels
,
out_channels
,
ksizes
,
strides
,
paddings
,
dilations
):
strides
,
paddings
,
dilations
):
if
all
([
s
>
1
,
d
>
1
]):
if
all
([
s
>
1
,
d
>
1
]):
continue
# don't support this.
continue
# don't support this.
device
=
torch
.
device
(
dev
)
device
=
torch
.
device
(
dev
)
num_points
=
[
1000
]
*
bs
num_points
=
[
1000
]
*
bs
sparse_dict
=
generate_sparse_data
(
shape
,
num_points
,
IC
)
sparse_dict
=
generate_sparse_data
(
shape
,
num_points
,
IC
)
features
=
np
.
ascontiguousarray
(
sparse_dict
[
"features"
]).
astype
(
np
.
float32
)
features
=
np
.
ascontiguousarray
(
sparse_dict
[
"features"
]).
astype
(
indices
=
np
.
ascontiguousarray
(
sparse_dict
[
"indices"
][:,
[
3
,
0
,
1
,
2
]]).
astype
(
np
.
int32
)
np
.
float32
)
indices
=
np
.
ascontiguousarray
(
sparse_dict
[
"indices"
][:,
[
3
,
0
,
1
,
2
]]).
astype
(
np
.
int32
)
features_dense
=
sparse_dict
[
"features_dense"
].
astype
(
np
.
float32
)
features_dense
=
sparse_dict
[
"features_dense"
].
astype
(
np
.
float32
)
filters
=
np
.
random
.
uniform
(
0
,
1
,
size
=
[
k
,
k
,
k
,
IC
,
OC
]).
astype
(
np
.
float32
)
filters
=
np
.
random
.
uniform
(
0
,
1
,
size
=
[
k
,
k
,
k
,
IC
,
OC
]).
astype
(
np
.
float32
)
indices_t
=
torch
.
from_numpy
(
indices
).
int
().
to
(
device
)
indices_t
=
torch
.
from_numpy
(
indices
).
int
().
to
(
device
)
features_t
=
torch
.
from_numpy
(
features
).
to
(
device
)
features_t
=
torch
.
from_numpy
(
features
).
to
(
device
)
features_t
.
requires_grad
=
True
features_t
.
requires_grad
=
True
features_dense_t
=
torch
.
from_numpy
(
features_dense
).
to
(
device
)
features_dense_t
=
torch
.
from_numpy
(
features_dense
).
to
(
device
)
features_dense_t
.
requires_grad
=
True
features_dense_t
.
requires_grad
=
True
net
=
SparseDeConv3dTestTorch
(
1
,
3
,
shape
,
IC
,
OC
,
k
,
s
,
p
,
d
).
to
(
device
)
net
=
SparseDeConv3dTestTorch
(
1
,
3
,
shape
,
IC
,
OC
,
k
,
s
,
p
,
net_ref
=
DeConv3dTestTorch
(
1
,
3
,
shape
,
IC
,
OC
,
k
,
s
,
p
,
d
).
to
(
device
)
d
).
to
(
device
)
net_ref
=
DeConv3dTestTorch
(
1
,
3
,
shape
,
IC
,
OC
,
k
,
s
,
p
,
d
).
to
(
device
)
filters_t
=
torch
.
from_numpy
(
filters
).
to
(
device
)
filters_t
=
torch
.
from_numpy
(
filters
).
to
(
device
)
net_ref
.
net
[
0
].
weight
.
data
[:]
=
filters_t
.
permute
(
3
,
4
,
0
,
1
,
2
).
contiguous
()
net_ref
.
net
[
0
].
weight
.
data
[:]
=
filters_t
.
permute
(
3
,
4
,
0
,
1
,
2
).
contiguous
()
net
.
net
[
0
].
weight
.
data
[:]
=
filters_t
net
.
net
[
0
].
weight
.
data
[:]
=
filters_t
out_ref
=
net_ref
(
features_dense_t
)
out_ref
=
net_ref
(
features_dense_t
)
out
=
net
(
features_t
,
indices_t
,
bs
).
dense
()
out
=
net
(
features_t
,
indices_t
,
bs
).
dense
()
dout
=
np
.
random
.
uniform
(
-
0.2
,
0.2
,
out_ref
.
shape
).
astype
(
features
.
dtype
)
dout
=
np
.
random
.
uniform
(
-
0.2
,
0.2
,
out_ref
.
shape
).
astype
(
features
.
dtype
)
dout_t
=
torch
.
from_numpy
(
dout
).
to
(
device
)
dout_t
=
torch
.
from_numpy
(
dout
).
to
(
device
)
out
.
backward
(
dout_t
)
out
.
backward
(
dout_t
)
out_ref
.
backward
(
dout_t
)
out_ref
.
backward
(
dout_t
)
din_dense
=
features_dense_t
.
grad
.
detach
().
permute
(
0
,
2
,
3
,
4
,
1
).
contiguous
()
din_dense
=
features_dense_t
.
grad
.
detach
().
permute
(
0
,
2
,
3
,
4
,
1
).
contiguous
()
din_sparse
=
gather_nd
(
din_dense
,
indices_t
.
long
())
din_sparse
=
gather_nd
(
din_dense
,
indices_t
.
long
())
din
=
features_t
.
grad
.
detach
()
din
=
features_t
.
grad
.
detach
()
din_np
=
din
.
cpu
().
numpy
()
din_np
=
din
.
cpu
().
numpy
()
...
@@ -440,7 +456,7 @@ class TestSpConv(TestCase):
...
@@ -440,7 +456,7 @@ class TestSpConv(TestCase):
out_np
=
out
.
detach
().
cpu
().
numpy
()
out_np
=
out
.
detach
().
cpu
().
numpy
()
out_ref_np
=
out_ref
.
detach
().
cpu
().
numpy
()
out_ref_np
=
out_ref
.
detach
().
cpu
().
numpy
()
self
.
assertAllClose
(
out_np
,
out_ref_np
,
atol
=
1e-4
)
self
.
assertAllClose
(
out_np
,
out_ref_np
,
atol
=
1e-4
)
def
testSpCpConv3d
(
self
):
def
testSpCpConv3d
(
self
):
np
.
random
.
seed
(
484
)
np
.
random
.
seed
(
484
)
devices
=
[
"cuda:0"
,
"cpu:0"
]
devices
=
[
"cuda:0"
,
"cpu:0"
]
...
@@ -455,19 +471,23 @@ class TestSpConv(TestCase):
...
@@ -455,19 +471,23 @@ class TestSpConv(TestCase):
dilations
=
[
1
,
2
,
3
]
dilations
=
[
1
,
2
,
3
]
for
dev
,
shape
,
bs
,
IC
,
OC
,
k
,
s
in
params_grid
(
for
dev
,
shape
,
bs
,
IC
,
OC
,
k
,
s
in
params_grid
(
devices
,
shapes
,
batchsizes
,
in_channels
,
out_channels
,
ksizes
,
devices
,
shapes
,
batchsizes
,
in_channels
,
out_channels
,
ksizes
,
strides
):
strides
):
device
=
torch
.
device
(
dev
)
device
=
torch
.
device
(
dev
)
num_points
=
[
1000
]
*
bs
num_points
=
[
1000
]
*
bs
sparse_dict
=
generate_sparse_data
(
shape
,
num_points
,
IC
)
sparse_dict
=
generate_sparse_data
(
shape
,
num_points
,
IC
)
features
=
np
.
ascontiguousarray
(
sparse_dict
[
"features"
]).
astype
(
np
.
float32
)
features
=
np
.
ascontiguousarray
(
sparse_dict
[
"features"
]).
astype
(
indices
=
np
.
ascontiguousarray
(
sparse_dict
[
"indices"
][:,
[
3
,
0
,
1
,
2
]]).
astype
(
np
.
int32
)
np
.
float32
)
indices
=
np
.
ascontiguousarray
(
sparse_dict
[
"indices"
][:,
[
3
,
0
,
1
,
2
]]).
astype
(
np
.
int32
)
features_dense
=
sparse_dict
[
"features_dense"
].
astype
(
np
.
float32
)
features_dense
=
sparse_dict
[
"features_dense"
].
astype
(
np
.
float32
)
filters
=
np
.
random
.
uniform
(
0
,
1
,
size
=
[
k
,
k
,
k
,
IC
,
OC
]).
astype
(
np
.
float32
)
filters
=
np
.
random
.
uniform
(
0
,
1
,
size
=
[
k
,
k
,
k
,
IC
,
OC
]).
astype
(
np
.
float32
)
indices_t
=
torch
.
from_numpy
(
indices
).
int
().
to
(
device
)
indices_t
=
torch
.
from_numpy
(
indices
).
int
().
to
(
device
)
indices_scn_t
=
torch
.
from_numpy
(
indices
[:,
[
1
,
2
,
3
,
0
]]).
int
().
to
(
device
)
indices_scn_t
=
torch
.
from_numpy
(
indices
[:,
[
1
,
2
,
3
,
0
]]).
int
().
to
(
device
)
features_t
=
torch
.
from_numpy
(
features
).
to
(
device
)
features_t
=
torch
.
from_numpy
(
features
).
to
(
device
)
features_t
.
requires_grad
=
True
features_t
.
requires_grad
=
True
features_ref_t
=
torch
.
from_numpy
(
features
).
to
(
device
)
features_ref_t
=
torch
.
from_numpy
(
features
).
to
(
device
)
...
@@ -475,11 +495,14 @@ class TestSpConv(TestCase):
...
@@ -475,11 +495,14 @@ class TestSpConv(TestCase):
net_ref
=
SCNCoupleDeConvTest
(
1
,
3
,
shape
,
IC
,
OC
,
k
,
s
).
to
(
device
)
net_ref
=
SCNCoupleDeConvTest
(
1
,
3
,
shape
,
IC
,
OC
,
k
,
s
).
to
(
device
)
net
=
SparseCoupleDeConvTest
(
1
,
3
,
shape
,
IC
,
OC
,
k
,
s
).
to
(
device
)
net
=
SparseCoupleDeConvTest
(
1
,
3
,
shape
,
IC
,
OC
,
k
,
s
).
to
(
device
)
net_ref
.
net
[
0
].
weight
.
data
[:]
=
net
.
net
[
0
].
weight
.
data
[:].
view
(
*
net_ref
.
net
[
0
].
weight
.
shape
)
net_ref
.
net
[
0
].
weight
.
data
[:]
=
net
.
net
[
0
].
weight
.
data
[:].
view
(
net_ref
.
net
[
1
].
weight
.
data
[:]
=
net
.
net
[
1
].
weight
.
data
[:].
view
(
*
net_ref
.
net
[
1
].
weight
.
shape
)
*
net_ref
.
net
[
0
].
weight
.
shape
)
net_ref
.
net
[
1
].
weight
.
data
[:]
=
net
.
net
[
1
].
weight
.
data
[:].
view
(
*
net_ref
.
net
[
1
].
weight
.
shape
)
out_ref
=
net_ref
(
features_ref_t
,
indices_scn_t
,
bs
)
out_ref
=
net_ref
(
features_ref_t
,
indices_scn_t
,
bs
)
out
=
net
(
features_t
,
indices_t
,
bs
)
out
=
net
(
features_t
,
indices_t
,
bs
)
dout
=
np
.
random
.
uniform
(
-
0.2
,
0.2
,
out_ref
.
shape
).
astype
(
features
.
dtype
)
dout
=
np
.
random
.
uniform
(
-
0.2
,
0.2
,
out_ref
.
shape
).
astype
(
features
.
dtype
)
dout_t
=
torch
.
from_numpy
(
dout
).
to
(
device
)
dout_t
=
torch
.
from_numpy
(
dout
).
to
(
device
)
out
.
backward
(
dout_t
)
out
.
backward
(
dout_t
)
out_ref
.
backward
(
dout_t
)
out_ref
.
backward
(
dout_t
)
...
@@ -490,15 +513,14 @@ class TestSpConv(TestCase):
...
@@ -490,15 +513,14 @@ class TestSpConv(TestCase):
self
.
assertAllClose
(
din_ref_np
,
din_np
,
atol
=
1e-4
)
self
.
assertAllClose
(
din_ref_np
,
din_np
,
atol
=
1e-4
)
for
layer
,
layer_ref
in
zip
(
net
.
net
,
net_ref
.
net
):
for
layer
,
layer_ref
in
zip
(
net
.
net
,
net_ref
.
net
):
dw
=
layer
.
weight
.
grad
.
detach
().
cpu
().
numpy
()
dw
=
layer
.
weight
.
grad
.
detach
().
cpu
().
numpy
()
dw_ref
=
layer_ref
.
weight
.
grad
.
detach
().
cpu
().
view
(
*
dw
.
shape
).
numpy
()
dw_ref
=
layer_ref
.
weight
.
grad
.
detach
().
cpu
().
view
(
*
dw
.
shape
).
numpy
()
self
.
assertAllClose
(
dw
,
dw_ref
,
atol
=
1e-4
)
self
.
assertAllClose
(
dw
,
dw_ref
,
atol
=
1e-4
)
out_np
=
out
.
detach
().
cpu
().
numpy
()
out_np
=
out
.
detach
().
cpu
().
numpy
()
out_ref_np
=
out_ref
.
detach
().
cpu
().
numpy
()
out_ref_np
=
out_ref
.
detach
().
cpu
().
numpy
()
self
.
assertAllClose
(
out_np
,
out_ref_np
,
atol
=
1e-4
)
self
.
assertAllClose
(
out_np
,
out_ref_np
,
atol
=
1e-4
)
def
testSpMaxPool3d
(
self
):
def
testSpMaxPool3d
(
self
):
np
.
random
.
seed
(
485
)
np
.
random
.
seed
(
485
)
devices
=
[
"cuda:0"
,
"cpu:0"
]
devices
=
[
"cuda:0"
,
"cpu:0"
]
...
@@ -513,19 +535,25 @@ class TestSpConv(TestCase):
...
@@ -513,19 +535,25 @@ class TestSpConv(TestCase):
dilations
=
[
1
,
2
,
3
]
dilations
=
[
1
,
2
,
3
]
for
dev
,
shape
,
bs
,
IC
,
OC
,
k
,
s
,
p
,
d
in
params_grid
(
for
dev
,
shape
,
bs
,
IC
,
OC
,
k
,
s
,
p
,
d
in
params_grid
(
devices
,
shapes
,
batchsizes
,
in_channels
,
out_channels
,
ksizes
,
devices
,
shapes
,
batchsizes
,
in_channels
,
out_channels
,
ksizes
,
strides
,
paddings
,
dilations
):
strides
,
paddings
,
dilations
):
if
all
([
s
>
1
,
d
>
1
]):
if
all
([
s
>
1
,
d
>
1
]):
continue
# don't support this.
continue
# don't support this.
device
=
torch
.
device
(
dev
)
device
=
torch
.
device
(
dev
)
num_points
=
[
1000
]
*
bs
num_points
=
[
1000
]
*
bs
# when data contains negative, sparse maxpool is not equal to dense maxpool.
# when data contains negative, sparse maxpool is not equal to dense maxpool.
sparse_dict
=
generate_sparse_data
(
shape
,
num_points
,
IC
,
data_range
=
[
0.1
,
1
])
sparse_dict
=
generate_sparse_data
(
shape
,
num_points
,
features
=
np
.
ascontiguousarray
(
sparse_dict
[
"features"
]).
astype
(
np
.
float32
)
IC
,
indices
=
np
.
ascontiguousarray
(
sparse_dict
[
"indices"
][:,
[
3
,
0
,
1
,
2
]]).
astype
(
np
.
int32
)
data_range
=
[
0.1
,
1
])
features
=
np
.
ascontiguousarray
(
sparse_dict
[
"features"
]).
astype
(
np
.
float32
)
indices
=
np
.
ascontiguousarray
(
sparse_dict
[
"indices"
][:,
[
3
,
0
,
1
,
2
]]).
astype
(
np
.
int32
)
features_dense
=
sparse_dict
[
"features_dense"
].
astype
(
np
.
float32
)
features_dense
=
sparse_dict
[
"features_dense"
].
astype
(
np
.
float32
)
filters
=
np
.
random
.
uniform
(
0
,
1
,
size
=
[
k
,
k
,
k
,
IC
,
OC
]).
astype
(
np
.
float32
)
filters
=
np
.
random
.
uniform
(
0
,
1
,
size
=
[
k
,
k
,
k
,
IC
,
OC
]).
astype
(
np
.
float32
)
indices_t
=
torch
.
from_numpy
(
indices
).
int
().
to
(
device
)
indices_t
=
torch
.
from_numpy
(
indices
).
int
().
to
(
device
)
features_t
=
torch
.
from_numpy
(
features
).
to
(
device
)
features_t
=
torch
.
from_numpy
(
features
).
to
(
device
)
features_t
.
requires_grad
=
True
features_t
.
requires_grad
=
True
...
@@ -540,24 +568,27 @@ class TestSpConv(TestCase):
...
@@ -540,24 +568,27 @@ class TestSpConv(TestCase):
outfeatures
=
out
.
features
outfeatures
=
out
.
features
out_dense
=
out
.
dense
(
channels_first
=
False
)
out_dense
=
out
.
dense
(
channels_first
=
False
)
out
=
out_dense
.
permute
(
0
,
4
,
1
,
2
,
3
).
contiguous
()
out
=
out_dense
.
permute
(
0
,
4
,
1
,
2
,
3
).
contiguous
()
dout_sparse
=
np
.
random
.
uniform
(
-
0.2
,
0.2
,
outfeatures
.
shape
).
astype
(
features
.
dtype
)
dout_sparse
=
np
.
random
.
uniform
(
-
0.2
,
0.2
,
outfeatures
.
shape
).
astype
(
features
.
dtype
)
dout_sparse_t
=
torch
.
from_numpy
(
dout_sparse
).
to
(
device
)
dout_sparse_t
=
torch
.
from_numpy
(
dout_sparse
).
to
(
device
)
dout_t
=
scatter_nd
(
outids
.
long
(),
dout_sparse_t
,
list
(
out_dense
.
shape
))
dout_t
=
scatter_nd
(
outids
.
long
(),
dout_sparse_t
,
list
(
out_dense
.
shape
))
dout_t
=
dout_t
.
permute
(
0
,
4
,
1
,
2
,
3
).
contiguous
()
dout_t
=
dout_t
.
permute
(
0
,
4
,
1
,
2
,
3
).
contiguous
()
out
.
backward
(
dout_t
)
out
.
backward
(
dout_t
)
out_ref
.
backward
(
dout_t
)
out_ref
.
backward
(
dout_t
)
din_dense
=
features_dense_t
.
grad
.
detach
().
permute
(
0
,
2
,
3
,
4
,
1
).
contiguous
()
din_dense
=
features_dense_t
.
grad
.
detach
().
permute
(
0
,
2
,
3
,
4
,
1
).
contiguous
()
din_sparse
=
gather_nd
(
din_dense
,
indices_t
.
long
())
din_sparse
=
gather_nd
(
din_dense
,
indices_t
.
long
())
din
=
features_t
.
grad
.
detach
()
din
=
features_t
.
grad
.
detach
()
din_np
=
din
.
cpu
().
numpy
()
din_np
=
din
.
cpu
().
numpy
()
din_sparse_np
=
din_sparse
.
cpu
().
numpy
()
din_sparse_np
=
din_sparse
.
cpu
().
numpy
()
self
.
assertAllClose
(
din_np
,
din_sparse_np
,
atol
=
1e-4
)
self
.
assertAllClose
(
din_np
,
din_sparse_np
,
atol
=
1e-4
)
out_np
=
out
.
detach
().
cpu
().
numpy
()
out_np
=
out
.
detach
().
cpu
().
numpy
()
out_ref_np
=
out_ref
.
detach
().
cpu
().
numpy
()
out_ref_np
=
out_ref
.
detach
().
cpu
().
numpy
()
self
.
assertAllClose
(
out_np
,
out_ref_np
,
atol
=
1e-4
)
self
.
assertAllClose
(
out_np
,
out_ref_np
,
atol
=
1e-4
)
def
main
():
def
main
():
# function for develop.
# function for develop.
...
@@ -567,7 +598,6 @@ def main():
...
@@ -567,7 +598,6 @@ def main():
shapes
=
[[
50
,
30
,
30
]]
shapes
=
[[
50
,
30
,
30
]]
batchsizes
=
[
2
]
batchsizes
=
[
2
]
in_channels
=
[
256
]
in_channels
=
[
256
]
out_channels
=
[
256
]
out_channels
=
[
256
]
ksizes
=
[(
3
,
1
,
1
)]
ksizes
=
[(
3
,
1
,
1
)]
...
@@ -576,8 +606,8 @@ def main():
...
@@ -576,8 +606,8 @@ def main():
dilations
=
[
1
]
dilations
=
[
1
]
for
dev
,
shape
,
bs
,
IC
,
OC
,
k
,
s
,
p
,
d
in
params_grid
(
for
dev
,
shape
,
bs
,
IC
,
OC
,
k
,
s
,
p
,
d
in
params_grid
(
devices
,
shapes
,
batchsizes
,
in_channels
,
out_channels
,
ksizes
,
devices
,
shapes
,
batchsizes
,
in_channels
,
out_channels
,
ksizes
,
strides
,
paddings
,
dilations
):
strides
,
paddings
,
dilations
):
if
all
([
s
>
1
,
d
>
1
]):
if
all
([
s
>
1
,
d
>
1
]):
continue
continue
device
=
torch
.
device
(
dev
)
device
=
torch
.
device
(
dev
)
...
@@ -585,19 +615,25 @@ def main():
...
@@ -585,19 +615,25 @@ def main():
sparse_dict
=
generate_sparse_data
(
shape
,
num_points
,
IC
)
sparse_dict
=
generate_sparse_data
(
shape
,
num_points
,
IC
)
features
=
np
.
ascontiguousarray
(
sparse_dict
[
"features"
]).
astype
(
np
.
float32
)
features
=
np
.
ascontiguousarray
(
sparse_dict
[
"features"
]).
astype
(
indices
=
np
.
ascontiguousarray
(
sparse_dict
[
"indices"
][:,
[
3
,
0
,
1
,
2
]]).
astype
(
np
.
int32
)
np
.
float32
)
indices
=
np
.
ascontiguousarray
(
sparse_dict
[
"indices"
][:,
[
3
,
0
,
1
,
2
]]).
astype
(
np
.
int32
)
features_dense
=
sparse_dict
[
"features_dense"
].
astype
(
np
.
float32
)
features_dense
=
sparse_dict
[
"features_dense"
].
astype
(
np
.
float32
)
indices_t
=
torch
.
from_numpy
(
indices
)
indices_t
=
torch
.
from_numpy
(
indices
)
filters
=
np
.
random
.
uniform
(
0
,
1
,
size
=
[
k
[
0
],
1
,
1
,
IC
,
OC
]).
astype
(
np
.
float32
)
filters
=
np
.
random
.
uniform
(
0
,
1
,
size
=
[
k
[
0
],
1
,
1
,
IC
,
OC
]).
astype
(
np
.
float32
)
indices_t
=
torch
.
from_numpy
(
indices
).
int
().
to
(
device
).
float
()
indices_t
=
torch
.
from_numpy
(
indices
).
int
().
to
(
device
).
float
()
features_t
=
torch
.
from_numpy
(
features
).
to
(
device
).
float
()
features_t
=
torch
.
from_numpy
(
features
).
to
(
device
).
float
()
features_dense_t
=
torch
.
from_numpy
(
features_dense
).
to
(
device
).
float
()
features_dense_t
=
torch
.
from_numpy
(
features_dense
).
to
(
device
).
float
()
net
=
SparseConv3dTestTorch
(
1
,
3
,
shape
,
IC
,
OC
,
k
,
s
,
p
,
d
).
to
(
device
).
float
()
net
=
SparseConv3dTestTorch
(
1
,
3
,
shape
,
IC
,
OC
,
k
,
s
,
p
,
net_ref
=
Conv3dTestTorch
(
1
,
3
,
shape
,
IC
,
OC
,
k
,
s
,
p
,
d
).
to
(
device
).
float
()
d
).
to
(
device
).
float
()
net_ref
=
Conv3dTestTorch
(
1
,
3
,
shape
,
IC
,
OC
,
k
,
s
,
p
,
d
).
to
(
device
).
float
()
filters_t
=
torch
.
from_numpy
(
filters
).
to
(
device
).
float
()
filters_t
=
torch
.
from_numpy
(
filters
).
to
(
device
).
float
()
net_ref
.
net
[
0
].
weight
[:]
=
filters_t
.
permute
(
4
,
3
,
0
,
1
,
2
).
contiguous
()
net_ref
.
net
[
0
].
weight
[:]
=
filters_t
.
permute
(
4
,
3
,
0
,
1
,
2
).
contiguous
()
net
.
net
[
0
].
weight
[:]
=
filters_t
net
.
net
[
0
].
weight
[:]
=
filters_t
out_ref
=
net_ref
(
features_dense_t
)
out_ref
=
net_ref
(
features_dense_t
)
times
=
[]
times
=
[]
...
@@ -607,16 +643,16 @@ def main():
...
@@ -607,16 +643,16 @@ def main():
torch
.
cuda
.
synchronize
()
torch
.
cuda
.
synchronize
()
times
.
append
(
time
.
time
()
-
t
)
times
.
append
(
time
.
time
()
-
t
)
# print((net.grid == -1).float().sum(), net.grid.numel())
# print((net.grid == -1).float().sum(), net.grid.numel())
# print("spconv time", time.time() - t)
# print("spconv time", time.time() - t)
print
(
"spconv time"
,
np
.
mean
(
times
[
2
:]))
print
(
"spconv time"
,
np
.
mean
(
times
[
2
:]))
out
=
net
(
features_t
,
indices_t
,
bs
)
out
=
net
(
features_t
,
indices_t
,
bs
)
# print(out.indices)
# print(out.indices)
out
=
out
.
dense
()
out
=
out
.
dense
()
print
(
np
.
linalg
.
norm
(
out
.
detach
().
cpu
().
numpy
()
-
out_ref
.
detach
().
cpu
().
numpy
()))
print
(
np
.
linalg
.
norm
(
out
.
detach
().
cpu
().
numpy
()
-
out_ref
.
detach
().
cpu
().
numpy
()))
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
main
()
main
()
# unittest.main()
# unittest.main()
pybind11
@
3b1dbeba
Compare
085a2943
...
3b1dbeba
Subproject commit
085a29436a8c472caaaf7157aa644b571079bcaa
Subproject commit
3b1dbebabc801c9cf6f0953a4c20b904d444f879
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment