Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
one
spconv
Commits
d17a00e0
Commit
d17a00e0
authored
Jun 05, 2020
by
yanyan
Browse files
sync some code
parent
11bcbbf6
Changes
13
Show whitespace changes
Inline
Side-by-side
Showing
13 changed files
with
389 additions
and
97 deletions
+389
-97
include/spconv/indice.cu.h
include/spconv/indice.cu.h
+145
-29
include/spconv/spconv_ops.h
include/spconv/spconv_ops.h
+1
-1
include/sphash/hashmap.h
include/sphash/hashmap.h
+19
-0
include/tensorview/tensor.h
include/tensorview/tensor.h
+14
-0
spconv/__init__.py
spconv/__init__.py
+2
-0
spconv/ops.py
spconv/ops.py
+7
-17
spconv/pool.py
spconv/pool.py
+6
-2
src/spconv/all.cc
src/spconv/all.cc
+0
-1
src/spconv/indice.cu
src/spconv/indice.cu
+89
-37
src/spconv/spconv_ops.cc
src/spconv/spconv_ops.cc
+14
-8
test/benchmark.py
test/benchmark.py
+90
-0
test/data/benchmark-pc.npz
test/data/benchmark-pc.npz
+0
-0
test/test_conv.py
test/test_conv.py
+2
-2
No files found.
include/spconv/indice.cu.h
View file @
d17a00e0
...
...
@@ -269,7 +269,7 @@ __global__ void getSubMIndicePairsKernel(
template
<
typename
Index
,
typename
IndexGrid
,
unsigned
K0
,
unsigned
K1
,
unsigned
K2
>
__global__
void
getSubMIndicePairsKernel3
(
__global__
void
getSubMIndicePairs
Unroll
Kernel3
(
tv
::
TensorView
<
const
Index
>
indicesIn
,
tv
::
TensorView
<
IndexGrid
>
gridsOut
,
tv
::
TensorView
<
Index
>
indicePairs
,
tv
::
TensorView
<
Index
>
indiceNum
,
const
tv
::
SimpleVector
<
Index
,
3
>
outSpatialShape
,
Index
spatialVolume
)
{
...
...
@@ -290,25 +290,26 @@ __global__ void getSubMIndicePairsKernel3(
#pragma unroll
for
(
int
k
=
0
;
k
<
K2
;
++
k
)
{
offset
=
i
*
K1
*
K2
+
j
*
K2
+
k
;
if
(
offset
>
center
){
if
(
offset
>
center
)
{
continue
;
}
if
(
center
==
offset
){
if
(
center
==
offset
)
{
// center of subm indice pairs dont need atomicadd
indicePairs
(
1
,
offset
,
ix
)
=
ix
;
indicePairs
(
0
,
offset
,
ix
)
=
ix
;
}
else
{
}
else
{
point
[
2
]
=
indice_data
[
3
]
-
k
+
K2
/
2
;
point
[
1
]
=
indice_data
[
2
]
-
j
+
K1
/
2
;
point
[
0
]
=
indice_data
[
1
]
-
i
+
K0
/
2
;
if
(
point
[
1
]
>=
0
&&
point
[
1
]
<
outSpatialShape
[
1
]
&&
point
[
2
]
>=
0
&&
point
[
2
]
<
outSpatialShape
[
2
]
&&
point
[
0
]
>=
0
&&
point
[
0
]
<
outSpatialShape
[
0
])
{
if
(
point
[
1
]
>=
0
&&
point
[
1
]
<
outSpatialShape
[
1
]
&&
point
[
2
]
>=
0
&&
point
[
2
]
<
outSpatialShape
[
2
]
&&
point
[
0
]
>=
0
&&
point
[
0
]
<
outSpatialShape
[
0
])
{
index
=
tv
::
ArrayIndexRowMajor
<
3
,
3
>::
runPtrs
(
point
,
outSpatialShape
.
data
(),
0
)
+
spatialVolume
*
indice_data
[
0
];
if
(
gridsOut
[
index
]
!=
-
1
)
{
// for subm: indicePairs[0, i] = indicePairs[1, kernelVolume - i - 1]
// for subm: indicePairs[0, i] = indicePairs[1, kernelVolume - i
// - 1]
Index
oldNum
=
atomicAdd
(
indiceNum
.
data
()
+
offset
,
Index
(
1
));
atomicAdd
(
indiceNum
.
data
()
+
KV
-
offset
-
1
,
Index
(
1
));
indicePairs
(
1
,
offset
,
oldNum
)
=
gridsOut
[
index
];
...
...
@@ -325,7 +326,7 @@ __global__ void getSubMIndicePairsKernel3(
}
template
<
typename
Index
,
typename
IndexGrid
,
unsigned
K0
,
unsigned
K1
>
__global__
void
getSubMIndicePairsKernel2
(
__global__
void
getSubMIndicePairs
Unroll
Kernel2
(
tv
::
TensorView
<
const
Index
>
indicesIn
,
tv
::
TensorView
<
IndexGrid
>
gridsOut
,
tv
::
TensorView
<
Index
>
indicePairs
,
tv
::
TensorView
<
Index
>
indiceNum
,
const
tv
::
SimpleVector
<
Index
,
2
>
outSpatialShape
,
Index
spatialVolume
)
{
...
...
@@ -344,14 +345,14 @@ __global__ void getSubMIndicePairsKernel2(
#pragma unroll
for
(
int
j
=
0
;
j
<
K1
;
++
j
)
{
offset
=
i
*
K1
+
j
;
if
(
offset
>
center
){
if
(
offset
>
center
)
{
continue
;
}
if
(
center
==
offset
){
if
(
center
==
offset
)
{
// center of subm indice pairs dont need atomicadd
indicePairs
(
1
,
offset
,
ix
)
=
ix
;
indicePairs
(
0
,
offset
,
ix
)
=
ix
;
}
else
{
}
else
{
point
[
1
]
=
indice_data
[
2
]
-
j
+
K1
/
2
;
point
[
0
]
=
indice_data
[
1
]
-
i
+
K0
/
2
;
if
(
point
[
1
]
>=
0
&&
point
[
1
]
<
outSpatialShape
[
1
]
&&
point
[
0
]
>=
0
&&
...
...
@@ -418,6 +419,130 @@ __global__ void getSubMIndicePairsHashKernel(
}
}
template
<
typename
Index
,
unsigned
K0
,
unsigned
K1
,
unsigned
K2
,
unsigned
kNumHashFunctions
=
4
>
__global__
void
getSubMIndicePairsHashUnrollKernel3
(
tv
::
TensorView
<
const
Index
>
indicesIn
,
tv
::
TensorView
<
Index
>
indicePairs
,
tv
::
TensorView
<
Index
>
indiceNum
,
const
tv
::
SimpleVector
<
Index
,
3
>
outSpatialShape
,
Index
spatialVolume
,
unsigned
table_size
,
const
cuhash
::
Entry
*
table
,
cuhash
::
Functions
<
kNumHashFunctions
>
constants
,
uint2
stash_constants
,
unsigned
stash_count
)
{
auto
numActIn
=
indicesIn
.
dim
(
0
);
Index
index
=
0
;
Index
offset
;
Index
point
[
3
];
constexpr
unsigned
KV
=
K0
*
K1
*
K2
;
constexpr
unsigned
center
=
KV
/
2
;
*
(
indiceNum
.
data
()
+
center
)
=
numActIn
;
for
(
int
ix
:
tv
::
KernelLoopX
<
int
>
(
numActIn
))
{
const
Index
*
indice_data
=
indicesIn
.
data
()
+
ix
*
(
3
+
1
);
#pragma unroll
for
(
int
i
=
0
;
i
<
K0
;
++
i
)
{
#pragma unroll
for
(
int
j
=
0
;
j
<
K1
;
++
j
)
{
#pragma unroll
for
(
int
k
=
0
;
k
<
K2
;
++
k
)
{
offset
=
i
*
K1
*
K2
+
j
*
K2
+
k
;
if
(
offset
>
center
)
{
continue
;
}
if
(
center
==
offset
)
{
// center of subm indice pairs dont need atomicadd
indicePairs
(
1
,
offset
,
ix
)
=
ix
;
indicePairs
(
0
,
offset
,
ix
)
=
ix
;
}
else
{
point
[
2
]
=
indice_data
[
3
]
-
k
+
K2
/
2
;
point
[
1
]
=
indice_data
[
2
]
-
j
+
K1
/
2
;
point
[
0
]
=
indice_data
[
1
]
-
i
+
K0
/
2
;
if
(
point
[
1
]
>=
0
&&
point
[
1
]
<
outSpatialShape
[
1
]
&&
point
[
2
]
>=
0
&&
point
[
2
]
<
outSpatialShape
[
2
]
&&
point
[
0
]
>=
0
&&
point
[
0
]
<
outSpatialShape
[
0
])
{
index
=
tv
::
ArrayIndexRowMajor
<
3
,
3
>::
runPtrs
(
point
,
outSpatialShape
.
data
(),
0
)
+
spatialVolume
*
indice_data
[
0
];
auto
val
=
cuhash
::
retrieve
((
unsigned
)(
index
),
table_size
,
table
,
constants
,
stash_constants
,
stash_count
);
if
(
val
!=
cuhash
::
kNotFound
)
{
// for subm: indicePairs[0, i] = indicePairs[1, kernelVolume - i
// - 1]
Index
oldNum
=
atomicAdd
(
indiceNum
.
data
()
+
offset
,
Index
(
1
));
atomicAdd
(
indiceNum
.
data
()
+
KV
-
offset
-
1
,
Index
(
1
));
indicePairs
(
1
,
offset
,
oldNum
)
=
val
;
indicePairs
(
0
,
offset
,
oldNum
)
=
ix
;
indicePairs
(
1
,
KV
-
offset
-
1
,
oldNum
)
=
ix
;
indicePairs
(
0
,
KV
-
offset
-
1
,
oldNum
)
=
val
;
}
}
}
}
}
}
}
}
template
<
typename
Index
,
unsigned
K0
,
unsigned
K1
,
unsigned
kNumHashFunctions
=
4
>
__global__
void
getSubMIndicePairsHashUnrollKernel2
(
tv
::
TensorView
<
const
Index
>
indicesIn
,
tv
::
TensorView
<
Index
>
indicePairs
,
tv
::
TensorView
<
Index
>
indiceNum
,
const
tv
::
SimpleVector
<
Index
,
2
>
outSpatialShape
,
Index
spatialVolume
,
unsigned
table_size
,
const
cuhash
::
Entry
*
table
,
cuhash
::
Functions
<
kNumHashFunctions
>
constants
,
uint2
stash_constants
,
unsigned
stash_count
)
{
auto
numActIn
=
indicesIn
.
dim
(
0
);
Index
index
=
0
;
Index
offset
;
Index
point
[
2
];
constexpr
unsigned
KV
=
K0
*
K1
;
constexpr
unsigned
center
=
KV
/
2
;
*
(
indiceNum
.
data
()
+
center
)
=
numActIn
;
for
(
int
ix
:
tv
::
KernelLoopX
<
int
>
(
numActIn
))
{
const
Index
*
indice_data
=
indicesIn
.
data
()
+
ix
*
(
2
+
1
);
#pragma unroll
for
(
int
i
=
0
;
i
<
K0
;
++
i
)
{
#pragma unroll
for
(
int
j
=
0
;
j
<
K1
;
++
j
)
{
offset
=
i
*
K1
+
j
;
if
(
offset
>
center
)
{
continue
;
}
if
(
center
==
offset
)
{
// center of subm indice pairs dont need atomicadd
indicePairs
(
1
,
offset
,
ix
)
=
ix
;
indicePairs
(
0
,
offset
,
ix
)
=
ix
;
}
else
{
point
[
1
]
=
indice_data
[
2
]
-
j
+
K1
/
2
;
point
[
0
]
=
indice_data
[
1
]
-
i
+
K0
/
2
;
if
(
point
[
1
]
>=
0
&&
point
[
1
]
<
outSpatialShape
[
1
]
&&
point
[
0
]
>=
0
&&
point
[
0
]
<
outSpatialShape
[
0
])
{
index
=
tv
::
ArrayIndexRowMajor
<
2
,
2
>::
runPtrs
(
point
,
outSpatialShape
.
data
(),
0
)
+
spatialVolume
*
indice_data
[
0
];
auto
val
=
cuhash
::
retrieve
((
unsigned
)(
index
),
table_size
,
table
,
constants
,
stash_constants
,
stash_count
);
if
(
val
!=
cuhash
::
kNotFound
)
{
// for subm: indicePairs[0, i] = indicePairs[1, kernelVolume - i -
// 1]
Index
oldNum
=
atomicAdd
(
indiceNum
.
data
()
+
offset
,
Index
(
1
));
atomicAdd
(
indiceNum
.
data
()
+
KV
-
offset
-
1
,
Index
(
1
));
indicePairs
(
1
,
offset
,
oldNum
)
=
val
;
indicePairs
(
0
,
offset
,
oldNum
)
=
ix
;
indicePairs
(
1
,
KV
-
offset
-
1
,
oldNum
)
=
ix
;
indicePairs
(
0
,
KV
-
offset
-
1
,
oldNum
)
=
val
;
}
}
}
}
}
}
}
template
<
typename
Index
,
typename
IndexGrid
,
unsigned
NDim
>
__global__
void
resetGridKernel
(
const
Index
*
indicePairUnique
,
tv
::
TensorView
<
IndexGrid
>
gridsOut
,
...
...
@@ -437,22 +562,13 @@ template <typename Index, typename IndexGrid, unsigned NDim>
__global__
void
resetGridSubMKernel
(
const
Index
*
indices
,
tv
::
TensorView
<
IndexGrid
>
gridsOut
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
outSpatialShape
,
int
numAct
)
{
Index
outSpatialShapeReg
[
NDim
];
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
outSpatialShapeReg
[
i
]
=
outSpatialShape
[
i
];
}
Index
spatialVolume
=
1
;
int
numAct
,
Index
spatialVolume
)
{
auto
indsPtr
=
indices
;
#pragma unroll
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
spatialVolume
*=
outSpatialShape
[
i
];
}
Index
index
;
for
(
int
ix
:
tv
::
KernelLoopX
<
int
>
(
numAct
))
{
indsPtr
=
indices
+
ix
*
(
NDim
+
1
);
index
=
tv
::
ArrayIndexRowMajor
<
NDim
,
NDim
>::
runPtrs
(
indsPtr
+
1
,
outSpatialShape
Reg
,
0
);
outSpatialShape
.
data
()
,
0
);
gridsOut
[
index
+
spatialVolume
*
indsPtr
[
0
]]
=
-
1
;
}
}
...
...
include/spconv/spconv_ops.h
View file @
d17a00e0
...
...
@@ -27,7 +27,7 @@ enum ConvAlgo { kNative = 0, kBatch = 1, kBatchGemmGather = 2 };
// torch.jit's doc says only support int64, so we need to convert to int32.
std
::
vector
<
torch
::
Tensor
>
getIndicePairs
(
torch
::
Tensor
indices
,
int64_t
batchSize
,
getIndicePairs
(
torch
::
Tensor
indices
,
torch
::
Tensor
gridOut
,
int64_t
batchSize
,
std
::
vector
<
int64_t
>
outSpatialShape
,
std
::
vector
<
int64_t
>
spatialShape
,
std
::
vector
<
int64_t
>
kernelSize
,
std
::
vector
<
int64_t
>
stride
,
...
...
include/sphash/hashmap.h
0 → 100644
View file @
d17a00e0
#include <tensorview/tensor.h>
namespace
spconv
{
enum
HashTypes
{
kDenseMap
=
0
,
kCUDPPHash
=
1
};
template
<
int
Impl
>
struct
HashMap
;
template
<
>
struct
HashMap
<
kDenseMap
>
{
};
}
\ No newline at end of file
include/tensorview/tensor.h
View file @
d17a00e0
...
...
@@ -510,6 +510,20 @@ struct DispatchInt<T<Args...>> {
}
};
template
<
class
T
>
struct
DispatchIntNoexcept
;
template
<
template
<
class
...
>
class
T
,
class
...
Args
>
struct
DispatchIntNoexcept
<
T
<
Args
...
>>
{
template
<
typename
F
>
inline
bool
operator
()(
int
t
,
F
&&
f
)
{
return
dispatch_int_noexcept
<
Args
::
value
...
>
(
t
,
std
::
forward
<
F
>
(
f
));
}
template
<
typename
F
,
typename
BinaryPredicate
>
inline
bool
operator
()(
int
t
,
BinaryPredicate
p
,
F
&&
f
)
{
return
dispatch_int_noexcept
<
Args
::
value
...
>
(
t
,
p
,
std
::
forward
<
F
>
(
f
));
}
};
constexpr
size_t
kTensorMaxDim
=
10
;
using
TensorShape
=
ShapeBase
<
kTensorMaxDim
,
int64_t
>
;
...
...
spconv/__init__.py
View file @
d17a00e0
...
...
@@ -68,6 +68,8 @@ class SparseConvTensor(object):
self
.
spatial_shape
=
spatial_shape
self
.
batch_size
=
batch_size
self
.
indice_dict
=
{}
if
grid
is
None
:
grid
=
torch
.
Tensor
()
# empty tensor
self
.
grid
=
grid
@
classmethod
...
...
spconv/ops.py
View file @
d17a00e0
...
...
@@ -88,23 +88,13 @@ def get_indice_pairs(indices,
else
:
out_shape
=
spatial_shape
if
grid
is
None
:
res
=
torch
.
ops
.
spconv
.
get_indice_pairs
(
indices
,
batch_size
,
out_shape
,
grid
=
torch
.
Tensor
()
res
=
torch
.
ops
.
spconv
.
get_indice_pairs
(
indices
,
grid
,
batch_size
,
out_shape
,
spatial_shape
,
ksize
,
stride
,
padding
,
dilation
,
out_padding
,
int
(
subm
),
int
(
transpose
),
int
(
use_hash
))
return
res
else
:
if
ndim
==
2
:
get_indice_pairs_func
=
torch
.
ops
.
spconv
.
get_indice_pairs_grid_2d
elif
ndim
==
3
:
get_indice_pairs_func
=
torch
.
ops
.
spconv
.
get_indice_pairs_grid_3d
else
:
raise
NotImplementedError
return
get_indice_pairs_func
(
indices
,
grid
,
batch_size
,
out_shape
,
spatial_shape
,
ksize
,
stride
,
padding
,
dilation
,
out_padding
,
int
(
subm
),
int
(
transpose
),
int
(
use_hash
))
def
indice_conv
(
features
,
...
...
spconv/pool.py
View file @
d17a00e0
...
...
@@ -46,7 +46,6 @@ class SparseMaxPool(SparseModule):
padding
=
[
padding
]
*
ndim
if
not
isinstance
(
dilation
,
(
list
,
tuple
)):
dilation
=
[
dilation
]
*
ndim
self
.
ndim
=
ndim
self
.
kernel_size
=
kernel_size
self
.
stride
=
stride
...
...
@@ -61,6 +60,8 @@ class SparseMaxPool(SparseModule):
indices
=
input
.
indices
spatial_shape
=
input
.
spatial_shape
batch_size
=
input
.
batch_size
torch
.
cuda
.
synchronize
()
t
=
time
.
time
()
if
not
self
.
subm
:
out_spatial_shape
=
ops
.
get_conv_output_size
(
spatial_shape
,
self
.
kernel_size
,
self
.
stride
,
self
.
padding
,
...
...
@@ -69,11 +70,14 @@ class SparseMaxPool(SparseModule):
out_spatial_shape
=
spatial_shape
outids
,
indice_pairs
,
indice_pairs_num
=
ops
.
get_indice_pairs
(
indices
,
batch_size
,
spatial_shape
,
self
.
kernel_size
,
self
.
stride
,
self
.
padding
,
self
.
dilation
,
0
,
self
.
subm
)
self
.
padding
,
self
.
dilation
,
0
,
self
.
subm
,
grid
=
input
.
grid
)
out_features
=
Fsp
.
indice_maxpool
(
features
,
indice_pairs
.
to
(
device
),
indice_pairs_num
.
to
(
device
),
outids
.
shape
[
0
])
torch
.
cuda
.
synchronize
()
print
(
"maxpool"
,
spatial_shape
,
time
.
time
()
-
t
)
out_tensor
=
spconv
.
SparseConvTensor
(
out_features
,
outids
,
out_spatial_shape
,
batch_size
)
out_tensor
.
indice_dict
=
input
.
indice_dict
...
...
src/spconv/all.cc
View file @
d17a00e0
...
...
@@ -23,7 +23,6 @@ static auto registry =
torch
::
RegisterOperators
()
.
op
(
"spconv::get_indice_pairs"
,
&
spconv
::
getIndicePairs
)
.
op
(
"spconv::indice_conv"
,
&
spconv
::
indiceConv
)
.
op
(
"spconv::indice_conv_batch"
,
&
spconv
::
indiceConvBatch
)
.
op
(
"spconv::indice_conv_backward"
,
&
spconv
::
indiceConvBackward
)
.
op
(
"spconv::fused_indice_conv_bn"
,
&
spconv
::
fusedIndiceConvBatchNorm
)
.
op
(
"spconv::indice_maxpool"
,
&
spconv
::
indiceMaxPool
)
...
...
src/spconv/indice.cu
View file @
d17a00e0
...
...
@@ -42,6 +42,8 @@ int create_conv_indice_pair_p1_cuda(
auto
ndim
=
kernelSize
.
size
();
auto
numActIn
=
indicesIn
.
size
(
0
);
auto
kernelVolume
=
indiceNum
.
size
(
0
);
// auto timer = spconv::CudaContextTimer<>();
if
(
numActIn
==
0
)
return
0
;
tv
::
dispatch_torch
<
int32_t
>
(
indicesIn
.
scalar_type
(),
[
&
](
auto
IndexValue
)
{
...
...
@@ -77,6 +79,7 @@ int create_conv_indice_pair_p1_cuda(
pa
,
di
,
ou
);
TV_CHECK_CUDA_ERR_V2
(
"prepareIndicePairsKernel failed"
);
}
// tv::ssprint("prepareIndicePairsKernel", timer.report() / 1000.0);
#ifdef TV_LOG_KERNEL_INFO
cudaFuncAttributes
attr
;
checkCudaErrors
(
cudaFuncGetAttributes
(
...
...
@@ -193,10 +196,6 @@ int create_conv_indice_pair_p2_cuda(
return
numAct
;
}
template
<
typename
T
>
struct
is_valid
{
__device__
__forceinline__
bool
operator
()(
const
T
x
)
{
return
x
!=
-
1
;
}
};
int
create_submconv_indice_pair_cuda
(
torch
::
Tensor
indicesIn
,
torch
::
Tensor
gridsOut
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
std
::
vector
<
int64_t
>
kernelSize
,
...
...
@@ -226,6 +225,8 @@ int create_submconv_indice_pair_cuda(
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
spatialVolume
*=
outSpatialShape
[
i
];
}
auto
dispatcher
=
tv
::
DispatchIntNoexcept
<
tv
::
mp_list_c
<
int
,
1
,
3
,
5
>>
();
if
(
useHash
)
{
auto
table
=
cuhash
::
HashTable
();
// std::cout << "create " << numAct << " size table..." << std::endl;
...
...
@@ -252,18 +253,71 @@ int create_submconv_indice_pair_cuda(
auto
constants
=
table
.
get_constants_4
();
auto
stash_constants
=
table
.
get_stash_constants
();
auto
stash_count
=
table
.
get_stash_count
();
bool
dilation_one
=
true
;
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
dilation_one
&=
di
[
i
]
==
1
;
}
auto
found
=
false
;
if
(
dilation_one
&&
(
NDim
==
2
||
NDim
==
3
))
{
auto
indiceNumCpu
=
indiceNum
.
cpu
();
if
(
NDim
==
2
)
{
tv
::
SimpleVector
<
Index
,
2
>
ou_
(
outSpatialShape
.
begin
(),
outSpatialShape
.
end
());
dispatcher
(
kernelSize
[
0
],
[
&
](
auto
K0C
)
{
dispatcher
(
kernelSize
[
1
],
[
&
](
auto
K1C
)
{
constexpr
int
K0
=
decltype
(
K0C
)
::
value
;
constexpr
int
K1
=
decltype
(
K1C
)
::
value
;
found
=
true
;
getSubMIndicePairsHashUnrollKernel2
<
Index
,
K0
,
K1
>
<<<
tv
::
cuda
::
getBlocks
(
numActIn
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
stream
>>>
(
tv
::
torch2tv
<
Index
>
(
indicesIn
),
tv
::
torch2tv
<
Index
>
(
indicePairs
),
tv
::
torch2tv
<
Index
>
(
indiceNum
),
ou_
,
spatialVolume
,
tableSize
,
tableData
,
constants
,
stash_constants
,
stash_count
);
});
});
}
else
if
(
NDim
==
3
)
{
tv
::
SimpleVector
<
Index
,
3
>
ou_
(
outSpatialShape
.
begin
(),
outSpatialShape
.
end
());
dispatcher
(
kernelSize
[
0
],
[
&
](
auto
K0C
)
{
dispatcher
(
kernelSize
[
1
],
[
&
](
auto
K1C
)
{
dispatcher
(
kernelSize
[
2
],
[
&
](
auto
K2C
)
{
constexpr
int
K0
=
decltype
(
K0C
)
::
value
;
constexpr
int
K1
=
decltype
(
K1C
)
::
value
;
constexpr
int
K2
=
decltype
(
K2C
)
::
value
;
found
=
true
;
getSubMIndicePairsHashUnrollKernel3
<
Index
,
K0
,
K1
,
K2
>
<<<
tv
::
cuda
::
getBlocks
(
numActIn
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
stream
>>>
(
tv
::
torch2tv
<
Index
>
(
indicesIn
),
tv
::
torch2tv
<
Index
>
(
indicePairs
),
tv
::
torch2tv
<
Index
>
(
indiceNum
),
ou_
,
spatialVolume
,
tableSize
,
tableData
,
constants
,
stash_constants
,
stash_count
);
});
});
});
}
}
if
(
!
found
)
{
tv
::
DispatchInt
<
max_kernel_vol_t
>
()(
kernelVolume
,
std
::
less_equal
<
int
>
(),
[
&
](
auto
I2
)
{
constexpr
int
MaxKernelVolume
=
decltype
(
I2
)
::
value
;
getSubMIndicePairsHashKernel
<
Index
,
NDim
,
MaxKernelVolume
>
<<<
tv
::
cuda
::
getBlocks
(
numActIn
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
stream
>>>
(
tv
::
torch2tv
<
Index
>
(
indicesIn
),
<<<
tv
::
cuda
::
getBlocks
(
numActIn
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
stream
>>>
(
tv
::
torch2tv
<
Index
>
(
indicesIn
),
tv
::
torch2tv
<
Index
>
(
indicePairs
),
tv
::
torch2tv
<
Index
>
(
indiceNum
),
ks
,
st
,
pa
,
di
,
ou
,
tableSize
,
tableData
,
constants
,
stash_constants
,
stash_count
);
tv
::
torch2tv
<
Index
>
(
indiceNum
),
ks
,
st
,
pa
,
di
,
ou
,
tableSize
,
tableData
,
constants
,
stash_constants
,
stash_count
);
TV_CHECK_CUDA_ERR_V2
(
"getSubMIndicePairsHashKernel failed"
);
});
}
}
else
{
// auto timer = spconv::CudaContextTimer<>();
prepareSubMGridKernel
<
Index
,
IndexGrid
,
NDim
>
...
...
@@ -280,16 +334,17 @@ int create_submconv_indice_pair_cuda(
auto
found
=
false
;
if
(
dilation_one
&&
(
NDim
==
2
||
NDim
==
3
))
{
auto
indiceNumCpu
=
indiceNum
.
cpu
();
if
(
NDim
==
2
)
{
tv
::
SimpleVector
<
Index
,
2
>
ou_
(
outSpatialShape
.
begin
(),
outSpatialShape
.
end
());
tv
::
dispatch
_int_noexcept
<
1
,
3
,
5
>
(
kernelSize
[
0
],
[
&
](
auto
K0C
)
{
tv
::
dispatch
_int_noexcept
<
1
,
3
,
5
>
(
kernelSize
[
1
],
[
&
](
auto
K1C
)
{
dispatch
er
(
kernelSize
[
0
],
[
&
](
auto
K0C
)
{
dispatch
er
(
kernelSize
[
1
],
[
&
](
auto
K1C
)
{
constexpr
int
K0
=
decltype
(
K0C
)
::
value
;
constexpr
int
K1
=
decltype
(
K1C
)
::
value
;
found
=
true
;
getSubMIndicePairsKernel2
<
Index
,
IndexGrid
,
K0
,
K1
>
getSubMIndicePairs
Unroll
Kernel2
<
Index
,
IndexGrid
,
K0
,
K1
>
<<<
tv
::
cuda
::
getBlocks
(
numActIn
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
stream
>>>
(
tv
::
torch2tv
<
Index
>
(
indicesIn
),
...
...
@@ -301,23 +356,20 @@ int create_submconv_indice_pair_cuda(
}
else
if
(
NDim
==
3
)
{
tv
::
SimpleVector
<
Index
,
3
>
ou_
(
outSpatialShape
.
begin
(),
outSpatialShape
.
end
());
tv
::
dispatch_int_noexcept
<
1
,
3
,
5
>
(
kernelSize
[
0
],
[
&
](
auto
K0C
)
{
tv
::
dispatch_int_noexcept
<
1
,
3
,
5
>
(
kernelSize
[
1
],
[
&
](
auto
K1C
)
{
tv
::
dispatch_int_noexcept
<
1
,
3
,
5
>
(
kernelSize
[
2
],
[
&
](
auto
K2C
)
{
dispatcher
(
kernelSize
[
0
],
[
&
](
auto
K0C
)
{
dispatcher
(
kernelSize
[
1
],
[
&
](
auto
K1C
)
{
dispatcher
(
kernelSize
[
2
],
[
&
](
auto
K2C
)
{
constexpr
int
K0
=
decltype
(
K0C
)
::
value
;
constexpr
int
K1
=
decltype
(
K1C
)
::
value
;
constexpr
int
K2
=
decltype
(
K2C
)
::
value
;
found
=
true
;
getSubMIndicePairsKernel3
<
Index
,
IndexGrid
,
K0
,
K1
,
K2
>
getSubMIndicePairs
Unroll
Kernel3
<
Index
,
IndexGrid
,
K0
,
K1
,
K2
>
<<<
tv
::
cuda
::
getBlocks
(
numActIn
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
stream
>>>
(
tv
::
torch2tv
<
Index
>
(
indicesIn
),
tv
::
torch2tv
<
IndexGrid
>
(
gridsOut
),
tv
::
torch2tv
<
Index
>
(
indicePairs
),
tv
::
torch2tv
<
Index
>
(
indiceNum
),
ou_
,
spatialVolume
);
tv
::
torch2tv
<
Index
>
(
indiceNum
),
ou_
,
spatialVolume
);
});
});
});
...
...
@@ -344,7 +396,7 @@ int create_submconv_indice_pair_cuda(
resetGridSubMKernel
<
Index
,
IndexGrid
,
NDim
>
<<<
tv
::
cuda
::
getBlocks
(
numActIn
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
stream
>>>
(
indicesIn
.
data_ptr
<
Index
>
(),
tv
::
torch2tv
<
IndexGrid
>
(
gridsOut
),
ou
,
numActIn
);
tv
::
torch2tv
<
IndexGrid
>
(
gridsOut
),
ou
,
numActIn
,
spatialVolume
);
TV_CHECK_CUDA_ERR_V2
(
"resetGridKernel failed"
);
}
});
...
...
src/spconv/spconv_ops.cc
View file @
d17a00e0
...
...
@@ -2,7 +2,7 @@
namespace
spconv
{
std
::
vector
<
torch
::
Tensor
>
getIndicePairs
(
torch
::
Tensor
indices
,
int64_t
batchSize
,
getIndicePairs
(
torch
::
Tensor
indices
,
torch
::
Tensor
gridOut
,
int64_t
batchSize
,
std
::
vector
<
int64_t
>
outSpatialShape
,
std
::
vector
<
int64_t
>
spatialShape
,
std
::
vector
<
int64_t
>
kernelSize
,
std
::
vector
<
int64_t
>
stride
,
...
...
@@ -47,8 +47,11 @@ getIndicePairs(torch::Tensor indices, int64_t batchSize,
if
(
useHash
)
{
gridSize
=
batchSize
;
}
torch
::
Tensor
gridOut
=
torch
::
full
(
bool
resetGrid
=
gridOut
.
numel
()
!=
0
;
if
(
!
resetGrid
){
gridOut
=
torch
::
full
(
{
gridSize
},
-
1
,
torch
::
dtype
(
torch
::
kInt32
).
device
(
indices
.
device
()));
}
gridOut
=
gridOut
.
view
({
batchSize
,
-
1
});
int64_t
numActOut
=
-
1
;
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
...
...
@@ -68,7 +71,7 @@ getIndicePairs(torch::Tensor indices, int64_t batchSize,
else
if
(
indices
.
device
().
type
()
==
torch
::
kCUDA
)
{
numActOut
=
create_submconv_indice_pair_cuda
(
indices
,
gridOut
,
indicePairs
,
indiceNum
,
kernelSize
,
stride
,
padding
,
dilation
,
outSpatialShape
,
transpose
,
false
,
useHash
);
dilation
,
outSpatialShape
,
transpose
,
resetGrid
,
useHash
);
if
(
numActOut
==
-
1
)
{
auto
device
=
indices
.
device
();
indicePairs
=
indicePairs
.
to
({
torch
::
kCPU
});
...
...
@@ -98,7 +101,7 @@ getIndicePairs(torch::Tensor indices, int64_t batchSize,
if
(
indices
.
device
().
type
()
==
torch
::
kCPU
)
{
numActOut
=
create_conv_indice_pair_cpu
(
indices
,
outInds
,
gridOut
,
indicePairs
,
indiceNum
,
kernelSize
,
stride
,
padding
,
dilation
,
outSpatialShape
,
transpose
,
false
,
useHash
);
padding
,
dilation
,
outSpatialShape
,
transpose
,
resetGrid
,
useHash
);
}
#ifdef TV_CUDA
else
if
(
indices
.
device
().
type
()
==
torch
::
kCUDA
)
{
...
...
@@ -110,7 +113,7 @@ getIndicePairs(torch::Tensor indices, int64_t batchSize,
indicePairUnique
=
std
::
get
<
0
>
(
res
);
numActOut
=
create_conv_indice_pair_p2_cuda
(
indices
,
outInds
,
gridOut
,
indicePairs
,
indiceNum
,
indicePairUnique
,
outSpatialShape
,
transpose
,
false
,
useHash
);
outSpatialShape
,
transpose
,
resetGrid
,
useHash
);
if
(
numActOut
==
-
1
)
{
auto
device
=
indices
.
device
();
outInds
=
outInds
.
to
({
torch
::
kCPU
});
...
...
@@ -188,7 +191,8 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
double
totalGatherTime
=
0
;
double
totalGEMMTime
=
0
;
double
totalSAddTime
=
0
;
// tv::ssprint("first subm gemm time", timer.report() / 1000.0);
// tv::ssprint("first subm gemm time", timer.report() / 1000.0, std::vector<int>(indicePairNumCpu.data_ptr<int>(),
// indicePairNumCpu.data_ptr<int>() + kernelVolume));
for
(
int
i
=
0
;
i
<
kernelVolume
;
++
i
)
{
auto
nHot
=
indicePairNumCpu
.
data_ptr
<
int
>
()[
i
];
...
...
@@ -237,6 +241,8 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
// totalSAddTime += timer.report() / 1000.0;
}
// tv::ssprint(totalGatherTime, totalGEMMTime, totalSAddTime);
// tv::ssprint("final subm gemm time", timer.report() / 1000.0);
return
output
;
}
...
...
test/benchmark.py
0 → 100644
View file @
d17a00e0
import
torch
import
spconv
import
numpy
as
np
from
spconv.utils
import
VoxelGeneratorV2
from
pathlib
import
Path
from
torch
import
nn
import
time
def
waymo_data
(
batch_size
=
1
):
gen
=
VoxelGeneratorV2
([
0.1
,
0.1
,
0.1
],
[
-
80
,
-
80
,
-
2
,
80
,
80
,
6
],
1
,
150000
)
data
=
np
.
load
(
Path
(
__file__
).
parent
/
"data"
/
"benchmark-pc.npz"
)
pc
=
data
[
"pc"
]
data
=
gen
.
generate
(
pc
)
voxels
=
data
[
"voxels"
].
reshape
(
-
1
,
3
)
coors
=
data
[
"coordinates"
]
N
=
coors
.
shape
[
0
]
coors
=
np
.
concatenate
([
np
.
full
([
N
,
1
],
0
,
coors
.
dtype
),
coors
],
axis
=
1
)
return
voxels
,
coors
,
gen
.
grid_size
class
Net
(
nn
.
Module
):
def
__init__
(
self
,
shape
):
super
().
__init__
()
self
.
net
=
spconv
.
SparseSequential
(
spconv
.
SubMConv3d
(
3
,
64
,
3
,
bias
=
False
,
indice_key
=
"c0"
),
spconv
.
SubMConv3d
(
64
,
64
,
3
,
bias
=
False
,
indice_key
=
"c0"
),
# nn.BatchNorm1d(32),
# nn.ReLU(),
spconv
.
SparseMaxPool3d
(
2
,
2
),
spconv
.
SubMConv3d
(
64
,
96
,
3
,
bias
=
False
,
indice_key
=
"c1"
),
spconv
.
SubMConv3d
(
96
,
96
,
3
,
bias
=
False
,
indice_key
=
"c1"
),
# nn.BatchNorm1d(64),
# nn.ReLU(),
spconv
.
SparseMaxPool3d
(
2
,
2
),
spconv
.
SubMConv3d
(
96
,
128
,
3
,
bias
=
False
,
indice_key
=
"c2"
),
spconv
.
SubMConv3d
(
128
,
128
,
3
,
bias
=
False
,
indice_key
=
"c2"
),
# nn.BatchNorm1d(128),
# nn.ReLU(),
spconv
.
SparseMaxPool3d
(
2
,
2
),
spconv
.
SubMConv3d
(
128
,
160
,
3
,
bias
=
False
,
indice_key
=
"c3"
),
spconv
.
SubMConv3d
(
160
,
160
,
3
,
bias
=
False
,
indice_key
=
"c3"
),
# nn.BatchNorm1d(128),
# nn.ReLU(),
spconv
.
SparseMaxPool3d
(
2
,
2
),
spconv
.
SubMConv3d
(
160
,
192
,
3
,
bias
=
False
,
indice_key
=
"c4"
),
spconv
.
SubMConv3d
(
192
,
192
,
3
,
bias
=
False
,
indice_key
=
"c4"
),
# nn.BatchNorm1d(128),
# nn.ReLU(),
spconv
.
SparseMaxPool3d
(
2
,
2
),
spconv
.
SubMConv3d
(
192
,
224
,
3
,
bias
=
False
,
indice_key
=
"c5"
),
spconv
.
SubMConv3d
(
224
,
224
,
3
,
bias
=
False
,
indice_key
=
"c5"
),
# nn.BatchNorm1d(128),
# nn.ReLU(),
spconv
.
SparseMaxPool3d
(
2
,
2
),
spconv
.
SubMConv3d
(
224
,
256
,
3
,
bias
=
False
,
indice_key
=
"c6"
),
spconv
.
SubMConv3d
(
256
,
256
,
3
,
bias
=
False
,
indice_key
=
"c6"
),
)
max_batch_size
=
1
self
.
grid
=
torch
.
full
([
max_batch_size
,
*
shape
],
-
1
,
dtype
=
torch
.
int32
).
cuda
()
# self.grid = None
self
.
shape
=
shape
def
forward
(
self
,
features
,
coors
,
batch_size
):
x
=
spconv
.
SparseConvTensor
(
features
,
coors
,
self
.
shape
,
batch_size
,
self
.
grid
)
return
self
.
net
(
x
)
def
main
():
voxels
,
coors
,
spatial_shape
=
waymo_data
()
voxels_th
=
torch
.
from_numpy
(
voxels
).
cuda
().
float
()
coors_th
=
torch
.
from_numpy
(
coors
).
cuda
()
net
=
Net
(
spatial_shape
[::
-
1
]).
cuda
().
eval
().
float
()
print
(
coors_th
.
shape
)
out
=
net
(
voxels_th
,
coors_th
,
1
)
print
(
out
.
spatial_shape
)
times
=
[]
with
torch
.
no_grad
():
for
i
in
range
(
20
):
torch
.
cuda
.
synchronize
()
t
=
time
.
time
()
out
=
net
(
voxels_th
,
coors_th
,
1
)
torch
.
cuda
.
synchronize
()
times
.
append
(
time
.
time
()
-
t
)
# print((net.grid == -1).float().sum(), net.grid.numel())
# print("spconv time", time.time() - t)
print
(
"spconv time"
,
np
.
mean
(
times
[
10
:]))
if
__name__
==
"__main__"
:
main
()
\ No newline at end of file
test/data/benchmark-pc.npz
0 → 100644
View file @
d17a00e0
File added
test/test_conv.py
View file @
d17a00e0
...
...
@@ -752,8 +752,8 @@ def main_subm(algo, dtype=torch.float32):
if
__name__
==
'__main__'
:
main
_subm
(
algo
=
spconv
.
ConvAlgo
.
Native
,
dtype
=
torch
.
float32
)
main
_subm
(
algo
=
spconv
.
ConvAlgo
.
Native
,
dtype
=
torch
.
half
)
main
(
algo
=
spconv
.
ConvAlgo
.
Native
,
dtype
=
torch
.
float32
)
main
(
algo
=
spconv
.
ConvAlgo
.
Native
,
dtype
=
torch
.
half
)
# TestCase().assertAllClose(out_my, out_ref)
# unittest.main()
# TestSpConv().testSpConv3d()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment