Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
one
spconv
Commits
d17a00e0
Commit
d17a00e0
authored
Jun 05, 2020
by
yanyan
Browse files
sync some code
parent
11bcbbf6
Changes
13
Hide whitespace changes
Inline
Side-by-side
Showing
13 changed files
with
389 additions
and
97 deletions
+389
-97
include/spconv/indice.cu.h
include/spconv/indice.cu.h
+145
-29
include/spconv/spconv_ops.h
include/spconv/spconv_ops.h
+1
-1
include/sphash/hashmap.h
include/sphash/hashmap.h
+19
-0
include/tensorview/tensor.h
include/tensorview/tensor.h
+14
-0
spconv/__init__.py
spconv/__init__.py
+2
-0
spconv/ops.py
spconv/ops.py
+7
-17
spconv/pool.py
spconv/pool.py
+6
-2
src/spconv/all.cc
src/spconv/all.cc
+0
-1
src/spconv/indice.cu
src/spconv/indice.cu
+89
-37
src/spconv/spconv_ops.cc
src/spconv/spconv_ops.cc
+14
-8
test/benchmark.py
test/benchmark.py
+90
-0
test/data/benchmark-pc.npz
test/data/benchmark-pc.npz
+0
-0
test/test_conv.py
test/test_conv.py
+2
-2
No files found.
include/spconv/indice.cu.h
View file @
d17a00e0
...
@@ -269,7 +269,7 @@ __global__ void getSubMIndicePairsKernel(
...
@@ -269,7 +269,7 @@ __global__ void getSubMIndicePairsKernel(
template
<
typename
Index
,
typename
IndexGrid
,
unsigned
K0
,
unsigned
K1
,
template
<
typename
Index
,
typename
IndexGrid
,
unsigned
K0
,
unsigned
K1
,
unsigned
K2
>
unsigned
K2
>
__global__
void
getSubMIndicePairsKernel3
(
__global__
void
getSubMIndicePairs
Unroll
Kernel3
(
tv
::
TensorView
<
const
Index
>
indicesIn
,
tv
::
TensorView
<
IndexGrid
>
gridsOut
,
tv
::
TensorView
<
const
Index
>
indicesIn
,
tv
::
TensorView
<
IndexGrid
>
gridsOut
,
tv
::
TensorView
<
Index
>
indicePairs
,
tv
::
TensorView
<
Index
>
indiceNum
,
tv
::
TensorView
<
Index
>
indicePairs
,
tv
::
TensorView
<
Index
>
indiceNum
,
const
tv
::
SimpleVector
<
Index
,
3
>
outSpatialShape
,
Index
spatialVolume
)
{
const
tv
::
SimpleVector
<
Index
,
3
>
outSpatialShape
,
Index
spatialVolume
)
{
...
@@ -290,25 +290,26 @@ __global__ void getSubMIndicePairsKernel3(
...
@@ -290,25 +290,26 @@ __global__ void getSubMIndicePairsKernel3(
#pragma unroll
#pragma unroll
for
(
int
k
=
0
;
k
<
K2
;
++
k
)
{
for
(
int
k
=
0
;
k
<
K2
;
++
k
)
{
offset
=
i
*
K1
*
K2
+
j
*
K2
+
k
;
offset
=
i
*
K1
*
K2
+
j
*
K2
+
k
;
if
(
offset
>
center
){
if
(
offset
>
center
)
{
continue
;
continue
;
}
}
if
(
center
==
offset
){
if
(
center
==
offset
)
{
// center of subm indice pairs dont need atomicadd
// center of subm indice pairs dont need atomicadd
indicePairs
(
1
,
offset
,
ix
)
=
ix
;
indicePairs
(
1
,
offset
,
ix
)
=
ix
;
indicePairs
(
0
,
offset
,
ix
)
=
ix
;
indicePairs
(
0
,
offset
,
ix
)
=
ix
;
}
else
{
}
else
{
point
[
2
]
=
indice_data
[
3
]
-
k
+
K2
/
2
;
point
[
2
]
=
indice_data
[
3
]
-
k
+
K2
/
2
;
point
[
1
]
=
indice_data
[
2
]
-
j
+
K1
/
2
;
point
[
1
]
=
indice_data
[
2
]
-
j
+
K1
/
2
;
point
[
0
]
=
indice_data
[
1
]
-
i
+
K0
/
2
;
point
[
0
]
=
indice_data
[
1
]
-
i
+
K0
/
2
;
if
(
point
[
1
]
>=
0
&&
point
[
1
]
<
outSpatialShape
[
1
]
&&
point
[
2
]
>=
0
&&
if
(
point
[
1
]
>=
0
&&
point
[
1
]
<
outSpatialShape
[
1
]
&&
point
[
2
]
<
outSpatialShape
[
2
]
&&
point
[
0
]
>=
0
&&
point
[
2
]
>=
0
&&
point
[
2
]
<
outSpatialShape
[
2
]
&&
point
[
0
]
<
outSpatialShape
[
0
])
{
point
[
0
]
>=
0
&&
point
[
0
]
<
outSpatialShape
[
0
])
{
index
=
tv
::
ArrayIndexRowMajor
<
3
,
3
>::
runPtrs
(
index
=
tv
::
ArrayIndexRowMajor
<
3
,
3
>::
runPtrs
(
point
,
outSpatialShape
.
data
(),
0
)
+
point
,
outSpatialShape
.
data
(),
0
)
+
spatialVolume
*
indice_data
[
0
];
spatialVolume
*
indice_data
[
0
];
if
(
gridsOut
[
index
]
!=
-
1
)
{
if
(
gridsOut
[
index
]
!=
-
1
)
{
// for subm: indicePairs[0, i] = indicePairs[1, kernelVolume - i - 1]
// for subm: indicePairs[0, i] = indicePairs[1, kernelVolume - i
// - 1]
Index
oldNum
=
atomicAdd
(
indiceNum
.
data
()
+
offset
,
Index
(
1
));
Index
oldNum
=
atomicAdd
(
indiceNum
.
data
()
+
offset
,
Index
(
1
));
atomicAdd
(
indiceNum
.
data
()
+
KV
-
offset
-
1
,
Index
(
1
));
atomicAdd
(
indiceNum
.
data
()
+
KV
-
offset
-
1
,
Index
(
1
));
indicePairs
(
1
,
offset
,
oldNum
)
=
gridsOut
[
index
];
indicePairs
(
1
,
offset
,
oldNum
)
=
gridsOut
[
index
];
...
@@ -325,7 +326,7 @@ __global__ void getSubMIndicePairsKernel3(
...
@@ -325,7 +326,7 @@ __global__ void getSubMIndicePairsKernel3(
}
}
template
<
typename
Index
,
typename
IndexGrid
,
unsigned
K0
,
unsigned
K1
>
template
<
typename
Index
,
typename
IndexGrid
,
unsigned
K0
,
unsigned
K1
>
__global__
void
getSubMIndicePairsKernel2
(
__global__
void
getSubMIndicePairs
Unroll
Kernel2
(
tv
::
TensorView
<
const
Index
>
indicesIn
,
tv
::
TensorView
<
IndexGrid
>
gridsOut
,
tv
::
TensorView
<
const
Index
>
indicesIn
,
tv
::
TensorView
<
IndexGrid
>
gridsOut
,
tv
::
TensorView
<
Index
>
indicePairs
,
tv
::
TensorView
<
Index
>
indiceNum
,
tv
::
TensorView
<
Index
>
indicePairs
,
tv
::
TensorView
<
Index
>
indiceNum
,
const
tv
::
SimpleVector
<
Index
,
2
>
outSpatialShape
,
Index
spatialVolume
)
{
const
tv
::
SimpleVector
<
Index
,
2
>
outSpatialShape
,
Index
spatialVolume
)
{
...
@@ -344,14 +345,14 @@ __global__ void getSubMIndicePairsKernel2(
...
@@ -344,14 +345,14 @@ __global__ void getSubMIndicePairsKernel2(
#pragma unroll
#pragma unroll
for
(
int
j
=
0
;
j
<
K1
;
++
j
)
{
for
(
int
j
=
0
;
j
<
K1
;
++
j
)
{
offset
=
i
*
K1
+
j
;
offset
=
i
*
K1
+
j
;
if
(
offset
>
center
){
if
(
offset
>
center
)
{
continue
;
continue
;
}
}
if
(
center
==
offset
){
if
(
center
==
offset
)
{
// center of subm indice pairs dont need atomicadd
// center of subm indice pairs dont need atomicadd
indicePairs
(
1
,
offset
,
ix
)
=
ix
;
indicePairs
(
1
,
offset
,
ix
)
=
ix
;
indicePairs
(
0
,
offset
,
ix
)
=
ix
;
indicePairs
(
0
,
offset
,
ix
)
=
ix
;
}
else
{
}
else
{
point
[
1
]
=
indice_data
[
2
]
-
j
+
K1
/
2
;
point
[
1
]
=
indice_data
[
2
]
-
j
+
K1
/
2
;
point
[
0
]
=
indice_data
[
1
]
-
i
+
K0
/
2
;
point
[
0
]
=
indice_data
[
1
]
-
i
+
K0
/
2
;
if
(
point
[
1
]
>=
0
&&
point
[
1
]
<
outSpatialShape
[
1
]
&&
point
[
0
]
>=
0
&&
if
(
point
[
1
]
>=
0
&&
point
[
1
]
<
outSpatialShape
[
1
]
&&
point
[
0
]
>=
0
&&
...
@@ -418,6 +419,130 @@ __global__ void getSubMIndicePairsHashKernel(
...
@@ -418,6 +419,130 @@ __global__ void getSubMIndicePairsHashKernel(
}
}
}
}
template
<
typename
Index
,
unsigned
K0
,
unsigned
K1
,
unsigned
K2
,
unsigned
kNumHashFunctions
=
4
>
__global__
void
getSubMIndicePairsHashUnrollKernel3
(
tv
::
TensorView
<
const
Index
>
indicesIn
,
tv
::
TensorView
<
Index
>
indicePairs
,
tv
::
TensorView
<
Index
>
indiceNum
,
const
tv
::
SimpleVector
<
Index
,
3
>
outSpatialShape
,
Index
spatialVolume
,
unsigned
table_size
,
const
cuhash
::
Entry
*
table
,
cuhash
::
Functions
<
kNumHashFunctions
>
constants
,
uint2
stash_constants
,
unsigned
stash_count
)
{
auto
numActIn
=
indicesIn
.
dim
(
0
);
Index
index
=
0
;
Index
offset
;
Index
point
[
3
];
constexpr
unsigned
KV
=
K0
*
K1
*
K2
;
constexpr
unsigned
center
=
KV
/
2
;
*
(
indiceNum
.
data
()
+
center
)
=
numActIn
;
for
(
int
ix
:
tv
::
KernelLoopX
<
int
>
(
numActIn
))
{
const
Index
*
indice_data
=
indicesIn
.
data
()
+
ix
*
(
3
+
1
);
#pragma unroll
for
(
int
i
=
0
;
i
<
K0
;
++
i
)
{
#pragma unroll
for
(
int
j
=
0
;
j
<
K1
;
++
j
)
{
#pragma unroll
for
(
int
k
=
0
;
k
<
K2
;
++
k
)
{
offset
=
i
*
K1
*
K2
+
j
*
K2
+
k
;
if
(
offset
>
center
)
{
continue
;
}
if
(
center
==
offset
)
{
// center of subm indice pairs dont need atomicadd
indicePairs
(
1
,
offset
,
ix
)
=
ix
;
indicePairs
(
0
,
offset
,
ix
)
=
ix
;
}
else
{
point
[
2
]
=
indice_data
[
3
]
-
k
+
K2
/
2
;
point
[
1
]
=
indice_data
[
2
]
-
j
+
K1
/
2
;
point
[
0
]
=
indice_data
[
1
]
-
i
+
K0
/
2
;
if
(
point
[
1
]
>=
0
&&
point
[
1
]
<
outSpatialShape
[
1
]
&&
point
[
2
]
>=
0
&&
point
[
2
]
<
outSpatialShape
[
2
]
&&
point
[
0
]
>=
0
&&
point
[
0
]
<
outSpatialShape
[
0
])
{
index
=
tv
::
ArrayIndexRowMajor
<
3
,
3
>::
runPtrs
(
point
,
outSpatialShape
.
data
(),
0
)
+
spatialVolume
*
indice_data
[
0
];
auto
val
=
cuhash
::
retrieve
((
unsigned
)(
index
),
table_size
,
table
,
constants
,
stash_constants
,
stash_count
);
if
(
val
!=
cuhash
::
kNotFound
)
{
// for subm: indicePairs[0, i] = indicePairs[1, kernelVolume - i
// - 1]
Index
oldNum
=
atomicAdd
(
indiceNum
.
data
()
+
offset
,
Index
(
1
));
atomicAdd
(
indiceNum
.
data
()
+
KV
-
offset
-
1
,
Index
(
1
));
indicePairs
(
1
,
offset
,
oldNum
)
=
val
;
indicePairs
(
0
,
offset
,
oldNum
)
=
ix
;
indicePairs
(
1
,
KV
-
offset
-
1
,
oldNum
)
=
ix
;
indicePairs
(
0
,
KV
-
offset
-
1
,
oldNum
)
=
val
;
}
}
}
}
}
}
}
}
template
<
typename
Index
,
unsigned
K0
,
unsigned
K1
,
unsigned
kNumHashFunctions
=
4
>
__global__
void
getSubMIndicePairsHashUnrollKernel2
(
tv
::
TensorView
<
const
Index
>
indicesIn
,
tv
::
TensorView
<
Index
>
indicePairs
,
tv
::
TensorView
<
Index
>
indiceNum
,
const
tv
::
SimpleVector
<
Index
,
2
>
outSpatialShape
,
Index
spatialVolume
,
unsigned
table_size
,
const
cuhash
::
Entry
*
table
,
cuhash
::
Functions
<
kNumHashFunctions
>
constants
,
uint2
stash_constants
,
unsigned
stash_count
)
{
auto
numActIn
=
indicesIn
.
dim
(
0
);
Index
index
=
0
;
Index
offset
;
Index
point
[
2
];
constexpr
unsigned
KV
=
K0
*
K1
;
constexpr
unsigned
center
=
KV
/
2
;
*
(
indiceNum
.
data
()
+
center
)
=
numActIn
;
for
(
int
ix
:
tv
::
KernelLoopX
<
int
>
(
numActIn
))
{
const
Index
*
indice_data
=
indicesIn
.
data
()
+
ix
*
(
2
+
1
);
#pragma unroll
for
(
int
i
=
0
;
i
<
K0
;
++
i
)
{
#pragma unroll
for
(
int
j
=
0
;
j
<
K1
;
++
j
)
{
offset
=
i
*
K1
+
j
;
if
(
offset
>
center
)
{
continue
;
}
if
(
center
==
offset
)
{
// center of subm indice pairs dont need atomicadd
indicePairs
(
1
,
offset
,
ix
)
=
ix
;
indicePairs
(
0
,
offset
,
ix
)
=
ix
;
}
else
{
point
[
1
]
=
indice_data
[
2
]
-
j
+
K1
/
2
;
point
[
0
]
=
indice_data
[
1
]
-
i
+
K0
/
2
;
if
(
point
[
1
]
>=
0
&&
point
[
1
]
<
outSpatialShape
[
1
]
&&
point
[
0
]
>=
0
&&
point
[
0
]
<
outSpatialShape
[
0
])
{
index
=
tv
::
ArrayIndexRowMajor
<
2
,
2
>::
runPtrs
(
point
,
outSpatialShape
.
data
(),
0
)
+
spatialVolume
*
indice_data
[
0
];
auto
val
=
cuhash
::
retrieve
((
unsigned
)(
index
),
table_size
,
table
,
constants
,
stash_constants
,
stash_count
);
if
(
val
!=
cuhash
::
kNotFound
)
{
// for subm: indicePairs[0, i] = indicePairs[1, kernelVolume - i -
// 1]
Index
oldNum
=
atomicAdd
(
indiceNum
.
data
()
+
offset
,
Index
(
1
));
atomicAdd
(
indiceNum
.
data
()
+
KV
-
offset
-
1
,
Index
(
1
));
indicePairs
(
1
,
offset
,
oldNum
)
=
val
;
indicePairs
(
0
,
offset
,
oldNum
)
=
ix
;
indicePairs
(
1
,
KV
-
offset
-
1
,
oldNum
)
=
ix
;
indicePairs
(
0
,
KV
-
offset
-
1
,
oldNum
)
=
val
;
}
}
}
}
}
}
}
template
<
typename
Index
,
typename
IndexGrid
,
unsigned
NDim
>
template
<
typename
Index
,
typename
IndexGrid
,
unsigned
NDim
>
__global__
void
resetGridKernel
(
const
Index
*
indicePairUnique
,
__global__
void
resetGridKernel
(
const
Index
*
indicePairUnique
,
tv
::
TensorView
<
IndexGrid
>
gridsOut
,
tv
::
TensorView
<
IndexGrid
>
gridsOut
,
...
@@ -437,22 +562,13 @@ template <typename Index, typename IndexGrid, unsigned NDim>
...
@@ -437,22 +562,13 @@ template <typename Index, typename IndexGrid, unsigned NDim>
__global__
void
__global__
void
resetGridSubMKernel
(
const
Index
*
indices
,
tv
::
TensorView
<
IndexGrid
>
gridsOut
,
resetGridSubMKernel
(
const
Index
*
indices
,
tv
::
TensorView
<
IndexGrid
>
gridsOut
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
outSpatialShape
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
outSpatialShape
,
int
numAct
)
{
int
numAct
,
Index
spatialVolume
)
{
Index
outSpatialShapeReg
[
NDim
];
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
outSpatialShapeReg
[
i
]
=
outSpatialShape
[
i
];
}
Index
spatialVolume
=
1
;
auto
indsPtr
=
indices
;
auto
indsPtr
=
indices
;
#pragma unroll
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
spatialVolume
*=
outSpatialShape
[
i
];
}
Index
index
;
Index
index
;
for
(
int
ix
:
tv
::
KernelLoopX
<
int
>
(
numAct
))
{
for
(
int
ix
:
tv
::
KernelLoopX
<
int
>
(
numAct
))
{
indsPtr
=
indices
+
ix
*
(
NDim
+
1
);
indsPtr
=
indices
+
ix
*
(
NDim
+
1
);
index
=
tv
::
ArrayIndexRowMajor
<
NDim
,
NDim
>::
runPtrs
(
indsPtr
+
1
,
index
=
tv
::
ArrayIndexRowMajor
<
NDim
,
NDim
>::
runPtrs
(
indsPtr
+
1
,
outSpatialShape
Reg
,
0
);
outSpatialShape
.
data
()
,
0
);
gridsOut
[
index
+
spatialVolume
*
indsPtr
[
0
]]
=
-
1
;
gridsOut
[
index
+
spatialVolume
*
indsPtr
[
0
]]
=
-
1
;
}
}
}
}
...
...
include/spconv/spconv_ops.h
View file @
d17a00e0
...
@@ -27,7 +27,7 @@ enum ConvAlgo { kNative = 0, kBatch = 1, kBatchGemmGather = 2 };
...
@@ -27,7 +27,7 @@ enum ConvAlgo { kNative = 0, kBatch = 1, kBatchGemmGather = 2 };
// torch.jit's doc says only support int64, so we need to convert to int32.
// torch.jit's doc says only support int64, so we need to convert to int32.
std
::
vector
<
torch
::
Tensor
>
std
::
vector
<
torch
::
Tensor
>
getIndicePairs
(
torch
::
Tensor
indices
,
int64_t
batchSize
,
getIndicePairs
(
torch
::
Tensor
indices
,
torch
::
Tensor
gridOut
,
int64_t
batchSize
,
std
::
vector
<
int64_t
>
outSpatialShape
,
std
::
vector
<
int64_t
>
outSpatialShape
,
std
::
vector
<
int64_t
>
spatialShape
,
std
::
vector
<
int64_t
>
spatialShape
,
std
::
vector
<
int64_t
>
kernelSize
,
std
::
vector
<
int64_t
>
stride
,
std
::
vector
<
int64_t
>
kernelSize
,
std
::
vector
<
int64_t
>
stride
,
...
...
include/sphash/hashmap.h
0 → 100644
View file @
d17a00e0
#include <tensorview/tensor.h>
namespace
spconv
{
enum
HashTypes
{
kDenseMap
=
0
,
kCUDPPHash
=
1
};
template
<
int
Impl
>
struct
HashMap
;
template
<
>
struct
HashMap
<
kDenseMap
>
{
};
}
\ No newline at end of file
include/tensorview/tensor.h
View file @
d17a00e0
...
@@ -510,6 +510,20 @@ struct DispatchInt<T<Args...>> {
...
@@ -510,6 +510,20 @@ struct DispatchInt<T<Args...>> {
}
}
};
};
template
<
class
T
>
struct
DispatchIntNoexcept
;
template
<
template
<
class
...
>
class
T
,
class
...
Args
>
struct
DispatchIntNoexcept
<
T
<
Args
...
>>
{
template
<
typename
F
>
inline
bool
operator
()(
int
t
,
F
&&
f
)
{
return
dispatch_int_noexcept
<
Args
::
value
...
>
(
t
,
std
::
forward
<
F
>
(
f
));
}
template
<
typename
F
,
typename
BinaryPredicate
>
inline
bool
operator
()(
int
t
,
BinaryPredicate
p
,
F
&&
f
)
{
return
dispatch_int_noexcept
<
Args
::
value
...
>
(
t
,
p
,
std
::
forward
<
F
>
(
f
));
}
};
constexpr
size_t
kTensorMaxDim
=
10
;
constexpr
size_t
kTensorMaxDim
=
10
;
using
TensorShape
=
ShapeBase
<
kTensorMaxDim
,
int64_t
>
;
using
TensorShape
=
ShapeBase
<
kTensorMaxDim
,
int64_t
>
;
...
...
spconv/__init__.py
View file @
d17a00e0
...
@@ -68,6 +68,8 @@ class SparseConvTensor(object):
...
@@ -68,6 +68,8 @@ class SparseConvTensor(object):
self
.
spatial_shape
=
spatial_shape
self
.
spatial_shape
=
spatial_shape
self
.
batch_size
=
batch_size
self
.
batch_size
=
batch_size
self
.
indice_dict
=
{}
self
.
indice_dict
=
{}
if
grid
is
None
:
grid
=
torch
.
Tensor
()
# empty tensor
self
.
grid
=
grid
self
.
grid
=
grid
@
classmethod
@
classmethod
...
...
spconv/ops.py
View file @
d17a00e0
...
@@ -88,23 +88,13 @@ def get_indice_pairs(indices,
...
@@ -88,23 +88,13 @@ def get_indice_pairs(indices,
else
:
else
:
out_shape
=
spatial_shape
out_shape
=
spatial_shape
if
grid
is
None
:
if
grid
is
None
:
res
=
torch
.
ops
.
spconv
.
get_indice_pairs
(
indices
,
batch_size
,
out_shape
,
grid
=
torch
.
Tensor
()
spatial_shape
,
ksize
,
stride
,
res
=
torch
.
ops
.
spconv
.
get_indice_pairs
(
indices
,
grid
,
batch_size
,
out_shape
,
padding
,
dilation
,
out_padding
,
spatial_shape
,
ksize
,
stride
,
int
(
subm
),
int
(
transpose
),
padding
,
dilation
,
out_padding
,
int
(
use_hash
))
int
(
subm
),
int
(
transpose
),
return
res
int
(
use_hash
))
else
:
return
res
if
ndim
==
2
:
get_indice_pairs_func
=
torch
.
ops
.
spconv
.
get_indice_pairs_grid_2d
elif
ndim
==
3
:
get_indice_pairs_func
=
torch
.
ops
.
spconv
.
get_indice_pairs_grid_3d
else
:
raise
NotImplementedError
return
get_indice_pairs_func
(
indices
,
grid
,
batch_size
,
out_shape
,
spatial_shape
,
ksize
,
stride
,
padding
,
dilation
,
out_padding
,
int
(
subm
),
int
(
transpose
),
int
(
use_hash
))
def
indice_conv
(
features
,
def
indice_conv
(
features
,
...
...
spconv/pool.py
View file @
d17a00e0
...
@@ -46,7 +46,6 @@ class SparseMaxPool(SparseModule):
...
@@ -46,7 +46,6 @@ class SparseMaxPool(SparseModule):
padding
=
[
padding
]
*
ndim
padding
=
[
padding
]
*
ndim
if
not
isinstance
(
dilation
,
(
list
,
tuple
)):
if
not
isinstance
(
dilation
,
(
list
,
tuple
)):
dilation
=
[
dilation
]
*
ndim
dilation
=
[
dilation
]
*
ndim
self
.
ndim
=
ndim
self
.
ndim
=
ndim
self
.
kernel_size
=
kernel_size
self
.
kernel_size
=
kernel_size
self
.
stride
=
stride
self
.
stride
=
stride
...
@@ -61,6 +60,8 @@ class SparseMaxPool(SparseModule):
...
@@ -61,6 +60,8 @@ class SparseMaxPool(SparseModule):
indices
=
input
.
indices
indices
=
input
.
indices
spatial_shape
=
input
.
spatial_shape
spatial_shape
=
input
.
spatial_shape
batch_size
=
input
.
batch_size
batch_size
=
input
.
batch_size
torch
.
cuda
.
synchronize
()
t
=
time
.
time
()
if
not
self
.
subm
:
if
not
self
.
subm
:
out_spatial_shape
=
ops
.
get_conv_output_size
(
out_spatial_shape
=
ops
.
get_conv_output_size
(
spatial_shape
,
self
.
kernel_size
,
self
.
stride
,
self
.
padding
,
spatial_shape
,
self
.
kernel_size
,
self
.
stride
,
self
.
padding
,
...
@@ -69,11 +70,14 @@ class SparseMaxPool(SparseModule):
...
@@ -69,11 +70,14 @@ class SparseMaxPool(SparseModule):
out_spatial_shape
=
spatial_shape
out_spatial_shape
=
spatial_shape
outids
,
indice_pairs
,
indice_pairs_num
=
ops
.
get_indice_pairs
(
outids
,
indice_pairs
,
indice_pairs_num
=
ops
.
get_indice_pairs
(
indices
,
batch_size
,
spatial_shape
,
self
.
kernel_size
,
self
.
stride
,
indices
,
batch_size
,
spatial_shape
,
self
.
kernel_size
,
self
.
stride
,
self
.
padding
,
self
.
dilation
,
0
,
self
.
subm
)
self
.
padding
,
self
.
dilation
,
0
,
self
.
subm
,
grid
=
input
.
grid
)
out_features
=
Fsp
.
indice_maxpool
(
features
,
indice_pairs
.
to
(
device
),
out_features
=
Fsp
.
indice_maxpool
(
features
,
indice_pairs
.
to
(
device
),
indice_pairs_num
.
to
(
device
),
indice_pairs_num
.
to
(
device
),
outids
.
shape
[
0
])
outids
.
shape
[
0
])
torch
.
cuda
.
synchronize
()
print
(
"maxpool"
,
spatial_shape
,
time
.
time
()
-
t
)
out_tensor
=
spconv
.
SparseConvTensor
(
out_features
,
outids
,
out_tensor
=
spconv
.
SparseConvTensor
(
out_features
,
outids
,
out_spatial_shape
,
batch_size
)
out_spatial_shape
,
batch_size
)
out_tensor
.
indice_dict
=
input
.
indice_dict
out_tensor
.
indice_dict
=
input
.
indice_dict
...
...
src/spconv/all.cc
View file @
d17a00e0
...
@@ -23,7 +23,6 @@ static auto registry =
...
@@ -23,7 +23,6 @@ static auto registry =
torch
::
RegisterOperators
()
torch
::
RegisterOperators
()
.
op
(
"spconv::get_indice_pairs"
,
&
spconv
::
getIndicePairs
)
.
op
(
"spconv::get_indice_pairs"
,
&
spconv
::
getIndicePairs
)
.
op
(
"spconv::indice_conv"
,
&
spconv
::
indiceConv
)
.
op
(
"spconv::indice_conv"
,
&
spconv
::
indiceConv
)
.
op
(
"spconv::indice_conv_batch"
,
&
spconv
::
indiceConvBatch
)
.
op
(
"spconv::indice_conv_backward"
,
&
spconv
::
indiceConvBackward
)
.
op
(
"spconv::indice_conv_backward"
,
&
spconv
::
indiceConvBackward
)
.
op
(
"spconv::fused_indice_conv_bn"
,
&
spconv
::
fusedIndiceConvBatchNorm
)
.
op
(
"spconv::fused_indice_conv_bn"
,
&
spconv
::
fusedIndiceConvBatchNorm
)
.
op
(
"spconv::indice_maxpool"
,
&
spconv
::
indiceMaxPool
)
.
op
(
"spconv::indice_maxpool"
,
&
spconv
::
indiceMaxPool
)
...
...
src/spconv/indice.cu
View file @
d17a00e0
...
@@ -42,6 +42,8 @@ int create_conv_indice_pair_p1_cuda(
...
@@ -42,6 +42,8 @@ int create_conv_indice_pair_p1_cuda(
auto
ndim
=
kernelSize
.
size
();
auto
ndim
=
kernelSize
.
size
();
auto
numActIn
=
indicesIn
.
size
(
0
);
auto
numActIn
=
indicesIn
.
size
(
0
);
auto
kernelVolume
=
indiceNum
.
size
(
0
);
auto
kernelVolume
=
indiceNum
.
size
(
0
);
// auto timer = spconv::CudaContextTimer<>();
if
(
numActIn
==
0
)
if
(
numActIn
==
0
)
return
0
;
return
0
;
tv
::
dispatch_torch
<
int32_t
>
(
indicesIn
.
scalar_type
(),
[
&
](
auto
IndexValue
)
{
tv
::
dispatch_torch
<
int32_t
>
(
indicesIn
.
scalar_type
(),
[
&
](
auto
IndexValue
)
{
...
@@ -77,6 +79,7 @@ int create_conv_indice_pair_p1_cuda(
...
@@ -77,6 +79,7 @@ int create_conv_indice_pair_p1_cuda(
pa
,
di
,
ou
);
pa
,
di
,
ou
);
TV_CHECK_CUDA_ERR_V2
(
"prepareIndicePairsKernel failed"
);
TV_CHECK_CUDA_ERR_V2
(
"prepareIndicePairsKernel failed"
);
}
}
// tv::ssprint("prepareIndicePairsKernel", timer.report() / 1000.0);
#ifdef TV_LOG_KERNEL_INFO
#ifdef TV_LOG_KERNEL_INFO
cudaFuncAttributes
attr
;
cudaFuncAttributes
attr
;
checkCudaErrors
(
cudaFuncGetAttributes
(
checkCudaErrors
(
cudaFuncGetAttributes
(
...
@@ -193,10 +196,6 @@ int create_conv_indice_pair_p2_cuda(
...
@@ -193,10 +196,6 @@ int create_conv_indice_pair_p2_cuda(
return
numAct
;
return
numAct
;
}
}
template
<
typename
T
>
struct
is_valid
{
__device__
__forceinline__
bool
operator
()(
const
T
x
)
{
return
x
!=
-
1
;
}
};
int
create_submconv_indice_pair_cuda
(
int
create_submconv_indice_pair_cuda
(
torch
::
Tensor
indicesIn
,
torch
::
Tensor
gridsOut
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indicesIn
,
torch
::
Tensor
gridsOut
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
std
::
vector
<
int64_t
>
kernelSize
,
torch
::
Tensor
indiceNum
,
std
::
vector
<
int64_t
>
kernelSize
,
...
@@ -226,6 +225,8 @@ int create_submconv_indice_pair_cuda(
...
@@ -226,6 +225,8 @@ int create_submconv_indice_pair_cuda(
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
spatialVolume
*=
outSpatialShape
[
i
];
spatialVolume
*=
outSpatialShape
[
i
];
}
}
auto
dispatcher
=
tv
::
DispatchIntNoexcept
<
tv
::
mp_list_c
<
int
,
1
,
3
,
5
>>
();
if
(
useHash
)
{
if
(
useHash
)
{
auto
table
=
cuhash
::
HashTable
();
auto
table
=
cuhash
::
HashTable
();
// std::cout << "create " << numAct << " size table..." << std::endl;
// std::cout << "create " << numAct << " size table..." << std::endl;
...
@@ -252,18 +253,71 @@ int create_submconv_indice_pair_cuda(
...
@@ -252,18 +253,71 @@ int create_submconv_indice_pair_cuda(
auto
constants
=
table
.
get_constants_4
();
auto
constants
=
table
.
get_constants_4
();
auto
stash_constants
=
table
.
get_stash_constants
();
auto
stash_constants
=
table
.
get_stash_constants
();
auto
stash_count
=
table
.
get_stash_count
();
auto
stash_count
=
table
.
get_stash_count
();
tv
::
DispatchInt
<
max_kernel_vol_t
>
()(
bool
dilation_one
=
true
;
kernelVolume
,
std
::
less_equal
<
int
>
(),
[
&
](
auto
I2
)
{
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
constexpr
int
MaxKernelVolume
=
decltype
(
I2
)
::
value
;
dilation_one
&=
di
[
i
]
==
1
;
getSubMIndicePairsHashKernel
<
Index
,
NDim
,
MaxKernelVolume
>
}
<<<
tv
::
cuda
::
getBlocks
(
numActIn
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
auto
found
=
false
;
0
,
stream
>>>
(
tv
::
torch2tv
<
Index
>
(
indicesIn
),
if
(
dilation_one
&&
(
NDim
==
2
||
NDim
==
3
))
{
tv
::
torch2tv
<
Index
>
(
indicePairs
),
auto
indiceNumCpu
=
indiceNum
.
cpu
();
tv
::
torch2tv
<
Index
>
(
indiceNum
),
ks
,
st
,
pa
,
if
(
NDim
==
2
)
{
di
,
ou
,
tableSize
,
tableData
,
constants
,
tv
::
SimpleVector
<
Index
,
2
>
ou_
(
outSpatialShape
.
begin
(),
stash_constants
,
stash_count
);
outSpatialShape
.
end
());
TV_CHECK_CUDA_ERR_V2
(
"getSubMIndicePairsHashKernel failed"
);
dispatcher
(
kernelSize
[
0
],
[
&
](
auto
K0C
)
{
dispatcher
(
kernelSize
[
1
],
[
&
](
auto
K1C
)
{
constexpr
int
K0
=
decltype
(
K0C
)
::
value
;
constexpr
int
K1
=
decltype
(
K1C
)
::
value
;
found
=
true
;
getSubMIndicePairsHashUnrollKernel2
<
Index
,
K0
,
K1
>
<<<
tv
::
cuda
::
getBlocks
(
numActIn
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
stream
>>>
(
tv
::
torch2tv
<
Index
>
(
indicesIn
),
tv
::
torch2tv
<
Index
>
(
indicePairs
),
tv
::
torch2tv
<
Index
>
(
indiceNum
),
ou_
,
spatialVolume
,
tableSize
,
tableData
,
constants
,
stash_constants
,
stash_count
);
});
});
});
}
else
if
(
NDim
==
3
)
{
tv
::
SimpleVector
<
Index
,
3
>
ou_
(
outSpatialShape
.
begin
(),
outSpatialShape
.
end
());
dispatcher
(
kernelSize
[
0
],
[
&
](
auto
K0C
)
{
dispatcher
(
kernelSize
[
1
],
[
&
](
auto
K1C
)
{
dispatcher
(
kernelSize
[
2
],
[
&
](
auto
K2C
)
{
constexpr
int
K0
=
decltype
(
K0C
)
::
value
;
constexpr
int
K1
=
decltype
(
K1C
)
::
value
;
constexpr
int
K2
=
decltype
(
K2C
)
::
value
;
found
=
true
;
getSubMIndicePairsHashUnrollKernel3
<
Index
,
K0
,
K1
,
K2
>
<<<
tv
::
cuda
::
getBlocks
(
numActIn
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
stream
>>>
(
tv
::
torch2tv
<
Index
>
(
indicesIn
),
tv
::
torch2tv
<
Index
>
(
indicePairs
),
tv
::
torch2tv
<
Index
>
(
indiceNum
),
ou_
,
spatialVolume
,
tableSize
,
tableData
,
constants
,
stash_constants
,
stash_count
);
});
});
});
}
}
if
(
!
found
)
{
tv
::
DispatchInt
<
max_kernel_vol_t
>
()(
kernelVolume
,
std
::
less_equal
<
int
>
(),
[
&
](
auto
I2
)
{
constexpr
int
MaxKernelVolume
=
decltype
(
I2
)
::
value
;
getSubMIndicePairsHashKernel
<
Index
,
NDim
,
MaxKernelVolume
>
<<<
tv
::
cuda
::
getBlocks
(
numActIn
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
stream
>>>
(
tv
::
torch2tv
<
Index
>
(
indicesIn
),
tv
::
torch2tv
<
Index
>
(
indicePairs
),
tv
::
torch2tv
<
Index
>
(
indiceNum
),
ks
,
st
,
pa
,
di
,
ou
,
tableSize
,
tableData
,
constants
,
stash_constants
,
stash_count
);
TV_CHECK_CUDA_ERR_V2
(
"getSubMIndicePairsHashKernel failed"
);
});
}
}
else
{
}
else
{
// auto timer = spconv::CudaContextTimer<>();
// auto timer = spconv::CudaContextTimer<>();
prepareSubMGridKernel
<
Index
,
IndexGrid
,
NDim
>
prepareSubMGridKernel
<
Index
,
IndexGrid
,
NDim
>
...
@@ -280,16 +334,17 @@ int create_submconv_indice_pair_cuda(
...
@@ -280,16 +334,17 @@ int create_submconv_indice_pair_cuda(
auto
found
=
false
;
auto
found
=
false
;
if
(
dilation_one
&&
(
NDim
==
2
||
NDim
==
3
))
{
if
(
dilation_one
&&
(
NDim
==
2
||
NDim
==
3
))
{
auto
indiceNumCpu
=
indiceNum
.
cpu
();
auto
indiceNumCpu
=
indiceNum
.
cpu
();
if
(
NDim
==
2
)
{
if
(
NDim
==
2
)
{
tv
::
SimpleVector
<
Index
,
2
>
ou_
(
outSpatialShape
.
begin
(),
tv
::
SimpleVector
<
Index
,
2
>
ou_
(
outSpatialShape
.
begin
(),
outSpatialShape
.
end
());
outSpatialShape
.
end
());
tv
::
dispatch
_int_noexcept
<
1
,
3
,
5
>
(
kernelSize
[
0
],
[
&
](
auto
K0C
)
{
dispatch
er
(
kernelSize
[
0
],
[
&
](
auto
K0C
)
{
tv
::
dispatch
_int_noexcept
<
1
,
3
,
5
>
(
kernelSize
[
1
],
[
&
](
auto
K1C
)
{
dispatch
er
(
kernelSize
[
1
],
[
&
](
auto
K1C
)
{
constexpr
int
K0
=
decltype
(
K0C
)
::
value
;
constexpr
int
K0
=
decltype
(
K0C
)
::
value
;
constexpr
int
K1
=
decltype
(
K1C
)
::
value
;
constexpr
int
K1
=
decltype
(
K1C
)
::
value
;
found
=
true
;
found
=
true
;
getSubMIndicePairsKernel2
<
Index
,
IndexGrid
,
K0
,
K1
>
getSubMIndicePairs
Unroll
Kernel2
<
Index
,
IndexGrid
,
K0
,
K1
>
<<<
tv
::
cuda
::
getBlocks
(
numActIn
),
<<<
tv
::
cuda
::
getBlocks
(
numActIn
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
stream
>>>
(
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
stream
>>>
(
tv
::
torch2tv
<
Index
>
(
indicesIn
),
tv
::
torch2tv
<
Index
>
(
indicesIn
),
...
@@ -301,24 +356,21 @@ int create_submconv_indice_pair_cuda(
...
@@ -301,24 +356,21 @@ int create_submconv_indice_pair_cuda(
}
else
if
(
NDim
==
3
)
{
}
else
if
(
NDim
==
3
)
{
tv
::
SimpleVector
<
Index
,
3
>
ou_
(
outSpatialShape
.
begin
(),
tv
::
SimpleVector
<
Index
,
3
>
ou_
(
outSpatialShape
.
begin
(),
outSpatialShape
.
end
());
outSpatialShape
.
end
());
dispatcher
(
kernelSize
[
0
],
[
&
](
auto
K0C
)
{
tv
::
dispatch_int_noexcept
<
1
,
3
,
5
>
(
kernelSize
[
0
],
[
&
](
auto
K0C
)
{
dispatcher
(
kernelSize
[
1
],
[
&
](
auto
K1C
)
{
tv
::
dispatch_int_noexcept
<
1
,
3
,
5
>
(
kernelSize
[
1
],
[
&
](
auto
K1C
)
{
dispatcher
(
kernelSize
[
2
],
[
&
](
auto
K2C
)
{
tv
::
dispatch_int_noexcept
<
1
,
3
,
5
>
(
constexpr
int
K0
=
decltype
(
K0C
)
::
value
;
kernelSize
[
2
],
[
&
](
auto
K2C
)
{
constexpr
int
K1
=
decltype
(
K1C
)
::
value
;
constexpr
int
K0
=
decltype
(
K0C
)
::
value
;
constexpr
int
K2
=
decltype
(
K2C
)
::
value
;
constexpr
int
K1
=
decltype
(
K1C
)
::
value
;
found
=
true
;
constexpr
int
K2
=
decltype
(
K2C
)
::
value
;
getSubMIndicePairsUnrollKernel3
<
Index
,
IndexGrid
,
K0
,
K1
,
K2
>
found
=
true
;
<<<
tv
::
cuda
::
getBlocks
(
numActIn
),
getSubMIndicePairsKernel3
<
Index
,
IndexGrid
,
K0
,
K1
,
K2
>
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
stream
>>>
(
<<<
tv
::
cuda
::
getBlocks
(
numActIn
),
tv
::
torch2tv
<
Index
>
(
indicesIn
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
stream
>>>
(
tv
::
torch2tv
<
IndexGrid
>
(
gridsOut
),
tv
::
torch2tv
<
Index
>
(
indicesIn
),
tv
::
torch2tv
<
Index
>
(
indicePairs
),
tv
::
torch2tv
<
IndexGrid
>
(
gridsOut
),
tv
::
torch2tv
<
Index
>
(
indiceNum
),
ou_
,
spatialVolume
);
tv
::
torch2tv
<
Index
>
(
indicePairs
),
});
tv
::
torch2tv
<
Index
>
(
indiceNum
),
ou_
,
spatialVolume
);
});
});
});
});
});
}
}
...
@@ -344,7 +396,7 @@ int create_submconv_indice_pair_cuda(
...
@@ -344,7 +396,7 @@ int create_submconv_indice_pair_cuda(
resetGridSubMKernel
<
Index
,
IndexGrid
,
NDim
>
resetGridSubMKernel
<
Index
,
IndexGrid
,
NDim
>
<<<
tv
::
cuda
::
getBlocks
(
numActIn
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
<<<
tv
::
cuda
::
getBlocks
(
numActIn
),
tv
::
cuda
::
CUDA_NUM_THREADS
,
0
,
stream
>>>
(
indicesIn
.
data_ptr
<
Index
>
(),
stream
>>>
(
indicesIn
.
data_ptr
<
Index
>
(),
tv
::
torch2tv
<
IndexGrid
>
(
gridsOut
),
ou
,
numActIn
);
tv
::
torch2tv
<
IndexGrid
>
(
gridsOut
),
ou
,
numActIn
,
spatialVolume
);
TV_CHECK_CUDA_ERR_V2
(
"resetGridKernel failed"
);
TV_CHECK_CUDA_ERR_V2
(
"resetGridKernel failed"
);
}
}
});
});
...
...
src/spconv/spconv_ops.cc
View file @
d17a00e0
...
@@ -2,7 +2,7 @@
...
@@ -2,7 +2,7 @@
namespace
spconv
{
namespace
spconv
{
std
::
vector
<
torch
::
Tensor
>
std
::
vector
<
torch
::
Tensor
>
getIndicePairs
(
torch
::
Tensor
indices
,
int64_t
batchSize
,
getIndicePairs
(
torch
::
Tensor
indices
,
torch
::
Tensor
gridOut
,
int64_t
batchSize
,
std
::
vector
<
int64_t
>
outSpatialShape
,
std
::
vector
<
int64_t
>
outSpatialShape
,
std
::
vector
<
int64_t
>
spatialShape
,
std
::
vector
<
int64_t
>
spatialShape
,
std
::
vector
<
int64_t
>
kernelSize
,
std
::
vector
<
int64_t
>
stride
,
std
::
vector
<
int64_t
>
kernelSize
,
std
::
vector
<
int64_t
>
stride
,
...
@@ -47,8 +47,11 @@ getIndicePairs(torch::Tensor indices, int64_t batchSize,
...
@@ -47,8 +47,11 @@ getIndicePairs(torch::Tensor indices, int64_t batchSize,
if
(
useHash
)
{
if
(
useHash
)
{
gridSize
=
batchSize
;
gridSize
=
batchSize
;
}
}
torch
::
Tensor
gridOut
=
torch
::
full
(
bool
resetGrid
=
gridOut
.
numel
()
!=
0
;
{
gridSize
},
-
1
,
torch
::
dtype
(
torch
::
kInt32
).
device
(
indices
.
device
()));
if
(
!
resetGrid
){
gridOut
=
torch
::
full
(
{
gridSize
},
-
1
,
torch
::
dtype
(
torch
::
kInt32
).
device
(
indices
.
device
()));
}
gridOut
=
gridOut
.
view
({
batchSize
,
-
1
});
gridOut
=
gridOut
.
view
({
batchSize
,
-
1
});
int64_t
numActOut
=
-
1
;
int64_t
numActOut
=
-
1
;
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
...
@@ -68,7 +71,7 @@ getIndicePairs(torch::Tensor indices, int64_t batchSize,
...
@@ -68,7 +71,7 @@ getIndicePairs(torch::Tensor indices, int64_t batchSize,
else
if
(
indices
.
device
().
type
()
==
torch
::
kCUDA
)
{
else
if
(
indices
.
device
().
type
()
==
torch
::
kCUDA
)
{
numActOut
=
create_submconv_indice_pair_cuda
(
numActOut
=
create_submconv_indice_pair_cuda
(
indices
,
gridOut
,
indicePairs
,
indiceNum
,
kernelSize
,
stride
,
padding
,
indices
,
gridOut
,
indicePairs
,
indiceNum
,
kernelSize
,
stride
,
padding
,
dilation
,
outSpatialShape
,
transpose
,
false
,
useHash
);
dilation
,
outSpatialShape
,
transpose
,
resetGrid
,
useHash
);
if
(
numActOut
==
-
1
)
{
if
(
numActOut
==
-
1
)
{
auto
device
=
indices
.
device
();
auto
device
=
indices
.
device
();
indicePairs
=
indicePairs
.
to
({
torch
::
kCPU
});
indicePairs
=
indicePairs
.
to
({
torch
::
kCPU
});
...
@@ -98,10 +101,10 @@ getIndicePairs(torch::Tensor indices, int64_t batchSize,
...
@@ -98,10 +101,10 @@ getIndicePairs(torch::Tensor indices, int64_t batchSize,
if
(
indices
.
device
().
type
()
==
torch
::
kCPU
)
{
if
(
indices
.
device
().
type
()
==
torch
::
kCPU
)
{
numActOut
=
create_conv_indice_pair_cpu
(
numActOut
=
create_conv_indice_pair_cpu
(
indices
,
outInds
,
gridOut
,
indicePairs
,
indiceNum
,
kernelSize
,
stride
,
indices
,
outInds
,
gridOut
,
indicePairs
,
indiceNum
,
kernelSize
,
stride
,
padding
,
dilation
,
outSpatialShape
,
transpose
,
false
,
useHash
);
padding
,
dilation
,
outSpatialShape
,
transpose
,
resetGrid
,
useHash
);
}
}
#ifdef TV_CUDA
#ifdef TV_CUDA
else
if
(
indices
.
device
().
type
()
==
torch
::
kCUDA
)
{
else
if
(
indices
.
device
().
type
()
==
torch
::
kCUDA
)
{
numActOut
=
create_conv_indice_pair_p1_cuda
(
numActOut
=
create_conv_indice_pair_p1_cuda
(
indices
,
indicePairs
,
indiceNum
,
indicePairUnique
,
kernelSize
,
stride
,
indices
,
indicePairs
,
indiceNum
,
indicePairUnique
,
kernelSize
,
stride
,
padding
,
dilation
,
outSpatialShape
,
transpose
);
padding
,
dilation
,
outSpatialShape
,
transpose
);
...
@@ -110,7 +113,7 @@ getIndicePairs(torch::Tensor indices, int64_t batchSize,
...
@@ -110,7 +113,7 @@ getIndicePairs(torch::Tensor indices, int64_t batchSize,
indicePairUnique
=
std
::
get
<
0
>
(
res
);
indicePairUnique
=
std
::
get
<
0
>
(
res
);
numActOut
=
create_conv_indice_pair_p2_cuda
(
numActOut
=
create_conv_indice_pair_p2_cuda
(
indices
,
outInds
,
gridOut
,
indicePairs
,
indiceNum
,
indicePairUnique
,
indices
,
outInds
,
gridOut
,
indicePairs
,
indiceNum
,
indicePairUnique
,
outSpatialShape
,
transpose
,
false
,
useHash
);
outSpatialShape
,
transpose
,
resetGrid
,
useHash
);
if
(
numActOut
==
-
1
)
{
if
(
numActOut
==
-
1
)
{
auto
device
=
indices
.
device
();
auto
device
=
indices
.
device
();
outInds
=
outInds
.
to
({
torch
::
kCPU
});
outInds
=
outInds
.
to
({
torch
::
kCPU
});
...
@@ -188,7 +191,8 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
...
@@ -188,7 +191,8 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
double
totalGatherTime
=
0
;
double
totalGatherTime
=
0
;
double
totalGEMMTime
=
0
;
double
totalGEMMTime
=
0
;
double
totalSAddTime
=
0
;
double
totalSAddTime
=
0
;
// tv::ssprint("first subm gemm time", timer.report() / 1000.0);
// tv::ssprint("first subm gemm time", timer.report() / 1000.0, std::vector<int>(indicePairNumCpu.data_ptr<int>(),
// indicePairNumCpu.data_ptr<int>() + kernelVolume));
for
(
int
i
=
0
;
i
<
kernelVolume
;
++
i
)
{
for
(
int
i
=
0
;
i
<
kernelVolume
;
++
i
)
{
auto
nHot
=
indicePairNumCpu
.
data_ptr
<
int
>
()[
i
];
auto
nHot
=
indicePairNumCpu
.
data_ptr
<
int
>
()[
i
];
...
@@ -237,6 +241,8 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
...
@@ -237,6 +241,8 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
// totalSAddTime += timer.report() / 1000.0;
// totalSAddTime += timer.report() / 1000.0;
}
}
// tv::ssprint(totalGatherTime, totalGEMMTime, totalSAddTime);
// tv::ssprint(totalGatherTime, totalGEMMTime, totalSAddTime);
// tv::ssprint("final subm gemm time", timer.report() / 1000.0);
return
output
;
return
output
;
}
}
...
...
test/benchmark.py
0 → 100644
View file @
d17a00e0
import
torch
import
spconv
import
numpy
as
np
from
spconv.utils
import
VoxelGeneratorV2
from
pathlib
import
Path
from
torch
import
nn
import
time
def
waymo_data
(
batch_size
=
1
):
gen
=
VoxelGeneratorV2
([
0.1
,
0.1
,
0.1
],
[
-
80
,
-
80
,
-
2
,
80
,
80
,
6
],
1
,
150000
)
data
=
np
.
load
(
Path
(
__file__
).
parent
/
"data"
/
"benchmark-pc.npz"
)
pc
=
data
[
"pc"
]
data
=
gen
.
generate
(
pc
)
voxels
=
data
[
"voxels"
].
reshape
(
-
1
,
3
)
coors
=
data
[
"coordinates"
]
N
=
coors
.
shape
[
0
]
coors
=
np
.
concatenate
([
np
.
full
([
N
,
1
],
0
,
coors
.
dtype
),
coors
],
axis
=
1
)
return
voxels
,
coors
,
gen
.
grid_size
class
Net
(
nn
.
Module
):
def
__init__
(
self
,
shape
):
super
().
__init__
()
self
.
net
=
spconv
.
SparseSequential
(
spconv
.
SubMConv3d
(
3
,
64
,
3
,
bias
=
False
,
indice_key
=
"c0"
),
spconv
.
SubMConv3d
(
64
,
64
,
3
,
bias
=
False
,
indice_key
=
"c0"
),
# nn.BatchNorm1d(32),
# nn.ReLU(),
spconv
.
SparseMaxPool3d
(
2
,
2
),
spconv
.
SubMConv3d
(
64
,
96
,
3
,
bias
=
False
,
indice_key
=
"c1"
),
spconv
.
SubMConv3d
(
96
,
96
,
3
,
bias
=
False
,
indice_key
=
"c1"
),
# nn.BatchNorm1d(64),
# nn.ReLU(),
spconv
.
SparseMaxPool3d
(
2
,
2
),
spconv
.
SubMConv3d
(
96
,
128
,
3
,
bias
=
False
,
indice_key
=
"c2"
),
spconv
.
SubMConv3d
(
128
,
128
,
3
,
bias
=
False
,
indice_key
=
"c2"
),
# nn.BatchNorm1d(128),
# nn.ReLU(),
spconv
.
SparseMaxPool3d
(
2
,
2
),
spconv
.
SubMConv3d
(
128
,
160
,
3
,
bias
=
False
,
indice_key
=
"c3"
),
spconv
.
SubMConv3d
(
160
,
160
,
3
,
bias
=
False
,
indice_key
=
"c3"
),
# nn.BatchNorm1d(128),
# nn.ReLU(),
spconv
.
SparseMaxPool3d
(
2
,
2
),
spconv
.
SubMConv3d
(
160
,
192
,
3
,
bias
=
False
,
indice_key
=
"c4"
),
spconv
.
SubMConv3d
(
192
,
192
,
3
,
bias
=
False
,
indice_key
=
"c4"
),
# nn.BatchNorm1d(128),
# nn.ReLU(),
spconv
.
SparseMaxPool3d
(
2
,
2
),
spconv
.
SubMConv3d
(
192
,
224
,
3
,
bias
=
False
,
indice_key
=
"c5"
),
spconv
.
SubMConv3d
(
224
,
224
,
3
,
bias
=
False
,
indice_key
=
"c5"
),
# nn.BatchNorm1d(128),
# nn.ReLU(),
spconv
.
SparseMaxPool3d
(
2
,
2
),
spconv
.
SubMConv3d
(
224
,
256
,
3
,
bias
=
False
,
indice_key
=
"c6"
),
spconv
.
SubMConv3d
(
256
,
256
,
3
,
bias
=
False
,
indice_key
=
"c6"
),
)
max_batch_size
=
1
self
.
grid
=
torch
.
full
([
max_batch_size
,
*
shape
],
-
1
,
dtype
=
torch
.
int32
).
cuda
()
# self.grid = None
self
.
shape
=
shape
def
forward
(
self
,
features
,
coors
,
batch_size
):
x
=
spconv
.
SparseConvTensor
(
features
,
coors
,
self
.
shape
,
batch_size
,
self
.
grid
)
return
self
.
net
(
x
)
def
main
():
voxels
,
coors
,
spatial_shape
=
waymo_data
()
voxels_th
=
torch
.
from_numpy
(
voxels
).
cuda
().
float
()
coors_th
=
torch
.
from_numpy
(
coors
).
cuda
()
net
=
Net
(
spatial_shape
[::
-
1
]).
cuda
().
eval
().
float
()
print
(
coors_th
.
shape
)
out
=
net
(
voxels_th
,
coors_th
,
1
)
print
(
out
.
spatial_shape
)
times
=
[]
with
torch
.
no_grad
():
for
i
in
range
(
20
):
torch
.
cuda
.
synchronize
()
t
=
time
.
time
()
out
=
net
(
voxels_th
,
coors_th
,
1
)
torch
.
cuda
.
synchronize
()
times
.
append
(
time
.
time
()
-
t
)
# print((net.grid == -1).float().sum(), net.grid.numel())
# print("spconv time", time.time() - t)
print
(
"spconv time"
,
np
.
mean
(
times
[
10
:]))
if
__name__
==
"__main__"
:
main
()
\ No newline at end of file
test/data/benchmark-pc.npz
0 → 100644
View file @
d17a00e0
File added
test/test_conv.py
View file @
d17a00e0
...
@@ -752,8 +752,8 @@ def main_subm(algo, dtype=torch.float32):
...
@@ -752,8 +752,8 @@ def main_subm(algo, dtype=torch.float32):
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
main
_subm
(
algo
=
spconv
.
ConvAlgo
.
Native
,
dtype
=
torch
.
float32
)
main
(
algo
=
spconv
.
ConvAlgo
.
Native
,
dtype
=
torch
.
float32
)
main
_subm
(
algo
=
spconv
.
ConvAlgo
.
Native
,
dtype
=
torch
.
half
)
main
(
algo
=
spconv
.
ConvAlgo
.
Native
,
dtype
=
torch
.
half
)
# TestCase().assertAllClose(out_my, out_ref)
# TestCase().assertAllClose(out_my, out_ref)
# unittest.main()
# unittest.main()
# TestSpConv().testSpConv3d()
# TestSpConv().testSpConv3d()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment