Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
one
spconv
Commits
f08268fc
Commit
f08268fc
authored
May 25, 2020
by
Yan Yan
Browse files
working on batch indice conv (almost finished)
parent
bf473de0
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
328 additions
and
23 deletions
+328
-23
include/spconv/reordering.cu.h
include/spconv/reordering.cu.h
+105
-21
include/spconv/reordering.h
include/spconv/reordering.h
+6
-0
include/spconv/spconv_ops.h
include/spconv/spconv_ops.h
+3
-0
spconv/ops.py
spconv/ops.py
+1
-1
src/spconv/all.cc
src/spconv/all.cc
+1
-0
src/spconv/reordering.cu
src/spconv/reordering.cu
+150
-0
src/spconv/spconv_ops.cc
src/spconv/spconv_ops.cc
+61
-0
test/test_conv.py
test/test_conv.py
+1
-1
No files found.
include/spconv/reordering.cu.h
View file @
f08268fc
...
@@ -14,10 +14,11 @@
...
@@ -14,10 +14,11 @@
#ifndef REORDERING_CU_H_
#ifndef REORDERING_CU_H_
#define REORDERING_CU_H_
#define REORDERING_CU_H_
#include <THC/THCAtomics.cuh>
#include <tensorview/kernel_utils.h>
#include <tensorview/kernel_utils.h>
// see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
// see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
namespace
spconv
{
namespace
spconv
{
template
<
typename
T
,
typename
Index
,
int
NumTLP
,
int
NumILP
>
template
<
typename
T
,
typename
Index
,
int
NumTLP
,
int
NumILP
>
__global__
void
gatherGenericKernel
(
T
*
buffer
,
const
T
*
features
,
__global__
void
gatherGenericKernel
(
T
*
buffer
,
const
T
*
features
,
const
Index
*
indices
,
int
size
,
const
Index
*
indices
,
int
size
,
...
@@ -99,12 +100,13 @@ __global__ void gatherVecBlockKernel(T *buffer, const T *features,
...
@@ -99,12 +100,13 @@ __global__ void gatherVecBlockKernel(T *buffer, const T *features,
template
<
typename
T
,
typename
Index
,
int
NumTLP
,
int
NumILP
>
template
<
typename
T
,
typename
Index
,
int
NumTLP
,
int
NumILP
>
__global__
void
batchGatherGenericKernel
(
T
*
buffer
,
const
T
*
features
,
__global__
void
batchGatherGenericKernel
(
T
*
buffer
,
const
T
*
features
,
const
Index
*
indices
,
int
size
,
const
Index
*
indices
,
int
size
,
int
numPlanes
,
int
batch_stride
,
int
numPlanes
,
int
indice_
batch_stride
,
int
feature_batch_stride
)
{
int
feature_batch_stride
)
{
// size: max indice num * kernel volume
// inds: [volume, num_elems]
int
ILPStrideX
[
NumILP
];
int
ILPStrideX
[
NumILP
];
Index
inds
[
NumILP
];
Index
inds
[
NumILP
];
Index
batchIdx
[
NumILP
];
Index
inds_elem
;
#pragma unroll
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
ILPStrideX
[
ilp
]
=
ilp
*
gridDim
.
x
*
blockDim
.
x
;
ILPStrideX
[
ilp
]
=
ilp
*
gridDim
.
x
*
blockDim
.
x
;
...
@@ -113,16 +115,18 @@ __global__ void batchGatherGenericKernel(T *buffer, const T *features,
...
@@ -113,16 +115,18 @@ __global__ void batchGatherGenericKernel(T *buffer, const T *features,
#pragma unroll
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
{
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
{
if
(
ix
+
ILPStrideX
[
ilp
]
<
size
)
{
if
(
ix
+
ILPStrideX
[
ilp
]
<
size
)
{
batchIdx
[
ilp
]
=
ix
/
feature_batch_stride
;
inds_elem
=
ix
+
ILPStrideX
[
ilp
];
inds
[
ilp
]
=
indices
[
ix
+
ILPStrideX
[
ilp
]]
*
numPlanes
;
inds
[
ilp
]
=
indices
[(
inds_elem
/
feature_batch_stride
)
*
indice_batch_stride
+
inds_elem
%
feature_batch_stride
];
}
}
}
}
for
(
int
iy
:
tv
::
KernelLoopY
<
int
>
(
numPlanes
))
{
for
(
int
iy
:
tv
::
KernelLoopY
<
int
>
(
numPlanes
))
{
#pragma unroll
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
++
ilp
)
{
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
++
ilp
)
{
if
(
ix
+
ILPStrideX
[
ilp
]
<
size
)
if
(
ix
+
ILPStrideX
[
ilp
]
<
size
&&
inds
[
ilp
]
!=
-
1
)
buffer
[(
ix
+
ILPStrideX
[
ilp
])
*
numPlanes
+
iy
]
=
buffer
[(
ix
+
ILPStrideX
[
ilp
])
*
numPlanes
+
iy
]
=
features
[
inds
[
ilp
]
+
iy
];
features
[
inds
[
ilp
]
*
numPlanes
+
iy
];
}
}
}
}
}
}
...
@@ -131,9 +135,12 @@ __global__ void batchGatherGenericKernel(T *buffer, const T *features,
...
@@ -131,9 +135,12 @@ __global__ void batchGatherGenericKernel(T *buffer, const T *features,
template
<
typename
T
,
typename
Index
,
int
NumTLP
,
int
NumILP
,
typename
VecType
>
template
<
typename
T
,
typename
Index
,
int
NumTLP
,
int
NumILP
,
typename
VecType
>
__global__
void
batchGatherVecKernel
(
T
*
buffer
,
const
T
*
features
,
__global__
void
batchGatherVecKernel
(
T
*
buffer
,
const
T
*
features
,
const
Index
*
indices
,
int
size
,
const
Index
*
indices
,
int
size
,
int
numPlanes
)
{
int
feature_offset
,
int
numPlanes
,
int
indice_batch_stride
,
int
feature_batch_stride
)
{
int
ILPStrideX
[
NumILP
];
int
ILPStrideX
[
NumILP
];
Index
inds
[
NumILP
];
Index
inds
[
NumILP
];
Index
inds_elem
;
#pragma unroll
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
ILPStrideX
[
ilp
]
=
ilp
*
gridDim
.
x
*
blockDim
.
x
;
ILPStrideX
[
ilp
]
=
ilp
*
gridDim
.
x
*
blockDim
.
x
;
...
@@ -141,16 +148,21 @@ __global__ void batchGatherVecKernel(T *buffer, const T *features,
...
@@ -141,16 +148,21 @@ __global__ void batchGatherVecKernel(T *buffer, const T *features,
for
(
int
ix
:
tv
::
KernelLoopX
<
int
,
NumILP
>
(
size
))
{
for
(
int
ix
:
tv
::
KernelLoopX
<
int
,
NumILP
>
(
size
))
{
#pragma unroll
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
{
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
{
if
(
ix
+
ILPStrideX
[
ilp
]
<
size
)
if
(
ix
+
ILPStrideX
[
ilp
]
<
size
)
{
inds
[
ilp
]
=
indices
[
ix
+
ILPStrideX
[
ilp
]]
*
numPlanes
;
inds_elem
=
ix
+
ILPStrideX
[
ilp
]
+
feature_offset
;
inds
[
ilp
]
=
indices
[(
inds_elem
/
feature_batch_stride
)
*
indice_batch_stride
+
inds_elem
%
feature_batch_stride
];
}
}
}
for
(
int
iy
:
tv
::
KernelLoopY
<
int
>
(
numPlanes
))
{
for
(
int
iy
:
tv
::
KernelLoopY
<
int
>
(
numPlanes
))
{
#pragma unroll
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
++
ilp
)
{
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
++
ilp
)
{
if
(
ix
+
ILPStrideX
[
ilp
]
<
size
)
if
(
ix
+
ILPStrideX
[
ilp
]
<
size
&&
inds
[
ilp
]
!=
-
1
)
reinterpret_cast
<
VecType
*>
(
reinterpret_cast
<
VecType
*>
(
buffer
)[(
ix
+
ILPStrideX
[
ilp
])
*
numPlanes
+
iy
]
=
buffer
)[(
ix
+
ILPStrideX
[
ilp
])
*
numPlanes
+
iy
]
=
reinterpret_cast
<
const
VecType
*>
(
features
)[
inds
[
ilp
]
+
iy
];
reinterpret_cast
<
const
VecType
*>
(
features
)[
inds
[
ilp
]
*
numPlanes
+
iy
];
}
}
}
}
}
}
...
@@ -158,24 +170,34 @@ __global__ void batchGatherVecKernel(T *buffer, const T *features,
...
@@ -158,24 +170,34 @@ __global__ void batchGatherVecKernel(T *buffer, const T *features,
template
<
typename
T
,
typename
Index
,
int
NumTLP
,
int
NumILP
,
template
<
typename
T
,
typename
Index
,
int
NumTLP
,
int
NumILP
,
typename
VecType
=
int4
>
typename
VecType
=
int4
>
__global__
void
batchGatherVecBlockKernel
(
T
*
buffer
,
const
T
*
features
,
__global__
void
const
Index
*
indices
,
int
size
,
batchGatherVecBlockKernel
(
T
*
buffer
,
const
T
*
features
,
const
Index
*
indices
,
int
numPlanes
)
{
int
size
,
int
numPlanes
,
int
indice_batch_stride
,
int
feature_batch_stride
)
{
int
ILPStrideY
[
NumILP
];
int
ILPStrideY
[
NumILP
];
Index
inds
;
#pragma unroll
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
ILPStrideY
[
ilp
]
=
ilp
*
gridDim
.
y
*
blockDim
.
y
;
ILPStrideY
[
ilp
]
=
ilp
*
gridDim
.
y
*
blockDim
.
y
;
features
+=
blockIdx
.
x
*
NumTLP
;
features
+=
blockIdx
.
x
*
NumTLP
;
buffer
+=
blockIdx
.
x
*
NumTLP
;
buffer
+=
blockIdx
.
x
*
NumTLP
;
Index
inds_elem
;
for
(
int
iy
:
tv
::
KernelLoopY
<
int
,
NumILP
>
(
size
))
{
for
(
int
iy
:
tv
::
KernelLoopY
<
int
,
NumILP
>
(
size
))
{
#pragma unroll
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
++
ilp
)
{
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
++
ilp
)
{
reinterpret_cast
<
VecType
*>
(
inds_elem
=
iy
+
ILPStrideY
[
ilp
];
buffer
)[(
iy
+
ILPStrideY
[
ilp
])
*
numPlanes
+
threadIdx
.
x
]
=
inds
=
indices
[(
inds_elem
/
feature_batch_stride
)
*
indice_batch_stride
+
reinterpret_cast
<
const
VecType
*>
(
inds_elem
%
feature_batch_stride
];
features
)[
indices
[
iy
+
ILPStrideY
[
ilp
]]
*
numPlanes
+
threadIdx
.
x
];
if
(
inds
!=
-
1
)
{
reinterpret_cast
<
VecType
*>
(
buffer
)[(
iy
+
ILPStrideY
[
ilp
])
*
numPlanes
+
threadIdx
.
x
]
=
reinterpret_cast
<
const
VecType
*>
(
features
)[
inds
*
numPlanes
+
threadIdx
.
x
];
}
}
}
}
}
}
}
...
@@ -240,6 +262,68 @@ __global__ void scatterAddVecBlockKernel(T *outFeatures, const T *buffer,
...
@@ -240,6 +262,68 @@ __global__ void scatterAddVecBlockKernel(T *outFeatures, const T *buffer,
}
}
}
}
template
<
typename
T
,
typename
Index
,
int
NumTLP
,
int
NumILP
>
__global__
void
batchScatterAddGenericKernel
(
T
*
outFeatures
,
const
T
*
buffer
,
const
Index
*
indices
,
int
size
,
int
feature_offset
,
int
numPlanes
,
int
indice_batch_stride
,
int
feature_batch_stride
)
{
int
ILPStrideX
[
NumILP
];
Index
inds
[
NumILP
];
Index
inds_elem
;
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
ILPStrideX
[
ilp
]
=
ilp
*
gridDim
.
x
*
blockDim
.
x
;
for
(
int
ix
:
tv
::
KernelLoopX
<
int
,
NumILP
>
(
size
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
{
if
(
ix
+
ILPStrideX
[
ilp
]
<
size
)
{
inds_elem
=
ix
+
ILPStrideX
[
ilp
]
+
feature_offset
;
inds
[
ilp
]
=
indices
[(
inds_elem
/
feature_batch_stride
)
*
indice_batch_stride
+
inds_elem
%
feature_batch_stride
];
}
}
for
(
int
iy
:
tv
::
KernelLoopY
<
int
>
(
numPlanes
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
++
ilp
)
{
if
(
ix
+
ILPStrideX
[
ilp
]
<
size
&&
inds
[
ilp
]
!=
-
1
)
{
gpuAtomicAdd
(
outFeatures
+
inds
[
ilp
]
*
numPlanes
+
iy
,
buffer
[(
ix
+
ILPStrideX
[
ilp
])
*
numPlanes
+
iy
]);
// outFeatures[inds[ilp] * numPlanes + iy] +=
// buffer[(ix + ILPStrideX[ilp]) * numPlanes + iy];
}
}
}
}
}
template
<
typename
T
,
typename
Index
,
int
NumTLP
,
int
NumILP
>
__global__
void
batchScatterAddBlockKernel
(
T
*
outFeatures
,
const
T
*
buffer
,
const
Index
*
indices
,
int
size
,
int
numPlanes
,
int
indice_batch_stride
,
int
feature_batch_stride
)
{
int
ILPStrideY
[
NumILP
];
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
ILPStrideY
[
ilp
]
=
ilp
*
gridDim
.
y
*
blockDim
.
y
;
outFeatures
+=
blockIdx
.
x
*
NumTLP
;
buffer
+=
blockIdx
.
x
*
NumTLP
;
Index
inds
,
inds_elem
;
for
(
int
iy
:
tv
::
KernelLoopY
<
int
,
NumILP
>
(
size
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
++
ilp
)
{
inds_elem
=
iy
+
ILPStrideY
[
ilp
];
inds
=
indices
[(
inds_elem
/
feature_batch_stride
)
*
indice_batch_stride
+
inds_elem
%
feature_batch_stride
];
if
(
inds
!=
-
1
)
{
gpuAtomicAdd
(
outFeatures
+
inds
*
numPlanes
+
threadIdx
.
x
,
buffer
[(
iy
+
ILPStrideY
[
ilp
])
*
numPlanes
+
threadIdx
.
x
]);
}
}
}
}
}
// namespace spconv
}
// namespace spconv
#endif
#endif
\ No newline at end of file
include/spconv/reordering.h
View file @
f08268fc
...
@@ -19,6 +19,12 @@
...
@@ -19,6 +19,12 @@
namespace
spconv
{
namespace
spconv
{
void
batch_sparse_gather_cuda
(
torch
::
Tensor
buffer
,
torch
::
Tensor
features
,
torch
::
Tensor
indices
,
int
size
);
void
batch_sparse_scatter_add_cuda
(
torch
::
Tensor
buffer
,
torch
::
Tensor
outFeatures
,
torch
::
Tensor
indices
,
int
size
);
void
sparse_gather_cuda
(
torch
::
Tensor
buffer
,
torch
::
Tensor
features
,
void
sparse_gather_cuda
(
torch
::
Tensor
buffer
,
torch
::
Tensor
features
,
torch
::
Tensor
indices
,
int
size
);
torch
::
Tensor
indices
,
int
size
);
void
sparse_scatter_add_cuda
(
torch
::
Tensor
buffer
,
torch
::
Tensor
outFeatures
,
void
sparse_scatter_add_cuda
(
torch
::
Tensor
buffer
,
torch
::
Tensor
outFeatures
,
...
...
include/spconv/spconv_ops.h
View file @
f08268fc
...
@@ -338,6 +338,9 @@ std::vector<torch::Tensor> getIndicePairPreGrid(
...
@@ -338,6 +338,9 @@ std::vector<torch::Tensor> getIndicePairPreGrid(
return
{
outInds
.
slice
(
0
,
0
,
numActOut
),
indicePairs
,
indiceNum
};
return
{
outInds
.
slice
(
0
,
0
,
numActOut
),
indicePairs
,
indiceNum
};
}
}
}
}
torch
::
Tensor
indiceConvBatch
(
torch
::
Tensor
features
,
torch
::
Tensor
filters
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
int64_t
numActOut
,
int64_t
_inverse
,
int64_t
_subM
);
torch
::
Tensor
indiceConv
(
torch
::
Tensor
features
,
torch
::
Tensor
filters
,
torch
::
Tensor
indiceConv
(
torch
::
Tensor
features
,
torch
::
Tensor
filters
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
...
...
spconv/ops.py
View file @
f08268fc
...
@@ -107,7 +107,7 @@ def indice_conv(features,
...
@@ -107,7 +107,7 @@ def indice_conv(features,
num_activate_out
,
num_activate_out
,
inverse
=
False
,
inverse
=
False
,
subm
=
False
):
subm
=
False
):
return
torch
.
ops
.
spconv
.
indice_conv
(
features
,
filters
,
indice_pairs
,
return
torch
.
ops
.
spconv
.
indice_conv
_batch
(
features
,
filters
,
indice_pairs
,
indice_pair_num
,
num_activate_out
,
indice_pair_num
,
num_activate_out
,
int
(
inverse
),
int
(
subm
))
int
(
inverse
),
int
(
subm
))
...
...
src/spconv/all.cc
View file @
f08268fc
...
@@ -30,6 +30,7 @@ static auto registry =
...
@@ -30,6 +30,7 @@ static auto registry =
.
op
(
"spconv::get_indice_pairs_grid_3d"
,
.
op
(
"spconv::get_indice_pairs_grid_3d"
,
&
spconv
::
getIndicePairPreGrid
<
3
>
)
&
spconv
::
getIndicePairPreGrid
<
3
>
)
.
op
(
"spconv::indice_conv"
,
&
spconv
::
indiceConv
)
.
op
(
"spconv::indice_conv"
,
&
spconv
::
indiceConv
)
.
op
(
"spconv::indice_conv_batch"
,
&
spconv
::
indiceConvBatch
)
.
op
(
"spconv::indice_conv_backward"
,
&
spconv
::
indiceConvBackward
)
.
op
(
"spconv::indice_conv_backward"
,
&
spconv
::
indiceConvBackward
)
.
op
(
"spconv::fused_indice_conv_bn"
,
&
spconv
::
fusedIndiceConvBatchNorm
)
.
op
(
"spconv::fused_indice_conv_bn"
,
&
spconv
::
fusedIndiceConvBatchNorm
)
.
op
(
"spconv::indice_maxpool"
,
&
spconv
::
indiceMaxPool
)
.
op
(
"spconv::indice_maxpool"
,
&
spconv
::
indiceMaxPool
)
...
...
src/spconv/reordering.cu
View file @
f08268fc
...
@@ -161,4 +161,154 @@ void sparse_scatter_add_cuda(torch::Tensor buffer, torch::Tensor outFeatures,
...
@@ -161,4 +161,154 @@ void sparse_scatter_add_cuda(torch::Tensor buffer, torch::Tensor outFeatures,
});
});
}
}
void
batch_sparse_gather_cuda
(
torch
::
Tensor
buffer
,
torch
::
Tensor
features
,
torch
::
Tensor
indices
,
int
size
)
{
// indices: [volume, inds_stride]
// buffer: [volume, num_points, num_features]
// size == volume * num_points
if
(
size
<=
0
)
return
;
int
numPlanes
=
features
.
size
(
1
);
auto
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
auto
dtype
=
features
.
scalar_type
();
auto
inds_dtype
=
indices
.
scalar_type
();
int
inds_stride
=
indices
.
size
(
1
);
int
feature_stride
=
buffer
.
size
(
1
);
tv
::
DispatchTorch
<
float_types_t
>
()(
dtype
,
[
&
](
auto
TValue
)
{
using
T
=
decltype
(
TValue
);
using
vecload_type_t
=
std
::
conditional_t
<
std
::
is_same
<
T
,
at
::
Half
>::
value
,
int2
,
int4
>
;
using
kernel_block_t
=
tv
::
mp_list_c
<
int
,
64
,
32
,
16
>
;
tv
::
DispatchTorch
<
int_types_t
>
()(
inds_dtype
,
[
&
](
auto
IndexValue
)
{
using
Index
=
decltype
(
IndexValue
);
bool
notFound
=
true
;
constexpr
int
vecloadFactor
=
sizeof
(
vecload_type_t
)
/
sizeof
(
T
);
tv
::
mp_for_each
<
kernel_block_t
>
(
[
=
,
&
buffer
,
&
features
,
&
indices
,
&
notFound
](
auto
NumTLP
)
{
constexpr
int
NumILP
=
NumTLP
/
4
;
// constexpr int NumILP = NumTLP / (64 / (NumTLP / vecloadFactor));
int
nHotBlock
=
(
size
/
NumTLP
)
*
NumTLP
;
if
(
notFound
)
{
if
(
numPlanes
%
NumTLP
==
0
)
{
if
(
nHotBlock
>=
NumTLP
)
{
batchGatherVecBlockKernel
<
T
,
Index
,
int
(
NumTLP
),
NumILP
,
vecload_type_t
>
<<<
dim3
(
numPlanes
/
NumTLP
,
size
/
NumTLP
),
dim3
(
NumTLP
/
vecloadFactor
,
NumTLP
/
NumILP
),
0
,
stream
>>>
(
buffer
.
data_ptr
<
T
>
(),
features
.
data_ptr
<
T
>
(),
indices
.
data_ptr
<
Index
>
(),
nHotBlock
,
numPlanes
/
vecloadFactor
,
inds_stride
,
feature_stride
);
TV_CHECK_CUDA_ERR
();
}
if
(
size
-
nHotBlock
>
0
)
{
auto
indices_offset
=
(
nHotBlock
/
feature_stride
)
*
inds_stride
+
nHotBlock
%
feature_stride
;
batchGatherVecKernel
<
T
,
Index
,
int
(
NumTLP
),
NumILP
,
vecload_type_t
>
<<<
dim3
(
1
,
numPlanes
/
NumTLP
),
dim3
(
NumTLP
/
NumILP
,
NumTLP
/
vecloadFactor
),
0
,
stream
>>>
(
buffer
.
data_ptr
<
T
>
()
+
nHotBlock
*
numPlanes
,
features
.
data_ptr
<
T
>
(),
indices
.
data_ptr
<
Index
>
()
+
indices_offset
,
size
-
nHotBlock
,
nHotBlock
,
numPlanes
/
vecloadFactor
,
inds_stride
,
feature_stride
);
TV_CHECK_CUDA_ERR
();
}
notFound
=
false
;
}
}
});
if
(
notFound
)
{
constexpr
int
NumTLP
=
64
;
constexpr
int
NumILP
=
NumTLP
/
4
;
batchGatherGenericKernel
<
T
,
Index
,
NumTLP
,
NumILP
>
<<<
dim3
(
tv
::
cuda
::
DivUp
(
size
,
NumTLP
),
tv
::
cuda
::
DivUp
(
numPlanes
,
NumTLP
)),
dim3
(
NumTLP
/
NumILP
,
NumTLP
),
0
,
stream
>>>
(
buffer
.
data_ptr
<
T
>
(),
features
.
data_ptr
<
T
>
(),
indices
.
data_ptr
<
Index
>
(),
size
,
numPlanes
,
inds_stride
,
feature_stride
);
TV_CHECK_CUDA_ERR
();
}
});
});
}
void
batch_sparse_scatter_add_cuda
(
torch
::
Tensor
buffer
,
torch
::
Tensor
outFeatures
,
torch
::
Tensor
indices
,
int
size
)
{
// indices: [volume, inds_stride]
// buffer: [volume, num_points, num_features]
// size == volume * num_points
if
(
size
<=
0
)
return
;
int
numPlanes
=
outFeatures
.
size
(
1
);
auto
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
auto
dtype
=
outFeatures
.
scalar_type
();
auto
inds_dtype
=
indices
.
scalar_type
();
int
inds_stride
=
indices
.
size
(
1
);
int
feature_stride
=
buffer
.
size
(
1
);
tv
::
DispatchTorch
<
float_types_t
>
()(
dtype
,
[
&
](
auto
TValue
)
{
using
T
=
decltype
(
TValue
);
using
vecload_type_t
=
std
::
conditional_t
<
std
::
is_same
<
T
,
at
::
Half
>::
value
,
int2
,
int4
>
;
using
kernel_block_t
=
tv
::
mp_list_c
<
int
,
64
,
32
,
16
>
;
tv
::
DispatchTorch
<
int_types_t
>
()(
inds_dtype
,
[
&
](
auto
IndexValue
)
{
using
Index
=
decltype
(
IndexValue
);
bool
notFound
=
true
;
constexpr
int
vecloadFactor
=
1
;
// important for half.
tv
::
mp_for_each
<
kernel_block_t
>
([
=
,
&
outFeatures
,
&
buffer
,
&
indices
,
&
notFound
](
auto
NumTLP
)
{
// constexpr int NumILP = NumTLP / (64 / (NumTLP /
// vecloadFactor));
constexpr
int
NumILP
=
NumTLP
/
4
;
int
nHotBlock
=
(
size
/
NumTLP
)
*
NumTLP
;
if
(
notFound
)
{
if
(
numPlanes
%
NumTLP
==
0
)
{
if
(
nHotBlock
>=
NumTLP
)
{
batchScatterAddBlockKernel
<
T
,
Index
,
int
(
NumTLP
),
NumILP
>
<<<
dim3
(
numPlanes
/
NumTLP
,
size
/
NumTLP
),
dim3
(
NumTLP
/
vecloadFactor
,
NumTLP
/
NumILP
),
0
,
stream
>>>
(
outFeatures
.
data_ptr
<
T
>
(),
buffer
.
data_ptr
<
T
>
(),
indices
.
data_ptr
<
Index
>
(),
nHotBlock
,
numPlanes
/
vecloadFactor
,
inds_stride
,
feature_stride
);
TV_CHECK_CUDA_ERR
();
}
if
(
size
-
nHotBlock
>
0
)
{
// int indices_offset = (nHotBlock / feature_stride) * inds_stride + nHotBlock % feature_stride;
batchScatterAddGenericKernel
<
T
,
Index
,
int
(
NumTLP
),
NumILP
>
<<<
dim3
(
1
,
numPlanes
/
NumTLP
),
dim3
(
NumTLP
/
NumILP
,
NumTLP
),
0
,
stream
>>>
(
outFeatures
.
data_ptr
<
T
>
(),
buffer
.
data_ptr
<
T
>
()
+
nHotBlock
*
numPlanes
,
indices
.
data_ptr
<
Index
>
(),
size
-
nHotBlock
,
nHotBlock
,
numPlanes
,
inds_stride
,
feature_stride
);
TV_CHECK_CUDA_ERR
();
}
notFound
=
false
;
}
}
});
if
(
notFound
)
{
constexpr
int
NumTLP
=
64
;
constexpr
int
NumILP
=
NumTLP
/
4
;
batchScatterAddGenericKernel
<
T
,
Index
,
NumTLP
,
NumILP
>
<<<
dim3
(
tv
::
cuda
::
DivUp
(
size
,
NumTLP
),
tv
::
cuda
::
DivUp
(
numPlanes
,
NumTLP
)),
dim3
(
NumTLP
/
NumILP
,
NumTLP
),
0
,
stream
>>>
(
outFeatures
.
data_ptr
<
T
>
(),
buffer
.
data_ptr
<
T
>
(),
indices
.
data_ptr
<
Index
>
(),
size
,
0
,
numPlanes
,
inds_stride
,
feature_stride
);
TV_CHECK_CUDA_ERR
();
}
});
});
}
}
// namespace spconv
}
// namespace spconv
\ No newline at end of file
src/spconv/spconv_ops.cc
View file @
f08268fc
...
@@ -211,6 +211,67 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
...
@@ -211,6 +211,67 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
return
output
;
return
output
;
}
}
torch
::
Tensor
indiceConvBatch
(
torch
::
Tensor
features
,
torch
::
Tensor
filters
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
int64_t
numActOut
,
int64_t
_inverse
,
int64_t
_subM
)
{
bool
subM
=
_subM
!=
0
;
bool
inverse
=
_inverse
!=
0
;
auto
device
=
features
.
device
().
type
();
auto
ndim
=
filters
.
dim
()
-
2
;
auto
kernelVolume
=
indiceNum
.
size
(
0
);
auto
numInPlanes
=
features
.
size
(
1
);
auto
numOutPlanes
=
filters
.
size
(
ndim
+
1
);
auto
indicePairNumCpu
=
indiceNum
.
to
({
torch
::
kCPU
});
auto
indicePairMaxSizeIter
=
std
::
max_element
(
indicePairNumCpu
.
data_ptr
<
int
>
(),
indicePairNumCpu
.
data_ptr
<
int
>
()
+
kernelVolume
);
int
indicePairMaxOffset
=
indicePairMaxSizeIter
-
indicePairNumCpu
.
data_ptr
<
int
>
();
int
indicePairMaxSize
=
*
indicePairMaxSizeIter
;
auto
options
=
torch
::
TensorOptions
().
dtype
(
features
.
dtype
()).
device
(
features
.
device
());
torch
::
Tensor
output
=
torch
::
zeros
({
numActOut
,
numOutPlanes
},
options
);
torch
::
Tensor
inputBuffer
=
torch
::
zeros
({
kernelVolume
,
indicePairMaxSize
,
numInPlanes
},
options
);
torch
::
Tensor
outputBuffer
=
torch
::
zeros
({
kernelVolume
,
indicePairMaxSize
,
numOutPlanes
},
options
);
filters
=
filters
.
view
({
kernelVolume
,
numInPlanes
,
numOutPlanes
});
if
(
subM
)
{
// the center index of subm conv don't need gather and scatter
// add.
torch
::
mm_out
(
output
,
features
,
filters
[
indicePairMaxOffset
]);
}
double
totalGatherTime
=
0
;
double
totalGEMMTime
=
0
;
double
totalSAddTime
=
0
;
auto
size
=
kernelVolume
*
indicePairMaxSize
;
if
(
device
==
torch
::
kCPU
)
{
TV_ASSERT_INVALID_ARG
(
false
,
"unknown device type"
);
}
#ifdef TV_CUDA
else
if
(
device
==
torch
::
kCUDA
)
{
batch_sparse_gather_cuda
(
inputBuffer
,
features
,
indicePairs
[
inverse
],
size
);
}
#endif
else
{
TV_ASSERT_INVALID_ARG
(
false
,
"unknown device type"
);
}
torch
::
bmm_out
(
outputBuffer
,
inputBuffer
,
filters
);
if
(
device
==
torch
::
kCPU
)
{
TV_ASSERT_INVALID_ARG
(
false
,
"unknown device type"
);
}
#ifdef TV_CUDA
else
if
(
device
==
torch
::
kCUDA
)
{
batch_sparse_scatter_add_cuda
(
outputBuffer
,
output
,
indicePairs
[
!
inverse
],
size
);
}
#endif
else
{
TV_ASSERT_INVALID_ARG
(
false
,
"unknown device type"
);
}
return
output
;
}
std
::
vector
<
torch
::
Tensor
>
std
::
vector
<
torch
::
Tensor
>
indiceConvBackward
(
torch
::
Tensor
features
,
torch
::
Tensor
filters
,
indiceConvBackward
(
torch
::
Tensor
features
,
torch
::
Tensor
filters
,
torch
::
Tensor
outGrad
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
outGrad
,
torch
::
Tensor
indicePairs
,
...
...
test/test_conv.py
View file @
f08268fc
...
@@ -720,6 +720,6 @@ def main_subm():
...
@@ -720,6 +720,6 @@ def main_subm():
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
main
_subm
()
main
()
# unittest.main()
# unittest.main()
# TestSpConv().testSpConv3d()
# TestSpConv().testSpConv3d()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment