Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
one
spconv
Commits
01ed382c
Commit
01ed382c
authored
Oct 18, 2021
by
yan.yan
Browse files
working on tensor core test
parent
3517290c
Changes
159
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
0 additions
and
3116 deletions
+0
-3116
include/cuhash/hash_table.h
include/cuhash/hash_table.h
+0
-228
include/paramsgrid.h
include/paramsgrid.h
+0
-65
include/spconv/box_iou.h
include/spconv/box_iou.h
+0
-156
include/spconv/cublas_gemm.h
include/spconv/cublas_gemm.h
+0
-47
include/spconv/fused_conv.cu.h
include/spconv/fused_conv.cu.h
+0
-629
include/spconv/fused_conv.h
include/spconv/fused_conv.h
+0
-55
include/spconv/fused_spconv_ops.h
include/spconv/fused_spconv_ops.h
+0
-126
include/spconv/geometry.h
include/spconv/geometry.h
+0
-183
include/spconv/indice.cu.h
include/spconv/indice.cu.h
+0
-571
include/spconv/indice.h
include/spconv/indice.h
+0
-58
include/spconv/maxpool.h
include/spconv/maxpool.h
+0
-44
include/spconv/minkowski.cu.h
include/spconv/minkowski.cu.h
+0
-179
include/spconv/nms.h
include/spconv/nms.h
+0
-202
include/spconv/nms_functor.h
include/spconv/nms_functor.h
+0
-37
include/spconv/nms_gpu.h
include/spconv/nms_gpu.h
+0
-18
include/spconv/nms_ops.h
include/spconv/nms_ops.h
+0
-74
include/spconv/pillar_scatter_functor.h
include/spconv/pillar_scatter_functor.h
+0
-31
include/spconv/pillar_scatter_ops.h
include/spconv/pillar_scatter_ops.h
+0
-56
include/spconv/point2voxel.cu.h
include/spconv/point2voxel.cu.h
+0
-81
include/spconv/point2voxel.h
include/spconv/point2voxel.h
+0
-276
No files found.
include/cuhash/hash_table.h
deleted
100644 → 0
View file @
3517290c
// -------------------------------------------------------------
// cuDPP -- CUDA Data Parallel Primitives library
// -------------------------------------------------------------
// $Revision:$
// $Date:$
// -------------------------------------------------------------
// This source code is distributed under the terms of license.txt in
// the root directory of this source distribution.
// -------------------------------------------------------------
/**
* @file hash_table.h
*
* @brief Header for a basic hash table that stores one value per key.
*/
#ifndef CUDAHT__CUCKOO__SRC__LIBRARY__HASH_TABLE__H
#define CUDAHT__CUCKOO__SRC__LIBRARY__HASH_TABLE__H
#include "definitions.h"
#include "hash_functions.h"
#include <cstdio>
/** \addtogroup cudpp_app
* @{
*/
/** \addtogroup cudpp_hash_data_structures
* @{
*/
/* --------------------------------------------------------------------------
Doxygen definitions.
-------------------------------------------------------------------------- */
/*! @namespace CudaHT
* @brief Encapsulates the hash table library.
*/
/*! @namespace CuckooHashing
* @brief Encapsulates the cuckoo hash table that uses stashes.
*/
/* -------------------------------------------------------------------------
Hash table code.
------------------------------------------------------------------------- */
namespace
cuhash
{
//! Compute how many thread blocks are required for the given number of threads.
dim3
ComputeGridDim
(
unsigned
threads
);
//! Compute how long an eviction chain is allowed to become for a given input
//! size.
/*! \param[in] num_keys Number of keys in the input.
* \param[in] table_size Number of slots in the hash table.
* \param[in] num_functions Number of hash functions being used.
* \returns The number of iterations that should be allowed.
*
* The latter two parameters are only needed when using an empirical
* formula for computing the chain length.
*/
unsigned
ComputeMaxIterations
(
const
unsigned
num_keys
,
const
unsigned
table_size
,
const
unsigned
num_functions
);
//! Basic hash table that stores one value for each key.
/*! The input consists of two unsigned arrays of keys and values.
* None of the keys are expected to be repeated.
*
* @todo Templatize the interface without forcing the header file to
* have CUDA calls.
* @ingroup cudpp_app
*/
class
HashTable
{
public:
HashTable
();
virtual
~
HashTable
()
{
Release
();
}
//! Initialize the hash table's memory. Must be called before \ref
//! Build() and after the random number generator has been seeded.
/*! @param[in] max_input_size Largest expected number of items in the input.
* @param[in] space_usage Size of the hash table relative to the
* input. Bigger tables are faster to build
* and retrieve from.
* @param[in] num_functions Number of hash functions to use. May be
* 2-5. More hash functions make it easier
* to build the table, but increase
* retrieval times.
* @returns Whether the hash table was initialized successfully (true)
* or not (false).
*
* The minimum space usage is dependent on the number of functions
* being used; for two through five functions, the minimum space
* usage is 2.1, 1.1, 1.03, and 1.02 respectively.
*/
virtual
bool
Initialize
(
const
unsigned
max_input_size
,
const
float
space_usage
=
1.25
,
const
unsigned
num_functions
=
4
);
//! Free all memory.
virtual
void
Release
();
//! Build the hash table.
/*! @param[in] input_size Number of key-value pairs being inserted.
* @param[in] d_keys Device memory array containing all of the input
* keys.
* @param[in] d_vals Device memory array containing the keys' values.
* @returns Whether the hash table was built successfully (true) or
* not (false).
*
* Several attempts are allowed to build the hash table in case of failure.
* The input keys are expected to be completely unique.
* To reduce the chance of a failure, increase the space usage or number of
* functions.
* Keys are not allowed to be equal to cuhash::kKeyEmpty.
*/
virtual
bool
Build
(
const
unsigned
input_size
,
const
unsigned
*
d_keys
,
const
unsigned
*
d_vals
);
//! Query the hash table.
/*! @param[in] n_queries Number of keys in the query set.
* @param[in] d_query_keys Device memory array containing all of
* the query keys.
* @param[in] d_query_results Values for the query keys.
*
* kNotFound is returned for any query key that failed to be found
* in the table.
*/
virtual
void
Retrieve
(
const
unsigned
n_queries
,
const
unsigned
*
d_query_keys
,
unsigned
*
d_query_results
);
//! @name Accessors
/// @brief Mainly needed to use the __device__ CudaHT::retrieve()
/// function directly.
/// @{
//! Returns how many slots the hash table has.
inline
unsigned
get_table_size
()
const
{
return
table_size_
;
}
//! Returns how many items are stored in the stash.
inline
unsigned
get_stash_count
()
const
{
return
stash_count_
;
}
//! Returns the constants used by the stash.
inline
uint2
get_stash_constants
()
const
{
return
stash_constants_
;
}
//! Returns the hash table contents.
inline
const
Entry
*
get_contents
()
const
{
return
d_contents_
;
}
//! Returns the number of hash functions being used.
inline
unsigned
get_num_hash_functions
()
const
{
return
num_hash_functions_
;
}
//! When using two hash functions, returns the constants.
inline
Functions
<
2
>
get_constants_2
()
const
{
return
constants_2_
;
}
//! When using three hash functions, returns the constants.
inline
Functions
<
3
>
get_constants_3
()
const
{
return
constants_3_
;
}
//! When using four hash functions, returns the constants.
inline
Functions
<
4
>
get_constants_4
()
const
{
return
constants_4_
;
}
//! When using five hash functions, returns the constants.
inline
Functions
<
5
>
get_constants_5
()
const
{
return
constants_5_
;
}
/// @}
inline
Entry
*
data
()
{
return
d_contents_
;
}
inline
const
Entry
*
data
()
const
{
return
d_contents_
;
}
protected:
unsigned
table_size_
;
//!< Size of the hash table.
unsigned
num_hash_functions_
;
//!< Number of hash functions being used.
Entry
*
d_contents_
;
//!< Device memory: The hash table contents. The stash is
//!< stored at the end.
unsigned
stash_count_
;
//!< Number of key-value pairs currently stored.
uint2
stash_constants_
;
//!< Hash function constants for the stash.
Functions
<
2
>
constants_2_
;
//!< Constants for a set of two hash functions.
Functions
<
3
>
constants_3_
;
//!< Constants for a set of three hash functions.
Functions
<
4
>
constants_4_
;
//!< Constants for a set of four hash functions.
Functions
<
5
>
constants_5_
;
//!< Constants for a set of five hash functions.
unsigned
*
d_failures_
;
//!< Device memory: General use error flag.
};
/*! @name Internal
* @{
*/
namespace
CUDAWrapper
{
//! Fills a 64-bit array with a particular value.
void
ClearTable
(
const
unsigned
slots_in_table
,
const
Entry
fill_value
,
Entry
*
d_array
);
//! Calls the Cuckoo Hash construction kernel.
void
CallCuckooHash
(
const
unsigned
n_entries
,
const
unsigned
num_hash_functions
,
const
unsigned
*
d_keys
,
const
unsigned
*
d_values
,
const
unsigned
table_size
,
const
Functions
<
2
>
constants_2
,
const
Functions
<
3
>
constants_3
,
const
Functions
<
4
>
constants_4
,
const
Functions
<
5
>
constants_5
,
const
unsigned
max_iteration_attempts
,
Entry
*
d_contents
,
uint2
stash_constants
,
unsigned
*
d_stash_count
,
unsigned
*
d_failures
,
unsigned
*
d_iterations_taken
);
//! Calls the kernel that performs retrievals.
void
CallHashRetrieve
(
const
unsigned
n_queries
,
const
unsigned
num_hash_functions
,
const
unsigned
*
keys_in
,
const
unsigned
table_size
,
const
Entry
*
table
,
const
Functions
<
2
>
constants_2
,
const
Functions
<
3
>
constants_3
,
const
Functions
<
4
>
constants_4
,
const
Functions
<
5
>
constants_5
,
const
uint2
stash_constants
,
const
unsigned
stash_count
,
unsigned
*
values_out
);
};
// namespace CUDAWrapper
/// @}
};
// namespace cuhash
/** @} */
// end hash table data structures
/** @} */
// end cudpp_app
#endif
// Leave this at the end of the file
// Local Variables:
// mode:c++
// c-file-style: "NVIDIA"
// End:
include/paramsgrid.h
deleted
100644 → 0
View file @
3517290c
// Copyright 2019-2020 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// This file is used for c++ unit test, but pytorch jit ops don't support c++
// debug build.
#ifndef PARAMS_GRID_H_
#define PARAMS_GRID_H_
#include <tuple>
#include <vector>
namespace
detail
{
template
<
class
T
>
int
getTotalSize
(
std
::
vector
<
T
>
arg
)
{
return
arg
.
size
();
}
template
<
class
T
,
class
...
TArgs
>
int
getTotalSize
(
std
::
vector
<
T
>
arg
,
std
::
vector
<
TArgs
>
...
args
)
{
return
arg
.
size
()
*
getTotalSize
(
args
...);
}
template
<
typename
T
>
int
getSize
(
std
::
vector
<
T
>
arg
)
{
return
arg
.
size
();
}
template
<
int
Idx
,
class
TT
,
class
T
>
void
assigner
(
TT
&
src
,
std
::
vector
<
int
>
counter
,
std
::
vector
<
T
>
&
arg
)
{
std
::
get
<
Idx
>
(
src
)
=
arg
[
counter
[
Idx
]];
}
template
<
int
Idx
,
class
TT
,
class
T
,
class
...
TArgs
>
void
assigner
(
TT
&
src
,
std
::
vector
<
int
>
counter
,
std
::
vector
<
T
>
&
arg
,
std
::
vector
<
TArgs
>
&
...
args
)
{
std
::
get
<
Idx
>
(
src
)
=
arg
[
counter
[
Idx
]];
assigner
<
Idx
+
1
>
(
src
,
counter
,
args
...);
}
}
// namespace detail
template
<
class
...
TArgs
>
std
::
vector
<
std
::
tuple
<
TArgs
...
>>
paramsGrid
(
std
::
vector
<
TArgs
>
...
args
)
{
int
length
=
detail
::
getTotalSize
(
args
...);
std
::
vector
<
int
>
sizes
=
{
detail
::
getSize
(
args
)...};
int
size
=
sizes
.
size
();
std
::
vector
<
std
::
tuple
<
TArgs
...
>>
params
(
length
);
std
::
vector
<
int
>
counter
(
size
);
for
(
int
i
=
0
;
i
<
length
;
++
i
)
{
detail
::
assigner
<
0
>
(
params
[
i
],
counter
,
args
...);
counter
[
size
-
1
]
+=
1
;
for
(
int
c
=
size
-
1
;
c
>=
0
;
--
c
)
{
if
(
counter
[
c
]
==
sizes
[
c
]
&&
c
>
0
)
{
counter
[
c
-
1
]
+=
1
;
counter
[
c
]
=
0
;
}
}
}
return
params
;
}
#endif
\ No newline at end of file
include/spconv/box_iou.h
deleted
100644 → 0
View file @
3517290c
// Copyright 2019-2020 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef BOX_IOU_H
#define BOX_IOU_H
#include <pybind11/pybind11.h>
// must include pybind11/eigen.h if using eigen matrix as arguments.
#include <algorithm>
#include <boost/geometry.hpp>
#include <pybind11/numpy.h>
namespace
spconv
{
// #include "voxelnet/core/cc/pybind11_helper.h"
namespace
py
=
pybind11
;
using
namespace
pybind11
::
literals
;
template
<
typename
DType
,
typename
ShapeContainer
>
inline
py
::
array_t
<
DType
>
constant
(
ShapeContainer
shape
,
DType
value
)
{
// create ROWMAJOR array.
py
::
array_t
<
DType
>
array
(
shape
);
std
::
fill
(
array
.
mutable_data
(),
array
.
mutable_data
()
+
array
.
size
(),
value
);
return
array
;
}
template
<
typename
DType
>
inline
py
::
array_t
<
DType
>
zeros
(
std
::
vector
<
long
int
>
shape
)
{
return
constant
<
DType
,
std
::
vector
<
long
int
>>
(
shape
,
0
);
}
template
<
typename
DType
>
py
::
array_t
<
DType
>
rbbox_iou
(
py
::
array_t
<
DType
>
box_corners
,
py
::
array_t
<
DType
>
qbox_corners
,
py
::
array_t
<
DType
>
standup_iou
,
DType
standup_thresh
)
{
namespace
bg
=
boost
::
geometry
;
typedef
bg
::
model
::
point
<
DType
,
2
,
bg
::
cs
::
cartesian
>
point_t
;
typedef
bg
::
model
::
polygon
<
point_t
>
polygon_t
;
polygon_t
poly
,
qpoly
;
std
::
vector
<
polygon_t
>
poly_inter
,
poly_union
;
DType
inter_area
,
union_area
;
auto
box_corners_r
=
box_corners
.
template
unchecked
<
3
>();
auto
qbox_corners_r
=
qbox_corners
.
template
unchecked
<
3
>();
auto
standup_iou_r
=
standup_iou
.
template
unchecked
<
2
>();
auto
N
=
box_corners_r
.
shape
(
0
);
auto
K
=
qbox_corners_r
.
shape
(
0
);
py
::
array_t
<
DType
>
overlaps
=
zeros
<
DType
>
({
int
(
N
),
int
(
K
)});
auto
overlaps_rw
=
overlaps
.
template
mutable_unchecked
<
2
>();
if
(
N
==
0
||
K
==
0
)
{
return
overlaps
;
}
for
(
int
k
=
0
;
k
<
K
;
++
k
)
{
for
(
int
n
=
0
;
n
<
N
;
++
n
)
{
if
(
standup_iou_r
(
n
,
k
)
<=
standup_thresh
)
continue
;
bg
::
append
(
poly
,
point_t
(
box_corners_r
(
n
,
0
,
0
),
box_corners_r
(
n
,
0
,
1
)));
bg
::
append
(
poly
,
point_t
(
box_corners_r
(
n
,
1
,
0
),
box_corners_r
(
n
,
1
,
1
)));
bg
::
append
(
poly
,
point_t
(
box_corners_r
(
n
,
2
,
0
),
box_corners_r
(
n
,
2
,
1
)));
bg
::
append
(
poly
,
point_t
(
box_corners_r
(
n
,
3
,
0
),
box_corners_r
(
n
,
3
,
1
)));
bg
::
append
(
poly
,
point_t
(
box_corners_r
(
n
,
0
,
0
),
box_corners_r
(
n
,
0
,
1
)));
bg
::
append
(
qpoly
,
point_t
(
qbox_corners_r
(
k
,
0
,
0
),
qbox_corners_r
(
k
,
0
,
1
)));
bg
::
append
(
qpoly
,
point_t
(
qbox_corners_r
(
k
,
1
,
0
),
qbox_corners_r
(
k
,
1
,
1
)));
bg
::
append
(
qpoly
,
point_t
(
qbox_corners_r
(
k
,
2
,
0
),
qbox_corners_r
(
k
,
2
,
1
)));
bg
::
append
(
qpoly
,
point_t
(
qbox_corners_r
(
k
,
3
,
0
),
qbox_corners_r
(
k
,
3
,
1
)));
bg
::
append
(
qpoly
,
point_t
(
qbox_corners_r
(
k
,
0
,
0
),
qbox_corners_r
(
k
,
0
,
1
)));
bg
::
intersection
(
poly
,
qpoly
,
poly_inter
);
if
(
!
poly_inter
.
empty
())
{
inter_area
=
bg
::
area
(
poly_inter
.
front
());
bg
::
union_
(
poly
,
qpoly
,
poly_union
);
if
(
!
poly_union
.
empty
())
{
union_area
=
bg
::
area
(
poly_union
.
front
());
overlaps_rw
(
n
,
k
)
=
inter_area
/
union_area
;
}
poly_union
.
clear
();
}
poly
.
clear
();
qpoly
.
clear
();
poly_inter
.
clear
();
}
}
return
overlaps
;
}
template
<
typename
DType
>
py
::
array_t
<
DType
>
rbbox_intersection
(
py
::
array_t
<
DType
>
box_corners
,
py
::
array_t
<
DType
>
qbox_corners
,
py
::
array_t
<
DType
>
standup_iou
,
DType
standup_thresh
)
{
namespace
bg
=
boost
::
geometry
;
typedef
bg
::
model
::
point
<
DType
,
2
,
bg
::
cs
::
cartesian
>
point_t
;
typedef
bg
::
model
::
polygon
<
point_t
>
polygon_t
;
polygon_t
poly
,
qpoly
;
std
::
vector
<
polygon_t
>
poly_inter
,
poly_union
;
DType
inter_area
,
union_area
;
auto
box_corners_r
=
box_corners
.
template
unchecked
<
3
>();
auto
qbox_corners_r
=
qbox_corners
.
template
unchecked
<
3
>();
auto
standup_iou_r
=
standup_iou
.
template
unchecked
<
2
>();
auto
N
=
box_corners_r
.
shape
(
0
);
auto
K
=
qbox_corners_r
.
shape
(
0
);
py
::
array_t
<
DType
>
overlaps
=
zeros
<
DType
>
({
int
(
N
),
int
(
K
)});
auto
overlaps_rw
=
overlaps
.
template
mutable_unchecked
<
2
>();
if
(
N
==
0
||
K
==
0
)
{
return
overlaps
;
}
for
(
int
k
=
0
;
k
<
K
;
++
k
)
{
for
(
int
n
=
0
;
n
<
N
;
++
n
)
{
if
(
standup_iou_r
(
n
,
k
)
<=
standup_thresh
)
continue
;
bg
::
append
(
poly
,
point_t
(
box_corners_r
(
n
,
0
,
0
),
box_corners_r
(
n
,
0
,
1
)));
bg
::
append
(
poly
,
point_t
(
box_corners_r
(
n
,
1
,
0
),
box_corners_r
(
n
,
1
,
1
)));
bg
::
append
(
poly
,
point_t
(
box_corners_r
(
n
,
2
,
0
),
box_corners_r
(
n
,
2
,
1
)));
bg
::
append
(
poly
,
point_t
(
box_corners_r
(
n
,
3
,
0
),
box_corners_r
(
n
,
3
,
1
)));
bg
::
append
(
poly
,
point_t
(
box_corners_r
(
n
,
0
,
0
),
box_corners_r
(
n
,
0
,
1
)));
bg
::
append
(
qpoly
,
point_t
(
qbox_corners_r
(
k
,
0
,
0
),
qbox_corners_r
(
k
,
0
,
1
)));
bg
::
append
(
qpoly
,
point_t
(
qbox_corners_r
(
k
,
1
,
0
),
qbox_corners_r
(
k
,
1
,
1
)));
bg
::
append
(
qpoly
,
point_t
(
qbox_corners_r
(
k
,
2
,
0
),
qbox_corners_r
(
k
,
2
,
1
)));
bg
::
append
(
qpoly
,
point_t
(
qbox_corners_r
(
k
,
3
,
0
),
qbox_corners_r
(
k
,
3
,
1
)));
bg
::
append
(
qpoly
,
point_t
(
qbox_corners_r
(
k
,
0
,
0
),
qbox_corners_r
(
k
,
0
,
1
)));
bg
::
intersection
(
poly
,
qpoly
,
poly_inter
);
if
(
!
poly_inter
.
empty
())
{
inter_area
=
bg
::
area
(
poly_inter
.
front
());
overlaps_rw
(
n
,
k
)
=
inter_area
;
}
poly
.
clear
();
qpoly
.
clear
();
poly_inter
.
clear
();
}
}
return
overlaps
;
}
}
// namespace spconv
#endif
\ No newline at end of file
include/spconv/cublas_gemm.h
deleted
100644 → 0
View file @
3517290c
#pragma once
#include <cublas_v2.h>
#include <tensorview/tensorview.h>
namespace
spconv
{
template
<
class
T
>
cublasStatus_t
cublasTgemm
(
cublasHandle_t
handle
,
cublasOperation_t
transa
,
cublasOperation_t
transb
,
int
m
,
int
n
,
int
k
,
const
T
*
alpha
,
const
T
*
A
,
int
lda
,
const
T
*
B
,
int
ldb
,
const
T
*
beta
,
T
*
C
,
int
ldc
);
template
<
class
T
>
cublasStatus_t
cublasTgemmRow
(
cublasHandle_t
handle
,
cublasOperation_t
transa
,
cublasOperation_t
transb
,
int
m
,
int
n
,
int
k
,
const
T
*
alpha
,
const
T
*
A
,
int
lda
,
const
T
*
B
,
int
ldb
,
const
T
*
beta
,
T
*
C
,
int
ldc
)
{
return
cublasTgemm
<
T
>
(
handle
,
transb
,
transa
,
n
,
m
,
k
,
alpha
,
B
,
ldb
,
A
,
lda
,
beta
,
C
,
ldc
);
}
template
<
class
T
>
inline
T
constant_scalar
(
float
data
)
{
return
T
(
data
);
}
template
<
class
T
>
cublasStatus_t
gemm
(
cublasHandle_t
handle
,
bool
transa
,
bool
transb
,
const
tv
::
TensorView
<
T
>
A
,
const
tv
::
TensorView
<
T
>
B
,
tv
::
TensorView
<
T
>
C
)
{
TV_ASSERT_RT_ERR
(
A
.
ndim
()
==
2
,
"error"
);
TV_ASSERT_RT_ERR
(
B
.
ndim
()
==
2
,
"error"
);
auto
transa_cublas
=
transa
?
CUBLAS_OP_T
:
CUBLAS_OP_N
;
auto
transb_cublas
=
transb
?
CUBLAS_OP_T
:
CUBLAS_OP_N
;
int
m
=
transa
?
A
.
dim
(
1
)
:
A
.
dim
(
0
);
int
n
=
transb
?
B
.
dim
(
0
)
:
B
.
dim
(
1
);
int
ka
=
transa
?
A
.
dim
(
0
)
:
A
.
dim
(
1
);
int
kb
=
transb
?
B
.
dim
(
1
)
:
B
.
dim
(
0
);
int
lda
=
transa
?
m
:
ka
;
int
ldb
=
transb
?
ka
:
n
;
int
ldc
=
n
;
TV_ASSERT_RT_ERR
(
ka
==
kb
,
"error"
);
T
alpha
=
constant_scalar
<
T
>
(
1
);
T
beta
=
constant_scalar
<
T
>
(
0
);
return
cublasTgemmRow
<
T
>
(
handle
,
transa_cublas
,
transb_cublas
,
m
,
n
,
ka
,
&
alpha
,
A
.
data
(),
lda
,
B
.
data
(),
ldb
,
&
beta
,
C
.
data
(),
ldc
);
}
}
// namespace spconv
include/spconv/fused_conv.cu.h
deleted
100644 → 0
View file @
3517290c
/*
BSD License
For SparseConvNet software
Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
* Neither the name Facebook nor the names of its contributors may be used to
endorse or promote products derived from this software without specific
prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#define TACC double
template
<
typename
T
,
int32_t
K
,
int32_t
V
>
__global__
void
dConvolution_KMxKN_forwardA
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
int32_t
*
rulesIn
,
int32_t
*
rulesOut
,
int32_t
nHot
,
int32_t
input_nPlanes
,
int32_t
input_stride
,
int32_t
output_nPlanes
,
int32_t
output_stride
)
{
// nHot must be a multiple of K!!
// Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,nGroups) Volkov-blocks
// K is a multiple of V,
// nHot x KM -> nHot x KN - parallel over N,nHot - loop over M
int32_t
M
=
input_nPlanes
/
K
;
// N = gridDim.y == output_nPlanes/K
int32_t
n
=
blockIdx
.
y
;
int32_t
g
=
blockIdx
.
z
;
inFeatures
+=
g
*
input_nPlanes
;
outFeatures
+=
n
*
K
+
g
*
output_nPlanes
;
w
+=
n
*
K
+
g
*
input_nPlanes
*
output_nPlanes
;
TACC
O
[
V
];
__shared__
T
W
[
K
][
K
];
__shared__
T
I
[
K
][
K
];
int32_t
R0
[
V
];
int32_t
R1
[
V
];
const
int32_t
tx
=
threadIdx
.
x
;
int32_t
ty
[
V
];
#pragma unroll
for
(
int32_t
v
=
0
;
v
<
V
;
v
++
)
ty
[
v
]
=
threadIdx
.
y
+
v
*
(
K
/
V
);
for
(
int32_t
m
=
0
;
m
<
M
;
m
++
)
{
// Read w
#pragma unroll
for
(
int32_t
v
=
0
;
v
<
V
;
v
++
)
W
[
ty
[
v
]][
tx
]
=
w
[
ty
[
v
]
*
output_nPlanes
+
tx
];
for
(
int32_t
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
#pragma unroll
for
(
int32_t
v
=
0
;
v
<
V
;
v
++
)
{
R0
[
v
]
=
rulesIn
[
s
+
ty
[
v
]];
R1
[
v
]
=
rulesOut
[
s
+
ty
[
v
]];
}
__syncthreads
();
// Read input, reset O[]
#pragma unroll
for
(
int32_t
v
=
0
;
v
<
V
;
v
++
)
{
I
[
ty
[
v
]][
tx
]
=
inFeatures
[
R0
[
v
]
*
input_stride
+
tx
];
O
[
v
]
=
0
;
}
__syncthreads
();
#pragma unroll
for
(
int32_t
k
=
0
;
k
<
K
;
k
++
)
#pragma unroll
for
(
int32_t
v
=
0
;
v
<
V
;
v
++
)
O
[
v
]
+=
I
[
ty
[
v
]][
k
]
*
W
[
k
][
tx
];
#pragma unroll
for
(
int32_t
v
=
0
;
v
<
V
;
v
++
)
O
[
v
]
+=
outFeatures
[
R1
[
v
]
*
output_stride
+
tx
];
#pragma unroll
for
(
int32_t
v
=
0
;
v
<
V
;
v
++
)
outFeatures
[
R1
[
v
]
*
output_stride
+
tx
]
=
O
[
v
];
__syncthreads
();
}
w
+=
K
*
output_nPlanes
;
inFeatures
+=
K
;
}
}
template
<
typename
T
,
int32_t
K
,
int32_t
V
>
__global__
void
dConvolution_KMxKN_forwardB
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
int32_t
*
rulesIn
,
int32_t
*
rulesOut
,
int32_t
nHot
,
int32_t
input_nPlanes
,
int32_t
input_stride
,
int32_t
output_nPlanes
,
int32_t
output_stride
)
{
// Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,nGroups) Volkov-blocks
// K is a multiple of V,
// nHot x KM -> nHot x KN - parallel over N,nHot - loop over M
int32_t
M
=
input_nPlanes
/
K
;
// N = gridDim.y == output_nPlanes/K
int32_t
n
=
blockIdx
.
y
;
int32_t
g
=
blockIdx
.
z
;
inFeatures
+=
g
*
input_nPlanes
;
outFeatures
+=
n
*
K
+
g
*
output_nPlanes
;
w
+=
n
*
K
+
g
*
input_nPlanes
*
output_nPlanes
;
TACC
O
[
V
];
__shared__
T
W
[
K
][
K
];
__shared__
T
I
[
K
][
K
];
int32_t
R0
[
V
];
int32_t
R1
[
V
];
const
int32_t
tx
=
threadIdx
.
x
;
int32_t
ty
[
V
];
#pragma unroll
for
(
int32_t
v
=
0
;
v
<
V
;
v
++
)
ty
[
v
]
=
threadIdx
.
y
+
v
*
(
K
/
V
);
for
(
int32_t
m
=
0
;
m
<
M
;
m
++
)
{
// Read w
#pragma unroll
for
(
int32_t
v
=
0
;
v
<
V
;
v
++
)
W
[
ty
[
v
]][
tx
]
=
w
[
ty
[
v
]
*
output_nPlanes
+
tx
];
for
(
int32_t
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
#pragma unroll
for
(
int32_t
v
=
0
;
v
<
V
;
v
++
)
{
if
(
s
+
ty
[
v
]
<
nHot
)
{
R0
[
v
]
=
rulesIn
[
s
+
ty
[
v
]];
R1
[
v
]
=
rulesOut
[
s
+
ty
[
v
]];
}
}
__syncthreads
();
// Read input, reset O[]
#pragma unroll
for
(
int32_t
v
=
0
;
v
<
V
;
v
++
)
{
if
(
s
+
ty
[
v
]
<
nHot
)
I
[
ty
[
v
]][
tx
]
=
inFeatures
[
R0
[
v
]
*
input_stride
+
tx
];
O
[
v
]
=
0
;
}
__syncthreads
();
#pragma unroll
for
(
int32_t
k
=
0
;
k
<
K
;
k
++
)
#pragma unroll
for
(
int32_t
v
=
0
;
v
<
V
;
v
++
)
O
[
v
]
+=
I
[
ty
[
v
]][
k
]
*
W
[
k
][
tx
];
#pragma unroll
for
(
int32_t
v
=
0
;
v
<
V
;
v
++
)
if
(
s
+
ty
[
v
]
<
nHot
)
O
[
v
]
+=
outFeatures
[
R1
[
v
]
*
output_stride
+
tx
];
#pragma unroll
for
(
int32_t
v
=
0
;
v
<
V
;
v
++
)
if
(
s
+
ty
[
v
]
<
nHot
)
outFeatures
[
R1
[
v
]
*
output_stride
+
tx
]
=
O
[
v
];
__syncthreads
();
}
w
+=
K
*
output_nPlanes
;
inFeatures
+=
K
;
}
}
#define FOO(T, K, V) \
{ \
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
int32_t o = (nHot / K) * K; \
if (o >= K) \
dConvolution_KMxKN_forwardA<T, K, V> \
<<<dim3(std::min(o / K, (int32_t)512), output_nPlanes / K, \
nGroups), \
dim3(K, K / V), 0, s>>>( \
inFeatures, outFeatures, w, rulesIn, rulesOut, o, \
input_nPlanes, input_stride, output_nPlanes, output_stride); \
if (nHot > o) \
dConvolution_KMxKN_forwardB<T, K, V> \
<<<dim3(1, output_nPlanes / K, nGroups), dim3(K, K / V), 0, s>>>( \
inFeatures, outFeatures, w, rulesIn + o, rulesOut + o, \
nHot - o, input_nPlanes, input_stride, output_nPlanes, \
output_stride); \
return; \
} \
}
template
<
typename
T
>
void
dConvolution_forward
(
cudaStream_t
s
,
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
int32_t
*
rulesIn
,
int32_t
*
rulesOut
,
int32_t
nHot
,
int32_t
input_nPlanes
,
int32_t
input_stride
,
int32_t
output_nPlanes
,
int32_t
output_stride
,
int32_t
nGroups
)
{
FOO
(
T
,
64
,
16
)
FOO
(
T
,
32
,
8
)
FOO
(
T
,
16
,
4
)
FOO
(
T
,
8
,
2
)
assert
(
false
);
}
template
<
>
void
dConvolution_forward
<
double
>
(
cudaStream_t
s
,
double
*
inFeatures
,
double
*
outFeatures
,
double
*
w
,
int32_t
*
rulesIn
,
int32_t
*
rulesOut
,
int32_t
nHot
,
int32_t
input_nPlanes
,
int32_t
input_stride
,
int32_t
output_nPlanes
,
int32_t
output_stride
,
int32_t
nGroups
)
{
FOO
(
double
,
32
,
8
)
FOO
(
double
,
16
,
4
)
FOO
(
double
,
8
,
2
)
assert
(
false
);
}
#undef FOO
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,nGroups)
template
<
typename
T
,
int32_t
K
,
int32_t
V
>
__global__
void
dConvolution_KMxKN_backward_dW_A
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
int32_t
*
rulesIn
,
int32_t
*
rulesOut
,
int32_t
nHot
,
int32_t
input_nPlanes
,
int32_t
input_stride
,
int32_t
output_nPlanes
,
int32_t
output_stride
)
{
// M = gridDim.y == input_nPlanes / K
int32_t
N
=
output_nPlanes
/
K
;
int32_t
m
=
blockIdx
.
y
;
int32_t
g
=
blockIdx
.
z
;
inFeatures
+=
m
*
K
+
g
*
input_nPlanes
;
dInFeatures
+=
m
*
K
+
g
*
input_nPlanes
;
dOutFeatures
+=
g
*
output_nPlanes
;
w
+=
m
*
K
*
output_nPlanes
+
g
*
input_nPlanes
*
output_nPlanes
;
dw
+=
m
*
K
*
output_nPlanes
+
g
*
input_nPlanes
*
output_nPlanes
;
TACC
dI
[
V
];
TACC
dW
[
V
];
__shared__
T
I
[
K
][
K
];
__shared__
T
dO
[
K
][
K
];
__shared__
T
W
[
K
][
K
];
int32_t
R0
[
V
];
int32_t
R1
[
V
];
const
int32_t
tx
=
threadIdx
.
x
;
int32_t
ty
[
V
];
#pragma unroll
for
(
int32_t
v
=
0
;
v
<
V
;
v
++
)
ty
[
v
]
=
threadIdx
.
y
+
v
*
(
K
/
V
);
for
(
int32_t
n
=
0
;
n
<
N
;
n
++
)
{
// Read w, reset dW
#pragma unroll
for
(
int32_t
v
=
0
;
v
<
V
;
v
++
)
{
W
[
ty
[
v
]][
tx
]
=
w
[
ty
[
v
]
*
output_nPlanes
+
tx
];
dW
[
v
]
=
0
;
}
for
(
int32_t
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
#pragma unroll
for
(
int32_t
v
=
0
;
v
<
V
;
v
++
)
{
R0
[
v
]
=
rulesIn
[
s
+
ty
[
v
]];
R1
[
v
]
=
rulesOut
[
s
+
ty
[
v
]];
dI
[
v
]
=
0
;
}
__syncthreads
();
// Read input and dOutput
#pragma unroll
for
(
int32_t
v
=
0
;
v
<
V
;
v
++
)
{
I
[
ty
[
v
]][
tx
]
=
inFeatures
[
R0
[
v
]
*
input_stride
+
tx
];
dO
[
ty
[
v
]][
tx
]
=
dOutFeatures
[
R1
[
v
]
*
output_stride
+
tx
];
}
__syncthreads
();
#pragma unroll
for
(
int32_t
k
=
0
;
k
<
K
;
k
++
)
#pragma unroll
for
(
int32_t
v
=
0
;
v
<
V
;
v
++
)
{
dI
[
v
]
+=
dO
[
ty
[
v
]][
k
]
*
W
[
tx
][
k
];
dW
[
v
]
+=
I
[
k
][
ty
[
v
]]
*
dO
[
k
][
tx
];
}
#pragma unroll
for
(
int32_t
v
=
0
;
v
<
V
;
v
++
)
dI
[
v
]
+=
dInFeatures
[
R0
[
v
]
*
input_stride
+
tx
];
#pragma unroll
for
(
int32_t
v
=
0
;
v
<
V
;
v
++
)
dInFeatures
[
R0
[
v
]
*
input_stride
+
tx
]
=
dI
[
v
];
__syncthreads
();
}
#pragma unroll
for
(
int32_t
v
=
0
;
v
<
V
;
v
++
)
atomicAdd
(
&
dw
[
ty
[
v
]
*
output_nPlanes
+
tx
],
dW
[
v
]);
w
+=
K
;
dw
+=
K
;
dOutFeatures
+=
K
;
}
}
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,nGroups)
template
<
typename
T
,
int32_t
K
,
int32_t
V
>
__global__
void
dConvolution_KMxKN_backward_dW_B
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
int32_t
*
rulesIn
,
int32_t
*
rulesOut
,
int32_t
nHot
,
int32_t
input_nPlanes
,
int32_t
input_stride
,
int32_t
output_nPlanes
,
int32_t
output_stride
)
{
// M = gridDim.y == input_nPlanes / K
int32_t
N
=
output_nPlanes
/
K
;
int32_t
m
=
blockIdx
.
y
;
int32_t
g
=
blockIdx
.
z
;
inFeatures
+=
m
*
K
+
g
*
input_nPlanes
;
dInFeatures
+=
m
*
K
+
g
*
input_nPlanes
;
dOutFeatures
+=
g
*
output_nPlanes
;
w
+=
m
*
K
*
output_nPlanes
+
g
*
input_nPlanes
*
output_nPlanes
;
dw
+=
m
*
K
*
output_nPlanes
+
g
*
input_nPlanes
*
output_nPlanes
;
TACC
dI
[
V
];
TACC
dW
[
V
];
__shared__
T
I
[
K
][
K
];
__shared__
T
dO
[
K
][
K
];
__shared__
T
W
[
K
][
K
];
int32_t
R0
[
V
];
int32_t
R1
[
V
];
const
int32_t
tx
=
threadIdx
.
x
;
int32_t
ty
[
V
];
#pragma unroll
for
(
int32_t
v
=
0
;
v
<
V
;
v
++
)
ty
[
v
]
=
threadIdx
.
y
+
v
*
(
K
/
V
);
for
(
int32_t
n
=
0
;
n
<
N
;
n
++
)
{
// Read w, reset dW
#pragma unroll
for
(
int32_t
v
=
0
;
v
<
V
;
v
++
)
{
W
[
ty
[
v
]][
tx
]
=
w
[
ty
[
v
]
*
output_nPlanes
+
tx
];
dW
[
v
]
=
0
;
}
for
(
int32_t
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
#pragma unroll
for
(
int32_t
v
=
0
;
v
<
V
;
v
++
)
{
if
(
s
+
ty
[
v
]
<
nHot
)
{
R0
[
v
]
=
rulesIn
[
s
+
ty
[
v
]];
R1
[
v
]
=
rulesOut
[
s
+
ty
[
v
]];
}
dI
[
v
]
=
0
;
}
__syncthreads
();
// Read input and dOutput
#pragma unroll
for
(
int32_t
v
=
0
;
v
<
V
;
v
++
)
if
(
s
+
ty
[
v
]
<
nHot
)
{
I
[
ty
[
v
]][
tx
]
=
inFeatures
[
R0
[
v
]
*
input_stride
+
tx
];
dO
[
ty
[
v
]][
tx
]
=
dOutFeatures
[
R1
[
v
]
*
output_stride
+
tx
];
}
else
{
I
[
ty
[
v
]][
tx
]
=
0
;
dO
[
ty
[
v
]][
tx
]
=
0
;
}
__syncthreads
();
#pragma unroll
for
(
int32_t
k
=
0
;
k
<
K
;
k
++
)
#pragma unroll
for
(
int32_t
v
=
0
;
v
<
V
;
v
++
)
{
dI
[
v
]
+=
dO
[
ty
[
v
]][
k
]
*
W
[
tx
][
k
];
dW
[
v
]
+=
I
[
k
][
ty
[
v
]]
*
dO
[
k
][
tx
];
}
#pragma unroll
for
(
int32_t
v
=
0
;
v
<
V
;
v
++
)
if
(
s
+
ty
[
v
]
<
nHot
)
dI
[
v
]
+=
dInFeatures
[
R0
[
v
]
*
input_stride
+
tx
];
#pragma unroll
for
(
int32_t
v
=
0
;
v
<
V
;
v
++
)
if
(
s
+
ty
[
v
]
<
nHot
)
dInFeatures
[
R0
[
v
]
*
input_stride
+
tx
]
=
dI
[
v
];
__syncthreads
();
}
#pragma unroll
for
(
int32_t
v
=
0
;
v
<
V
;
v
++
)
atomicAdd
(
&
dw
[
ty
[
v
]
*
output_nPlanes
+
tx
],
dW
[
v
]);
w
+=
K
;
dw
+=
K
;
dOutFeatures
+=
K
;
}
}
#define FOO(T, K, V) \
{ \
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
int32_t o = (nHot / K) * K; \
if (o >= K) \
dConvolution_KMxKN_backward_dW_A<T, K, V> \
<<<dim3(std::min(o / K, (int32_t)512), input_nPlanes / K, \
nGroups), \
dim3(K, K / V), 0, s>>>(inFeatures, dInFeatures, dOutFeatures, \
w, dw, rulesIn, rulesOut, o, \
input_nPlanes, input_stride, \
output_nPlanes, output_stride); \
if (nHot > o) \
dConvolution_KMxKN_backward_dW_B<T, K, V> \
<<<dim3(1, input_nPlanes / K, nGroups), dim3(K, K / V), 0, s>>>( \
inFeatures, dInFeatures, dOutFeatures, w, dw, rulesIn + o, \
rulesOut + o, nHot - o, input_nPlanes, input_stride, \
output_nPlanes, output_stride); \
return; \
} \
}
template
<
typename
T
>
void
dConvolution_backward_dW
(
cudaStream_t
s
,
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
int32_t
*
rulesIn
,
int32_t
*
rulesOut
,
int32_t
nHot
,
int32_t
input_nPlanes
,
int32_t
input_stride
,
int32_t
output_nPlanes
,
int32_t
output_stride
,
int32_t
nGroups
)
{
FOO
(
T
,
32
,
8
)
FOO
(
T
,
16
,
4
)
FOO
(
T
,
8
,
2
)
assert
(
false
);
}
#undef FOO
template
<
typename
T
,
int32_t
K
,
int32_t
V
>
__global__
void
dConvolution_KMxKN_forward2
(
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
int32_t
*
rulesIn
,
int32_t
*
rulesOut
,
int32_t
nHot
,
int32_t
input_nPlanes
,
int32_t
input_stride
,
int32_t
output_nPlanes
,
int32_t
output_stride
)
{
// Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,nGroups) Volkov-blocks
// K is a multiple of V,
// nHot x input_nplanes<=KM -> nHot x output_nPlanes<=KN
// - parallel over N,nHot - loop over M
int32_t
M
=
(
input_nPlanes
+
K
-
1
)
/
K
;
// N = gridDim.y ~ output_nPlanes/K
int32_t
n
=
blockIdx
.
y
;
int32_t
g
=
blockIdx
.
z
;
inFeatures
+=
g
*
input_nPlanes
;
outFeatures
+=
n
*
K
+
g
*
output_nPlanes
;
w
+=
n
*
K
+
g
*
input_nPlanes
*
output_nPlanes
;
int32_t
KO
=
min
(
K
,
output_nPlanes
-
K
*
n
);
TACC
O
[
V
];
__shared__
T
W
[
K
][
K
];
__shared__
T
I
[
K
][
K
];
__shared__
int32_t
R
[
K
*
2
];
const
int32_t
tx
=
threadIdx
.
x
;
int32_t
ty
[
V
];
#pragma unroll
for
(
int32_t
v
=
0
;
v
<
V
;
v
++
)
ty
[
v
]
=
threadIdx
.
y
+
v
*
(
K
/
V
);
for
(
int32_t
m
=
0
;
m
<
M
;
m
++
)
{
int32_t
KI
=
min
(
K
,
input_nPlanes
-
K
*
m
);
// Read w
#pragma unroll
for
(
int32_t
v
=
0
;
v
<
V
;
v
++
)
if
(
ty
[
v
]
<
KI
and
tx
<
KO
)
W
[
ty
[
v
]][
tx
]
=
w
[
ty
[
v
]
*
output_nPlanes
+
tx
];
for
(
int32_t
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
// Read rules for K input/output pairs
#pragma unroll
for
(
int32_t
v
=
0
;
v
<
V
;
v
++
)
{
if
(
ty
[
v
]
<
1
)
{
if
(
s
+
tx
<
nHot
)
{
R
[
2
*
tx
]
=
rulesIn
[
s
+
tx
];
R
[
2
*
tx
+
1
]
=
rulesOut
[
s
+
tx
];
}
// R[q] = rules[2 * s + q];
}
}
__syncthreads
();
// Read input, reset O[]
#pragma unroll
for
(
int32_t
v
=
0
;
v
<
V
;
v
++
)
{
if
(
tx
<
KI
and
s
+
ty
[
v
]
<
nHot
)
I
[
ty
[
v
]][
tx
]
=
inFeatures
[
R
[
2
*
ty
[
v
]]
*
input_stride
+
tx
];
O
[
v
]
=
0
;
}
__syncthreads
();
#pragma unroll
for
(
int32_t
k
=
0
;
k
<
KI
;
k
++
)
#pragma unroll
for
(
int32_t
v
=
0
;
v
<
V
;
v
++
)
O
[
v
]
+=
I
[
ty
[
v
]][
k
]
*
W
[
k
][
tx
];
__syncthreads
();
#pragma unroll
for
(
int32_t
v
=
0
;
v
<
V
;
v
++
)
if
(
tx
<
KO
and
s
+
ty
[
v
]
<
nHot
)
outFeatures
[
R
[
2
*
ty
[
v
]
+
1
]
*
output_stride
+
tx
]
+=
O
[
v
];
__syncthreads
();
}
w
+=
K
*
output_nPlanes
;
inFeatures
+=
K
;
}
}
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,nGroups)
template
<
typename
T
,
int32_t
K
,
int32_t
V
>
__global__
void
dConvolution_KMxKN_backward_dW2
(
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
int32_t
*
rulesIn
,
int32_t
*
rulesOut
,
int32_t
nHot
,
int32_t
input_nPlanes
,
int32_t
input_stride
,
int32_t
output_nPlanes
,
int32_t
output_stride
)
{
// M = gridDim.y == input_nPlanes / K
int32_t
N
=
(
output_nPlanes
+
K
-
1
)
/
K
;
int32_t
m
=
blockIdx
.
y
;
int32_t
g
=
blockIdx
.
z
;
inFeatures
+=
m
*
K
+
g
*
input_nPlanes
;
dInFeatures
+=
m
*
K
+
g
*
input_nPlanes
;
dOutFeatures
+=
g
*
output_nPlanes
;
w
+=
m
*
K
*
output_nPlanes
+
g
*
input_nPlanes
*
output_nPlanes
;
dw
+=
m
*
K
*
output_nPlanes
+
g
*
input_nPlanes
*
output_nPlanes
;
int32_t
KI
=
min
(
K
,
input_nPlanes
-
K
*
m
);
TACC
dI
[
V
];
TACC
dW
[
V
];
__shared__
T
I
[
K
][
K
];
__shared__
T
dO
[
K
][
K
];
__shared__
T
W
[
K
][
K
];
__shared__
int32_t
R
[
K
*
2
];
const
int32_t
tx
=
threadIdx
.
x
;
int32_t
ty
[
V
];
#pragma unroll
for
(
int32_t
v
=
0
;
v
<
V
;
v
++
)
ty
[
v
]
=
threadIdx
.
y
+
v
*
(
K
/
V
);
for
(
int32_t
n
=
0
;
n
<
N
;
n
++
)
{
int32_t
KO
=
min
(
K
,
output_nPlanes
-
K
*
n
);
// Read w, reset dW
#pragma unroll
for
(
int32_t
v
=
0
;
v
<
V
;
v
++
)
{
if
(
ty
[
v
]
<
KI
and
tx
<
KO
)
W
[
ty
[
v
]][
tx
]
=
w
[
ty
[
v
]
*
output_nPlanes
+
tx
];
dW
[
v
]
=
0
;
}
for
(
int32_t
s
=
blockIdx
.
x
*
K
;
s
<
nHot
;
s
+=
K
*
gridDim
.
x
)
{
// Read rules for K input/output pairs, reset dI[]
#pragma unroll
for
(
int32_t
v
=
0
;
v
<
V
;
v
++
)
{
if
(
ty
[
v
]
<
1
)
{
if
(
s
+
tx
<
nHot
)
{
R
[
2
*
tx
]
=
rulesIn
[
s
+
tx
];
R
[
2
*
tx
+
1
]
=
rulesOut
[
s
+
tx
];
}
// R[q] = rules[2 * s + q];
}
dI
[
v
]
=
0
;
}
__syncthreads
();
// Read input and dOutput
#pragma unroll
for
(
int32_t
v
=
0
;
v
<
V
;
v
++
)
{
if
(
tx
<
KI
and
s
+
ty
[
v
]
<
nHot
)
I
[
ty
[
v
]][
tx
]
=
inFeatures
[
R
[
2
*
ty
[
v
]]
*
input_stride
+
tx
];
else
I
[
ty
[
v
]][
tx
]
=
0
;
if
(
tx
<
KO
and
s
+
ty
[
v
]
<
nHot
)
dO
[
ty
[
v
]][
tx
]
=
dOutFeatures
[
R
[
2
*
ty
[
v
]
+
1
]
*
output_stride
+
tx
];
else
dO
[
ty
[
v
]][
tx
]
=
0
;
}
__syncthreads
();
#pragma unroll
for
(
int32_t
k
=
0
;
k
<
KO
;
k
++
)
#pragma unroll
for
(
int32_t
v
=
0
;
v
<
V
;
v
++
)
dI
[
v
]
+=
dO
[
ty
[
v
]][
k
]
*
W
[
tx
][
k
];
#pragma unroll
for
(
int32_t
k
=
0
;
k
<
K
;
k
++
)
#pragma unroll
for
(
int32_t
v
=
0
;
v
<
V
;
v
++
)
dW
[
v
]
+=
I
[
k
][
ty
[
v
]]
*
dO
[
k
][
tx
];
__syncthreads
();
#pragma unroll
for
(
int32_t
v
=
0
;
v
<
V
;
v
++
)
if
(
tx
<
KI
and
s
+
ty
[
v
]
<
nHot
)
dInFeatures
[
R
[
2
*
ty
[
v
]]
*
input_stride
+
tx
]
+=
dI
[
v
];
__syncthreads
();
}
#pragma unroll
for
(
int32_t
v
=
0
;
v
<
V
;
v
++
)
if
(
ty
[
v
]
<
KI
and
tx
<
KO
)
atomicAdd
(
&
dw
[
ty
[
v
]
*
output_nPlanes
+
tx
],
dW
[
v
]);
w
+=
K
;
dw
+=
K
;
dOutFeatures
+=
K
;
}
}
template
<
typename
T
>
void
dConvolution_forward2
(
cudaStream_t
s
,
T
*
inFeatures
,
T
*
outFeatures
,
T
*
w
,
int32_t
*
rulesIn
,
int32_t
*
rulesOut
,
int32_t
nHot
,
int32_t
input_nPlanes
,
int32_t
input_stride
,
int32_t
output_nPlanes
,
int32_t
output_stride
,
int32_t
nGroups
)
{
int32_t
c
=
input_nPlanes
*
output_nPlanes
*
nGroups
;
if
(
input_nPlanes
%
8
!=
0
or
output_nPlanes
%
8
!=
0
)
{
const
int32_t
K
=
16
;
const
int32_t
V
=
4
;
dConvolution_KMxKN_forward2
<
T
,
K
,
V
>
<<<
dim3
(
128
,
(
output_nPlanes
+
K
-
1
)
/
K
,
nGroups
),
dim3
(
K
,
K
/
V
),
0
,
s
>>>
(
inFeatures
,
outFeatures
,
w
,
rulesIn
,
rulesOut
,
nHot
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
);
}
else
{
dConvolution_forward
(
s
,
inFeatures
,
outFeatures
,
w
,
rulesIn
,
rulesOut
,
nHot
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
,
nGroups
);
}
}
template
<
typename
T
>
void
dConvolution_backward_dW2
(
cudaStream_t
s
,
T
*
inFeatures
,
T
*
dInFeatures
,
T
*
dOutFeatures
,
T
*
w
,
T
*
dw
,
int32_t
*
rulesIn
,
int32_t
*
rulesOut
,
int32_t
nHot
,
int32_t
input_nPlanes
,
int32_t
input_stride
,
int32_t
output_nPlanes
,
int32_t
output_stride
,
int32_t
nGroups
)
{
int32_t
c
=
input_nPlanes
*
output_nPlanes
*
nGroups
;
if
(
input_nPlanes
%
8
!=
0
or
output_nPlanes
%
8
!=
0
)
{
const
int32_t
K
=
16
;
const
int32_t
V
=
4
;
dConvolution_KMxKN_backward_dW2
<
T
,
K
,
V
>
<<<
dim3
(
128
,
(
input_nPlanes
+
K
-
1
)
/
K
,
nGroups
),
dim3
(
K
,
K
/
V
),
0
,
s
>>>
(
inFeatures
,
dInFeatures
,
dOutFeatures
,
w
,
dw
,
rulesIn
,
rulesOut
,
nHot
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
);
}
else
{
dConvolution_backward_dW
(
s
,
inFeatures
,
dInFeatures
,
dOutFeatures
,
w
,
dw
,
rulesIn
,
rulesOut
,
nHot
,
input_nPlanes
,
input_stride
,
output_nPlanes
,
output_stride
,
nGroups
);
}
}
#undef TACC
\ No newline at end of file
include/spconv/fused_conv.h
deleted
100644 → 0
View file @
3517290c
// Copyright 2019-2020 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <cuda_runtime_api.h>
#include <tensorview/tensor.h>
#include <torch/script.h>
namespace
spconv
{
enum
FusedConvAlgo
{
kFSparseConvNet
,
kFMinkowskiEngine
};
using
all_fused_conv_algos_t
=
tv
::
mp_list_c
<
int
,
kFSparseConvNet
,
kFMinkowskiEngine
>
;
void
fused_conv_cuda
(
torch
::
Tensor
output
,
torch
::
Tensor
features
,
torch
::
Tensor
filters
,
torch
::
Tensor
indicesIn
,
torch
::
Tensor
indicesOut
,
int
nHot
);
void
fused_conv_backward_cuda
(
torch
::
Tensor
features
,
torch
::
Tensor
din
,
torch
::
Tensor
dout
,
torch
::
Tensor
filters
,
torch
::
Tensor
dfilters
,
torch
::
Tensor
indicesIn
,
torch
::
Tensor
indicesOut
,
int
nHot
);
void
fused_conv_cuda_minkowski
(
torch
::
Tensor
output
,
torch
::
Tensor
features
,
torch
::
Tensor
filters
,
torch
::
Tensor
indicesIn
,
torch
::
Tensor
indicesOut
,
int
nHot
);
void
fused_conv_backward_cuda_minkowski
(
torch
::
Tensor
features
,
torch
::
Tensor
din
,
torch
::
Tensor
dout
,
torch
::
Tensor
filters
,
torch
::
Tensor
dfilters
,
torch
::
Tensor
indicesIn
,
torch
::
Tensor
indicesOut
,
int
nHot
);
template
<
int
Algo
>
struct
FusedConvDispatch
;
template
<
>
struct
FusedConvDispatch
<
kFSparseConvNet
>
{
constexpr
static
auto
*
fwd
=
fused_conv_cuda
;
constexpr
static
auto
*
bwd
=
fused_conv_backward_cuda
;
};
template
<
>
struct
FusedConvDispatch
<
kFMinkowskiEngine
>
{
constexpr
static
auto
*
fwd
=
fused_conv_cuda_minkowski
;
constexpr
static
auto
*
bwd
=
fused_conv_backward_cuda_minkowski
;
};
}
// namespace spconv
include/spconv/fused_spconv_ops.h
deleted
100644 → 0
View file @
3517290c
// Copyright 2019-2020 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef FUSED_SPARSE_CONV_OP_H_
#define FUSED_SPARSE_CONV_OP_H_
#include <spconv/indice.h>
#include <spconv/reordering.h>
#include <tensorview/torch_utils.h>
#include <torch/script.h>
#include <utility/timer.h>
namespace
spconv
{
// torch.jit's doc says only support int64, so we need to convert to int32.
torch
::
Tensor
fusedIndiceConvBatchNorm
(
torch
::
Tensor
features
,
torch
::
Tensor
filters
,
torch
::
Tensor
bias
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
int64_t
numActOut
,
int64_t
_inverse
,
int64_t
_subM
)
{
bool
subM
=
_subM
!=
0
;
bool
inverse
=
_inverse
!=
0
;
auto
device
=
features
.
device
().
type
();
auto
ndim
=
filters
.
dim
()
-
2
;
auto
kernelVolume
=
indicePairs
.
size
(
0
);
auto
numInPlanes
=
features
.
size
(
1
);
auto
numOutPlanes
=
filters
.
size
(
ndim
+
1
);
auto
indicePairNumCpu
=
indiceNum
.
to
({
torch
::
kCPU
});
auto
indicePairMaxSizeIter
=
std
::
max_element
(
indicePairNumCpu
.
data_ptr
<
int
>
(),
indicePairNumCpu
.
data_ptr
<
int
>
()
+
kernelVolume
);
int
indicePairMaxOffset
=
indicePairMaxSizeIter
-
indicePairNumCpu
.
data_ptr
<
int
>
();
int
indicePairMaxSize
=
*
indicePairMaxSizeIter
;
/*if (_subM){
std::vector<int> indicePairNumVec(indicePairNumCpu.data_ptr<int>(),
indicePairNumCpu.data_ptr<int>() + kernelVolume);
indicePairNumVec.erase(indicePairNumVec.begin() + indicePairMaxOffset);
auto indicePairVecMaxSizeIter = std::max_element(
indicePairNumVec.begin(), indicePairNumVec.end());
indicePairMaxSize = *indicePairVecMaxSizeIter;
}*/
auto
options
=
torch
::
TensorOptions
().
dtype
(
features
.
dtype
()).
device
(
features
.
device
());
// auto indicePairOptions =
// torch::TensorOptions().dtype(torch::kInt64).device(indicePairs.device());
torch
::
Tensor
output
=
torch
::
zeros
({
numActOut
,
numOutPlanes
},
options
).
copy_
(
bias
);
torch
::
Tensor
inputBuffer
=
torch
::
zeros
({
indicePairMaxSize
,
numInPlanes
},
options
);
torch
::
Tensor
outputBuffer
=
torch
::
zeros
({
indicePairMaxSize
,
numOutPlanes
},
options
);
filters
=
filters
.
view
({
-
1
,
numInPlanes
,
numOutPlanes
});
if
(
subM
)
{
// the center index of subm conv don't need gather and scatter
// add.
torch
::
mm_out
(
output
,
features
,
filters
[
indicePairMaxOffset
]);
}
double
totalGatherTime
=
0
;
double
totalGEMMTime
=
0
;
double
totalSAddTime
=
0
;
for
(
int
i
=
0
;
i
<
kernelVolume
;
++
i
)
{
auto
nHot
=
indicePairNumCpu
.
data_ptr
<
int
>
()[
i
];
if
(
nHot
<=
0
||
(
subM
&&
i
==
indicePairMaxOffset
))
{
continue
;
}
// auto timer = spconv::CudaContextTimer<>();
auto
outputBufferBlob
=
torch
::
from_blob
(
outputBuffer
.
data_ptr
(),
{
nHot
,
numOutPlanes
},
options
);
auto
inputBufferBlob
=
torch
::
from_blob
(
inputBuffer
.
data_ptr
(),
{
nHot
,
numInPlanes
},
options
);
if
(
device
==
torch
::
kCPU
)
{
sparse_gather_cpu
(
inputBuffer
,
features
,
indicePairs
[
i
][
inverse
],
nHot
);
}
#ifdef TV_CUDA
else
if
(
device
==
torch
::
kCUDA
)
{
sparse_gather_cuda
(
inputBuffer
,
features
,
indicePairs
[
i
][
inverse
],
nHot
);
}
#endif
else
{
TV_ASSERT_INVALID_ARG
(
false
,
"unknown device type"
);
}
// totalGatherTime += timer.report() / 1000.0;
torch
::
mm_out
(
outputBufferBlob
,
inputBufferBlob
,
filters
[
i
]);
// totalGEMMTime += timer.report() / 1000.0;
if
(
device
==
torch
::
kCPU
)
{
sparse_scatter_add_cpu
(
outputBuffer
,
output
,
indicePairs
[
i
][
!
inverse
],
nHot
);
}
#ifdef TV_CUDA
else
if
(
device
==
torch
::
kCUDA
)
{
sparse_scatter_add_cuda
(
outputBuffer
,
output
,
indicePairs
[
i
][
!
inverse
],
nHot
);
}
#endif
else
{
TV_ASSERT_INVALID_ARG
(
false
,
"unknown device type"
);
}
// totalSAddTime += timer.report() / 1000.0;
}
// std::cout << "gather time " << totalGatherTime << std::endl;
// std::cout << "gemm time " << totalGEMMTime << std::endl;
// std::cout << "scatteradd time " << totalSAddTime << std::endl;
return
output
;
}
}
// namespace spconv
#endif
\ No newline at end of file
include/spconv/geometry.h
deleted
100644 → 0
View file @
3517290c
// Copyright 2019-2020 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef SPCONV_GEOMETRY_H_
#define SPCONV_GEOMETRY_H_
#include <iostream>
#include <limits>
#include <tensorview/tensorview.h>
#include <tsl/robin_map.h>
#include <unordered_map>
namespace
spconv
{
namespace
detail
{
template
<
typename
T
>
struct
ToUnsigned
;
template
<
>
struct
ToUnsigned
<
int
>
{
using
type
=
uint32_t
;
};
template
<
>
struct
ToUnsigned
<
long
>
{
using
type
=
uint64_t
;
};
template
<
typename
T
>
struct
FNVInternal
;
template
<
>
struct
FNVInternal
<
uint32_t
>
{
constexpr
static
uint32_t
defaultOffsetBasis
=
0x811C9DC5
;
constexpr
static
uint32_t
prime
=
0x01000193
;
};
template
<
>
struct
FNVInternal
<
uint64_t
>
{
constexpr
static
uint64_t
defaultOffsetBasis
=
0xcbf29ce484222325
;
constexpr
static
uint64_t
prime
=
0x100000001b3
;
};
}
// namespace detail
template
<
typename
T
>
using
to_unsigned_t
=
typename
detail
::
ToUnsigned
<
std
::
remove_const_t
<
T
>>::
type
;
template
<
typename
T
>
struct
FNV1a
:
detail
::
FNVInternal
<
T
>
{
std
::
size_t
operator
()(
const
T
*
data
,
std
::
size_t
size
)
{
to_unsigned_t
<
T
>
hash
=
detail
::
FNVInternal
<
T
>::
defaultOffsetBasis
;
for
(
std
::
size_t
i
=
0
;
i
<
size
;
++
i
)
{
hash
*=
detail
::
FNVInternal
<
T
>::
prime
;
hash
^=
static_cast
<
to_unsigned_t
<
T
>>
(
data
[
i
]);
}
return
hash
;
}
};
template
<
typename
Index
,
unsigned
NDim
>
TV_HOST_DEVICE
Index
getValidOutPos
(
const
Index
*
input_pos
,
const
Index
*
kernelSize
,
const
Index
*
stride
,
const
Index
*
padding
,
const
Index
*
dilation
,
const
Index
*
outSpatialShape
,
Index
*
out
)
{
Index
lowers
[
NDim
];
Index
uppers
[
NDim
];
Index
counter
[
NDim
];
Index
counterSize
[
NDim
];
Index
pointCounter
=
0
;
Index
val
;
Index
numPoints
=
1
;
Index
m
,
offset
;
bool
valid
=
false
;
#pragma unroll
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
lowers
[
i
]
=
(
input_pos
[
i
]
-
(
kernelSize
[
i
]
-
1
)
*
dilation
[
i
]
-
1
+
stride
[
i
]
+
padding
[
i
])
/
stride
[
i
];
uppers
[
i
]
=
(
input_pos
[
i
]
+
padding
[
i
])
/
stride
[
i
];
}
#pragma unroll
for
(
unsigned
i
=
0
;
i
<
NDim
;
++
i
)
{
counterSize
[
i
]
=
((
uppers
[
i
]
-
lowers
[
i
])
/
dilation
[
i
]
+
1
);
numPoints
*=
counterSize
[
i
];
}
#pragma unroll
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
counter
[
i
]
=
0
;
}
for
(
int
i
=
0
;
i
<
numPoints
;
++
i
)
{
valid
=
true
;
m
=
1
;
offset
=
0
;
#pragma unroll
for
(
int
j
=
NDim
-
1
;
j
>=
0
;
--
j
)
{
val
=
uppers
[
j
]
-
counter
[
j
]
*
dilation
[
j
];
out
[
pointCounter
*
(
NDim
+
1
)
+
j
]
=
val
;
if
(
val
<
0
||
(
val
>
outSpatialShape
[
j
]
-
1
))
{
valid
=
false
;
// break;
}
offset
+=
m
*
(
input_pos
[
j
]
-
val
*
stride
[
j
]
+
padding
[
j
])
/
dilation
[
j
];
m
*=
kernelSize
[
j
];
}
out
[
pointCounter
*
(
NDim
+
1
)
+
NDim
]
=
offset
;
if
(
valid
)
++
pointCounter
;
counter
[
NDim
-
1
]
+=
1
;
#pragma unroll
for
(
int
c
=
NDim
-
1
;
c
>=
0
;
--
c
)
{
if
(
counter
[
c
]
==
counterSize
[
c
]
&&
c
>
0
)
{
counter
[
c
-
1
]
+=
1
;
counter
[
c
]
=
0
;
}
}
}
return
pointCounter
;
}
template
<
typename
Index
,
unsigned
NDim
>
TV_HOST_DEVICE
Index
getValidOutPosTranspose
(
const
Index
*
input_pos
,
const
Index
*
kernelSize
,
const
Index
*
stride
,
const
Index
*
padding
,
const
Index
*
dilation
,
const
Index
*
outSpatialShape
,
Index
*
out
)
{
Index
lowers
[
NDim
];
Index
uppers
[
NDim
];
Index
counter
[
NDim
];
Index
counterSize
[
NDim
];
Index
pointCounter
=
0
;
Index
val
;
Index
numPoints
=
1
;
Index
m
,
offset
;
bool
valid
=
false
;
#pragma unroll
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
lowers
[
i
]
=
input_pos
[
i
]
*
stride
[
i
]
-
padding
[
i
];
uppers
[
i
]
=
lowers
[
i
]
+
(
kernelSize
[
i
]
-
1
)
*
dilation
[
i
];
}
#pragma unroll
for
(
unsigned
i
=
0
;
i
<
NDim
;
++
i
)
{
counterSize
[
i
]
=
((
uppers
[
i
]
-
lowers
[
i
])
/
dilation
[
i
]
+
1
);
numPoints
*=
counterSize
[
i
];
}
#pragma unroll
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
counter
[
i
]
=
0
;
}
for
(
int
i
=
0
;
i
<
numPoints
;
++
i
)
{
valid
=
true
;
m
=
1
;
offset
=
0
;
#pragma unroll
for
(
int
j
=
NDim
-
1
;
j
>=
0
;
--
j
)
{
val
=
uppers
[
j
]
-
counter
[
j
]
*
dilation
[
j
];
out
[
pointCounter
*
(
NDim
+
1
)
+
j
]
=
val
;
if
(
val
<
0
||
(
val
>
outSpatialShape
[
j
]
-
1
))
{
valid
=
false
;
// break;
}
offset
+=
m
*
(
val
-
lowers
[
j
])
/
dilation
[
j
];
m
*=
kernelSize
[
j
];
}
out
[
pointCounter
*
(
NDim
+
1
)
+
NDim
]
=
offset
;
if
(
valid
)
++
pointCounter
;
counter
[
NDim
-
1
]
+=
1
;
#pragma unroll
for
(
int
c
=
NDim
-
1
;
c
>=
0
;
--
c
)
{
if
(
counter
[
c
]
==
counterSize
[
c
]
&&
c
>
0
)
{
counter
[
c
-
1
]
+=
1
;
counter
[
c
]
=
0
;
}
}
}
return
pointCounter
;
}
}
// namespace spconv
#endif
\ No newline at end of file
include/spconv/indice.cu.h
deleted
100644 → 0
View file @
3517290c
// Copyright 2019-2020 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef INDICE_CU_H_
#define INDICE_CU_H_
#include <cuhash/hash_table.cuh>
#include <spconv/geometry.h>
#include <tensorview/kernel_utils.h>
#include <tensorview/tensorview.h>
namespace
spconv
{
template
<
bool
UseDeconv
,
typename
Index
,
unsigned
NDim
>
struct
ConvIndiceDispatch
;
template
<
typename
Index
,
unsigned
NDim
>
struct
ConvIndiceDispatch
<
true
,
Index
,
NDim
>
{
constexpr
static
auto
*
func
=
getValidOutPosTranspose
<
Index
,
NDim
>
;
};
template
<
typename
Index
,
unsigned
NDim
>
struct
ConvIndiceDispatch
<
false
,
Index
,
NDim
>
{
constexpr
static
auto
*
func
=
getValidOutPos
<
Index
,
NDim
>
;
};
template
<
typename
Index
,
unsigned
NDim
,
bool
UseDeconv
,
int
KernelMaxVolume
=
256
,
typename
Index1D
=
int
>
__global__
void
prepareIndicePairsKernel
(
tv
::
TensorView
<
const
Index
>
indicesIn
,
tv
::
TensorView
<
Index
>
indicePairs
,
tv
::
TensorView
<
Index
>
indiceNum
,
tv
::
TensorView
<
Index1D
>
indicePairUnique
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
kernelSize
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
stride
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
padding
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
dilation
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
outSpatialShape
)
{
auto
numActIn
=
indicesIn
.
dim
(
0
);
Index
spatialVolume
=
1
;
#pragma unroll
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
spatialVolume
*=
outSpatialShape
[
i
];
}
Index
kernelVolume
=
1
;
#pragma unroll
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
kernelVolume
*=
kernelSize
[
i
];
}
Index
numValidPoints
=
0
;
Index
validPoints
[
KernelMaxVolume
*
(
NDim
+
1
)];
Index
*
pointPtr
=
nullptr
;
auto
indicePairsDim2
=
indicePairs
.
dim
(
2
);
Index
index
;
for
(
int
ix
:
tv
::
KernelLoopX
<
int
>
(
numActIn
))
{
numValidPoints
=
ConvIndiceDispatch
<
UseDeconv
,
Index
,
NDim
>::
func
(
indicesIn
.
data
()
+
ix
*
(
NDim
+
1
)
+
1
,
kernelSize
.
data
(),
stride
.
data
(),
padding
.
data
(),
dilation
.
data
(),
outSpatialShape
.
data
(),
validPoints
);
for
(
Index
i
=
0
;
i
<
numValidPoints
;
++
i
)
{
pointPtr
=
validPoints
+
i
*
(
NDim
+
1
);
auto
offset
=
pointPtr
[
NDim
];
Index
oldNum
=
atomicAdd
(
indiceNum
.
data
()
+
offset
,
Index
(
1
));
indicePairs
(
0
,
offset
,
oldNum
)
=
ix
;
index
=
tv
::
ArrayIndexRowMajor
<
NDim
,
NDim
>::
runPtrs
(
pointPtr
,
outSpatialShape
.
data
(),
0
)
+
spatialVolume
*
indicesIn
(
ix
,
0
);
indicePairs
(
1
,
offset
,
oldNum
)
=
index
;
indicePairUnique
[
offset
*
indicePairsDim2
+
oldNum
]
=
index
;
}
}
}
template
<
typename
Index
,
typename
IndexGrid
,
unsigned
NDim
>
__global__
void
assignGridAndIndiceOutKernel
(
tv
::
TensorView
<
Index
>
indicesOut
,
tv
::
TensorView
<
IndexGrid
>
gridsOut
,
int
numAct
,
tv
::
TensorView
<
Index
>
indicePairs
,
tv
::
TensorView
<
Index
>
indicePairUnique
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
outSpatialShape
,
int
batchSize
)
{
Index
index
;
auto
indicesOutPtr
=
indicesOut
.
data
();
for
(
int
ix
:
tv
::
KernelLoopX
<
int
>
(
numAct
))
{
index
=
indicePairUnique
[
ix
];
gridsOut
[
index
]
=
ix
;
index
=
tv
::
rowArrayIdxInv
<
Index
,
NDim
>
(
index
,
indicesOutPtr
+
ix
*
(
NDim
+
1
)
+
1
,
outSpatialShape
.
data
());
indicesOut
[
ix
*
(
NDim
+
1
)]
=
index
%
batchSize
;
}
}
template
<
typename
Index
,
unsigned
NDim
,
unsigned
kNumHashFunctions
=
4
>
__global__
void
assignIndiceOutKernel
(
tv
::
TensorView
<
Index
>
indicesOut
,
int
numAct
,
tv
::
TensorView
<
Index
>
indicePairUnique
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
outSpatialShape
,
int
batchSize
)
{
Index
index
;
auto
indicesOutPtr
=
indicesOut
.
data
();
for
(
unsigned
ix
:
tv
::
KernelLoopX
<
unsigned
>
(
numAct
))
{
index
=
indicePairUnique
[
ix
];
index
=
tv
::
rowArrayIdxInv
<
Index
,
NDim
>
(
index
,
indicesOutPtr
+
ix
*
(
NDim
+
1
)
+
1
,
outSpatialShape
.
data
());
indicesOut
[
ix
*
(
NDim
+
1
)]
=
index
%
batchSize
;
}
}
template
<
typename
Index
,
unsigned
NDim
,
unsigned
kNumHashFunctions
=
4
>
__global__
void
assignIndicePairsHashKernel
(
tv
::
TensorView
<
Index
>
indicesOut
,
int
numActIn
,
tv
::
TensorView
<
Index
>
indicePairs
,
tv
::
TensorView
<
Index
>
indicePairUnique
,
unsigned
table_size
,
const
cuhash
::
Entry
*
table
,
cuhash
::
Functions
<
kNumHashFunctions
>
constants
,
uint2
stash_constants
,
unsigned
stash_count
)
{
Index
index
;
int
kernelVolume
=
indicePairs
.
dim
(
1
);
auto
indicePairsOut
=
indicePairs
.
subview
(
1
);
for
(
int
ix
:
tv
::
KernelLoopX
<
int
>
(
numActIn
))
{
for
(
int
i
=
0
;
i
<
kernelVolume
;
++
i
)
{
index
=
indicePairsOut
(
i
,
ix
);
if
(
index
>
-
1
)
{
auto
val
=
cuhash
::
retrieve
((
unsigned
)(
index
),
table_size
,
table
,
constants
,
stash_constants
,
stash_count
);
assert
(
val
!=
cuhash
::
kNotFound
);
indicePairsOut
(
i
,
ix
)
=
(
unsigned
)
val
;
}
}
}
}
template
<
typename
Index
,
typename
IndexGrid
,
unsigned
NDim
>
__global__
void
assignIndicePairsKernel
(
tv
::
TensorView
<
Index
>
indicesOut
,
tv
::
TensorView
<
IndexGrid
>
gridsOut
,
int
numActIn
,
tv
::
TensorView
<
Index
>
indicePairs
,
tv
::
TensorView
<
Index
>
indicePairUnique
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
outSpatialShape
)
{
Index
index
;
int
kernelVolume
=
indicePairs
.
dim
(
1
);
auto
indicePairsOut
=
indicePairs
.
subview
(
1
);
for
(
int
ix
:
tv
::
KernelLoopX
<
int
>
(
numActIn
))
{
for
(
int
i
=
0
;
i
<
kernelVolume
;
++
i
)
{
index
=
indicePairsOut
(
i
,
ix
);
if
(
index
>
-
1
)
{
indicePairsOut
(
i
,
ix
)
=
gridsOut
[
index
];
}
}
}
}
template
<
typename
Index
,
typename
IndexGrid
,
unsigned
NDim
>
__global__
void
assignIndicePairsLimitedKernel
(
tv
::
TensorView
<
IndexGrid
>
gridsOut
,
int
numActIn
,
tv
::
TensorView
<
Index
>
indicePairs
,
tv
::
TensorView
<
Index
>
indiceNum
)
{
Index
index
,
val
;
int
kernelVolume
=
indicePairs
.
dim
(
0
);
for
(
int
ix
:
tv
::
KernelLoopX
<
int
>
(
numActIn
))
{
for
(
int
i
=
0
;
i
<
kernelVolume
;
++
i
)
{
index
=
indicePairs
(
i
,
1
,
ix
);
if
(
index
!=
-
1
)
{
val
=
gridsOut
[
index
];
if
(
val
!=
-
1
)
{
auto
oldNum
=
atomicAdd
(
indiceNum
.
data
()
+
i
,
Index
(
1
));
indicePairs
(
i
,
0
,
oldNum
)
=
indicePairs
(
i
,
0
,
ix
);
indicePairs
(
i
,
1
,
oldNum
)
=
val
;
}
}
}
}
}
template
<
typename
Index
,
typename
IndexGrid
,
unsigned
NDim
>
__global__
void
prepareSubMGridKernel
(
tv
::
TensorView
<
const
Index
>
indicesIn
,
tv
::
TensorView
<
IndexGrid
>
gridsOut
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
outSpatialShape
,
Index
spatialVolume
)
{
auto
numActIn
=
indicesIn
.
dim
(
0
);
Index
index
=
0
;
for
(
int
ix
:
tv
::
KernelLoopX
<
int
>
(
numActIn
))
{
index
=
tv
::
ArrayIndexRowMajor
<
NDim
,
NDim
>::
runPtrs
(
indicesIn
.
data
()
+
ix
*
(
NDim
+
1
)
+
1
,
outSpatialShape
.
data
(),
0
)
+
spatialVolume
*
indicesIn
(
ix
,
0
);
gridsOut
[
index
]
=
ix
;
}
}
template
<
typename
Index
,
unsigned
NDim
>
__global__
void
prepareSubMHashKernel
(
tv
::
TensorView
<
const
Index
>
indicesIn
,
unsigned
*
keys
,
unsigned
*
values
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
outSpatialShape
)
{
auto
numActIn
=
indicesIn
.
dim
(
0
);
Index
spatialVolume
=
1
;
#pragma unroll
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
spatialVolume
*=
outSpatialShape
[
i
];
}
Index
index
=
0
;
for
(
int
ix
:
tv
::
KernelLoopX
<
int
>
(
numActIn
))
{
index
=
tv
::
rowArrayIdx
<
Index
,
NDim
>
(
indicesIn
.
data
()
+
ix
*
(
NDim
+
1
)
+
1
,
outSpatialShape
.
data
())
+
spatialVolume
*
indicesIn
(
ix
,
0
);
keys
[
ix
]
=
index
;
values
[
ix
]
=
ix
;
}
}
template
<
typename
Index
,
typename
IndexGrid
,
unsigned
NDim
,
int
KernelMaxVolume
=
256
>
__global__
void
getSubMIndicePairsKernel
(
tv
::
TensorView
<
const
Index
>
indicesIn
,
tv
::
TensorView
<
IndexGrid
>
gridsOut
,
tv
::
TensorView
<
Index
>
indicePairs
,
tv
::
TensorView
<
Index
>
indiceNum
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
kernelSize
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
stride
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
padding
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
dilation
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
outSpatialShape
)
{
auto
numActIn
=
indicesIn
.
dim
(
0
);
Index
spatialVolume
=
1
;
#pragma unroll
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
spatialVolume
*=
outSpatialShape
[
i
];
}
Index
numValidPoints
=
0
;
Index
validPoints
[
KernelMaxVolume
*
(
NDim
+
1
)];
Index
*
pointPtr
=
nullptr
;
Index
index
=
0
;
for
(
int
ix
:
tv
::
KernelLoopX
<
int
>
(
numActIn
))
{
numValidPoints
=
getValidOutPos
<
Index
,
NDim
>
(
indicesIn
.
data
()
+
ix
*
(
NDim
+
1
)
+
1
,
kernelSize
.
data
(),
stride
.
data
(),
padding
.
data
(),
dilation
.
data
(),
outSpatialShape
.
data
(),
validPoints
);
for
(
int
i
=
0
;
i
<
numValidPoints
;
++
i
)
{
pointPtr
=
validPoints
+
i
*
(
NDim
+
1
);
auto
offset
=
pointPtr
[
NDim
];
index
=
tv
::
ArrayIndexRowMajor
<
NDim
,
NDim
>::
runPtrs
(
pointPtr
,
outSpatialShape
.
data
(),
0
)
+
spatialVolume
*
indicesIn
(
ix
,
0
);
if
(
gridsOut
[
index
]
>
-
1
)
{
Index
oldNum
=
atomicAdd
(
indiceNum
.
data
()
+
offset
,
Index
(
1
));
indicePairs
(
1
,
offset
,
oldNum
)
=
gridsOut
[
index
];
indicePairs
(
0
,
offset
,
oldNum
)
=
ix
;
}
}
}
}
template
<
typename
Index
,
typename
IndexGrid
,
unsigned
K0
,
unsigned
K1
,
unsigned
K2
>
__global__
void
getSubMIndicePairsUnrollKernel3
(
tv
::
TensorView
<
const
Index
>
indicesIn
,
tv
::
TensorView
<
IndexGrid
>
gridsOut
,
tv
::
TensorView
<
Index
>
indicePairs
,
tv
::
TensorView
<
Index
>
indiceNum
,
const
tv
::
SimpleVector
<
Index
,
3
>
outSpatialShape
,
Index
spatialVolume
)
{
auto
numActIn
=
indicesIn
.
dim
(
0
);
Index
point
[
3
];
Index
index
=
0
;
Index
offset
;
constexpr
unsigned
KV
=
K0
*
K1
*
K2
;
constexpr
unsigned
center
=
KV
/
2
;
*
(
indiceNum
.
data
()
+
center
)
=
numActIn
;
for
(
int
ix
:
tv
::
KernelLoopX
<
int
>
(
numActIn
))
{
const
Index
*
indice_data
=
indicesIn
.
data
()
+
ix
*
(
3
+
1
);
#pragma unroll
for
(
int
i
=
0
;
i
<
K0
;
++
i
)
{
#pragma unroll
for
(
int
j
=
0
;
j
<
K1
;
++
j
)
{
#pragma unroll
for
(
int
k
=
0
;
k
<
K2
;
++
k
)
{
offset
=
i
*
K1
*
K2
+
j
*
K2
+
k
;
if
(
offset
>
center
)
{
continue
;
}
if
(
center
==
offset
)
{
// center of subm indice pairs dont need atomicadd
indicePairs
(
1
,
offset
,
ix
)
=
ix
;
indicePairs
(
0
,
offset
,
ix
)
=
ix
;
}
else
{
point
[
2
]
=
indice_data
[
3
]
-
k
+
K2
/
2
;
point
[
1
]
=
indice_data
[
2
]
-
j
+
K1
/
2
;
point
[
0
]
=
indice_data
[
1
]
-
i
+
K0
/
2
;
if
(
point
[
1
]
>=
0
&&
point
[
1
]
<
outSpatialShape
[
1
]
&&
point
[
2
]
>=
0
&&
point
[
2
]
<
outSpatialShape
[
2
]
&&
point
[
0
]
>=
0
&&
point
[
0
]
<
outSpatialShape
[
0
])
{
index
=
tv
::
ArrayIndexRowMajor
<
3
,
3
>::
runPtrs
(
point
,
outSpatialShape
.
data
(),
0
)
+
spatialVolume
*
indice_data
[
0
];
if
(
gridsOut
[
index
]
!=
-
1
)
{
// for subm: indicePairs[0, i] = indicePairs[1, kernelVolume - i
// - 1]
Index
oldNum
=
atomicAdd
(
indiceNum
.
data
()
+
offset
,
Index
(
1
));
atomicAdd
(
indiceNum
.
data
()
+
KV
-
offset
-
1
,
Index
(
1
));
indicePairs
(
1
,
offset
,
oldNum
)
=
gridsOut
[
index
];
indicePairs
(
0
,
offset
,
oldNum
)
=
ix
;
indicePairs
(
1
,
KV
-
offset
-
1
,
oldNum
)
=
ix
;
indicePairs
(
0
,
KV
-
offset
-
1
,
oldNum
)
=
gridsOut
[
index
];
}
}
}
}
}
}
}
}
template
<
typename
Index
,
typename
IndexGrid
,
unsigned
K0
,
unsigned
K1
>
__global__
void
getSubMIndicePairsUnrollKernel2
(
tv
::
TensorView
<
const
Index
>
indicesIn
,
tv
::
TensorView
<
IndexGrid
>
gridsOut
,
tv
::
TensorView
<
Index
>
indicePairs
,
tv
::
TensorView
<
Index
>
indiceNum
,
const
tv
::
SimpleVector
<
Index
,
2
>
outSpatialShape
,
Index
spatialVolume
)
{
auto
numActIn
=
indicesIn
.
dim
(
0
);
Index
point
[
2
];
Index
index
=
0
;
Index
offset
;
constexpr
unsigned
KV
=
K0
*
K1
;
constexpr
unsigned
center
=
KV
/
2
;
*
(
indiceNum
.
data
()
+
center
)
=
numActIn
;
for
(
int
ix
:
tv
::
KernelLoopX
<
int
>
(
numActIn
))
{
const
Index
*
indice_data
=
indicesIn
.
data
()
+
ix
*
(
2
+
1
);
#pragma unroll
for
(
int
i
=
0
;
i
<
K0
;
++
i
)
{
#pragma unroll
for
(
int
j
=
0
;
j
<
K1
;
++
j
)
{
offset
=
i
*
K1
+
j
;
if
(
offset
>
center
)
{
continue
;
}
if
(
center
==
offset
)
{
// center of subm indice pairs dont need atomicadd
indicePairs
(
1
,
offset
,
ix
)
=
ix
;
indicePairs
(
0
,
offset
,
ix
)
=
ix
;
}
else
{
point
[
1
]
=
indice_data
[
2
]
-
j
+
K1
/
2
;
point
[
0
]
=
indice_data
[
1
]
-
i
+
K0
/
2
;
if
(
point
[
1
]
>=
0
&&
point
[
1
]
<
outSpatialShape
[
1
]
&&
point
[
0
]
>=
0
&&
point
[
0
]
<
outSpatialShape
[
0
])
{
index
=
tv
::
ArrayIndexRowMajor
<
2
,
2
>::
runPtrs
(
point
,
outSpatialShape
.
data
(),
0
)
+
spatialVolume
*
indice_data
[
0
];
if
(
gridsOut
[
index
]
>
-
1
)
{
Index
oldNum
=
atomicAdd
(
indiceNum
.
data
()
+
offset
,
Index
(
1
));
atomicAdd
(
indiceNum
.
data
()
+
KV
-
offset
-
1
,
Index
(
1
));
indicePairs
(
1
,
offset
,
oldNum
)
=
gridsOut
[
index
];
indicePairs
(
0
,
offset
,
oldNum
)
=
ix
;
indicePairs
(
1
,
KV
-
offset
-
1
,
oldNum
)
=
ix
;
indicePairs
(
0
,
KV
-
offset
-
1
,
oldNum
)
=
gridsOut
[
index
];
}
}
}
}
}
}
}
template
<
typename
Index
,
unsigned
NDim
,
int
KernelMaxVolume
=
256
,
unsigned
kNumHashFunctions
=
4
>
__global__
void
getSubMIndicePairsHashKernel
(
tv
::
TensorView
<
const
Index
>
indicesIn
,
tv
::
TensorView
<
Index
>
indicePairs
,
tv
::
TensorView
<
Index
>
indiceNum
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
kernelSize
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
stride
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
padding
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
dilation
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
outSpatialShape
,
unsigned
table_size
,
const
cuhash
::
Entry
*
table
,
cuhash
::
Functions
<
kNumHashFunctions
>
constants
,
uint2
stash_constants
,
unsigned
stash_count
)
{
auto
numActIn
=
indicesIn
.
dim
(
0
);
Index
spatialVolume
=
1
;
#pragma unroll
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
spatialVolume
*=
outSpatialShape
[
i
];
}
Index
numValidPoints
=
0
;
Index
validPoints
[
KernelMaxVolume
*
(
NDim
+
1
)];
Index
*
pointPtr
=
nullptr
;
Index
index
=
0
;
for
(
int
ix
:
tv
::
KernelLoopX
<
int
>
(
numActIn
))
{
numValidPoints
=
getValidOutPos
<
Index
,
NDim
>
(
indicesIn
.
data
()
+
ix
*
(
NDim
+
1
)
+
1
,
kernelSize
.
data
(),
stride
.
data
(),
padding
.
data
(),
dilation
.
data
(),
outSpatialShape
.
data
(),
validPoints
);
for
(
int
i
=
0
;
i
<
numValidPoints
;
++
i
)
{
pointPtr
=
validPoints
+
i
*
(
NDim
+
1
);
auto
offset
=
pointPtr
[
NDim
];
index
=
tv
::
ArrayIndexRowMajor
<
NDim
,
NDim
>::
runPtrs
(
pointPtr
,
outSpatialShape
.
data
(),
0
)
+
spatialVolume
*
indicesIn
(
ix
,
0
);
auto
val
=
cuhash
::
retrieve
((
unsigned
)(
index
),
table_size
,
table
,
constants
,
stash_constants
,
stash_count
);
if
(
val
!=
cuhash
::
kNotFound
)
{
Index
oldNum
=
atomicAdd
(
indiceNum
.
data
()
+
offset
,
Index
(
1
));
indicePairs
(
1
,
offset
,
oldNum
)
=
val
;
indicePairs
(
0
,
offset
,
oldNum
)
=
ix
;
}
}
}
}
template
<
typename
Index
,
unsigned
K0
,
unsigned
K1
,
unsigned
K2
,
unsigned
kNumHashFunctions
=
4
>
__global__
void
getSubMIndicePairsHashUnrollKernel3
(
tv
::
TensorView
<
const
Index
>
indicesIn
,
tv
::
TensorView
<
Index
>
indicePairs
,
tv
::
TensorView
<
Index
>
indiceNum
,
const
tv
::
SimpleVector
<
Index
,
3
>
outSpatialShape
,
Index
spatialVolume
,
unsigned
table_size
,
const
cuhash
::
Entry
*
table
,
cuhash
::
Functions
<
kNumHashFunctions
>
constants
,
uint2
stash_constants
,
unsigned
stash_count
)
{
auto
numActIn
=
indicesIn
.
dim
(
0
);
Index
index
=
0
;
Index
offset
;
Index
point
[
3
];
constexpr
unsigned
KV
=
K0
*
K1
*
K2
;
constexpr
unsigned
center
=
KV
/
2
;
*
(
indiceNum
.
data
()
+
center
)
=
numActIn
;
for
(
int
ix
:
tv
::
KernelLoopX
<
int
>
(
numActIn
))
{
const
Index
*
indice_data
=
indicesIn
.
data
()
+
ix
*
(
3
+
1
);
#pragma unroll
for
(
int
i
=
0
;
i
<
K0
;
++
i
)
{
#pragma unroll
for
(
int
j
=
0
;
j
<
K1
;
++
j
)
{
#pragma unroll
for
(
int
k
=
0
;
k
<
K2
;
++
k
)
{
offset
=
i
*
K1
*
K2
+
j
*
K2
+
k
;
if
(
offset
>
center
)
{
continue
;
}
if
(
center
==
offset
)
{
// center of subm indice pairs dont need atomicadd
indicePairs
(
1
,
offset
,
ix
)
=
ix
;
indicePairs
(
0
,
offset
,
ix
)
=
ix
;
}
else
{
point
[
2
]
=
indice_data
[
3
]
-
k
+
K2
/
2
;
point
[
1
]
=
indice_data
[
2
]
-
j
+
K1
/
2
;
point
[
0
]
=
indice_data
[
1
]
-
i
+
K0
/
2
;
if
(
point
[
1
]
>=
0
&&
point
[
1
]
<
outSpatialShape
[
1
]
&&
point
[
2
]
>=
0
&&
point
[
2
]
<
outSpatialShape
[
2
]
&&
point
[
0
]
>=
0
&&
point
[
0
]
<
outSpatialShape
[
0
])
{
index
=
tv
::
ArrayIndexRowMajor
<
3
,
3
>::
runPtrs
(
point
,
outSpatialShape
.
data
(),
0
)
+
spatialVolume
*
indice_data
[
0
];
auto
val
=
cuhash
::
retrieve
((
unsigned
)(
index
),
table_size
,
table
,
constants
,
stash_constants
,
stash_count
);
if
(
val
!=
cuhash
::
kNotFound
)
{
// for subm: indicePairs[0, i] = indicePairs[1, kernelVolume - i
// - 1]
Index
oldNum
=
atomicAdd
(
indiceNum
.
data
()
+
offset
,
Index
(
1
));
atomicAdd
(
indiceNum
.
data
()
+
KV
-
offset
-
1
,
Index
(
1
));
indicePairs
(
1
,
offset
,
oldNum
)
=
val
;
indicePairs
(
0
,
offset
,
oldNum
)
=
ix
;
indicePairs
(
1
,
KV
-
offset
-
1
,
oldNum
)
=
ix
;
indicePairs
(
0
,
KV
-
offset
-
1
,
oldNum
)
=
val
;
}
}
}
}
}
}
}
}
template
<
typename
Index
,
unsigned
K0
,
unsigned
K1
,
unsigned
kNumHashFunctions
=
4
>
__global__
void
getSubMIndicePairsHashUnrollKernel2
(
tv
::
TensorView
<
const
Index
>
indicesIn
,
tv
::
TensorView
<
Index
>
indicePairs
,
tv
::
TensorView
<
Index
>
indiceNum
,
const
tv
::
SimpleVector
<
Index
,
2
>
outSpatialShape
,
Index
spatialVolume
,
unsigned
table_size
,
const
cuhash
::
Entry
*
table
,
cuhash
::
Functions
<
kNumHashFunctions
>
constants
,
uint2
stash_constants
,
unsigned
stash_count
)
{
auto
numActIn
=
indicesIn
.
dim
(
0
);
Index
index
=
0
;
Index
offset
;
Index
point
[
2
];
constexpr
unsigned
KV
=
K0
*
K1
;
constexpr
unsigned
center
=
KV
/
2
;
*
(
indiceNum
.
data
()
+
center
)
=
numActIn
;
for
(
int
ix
:
tv
::
KernelLoopX
<
int
>
(
numActIn
))
{
const
Index
*
indice_data
=
indicesIn
.
data
()
+
ix
*
(
2
+
1
);
#pragma unroll
for
(
int
i
=
0
;
i
<
K0
;
++
i
)
{
#pragma unroll
for
(
int
j
=
0
;
j
<
K1
;
++
j
)
{
offset
=
i
*
K1
+
j
;
if
(
offset
>
center
)
{
continue
;
}
if
(
center
==
offset
)
{
// center of subm indice pairs dont need atomicadd
indicePairs
(
1
,
offset
,
ix
)
=
ix
;
indicePairs
(
0
,
offset
,
ix
)
=
ix
;
}
else
{
point
[
1
]
=
indice_data
[
2
]
-
j
+
K1
/
2
;
point
[
0
]
=
indice_data
[
1
]
-
i
+
K0
/
2
;
if
(
point
[
1
]
>=
0
&&
point
[
1
]
<
outSpatialShape
[
1
]
&&
point
[
0
]
>=
0
&&
point
[
0
]
<
outSpatialShape
[
0
])
{
index
=
tv
::
ArrayIndexRowMajor
<
2
,
2
>::
runPtrs
(
point
,
outSpatialShape
.
data
(),
0
)
+
spatialVolume
*
indice_data
[
0
];
auto
val
=
cuhash
::
retrieve
((
unsigned
)(
index
),
table_size
,
table
,
constants
,
stash_constants
,
stash_count
);
if
(
val
!=
cuhash
::
kNotFound
)
{
// for subm: indicePairs[0, i] = indicePairs[1, kernelVolume - i -
// 1]
Index
oldNum
=
atomicAdd
(
indiceNum
.
data
()
+
offset
,
Index
(
1
));
atomicAdd
(
indiceNum
.
data
()
+
KV
-
offset
-
1
,
Index
(
1
));
indicePairs
(
1
,
offset
,
oldNum
)
=
val
;
indicePairs
(
0
,
offset
,
oldNum
)
=
ix
;
indicePairs
(
1
,
KV
-
offset
-
1
,
oldNum
)
=
ix
;
indicePairs
(
0
,
KV
-
offset
-
1
,
oldNum
)
=
val
;
}
}
}
}
}
}
}
template
<
typename
Index
,
typename
IndexGrid
,
unsigned
NDim
>
__global__
void
resetGridKernel
(
const
Index
*
indicePairUnique
,
tv
::
TensorView
<
IndexGrid
>
gridsOut
,
int
numAct
)
{
for
(
int
ix
:
tv
::
KernelLoopX
<
int
>
(
numAct
))
{
gridsOut
[
indicePairUnique
[
ix
]]
=
-
1
;
}
}
template
<
typename
T
>
__global__
void
arangeKernel
(
T
*
data
,
int
size
)
{
for
(
int
ix
:
tv
::
KernelLoopX
<
int
>
(
size
))
{
data
[
ix
]
=
ix
;
}
}
template
<
typename
Index
,
typename
IndexGrid
,
unsigned
NDim
>
__global__
void
resetGridSubMKernel
(
const
Index
*
indices
,
tv
::
TensorView
<
IndexGrid
>
gridsOut
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
outSpatialShape
,
int
numAct
,
Index
spatialVolume
)
{
auto
indsPtr
=
indices
;
Index
index
;
for
(
int
ix
:
tv
::
KernelLoopX
<
int
>
(
numAct
))
{
indsPtr
=
indices
+
ix
*
(
NDim
+
1
);
index
=
tv
::
ArrayIndexRowMajor
<
NDim
,
NDim
>::
runPtrs
(
indsPtr
+
1
,
outSpatialShape
.
data
(),
0
);
gridsOut
[
index
+
spatialVolume
*
indsPtr
[
0
]]
=
-
1
;
}
}
}
// namespace spconv
#undef atomicAdd
#endif
\ No newline at end of file
include/spconv/indice.h
deleted
100644 → 0
View file @
3517290c
// Copyright 2019-2020 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef SPARSE_CONV_INDICE_FUNCTOR_H_
#define SPARSE_CONV_INDICE_FUNCTOR_H_
#include <tensorview/tensorview.h>
#include <torch/script.h>
namespace
spconv
{
int
create_conv_indice_pair_p1_cuda
(
torch
::
Tensor
indicesIn
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
torch
::
Tensor
indicePairUnique
,
std
::
vector
<
int64_t
>
kernelSize
,
std
::
vector
<
int64_t
>
stride
,
std
::
vector
<
int64_t
>
padding
,
std
::
vector
<
int64_t
>
dilation
,
std
::
vector
<
int64_t
>
outSpatialShape
,
bool
transpose
);
int
create_conv_indice_pair_p2_cuda
(
torch
::
Tensor
indicesIn
,
torch
::
Tensor
indicesOut
,
torch
::
Tensor
gridsOut
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
torch
::
Tensor
indicePairUnique
,
std
::
vector
<
int64_t
>
outSpatialShape
,
bool
transpose
,
bool
resetGrid
,
bool
useHash
);
int
create_submconv_indice_pair_cuda
(
torch
::
Tensor
indicesIn
,
torch
::
Tensor
gridsOut
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
std
::
vector
<
int64_t
>
kernelSize
,
std
::
vector
<
int64_t
>
stride
,
std
::
vector
<
int64_t
>
padding
,
std
::
vector
<
int64_t
>
dilation
,
std
::
vector
<
int64_t
>
outSpatialShape
,
bool
transpose
,
bool
resetGrid
,
bool
useHash
);
int
create_conv_indice_pair_cpu
(
torch
::
Tensor
indicesIn
,
torch
::
Tensor
indicesOut
,
torch
::
Tensor
gridsOut
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
std
::
vector
<
int64_t
>
kernelSize
,
std
::
vector
<
int64_t
>
stride
,
std
::
vector
<
int64_t
>
padding
,
std
::
vector
<
int64_t
>
dilation
,
std
::
vector
<
int64_t
>
outSpatialShape
,
bool
transpose
,
bool
resetGrid
,
bool
useHash
);
int
create_submconv_indice_pair_cpu
(
torch
::
Tensor
indicesIn
,
torch
::
Tensor
gridsOut
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
std
::
vector
<
int64_t
>
kernelSize
,
std
::
vector
<
int64_t
>
stride
,
std
::
vector
<
int64_t
>
padding
,
std
::
vector
<
int64_t
>
dilation
,
std
::
vector
<
int64_t
>
outSpatialShape
,
bool
transpose
,
bool
resetGrid
,
bool
useHash
);
}
// namespace spconv
#endif
\ No newline at end of file
include/spconv/maxpool.h
deleted
100644 → 0
View file @
3517290c
// Copyright 2019-2020 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef SPARSE_MAXPOOL_FUNCTOR_H_
#define SPARSE_MAXPOOL_FUNCTOR_H_
#include <tensorview/mp_helper.h>
#include <tensorview/tensor.h>
#include <tensorview/torch_utils.h>
#include <torch/script.h>
namespace
spconv
{
void
maxpool_bwd_cpu
(
torch
::
Tensor
outFeatures
,
torch
::
Tensor
inFeatures
,
torch
::
Tensor
dout
,
torch
::
Tensor
din
,
torch
::
Tensor
indicesIn
,
torch
::
Tensor
indicesOut
,
int
size
);
void
maxpool_fwd_cpu
(
torch
::
Tensor
outFeatures
,
torch
::
Tensor
inFeatures
,
torch
::
Tensor
indicesIn
,
torch
::
Tensor
indicesOut
,
int
size
);
void
maxpool_bwd_cuda
(
torch
::
Tensor
outFeatures
,
torch
::
Tensor
inFeatures
,
torch
::
Tensor
dout
,
torch
::
Tensor
din
,
torch
::
Tensor
indicesIn
,
torch
::
Tensor
indicesOut
,
int
size
);
void
maxpool_fwd_cuda
(
torch
::
Tensor
outFeatures
,
torch
::
Tensor
inFeatures
,
torch
::
Tensor
indicesIn
,
torch
::
Tensor
indicesOut
,
int
size
);
}
// namespace spconv
#endif
\ No newline at end of file
include/spconv/minkowski.cu.h
deleted
100644 → 0
View file @
3517290c
/* Copyright (c) Chris Choy (chrischoy@ai.stanford.edu).
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* Please cite "4D Spatio-Temporal ConvNets: Minkowski Convolutional Neural
* Networks", CVPR'19 (https://arxiv.org/abs/1904.08755) if you use any part
* of the code.
*/
template
<
typename
Dtype
,
typename
Itype
,
int
BLOCK_SIZE
>
__global__
void
matmul
(
const
Dtype
*
A
,
const
int
wA
,
const
int
hA
,
const
Dtype
*
B
,
const
int
wB
,
const
int
hB
,
Dtype
*
C
,
const
Itype
*
in_map
,
const
Itype
*
out_map
)
{
// Use in_feat as A and kernel as B
// Block index
const
int
bx
=
blockIdx
.
x
;
const
int
by
=
blockIdx
.
y
;
// Thread index
const
int
tx
=
threadIdx
.
x
;
const
int
ty
=
threadIdx
.
y
;
// Coordinate. x is for rows, y is for columns.
const
int
x
=
BLOCK_SIZE
*
bx
+
tx
;
const
int
y
=
BLOCK_SIZE
*
by
+
ty
;
// Csub is used to store the element of the block sub-matrix
// that is computed by the thread
Dtype
Csub
=
0
;
const
Itype
in_row
=
y
<
hA
?
in_map
[
y
]
:
0
;
const
Itype
out_row
=
y
<
hA
?
out_map
[
y
]
:
0
;
// Loop over all the sub-matrices of A and B
// required to compute the block sub-matrix
for
(
int
s
=
0
;
s
<
wA
;
s
+=
BLOCK_SIZE
)
{
// Declaration of the shared memory array As used to
// store the sub-matrix of A
__shared__
Dtype
As
[
BLOCK_SIZE
][
BLOCK_SIZE
];
// Declaration of the shared memory array Bs used to
// store the sub-matrix of B
__shared__
Dtype
Bs
[
BLOCK_SIZE
][
BLOCK_SIZE
];
// Load the matrices from device memory
// to shared memory; each thread loads
// one element of each matrix
As
[
ty
][
tx
]
=
((
s
+
tx
)
<
wA
&&
y
<
hA
)
?
A
[
wA
*
in_row
+
s
+
tx
]
:
0
;
Bs
[
ty
][
tx
]
=
((
s
+
ty
)
<
hB
&&
x
<
wB
)
?
B
[
wB
*
(
s
+
ty
)
+
x
]
:
0
;
// Synchronize to make sure the matrices are loaded
__syncthreads
();
// Multiply the two matrices together;
// each thread computes one element
// of the block sub-matrix
#pragma unroll
for
(
int
k
=
0
;
k
<
BLOCK_SIZE
;
++
k
)
{
Csub
+=
As
[
ty
][
k
]
*
Bs
[
k
][
tx
];
}
// Synchronize to make sure that the preceding
// computation is done before loading two new
// sub-matrices of A and B in the next iteration
__syncthreads
();
}
// Write the block sub-matrix to device memory;
// each thread writes one element
if
(
y
<
hA
&&
x
<
wB
)
atomicAdd
(
&
C
[
wB
*
out_row
+
x
],
Csub
);
// C[wB * out_row + x] += Csub;
}
template
<
typename
Dtype
,
typename
Itype
,
int
BLOCK_SIZE
>
__global__
void
matmul2
(
const
Dtype
*
A
,
const
int
wA
,
const
int
hA
,
const
Dtype
*
B
,
const
int
wB
,
const
int
hB
,
const
Dtype
*
D
,
const
int
wD
,
const
int
hD
,
Dtype
*
C
,
Dtype
*
E
,
const
Itype
*
in_map
,
const
Itype
*
out_map
)
{
// Use grad_out_feat as A, transposed kernel weight as B, and in_feat as D
// Block index
const
int
bx
=
blockIdx
.
x
;
const
int
by
=
blockIdx
.
y
;
// Thread index
const
int
tx
=
threadIdx
.
x
;
const
int
ty
=
threadIdx
.
y
;
// Coordinate. y is for rows, x is for columns.
const
int
x
=
BLOCK_SIZE
*
bx
+
tx
;
const
int
y
=
BLOCK_SIZE
*
by
+
ty
;
const
Itype
in_row
=
y
<
hA
?
in_map
[
y
]
:
0
;
const
Itype
out_row
=
y
<
hA
?
out_map
[
y
]
:
0
;
// Csub is used to store the element of the block sub-matrix
// that is computed by the thread
Dtype
Csub
=
0
;
Dtype
Esub
=
0
;
// Declaration of the shared memory array As used to
// store the sub-matrix of A
__shared__
Dtype
As
[
BLOCK_SIZE
][
BLOCK_SIZE
];
// Declaration of the shared memory array Bs used to
// store the sub-matrix of B
__shared__
Dtype
BTs
[
BLOCK_SIZE
][
BLOCK_SIZE
];
// Declaration of the shared memory array Ds used to
// store the sub-matrix of D
__shared__
Dtype
DTs
[
BLOCK_SIZE
][
BLOCK_SIZE
];
// For Ds = D^T[...:..., ...:...], use the transposed grid dimension for A
DTs
[
ty
][
tx
]
=
(
x
<
wD
&&
y
<
hD
)
?
D
[
wD
*
in_row
+
x
]
:
0
;
// Loop over all the sub-matrices of A and B
// required to compute the block sub-matrix
for
(
int
s
=
0
;
s
<
wA
;
s
+=
BLOCK_SIZE
)
{
// Load the matrices from device memory
// to shared memory; each thread loads
// one element of each matrix
As
[
ty
][
tx
]
=
((
s
+
tx
)
<
wA
&&
y
<
hA
)
?
A
[
wA
*
out_row
+
s
+
tx
]
:
0
;
// Transposed kernel
BTs
[
ty
][
tx
]
=
((
s
+
ty
)
<
wB
&&
x
<
hB
)
?
B
[
wB
*
x
+
s
+
ty
]
:
0
;
// Synchronize to make sure the matrices are loaded
__syncthreads
();
// Multiply the two matrices together;
// each thread computes one element
// of the block sub-matrix
#pragma unroll
for
(
int
k
=
0
;
k
<
BLOCK_SIZE
;
++
k
)
{
Csub
+=
As
[
ty
][
k
]
*
BTs
[
k
][
tx
];
}
// For Esub, reset to 0
Esub
=
0
;
#pragma unroll
for
(
int
k
=
0
;
k
<
BLOCK_SIZE
;
++
k
)
{
Esub
+=
DTs
[
k
][
ty
]
*
As
[
k
][
tx
];
}
// Synchronize to make sure that the preceding
// computation is done before loading two new
// sub-matrices of A and B in the next iteration
__syncthreads
();
// For the E matrix which requires accmulation of multiple blocks, use
// atomic addition. This can be replaced with a more sophisticaed reduction
// algorithm.
if
((
bx
*
BLOCK_SIZE
+
ty
)
<
wD
&&
(
s
+
tx
)
<
wA
)
atomicAdd
(
&
E
[
wA
*
(
bx
*
BLOCK_SIZE
+
ty
)
+
(
s
+
tx
)],
Esub
);
}
// Write the block sub-matrix to device memory;
// each thread writes one element
if
(
y
<
hA
&&
x
<
hB
)
atomicAdd
(
&
C
[
hB
*
in_row
+
x
],
Csub
);
}
include/spconv/nms.h
deleted
100644 → 0
View file @
3517290c
// Copyright 2019-2020 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef NMS_CPU_H
#define NMS_CPU_H
#include <pybind11/pybind11.h>
// must include pybind11/stl.h if using containers in STL in arguments.
#include "box_iou.h"
#include "nms_gpu.h"
#include <algorithm>
#include <boost/geometry.hpp>
#include <pybind11/numpy.h>
#include <pybind11/stl.h>
#include <vector>
namespace
spconv
{
namespace
py
=
pybind11
;
using
namespace
pybind11
::
literals
;
template
<
typename
DType
>
std
::
vector
<
int
>
non_max_suppression_cpu
(
py
::
array_t
<
DType
>
boxes
,
py
::
array_t
<
int
>
order
,
DType
thresh
,
DType
eps
=
0
)
{
auto
ndets
=
boxes
.
shape
(
0
);
auto
boxes_r
=
boxes
.
template
unchecked
<
2
>();
auto
order_r
=
order
.
template
unchecked
<
1
>();
auto
suppressed
=
zeros
<
int
>
({
int
(
ndets
)});
auto
suppressed_rw
=
suppressed
.
template
mutable_unchecked
<
1
>();
auto
area
=
zeros
<
DType
>
({
int
(
ndets
)});
auto
area_rw
=
area
.
template
mutable_unchecked
<
1
>();
// get areas
for
(
int
i
=
0
;
i
<
ndets
;
++
i
)
{
area_rw
(
i
)
=
(
boxes_r
(
i
,
2
)
-
boxes_r
(
i
,
0
)
+
eps
)
*
(
boxes_r
(
i
,
3
)
-
boxes_r
(
i
,
1
)
+
eps
);
}
std
::
vector
<
int
>
keep
;
int
i
,
j
;
DType
xx1
,
xx2
,
w
,
h
,
inter
,
ovr
;
for
(
int
_i
=
0
;
_i
<
ndets
;
++
_i
)
{
i
=
order_r
(
_i
);
if
(
suppressed_rw
(
i
)
==
1
)
continue
;
keep
.
push_back
(
i
);
for
(
int
_j
=
_i
+
1
;
_j
<
ndets
;
++
_j
)
{
j
=
order_r
(
_j
);
if
(
suppressed_rw
(
j
)
==
1
)
continue
;
xx2
=
std
::
min
(
boxes_r
(
i
,
2
),
boxes_r
(
j
,
2
));
xx1
=
std
::
max
(
boxes_r
(
i
,
0
),
boxes_r
(
j
,
0
));
w
=
xx2
-
xx1
+
eps
;
if
(
w
>
0
)
{
xx2
=
std
::
min
(
boxes_r
(
i
,
3
),
boxes_r
(
j
,
3
));
xx1
=
std
::
max
(
boxes_r
(
i
,
1
),
boxes_r
(
j
,
1
));
h
=
xx2
-
xx1
+
eps
;
if
(
h
>
0
)
{
inter
=
w
*
h
;
ovr
=
inter
/
(
area_rw
(
i
)
+
area_rw
(
j
)
-
inter
);
if
(
ovr
>=
thresh
)
suppressed_rw
(
j
)
=
1
;
}
}
}
}
return
keep
;
}
template
<
typename
DType
>
std
::
vector
<
int
>
rotate_non_max_suppression_cpu
(
py
::
array_t
<
DType
>
box_corners
,
py
::
array_t
<
int
>
order
,
py
::
array_t
<
DType
>
standup_iou
,
DType
thresh
)
{
auto
ndets
=
box_corners
.
shape
(
0
);
auto
box_corners_r
=
box_corners
.
template
unchecked
<
3
>();
auto
order_r
=
order
.
template
unchecked
<
1
>();
auto
suppressed
=
zeros
<
int
>
({
int
(
ndets
)});
auto
suppressed_rw
=
suppressed
.
template
mutable_unchecked
<
1
>();
auto
standup_iou_r
=
standup_iou
.
template
unchecked
<
2
>();
std
::
vector
<
int
>
keep
;
int
i
,
j
;
namespace
bg
=
boost
::
geometry
;
typedef
bg
::
model
::
point
<
DType
,
2
,
bg
::
cs
::
cartesian
>
point_t
;
typedef
bg
::
model
::
polygon
<
point_t
>
polygon_t
;
polygon_t
poly
,
qpoly
;
std
::
vector
<
polygon_t
>
poly_inter
,
poly_union
;
DType
inter_area
,
union_area
,
overlap
;
for
(
int
_i
=
0
;
_i
<
ndets
;
++
_i
)
{
i
=
order_r
(
_i
);
if
(
suppressed_rw
(
i
)
==
1
)
continue
;
keep
.
push_back
(
i
);
for
(
int
_j
=
_i
+
1
;
_j
<
ndets
;
++
_j
)
{
j
=
order_r
(
_j
);
if
(
suppressed_rw
(
j
)
==
1
)
continue
;
if
(
standup_iou_r
(
i
,
j
)
<=
0.0
)
continue
;
// std::cout << "pre_poly" << std::endl;
try
{
bg
::
append
(
poly
,
point_t
(
box_corners_r
(
i
,
0
,
0
),
box_corners_r
(
i
,
0
,
1
)));
bg
::
append
(
poly
,
point_t
(
box_corners_r
(
i
,
1
,
0
),
box_corners_r
(
i
,
1
,
1
)));
bg
::
append
(
poly
,
point_t
(
box_corners_r
(
i
,
2
,
0
),
box_corners_r
(
i
,
2
,
1
)));
bg
::
append
(
poly
,
point_t
(
box_corners_r
(
i
,
3
,
0
),
box_corners_r
(
i
,
3
,
1
)));
bg
::
append
(
poly
,
point_t
(
box_corners_r
(
i
,
0
,
0
),
box_corners_r
(
i
,
0
,
1
)));
bg
::
append
(
qpoly
,
point_t
(
box_corners_r
(
j
,
0
,
0
),
box_corners_r
(
j
,
0
,
1
)));
bg
::
append
(
qpoly
,
point_t
(
box_corners_r
(
j
,
1
,
0
),
box_corners_r
(
j
,
1
,
1
)));
bg
::
append
(
qpoly
,
point_t
(
box_corners_r
(
j
,
2
,
0
),
box_corners_r
(
j
,
2
,
1
)));
bg
::
append
(
qpoly
,
point_t
(
box_corners_r
(
j
,
3
,
0
),
box_corners_r
(
j
,
3
,
1
)));
bg
::
append
(
qpoly
,
point_t
(
box_corners_r
(
j
,
0
,
0
),
box_corners_r
(
j
,
0
,
1
)));
bg
::
intersection
(
poly
,
qpoly
,
poly_inter
);
}
catch
(
const
std
::
exception
&
e
)
{
std
::
cout
<<
"box i corners:"
<<
std
::
endl
;
for
(
int
k
=
0
;
k
<
4
;
++
k
)
{
std
::
cout
<<
box_corners_r
(
i
,
k
,
0
)
<<
" "
<<
box_corners_r
(
i
,
k
,
1
)
<<
std
::
endl
;
}
std
::
cout
<<
"box j corners:"
<<
std
::
endl
;
for
(
int
k
=
0
;
k
<
4
;
++
k
)
{
std
::
cout
<<
box_corners_r
(
j
,
k
,
0
)
<<
" "
<<
box_corners_r
(
j
,
k
,
1
)
<<
std
::
endl
;
}
// throw e;
continue
;
}
// std::cout << "post_poly" << std::endl;
// std::cout << "post_intsec" << std::endl;
if
(
!
poly_inter
.
empty
())
{
inter_area
=
bg
::
area
(
poly_inter
.
front
());
// std::cout << "pre_union" << " " << inter_area << std::endl;
bg
::
union_
(
poly
,
qpoly
,
poly_union
);
/*
if (poly_union.empty()){
std::cout << "intsec area:" << " " << inter_area << std::endl;
std::cout << "box i corners:" << std::endl;
for(int k = 0; k < 4; ++k){
std::cout << box_corners_r(i, k, 0) << " " << box_corners_r(i,
k, 1) << std::endl;
}
std::cout << "box j corners:" << std::endl;
for(int k = 0; k < 4; ++k){
std::cout << box_corners_r(j, k, 0) << " " << box_corners_r(j,
k, 1) << std::endl;
}
}*/
// std::cout << "post_union" << poly_union.empty() << std::endl;
if
(
!
poly_union
.
empty
())
{
// ignore invalid box
union_area
=
bg
::
area
(
poly_union
.
front
());
// std::cout << "post union area" << std::endl;
// std::cout << union_area << "debug" << std::endl;
overlap
=
inter_area
/
union_area
;
if
(
overlap
>=
thresh
)
suppressed_rw
(
j
)
=
1
;
poly_union
.
clear
();
}
}
poly
.
clear
();
qpoly
.
clear
();
poly_inter
.
clear
();
}
}
return
keep
;
}
#ifdef TV_CUDA
constexpr
int
const
threadsPerBlock
=
sizeof
(
unsigned
long
long
)
*
8
;
template
<
typename
DType
>
int
non_max_suppression
(
py
::
array_t
<
DType
>
boxes
,
py
::
array_t
<
int
>
keep_out
,
DType
nms_overlap_thresh
,
int
device_id
)
{
py
::
buffer_info
info
=
boxes
.
request
();
auto
boxes_ptr
=
static_cast
<
DType
*>
(
info
.
ptr
);
py
::
buffer_info
info_k
=
keep_out
.
request
();
auto
keep_out_ptr
=
static_cast
<
int
*>
(
info_k
.
ptr
);
return
_nms_gpu
<
DType
,
threadsPerBlock
>
(
keep_out_ptr
,
boxes_ptr
,
boxes
.
shape
(
0
),
boxes
.
shape
(
1
),
nms_overlap_thresh
,
device_id
);
}
#endif
}
// namespace spconv
#endif
include/spconv/nms_functor.h
deleted
100644 → 0
View file @
3517290c
// Copyright 2019-2020 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef NMS_FUNCTOR_H_
#define NMS_FUNCTOR_H_
#include <tensorview/tensorview.h>
namespace
spconv
{
namespace
functor
{
template
<
typename
Device
,
typename
T
,
typename
Index
>
struct
NonMaxSupressionFunctor
{
Index
operator
()(
const
Device
&
d
,
tv
::
TensorView
<
Index
>
keep
,
tv
::
TensorView
<
const
T
>
boxes
,
T
threshold
,
T
eps
);
};
template
<
typename
Device
,
typename
T
,
typename
Index
>
struct
rotateNonMaxSupressionFunctor
{
Index
operator
()(
const
Device
&
d
,
tv
::
TensorView
<
Index
>
keep
,
tv
::
TensorView
<
const
T
>
boxCorners
,
tv
::
TensorView
<
const
T
>
standupIoU
,
T
threshold
);
};
}
// namespace functor
}
// namespace spconv
#endif
\ No newline at end of file
include/spconv/nms_gpu.h
deleted
100644 → 0
View file @
3517290c
// Copyright 2019-2020 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
template
<
typename
DType
,
int
BLOCK_THREADS
>
int
_nms_gpu
(
int
*
keep_out
,
const
DType
*
boxes_host
,
int
boxes_num
,
int
boxes_dim
,
DType
nms_overlap_thresh
,
int
device_id
);
include/spconv/nms_ops.h
deleted
100644 → 0
View file @
3517290c
// Copyright 2019-2020 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef NMS_TORCH_OP_H_
#define NMS_TORCH_OP_H_
#include <spconv/indice.h>
#include <spconv/nms_functor.h>
#include <spconv/reordering.h>
#include <tensorview/torch_utils.h>
#include <torch/script.h>
#include <utility/timer.h>
namespace
spconv
{
// torch.jit's doc says only support int64, so we need to convert to int32.
template
<
typename
T
>
torch
::
Tensor
nonMaxSuppression
(
torch
::
Tensor
boxes
,
torch
::
Tensor
scores
,
int64_t
preMaxSize
,
int64_t
postMaxSize
,
double
thresh
,
double
eps
)
{
// auto timer = spconv::CudaContextTimer<>();
tv
::
check_torch_dtype
<
T
>
(
boxes
);
auto
resOptions
=
torch
::
TensorOptions
().
dtype
(
torch
::
kInt64
).
device
(
boxes
.
device
());
if
(
boxes
.
size
(
0
)
==
0
)
{
return
torch
::
zeros
({
0
},
resOptions
);
}
torch
::
Tensor
indices
;
if
(
preMaxSize
>
0
)
{
auto
numKeepedScores
=
scores
.
size
(
0
);
preMaxSize
=
std
::
min
(
numKeepedScores
,
preMaxSize
);
auto
res
=
torch
::
topk
(
scores
,
preMaxSize
);
indices
=
std
::
get
<
1
>
(
res
);
boxes
=
torch
::
index_select
(
boxes
,
0
,
indices
);
}
else
{
indices
=
std
::
get
<
1
>
(
torch
::
sort
(
scores
));
boxes
=
torch
::
index_select
(
boxes
,
0
,
indices
);
}
if
(
boxes
.
size
(
0
)
==
0
)
return
torch
::
zeros
({
0
},
resOptions
);
auto
keep
=
torch
::
zeros
({
boxes
.
size
(
0
)},
resOptions
);
int64_t
keepNum
=
0
;
if
(
boxes
.
device
().
type
()
==
torch
::
kCPU
)
{
auto
nmsFunctor
=
functor
::
NonMaxSupressionFunctor
<
tv
::
CPU
,
T
,
int64_t
>
();
keepNum
=
nmsFunctor
(
tv
::
CPU
(),
tv
::
torch2tv
<
int64_t
>
(
keep
),
tv
::
torch2tv
<
const
T
>
(
boxes
),
T
(
thresh
),
T
(
eps
));
}
else
{
TV_ASSERT_RT_ERR
(
false
,
"not implemented"
);
}
if
(
postMaxSize
<=
0
)
{
postMaxSize
=
keepNum
;
}
// std::cout << keep << std::endl;
keep
=
keep
.
slice
(
0
,
0
,
std
::
min
(
keepNum
,
postMaxSize
));
if
(
preMaxSize
>
0
)
{
return
torch
::
index_select
(
indices
,
0
,
keep
);
}
return
keep
;
}
}
// namespace spconv
#endif
\ No newline at end of file
include/spconv/pillar_scatter_functor.h
deleted
100644 → 0
View file @
3517290c
// Copyright 2019-2020 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef POINTPILLARS_SCATTER_FUNCTOR_H_
#define POINTPILLARS_SCATTER_FUNCTOR_H_
#include <tensorview/tensorview.h>
namespace
spconv
{
namespace
functor
{
template
<
typename
Device
,
typename
T
,
typename
Index
>
struct
PointPillarScatter
{
void
operator
()(
const
Device
&
d
,
tv
::
TensorView
<
T
>
canvas
,
tv
::
TensorView
<
const
T
>
features
,
tv
::
TensorView
<
const
T
>
coors
);
};
}
// namespace functor
}
// namespace spconv
#endif
\ No newline at end of file
include/spconv/pillar_scatter_ops.h
deleted
100644 → 0
View file @
3517290c
// Copyright 2019-2020 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef PILLAR_SCATTER_OP_H_
#define PILLAR_SCATTER_OP_H_
#include <spconv/pillar_scatter_functor.h>
#include <tensorview/torch_utils.h>
#include <torch/script.h>
#include <utility/timer.h>
namespace
spconv
{
// torch.jit's doc says only support int64, so we need to convert to int32.
template
<
typename
T
>
torch
::
Tensor
pointPillarScatter
(
torch
::
Tensor
features
,
torch
::
Tensor
coors
,
torch
::
Tensor
shape
)
{
TV_ASSERT_RT_ERR
(
shape
.
device
().
type
()
==
torch
::
kCPU
,
"error"
);
TV_ASSERT_RT_ERR
(
features
.
device
().
type
()
==
torch
::
kCUDA
,
"error"
);
TV_ASSERT_RT_ERR
(
shape
.
dim
()
==
1
,
"error"
);
TV_ASSERT_RT_ERR
(
shape
.
size
(
0
)
==
4
,
"error"
);
TV_ASSERT_RT_ERR
(
features
.
dim
()
>=
3
,
"error"
);
TV_ASSERT_RT_ERR
(
features
.
size
(
0
)
==
1
,
"feature first dim must be 1"
);
TV_ASSERT_RT_ERR
(
coors
.
size
(
0
)
==
1
,
"coors first dim must be 1"
);
TV_ASSERT_RT_ERR
(
features
.
size
(
2
)
==
coors
.
size
(
2
),
"err"
);
tv
::
check_torch_dtype
<
int
>
(
shape
);
tv
::
check_torch_dtype
<
T
>
(
coors
);
auto
shapeData
=
shape
.
data_ptr
<
int
>
();
torch
::
Tensor
canvas
=
torch
::
zeros
({
shapeData
[
0
],
shapeData
[
1
],
shapeData
[
2
],
shapeData
[
3
]},
features
.
options
());
TV_ASSERT_RT_ERR
(
shapeData
[
1
]
==
features
.
size
(
1
),
"error"
);
#ifdef TV_CUDA
functor
::
PointPillarScatter
<
tv
::
GPU
,
T
,
int
>
ftor
;
ftor
(
tv
::
TorchGPU
(),
tv
::
torch2tv
<
T
>
(
canvas
),
tv
::
torch2tv
<
const
T
>
(
features
.
squeeze
()),
tv
::
torch2tv
<
const
T
>
(
coors
.
squeeze
()));
#endif
return
canvas
;
}
}
// namespace spconv
#endif
\ No newline at end of file
include/spconv/point2voxel.cu.h
deleted
100644 → 0
View file @
3517290c
#pragma once
#include <tensorview/kernel_utils.h>
#include <tensorview/tensorview.h>
#include <torch/script.h>
namespace
spconv
{
template
<
typename
Index
,
unsigned
NDim
>
__global__
void
scatterPointToGridKernel
(
tv
::
TensorView
<
const
float
>
points
,
tv
::
TensorView
<
const
Index
>
indexes
,
tv
::
TensorView
<
float
>
grids
,
tv
::
TensorView
<
Index
>
numPointsPerGrid
,
tv
::
TensorView
<
Index
>
pointIndex
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
gridShape
)
{
Index
index
;
int
numPoints
=
points
.
dim
(
0
);
int
numFeatures
=
points
.
dim
(
1
);
for
(
int
ix
:
tv
::
KernelLoopX
<
int
>
(
numPoints
))
{
index
=
tv
::
ArrayIndexRowMajor
<
NDim
,
NDim
>::
runPtrs
(
indexes
.
data
()
+
ix
*
NDim
,
gridShape
.
data
(),
0
);
pointIndex
(
ix
)
=
index
;
atomicAdd
(
numPointsPerGrid
.
data
()
+
index
,
Index
(
1
));
#pragma unroll
for
(
int
k
=
0
;
k
!=
numFeatures
;
++
k
)
{
atomicAdd
(
grids
.
data
()
+
index
*
numFeatures
+
k
,
*
(
points
.
data
()
+
ix
*
numFeatures
+
k
));
}
}
}
template
<
typename
Index
,
unsigned
NDim
>
__global__
void
gatherPointFromGridKernel
(
tv
::
TensorView
<
const
float
>
grids
,
tv
::
TensorView
<
const
Index
>
numPointsPerGrid
,
tv
::
TensorView
<
const
Index
>
pointIndexUnique
,
tv
::
TensorView
<
float
>
voxels
,
tv
::
TensorView
<
Index
>
coors
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
gridShape
)
{
Index
index
;
int
numVoxels
=
voxels
.
dim
(
0
);
int
numFeatures
=
grids
.
dim
(
1
);
for
(
int
ix
:
tv
::
KernelLoopX
<
int
>
(
numVoxels
))
{
index
=
pointIndexUnique
(
ix
);
#pragma unroll
for
(
int
k
=
0
;
k
!=
numFeatures
;
++
k
)
{
voxels
(
ix
,
k
)
=
grids
(
index
,
k
)
/
numPointsPerGrid
(
index
);
}
index
=
tv
::
rowArrayIdxInv
<
Index
,
NDim
>
(
index
,
coors
.
data
()
+
ix
*
NDim
,
gridShape
.
data
());
}
}
template
<
typename
Index
>
__global__
void
resetGridKernel
(
tv
::
TensorView
<
float
>
grids
,
tv
::
TensorView
<
Index
>
numPointsPerGrid
,
tv
::
TensorView
<
Index
>
pointIndexUnique
)
{
Index
index
;
int
numVoxels
=
pointIndexUnique
.
dim
(
0
)
-
1
;
int
numFeatures
=
grids
.
dim
(
1
);
for
(
int
ix
:
tv
::
KernelLoopX
<
int
>
(
numVoxels
))
{
index
=
pointIndexUnique
(
ix
);
#pragma unroll
for
(
int
k
=
0
;
k
!=
numFeatures
;
++
k
)
{
grids
(
index
,
k
)
=
0
;
numPointsPerGrid
(
index
)
=
0
;
}
}
}
template
<
typename
Index
>
__global__
void
resetPointIndexKernel
(
tv
::
TensorView
<
Index
>
pointIndex
,
const
Index
gridVolume
)
{
int
num_max_points
=
pointIndex
.
dim
(
0
)
-
1
;
for
(
int
ix
:
tv
::
KernelLoopX
<
int
>
(
num_max_points
))
{
pointIndex
(
ix
)
=
gridVolume
;
}
}
}
// namespace spconv
include/spconv/point2voxel.h
deleted
100644 → 0
View file @
3517290c
// Copyright 2019-2020 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <pybind11/pybind11.h>
// must include pybind11/eigen.h if using eigen matrix as arguments.
// must include pybind11/stl.h if using containers in STL in arguments.
#include <algorithm>
#include <pybind11/numpy.h>
#include <pybind11/stl.h>
// #include <vector>
#include <iostream>
#include <math.h>
namespace
spconv
{
namespace
py
=
pybind11
;
using
namespace
pybind11
::
literals
;
template
<
typename
DType
,
int
NDim
>
int
points_to_voxel_3d_np
(
py
::
array_t
<
DType
>
points
,
py
::
array_t
<
DType
>
voxels
,
py
::
array_t
<
DType
>
voxel_point_mask
,
py
::
array_t
<
int
>
coors
,
py
::
array_t
<
int
>
num_points_per_voxel
,
py
::
array_t
<
int
>
coor_to_voxelidx
,
std
::
vector
<
DType
>
voxel_size
,
std
::
vector
<
DType
>
coors_range
,
int
max_points
,
int
max_voxels
)
{
auto
points_rw
=
points
.
template
mutable_unchecked
<
2
>();
auto
voxels_rw
=
voxels
.
template
mutable_unchecked
<
3
>();
auto
voxel_point_mask_rw
=
voxel_point_mask
.
template
mutable_unchecked
<
2
>();
auto
coors_rw
=
coors
.
mutable_unchecked
<
2
>
();
auto
num_points_per_voxel_rw
=
num_points_per_voxel
.
mutable_unchecked
<
1
>
();
auto
coor_to_voxelidx_rw
=
coor_to_voxelidx
.
mutable_unchecked
<
NDim
>
();
auto
N
=
points_rw
.
shape
(
0
);
auto
num_features
=
points_rw
.
shape
(
1
);
// auto ndim = points_rw.shape(1) - 1;
constexpr
int
ndim_minus_1
=
NDim
-
1
;
int
voxel_num
=
0
;
bool
failed
=
false
;
int
coor
[
NDim
];
int
c
;
int
grid_size
[
NDim
];
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
grid_size
[
i
]
=
round
((
coors_range
[
NDim
+
i
]
-
coors_range
[
i
])
/
voxel_size
[
i
]);
}
int
voxelidx
,
num
;
for
(
int
i
=
0
;
i
<
N
;
++
i
)
{
failed
=
false
;
for
(
int
j
=
0
;
j
<
NDim
;
++
j
)
{
c
=
floor
((
points_rw
(
i
,
j
)
-
coors_range
[
j
])
/
voxel_size
[
j
]);
if
((
c
<
0
||
c
>=
grid_size
[
j
]))
{
failed
=
true
;
break
;
}
coor
[
ndim_minus_1
-
j
]
=
c
;
}
if
(
failed
)
continue
;
voxelidx
=
coor_to_voxelidx_rw
(
coor
[
0
],
coor
[
1
],
coor
[
2
]);
if
(
voxelidx
==
-
1
)
{
voxelidx
=
voxel_num
;
if
(
voxel_num
>=
max_voxels
)
continue
;
voxel_num
+=
1
;
coor_to_voxelidx_rw
(
coor
[
0
],
coor
[
1
],
coor
[
2
])
=
voxelidx
;
for
(
int
k
=
0
;
k
<
NDim
;
++
k
)
{
coors_rw
(
voxelidx
,
k
)
=
coor
[
k
];
}
}
num
=
num_points_per_voxel_rw
(
voxelidx
);
if
(
num
<
max_points
)
{
voxel_point_mask_rw
(
voxelidx
,
num
)
=
DType
(
1
);
for
(
int
k
=
0
;
k
<
num_features
;
++
k
)
{
voxels_rw
(
voxelidx
,
num
,
k
)
=
points_rw
(
i
,
k
);
}
num_points_per_voxel_rw
(
voxelidx
)
+=
1
;
}
}
for
(
int
i
=
0
;
i
<
voxel_num
;
++
i
)
{
coor_to_voxelidx_rw
(
coors_rw
(
i
,
0
),
coors_rw
(
i
,
1
),
coors_rw
(
i
,
2
))
=
-
1
;
}
return
voxel_num
;
}
template
<
typename
DType
,
int
NDim
>
int
points_to_voxel_3d_np_mean
(
py
::
array_t
<
DType
>
points
,
py
::
array_t
<
DType
>
voxel_point_mask
,
py
::
array_t
<
DType
>
voxels
,
py
::
array_t
<
DType
>
means
,
py
::
array_t
<
int
>
coors
,
py
::
array_t
<
int
>
num_points_per_voxel
,
py
::
array_t
<
int
>
coor_to_voxelidx
,
std
::
vector
<
DType
>
voxel_size
,
std
::
vector
<
DType
>
coors_range
,
int
max_points
,
int
max_voxels
)
{
auto
points_rw
=
points
.
template
mutable_unchecked
<
2
>();
auto
means_rw
=
means
.
template
mutable_unchecked
<
2
>();
auto
voxels_rw
=
voxels
.
template
mutable_unchecked
<
3
>();
auto
voxel_point_mask_rw
=
voxel_point_mask
.
template
mutable_unchecked
<
2
>();
auto
coors_rw
=
coors
.
mutable_unchecked
<
2
>
();
auto
num_points_per_voxel_rw
=
num_points_per_voxel
.
mutable_unchecked
<
1
>
();
auto
coor_to_voxelidx_rw
=
coor_to_voxelidx
.
mutable_unchecked
<
NDim
>
();
auto
N
=
points_rw
.
shape
(
0
);
auto
num_features
=
points_rw
.
shape
(
1
);
// auto ndim = points_rw.shape(1) - 1;
constexpr
int
ndim_minus_1
=
NDim
-
1
;
int
voxel_num
=
0
;
bool
failed
=
false
;
int
coor
[
NDim
];
int
c
;
int
grid_size
[
NDim
];
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
grid_size
[
i
]
=
round
((
coors_range
[
NDim
+
i
]
-
coors_range
[
i
])
/
voxel_size
[
i
]);
}
int
voxelidx
,
num
;
for
(
int
i
=
0
;
i
<
N
;
++
i
)
{
failed
=
false
;
for
(
int
j
=
0
;
j
<
NDim
;
++
j
)
{
c
=
floor
((
points_rw
(
i
,
j
)
-
coors_range
[
j
])
/
voxel_size
[
j
]);
if
((
c
<
0
||
c
>=
grid_size
[
j
]))
{
failed
=
true
;
break
;
}
coor
[
ndim_minus_1
-
j
]
=
c
;
}
if
(
failed
)
continue
;
voxelidx
=
coor_to_voxelidx_rw
(
coor
[
0
],
coor
[
1
],
coor
[
2
]);
if
(
voxelidx
==
-
1
)
{
voxelidx
=
voxel_num
;
if
(
voxel_num
>=
max_voxels
)
continue
;
voxel_num
+=
1
;
coor_to_voxelidx_rw
(
coor
[
0
],
coor
[
1
],
coor
[
2
])
=
voxelidx
;
for
(
int
k
=
0
;
k
<
NDim
;
++
k
)
{
coors_rw
(
voxelidx
,
k
)
=
coor
[
k
];
}
}
num
=
num_points_per_voxel_rw
(
voxelidx
);
if
(
num
<
max_points
)
{
voxel_point_mask_rw
(
voxelidx
,
num
)
=
DType
(
1
);
for
(
int
k
=
0
;
k
<
num_features
;
++
k
)
{
voxels_rw
(
voxelidx
,
num
,
k
)
=
points_rw
(
i
,
k
);
}
num_points_per_voxel_rw
(
voxelidx
)
+=
1
;
for
(
int
k
=
0
;
k
<
num_features
;
++
k
)
{
means_rw
(
voxelidx
,
k
)
+=
(
points_rw
(
i
,
k
)
-
means_rw
(
voxelidx
,
k
))
/
DType
(
num
+
1
);
}
}
}
for
(
int
i
=
0
;
i
<
voxel_num
;
++
i
)
{
coor_to_voxelidx_rw
(
coors_rw
(
i
,
0
),
coors_rw
(
i
,
1
),
coors_rw
(
i
,
2
))
=
-
1
;
num
=
num_points_per_voxel_rw
(
i
);
for
(
int
j
=
num
;
j
<
max_points
;
++
j
)
{
for
(
int
k
=
0
;
k
<
num_features
;
++
k
)
{
voxels_rw
(
i
,
j
,
k
)
=
means_rw
(
i
,
k
);
}
}
}
return
voxel_num
;
}
template
<
typename
DType
,
int
NDim
>
int
points_to_voxel_3d_with_filtering
(
py
::
array_t
<
DType
>
points
,
py
::
array_t
<
DType
>
voxels
,
py
::
array_t
<
DType
>
voxel_point_mask
,
py
::
array_t
<
int
>
voxel_mask
,
py
::
array_t
<
DType
>
mins
,
py
::
array_t
<
DType
>
maxs
,
py
::
array_t
<
int
>
coors
,
py
::
array_t
<
int
>
num_points_per_voxel
,
py
::
array_t
<
int
>
coor_to_voxelidx
,
std
::
vector
<
DType
>
voxel_size
,
std
::
vector
<
DType
>
coors_range
,
int
max_points
,
int
max_voxels
,
int
block_factor
,
int
block_size
,
DType
height_threshold
,
DType
height_high_threshold
)
{
auto
points_rw
=
points
.
template
mutable_unchecked
<
2
>();
auto
mins_rw
=
mins
.
template
mutable_unchecked
<
2
>();
auto
maxs_rw
=
maxs
.
template
mutable_unchecked
<
2
>();
auto
voxels_rw
=
voxels
.
template
mutable_unchecked
<
3
>();
auto
voxel_point_mask_rw
=
voxel_point_mask
.
template
mutable_unchecked
<
2
>();
auto
voxel_mask_rw
=
voxel_mask
.
template
mutable_unchecked
<
1
>();
auto
coors_rw
=
coors
.
mutable_unchecked
<
2
>
();
auto
num_points_per_voxel_rw
=
num_points_per_voxel
.
mutable_unchecked
<
1
>
();
auto
coor_to_voxelidx_rw
=
coor_to_voxelidx
.
mutable_unchecked
<
NDim
>
();
auto
N
=
points_rw
.
shape
(
0
);
auto
num_features
=
points_rw
.
shape
(
1
);
// auto ndim = points_rw.shape(1) - 1;
constexpr
int
ndim_minus_1
=
NDim
-
1
;
int
voxel_num
=
0
;
bool
failed
=
false
;
int
coor
[
NDim
];
int
c
;
int
grid_size
[
NDim
];
DType
max_value
,
min_value
;
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
grid_size
[
i
]
=
round
((
coors_range
[
NDim
+
i
]
-
coors_range
[
i
])
/
voxel_size
[
i
]);
}
int
block_shape_H
=
grid_size
[
1
]
/
block_factor
;
int
block_shape_W
=
grid_size
[
0
]
/
block_factor
;
int
voxelidx
,
num
;
int
block_coor
[
2
];
int
startx
,
stopx
,
starty
,
stopy
;
for
(
int
i
=
0
;
i
<
N
;
++
i
)
{
failed
=
false
;
for
(
int
j
=
0
;
j
<
NDim
;
++
j
)
{
c
=
floor
((
points_rw
(
i
,
j
)
-
coors_range
[
j
])
/
voxel_size
[
j
]);
if
((
c
<
0
||
c
>=
grid_size
[
j
]))
{
failed
=
true
;
break
;
}
coor
[
ndim_minus_1
-
j
]
=
c
;
}
if
(
failed
)
continue
;
voxelidx
=
coor_to_voxelidx_rw
(
coor
[
0
],
coor
[
1
],
coor
[
2
]);
if
(
voxelidx
==
-
1
)
{
voxelidx
=
voxel_num
;
if
(
voxel_num
>=
max_voxels
)
continue
;
voxel_num
+=
1
;
coor_to_voxelidx_rw
(
coor
[
0
],
coor
[
1
],
coor
[
2
])
=
voxelidx
;
for
(
int
k
=
0
;
k
<
NDim
;
++
k
)
{
coors_rw
(
voxelidx
,
k
)
=
coor
[
k
];
}
}
num
=
num_points_per_voxel_rw
(
voxelidx
);
if
(
num
<
max_points
)
{
voxel_point_mask_rw
(
voxelidx
,
num
)
=
DType
(
1
);
for
(
int
k
=
0
;
k
<
num_features
;
++
k
)
{
voxels_rw
(
voxelidx
,
num
,
k
)
=
points_rw
(
i
,
k
);
}
block_coor
[
0
]
=
coor
[
1
]
/
block_factor
;
block_coor
[
1
]
=
coor
[
2
]
/
block_factor
;
mins_rw
(
block_coor
[
0
],
block_coor
[
1
])
=
std
::
min
(
points_rw
(
i
,
2
),
mins_rw
(
block_coor
[
0
],
block_coor
[
1
]));
maxs_rw
(
block_coor
[
0
],
block_coor
[
1
])
=
std
::
max
(
points_rw
(
i
,
2
),
maxs_rw
(
block_coor
[
0
],
block_coor
[
1
]));
num_points_per_voxel_rw
(
voxelidx
)
+=
1
;
}
}
for
(
int
i
=
0
;
i
<
voxel_num
;
++
i
)
{
coor
[
1
]
=
coors_rw
(
i
,
1
);
coor
[
2
]
=
coors_rw
(
i
,
2
);
coor_to_voxelidx_rw
(
coors_rw
(
i
,
0
),
coor
[
1
],
coor
[
2
])
=
-
1
;
block_coor
[
0
]
=
coor
[
1
]
/
block_factor
;
block_coor
[
1
]
=
coor
[
2
]
/
block_factor
;
min_value
=
mins_rw
(
block_coor
[
0
],
block_coor
[
1
]);
max_value
=
maxs_rw
(
block_coor
[
0
],
block_coor
[
1
]);
startx
=
std
::
max
(
0
,
block_coor
[
0
]
-
block_size
/
2
);
stopx
=
std
::
min
(
block_shape_H
,
block_coor
[
0
]
+
block_size
-
block_size
/
2
);
starty
=
std
::
max
(
0
,
block_coor
[
1
]
-
block_size
/
2
);
stopy
=
std
::
min
(
block_shape_W
,
block_coor
[
1
]
+
block_size
-
block_size
/
2
);
for
(
int
j
=
startx
;
j
<
stopx
;
++
j
)
{
for
(
int
k
=
starty
;
k
<
stopy
;
++
k
)
{
min_value
=
std
::
min
(
min_value
,
mins_rw
(
j
,
k
));
max_value
=
std
::
max
(
max_value
,
maxs_rw
(
j
,
k
));
}
}
voxel_mask_rw
(
i
)
=
((
max_value
-
min_value
)
>
height_threshold
)
&&
((
max_value
-
min_value
)
<
height_high_threshold
);
}
return
voxel_num
;
}
}
// namespace spconv
Prev
1
2
3
4
5
6
…
8
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment