Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
mmdetection3d
Commits
333536f6
Unverified
Commit
333536f6
authored
Apr 06, 2022
by
Wenwei Zhang
Committed by
GitHub
Apr 06, 2022
Browse files
Release v1.0.0rc1
parents
9c7270d0
f747daab
Changes
219
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
0 additions
and
3659 deletions
+0
-3659
mmdet3d/ops/spconv/include/tensorview/tensorview.h
mmdet3d/ops/spconv/include/tensorview/tensorview.h
+0
-1163
mmdet3d/ops/spconv/include/torch_utils.h
mmdet3d/ops/spconv/include/torch_utils.h
+0
-71
mmdet3d/ops/spconv/include/utility/timer.h
mmdet3d/ops/spconv/include/utility/timer.h
+0
-54
mmdet3d/ops/spconv/modules.py
mmdet3d/ops/spconv/modules.py
+0
-203
mmdet3d/ops/spconv/ops.py
mmdet3d/ops/spconv/ops.py
+0
-183
mmdet3d/ops/spconv/pool.py
mmdet3d/ops/spconv/pool.py
+0
-85
mmdet3d/ops/spconv/src/all.cc
mmdet3d/ops/spconv/src/all.cc
+0
-51
mmdet3d/ops/spconv/src/indice.cc
mmdet3d/ops/spconv/src/indice.cc
+0
-85
mmdet3d/ops/spconv/src/indice_cuda.cu
mmdet3d/ops/spconv/src/indice_cuda.cu
+0
-158
mmdet3d/ops/spconv/src/maxpool.cc
mmdet3d/ops/spconv/src/maxpool.cc
+0
-82
mmdet3d/ops/spconv/src/maxpool_cuda.cu
mmdet3d/ops/spconv/src/maxpool_cuda.cu
+0
-478
mmdet3d/ops/spconv/src/reordering.cc
mmdet3d/ops/spconv/src/reordering.cc
+0
-69
mmdet3d/ops/spconv/src/reordering_cuda.cu
mmdet3d/ops/spconv/src/reordering_cuda.cu
+0
-155
mmdet3d/ops/spconv/structure.py
mmdet3d/ops/spconv/structure.py
+0
-70
mmdet3d/ops/spconv/test_utils.py
mmdet3d/ops/spconv/test_utils.py
+0
-194
mmdet3d/ops/voxel/__init__.py
mmdet3d/ops/voxel/__init__.py
+0
-5
mmdet3d/ops/voxel/scatter_points.py
mmdet3d/ops/voxel/scatter_points.py
+0
-108
mmdet3d/ops/voxel/src/scatter_points_cpu.cpp
mmdet3d/ops/voxel/src/scatter_points_cpu.cpp
+0
-122
mmdet3d/ops/voxel/src/scatter_points_cuda.cu
mmdet3d/ops/voxel/src/scatter_points_cuda.cu
+0
-310
mmdet3d/ops/voxel/src/voxelization.cpp
mmdet3d/ops/voxel/src/voxelization.cpp
+0
-13
No files found.
mmdet3d/ops/spconv/include/tensorview/tensorview.h
deleted
100644 → 0
View file @
9c7270d0
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <cuda_runtime_api.h>
#include <algorithm>
#include <cassert>
#include <cstdlib>
#include <iostream>
#include <memory>
// #include <prettyprint.h>
#include <sstream>
#include <type_traits>
#include <vector>
namespace
tv
{
#ifdef __NVCC__
#define TV_HOST_DEVICE_INLINE __forceinline__ __device__ __host__
#define TV_DEVICE_INLINE __forceinline__ __device__
#define TV_HOST_DEVICE __device__ __host__
#define TV_ASSERT(expr) assert(expr)
#elif defined(__CUDACC_RTC__)
#define TV_ASSERT(expr) assert(expr)
#define TV_HOST_DEVICE_INLINE __forceinline__ __device__
#define TV_DEVICE_INLINE __forceinline__ __device__
#define TV_HOST_DEVICE __device__ __host__
#else
#define TV_ASSERT(x) assert(x)
#define TV_HOST_DEVICE_INLINE inline
#define TV_HOST_DEVICE
#endif
#define TV_REQUIRE(expr, ...) \
{ \
if (!(expr)) { \
printf(__VA_ARGS__); \
assert(expr); \
} \
}
#define TV_DEVICE_REQUIRE(expr, ...) \
{ \
if (!(expr) && threadIdx.x == 0) printf(__VA_ARGS__); \
assert(expr); \
}
template
<
class
SStream
,
class
T
>
void
sstream_print
(
SStream
&
ss
,
T
val
)
{
ss
<<
val
;
}
template
<
class
SStream
,
class
T
,
class
...
TArgs
>
void
sstream_print
(
SStream
&
ss
,
T
val
,
TArgs
...
args
)
{
ss
<<
val
<<
" "
;
sstream_print
(
ss
,
args
...);
}
#define TV_ASSERT_RT_ERR(expr, ...) \
{ \
if (!(expr)) { \
std::stringstream __macro_s; \
__macro_s << __FILE__ << " " << __LINE__ << "\n"; \
__macro_s << #expr << " assert failed. "; \
tv::sstream_print(__macro_s, __VA_ARGS__); \
throw std::runtime_error(__macro_s.str()); \
} \
}
#define TV_ASSERT_INVALID_ARG(expr, ...) \
{ \
if (!(expr)) { \
std::stringstream __macro_s; \
__macro_s << __FILE__ << " " << __LINE__ << "\n"; \
__macro_s << #expr << " assert failed. "; \
tv::sstream_print(__macro_s, __VA_ARGS__); \
throw std::invalid_argument(__macro_s.str()); \
} \
}
#define TV_CHECK_CUDA_ERR() \
{ \
auto err = cudaGetLastError(); \
if (err != cudaSuccess) { \
std::stringstream __macro_s; \
__macro_s << __FILE__ << " " << __LINE__ << "\n"; \
__macro_s << "cuda execution failed with error " << err; \
throw std::runtime_error(__macro_s.str()); \
} \
}
struct
GPU
{
GPU
(
cudaStream_t
s
=
0
)
:
mStream
(
s
)
{}
virtual
cudaStream_t
getStream
()
const
{
return
mStream
;
}
cudaStream_t
mStream
=
0
;
};
struct
CPU
{};
#define TV_MAX_DIM 6
/*
template <typename T>
constexpr size_t calc_align(size_t ndim)
{
if (ndim * sizeof(T) == 1)
return 1;
else if (ndim * sizeof(T) == 2)
return 2;
else if (ndim * sizeof(T) <= 4 && ndim * sizeof(T) > 2)
return 4;
else if (ndim * sizeof(T) <= 8 && ndim * sizeof(T) > 4)
return 8;
else if (ndim * sizeof(T) <= 16 && ndim * sizeof(T) > 8)
return 16;
else if (ndim * sizeof(T) <= 32 && ndim * sizeof(T) > 16)
return 32;
else
return 64;
}
*/
template
<
typename
T
,
size_t
MaxDim
=
TV_MAX_DIM
>
struct
/*alignas(calc_align<T>(MaxDim))*/
SimpleVector
{
public:
TV_HOST_DEVICE_INLINE
SimpleVector
(){};
TV_HOST_DEVICE_INLINE
SimpleVector
(
std
::
initializer_list
<
T
>
q
)
{
TV_ASSERT
(
q
.
size
()
<=
MaxDim
);
mSize
=
0
;
for
(
T
s
:
q
)
{
mArray
[
mSize
++
]
=
s
;
}
mSize
=
q
.
size
();
}
SimpleVector
(
const
std
::
vector
<
T
>
&
arr
)
{
TV_ASSERT
(
arr
.
size
()
<=
MaxDim
);
for
(
size_t
i
=
0
;
i
<
arr
.
size
();
++
i
)
{
mArray
[
i
]
=
arr
[
i
];
}
mSize
=
arr
.
size
();
}
TV_HOST_DEVICE_INLINE
SimpleVector
(
const
SimpleVector
<
T
,
MaxDim
>
&
arr
)
{
TV_ASSERT
(
arr
.
size
()
<=
MaxDim
);
for
(
size_t
i
=
0
;
i
<
arr
.
size
();
++
i
)
{
mArray
[
i
]
=
arr
[
i
];
}
mSize
=
arr
.
size
();
}
TV_HOST_DEVICE_INLINE
T
&
operator
[](
int
idx
)
{
#ifdef TV_DEBUG
TV_ASSERT
(
idx
>=
0
&&
idx
<
mSize
);
#endif
return
mArray
[
idx
];
}
TV_HOST_DEVICE_INLINE
const
T
&
operator
[](
int
idx
)
const
{
#ifdef TV_DEBUG
TV_ASSERT
(
idx
>=
0
&&
idx
<
mSize
);
#endif
return
mArray
[
idx
];
}
TV_HOST_DEVICE_INLINE
void
push_back
(
T
s
)
{
#ifdef TV_DEBUG
TV_ASSERT
(
mSize
<
MaxDim
);
#endif
mArray
[
mSize
]
=
s
;
mSize
++
;
}
TV_HOST_DEVICE_INLINE
void
pop_back
()
{
#ifdef TV_DEBUG
TV_ASSERT
(
mSize
>
0
);
#endif
mSize
--
;
}
TV_HOST_DEVICE_INLINE
size_t
size
()
const
{
return
mSize
;
}
TV_HOST_DEVICE_INLINE
const
T
*
data
()
const
{
return
mArray
;
}
TV_HOST_DEVICE_INLINE
size_t
empty
()
const
{
return
mSize
==
0
;
}
typedef
size_t
size_type
;
class
iterator
{
public:
typedef
iterator
self_type
;
typedef
T
value_type
;
typedef
T
&
reference
;
typedef
T
*
pointer
;
typedef
std
::
forward_iterator_tag
iterator_category
;
typedef
std
::
ptrdiff_t
difference_type
;
TV_HOST_DEVICE_INLINE
iterator
(
pointer
ptr
)
:
ptr_
(
ptr
)
{}
TV_HOST_DEVICE_INLINE
self_type
operator
++
(
int
junk
)
{
self_type
i
=
*
this
;
ptr_
++
;
return
i
;
}
TV_HOST_DEVICE_INLINE
self_type
operator
++
()
{
ptr_
++
;
return
*
this
;
}
TV_HOST_DEVICE_INLINE
reference
operator
*
()
{
return
*
ptr_
;
}
TV_HOST_DEVICE_INLINE
pointer
operator
->
()
{
return
ptr_
;
}
TV_HOST_DEVICE_INLINE
bool
operator
==
(
const
self_type
&
rhs
)
{
return
ptr_
==
rhs
.
ptr_
;
}
TV_HOST_DEVICE_INLINE
bool
operator
!=
(
const
self_type
&
rhs
)
{
return
ptr_
!=
rhs
.
ptr_
;
}
private:
pointer
ptr_
;
};
class
const_iterator
{
public:
typedef
const_iterator
self_type
;
typedef
T
value_type
;
typedef
const
T
&
reference
;
typedef
const
T
*
pointer
;
typedef
std
::
ptrdiff_t
difference_type
;
typedef
std
::
forward_iterator_tag
iterator_category
;
TV_HOST_DEVICE_INLINE
const_iterator
(
pointer
ptr
)
:
ptr_
(
ptr
)
{}
TV_HOST_DEVICE_INLINE
self_type
operator
++
(
int
junk
)
{
self_type
i
=
*
this
;
ptr_
++
;
return
i
;
}
TV_HOST_DEVICE_INLINE
self_type
operator
++
()
{
ptr_
++
;
return
*
this
;
}
TV_HOST_DEVICE_INLINE
reference
operator
*
()
{
return
*
ptr_
;
}
TV_HOST_DEVICE_INLINE
pointer
operator
->
()
{
return
ptr_
;
}
TV_HOST_DEVICE_INLINE
bool
operator
==
(
const
self_type
&
rhs
)
{
return
ptr_
==
rhs
.
ptr_
;
}
TV_HOST_DEVICE_INLINE
bool
operator
!=
(
const
self_type
&
rhs
)
{
return
ptr_
!=
rhs
.
ptr_
;
}
private:
pointer
ptr_
;
};
TV_HOST_DEVICE_INLINE
iterator
begin
()
{
return
iterator
(
mArray
);
}
TV_HOST_DEVICE_INLINE
iterator
end
()
{
return
iterator
(
mArray
+
mSize
);
}
TV_HOST_DEVICE_INLINE
const_iterator
begin
()
const
{
return
const_iterator
(
mArray
);
}
TV_HOST_DEVICE_INLINE
const_iterator
end
()
const
{
return
const_iterator
(
mArray
+
mSize
);
}
TV_HOST_DEVICE_INLINE
const_iterator
cbegin
()
const
{
return
const_iterator
(
mArray
);
}
TV_HOST_DEVICE_INLINE
const_iterator
cend
()
const
{
return
const_iterator
(
mArray
+
mSize
);
}
protected:
T
mArray
[
MaxDim
];
size_t
mSize
=
0
;
};
template
<
typename
T
,
size_t
MaxDim
>
bool
operator
==
(
const
SimpleVector
<
T
,
MaxDim
>
&
lfs
,
const
SimpleVector
<
T
,
MaxDim
>
&
rfs
)
{
if
(
lfs
.
size
()
!=
rfs
.
size
())
return
false
;
for
(
size_t
i
=
0
;
i
<
lfs
.
size
();
++
i
)
{
if
(
lfs
[
i
]
!=
rfs
[
i
])
return
false
;
}
return
true
;
}
template
<
typename
T
,
size_t
MaxDim
>
bool
operator
!=
(
const
SimpleVector
<
T
,
MaxDim
>
&
lfs
,
const
SimpleVector
<
T
,
MaxDim
>
&
rfs
)
{
return
!
(
lfs
==
rfs
);
}
struct
Slice
{
template
<
class
...
Integers
>
TV_HOST_DEVICE_INLINE
Slice
(
Integers
...
ints
)
{
static_assert
(
sizeof
...(
ints
)
<=
3
,
"slice init must smaller than 3"
);
SimpleVector
<
int
,
3
>
slices
{
int
(
ints
)...};
mSlices
[
0
]
=
-
1
;
mSlices
[
1
]
=
-
1
;
mSlices
[
2
]
=
-
1
;
for
(
size_t
i
=
0
;
i
<
slices
.
size
();
++
i
)
{
mSlices
[
i
]
=
slices
[
i
];
}
}
TV_HOST_DEVICE_INLINE
Slice
()
{
mSlices
[
0
]
=
-
1
;
mSlices
[
1
]
=
-
1
;
mSlices
[
2
]
=
-
1
;
}
template
<
typename
T
>
TV_HOST_DEVICE_INLINE
Slice
(
std
::
initializer_list
<
T
>
slice
)
{
mSlices
[
0
]
=
-
1
;
mSlices
[
1
]
=
-
1
;
mSlices
[
2
]
=
-
1
;
TV_ASSERT
(
slice
.
size
()
<=
3
);
int
idx
=
0
;
for
(
T
s
:
slice
)
{
mSlices
[
idx
]
=
int
(
s
);
++
idx
;
}
}
TV_HOST_DEVICE_INLINE
int
&
operator
[](
int
idx
)
{
#ifdef TV_DEBUG
TV_ASSERT
(
idx
>=
0
&&
idx
<
3
);
#endif
return
mSlices
[
idx
];
}
TV_HOST_DEVICE_INLINE
const
int
&
operator
[](
int
idx
)
const
{
#ifdef TV_DEBUG
TV_ASSERT
(
idx
>=
0
&&
idx
<
3
);
#endif
return
mSlices
[
idx
];
}
protected:
int
mSlices
[
3
];
};
template
<
size_t
MaxDim
=
TV_MAX_DIM
>
struct
ShapeBase
:
public
SimpleVector
<
int
,
MaxDim
>
{
TV_HOST_DEVICE_INLINE
ShapeBase
()
:
SimpleVector
<
int
,
MaxDim
>
(){};
TV_HOST_DEVICE_INLINE
ShapeBase
(
std
::
initializer_list
<
int
>
shape
)
:
SimpleVector
<
int
,
MaxDim
>
(
shape
)
{}
template
<
typename
T
,
template
<
class
...
>
class
Container
>
ShapeBase
(
Container
<
T
>
shape
)
:
SimpleVector
<
int
,
MaxDim
>
(
shape
)
{}
TV_HOST_DEVICE_INLINE
ShapeBase
(
const
ShapeBase
<
MaxDim
>
&
shape
)
:
SimpleVector
<
int
,
MaxDim
>
(
shape
)
{}
ShapeBase
(
const
std
::
vector
<
int
>
&
arr
)
:
SimpleVector
<
int
,
MaxDim
>
(
arr
)
{}
ShapeBase
<
MaxDim
>
&
operator
=
(
const
ShapeBase
<
MaxDim
>
&
shape
)
=
default
;
TV_HOST_DEVICE_INLINE
ShapeBase
<
MaxDim
>
subshape
(
int
start
,
int
end
)
const
{
#ifdef TV_DEBUG
TV_ASSERT
(
start
>=
0
&&
end
<
this
->
mSize
&&
end
>
start
);
#endif
ShapeBase
<
MaxDim
>
shape
;
for
(
int
i
=
start
;
i
<
end
;
++
i
)
{
shape
.
push_back
(
this
->
mArray
[
i
]);
}
return
shape
;
}
TV_HOST_DEVICE_INLINE
ShapeBase
<
MaxDim
>
subshape
(
int
start
)
const
{
#ifdef TV_DEBUG
TV_ASSERT
(
start
>=
0
&&
start
<=
this
->
mSize
);
#endif
ShapeBase
<
MaxDim
>
shape
;
for
(
int
i
=
start
;
i
<
this
->
mSize
;
++
i
)
{
shape
.
push_back
(
this
->
mArray
[
i
]);
}
return
shape
;
}
TV_HOST_DEVICE_INLINE
size_t
size
()
const
{
if
(
this
->
mSize
==
0
)
return
0
;
size_t
s
=
1
;
for
(
int
i
=
0
;
i
<
int
(
this
->
mSize
);
++
i
)
{
s
*=
this
->
mArray
[
i
];
}
return
s
;
}
TV_HOST_DEVICE_INLINE
size_t
ndim
()
const
{
return
this
->
mSize
;
}
TV_HOST_DEVICE_INLINE
ShapeBase
<
MaxDim
>
squeeze
()
const
{
ShapeBase
<
MaxDim
>
shape
;
for
(
int
i
=
0
;
i
<
this
->
mSize
;
++
i
)
{
if
(
this
->
mArray
[
i
]
!=
1
)
shape
.
push_back
(
this
->
mArray
[
i
]);
}
return
shape
;
}
TV_HOST_DEVICE_INLINE
ShapeBase
<
MaxDim
>
squeeze
(
int
dim
)
const
{
ShapeBase
<
MaxDim
>
shape
;
for
(
int
i
=
0
;
i
<
this
->
mSize
;
++
i
)
{
if
(
i
!=
dim
||
this
->
mArray
[
i
]
!=
1
)
shape
.
push_back
(
this
->
mArray
[
i
]);
}
return
shape
;
}
};
using
Shape
=
ShapeBase
<
TV_MAX_DIM
>
;
template
<
class
...
Inds
>
TV_HOST_DEVICE_INLINE
unsigned
rowArrayIdx
(
std
::
vector
<
int
>
&
shape
,
Inds
...
indexes
)
{
unsigned
offset
=
0
;
unsigned
m
=
1
;
int
indexes_vec
[
sizeof
...(
indexes
)]
=
{
indexes
...};
#ifdef TV_DEBUG
TV_ASSERT
(
sizeof
...(
indexes
)
==
shape
.
size
());
#endif
#pragma unroll
for
(
int
i
=
sizeof
...(
indexes
)
-
1
;
i
>=
0
;
--
i
)
{
offset
+=
m
*
indexes_vec
[
i
];
m
*=
shape
[
i
];
}
return
offset
;
}
TV_HOST_DEVICE_INLINE
unsigned
rowArrayIdx
(
std
::
vector
<
int
>
&
shape
,
std
::
vector
<
int
>
&
indexes_vec
)
{
unsigned
offset
=
0
;
unsigned
m
=
1
;
for
(
int
i
=
shape
.
size
()
-
1
;
i
>=
0
;
--
i
)
{
offset
+=
m
*
indexes_vec
[
i
];
m
*=
shape
[
i
];
}
return
offset
;
}
template
<
class
...
Inds
>
TV_HOST_DEVICE_INLINE
unsigned
rowArrayIdx
(
const
Shape
&
shape
,
Inds
...
indexes
)
{
unsigned
offset
=
0
;
unsigned
m
=
1
;
int
indexes_vec
[
sizeof
...(
indexes
)]
=
{
indexes
...};
#pragma unroll
for
(
int
i
=
sizeof
...(
indexes
)
-
1
;
i
>=
0
;
--
i
)
{
offset
+=
m
*
indexes_vec
[
i
];
m
*=
shape
[
i
];
}
return
offset
;
}
TV_HOST_DEVICE_INLINE
unsigned
rowArrayIdx
(
const
Shape
&
shape
,
const
Shape
&
indexes_vec
)
{
unsigned
offset
=
0
;
unsigned
m
=
1
;
for
(
int
i
=
indexes_vec
.
ndim
()
-
1
;
i
>=
0
;
--
i
)
{
offset
+=
m
*
indexes_vec
[
i
];
m
*=
shape
[
i
];
}
return
offset
;
}
template
<
typename
Index
,
unsigned
NDim
>
TV_HOST_DEVICE_INLINE
unsigned
rowArrayIdx
(
const
Index
*
indexes
,
const
Index
*
shape
)
{
unsigned
offset
=
0
;
unsigned
m
=
1
;
#pragma unroll
for
(
int
i
=
NDim
-
1
;
i
>=
0
;
--
i
)
{
offset
+=
m
*
indexes
[
i
];
m
*=
shape
[
i
];
}
return
offset
;
}
template
<
typename
Index
,
unsigned
NDim
>
TV_HOST_DEVICE_INLINE
Index
rowArrayIdxInv
(
Index
index
,
Index
*
output
,
const
Index
*
shape
)
{
#pragma unroll
for
(
int
i
=
NDim
-
1
;
i
>=
0
;
--
i
)
{
output
[
i
]
=
index
%
shape
[
i
];
index
-=
output
[
i
];
index
/=
shape
[
i
];
}
return
index
;
}
template
<
int
N
>
struct
ArrayIndexRowMajor
{
// mPtr[((i1 * mShape[1] + i2) * mShape[2] + i3) * mShape[3] + i4];
TV_HOST_DEVICE_INLINE
static
unsigned
run
(
const
Shape
&
shape
,
const
Shape
&
indexes
)
{
return
indexes
[
N
-
1
]
+
shape
[
N
-
1
]
*
ArrayIndexRowMajor
<
N
-
1
>::
run
(
shape
,
indexes
);
}
};
template
<
>
struct
ArrayIndexRowMajor
<
0
>
{
TV_HOST_DEVICE_INLINE
static
unsigned
run
(
const
Shape
&
shape
,
const
Shape
&
indexes
)
{
return
0
;
}
};
namespace
detail
{
template
<
typename
T
>
constexpr
const
char
*
simpleTypeName
(
T
val
=
T
());
template
<
>
constexpr
const
char
*
simpleTypeName
(
float
val
)
{
return
"float32"
;
}
template
<
>
constexpr
const
char
*
simpleTypeName
(
double
val
)
{
return
"float64"
;
}
template
<
>
constexpr
const
char
*
simpleTypeName
(
int
val
)
{
return
"int32"
;
}
template
<
>
constexpr
const
char
*
simpleTypeName
(
unsigned
val
)
{
return
"uint32"
;
}
template
<
>
constexpr
const
char
*
simpleTypeName
(
long
val
)
{
return
"int64"
;
}
template
<
>
constexpr
const
char
*
simpleTypeName
(
unsigned
long
val
)
{
return
"uint64"
;
}
};
// namespace detail
template
<
typename
T
,
int
Rank
=
-
1
>
struct
TensorView
{
TV_HOST_DEVICE_INLINE
TensorView
()
{}
explicit
TV_HOST_DEVICE_INLINE
TensorView
(
T
*
ptr
,
Shape
shape
)
:
mPtr
(
ptr
),
mShape
(
shape
)
{}
// explicit TV_HOST_DEVICE_INLINE TensorView(const
// TensorView<std::remove_const_t<T>> &tview) : mPtr(tview.data()),
// mShape(tview.shape()) {}
template
<
class
...
Integers
>
explicit
TV_HOST_DEVICE_INLINE
TensorView
(
T
*
ptr
,
Integers
...
shapes
)
:
mPtr
(
ptr
)
{
mShape
=
{
int
(
shapes
)...};
}
TV_HOST_DEVICE_INLINE
TensorView
<
T
,
Rank
>
&
assign
(
const
TensorView
<
T
,
Rank
>
&
tensor
)
{
TV_REQUIRE
(
tensor
.
shape
()
==
shape
(),
"you must provide same input size%s"
,
"
\n
"
);
T
*
ptr
=
mPtr
;
const
T
*
other_ptr
=
tensor
.
data
();
for
(
size_t
i
=
0
;
i
<
size
();
++
i
)
*
(
ptr
++
)
=
*
(
other_ptr
++
);
return
*
this
;
}
template
<
typename
T1
>
TV_HOST_DEVICE_INLINE
TensorView
<
T
,
Rank
>
&
assign
(
std
::
initializer_list
<
T1
>
seq
)
{
TV_REQUIRE
(
seq
.
size
()
==
size
(),
"you must provide same input size%s"
,
"
\n
"
);
T
*
ptr
=
mPtr
;
for
(
const
T1
&
s
:
seq
)
*
(
ptr
++
)
=
T
(
s
);
return
*
this
;
}
template
<
class
...
Inds
>
TV_HOST_DEVICE_INLINE
T
&
operator
()(
Inds
...
inds
)
{
#ifdef TV_DEBUG
int
idxes
[
sizeof
...(
Inds
)]{
int
(
inds
)...};
TV_REQUIRE
(
sizeof
...(
inds
)
==
mShape
.
ndim
(),
"you provide %d indexes, but dim is %d
\n
"
,
sizeof
...(
inds
),
mShape
.
ndim
());
for
(
int
i
=
0
;
i
<
sizeof
...(
inds
);
++
i
)
{
TV_REQUIRE
(
idxes
[
i
]
>=
0
&&
idxes
[
i
]
<
mShape
[
i
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
i
,
idxes
[
i
],
mShape
[
i
]);
}
#endif
return
mPtr
[
rowArrayIdx
(
mShape
,
int
(
inds
)...)];
}
template
<
class
...
Inds
>
TV_HOST_DEVICE_INLINE
const
T
&
operator
()(
Inds
...
inds
)
const
{
#ifdef TV_DEBUG
int
idxes
[
sizeof
...(
Inds
)]{
int
(
inds
)...};
TV_REQUIRE
(
sizeof
...(
inds
)
==
mShape
.
ndim
(),
"you provide %d indexes, but dim is %d
\n
"
,
sizeof
...(
inds
),
mShape
.
ndim
());
for
(
int
i
=
0
;
i
<
sizeof
...(
inds
);
++
i
)
{
TV_REQUIRE
(
idxes
[
i
]
>=
0
&&
idxes
[
i
]
<
mShape
[
i
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
i
,
idxes
[
i
],
mShape
[
i
]);
}
#endif
return
mPtr
[
rowArrayIdx
(
mShape
,
int
(
inds
)...)];
}
TV_HOST_DEVICE_INLINE
T
&
operator
()()
{
#if defined TV_DEBUG
#if defined(__CUDA_ARCH__)
TV_DEVICE_REQUIRE
(
mPtr
!=
nullptr
,
"you want get value but the view is empty.%s"
,
"
\n
"
);
TV_DEVICE_REQUIRE
(
mShape
.
ndim
()
==
0
,
"you provide 0 indexes, but dim is %ld
\n
"
,
mShape
.
ndim
());
#else
TV_REQUIRE
(
mPtr
!=
nullptr
,
"you want get value but the view is empty.%s"
,
"
\n
"
);
TV_REQUIRE
(
mShape
.
ndim
()
==
0
,
"you provide 0 indexes, but dim is %ld
\n
"
,
mShape
.
ndim
());
#endif
#endif
return
mPtr
[
0
];
}
TV_HOST_DEVICE_INLINE
const
T
&
operator
()()
const
{
#if defined TV_DEBUG
#if defined(__CUDA_ARCH__)
TV_DEVICE_REQUIRE
(
mPtr
!=
nullptr
,
"you want get value but the view is empty.%s"
,
"
\n
"
);
TV_DEVICE_REQUIRE
(
mShape
.
ndim
()
==
0
,
"you provide 0 indexes, but dim is %ld
\n
"
,
mShape
.
ndim
());
#else
TV_REQUIRE
(
mPtr
!=
nullptr
,
"you want get value but the view is empty.%s"
,
"
\n
"
);
TV_REQUIRE
(
mShape
.
ndim
()
==
0
,
"you provide 0 indexes, but dim is %ld
\n
"
,
mShape
.
ndim
());
#endif
#endif
return
mPtr
[
0
];
}
template
<
class
T1
>
TV_HOST_DEVICE_INLINE
T
&
operator
()(
T1
i1
)
{
#if defined TV_DEBUG
#if defined(__CUDA_ARCH__)
TV_DEVICE_REQUIRE
(
mShape
.
ndim
()
==
1
,
"you provide 1 indexes, but dim is %ld
\n
"
,
mShape
.
ndim
());
TV_DEVICE_REQUIRE
(
i1
>=
0
&&
i1
<
mShape
[
0
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
0
,
i1
,
mShape
[
0
]);
#else
TV_REQUIRE
(
mShape
.
ndim
()
==
1
,
"you provide 1 indexes, but dim is %ld
\n
"
,
mShape
.
ndim
());
TV_REQUIRE
(
i1
>=
0
&&
i1
<
mShape
[
0
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
0
,
i1
,
mShape
[
0
]);
#endif
#endif
return
mPtr
[
i1
];
}
template
<
class
T1
,
class
T2
>
TV_HOST_DEVICE_INLINE
T
&
operator
()(
T1
i1
,
T2
i2
)
{
#ifdef TV_DEBUG
#if defined(__CUDA_ARCH__)
TV_DEVICE_REQUIRE
(
mShape
.
ndim
()
==
2
,
"you provide 2 indexes, but dim is %ld
\n
"
,
mShape
.
ndim
());
TV_DEVICE_REQUIRE
(
i1
>=
0
&&
i1
<
mShape
[
0
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
0
,
int
(
i1
),
mShape
[
0
]);
TV_DEVICE_REQUIRE
(
i2
>=
0
&&
i2
<
mShape
[
1
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
1
,
int
(
i2
),
mShape
[
1
]);
#else
TV_REQUIRE
(
mShape
.
ndim
()
==
2
,
"you provide 2 indexes, but dim is %ld
\n
"
,
mShape
.
ndim
());
TV_REQUIRE
(
i1
>=
0
&&
i1
<
mShape
[
0
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
0
,
int
(
i1
),
mShape
[
0
]);
TV_REQUIRE
(
i2
>=
0
&&
i2
<
mShape
[
1
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
1
,
int
(
i2
),
mShape
[
1
]);
#endif
#endif
return
mPtr
[
i1
*
mShape
[
1
]
+
i2
];
}
template
<
class
T1
,
class
T2
,
class
T3
>
TV_HOST_DEVICE_INLINE
T
&
operator
()(
T1
i1
,
T2
i2
,
T3
i3
)
{
#ifdef TV_DEBUG
#if defined(__CUDA_ARCH__)
TV_DEVICE_REQUIRE
(
mShape
.
ndim
()
==
3
,
"you provide 3 indexes, but dim is %ld
\n
"
,
mShape
.
ndim
());
TV_DEVICE_REQUIRE
(
i1
>=
0
&&
i1
<
mShape
[
0
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
0
,
int
(
i1
),
mShape
[
0
]);
TV_DEVICE_REQUIRE
(
i2
>=
0
&&
i2
<
mShape
[
1
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
1
,
int
(
i2
),
mShape
[
1
]);
TV_DEVICE_REQUIRE
(
i3
>=
0
&&
i3
<
mShape
[
2
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
2
,
int
(
i3
),
mShape
[
2
]);
#else
TV_REQUIRE
(
mShape
.
ndim
()
==
3
,
"you provide 3 indexes, but dim is %ld
\n
"
,
mShape
.
ndim
());
TV_REQUIRE
(
i1
>=
0
&&
i1
<
mShape
[
0
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
0
,
int
(
i1
),
mShape
[
0
]);
TV_REQUIRE
(
i2
>=
0
&&
i2
<
mShape
[
1
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
1
,
int
(
i2
),
mShape
[
1
]);
TV_REQUIRE
(
i3
>=
0
&&
i3
<
mShape
[
2
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
2
,
int
(
i3
),
mShape
[
2
]);
#endif
#endif
return
mPtr
[(
i1
*
mShape
[
1
]
+
i2
)
*
mShape
[
2
]
+
i3
];
}
template
<
class
T1
,
class
T2
,
class
T3
,
class
T4
>
TV_HOST_DEVICE_INLINE
T
&
operator
()(
T1
i1
,
T2
i2
,
T3
i3
,
T4
i4
)
{
#ifdef TV_DEBUG
#if defined(__CUDA_ARCH__)
TV_DEVICE_REQUIRE
(
mShape
.
ndim
()
==
4
,
"you provide 4 indexes, but dim is %ld
\n
"
,
mShape
.
ndim
());
TV_DEVICE_REQUIRE
(
i1
>=
0
&&
i1
<
mShape
[
0
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
0
,
int
(
i1
),
mShape
[
0
]);
TV_DEVICE_REQUIRE
(
i2
>=
0
&&
i2
<
mShape
[
1
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
1
,
int
(
i2
),
mShape
[
1
]);
TV_DEVICE_REQUIRE
(
i3
>=
0
&&
i3
<
mShape
[
2
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
2
,
int
(
i3
),
mShape
[
2
]);
TV_DEVICE_REQUIRE
(
i4
>=
0
&&
i4
<
mShape
[
3
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
3
,
int
(
i4
),
mShape
[
3
]);
#else
TV_REQUIRE
(
mShape
.
ndim
()
==
4
,
"you provide 4 indexes, but dim is %ld
\n
"
,
mShape
.
ndim
());
TV_REQUIRE
(
i1
>=
0
&&
i1
<
mShape
[
0
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
0
,
int
(
i1
),
mShape
[
0
]);
TV_REQUIRE
(
i2
>=
0
&&
i2
<
mShape
[
1
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
1
,
int
(
i2
),
mShape
[
1
]);
TV_REQUIRE
(
i3
>=
0
&&
i3
<
mShape
[
2
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
2
,
int
(
i3
),
mShape
[
2
]);
TV_REQUIRE
(
i4
>=
0
&&
i4
<
mShape
[
3
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
3
,
int
(
i4
),
mShape
[
3
]);
#endif
#endif
return
mPtr
[((
i1
*
mShape
[
1
]
+
i2
)
*
mShape
[
2
]
+
i3
)
*
mShape
[
3
]
+
i4
];
}
template
<
class
T1
>
TV_HOST_DEVICE_INLINE
const
T
&
operator
()(
T1
i1
)
const
{
#ifdef TV_DEBUG
#if defined(__CUDA_ARCH__)
TV_DEVICE_REQUIRE
(
mShape
.
ndim
()
==
1
,
"you provide 1 indexes, but dim is %ld
\n
"
,
mShape
.
ndim
());
TV_DEVICE_REQUIRE
(
i1
>=
0
&&
i1
<
mShape
[
0
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
0
,
int
(
i1
),
mShape
[
0
]);
#else
TV_REQUIRE
(
mShape
.
ndim
()
==
1
,
"you provide 1 indexes, but dim is %ld
\n
"
,
mShape
.
ndim
());
TV_REQUIRE
(
i1
>=
0
&&
i1
<
mShape
[
0
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
0
,
int
(
i1
),
mShape
[
0
]);
#endif
#endif
return
mPtr
[
i1
];
}
template
<
class
T1
,
class
T2
>
TV_HOST_DEVICE_INLINE
const
T
&
operator
()(
T1
i1
,
T2
i2
)
const
{
#ifdef TV_DEBUG
#if defined(__CUDA_ARCH__)
TV_DEVICE_REQUIRE
(
mShape
.
ndim
()
==
2
,
"you provide 2 indexes, but dim is %ld
\n
"
,
mShape
.
ndim
());
TV_DEVICE_REQUIRE
(
i1
>=
0
&&
i1
<
mShape
[
0
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
0
,
int
(
i1
),
mShape
[
0
]);
TV_DEVICE_REQUIRE
(
i2
>=
0
&&
i2
<
mShape
[
1
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
1
,
int
(
i2
),
mShape
[
1
]);
#else
TV_REQUIRE
(
mShape
.
ndim
()
==
2
,
"you provide 2 indexes, but dim is %ld
\n
"
,
mShape
.
ndim
());
TV_REQUIRE
(
i1
>=
0
&&
i1
<
mShape
[
0
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
0
,
int
(
i1
),
mShape
[
0
]);
TV_REQUIRE
(
i2
>=
0
&&
i2
<
mShape
[
1
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
1
,
int
(
i2
),
mShape
[
1
]);
#endif
#endif
return
mPtr
[
i1
*
mShape
[
1
]
+
i2
];
}
template
<
class
T1
,
class
T2
,
class
T3
>
TV_HOST_DEVICE_INLINE
const
T
&
operator
()(
T1
i1
,
T2
i2
,
T3
i3
)
const
{
#ifdef TV_DEBUG
#if defined(__CUDA_ARCH__)
TV_DEVICE_REQUIRE
(
mShape
.
ndim
()
==
3
,
"you provide 3 indexes, but dim is %ld
\n
"
,
mShape
.
ndim
());
TV_DEVICE_REQUIRE
(
i1
>=
0
&&
i1
<
mShape
[
0
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
0
,
int
(
i1
),
mShape
[
0
]);
TV_DEVICE_REQUIRE
(
i2
>=
0
&&
i2
<
mShape
[
1
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
1
,
int
(
i2
),
mShape
[
1
]);
TV_DEVICE_REQUIRE
(
i3
>=
0
&&
i3
<
mShape
[
2
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
2
,
int
(
i3
),
mShape
[
2
]);
#else
TV_REQUIRE
(
mShape
.
ndim
()
==
3
,
"you provide 3 indexes, but dim is %ld
\n
"
,
mShape
.
ndim
());
TV_REQUIRE
(
i1
>=
0
&&
i1
<
mShape
[
0
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
0
,
int
(
i1
),
mShape
[
0
]);
TV_REQUIRE
(
i2
>=
0
&&
i2
<
mShape
[
1
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
1
,
int
(
i2
),
mShape
[
1
]);
TV_REQUIRE
(
i3
>=
0
&&
i3
<
mShape
[
2
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
2
,
int
(
i3
),
mShape
[
2
]);
#endif
#endif
return
mPtr
[(
i1
*
mShape
[
1
]
+
i2
)
*
mShape
[
2
]
+
i3
];
}
template
<
class
T1
,
class
T2
,
class
T3
,
class
T4
>
TV_HOST_DEVICE_INLINE
const
T
&
operator
()(
T1
i1
,
T2
i2
,
T3
i3
,
T4
i4
)
const
{
#ifdef TV_DEBUG
#if defined(__CUDA_ARCH__)
TV_DEVICE_REQUIRE
(
mShape
.
ndim
()
==
4
,
"you provide 4 indexes, but dim is %ld
\n
"
,
mShape
.
ndim
());
TV_DEVICE_REQUIRE
(
i1
>=
0
&&
i1
<
mShape
[
0
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
0
,
int
(
i1
),
mShape
[
0
]);
TV_DEVICE_REQUIRE
(
i2
>=
0
&&
i2
<
mShape
[
1
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
1
,
int
(
i2
),
mShape
[
1
]);
TV_DEVICE_REQUIRE
(
i3
>=
0
&&
i3
<
mShape
[
2
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
2
,
int
(
i3
),
mShape
[
2
]);
TV_DEVICE_REQUIRE
(
i4
>=
0
&&
i4
<
mShape
[
3
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
3
,
int
(
i4
),
mShape
[
3
]);
#else
TV_REQUIRE
(
mShape
.
ndim
()
==
4
,
"you provide 4 indexes, but dim is %ld
\n
"
,
mShape
.
ndim
());
TV_REQUIRE
(
i1
>=
0
&&
i1
<
mShape
[
0
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
0
,
int
(
i1
),
mShape
[
0
]);
TV_REQUIRE
(
i2
>=
0
&&
i2
<
mShape
[
1
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
1
,
int
(
i2
),
mShape
[
1
]);
TV_REQUIRE
(
i3
>=
0
&&
i3
<
mShape
[
2
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
2
,
int
(
i3
),
mShape
[
2
]);
TV_REQUIRE
(
i4
>=
0
&&
i4
<
mShape
[
3
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
3
,
int
(
i4
),
mShape
[
3
]);
#endif
#endif
return
mPtr
[((
i1
*
mShape
[
1
]
+
i2
)
*
mShape
[
2
]
+
i3
)
*
mShape
[
3
]
+
i4
];
}
TV_HOST_DEVICE_INLINE
T
&
operator
[](
int
idx
)
{
#ifdef TV_DEBUG
#if defined(__CUDA_ARCH__)
TV_DEVICE_REQUIRE
(
idx
>=
0
&&
idx
<
size
(),
"index(%d) out-of-range: [0, %ld)
\n
"
,
int
(
idx
),
size
());
#else
TV_REQUIRE
(
idx
>=
0
&&
idx
<
size
(),
"index(%d) out-of-range: [0, %ld)
\n
"
,
int
(
idx
),
size
());
#endif
#endif
return
mPtr
[
idx
];
}
// TODO: this is conflcit with operator[](SimpleVector<Slice> slice_vec).
/*TV_HOST_DEVICE_INLINE T &operator[](const Shape index) {
int idx = rowArrayIdx(mShape, index);
#ifdef TV_DEBUG
TV_REQUIRE(idx >= 0 && idx < size(), "index(%d) out-of-range: [0, %ld)\n",
int(idx), size());
#endif
return mPtr[idx];
}
TV_HOST_DEVICE_INLINE const T &operator[](const Shape index) const {
int idx = rowArrayIdx(mShape, index);
#ifdef TV_DEBUG
TV_REQUIRE(idx >= 0 && idx < size(), "index(%d) out-of-range: [0, %ld)\n",
int(idx), size());
#endif
return mPtr[idx];
}*/
TV_HOST_DEVICE_INLINE
TensorView
<
T
,
Rank
>
operator
[](
SimpleVector
<
Slice
>
slice_vec
)
{
return
_subview
(
slice_vec
);
}
TV_HOST_DEVICE_INLINE
const
TensorView
<
T
,
Rank
>
operator
[](
SimpleVector
<
Slice
>
slice_vec
)
const
{
return
_subview
(
slice_vec
);
}
TV_HOST_DEVICE_INLINE
bool
empty
()
const
{
return
mPtr
==
nullptr
;
}
TV_HOST_DEVICE_INLINE
T
*
data
()
{
return
mPtr
;
}
TV_HOST_DEVICE_INLINE
const
T
*
data
()
const
{
return
mPtr
;
}
TV_HOST_DEVICE_INLINE
const
Shape
&
shape
()
const
{
return
mShape
;
}
TV_HOST_DEVICE_INLINE
int
dim
(
int
idx
)
const
{
return
mShape
[
idx
];
}
TV_HOST_DEVICE_INLINE
int
ndim
()
const
{
return
mShape
.
ndim
();
}
template
<
class
...
Inds
>
TV_HOST_DEVICE_INLINE
TensorView
<
T
,
Rank
>
&
reshape
(
Inds
...
newShapes
)
{
Shape
shapes
{
int
(
newShapes
)...};
TV_ASSERT
(
shapes
.
size
()
==
size
());
mShape
=
shapes
;
return
*
this
;
}
TV_HOST_DEVICE_INLINE
TensorView
<
T
,
Rank
>
&
reshape
(
Shape
shapes
)
{
TV_ASSERT
(
shapes
.
size
()
==
size
());
mShape
=
shapes
;
return
*
this
;
}
template
<
class
...
Inds
>
TV_HOST_DEVICE_INLINE
TensorView
<
T
,
Rank
>
view
(
Inds
...
newShapes
)
const
{
Shape
shapes
{
int
(
newShapes
)...};
for
(
size_t
i
=
0
;
i
<
shapes
.
ndim
();
++
i
)
{
if
(
shapes
[
i
]
==
-
1
)
{
shapes
[
i
]
=
1
;
shapes
[
i
]
=
size
()
/
shapes
.
size
();
break
;
}
}
TV_ASSERT
(
shapes
.
size
()
==
size
());
return
TensorView
<
T
,
Rank
>
(
mPtr
,
shapes
);
}
TV_HOST_DEVICE_INLINE
TensorView
<
T
,
Rank
>
view
(
Shape
shapes
)
const
{
TV_ASSERT
(
shapes
.
size
()
==
size
());
return
TensorView
<
T
,
Rank
>
(
mPtr
,
shapes
);
}
TV_HOST_DEVICE_INLINE
TensorView
<
T
,
Rank
>
squeeze
()
const
{
return
TensorView
<
T
,
Rank
>
(
mPtr
,
mShape
.
squeeze
());
}
TV_HOST_DEVICE_INLINE
TensorView
<
T
,
Rank
>
squeeze
(
int
dim
)
const
{
return
TensorView
<
T
,
Rank
>
(
mPtr
,
mShape
.
squeeze
(
dim
));
}
TV_HOST_DEVICE_INLINE
size_t
size
()
const
{
return
mShape
.
size
();
}
template
<
class
...
Slices
>
TV_HOST_DEVICE_INLINE
TensorView
<
T
,
Rank
>
subview
(
Slice
slice
,
Slices
...
slices
)
const
{
return
subview
<
float
,
Slice
,
Slices
...
>
(
slice
,
slices
...);
}
template
<
class
T2
=
float
,
class
...
Slices
>
TV_HOST_DEVICE_INLINE
TensorView
<
T
,
Rank
>
subview
(
Slices
...
slices
)
const
{
Slice
slice_vec
[
sizeof
...(
Slices
)]
=
{
to_slice
(
slices
)...};
Shape
new_shape
{
to_slice
(
slices
)[
0
]...};
Shape
start
{
to_slice
(
slices
)[
0
]...};
TV_ASSERT
(
new_shape
.
ndim
()
<=
mShape
.
ndim
());
TV_ASSERT
(
new_shape
.
ndim
()
!=
0
);
size_t
idxsize
=
new_shape
.
ndim
();
for
(
size_t
i
=
idxsize
;
i
<
mShape
.
ndim
();
++
i
)
{
new_shape
.
push_back
(
0
);
start
.
push_back
(
0
);
}
#pragma unroll
for
(
size_t
i
=
0
;
i
<
sizeof
...(
Slices
);
++
i
)
{
if
(
slice_vec
[
i
][
1
]
!=
-
1
)
{
new_shape
[
i
]
=
slice_vec
[
i
][
1
]
-
slice_vec
[
i
][
0
];
TV_ASSERT
(
new_shape
[
i
]
>=
0
);
}
else
{
new_shape
[
i
]
=
1
;
// reduce dim
}
}
auto
offset
=
rowArrayIdx
(
mShape
,
start
);
#pragma unroll
for
(
size_t
i
=
sizeof
...(
Slices
);
i
<
mShape
.
ndim
();
++
i
)
{
new_shape
[
i
]
=
mShape
[
i
];
TV_ASSERT
(
new_shape
[
i
]
>=
0
);
}
Shape
reduced_shape
;
#pragma unroll
for
(
size_t
i
=
0
;
i
<
sizeof
...(
Slices
);
++
i
)
{
if
(
slice_vec
[
i
][
1
]
!=
-
1
)
{
reduced_shape
.
push_back
(
new_shape
[
i
]);
}
}
#pragma unroll
for
(
size_t
i
=
sizeof
...(
Slices
);
i
<
mShape
.
ndim
();
++
i
)
{
reduced_shape
.
push_back
(
new_shape
[
i
]);
}
return
TensorView
<
T
,
Rank
>
(
mPtr
+
offset
,
reduced_shape
);
}
template
<
class
...
Integers
>
TV_HOST_DEVICE_INLINE
TensorView
<
T
,
Rank
>
subview
(
int
id
,
Integers
...
ints
)
{
Shape
start
=
{
id
,
ints
...};
for
(
int
i
=
1
+
sizeof
...(
ints
);
i
<
ndim
();
++
i
)
{
start
.
push_back
(
0
);
}
return
TensorView
<
T
,
Rank
>
(
mPtr
+
rowArrayIdx
(
mShape
,
start
),
mShape
.
subshape
(
sizeof
...(
ints
)
+
1
));
}
std
::
string
repr
()
const
{
std
::
ostringstream
ss
;
if
(
empty
())
return
""
;
if
(
mShape
.
ndim
()
==
0
)
{
ss
<<
*
mPtr
;
// ss << fmt::format("\nTensor: shape={}, dtype={}", mShape,
// detail::simpleTypeName<T>());
ss
<<
"Tensor: dtype="
<<
detail
::
simpleTypeName
<
T
>
();
return
ss
.
str
();
}
Shape
counter
=
mShape
;
auto
tensor_flat
=
this
->
view
(
-
1
);
for
(
int
i
=
0
;
i
<
counter
.
ndim
();
++
i
)
{
counter
[
i
]
=
0
;
ss
<<
"["
;
}
for
(
size_t
i
=
0
;
i
<
this
->
size
();
++
i
)
{
ss
<<
tensor_flat
(
rowArrayIdx
(
mShape
,
counter
));
counter
[
counter
.
ndim
()
-
1
]
+=
1
;
int
inc_count
=
0
;
bool
print_comma
=
true
;
for
(
int
c
=
counter
.
ndim
()
-
1
;
c
>=
0
;
--
c
)
{
if
(
counter
[
c
]
==
this
->
dim
(
c
)
&&
c
>
0
)
{
++
inc_count
;
counter
[
c
-
1
]
+=
1
;
counter
[
c
]
=
0
;
print_comma
=
false
;
}
}
if
(
print_comma
&&
i
!=
this
->
size
()
-
1
)
ss
<<
", "
;
for
(
int
j
=
0
;
j
<
inc_count
;
++
j
)
{
ss
<<
"]"
;
}
if
(
i
!=
this
->
size
()
-
1
)
{
if
(
inc_count
!=
0
)
ss
<<
"
\n
"
;
for
(
int
j
=
0
;
j
<
inc_count
;
++
j
)
{
ss
<<
"["
;
}
}
}
ss
<<
"]"
;
// ss << fmt::format("\nTensor: shape={}, dtype={}", mShape,
// detail::simpleTypeName<T>());
ss
<<
"Tensor: dtype="
<<
detail
::
simpleTypeName
<
T
>
();
return
ss
.
str
();
}
protected:
// TODO: make this function public.
// currently this function is called unexpectedly when using subview({0, 0}).
TV_HOST_DEVICE_INLINE
TensorView
<
T
,
Rank
>
_subview
(
SimpleVector
<
Slice
>
slice_vec
)
{
Shape
new_shape
;
for
(
int
i
=
0
;
i
<
slice_vec
.
size
();
++
i
)
{
new_shape
.
push_back
(
slice_vec
[
i
][
0
]);
}
Shape
start
=
new_shape
;
TV_ASSERT
(
new_shape
.
ndim
()
<=
mShape
.
ndim
());
TV_ASSERT
(
new_shape
.
ndim
()
!=
0
);
size_t
idxsize
=
new_shape
.
ndim
();
for
(
size_t
i
=
idxsize
;
i
<
mShape
.
ndim
();
++
i
)
{
new_shape
.
push_back
(
0
);
start
.
push_back
(
0
);
}
for
(
size_t
i
=
0
;
i
<
slice_vec
.
size
();
++
i
)
{
if
(
slice_vec
[
i
][
1
]
!=
-
1
)
{
new_shape
[
i
]
=
slice_vec
[
i
][
1
]
-
slice_vec
[
i
][
0
];
TV_ASSERT
(
new_shape
[
i
]
>=
0
);
}
else
{
new_shape
[
i
]
=
1
;
// reduce dim
}
}
auto
offset
=
rowArrayIdx
(
mShape
,
start
);
for
(
size_t
i
=
slice_vec
.
size
();
i
<
mShape
.
ndim
();
++
i
)
{
new_shape
[
i
]
=
mShape
[
i
];
TV_ASSERT
(
new_shape
[
i
]
>=
0
);
}
Shape
reduced_shape
;
for
(
size_t
i
=
0
;
i
<
slice_vec
.
size
();
++
i
)
{
if
(
slice_vec
[
i
][
1
]
!=
-
1
)
{
reduced_shape
.
push_back
(
new_shape
[
i
]);
}
}
for
(
size_t
i
=
slice_vec
.
size
();
i
<
mShape
.
ndim
();
++
i
)
{
reduced_shape
.
push_back
(
new_shape
[
i
]);
}
return
TensorView
<
T
,
Rank
>
(
mPtr
+
offset
,
reduced_shape
);
}
template
<
typename
T1
>
TV_HOST_DEVICE_INLINE
Slice
to_slice
(
T1
s
)
const
{
return
Slice
{
int
(
s
),
-
1
,
-
1
};
}
TV_HOST_DEVICE_INLINE
Slice
to_slice
(
Slice
s
)
const
{
return
Slice
(
s
);
}
T
*
mPtr
=
nullptr
;
Shape
mShape
;
};
template
<
typename
Os
,
typename
T
,
int
Rank
>
Os
&
operator
<<
(
Os
&
os
,
const
TensorView
<
T
,
Rank
>
&
dt
)
{
os
<<
dt
.
repr
();
return
os
;
}
template
<
typename
Os
,
typename
T
,
int
Rank
>
Os
&
operator
<<
(
Os
&
os
,
const
TensorView
<
const
T
,
Rank
>
&
dt
)
{
os
<<
dt
.
repr
();
return
os
;
}
namespace
detail
{
template
<
typename
T
>
constexpr
const
char
*
printfTypeFormat
(
T
val
=
T
());
template
<
>
constexpr
const
char
*
printfTypeFormat
(
float
val
)
{
return
"%.2f"
;
}
template
<
>
constexpr
const
char
*
printfTypeFormat
(
double
val
)
{
return
"%.2f"
;
}
template
<
>
constexpr
const
char
*
printfTypeFormat
(
int
val
)
{
return
"%d"
;
}
template
<
>
constexpr
const
char
*
printfTypeFormat
(
unsigned
val
)
{
return
"%u"
;
}
template
<
>
constexpr
const
char
*
printfTypeFormat
(
long
val
)
{
return
"%ld"
;
}
template
<
>
constexpr
const
char
*
printfTypeFormat
(
unsigned
long
val
)
{
return
"%lu"
;
}
};
// namespace detail
template
<
typename
T
>
TV_HOST_DEVICE
void
printTensorView
(
const
TensorView
<
T
>
tensor
,
const
char
*
format
)
{
if
(
tensor
.
empty
())
return
;
if
(
tensor
.
ndim
()
==
0
)
{
printf
(
format
,
tensor
());
printf
(
"
\n
"
);
return
;
}
Shape
counter
=
tensor
.
shape
();
auto
tensor_flat
=
tensor
.
view
(
-
1
);
for
(
int
i
=
0
;
i
<
counter
.
ndim
();
++
i
)
{
counter
[
i
]
=
0
;
printf
(
"["
);
}
for
(
size_t
i
=
0
;
i
<
tensor
.
size
();
++
i
)
{
printf
(
format
,
tensor_flat
(
rowArrayIdx
(
tensor
.
shape
(),
counter
)));
counter
[
counter
.
ndim
()
-
1
]
+=
1
;
int
inc_count
=
0
;
bool
print_comma
=
true
;
for
(
int
c
=
counter
.
ndim
()
-
1
;
c
>=
0
;
--
c
)
{
if
(
counter
[
c
]
==
tensor
.
dim
(
c
)
&&
c
>
0
)
{
++
inc_count
;
counter
[
c
-
1
]
+=
1
;
counter
[
c
]
=
0
;
print_comma
=
false
;
}
}
if
(
print_comma
&&
i
!=
tensor
.
size
()
-
1
)
printf
(
", "
);
for
(
int
j
=
0
;
j
<
inc_count
;
++
j
)
{
printf
(
"]"
);
}
if
(
i
!=
tensor
.
size
()
-
1
)
{
if
(
inc_count
!=
0
)
printf
(
"
\n
"
);
for
(
int
j
=
0
;
j
<
inc_count
;
++
j
)
{
printf
(
"["
);
}
}
}
printf
(
"]
\n
"
);
}
template
<
typename
T
>
TV_HOST_DEVICE
void
printTensorView
(
TensorView
<
T
>
tensor
)
{
using
Traw
=
typename
std
::
remove_const
<
T
>::
type
;
return
printTensorView
(
tensor
,
detail
::
printfTypeFormat
<
Traw
>
());
}
template
<
typename
T
>
TV_HOST_DEVICE
void
printTensorView
(
const
T
*
ptr
,
Shape
shape
)
{
using
Traw
=
typename
std
::
remove_const
<
T
>::
type
;
return
printTensorView
(
TensorView
<
const
T
>
(
ptr
,
shape
),
detail
::
printfTypeFormat
<
Traw
>
());
}
template
<
typename
T
>
TV_HOST_DEVICE
void
printTensorView
(
const
T
*
ptr
,
Shape
shape
,
const
char
*
format
)
{
return
printTensorView
(
TensorView
<
const
T
>
(
ptr
,
shape
),
format
);
}
}
// namespace tv
mmdet3d/ops/spconv/include/torch_utils.h
deleted
100644 → 0
View file @
9c7270d0
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <tensorview/tensorview.h>
#include <torch/script.h>
namespace
tv
{
struct
TorchGPU
:
public
tv
::
GPU
{
virtual
cudaStream_t
getStream
()
const
override
{
return
at
::
cuda
::
getCurrentCUDAStream
();
}
};
template
<
typename
T
>
void
check_torch_dtype
(
const
torch
::
Tensor
&
tensor
)
{
switch
(
tensor
.
type
().
scalarType
())
{
case
at
::
ScalarType
::
Double
:
{
auto
val
=
std
::
is_same
<
std
::
remove_const_t
<
T
>
,
double
>::
value
;
TV_ASSERT_RT_ERR
(
val
,
"error"
);
break
;
}
case
at
::
ScalarType
::
Float
:
{
auto
val
=
std
::
is_same
<
std
::
remove_const_t
<
T
>
,
float
>::
value
;
TV_ASSERT_RT_ERR
(
val
,
"error"
);
break
;
}
case
at
::
ScalarType
::
Int
:
{
auto
val
=
std
::
is_same
<
std
::
remove_const_t
<
T
>
,
int
>::
value
;
TV_ASSERT_RT_ERR
(
val
,
"error"
);
break
;
}
case
at
::
ScalarType
::
Half
:
{
auto
val
=
std
::
is_same
<
std
::
remove_const_t
<
T
>
,
at
::
Half
>::
value
;
TV_ASSERT_RT_ERR
(
val
,
"error"
);
break
;
}
case
at
::
ScalarType
::
Long
:
{
auto
val
=
std
::
is_same
<
std
::
remove_const_t
<
T
>
,
long
>::
value
;
TV_ASSERT_RT_ERR
(
val
,
"error"
);
break
;
}
default:
TV_ASSERT_RT_ERR
(
false
,
"error"
);
}
}
template
<
typename
T
>
tv
::
TensorView
<
T
>
torch2tv
(
const
torch
::
Tensor
&
tensor
)
{
check_torch_dtype
<
T
>
(
tensor
);
tv
::
Shape
shape
;
for
(
auto
i
:
tensor
.
sizes
())
{
shape
.
push_back
(
i
);
}
return
tv
::
TensorView
<
T
>
(
tensor
.
data_ptr
<
std
::
remove_const_t
<
T
>>
(),
shape
);
}
}
// namespace tv
mmdet3d/ops/spconv/include/utility/timer.h
deleted
100644 → 0
View file @
9c7270d0
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <chrono>
#include <cuda_runtime_api.h>
#include <iostream>
namespace
spconv
{
template
<
typename
TimeT
=
std
::
chrono
::
microseconds
>
struct
CudaContextTimer
{
CudaContextTimer
()
{
cudaDeviceSynchronize
();
mCurTime
=
std
::
chrono
::
steady_clock
::
now
();
}
typename
TimeT
::
rep
report
()
{
cudaDeviceSynchronize
();
auto
duration
=
std
::
chrono
::
duration_cast
<
TimeT
>
(
std
::
chrono
::
steady_clock
::
now
()
-
mCurTime
);
auto
res
=
duration
.
count
();
mCurTime
=
std
::
chrono
::
steady_clock
::
now
();
return
res
;
}
private:
std
::
chrono
::
time_point
<
std
::
chrono
::
steady_clock
>
mCurTime
;
};
template
<
typename
TimeT
=
std
::
chrono
::
microseconds
>
struct
CPUTimer
{
CPUTimer
()
{
mCurTime
=
std
::
chrono
::
steady_clock
::
now
();
}
typename
TimeT
::
rep
report
()
{
auto
duration
=
std
::
chrono
::
duration_cast
<
TimeT
>
(
std
::
chrono
::
steady_clock
::
now
()
-
mCurTime
);
auto
res
=
duration
.
count
();
mCurTime
=
std
::
chrono
::
steady_clock
::
now
();
return
res
;
}
private:
std
::
chrono
::
time_point
<
std
::
chrono
::
steady_clock
>
mCurTime
;
};
}
// namespace spconv
mmdet3d/ops/spconv/modules.py
deleted
100644 → 0
View file @
9c7270d0
# Copyright 2019 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
sys
from
collections
import
OrderedDict
import
torch
from
torch
import
nn
from
.structure
import
SparseConvTensor
def
is_spconv_module
(
module
):
spconv_modules
=
(
SparseModule
,
)
return
isinstance
(
module
,
spconv_modules
)
def
is_sparse_conv
(
module
):
from
.conv
import
SparseConvolution
return
isinstance
(
module
,
SparseConvolution
)
def
_mean_update
(
vals
,
m_vals
,
t
):
outputs
=
[]
if
not
isinstance
(
vals
,
list
):
vals
=
[
vals
]
if
not
isinstance
(
m_vals
,
list
):
m_vals
=
[
m_vals
]
for
val
,
m_val
in
zip
(
vals
,
m_vals
):
output
=
t
/
float
(
t
+
1
)
*
m_val
+
1
/
float
(
t
+
1
)
*
val
outputs
.
append
(
output
)
if
len
(
outputs
)
==
1
:
outputs
=
outputs
[
0
]
return
outputs
class
SparseModule
(
nn
.
Module
):
"""place holder, All module subclass from this will take sptensor in
SparseSequential."""
pass
class
SparseSequential
(
SparseModule
):
r
"""A sequential container.
Modules will be added to it in the order they are passed in the
constructor.
Alternatively, an ordered dict of modules can also be passed in.
To make it easier to understand, given is a small example::
# Example of using Sequential
model = SparseSequential(
SparseConv2d(1,20,5),
nn.ReLU(),
SparseConv2d(20,64,5),
nn.ReLU()
)
# Example of using Sequential with OrderedDict
model = SparseSequential(OrderedDict([
('conv1', SparseConv2d(1,20,5)),
('relu1', nn.ReLU()),
('conv2', SparseConv2d(20,64,5)),
('relu2', nn.ReLU())
]))
# Example of using Sequential with kwargs(python 3.6+)
model = SparseSequential(
conv1=SparseConv2d(1,20,5),
relu1=nn.ReLU(),
conv2=SparseConv2d(20,64,5),
relu2=nn.ReLU()
)
"""
def
__init__
(
self
,
*
args
,
**
kwargs
):
super
(
SparseSequential
,
self
).
__init__
()
if
len
(
args
)
==
1
and
isinstance
(
args
[
0
],
OrderedDict
):
for
key
,
module
in
args
[
0
].
items
():
self
.
add_module
(
key
,
module
)
else
:
for
idx
,
module
in
enumerate
(
args
):
self
.
add_module
(
str
(
idx
),
module
)
for
name
,
module
in
kwargs
.
items
():
if
sys
.
version_info
<
(
3
,
6
):
raise
ValueError
(
'kwargs only supported in py36+'
)
if
name
in
self
.
_modules
:
raise
ValueError
(
'name exists.'
)
self
.
add_module
(
name
,
module
)
self
.
_sparity_dict
=
{}
def
__getitem__
(
self
,
idx
):
if
not
(
-
len
(
self
)
<=
idx
<
len
(
self
)):
raise
IndexError
(
'index {} is out of range'
.
format
(
idx
))
if
idx
<
0
:
idx
+=
len
(
self
)
it
=
iter
(
self
.
_modules
.
values
())
for
i
in
range
(
idx
):
next
(
it
)
return
next
(
it
)
def
__len__
(
self
):
return
len
(
self
.
_modules
)
@
property
def
sparity_dict
(
self
):
return
self
.
_sparity_dict
def
add
(
self
,
module
,
name
=
None
):
if
name
is
None
:
name
=
str
(
len
(
self
.
_modules
))
if
name
in
self
.
_modules
:
raise
KeyError
(
'name exists'
)
self
.
add_module
(
name
,
module
)
def
forward
(
self
,
input
):
for
k
,
module
in
self
.
_modules
.
items
():
if
is_spconv_module
(
module
):
# use SpConvTensor as input
assert
isinstance
(
input
,
SparseConvTensor
)
self
.
_sparity_dict
[
k
]
=
input
.
sparity
input
=
module
(
input
)
else
:
if
isinstance
(
input
,
SparseConvTensor
):
if
input
.
indices
.
shape
[
0
]
!=
0
:
input
.
features
=
module
(
input
.
features
)
else
:
input
=
module
(
input
)
return
input
def
fused
(
self
):
"""don't use this.
no effect.
"""
from
.conv
import
SparseConvolution
mods
=
[
v
for
k
,
v
in
self
.
_modules
.
items
()]
fused_mods
=
[]
idx
=
0
while
idx
<
len
(
mods
):
if
is_sparse_conv
(
mods
[
idx
]):
if
idx
<
len
(
mods
)
-
1
and
isinstance
(
mods
[
idx
+
1
],
nn
.
BatchNorm1d
):
new_module
=
SparseConvolution
(
ndim
=
mods
[
idx
].
ndim
,
in_channels
=
mods
[
idx
].
in_channels
,
out_channels
=
mods
[
idx
].
out_channels
,
kernel_size
=
mods
[
idx
].
kernel_size
,
stride
=
mods
[
idx
].
stride
,
padding
=
mods
[
idx
].
padding
,
dilation
=
mods
[
idx
].
dilation
,
groups
=
mods
[
idx
].
groups
,
bias
=
True
,
subm
=
mods
[
idx
].
subm
,
output_padding
=
mods
[
idx
].
output_padding
,
transposed
=
mods
[
idx
].
transposed
,
inverse
=
mods
[
idx
].
inverse
,
indice_key
=
mods
[
idx
].
indice_key
,
fused_bn
=
True
,
)
new_module
.
load_state_dict
(
mods
[
idx
].
state_dict
(),
False
)
new_module
.
to
(
mods
[
idx
].
weight
.
device
)
conv
=
new_module
bn
=
mods
[
idx
+
1
]
conv
.
bias
.
data
.
zero_
()
conv
.
weight
.
data
[:]
=
conv
.
weight
.
data
*
bn
.
weight
.
data
/
(
torch
.
sqrt
(
bn
.
running_var
)
+
bn
.
eps
)
conv
.
bias
.
data
[:]
=
(
conv
.
bias
.
data
-
bn
.
running_mean
)
*
bn
.
weight
.
data
/
(
torch
.
sqrt
(
bn
.
running_var
)
+
bn
.
eps
)
+
bn
.
bias
.
data
fused_mods
.
append
(
conv
)
idx
+=
2
else
:
fused_mods
.
append
(
mods
[
idx
])
idx
+=
1
else
:
fused_mods
.
append
(
mods
[
idx
])
idx
+=
1
return
SparseSequential
(
*
fused_mods
)
class
ToDense
(
SparseModule
):
"""convert SparseConvTensor to NCHW dense tensor."""
def
forward
(
self
,
x
:
SparseConvTensor
):
return
x
.
dense
()
class
RemoveGrid
(
SparseModule
):
"""remove pre-allocated grid buffer."""
def
forward
(
self
,
x
:
SparseConvTensor
):
x
.
grid
=
None
return
x
mmdet3d/ops/spconv/ops.py
deleted
100644 → 0
View file @
9c7270d0
# Copyright 2019 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
torch
from
.
import
sparse_conv_ext
def
get_conv_output_size
(
input_size
,
kernel_size
,
stride
,
padding
,
dilation
):
ndim
=
len
(
input_size
)
output_size
=
[]
for
i
in
range
(
ndim
):
size
=
(
input_size
[
i
]
+
2
*
padding
[
i
]
-
dilation
[
i
]
*
(
kernel_size
[
i
]
-
1
)
-
1
)
//
stride
[
i
]
+
1
if
kernel_size
[
i
]
==
-
1
:
output_size
.
append
(
1
)
else
:
output_size
.
append
(
size
)
return
output_size
def
get_deconv_output_size
(
input_size
,
kernel_size
,
stride
,
padding
,
dilation
,
output_padding
):
ndim
=
len
(
input_size
)
output_size
=
[]
for
i
in
range
(
ndim
):
if
kernel_size
[
i
]
==
-
1
:
raise
ValueError
(
"deconv don't support kernel_size < 0"
)
size
=
(
input_size
[
i
]
-
1
)
*
stride
[
i
]
-
2
*
padding
[
i
]
+
kernel_size
[
i
]
+
output_padding
[
i
]
output_size
.
append
(
size
)
return
output_size
def
get_indice_pairs
(
indices
,
batch_size
,
spatial_shape
,
ksize
=
3
,
stride
=
1
,
padding
=
0
,
dilation
=
1
,
out_padding
=
0
,
subm
=
False
,
transpose
=
False
,
grid
=
None
):
ndim
=
indices
.
shape
[
1
]
-
1
if
not
isinstance
(
ksize
,
(
list
,
tuple
)):
ksize
=
[
ksize
]
*
ndim
if
not
isinstance
(
stride
,
(
list
,
tuple
)):
stride
=
[
stride
]
*
ndim
if
not
isinstance
(
padding
,
(
list
,
tuple
)):
padding
=
[
padding
]
*
ndim
if
not
isinstance
(
dilation
,
(
list
,
tuple
)):
dilation
=
[
dilation
]
*
ndim
if
not
isinstance
(
out_padding
,
(
list
,
tuple
)):
out_padding
=
[
out_padding
]
*
ndim
for
d
,
s
in
zip
(
dilation
,
stride
):
assert
any
([
s
==
1
,
d
==
1
]),
"don't support this."
if
not
subm
:
if
transpose
:
out_shape
=
get_deconv_output_size
(
spatial_shape
,
ksize
,
stride
,
padding
,
dilation
,
out_padding
)
else
:
out_shape
=
get_conv_output_size
(
spatial_shape
,
ksize
,
stride
,
padding
,
dilation
)
else
:
out_shape
=
spatial_shape
if
grid
is
None
:
if
ndim
==
2
:
get_indice_pairs_func
=
sparse_conv_ext
.
get_indice_pairs_2d
elif
ndim
==
3
:
get_indice_pairs_func
=
sparse_conv_ext
.
get_indice_pairs_3d
elif
ndim
==
4
:
get_indice_pairs_func
=
sparse_conv_ext
.
get_indice_pairs_4d
else
:
raise
NotImplementedError
return
get_indice_pairs_func
(
indices
,
batch_size
,
out_shape
,
spatial_shape
,
ksize
,
stride
,
padding
,
dilation
,
out_padding
,
int
(
subm
),
int
(
transpose
))
else
:
if
ndim
==
2
:
get_indice_pairs_func
=
sparse_conv_ext
.
get_indice_pairs_grid_2d
elif
ndim
==
3
:
get_indice_pairs_func
=
sparse_conv_ext
.
get_indice_pairs_grid_3d
else
:
raise
NotImplementedError
return
get_indice_pairs_func
(
indices
,
grid
,
batch_size
,
out_shape
,
spatial_shape
,
ksize
,
stride
,
padding
,
dilation
,
out_padding
,
int
(
subm
),
int
(
transpose
))
def
indice_conv
(
features
,
filters
,
indice_pairs
,
indice_pair_num
,
num_activate_out
,
inverse
=
False
,
subm
=
False
):
if
filters
.
dtype
==
torch
.
float32
:
return
sparse_conv_ext
.
indice_conv_fp32
(
features
,
filters
,
indice_pairs
,
indice_pair_num
,
num_activate_out
,
int
(
inverse
),
int
(
subm
))
elif
filters
.
dtype
==
torch
.
half
:
return
sparse_conv_ext
.
indice_conv_half
(
features
,
filters
,
indice_pairs
,
indice_pair_num
,
num_activate_out
,
int
(
inverse
),
int
(
subm
))
else
:
raise
NotImplementedError
def
fused_indice_conv
(
features
,
filters
,
bias
,
indice_pairs
,
indice_pair_num
,
num_activate_out
,
inverse
,
subm
):
if
features
.
dtype
==
torch
.
half
:
func
=
sparse_conv_ext
.
fused_indice_conv_half
elif
filters
.
dtype
==
torch
.
float32
:
func
=
sparse_conv_ext
.
fused_indice_conv_fp32
else
:
raise
NotImplementedError
return
func
(
features
,
filters
,
bias
,
indice_pairs
,
indice_pair_num
,
num_activate_out
,
int
(
inverse
),
int
(
subm
))
def
indice_conv_backward
(
features
,
filters
,
out_bp
,
indice_pairs
,
indice_pair_num
,
inverse
=
False
,
subm
=
False
):
if
filters
.
dtype
==
torch
.
float32
:
return
sparse_conv_ext
.
indice_conv_backward_fp32
(
features
,
filters
,
out_bp
,
indice_pairs
,
indice_pair_num
,
int
(
inverse
),
int
(
subm
))
elif
filters
.
dtype
==
torch
.
half
:
return
sparse_conv_ext
.
indice_conv_backward_half
(
features
,
filters
,
out_bp
,
indice_pairs
,
indice_pair_num
,
int
(
inverse
),
int
(
subm
))
else
:
raise
NotImplementedError
def
indice_maxpool
(
features
,
indice_pairs
,
indice_pair_num
,
num_activate_out
):
if
features
.
dtype
==
torch
.
float32
:
return
sparse_conv_ext
.
indice_maxpool_fp32
(
features
,
indice_pairs
,
indice_pair_num
,
num_activate_out
)
elif
features
.
dtype
==
torch
.
half
:
return
sparse_conv_ext
.
indice_maxpool_half
(
features
,
indice_pairs
,
indice_pair_num
,
num_activate_out
)
else
:
raise
NotImplementedError
def
indice_maxpool_backward
(
features
,
out_features
,
out_bp
,
indice_pairs
,
indice_pair_num
):
if
features
.
dtype
==
torch
.
float32
:
return
sparse_conv_ext
.
indice_maxpool_backward_fp32
(
features
,
out_features
,
out_bp
,
indice_pairs
,
indice_pair_num
)
elif
features
.
dtype
==
torch
.
half
:
return
sparse_conv_ext
.
indice_maxpool_backward_half
(
features
,
out_features
,
out_bp
,
indice_pairs
,
indice_pair_num
)
else
:
raise
NotImplementedError
mmdet3d/ops/spconv/pool.py
deleted
100644 → 0
View file @
9c7270d0
# Copyright 2019 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
.
import
functional
as
Fsp
from
.
import
ops
from
.modules
import
SparseModule
from
.structure
import
SparseConvTensor
class
SparseMaxPool
(
SparseModule
):
def
__init__
(
self
,
ndim
,
kernel_size
,
stride
=
1
,
padding
=
0
,
dilation
=
1
,
subm
=
False
):
super
(
SparseMaxPool
,
self
).
__init__
()
if
not
isinstance
(
kernel_size
,
(
list
,
tuple
)):
kernel_size
=
[
kernel_size
]
*
ndim
if
not
isinstance
(
stride
,
(
list
,
tuple
)):
stride
=
[
stride
]
*
ndim
if
not
isinstance
(
padding
,
(
list
,
tuple
)):
padding
=
[
padding
]
*
ndim
if
not
isinstance
(
dilation
,
(
list
,
tuple
)):
dilation
=
[
dilation
]
*
ndim
self
.
ndim
=
ndim
self
.
kernel_size
=
kernel_size
self
.
stride
=
stride
self
.
padding
=
padding
self
.
subm
=
subm
self
.
dilation
=
dilation
def
forward
(
self
,
input
):
assert
isinstance
(
input
,
SparseConvTensor
)
features
=
input
.
features
device
=
features
.
device
indices
=
input
.
indices
spatial_shape
=
input
.
spatial_shape
batch_size
=
input
.
batch_size
if
not
self
.
subm
:
out_spatial_shape
=
ops
.
get_conv_output_size
(
spatial_shape
,
self
.
kernel_size
,
self
.
stride
,
self
.
padding
,
self
.
dilation
)
else
:
out_spatial_shape
=
spatial_shape
outids
,
indice_pairs
,
indice_pairs_num
=
ops
.
get_indice_pairs
(
indices
,
batch_size
,
spatial_shape
,
self
.
kernel_size
,
self
.
stride
,
self
.
padding
,
self
.
dilation
,
0
,
self
.
subm
)
out_features
=
Fsp
.
indice_maxpool
(
features
,
indice_pairs
.
to
(
device
),
indice_pairs_num
.
to
(
device
),
outids
.
shape
[
0
])
out_tensor
=
SparseConvTensor
(
out_features
,
outids
,
out_spatial_shape
,
batch_size
)
out_tensor
.
indice_dict
=
input
.
indice_dict
out_tensor
.
grid
=
input
.
grid
return
out_tensor
class
SparseMaxPool2d
(
SparseMaxPool
):
def
__init__
(
self
,
kernel_size
,
stride
=
1
,
padding
=
0
,
dilation
=
1
):
super
(
SparseMaxPool2d
,
self
).
__init__
(
2
,
kernel_size
,
stride
,
padding
,
dilation
)
class
SparseMaxPool3d
(
SparseMaxPool
):
def
__init__
(
self
,
kernel_size
,
stride
=
1
,
padding
=
0
,
dilation
=
1
):
super
(
SparseMaxPool3d
,
self
).
__init__
(
3
,
kernel_size
,
stride
,
padding
,
dilation
)
mmdet3d/ops/spconv/src/all.cc
deleted
100644 → 0
View file @
9c7270d0
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <cuda_runtime_api.h>
#include <spconv/fused_spconv_ops.h>
#include <spconv/pool_ops.h>
#include <spconv/spconv_ops.h>
#include <torch/extension.h>
PYBIND11_MODULE
(
TORCH_EXTENSION_NAME
,
m
)
{
m
.
def
(
"get_indice_pairs_2d"
,
&
spconv
::
getIndicePair
<
2
>
,
"get_indice_pairs_2d"
);
m
.
def
(
"get_indice_pairs_3d"
,
&
spconv
::
getIndicePair
<
3
>
,
"get_indice_pairs_3d"
);
m
.
def
(
"get_indice_pairs_4d"
,
&
spconv
::
getIndicePair
<
4
>
,
"get_indice_pairs_4d"
);
m
.
def
(
"get_indice_pairs_grid_2d"
,
&
spconv
::
getIndicePairPreGrid
<
2
>
,
"get_indice_pairs_grid_2d"
);
m
.
def
(
"get_indice_pairs_grid_3d"
,
&
spconv
::
getIndicePairPreGrid
<
3
>
,
"get_indice_pairs_grid_3d"
);
m
.
def
(
"indice_conv_fp32"
,
&
spconv
::
indiceConv
<
float
>
,
"indice_conv_fp32"
);
m
.
def
(
"indice_conv_backward_fp32"
,
&
spconv
::
indiceConvBackward
<
float
>
,
"indice_conv_backward_fp32"
);
m
.
def
(
"indice_conv_half"
,
&
spconv
::
indiceConv
<
at
::
Half
>
,
"indice_conv_half"
);
m
.
def
(
"indice_conv_backward_half"
,
&
spconv
::
indiceConvBackward
<
at
::
Half
>
,
"indice_conv_backward_half"
);
m
.
def
(
"fused_indice_conv_fp32"
,
&
spconv
::
fusedIndiceConvBatchNorm
<
float
>
,
"fused_indice_conv_fp32"
);
m
.
def
(
"fused_indice_conv_half"
,
&
spconv
::
fusedIndiceConvBatchNorm
<
at
::
Half
>
,
"fused_indice_conv_half"
);
m
.
def
(
"indice_maxpool_fp32"
,
&
spconv
::
indiceMaxPool
<
float
>
,
"indice_maxpool_fp32"
);
m
.
def
(
"indice_maxpool_backward_fp32"
,
&
spconv
::
indiceMaxPoolBackward
<
float
>
,
"indice_maxpool_backward_fp32"
);
m
.
def
(
"indice_maxpool_half"
,
&
spconv
::
indiceMaxPool
<
at
::
Half
>
,
"indice_maxpool_half"
);
m
.
def
(
"indice_maxpool_backward_half"
,
&
spconv
::
indiceMaxPoolBackward
<
at
::
Half
>
,
"indice_maxpool_backward_half"
);
}
mmdet3d/ops/spconv/src/indice.cc
deleted
100644 → 0
View file @
9c7270d0
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <spconv/geometry.h>
#include <spconv/indice.h>
#include <spconv/spconv_ops.h>
#include <torch/script.h>
namespace
spconv
{
namespace
functor
{
template
<
typename
Index
,
typename
IndexGrid
,
unsigned
NDim
>
struct
CreateConvIndicePairFunctor
<
tv
::
CPU
,
Index
,
IndexGrid
,
NDim
>
{
Index
operator
()(
const
tv
::
CPU
&
d
,
tv
::
TensorView
<
const
Index
>
indicesIn
,
tv
::
TensorView
<
Index
>
indicesOut
,
tv
::
TensorView
<
IndexGrid
>
gridsOut
,
tv
::
TensorView
<
Index
>
indicePairs
,
tv
::
TensorView
<
Index
>
indiceNum
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
kernelSize
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
stride
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
padding
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
dilation
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
outSpatialShape
,
bool
transpose
,
bool
resetGrid
)
{
if
(
transpose
)
return
getIndicePairsDeConv
<
Index
,
IndexGrid
,
NDim
>
(
indicesIn
,
indicesOut
,
gridsOut
,
indicePairs
,
indiceNum
,
kernelSize
.
data
(),
stride
.
data
(),
padding
.
data
(),
dilation
.
data
(),
outSpatialShape
.
data
());
else
return
getIndicePairsConv
<
Index
,
IndexGrid
,
NDim
>
(
indicesIn
,
indicesOut
,
gridsOut
,
indicePairs
,
indiceNum
,
kernelSize
.
data
(),
stride
.
data
(),
padding
.
data
(),
dilation
.
data
(),
outSpatialShape
.
data
());
}
};
template
<
typename
Index
,
typename
IndexGrid
,
unsigned
NDim
>
struct
CreateSubMIndicePairFunctor
<
tv
::
CPU
,
Index
,
IndexGrid
,
NDim
>
{
Index
operator
()(
const
tv
::
CPU
&
d
,
tv
::
TensorView
<
const
Index
>
indicesIn
,
tv
::
TensorView
<
IndexGrid
>
gridsOut
,
tv
::
TensorView
<
Index
>
indicePairs
,
tv
::
TensorView
<
Index
>
indiceNum
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
kernelSize
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
stride
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
padding
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
dilation
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
outSpatialShape
,
bool
transpose
,
bool
resetGrid
)
{
return
getIndicePairsSubM
<
Index
,
IndexGrid
,
NDim
>
(
indicesIn
,
gridsOut
,
indicePairs
,
indiceNum
,
kernelSize
.
data
(),
stride
.
data
(),
padding
.
data
(),
dilation
.
data
(),
outSpatialShape
.
data
());
}
};
}
// namespace functor
#define DECLARE_CPU_SPECS_INDEX_NDIM(Index, NDIM) \
template struct functor::CreateConvIndicePairFunctor<tv::CPU, Index, int, \
NDIM>; \
template struct functor::CreateSubMIndicePairFunctor<tv::CPU, Index, int, \
NDIM>;
#define DECLARE_CPU_INDEX(Index) \
DECLARE_CPU_SPECS_INDEX_NDIM(Index, 1); \
DECLARE_CPU_SPECS_INDEX_NDIM(Index, 2); \
DECLARE_CPU_SPECS_INDEX_NDIM(Index, 3); \
DECLARE_CPU_SPECS_INDEX_NDIM(Index, 4);
DECLARE_CPU_INDEX
(
int
);
DECLARE_CPU_INDEX
(
long
);
#undef DECLARE_CPU_INDEX
#undef DECLARE_CPU_SPECS_INDEX_NDIM
}
// namespace spconv
mmdet3d/ops/spconv/src/indice_cuda.cu
deleted
100644 → 0
View file @
9c7270d0
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <ATen/ATen.h>
#include <spconv/indice.cu.h>
#include <spconv/indice.h>
#include <spconv/mp_helper.h>
#include <tensorview/helper_launch.h>
#include <tensorview/tensorview.h>
#include <utility/timer.h>
#include <chrono>
#include <limits>
#include <type_traits>
namespace
spconv
{
namespace
functor
{
template
<
typename
Index
,
typename
IndexGrid
,
unsigned
NDim
>
struct
CreateConvIndicePairFunctorP1
<
tv
::
GPU
,
Index
,
IndexGrid
,
NDim
>
{
Index
operator
()(
const
tv
::
GPU
&
d
,
tv
::
TensorView
<
const
Index
>
indicesIn
,
tv
::
TensorView
<
Index
>
indicesOut
,
tv
::
TensorView
<
IndexGrid
>
gridsOut
,
tv
::
TensorView
<
Index
>
indicePairs
,
tv
::
TensorView
<
Index
>
indiceNum
,
tv
::
TensorView
<
Index
>
indicePairUnique
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
kernelSize
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
stride
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
padding
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
dilation
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
outSpatialShape
,
bool
transpose
)
{
Index
batchSize
=
gridsOut
.
dim
(
0
);
auto
numActIn
=
indicesIn
.
dim
(
0
);
if
(
numActIn
==
0
)
return
0
;
// auto timer = spconv::CudaContextTimer<>();
if
(
transpose
)
prepareDeConvIndicePairsKernel
<
Index
,
IndexGrid
,
NDim
,
4096
>
<<<
tv
::
launch
::
getBlocks
(
numActIn
),
tv
::
launch
::
CUDA_NUM_THREADS
,
0
,
d
.
getStream
()
>>>
(
indicesIn
,
indicesOut
,
gridsOut
,
indicePairs
,
indiceNum
,
indicePairUnique
,
kernelSize
,
stride
,
padding
,
dilation
,
outSpatialShape
);
else
prepareIndicePairsKernel
<
Index
,
IndexGrid
,
NDim
,
4096
>
<<<
tv
::
launch
::
getBlocks
(
numActIn
),
tv
::
launch
::
CUDA_NUM_THREADS
,
0
,
d
.
getStream
()
>>>
(
indicesIn
,
indicesOut
,
gridsOut
,
indicePairs
,
indiceNum
,
indicePairUnique
,
kernelSize
,
stride
,
padding
,
dilation
,
outSpatialShape
);
TV_CHECK_CUDA_ERR
();
// std::cout << "p1 gene time " << timer.report() / 1000.0 << std::endl;
return
1
;
}
};
template
<
typename
Index
,
typename
IndexGrid
,
unsigned
NDim
>
struct
CreateConvIndicePairFunctorP2
<
tv
::
GPU
,
Index
,
IndexGrid
,
NDim
>
{
Index
operator
()(
const
tv
::
GPU
&
d
,
tv
::
TensorView
<
const
Index
>
indicesIn
,
tv
::
TensorView
<
Index
>
indicesOut
,
tv
::
TensorView
<
IndexGrid
>
gridsOut
,
tv
::
TensorView
<
Index
>
indicePairs
,
tv
::
TensorView
<
Index
>
indiceNum
,
tv
::
TensorView
<
Index
>
indicePairUnique
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
outSpatialShape
,
bool
transpose
,
bool
resetGrid
)
{
Index
batchSize
=
gridsOut
.
dim
(
0
);
auto
kernelVolume
=
indicePairs
.
dim
(
0
);
auto
numActIn
=
indicesIn
.
dim
(
0
);
if
(
numActIn
==
0
)
return
0
;
Index
numAct
=
indicePairUnique
.
dim
(
0
)
-
1
;
assignGridAndIndiceOutKernel
<
Index
,
IndexGrid
,
NDim
>
<<<
tv
::
launch
::
getBlocks
(
numAct
),
tv
::
launch
::
CUDA_NUM_THREADS
,
0
,
d
.
getStream
()
>>>
(
indicesOut
,
gridsOut
,
numAct
,
indicePairs
,
indicePairUnique
,
outSpatialShape
,
batchSize
);
TV_CHECK_CUDA_ERR
();
assignIndicePairsKernel
<
Index
,
IndexGrid
,
NDim
>
<<<
tv
::
launch
::
getBlocks
(
numActIn
),
tv
::
launch
::
CUDA_NUM_THREADS
,
0
,
d
.
getStream
()
>>>
(
indicesOut
,
gridsOut
,
numActIn
,
indicePairs
,
indicePairUnique
,
outSpatialShape
);
TV_CHECK_CUDA_ERR
();
if
(
resetGrid
)
{
resetGridKernel
<
Index
,
IndexGrid
,
NDim
>
<<<
tv
::
launch
::
getBlocks
(
numAct
),
tv
::
launch
::
CUDA_NUM_THREADS
,
0
,
d
.
getStream
()
>>>
(
indicePairUnique
.
data
(),
gridsOut
,
numAct
);
TV_CHECK_CUDA_ERR
();
}
return
numAct
;
}
};
template
<
typename
Index
,
typename
IndexGrid
,
unsigned
NDim
>
struct
CreateSubMIndicePairFunctor
<
tv
::
GPU
,
Index
,
IndexGrid
,
NDim
>
{
Index
operator
()(
const
tv
::
GPU
&
d
,
tv
::
TensorView
<
const
Index
>
indicesIn
,
tv
::
TensorView
<
IndexGrid
>
gridsOut
,
tv
::
TensorView
<
Index
>
indicePairs
,
tv
::
TensorView
<
Index
>
indiceNum
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
kernelSize
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
stride
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
padding
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
dilation
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
outSpatialShape
,
bool
transpose
,
bool
resetGrid
)
{
auto
numActIn
=
indicesIn
.
dim
(
0
);
if
(
numActIn
==
0
)
return
0
;
// auto timer = spconv::CudaContextTimer<>();
prepareSubMGridKernel
<
Index
,
IndexGrid
,
NDim
>
<<<
tv
::
launch
::
getBlocks
(
numActIn
),
tv
::
launch
::
CUDA_NUM_THREADS
,
0
,
d
.
getStream
()
>>>
(
indicesIn
,
gridsOut
,
outSpatialShape
);
TV_CHECK_CUDA_ERR
();
getSubMIndicePairsKernel
<
Index
,
IndexGrid
,
NDim
,
4096
>
<<<
tv
::
launch
::
getBlocks
(
numActIn
),
tv
::
launch
::
CUDA_NUM_THREADS
,
0
,
d
.
getStream
()
>>>
(
indicesIn
,
gridsOut
,
indicePairs
,
indiceNum
,
kernelSize
,
stride
,
padding
,
dilation
,
outSpatialShape
);
TV_CHECK_CUDA_ERR
();
// std::cout << "subm gene time " << timer.report() / 1000.0 << std::endl;
if
(
resetGrid
)
{
resetGridSubMKernel
<
Index
,
IndexGrid
,
NDim
>
<<<
tv
::
launch
::
getBlocks
(
numActIn
),
tv
::
launch
::
CUDA_NUM_THREADS
,
0
,
d
.
getStream
()
>>>
(
indicesIn
.
data
(),
gridsOut
,
outSpatialShape
,
numActIn
);
TV_CHECK_CUDA_ERR
();
}
return
numActIn
;
}
};
}
// namespace functor
#define DECLARE_GPU_SPECS_INDEX_NDIM(Index, NDIM) \
template struct functor::CreateConvIndicePairFunctor<tv::GPU, Index, int, \
NDIM>; \
template struct functor::CreateConvIndicePairFunctorP1<tv::GPU, Index, int, \
NDIM>; \
template struct functor::CreateConvIndicePairFunctorP2<tv::GPU, Index, int, \
NDIM>; \
template struct functor::CreateSubMIndicePairFunctor<tv::GPU, Index, int, \
NDIM>;
#define DECLARE_GPU_INDEX(Index) \
DECLARE_GPU_SPECS_INDEX_NDIM(Index, 1); \
DECLARE_GPU_SPECS_INDEX_NDIM(Index, 2); \
DECLARE_GPU_SPECS_INDEX_NDIM(Index, 3); \
DECLARE_GPU_SPECS_INDEX_NDIM(Index, 4);
DECLARE_GPU_INDEX
(
int
);
#undef DECLARE_GPU_INDEX
#undef DECLARE_GPU_SPECS_INDEX_NDIM
}
// namespace spconv
mmdet3d/ops/spconv/src/maxpool.cc
deleted
100644 → 0
View file @
9c7270d0
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <spconv/maxpool.h>
#include <torch/script.h>
namespace
spconv
{
namespace
functor
{
template
<
typename
T
,
typename
Index
>
struct
SparseMaxPoolForwardFunctor
<
tv
::
CPU
,
T
,
Index
>
{
void
operator
()(
const
tv
::
CPU
&
d
,
tv
::
TensorView
<
T
>
outFeatures
,
tv
::
TensorView
<
const
T
>
inFeatures
,
tv
::
TensorView
<
const
Index
>
indices
,
int
size
)
{
int
stride
=
outFeatures
.
dim
(
1
);
auto
outFeaturesData
=
outFeatures
.
data
();
auto
inFeaturesData
=
inFeatures
.
data
();
auto
indicesIn
=
indices
.
subview
(
0
).
data
();
auto
indicesOut
=
indices
.
subview
(
1
).
data
();
Index
idxi
,
idxo
;
for
(
int
row
=
0
;
row
<
size
;
row
++
)
{
idxi
=
indicesIn
[
row
]
*
stride
;
idxo
=
indicesOut
[
row
]
*
stride
;
for
(
int
plane
=
0
;
plane
<
stride
;
++
plane
)
if
(
outFeaturesData
[
idxo
+
plane
]
<
inFeaturesData
[
idxi
+
plane
])
outFeaturesData
[
idxo
+
plane
]
=
inFeaturesData
[
idxi
+
plane
];
}
}
};
template
<
typename
T
,
typename
Index
>
struct
SparseMaxPoolBackwardFunctor
<
tv
::
CPU
,
T
,
Index
>
{
void
operator
()(
const
tv
::
CPU
&
d
,
tv
::
TensorView
<
const
T
>
outFeatures
,
tv
::
TensorView
<
const
T
>
inFeatures
,
tv
::
TensorView
<
const
T
>
dout
,
tv
::
TensorView
<
T
>
din
,
tv
::
TensorView
<
const
Index
>
indices
,
int
size
)
{
int
stride
=
outFeatures
.
dim
(
1
);
auto
outFeaturesData
=
outFeatures
.
data
();
auto
inFeaturesData
=
inFeatures
.
data
();
auto
doutData
=
dout
.
data
();
auto
dinData
=
din
.
data
();
auto
indicesIn
=
indices
.
subview
(
0
).
data
();
auto
indicesOut
=
indices
.
subview
(
1
).
data
();
Index
idxi
,
idxo
;
for
(
int
row
=
0
;
row
<
size
;
row
++
)
{
idxi
=
indicesIn
[
row
]
*
stride
;
idxo
=
indicesOut
[
row
]
*
stride
;
for
(
int
plane
=
0
;
plane
<
stride
;
++
plane
)
if
(
outFeaturesData
[
idxo
+
plane
]
==
inFeaturesData
[
idxi
+
plane
])
dinData
[
idxi
+
plane
]
+=
doutData
[
idxo
+
plane
];
}
}
};
}
// namespace functor
#define DECLARE_CPU_SPECS_T_INDEX(T, Index) \
template struct functor::SparseMaxPoolForwardFunctor<tv::CPU, T, Index>; \
template struct functor::SparseMaxPoolBackwardFunctor<tv::CPU, T, Index>;
#define DECLARE_CPU_SPECS(T) \
DECLARE_CPU_SPECS_T_INDEX(T, int); \
DECLARE_CPU_SPECS_T_INDEX(T, long);
DECLARE_CPU_SPECS
(
float
);
DECLARE_CPU_SPECS
(
double
);
DECLARE_CPU_SPECS
(
at
::
Half
);
#undef DECLARE_CPU_SPECS
#undef DECLARE_CPU_SPECS_T_INDEX
}
// namespace spconv
mmdet3d/ops/spconv/src/maxpool_cuda.cu
deleted
100644 → 0
View file @
9c7270d0
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <ATen/ATen.h>
#include <spconv/maxpool.h>
#include <spconv/mp_helper.h>
#include <tensorview/helper_kernel.cu.h>
#include <tensorview/helper_launch.h>
#include <tensorview/tensorview.h>
#include <chrono>
#include <limits>
#include <type_traits>
namespace
spconv
{
template
<
typename
T
,
typename
Index
,
int
NumTLP
,
int
NumILP
>
__global__
void
maxPoolFwdBlockKernel
(
T
*
outFeatures
,
const
T
*
inFeatures
,
const
Index
*
indicesIn
,
const
Index
*
indicesOut
,
int
numHot
,
int
numPlanes
)
{
T
in
,
out
;
int
ILPStrideY
[
NumILP
];
Index
idxo
,
idxi
;
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
ILPStrideY
[
ilp
]
=
threadIdx
.
y
+
ilp
*
blockDim
.
y
;
outFeatures
+=
blockIdx
.
y
*
NumTLP
;
inFeatures
+=
blockIdx
.
y
*
NumTLP
;
for
(
int
ix
=
blockIdx
.
x
*
blockDim
.
x
;
ix
<
numHot
;
ix
+=
blockDim
.
x
*
gridDim
.
x
)
{
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
++
ilp
)
{
idxi
=
indicesIn
[
ix
+
ILPStrideY
[
ilp
]]
*
numPlanes
+
threadIdx
.
x
;
idxo
=
indicesOut
[
ix
+
ILPStrideY
[
ilp
]]
*
numPlanes
+
threadIdx
.
x
;
in
=
inFeatures
[
idxi
];
out
=
outFeatures
[
idxo
];
if
(
in
>
out
)
{
outFeatures
[
idxo
]
=
in
;
}
}
}
}
}
template
<
typename
T
,
typename
Index
,
int
NumTLP
,
int
NumILP
>
__global__
void
maxPoolFwdGenericBlockKernel
(
T
*
outFeatures
,
const
T
*
inFeatures
,
const
Index
*
indicesIn
,
const
Index
*
indicesOut
,
int
numHot
,
int
numPlanes
)
{
// see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
int
ILPStrideX
[
NumILP
];
Index
RI
[
NumILP
];
Index
RO
[
NumILP
];
T
in
,
out
;
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
ILPStrideX
[
ilp
]
=
ilp
*
gridDim
.
x
*
blockDim
.
x
;
for
(
int
ix
:
tv
::
KernelLoopX
<
int
,
NumILP
>
(
numHot
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
{
RI
[
ilp
]
=
indicesIn
[
ix
+
ILPStrideX
[
ilp
]]
*
numPlanes
;
RO
[
ilp
]
=
indicesOut
[
ix
+
ILPStrideX
[
ilp
]]
*
numPlanes
;
}
for
(
int
iy
:
tv
::
KernelLoopY
<
int
>
(
numPlanes
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
++
ilp
)
{
in
=
inFeatures
[
RI
[
ilp
]
+
iy
];
out
=
outFeatures
[
RO
[
ilp
]
+
iy
];
if
(
in
>
out
)
{
outFeatures
[
RO
[
ilp
]
+
iy
]
=
in
;
}
}
}
}
}
template
<
typename
T
,
typename
Index
,
int
NumTLP
,
int
NumILP
,
typename
VecType
>
__global__
void
maxPoolFwdVecBlockKernel
(
T
*
outFeatures
,
const
T
*
inFeatures
,
const
Index
*
indicesIn
,
const
Index
*
indicesOut
,
int
numHot
,
int
numPlanes
)
{
// see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
int
ILPStrideY
[
NumILP
];
constexpr
int
vecloadFactor
=
sizeof
(
VecType
)
/
sizeof
(
T
);
T
bufi
[
vecloadFactor
];
T
bufo
[
vecloadFactor
];
Index
idxi
,
idxo
;
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
ILPStrideY
[
ilp
]
=
threadIdx
.
y
+
ilp
*
blockDim
.
y
;
outFeatures
+=
blockIdx
.
y
*
NumTLP
;
inFeatures
+=
blockIdx
.
y
*
NumTLP
;
for
(
int
ix
=
blockIdx
.
x
*
blockDim
.
x
*
vecloadFactor
;
ix
<
numHot
;
ix
+=
blockDim
.
x
*
gridDim
.
x
*
vecloadFactor
)
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
++
ilp
)
{
idxi
=
indicesIn
[
ix
+
ILPStrideY
[
ilp
]]
*
numPlanes
+
threadIdx
.
x
;
idxo
=
indicesOut
[
ix
+
ILPStrideY
[
ilp
]]
*
numPlanes
+
threadIdx
.
x
;
reinterpret_cast
<
VecType
*>
(
bufo
)[
0
]
=
reinterpret_cast
<
VecType
*>
(
outFeatures
)[
idxo
];
reinterpret_cast
<
VecType
*>
(
bufi
)[
0
]
=
reinterpret_cast
<
const
VecType
*>
(
inFeatures
)[
idxi
];
#pragma unroll
for
(
int
i
=
0
;
i
<
vecloadFactor
;
i
++
)
{
if
(
bufi
[
i
]
>
bufo
[
i
])
{
bufo
[
i
]
=
bufi
[
i
];
}
}
reinterpret_cast
<
VecType
*>
(
outFeatures
)[
idxo
]
=
reinterpret_cast
<
VecType
*>
(
bufo
)[
0
];
}
}
}
template
<
typename
T
,
typename
Index
,
int
NumTLP
,
int
NumILP
>
__global__
void
maxPoolFwdGenericKernel
(
T
*
outFeatures
,
const
T
*
inFeatures
,
const
Index
*
indicesIn
,
const
Index
*
indicesOut
,
int
numHot
,
int
numPlanes
)
{
// see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
int
ILPStrideX
[
NumILP
];
Index
RI
[
NumILP
];
Index
RO
[
NumILP
];
T
in
,
out
;
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
ILPStrideX
[
ilp
]
=
ilp
*
gridDim
.
x
*
blockDim
.
x
;
for
(
int
ix
:
tv
::
KernelLoopX
<
int
,
NumILP
>
(
numHot
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
{
if
(
ix
+
ILPStrideX
[
ilp
]
<
numHot
)
{
RI
[
ilp
]
=
indicesIn
[
ix
+
ILPStrideX
[
ilp
]]
*
numPlanes
;
RO
[
ilp
]
=
indicesOut
[
ix
+
ILPStrideX
[
ilp
]]
*
numPlanes
;
}
}
for
(
int
iy
:
tv
::
KernelLoopY
<
int
>
(
numPlanes
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
++
ilp
)
{
if
(
ix
+
ILPStrideX
[
ilp
]
<
numHot
)
{
in
=
inFeatures
[
RI
[
ilp
]
+
iy
];
out
=
outFeatures
[
RO
[
ilp
]
+
iy
];
if
(
in
>
out
)
{
outFeatures
[
RO
[
ilp
]
+
iy
]
=
in
;
}
}
}
}
}
}
template
<
typename
T
,
typename
Index
,
int
NumTLP
,
int
NumILP
>
__global__
void
maxPoolBwdBlockKernel
(
const
T
*
outFeatures
,
const
T
*
inFeatures
,
const
T
*
dout
,
T
*
din
,
const
Index
*
indicesIn
,
const
Index
*
indicesOut
,
int
numHot
,
int
numPlanes
)
{
// see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
T
in
,
out
;
Index
idxo
,
idxi
;
int
ILPStrideY
[
NumILP
];
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
ILPStrideY
[
ilp
]
=
threadIdx
.
y
+
ilp
*
blockDim
.
y
;
outFeatures
+=
blockIdx
.
y
*
NumTLP
;
inFeatures
+=
blockIdx
.
y
*
NumTLP
;
dout
+=
blockIdx
.
y
*
NumTLP
;
din
+=
blockIdx
.
y
*
NumTLP
;
for
(
int
ix
=
blockIdx
.
x
*
blockDim
.
x
;
ix
<
numHot
;
ix
+=
blockDim
.
x
*
gridDim
.
x
)
{
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
++
ilp
)
{
idxi
=
indicesIn
[
ix
+
ILPStrideY
[
ilp
]]
*
numPlanes
+
threadIdx
.
x
;
idxo
=
indicesOut
[
ix
+
ILPStrideY
[
ilp
]]
*
numPlanes
+
threadIdx
.
x
;
in
=
inFeatures
[
idxi
];
out
=
outFeatures
[
idxo
];
if
(
in
==
out
)
{
din
[
idxi
]
+=
dout
[
idxo
];
}
}
}
}
}
template
<
typename
T
,
typename
Index
,
int
NumTLP
,
int
NumILP
>
__global__
void
maxPoolBwdGenericBlockKernel
(
const
T
*
outFeatures
,
const
T
*
inFeatures
,
const
T
*
dout
,
T
*
din
,
const
Index
*
indicesIn
,
const
Index
*
indicesOut
,
int
numHot
,
int
numPlanes
)
{
// see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
int
ILPStrideX
[
NumILP
];
Index
RI
[
NumILP
];
Index
RO
[
NumILP
];
T
in
,
out
;
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
ILPStrideX
[
ilp
]
=
ilp
*
gridDim
.
x
*
blockDim
.
x
;
for
(
int
ix
:
tv
::
KernelLoopX
<
int
,
NumILP
>
(
numHot
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
{
RI
[
ilp
]
=
indicesIn
[
ix
+
ILPStrideX
[
ilp
]]
*
numPlanes
;
RO
[
ilp
]
=
indicesOut
[
ix
+
ILPStrideX
[
ilp
]]
*
numPlanes
;
}
for
(
int
iy
:
tv
::
KernelLoopY
<
int
>
(
numPlanes
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
++
ilp
)
{
in
=
inFeatures
[
RI
[
ilp
]
+
iy
];
out
=
outFeatures
[
RO
[
ilp
]
+
iy
];
if
(
in
==
out
)
{
din
[
RI
[
ilp
]
+
iy
]
+=
dout
[
RO
[
ilp
]
+
iy
];
}
}
}
}
}
template
<
typename
T
,
typename
Index
,
int
NumTLP
,
int
NumILP
,
typename
VecType
>
__global__
void
maxPoolBwdVecBlockKernel
(
const
T
*
outFeatures
,
const
T
*
inFeatures
,
const
T
*
dout
,
T
*
din
,
const
Index
*
indicesIn
,
const
Index
*
indicesOut
,
int
numHot
,
int
numPlanes
)
{
// see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
int
ILPStrideY
[
NumILP
];
constexpr
int
vecloadFactor
=
sizeof
(
VecType
)
/
sizeof
(
T
);
T
bufi
[
vecloadFactor
];
T
bufo
[
vecloadFactor
];
T
bufdi
[
vecloadFactor
];
T
bufdo
[
vecloadFactor
];
Index
idxi
,
idxo
;
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
ILPStrideY
[
ilp
]
=
threadIdx
.
y
+
ilp
*
blockDim
.
y
;
outFeatures
+=
blockIdx
.
y
*
NumTLP
;
inFeatures
+=
blockIdx
.
y
*
NumTLP
;
for
(
int
ix
=
blockIdx
.
x
*
blockDim
.
x
*
vecloadFactor
;
ix
<
numHot
;
ix
+=
blockDim
.
x
*
gridDim
.
x
*
vecloadFactor
)
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
++
ilp
)
{
idxi
=
indicesIn
[
ix
+
ILPStrideY
[
ilp
]]
*
numPlanes
+
threadIdx
.
x
;
idxo
=
indicesOut
[
ix
+
ILPStrideY
[
ilp
]]
*
numPlanes
+
threadIdx
.
x
;
reinterpret_cast
<
VecType
*>
(
bufo
)[
0
]
=
reinterpret_cast
<
const
VecType
*>
(
outFeatures
)[
idxo
];
reinterpret_cast
<
VecType
*>
(
bufi
)[
0
]
=
reinterpret_cast
<
const
VecType
*>
(
inFeatures
)[
idxi
];
reinterpret_cast
<
VecType
*>
(
bufdo
)[
0
]
=
reinterpret_cast
<
const
VecType
*>
(
dout
)[
idxo
];
reinterpret_cast
<
VecType
*>
(
bufdi
)[
0
]
=
reinterpret_cast
<
VecType
*>
(
din
)[
idxi
];
#pragma unroll
for
(
int
i
=
0
;
i
<
vecloadFactor
;
i
++
)
{
if
(
bufi
[
i
]
==
bufo
[
i
])
{
bufdi
[
i
]
+=
bufdo
[
i
];
}
}
reinterpret_cast
<
VecType
*>
(
din
)[
idxi
]
=
reinterpret_cast
<
VecType
*>
(
bufdi
)[
0
];
}
}
}
template
<
typename
T
,
typename
Index
,
int
NumTLP
,
int
NumILP
>
__global__
void
maxPoolBwdGenericKernel
(
const
T
*
outFeatures
,
const
T
*
inFeatures
,
const
T
*
dout
,
T
*
din
,
const
Index
*
indicesIn
,
const
Index
*
indicesOut
,
int
numHot
,
int
numPlanes
)
{
// see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
int
ILPStrideX
[
NumILP
];
Index
RI
[
NumILP
];
Index
RO
[
NumILP
];
T
in
,
out
;
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
ILPStrideX
[
ilp
]
=
ilp
*
gridDim
.
x
*
blockDim
.
x
;
for
(
int
ix
:
tv
::
KernelLoopX
<
int
,
NumILP
>
(
numHot
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
{
if
(
ix
+
ILPStrideX
[
ilp
]
<
numHot
)
{
RI
[
ilp
]
=
indicesIn
[
ix
+
ILPStrideX
[
ilp
]]
*
numPlanes
;
RO
[
ilp
]
=
indicesOut
[
ix
+
ILPStrideX
[
ilp
]]
*
numPlanes
;
}
}
for
(
int
iy
:
tv
::
KernelLoopY
<
int
>
(
numPlanes
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
++
ilp
)
{
if
(
ix
+
ILPStrideX
[
ilp
]
<
numHot
)
{
in
=
inFeatures
[
RI
[
ilp
]
+
iy
];
out
=
outFeatures
[
RO
[
ilp
]
+
iy
];
if
(
in
==
out
)
{
din
[
RI
[
ilp
]
+
iy
]
+=
dout
[
RO
[
ilp
]
+
iy
];
}
}
}
}
}
}
namespace
functor
{
template
<
typename
T
,
typename
Index
>
struct
SparseMaxPoolForwardFunctor
<
tv
::
GPU
,
T
,
Index
>
{
using
vecload_type_t
=
std
::
conditional_t
<
std
::
is_same
<
T
,
at
::
Half
>::
value
,
int2
,
int4
>
;
using
kernel_block_t
=
mp_list_c
<
int
,
64
,
32
,
16
>
;
void
operator
()(
const
tv
::
GPU
&
d
,
tv
::
TensorView
<
T
>
outFeatures
,
tv
::
TensorView
<
const
T
>
inFeatures
,
tv
::
TensorView
<
const
Index
>
indices
,
int
size
)
{
if
(
size
<=
0
)
return
;
int
numPlanes
=
inFeatures
.
dim
(
1
);
bool
notFound
=
true
;
constexpr
int
vecloadFactor
=
sizeof
(
vecload_type_t
)
/
sizeof
(
T
);
mp_for_each
<
kernel_block_t
>
([
=
,
&
outFeatures
,
&
inFeatures
,
&
indices
,
&
notFound
](
auto
NumTLP
)
{
constexpr
int
NumILP
=
NumTLP
/
4
;
int
numHotBlock
=
(
size
/
NumTLP
)
*
NumTLP
;
if
(
notFound
)
{
if
(
numPlanes
%
NumTLP
==
0
)
{
if
(
numHotBlock
>=
NumTLP
)
{
maxPoolFwdVecBlockKernel
<
T
,
Index
,
int
(
NumTLP
),
NumILP
,
vecload_type_t
>
<<<
dim3
(
std
::
min
(
size
/
NumTLP
,
512
),
numPlanes
/
NumTLP
),
dim3
(
NumTLP
/
vecloadFactor
,
NumTLP
/
NumILP
),
0
,
d
.
getStream
()
>>>
(
outFeatures
.
data
(),
inFeatures
.
data
(),
indices
.
subview
(
0
).
data
(),
indices
.
subview
(
1
).
data
(),
numHotBlock
,
numPlanes
/
vecloadFactor
);
TV_CHECK_CUDA_ERR
();
}
if
(
size
>
numHotBlock
)
{
maxPoolFwdGenericKernel
<
T
,
Index
,
int
(
NumTLP
),
NumILP
>
<<<
dim3
(
1
,
numPlanes
/
NumTLP
),
dim3
(
NumTLP
/
NumILP
,
NumTLP
),
0
,
d
.
getStream
()
>>>
(
outFeatures
.
data
(),
inFeatures
.
data
(),
indices
.
subview
(
0
).
data
()
+
numHotBlock
,
indices
.
subview
(
1
).
data
()
+
numHotBlock
,
size
-
numHotBlock
,
numPlanes
);
TV_CHECK_CUDA_ERR
();
}
notFound
=
false
;
}
}
});
if
(
notFound
)
{
constexpr
int
NumTLP
=
64
;
constexpr
int
NumILP
=
NumTLP
/
4
;
int
numHotBlock
=
(
size
/
NumTLP
)
*
NumTLP
;
if
(
numHotBlock
>=
NumTLP
)
{
maxPoolFwdGenericBlockKernel
<
T
,
Index
,
NumTLP
,
NumILP
>
<<<
dim3
(
size
/
NumTLP
,
tv
::
launch
::
DivUp
(
numPlanes
,
NumTLP
)),
dim3
(
NumTLP
/
NumILP
,
NumTLP
),
0
,
d
.
getStream
()
>>>
(
outFeatures
.
data
(),
inFeatures
.
data
(),
indices
.
subview
(
0
).
data
(),
indices
.
subview
(
1
).
data
(),
numHotBlock
,
numPlanes
);
TV_CHECK_CUDA_ERR
();
}
if
(
size
>
numHotBlock
)
{
maxPoolFwdGenericKernel
<
T
,
Index
,
NumTLP
,
NumILP
>
<<<
dim3
(
1
,
tv
::
launch
::
DivUp
(
numPlanes
,
NumTLP
)),
dim3
(
NumTLP
/
NumILP
,
NumTLP
),
0
,
d
.
getStream
()
>>>
(
outFeatures
.
data
(),
inFeatures
.
data
(),
indices
.
subview
(
0
).
data
()
+
numHotBlock
,
indices
.
subview
(
1
).
data
()
+
numHotBlock
,
size
-
numHotBlock
,
numPlanes
);
TV_CHECK_CUDA_ERR
();
}
}
}
};
template
<
typename
T
,
typename
Index
>
struct
SparseMaxPoolBackwardFunctor
<
tv
::
GPU
,
T
,
Index
>
{
using
vecload_type_t
=
std
::
conditional_t
<
std
::
is_same
<
T
,
at
::
Half
>::
value
,
int2
,
int4
>
;
using
kernel_block_t
=
mp_list_c
<
int
,
64
,
32
,
16
>
;
void
operator
()(
const
tv
::
GPU
&
d
,
tv
::
TensorView
<
const
T
>
outFeatures
,
tv
::
TensorView
<
const
T
>
inFeatures
,
tv
::
TensorView
<
const
T
>
dout
,
tv
::
TensorView
<
T
>
din
,
tv
::
TensorView
<
const
Index
>
indices
,
int
size
)
{
if
(
size
<=
0
)
return
;
int
numPlanes
=
inFeatures
.
dim
(
1
);
bool
notFound
=
true
;
constexpr
int
vecloadFactor
=
sizeof
(
vecload_type_t
)
/
sizeof
(
T
);
mp_for_each
<
kernel_block_t
>
([
=
,
&
outFeatures
,
&
inFeatures
,
&
dout
,
&
din
,
&
indices
,
&
notFound
](
auto
NumTLP
)
{
constexpr
int
NumILP
=
NumTLP
/
4
;
int
numHotBlock
=
(
size
/
NumTLP
)
*
NumTLP
;
if
(
notFound
)
{
if
(
numPlanes
%
NumTLP
==
0
)
{
if
(
numHotBlock
>=
NumTLP
)
{
maxPoolBwdVecBlockKernel
<
T
,
Index
,
int
(
NumTLP
),
NumILP
,
vecload_type_t
>
<<<
dim3
(
std
::
min
(
size
/
NumTLP
,
512
),
numPlanes
/
NumTLP
),
dim3
(
NumTLP
/
vecloadFactor
,
NumTLP
/
NumILP
),
0
,
d
.
getStream
()
>>>
(
outFeatures
.
data
(),
inFeatures
.
data
(),
dout
.
data
(),
din
.
data
(),
indices
.
subview
(
0
).
data
(),
indices
.
subview
(
1
).
data
(),
numHotBlock
,
numPlanes
/
vecloadFactor
);
TV_CHECK_CUDA_ERR
();
}
if
(
size
>
numHotBlock
)
{
maxPoolBwdGenericKernel
<
T
,
Index
,
int
(
NumTLP
),
NumILP
>
<<<
dim3
(
1
,
numPlanes
/
NumTLP
),
dim3
(
NumTLP
/
NumILP
,
NumTLP
),
0
,
d
.
getStream
()
>>>
(
outFeatures
.
data
(),
inFeatures
.
data
(),
dout
.
data
(),
din
.
data
(),
indices
.
subview
(
0
).
data
()
+
numHotBlock
,
indices
.
subview
(
1
).
data
()
+
numHotBlock
,
size
-
numHotBlock
,
numPlanes
);
TV_CHECK_CUDA_ERR
();
}
notFound
=
false
;
}
}
});
if
(
notFound
)
{
constexpr
int
NumTLP
=
64
;
constexpr
int
NumILP
=
NumTLP
/
4
;
int
numHotBlock
=
(
size
/
NumTLP
)
*
NumTLP
;
if
(
numHotBlock
>=
NumTLP
)
{
maxPoolBwdGenericBlockKernel
<
T
,
Index
,
NumTLP
,
NumILP
>
<<<
dim3
(
size
/
NumTLP
,
tv
::
launch
::
DivUp
(
numPlanes
,
NumTLP
)),
dim3
(
NumTLP
/
NumILP
,
NumTLP
),
0
,
d
.
getStream
()
>>>
(
outFeatures
.
data
(),
inFeatures
.
data
(),
dout
.
data
(),
din
.
data
(),
indices
.
subview
(
0
).
data
(),
indices
.
subview
(
1
).
data
(),
numHotBlock
,
numPlanes
);
TV_CHECK_CUDA_ERR
();
}
if
(
size
>
numHotBlock
)
{
maxPoolBwdGenericKernel
<
T
,
Index
,
NumTLP
,
NumILP
>
<<<
dim3
(
1
,
tv
::
launch
::
DivUp
(
numPlanes
,
NumTLP
)),
dim3
(
NumTLP
/
NumILP
,
NumTLP
),
0
,
d
.
getStream
()
>>>
(
outFeatures
.
data
(),
inFeatures
.
data
(),
dout
.
data
(),
din
.
data
(),
indices
.
subview
(
0
).
data
()
+
numHotBlock
,
indices
.
subview
(
1
).
data
()
+
numHotBlock
,
size
-
numHotBlock
,
numPlanes
);
TV_CHECK_CUDA_ERR
();
}
}
}
};
}
// namespace functor
#define DECLARE_GPU_SPECS_T_INDEX(T, Index) \
template struct functor::SparseMaxPoolForwardFunctor<tv::GPU, T, Index>; \
template struct functor::SparseMaxPoolBackwardFunctor<tv::GPU, T, Index>;
#define DECLARE_GPU_SPECS(T) DECLARE_GPU_SPECS_T_INDEX(T, int);
DECLARE_GPU_SPECS
(
float
);
DECLARE_GPU_SPECS
(
double
);
DECLARE_GPU_SPECS
(
at
::
Half
);
#undef DECLARE_GPU_SPECS
#undef DECLARE_GPU_SPECS_T_INDEX
}
// namespace spconv
mmdet3d/ops/spconv/src/reordering.cc
deleted
100644 → 0
View file @
9c7270d0
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <spconv/reordering.h>
#include <torch/script.h>
namespace
spconv
{
namespace
functor
{
template
<
typename
T
,
typename
Index
>
struct
SparseGatherFunctor
<
tv
::
CPU
,
T
,
Index
>
{
void
operator
()(
const
tv
::
CPU
&
d
,
tv
::
TensorView
<
T
>
buffer
,
tv
::
TensorView
<
const
T
>
features
,
tv
::
TensorView
<
const
Index
>
indices
,
int
size
)
{
int
numPlanes
=
features
.
dim
(
1
);
for
(
int
i
=
0
;
i
<
size
;
++
i
)
{
std
::
memcpy
(
buffer
.
data
()
+
i
*
numPlanes
,
features
.
data
()
+
indices
[
i
]
*
numPlanes
,
sizeof
(
T
)
*
numPlanes
);
}
}
};
template
<
typename
T
,
typename
Index
>
struct
SparseScatterAddFunctor
<
tv
::
CPU
,
T
,
Index
>
{
void
operator
()(
const
tv
::
CPU
&
d
,
tv
::
TensorView
<
T
>
outFeatures
,
tv
::
TensorView
<
const
T
>
buffer
,
tv
::
TensorView
<
const
Index
>
indices
,
int
size
,
bool
stable
)
{
int
numPlanes
=
outFeatures
.
dim
(
1
);
const
T
*
buf
=
buffer
.
data
();
T
*
out
=
outFeatures
.
data
();
for
(
int
i
=
0
;
i
<
size
;
++
i
)
{
buf
=
buffer
.
data
()
+
i
*
numPlanes
;
out
=
outFeatures
.
data
()
+
indices
[
i
]
*
numPlanes
;
for
(
int
j
=
0
;
j
<
numPlanes
;
++
j
)
{
out
[
j
]
+=
buf
[
j
];
}
}
}
};
}
// namespace functor
#define DECLARE_CPU_SPECS_T_INDEX(T, Index) \
template struct functor::SparseGatherFunctor<tv::CPU, T, Index>; \
template struct functor::SparseScatterAddFunctor<tv::CPU, T, Index>;
#define DECLARE_CPU_SPECS(T) \
DECLARE_CPU_SPECS_T_INDEX(T, int); \
DECLARE_CPU_SPECS_T_INDEX(T, long);
DECLARE_CPU_SPECS
(
float
);
DECLARE_CPU_SPECS
(
double
);
DECLARE_CPU_SPECS
(
at
::
Half
);
#undef DECLARE_CPU_SPECS
#undef DECLARE_CPU_SPECS_T_INDEX
}
// namespace spconv
mmdet3d/ops/spconv/src/reordering_cuda.cu
deleted
100644 → 0
View file @
9c7270d0
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <ATen/ATen.h>
#include <spconv/mp_helper.h>
#include <spconv/reordering.cu.h>
#include <spconv/reordering.h>
#include <tensorview/helper_kernel.cu.h>
#include <tensorview/helper_launch.h>
#include <tensorview/tensorview.h>
#include <utility/timer.h>
#include <chrono>
#include <limits>
#include <type_traits>
namespace
spconv
{
namespace
functor
{
template
<
typename
T
,
typename
Index
>
struct
SparseGatherFunctor
<
tv
::
GPU
,
T
,
Index
>
{
using
vecload_type_t
=
std
::
conditional_t
<
std
::
is_same
<
T
,
at
::
Half
>::
value
,
int2
,
int4
>
;
using
kernel_block_t
=
mp_list_c
<
int
,
64
,
32
,
16
>
;
void
operator
()(
const
tv
::
GPU
&
d
,
tv
::
TensorView
<
T
>
buffer
,
tv
::
TensorView
<
const
T
>
features
,
tv
::
TensorView
<
const
Index
>
indices
,
int
size
)
{
if
(
size
<=
0
)
return
;
int
numPlanes
=
features
.
dim
(
1
);
bool
notFound
=
true
;
constexpr
int
vecloadFactor
=
sizeof
(
vecload_type_t
)
/
sizeof
(
T
);
mp_for_each
<
kernel_block_t
>
([
=
,
&
buffer
,
&
features
,
&
indices
,
&
notFound
](
auto
NumTLP
)
{
constexpr
int
NumILP
=
NumTLP
/
4
;
// constexpr int NumILP = NumTLP / (64 / (NumTLP / vecloadFactor));
int
nHotBlock
=
(
size
/
NumTLP
)
*
NumTLP
;
if
(
notFound
)
{
if
(
numPlanes
%
NumTLP
==
0
)
{
if
(
nHotBlock
>=
NumTLP
)
{
gatherVecBlockKernel
<
T
,
Index
,
int
(
NumTLP
),
NumILP
,
vecload_type_t
>
<<<
dim3
(
numPlanes
/
NumTLP
,
size
/
NumTLP
),
dim3
(
NumTLP
/
vecloadFactor
,
NumTLP
/
NumILP
),
0
,
d
.
getStream
()
>>>
(
buffer
.
data
(),
features
.
data
(),
indices
.
data
(),
nHotBlock
,
numPlanes
/
vecloadFactor
);
TV_CHECK_CUDA_ERR
();
}
if
(
size
-
nHotBlock
>
0
)
{
gatherVecKernel
<
T
,
Index
,
int
(
NumTLP
),
NumILP
,
vecload_type_t
>
<<<
dim3
(
1
,
numPlanes
/
NumTLP
),
dim3
(
NumTLP
/
NumILP
,
NumTLP
/
vecloadFactor
),
0
,
d
.
getStream
()
>>>
(
buffer
.
data
()
+
nHotBlock
*
numPlanes
,
features
.
data
(),
indices
.
data
()
+
nHotBlock
,
size
-
nHotBlock
,
numPlanes
/
vecloadFactor
);
TV_CHECK_CUDA_ERR
();
}
notFound
=
false
;
}
}
});
if
(
notFound
)
{
constexpr
int
NumTLP
=
64
;
constexpr
int
NumILP
=
NumTLP
/
4
;
gatherGenericKernel
<
T
,
Index
,
NumTLP
,
NumILP
>
<<<
dim3
(
tv
::
launch
::
DivUp
(
size
,
NumTLP
),
tv
::
launch
::
DivUp
(
numPlanes
,
NumTLP
)),
dim3
(
NumTLP
/
NumILP
,
NumTLP
),
0
,
d
.
getStream
()
>>>
(
buffer
.
data
(),
features
.
data
(),
indices
.
data
(),
size
,
numPlanes
);
TV_CHECK_CUDA_ERR
();
}
}
};
template
<
typename
T
,
typename
Index
>
struct
SparseScatterAddFunctor
<
tv
::
GPU
,
T
,
Index
>
{
using
vecload_type_t
=
std
::
conditional_t
<
std
::
is_same
<
T
,
at
::
Half
>::
value
,
int2
,
int4
>
;
using
kernel_block_t
=
mp_list_c
<
int
,
64
,
32
,
16
>
;
void
operator
()(
const
tv
::
GPU
&
d
,
tv
::
TensorView
<
T
>
outFeatures
,
tv
::
TensorView
<
const
T
>
buffer
,
tv
::
TensorView
<
const
Index
>
indices
,
int
size
,
bool
stable
)
{
if
(
size
<=
0
)
return
;
int
numPlanes
=
outFeatures
.
dim
(
1
);
bool
notFound
=
true
;
constexpr
int
vecloadFactor
=
sizeof
(
vecload_type_t
)
/
sizeof
(
T
);
// important for half.
mp_for_each
<
kernel_block_t
>
([
=
,
&
d
,
&
outFeatures
,
&
buffer
,
&
indices
,
&
notFound
](
auto
NumTLP
)
{
// constexpr int NumILP = NumTLP / (64 / (NumTLP / vecloadFactor));
constexpr
int
NumILP
=
NumTLP
/
4
;
int
nHotBlock
=
(
size
/
NumTLP
)
*
NumTLP
;
if
(
notFound
)
{
if
(
numPlanes
%
NumTLP
==
0
)
{
if
(
nHotBlock
>=
NumTLP
)
{
scatterAddVecBlockKernel
<
T
,
Index
,
int
(
NumTLP
),
NumILP
,
vecload_type_t
>
<<<
dim3
(
numPlanes
/
NumTLP
,
size
/
NumTLP
),
dim3
(
NumTLP
/
vecloadFactor
,
NumTLP
/
NumILP
),
0
,
d
.
getStream
()
>>>
(
outFeatures
.
data
(),
buffer
.
data
(),
indices
.
data
(),
nHotBlock
,
numPlanes
/
vecloadFactor
);
TV_CHECK_CUDA_ERR
();
}
if
(
size
-
nHotBlock
>
0
)
{
scatterAddGenericKernel
<
T
,
Index
,
int
(
NumTLP
),
NumILP
>
<<<
dim3
(
1
,
numPlanes
/
NumTLP
),
dim3
(
NumTLP
/
NumILP
,
NumTLP
),
0
,
d
.
getStream
()
>>>
(
outFeatures
.
data
(),
buffer
.
data
()
+
nHotBlock
*
numPlanes
,
indices
.
data
()
+
nHotBlock
,
size
-
nHotBlock
,
numPlanes
);
TV_CHECK_CUDA_ERR
();
}
notFound
=
false
;
}
}
});
if
(
notFound
)
{
constexpr
int
NumTLP
=
64
;
constexpr
int
NumILP
=
NumTLP
/
4
;
scatterAddGenericKernel
<
T
,
Index
,
NumTLP
,
NumILP
>
<<<
dim3
(
tv
::
launch
::
DivUp
(
size
,
NumTLP
),
tv
::
launch
::
DivUp
(
numPlanes
,
NumTLP
)),
dim3
(
NumTLP
/
NumILP
,
NumTLP
),
0
,
d
.
getStream
()
>>>
(
outFeatures
.
data
(),
buffer
.
data
(),
indices
.
data
(),
size
,
numPlanes
);
TV_CHECK_CUDA_ERR
();
}
}
};
}
// namespace functor
#define DECLARE_GPU_SPECS_T_INDEX(T, Index) \
template struct functor::SparseGatherFunctor<tv::GPU, T, Index>; \
template struct functor::SparseScatterAddFunctor<tv::GPU, T, Index>;
#define DECLARE_GPU_SPECS(T) DECLARE_GPU_SPECS_T_INDEX(T, int);
DECLARE_GPU_SPECS
(
float
);
DECLARE_GPU_SPECS
(
double
);
DECLARE_GPU_SPECS
(
at
::
Half
);
#undef DECLARE_GPU_SPECS
#undef DECLARE_GPU_SPECS_T_INDEX
}
// namespace spconv
mmdet3d/ops/spconv/structure.py
deleted
100644 → 0
View file @
9c7270d0
# Copyright (c) OpenMMLab. All rights reserved.
import
numpy
as
np
import
torch
def
scatter_nd
(
indices
,
updates
,
shape
):
"""pytorch edition of tensorflow scatter_nd.
this function don't contain except handle code. so use this carefully when
indice repeats, don't support repeat add which is supported in tensorflow.
"""
ret
=
torch
.
zeros
(
*
shape
,
dtype
=
updates
.
dtype
,
device
=
updates
.
device
)
ndim
=
indices
.
shape
[
-
1
]
output_shape
=
list
(
indices
.
shape
[:
-
1
])
+
shape
[
indices
.
shape
[
-
1
]:]
flatted_indices
=
indices
.
view
(
-
1
,
ndim
)
slices
=
[
flatted_indices
[:,
i
]
for
i
in
range
(
ndim
)]
slices
+=
[
Ellipsis
]
ret
[
slices
]
=
updates
.
view
(
*
output_shape
)
return
ret
class
SparseConvTensor
(
object
):
def
__init__
(
self
,
features
,
indices
,
spatial_shape
,
batch_size
,
grid
=
None
):
"""
Args:
grid: pre-allocated grid tensor.
should be used when the volume of spatial shape
is very large.
"""
self
.
features
=
features
self
.
indices
=
indices
if
self
.
indices
.
dtype
!=
torch
.
int32
:
self
.
indices
.
int
()
self
.
spatial_shape
=
spatial_shape
self
.
batch_size
=
batch_size
self
.
indice_dict
=
{}
self
.
grid
=
grid
@
property
def
spatial_size
(
self
):
return
np
.
prod
(
self
.
spatial_shape
)
def
find_indice_pair
(
self
,
key
):
if
key
is
None
:
return
None
if
key
in
self
.
indice_dict
:
return
self
.
indice_dict
[
key
]
return
None
def
dense
(
self
,
channels_first
=
True
):
output_shape
=
[
self
.
batch_size
]
+
list
(
self
.
spatial_shape
)
+
[
self
.
features
.
shape
[
1
]]
res
=
scatter_nd
(
self
.
indices
.
long
(),
self
.
features
,
output_shape
)
if
not
channels_first
:
return
res
ndim
=
len
(
self
.
spatial_shape
)
trans_params
=
list
(
range
(
0
,
ndim
+
1
))
trans_params
.
insert
(
1
,
ndim
+
1
)
return
res
.
permute
(
*
trans_params
).
contiguous
()
@
property
def
sparity
(
self
):
return
(
self
.
indices
.
shape
[
0
]
/
np
.
prod
(
self
.
spatial_shape
)
/
self
.
batch_size
)
mmdet3d/ops/spconv/test_utils.py
deleted
100644 → 0
View file @
9c7270d0
# Copyright 2019 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
unittest
import
numpy
as
np
class
TestCase
(
unittest
.
TestCase
):
def
_GetNdArray
(
self
,
a
):
if
not
isinstance
(
a
,
np
.
ndarray
):
a
=
np
.
array
(
a
)
return
a
def
assertAllEqual
(
self
,
a
,
b
):
"""Asserts that two numpy arrays have the same values.
Args:
a: the expected numpy ndarray or anything can be converted to one.
b: the actual numpy ndarray or anything can be converted to one.
"""
a
=
self
.
_GetNdArray
(
a
)
b
=
self
.
_GetNdArray
(
b
)
self
.
assertEqual
(
a
.
shape
,
b
.
shape
,
'Shape mismatch: expected %s, got %s.'
%
(
a
.
shape
,
b
.
shape
))
same
=
(
a
==
b
)
if
a
.
dtype
==
np
.
float32
or
a
.
dtype
==
np
.
float64
:
same
=
np
.
logical_or
(
same
,
np
.
logical_and
(
np
.
isnan
(
a
),
np
.
isnan
(
b
)))
if
not
np
.
all
(
same
):
# Prints more details than np.testing.assert_array_equal.
diff
=
np
.
logical_not
(
same
)
if
a
.
ndim
:
x
=
a
[
np
.
where
(
diff
)]
y
=
b
[
np
.
where
(
diff
)]
print
(
'not equal where = '
,
np
.
where
(
diff
))
else
:
# np.where is broken for scalars
x
,
y
=
a
,
b
print
(
'not equal lhs = '
,
x
)
print
(
'not equal rhs = '
,
y
)
np
.
testing
.
assert_array_equal
(
a
,
b
)
def
assertAllClose
(
self
,
a
,
b
,
rtol
=
1e-6
,
atol
=
1e-6
):
"""Asserts that two numpy arrays, or dicts of same, have near values.
This does not support nested dicts.
Args:
a: The expected numpy ndarray (or anything can be converted to one), or
dict of same. Must be a dict iff `b` is a dict.
b: The actual numpy ndarray (or anything can be converted to one), or
dict of same. Must be a dict iff `a` is a dict.
rtol: relative tolerance.
atol: absolute tolerance.
Raises:
ValueError: if only one of `a` and `b` is a dict.
"""
is_a_dict
=
isinstance
(
a
,
dict
)
if
is_a_dict
!=
isinstance
(
b
,
dict
):
raise
ValueError
(
"Can't compare dict to non-dict, %s vs %s."
%
(
a
,
b
))
if
is_a_dict
:
self
.
assertCountEqual
(
a
.
keys
(),
b
.
keys
(),
msg
=
'mismatched keys, expected %s, got %s'
%
(
a
.
keys
(),
b
.
keys
()))
for
k
in
a
:
self
.
_assertArrayLikeAllClose
(
a
[
k
],
b
[
k
],
rtol
=
rtol
,
atol
=
atol
,
msg
=
'%s: expected %s, got %s.'
%
(
k
,
a
,
b
))
else
:
self
.
_assertArrayLikeAllClose
(
a
,
b
,
rtol
=
rtol
,
atol
=
atol
)
def
_assertArrayLikeAllClose
(
self
,
a
,
b
,
rtol
=
1e-6
,
atol
=
1e-6
,
msg
=
None
):
a
=
self
.
_GetNdArray
(
a
)
b
=
self
.
_GetNdArray
(
b
)
self
.
assertEqual
(
a
.
shape
,
b
.
shape
,
'Shape mismatch: expected %s, got %s.'
%
(
a
.
shape
,
b
.
shape
))
if
not
np
.
allclose
(
a
,
b
,
rtol
=
rtol
,
atol
=
atol
):
# Prints more details than np.testing.assert_allclose.
#
# NOTE: numpy.allclose (and numpy.testing.assert_allclose)
# checks whether two arrays are element-wise equal within a
# tolerance. The relative difference (rtol * abs(b)) and the
# absolute difference atol are added together to compare against
# the absolute difference between a and b. Here, we want to
# print out which elements violate such conditions.
cond
=
np
.
logical_or
(
np
.
abs
(
a
-
b
)
>
atol
+
rtol
*
np
.
abs
(
b
),
np
.
isnan
(
a
)
!=
np
.
isnan
(
b
))
if
a
.
ndim
:
x
=
a
[
np
.
where
(
cond
)]
y
=
b
[
np
.
where
(
cond
)]
print
(
'not close where = '
,
np
.
where
(
cond
))
else
:
# np.where is broken for scalars
x
,
y
=
a
,
b
print
(
'not close lhs = '
,
x
)
print
(
'not close rhs = '
,
y
)
print
(
'not close dif = '
,
np
.
abs
(
x
-
y
))
print
(
'not close tol = '
,
atol
+
rtol
*
np
.
abs
(
y
))
print
(
'dtype = %s, shape = %s'
%
(
a
.
dtype
,
a
.
shape
))
np
.
testing
.
assert_allclose
(
a
,
b
,
rtol
=
rtol
,
atol
=
atol
,
err_msg
=
msg
)
def
params_grid
(
*
params
):
size
=
len
(
params
)
length
=
1
for
p
in
params
:
length
*=
len
(
p
)
sizes
=
[
len
(
p
)
for
p
in
params
]
counter
=
[
0
]
*
size
total
=
[]
for
i
in
range
(
length
):
total
.
append
([
0
]
*
size
)
for
i
in
range
(
length
):
for
j
in
range
(
size
):
total
[
i
][
j
]
=
params
[
j
][
counter
[
j
]]
counter
[
size
-
1
]
+=
1
for
c
in
range
(
size
-
1
,
-
1
,
-
1
):
if
(
counter
[
c
]
==
sizes
[
c
]
and
c
>
0
):
counter
[
c
-
1
]
+=
1
counter
[
c
]
=
0
return
total
def
generate_sparse_data
(
shape
,
num_points
,
num_channels
,
integer
=
False
,
data_range
=
(
-
1
,
1
),
with_dense
=
True
,
dtype
=
np
.
float32
):
dense_shape
=
shape
ndim
=
len
(
dense_shape
)
# num_points = np.random.randint(10, 100, size=[batch_size, ndim])
num_points
=
np
.
array
(
num_points
)
# num_points = np.array([3, 2])
batch_size
=
len
(
num_points
)
batch_indices
=
[]
coors_total
=
np
.
stack
(
np
.
meshgrid
(
*
[
np
.
arange
(
0
,
s
)
for
s
in
shape
]),
axis
=-
1
)
coors_total
=
coors_total
.
reshape
(
-
1
,
ndim
)
for
i
in
range
(
batch_size
):
np
.
random
.
shuffle
(
coors_total
)
inds_total
=
coors_total
[:
num_points
[
i
]]
inds_total
=
np
.
pad
(
inds_total
,
((
0
,
0
),
(
0
,
1
)),
mode
=
'constant'
,
constant_values
=
i
)
batch_indices
.
append
(
inds_total
)
if
integer
:
sparse_data
=
np
.
random
.
randint
(
data_range
[
0
],
data_range
[
1
],
size
=
[
num_points
.
sum
(),
num_channels
]).
astype
(
dtype
)
else
:
sparse_data
=
np
.
random
.
uniform
(
data_range
[
0
],
data_range
[
1
],
size
=
[
num_points
.
sum
(),
num_channels
]).
astype
(
dtype
)
res
=
{
'features'
:
sparse_data
.
astype
(
dtype
),
}
if
with_dense
:
dense_data
=
np
.
zeros
([
batch_size
,
num_channels
,
*
dense_shape
],
dtype
=
sparse_data
.
dtype
)
start
=
0
for
i
,
inds
in
enumerate
(
batch_indices
):
for
j
,
ind
in
enumerate
(
inds
):
dense_slice
=
(
i
,
slice
(
None
),
*
ind
[:
-
1
])
dense_data
[
dense_slice
]
=
sparse_data
[
start
+
j
]
start
+=
len
(
inds
)
res
[
'features_dense'
]
=
dense_data
.
astype
(
dtype
)
batch_indices
=
np
.
concatenate
(
batch_indices
,
axis
=
0
)
res
[
'indices'
]
=
batch_indices
.
astype
(
np
.
int32
)
return
res
mmdet3d/ops/voxel/__init__.py
deleted
100644 → 0
View file @
9c7270d0
# Copyright (c) OpenMMLab. All rights reserved.
from
.scatter_points
import
DynamicScatter
,
dynamic_scatter
from
.voxelize
import
Voxelization
,
voxelization
__all__
=
[
'Voxelization'
,
'voxelization'
,
'dynamic_scatter'
,
'DynamicScatter'
]
mmdet3d/ops/voxel/scatter_points.py
deleted
100644 → 0
View file @
9c7270d0
# Copyright (c) OpenMMLab. All rights reserved.
import
torch
from
torch
import
nn
from
torch.autograd
import
Function
from
.voxel_layer
import
(
dynamic_point_to_voxel_backward
,
dynamic_point_to_voxel_forward
)
class
_dynamic_scatter
(
Function
):
@
staticmethod
def
forward
(
ctx
,
feats
,
coors
,
reduce_type
=
'max'
):
"""convert kitti points(N, >=3) to voxels.
Args:
feats: [N, C] float tensor. points features to be reduced
into voxels.
coors: [N, ndim] int tensor. corresponding voxel coordinates
(specifically multi-dim voxel index) of each points.
reduce_type: str. reduce op. support 'max', 'sum' and 'mean'
Returns:
tuple
voxel_feats: [M, C] float tensor. reduced features. input features
that shares the same voxel coordinates are reduced to one row
coordinates: [M, ndim] int tensor, voxel coordinates.
"""
results
=
dynamic_point_to_voxel_forward
(
feats
,
coors
,
reduce_type
)
(
voxel_feats
,
voxel_coors
,
point2voxel_map
,
voxel_points_count
)
=
results
ctx
.
reduce_type
=
reduce_type
ctx
.
save_for_backward
(
feats
,
voxel_feats
,
point2voxel_map
,
voxel_points_count
)
ctx
.
mark_non_differentiable
(
voxel_coors
)
return
voxel_feats
,
voxel_coors
@
staticmethod
def
backward
(
ctx
,
grad_voxel_feats
,
grad_voxel_coors
=
None
):
(
feats
,
voxel_feats
,
point2voxel_map
,
voxel_points_count
)
=
ctx
.
saved_tensors
grad_feats
=
torch
.
zeros_like
(
feats
)
# TODO: whether to use index put or use cuda_backward
# To use index put, need point to voxel index
dynamic_point_to_voxel_backward
(
grad_feats
,
grad_voxel_feats
.
contiguous
(),
feats
,
voxel_feats
,
point2voxel_map
,
voxel_points_count
,
ctx
.
reduce_type
)
return
grad_feats
,
None
,
None
dynamic_scatter
=
_dynamic_scatter
.
apply
class
DynamicScatter
(
nn
.
Module
):
def
__init__
(
self
,
voxel_size
,
point_cloud_range
,
average_points
:
bool
):
super
(
DynamicScatter
,
self
).
__init__
()
"""Scatters points into voxels, used in the voxel encoder with
dynamic voxelization
**Note**: The CPU and GPU implementation get the same output, but
have numerical difference after summation and division (e.g., 5e-7).
Args:
average_points (bool): whether to use avg pooling to scatter
points into voxel voxel_size (list): list [x, y, z] size
of three dimension
point_cloud_range (list):
[x_min, y_min, z_min, x_max, y_max, z_max]
"""
self
.
voxel_size
=
voxel_size
self
.
point_cloud_range
=
point_cloud_range
self
.
average_points
=
average_points
def
forward_single
(
self
,
points
,
coors
):
reduce
=
'mean'
if
self
.
average_points
else
'max'
return
dynamic_scatter
(
points
.
contiguous
(),
coors
.
contiguous
(),
reduce
)
def
forward
(
self
,
points
,
coors
):
"""
Args:
input: NC points
"""
if
coors
.
size
(
-
1
)
==
3
:
return
self
.
forward_single
(
points
,
coors
)
else
:
batch_size
=
coors
[
-
1
,
0
]
+
1
voxels
,
voxel_coors
=
[],
[]
for
i
in
range
(
batch_size
):
inds
=
torch
.
where
(
coors
[:,
0
]
==
i
)
voxel
,
voxel_coor
=
self
.
forward_single
(
points
[
inds
],
coors
[
inds
][:,
1
:])
coor_pad
=
nn
.
functional
.
pad
(
voxel_coor
,
(
1
,
0
),
mode
=
'constant'
,
value
=
i
)
voxel_coors
.
append
(
coor_pad
)
voxels
.
append
(
voxel
)
features
=
torch
.
cat
(
voxels
,
dim
=
0
)
feature_coors
=
torch
.
cat
(
voxel_coors
,
dim
=
0
)
return
features
,
feature_coors
def
__repr__
(
self
):
tmpstr
=
self
.
__class__
.
__name__
+
'('
tmpstr
+=
'voxel_size='
+
str
(
self
.
voxel_size
)
tmpstr
+=
', point_cloud_range='
+
str
(
self
.
point_cloud_range
)
tmpstr
+=
', average_points='
+
str
(
self
.
average_points
)
tmpstr
+=
')'
return
tmpstr
mmdet3d/ops/voxel/src/scatter_points_cpu.cpp
deleted
100644 → 0
View file @
9c7270d0
#include <ATen/TensorUtils.h>
#include <torch/extension.h>
// #include "voxelization.h"
namespace
{
template
<
typename
T_int
>
void
determin_max_points_kernel
(
torch
::
TensorAccessor
<
T_int
,
2
>
coor
,
torch
::
TensorAccessor
<
T_int
,
1
>
point_to_voxelidx
,
torch
::
TensorAccessor
<
T_int
,
1
>
num_points_per_voxel
,
torch
::
TensorAccessor
<
T_int
,
3
>
coor_to_voxelidx
,
int
&
voxel_num
,
int
&
max_points
,
const
int
num_points
)
{
int
voxelidx
,
num
;
for
(
int
i
=
0
;
i
<
num_points
;
++
i
)
{
if
(
coor
[
i
][
0
]
==
-
1
)
continue
;
voxelidx
=
coor_to_voxelidx
[
coor
[
i
][
0
]][
coor
[
i
][
1
]][
coor
[
i
][
2
]];
// record voxel
if
(
voxelidx
==
-
1
)
{
voxelidx
=
voxel_num
;
voxel_num
+=
1
;
coor_to_voxelidx
[
coor
[
i
][
0
]][
coor
[
i
][
1
]][
coor
[
i
][
2
]]
=
voxelidx
;
}
// put points into voxel
num
=
num_points_per_voxel
[
voxelidx
];
point_to_voxelidx
[
i
]
=
num
;
num_points_per_voxel
[
voxelidx
]
+=
1
;
// update max points per voxel
max_points
=
std
::
max
(
max_points
,
num
+
1
);
}
return
;
}
template
<
typename
T
,
typename
T_int
>
void
scatter_point_to_voxel_kernel
(
const
torch
::
TensorAccessor
<
T
,
2
>
points
,
torch
::
TensorAccessor
<
T_int
,
2
>
coor
,
torch
::
TensorAccessor
<
T_int
,
1
>
point_to_voxelidx
,
torch
::
TensorAccessor
<
T_int
,
3
>
coor_to_voxelidx
,
torch
::
TensorAccessor
<
T
,
3
>
voxels
,
torch
::
TensorAccessor
<
T_int
,
2
>
voxel_coors
,
const
int
num_features
,
const
int
num_points
,
const
int
NDim
)
{
for
(
int
i
=
0
;
i
<
num_points
;
++
i
)
{
int
num
=
point_to_voxelidx
[
i
];
int
voxelidx
=
coor_to_voxelidx
[
coor
[
i
][
0
]][
coor
[
i
][
1
]][
coor
[
i
][
2
]];
for
(
int
k
=
0
;
k
<
num_features
;
++
k
)
{
voxels
[
voxelidx
][
num
][
k
]
=
points
[
i
][
k
];
}
for
(
int
k
=
0
;
k
<
NDim
;
++
k
)
{
voxel_coors
[
voxelidx
][
k
]
=
coor
[
i
][
k
];
}
}
}
}
// namespace
namespace
voxelization
{
std
::
vector
<
at
::
Tensor
>
dynamic_point_to_voxel_cpu
(
const
at
::
Tensor
&
points
,
const
at
::
Tensor
&
voxel_mapping
,
const
std
::
vector
<
float
>
voxel_size
,
const
std
::
vector
<
float
>
coors_range
)
{
// current version tooks about 0.02s_0.03s for one frame on cpu
// check device
AT_ASSERTM
(
points
.
device
().
is_cpu
(),
"points must be a CPU tensor"
);
const
int
NDim
=
voxel_mapping
.
size
(
1
);
const
int
num_points
=
points
.
size
(
0
);
const
int
num_features
=
points
.
size
(
1
);
std
::
vector
<
int
>
grid_size
(
NDim
);
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
grid_size
[
i
]
=
round
((
coors_range
[
NDim
+
i
]
-
coors_range
[
i
])
/
voxel_size
[
i
]);
}
at
::
Tensor
num_points_per_voxel
=
at
::
zeros
(
{
num_points
,
},
voxel_mapping
.
options
());
at
::
Tensor
coor_to_voxelidx
=
-
at
::
ones
(
{
grid_size
[
2
],
grid_size
[
1
],
grid_size
[
0
]},
voxel_mapping
.
options
());
at
::
Tensor
point_to_voxelidx
=
-
at
::
ones
(
{
num_points
,
},
voxel_mapping
.
options
());
int
voxel_num
=
0
;
int
max_points
=
0
;
AT_DISPATCH_ALL_TYPES
(
voxel_mapping
.
scalar_type
(),
"determin_max_point"
,
[
&
]
{
determin_max_points_kernel
<
scalar_t
>
(
voxel_mapping
.
accessor
<
scalar_t
,
2
>
(),
point_to_voxelidx
.
accessor
<
scalar_t
,
1
>
(),
num_points_per_voxel
.
accessor
<
scalar_t
,
1
>
(),
coor_to_voxelidx
.
accessor
<
scalar_t
,
3
>
(),
voxel_num
,
max_points
,
num_points
);
});
at
::
Tensor
voxels
=
at
::
zeros
({
voxel_num
,
max_points
,
num_features
},
points
.
options
());
at
::
Tensor
voxel_coors
=
at
::
zeros
({
voxel_num
,
NDim
},
points
.
options
().
dtype
(
at
::
kInt
));
AT_DISPATCH_ALL_TYPES
(
points
.
scalar_type
(),
"scatter_point_to_voxel"
,
[
&
]
{
scatter_point_to_voxel_kernel
<
scalar_t
,
int
>
(
points
.
accessor
<
scalar_t
,
2
>
(),
voxel_mapping
.
accessor
<
int
,
2
>
(),
point_to_voxelidx
.
accessor
<
int
,
1
>
(),
coor_to_voxelidx
.
accessor
<
int
,
3
>
(),
voxels
.
accessor
<
scalar_t
,
3
>
(),
voxel_coors
.
accessor
<
int
,
2
>
(),
num_features
,
num_points
,
NDim
);
});
at
::
Tensor
num_points_per_voxel_out
=
num_points_per_voxel
.
slice
(
/*dim=*/
0
,
/*start=*/
0
,
/*end=*/
voxel_num
);
return
{
voxels
,
voxel_coors
,
num_points_per_voxel_out
};
}
}
// namespace voxelization
mmdet3d/ops/voxel/src/scatter_points_cuda.cu
deleted
100644 → 0
View file @
9c7270d0
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <torch/types.h>
#include <ATen/cuda/CUDAApplyUtils.cuh>
typedef
enum
{
SUM
=
0
,
MEAN
=
1
,
MAX
=
2
}
reduce_t
;
#define CHECK_CUDA(x) \
TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) \
TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) \
CHECK_CUDA(x); \
CHECK_CONTIGUOUS(x)
namespace
{
int
const
threadsPerBlock
=
512
;
int
const
maxGridDim
=
50000
;
}
// namespace
__device__
__forceinline__
static
void
reduceMax
(
float
*
address
,
float
val
)
{
int
*
address_as_i
=
reinterpret_cast
<
int
*>
(
address
);
int
old
=
*
address_as_i
,
assumed
;
do
{
assumed
=
old
;
old
=
atomicCAS
(
address_as_i
,
assumed
,
__float_as_int
(
fmaxf
(
val
,
__int_as_float
(
assumed
))));
}
while
(
assumed
!=
old
||
__int_as_float
(
old
)
<
val
);
}
__device__
__forceinline__
static
void
reduceMax
(
double
*
address
,
double
val
)
{
unsigned
long
long
*
address_as_ull
=
reinterpret_cast
<
unsigned
long
long
*>
(
address
);
unsigned
long
long
old
=
*
address_as_ull
,
assumed
;
do
{
assumed
=
old
;
old
=
atomicCAS
(
address_as_ull
,
assumed
,
__double_as_longlong
(
fmax
(
val
,
__longlong_as_double
(
assumed
))));
}
while
(
assumed
!=
old
||
__longlong_as_double
(
old
)
<
val
);
}
// get rid of meaningless warnings when compiling host code
#ifdef __CUDA_ARCH__
__device__
__forceinline__
static
void
reduceAdd
(
float
*
address
,
float
val
)
{
#if (__CUDA_ARCH__ < 200)
#warning \
"compute capability lower than 2.x. fall back to use CAS version of atomicAdd for float32"
int
*
address_as_i
=
reinterpret_cast
<
int
*>
(
address
);
int
old
=
*
address_as_i
,
assumed
;
do
{
assumed
=
old
;
old
=
atomicCAS
(
address_as_i
,
assumed
,
__float_as_int
(
val
+
__int_as_float
(
assumed
)));
}
while
(
assumed
!=
old
);
#else
atomicAdd
(
address
,
val
);
#endif
}
__device__
__forceinline__
static
void
reduceAdd
(
double
*
address
,
double
val
)
{
#if (__CUDA_ARCH__ < 600)
#warning \
"compute capability lower than 6.x. fall back to use CAS version of atomicAdd for float64"
unsigned
long
long
*
address_as_ull
=
reinterpret_cast
<
unsigned
long
long
*>
(
address
);
unsigned
long
long
old
=
*
address_as_ull
,
assumed
;
do
{
assumed
=
old
;
old
=
atomicCAS
(
address_as_ull
,
assumed
,
__double_as_longlong
(
val
+
__longlong_as_double
(
assumed
)));
}
while
(
assumed
!=
old
);
#else
atomicAdd
(
address
,
val
);
#endif
}
#endif
template
<
typename
T
>
__global__
void
feats_reduce_kernel
(
const
T
*
feats
,
const
int32_t
*
coors_map
,
T
*
reduced_feats
,
// shall be 0 at initialization
const
int
num_input
,
const
int
num_feats
,
const
reduce_t
reduce_type
)
{
for
(
int
x
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
x
<
num_input
;
x
+=
gridDim
.
x
*
blockDim
.
x
)
{
int32_t
reduce_to
=
coors_map
[
x
];
if
(
reduce_to
==
-
1
)
continue
;
const
T
*
feats_offset
=
feats
+
x
*
num_feats
;
T
*
reduced_feats_offset
=
reduced_feats
+
reduce_to
*
num_feats
;
if
(
reduce_type
==
reduce_t
::
MAX
)
{
for
(
int
i
=
0
;
i
<
num_feats
;
i
++
)
{
reduceMax
(
&
reduced_feats_offset
[
i
],
feats_offset
[
i
]);
}
}
else
{
for
(
int
i
=
0
;
i
<
num_feats
;
i
++
)
{
reduceAdd
(
&
reduced_feats_offset
[
i
],
feats_offset
[
i
]);
}
}
}
}
template
<
typename
T
>
__global__
void
add_reduce_traceback_grad_kernel
(
T
*
grad_feats
,
const
T
*
grad_reduced_feats
,
const
int32_t
*
coors_map
,
const
int32_t
*
reduce_count
,
const
int
num_input
,
const
int
num_feats
,
const
reduce_t
reduce_type
)
{
for
(
int
x
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
x
<
num_input
;
x
+=
gridDim
.
x
*
blockDim
.
x
)
{
int32_t
reduce_to
=
coors_map
[
x
];
if
(
reduce_to
==
-
1
)
{
continue
;
}
const
int
input_offset
=
x
*
num_feats
;
T
*
grad_feats_offset
=
grad_feats
+
input_offset
;
const
int
reduced_offset
=
reduce_to
*
num_feats
;
const
T
*
grad_reduced_feats_offset
=
grad_reduced_feats
+
reduced_offset
;
if
(
reduce_type
==
reduce_t
::
SUM
)
{
for
(
int
i
=
0
;
i
<
num_feats
;
i
++
)
{
grad_feats_offset
[
i
]
=
grad_reduced_feats_offset
[
i
];
}
}
else
if
(
reduce_type
==
reduce_t
::
MEAN
)
{
for
(
int
i
=
0
;
i
<
num_feats
;
i
++
)
{
grad_feats_offset
[
i
]
=
grad_reduced_feats_offset
[
i
]
/
static_cast
<
T
>
(
reduce_count
[
reduce_to
]);
}
}
}
}
template
<
typename
T
>
__global__
void
max_reduce_traceback_scatter_idx_kernel
(
const
T
*
feats
,
const
T
*
reduced_feats
,
int32_t
*
reduce_from
,
const
int32_t
*
coors_map
,
const
int
num_input
,
const
int
num_feats
)
{
for
(
int
x
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
x
<
num_input
;
x
+=
gridDim
.
x
*
blockDim
.
x
)
{
int32_t
reduce_to
=
coors_map
[
x
];
const
int
input_offset
=
x
*
num_feats
;
const
T
*
feats_offset
=
feats
+
input_offset
;
if
(
reduce_to
==
-
1
)
{
continue
;
}
const
int
reduced_offset
=
reduce_to
*
num_feats
;
const
T
*
reduced_feats_offset
=
reduced_feats
+
reduced_offset
;
int32_t
*
reduce_from_offset
=
reduce_from
+
reduced_offset
;
for
(
int
i
=
0
;
i
<
num_feats
;
i
++
)
{
if
(
feats_offset
[
i
]
==
reduced_feats_offset
[
i
])
{
atomicMin
(
&
reduce_from_offset
[
i
],
static_cast
<
int32_t
>
(
x
));
}
}
}
}
template
<
typename
T
>
__global__
void
max_reduce_scatter_grad_kernel
(
T
*
grad_feats
,
const
T
*
grad_reduced_feats
,
const
int32_t
*
reduce_from
,
const
int
num_reduced
,
const
int
num_feats
)
{
for
(
int
x
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
x
<
num_reduced
;
x
+=
gridDim
.
x
*
blockDim
.
x
)
{
const
int
reduced_offset
=
x
*
num_feats
;
const
int32_t
*
scatter_to_offset
=
reduce_from
+
reduced_offset
;
const
T
*
grad_reduced_feats_offset
=
grad_reduced_feats
+
reduced_offset
;
for
(
int
i
=
0
;
i
<
num_feats
;
i
++
)
{
grad_feats
[
scatter_to_offset
[
i
]
*
num_feats
+
i
]
=
grad_reduced_feats_offset
[
i
];
}
}
}
namespace
voxelization
{
std
::
vector
<
at
::
Tensor
>
dynamic_point_to_voxel_forward_gpu
(
const
at
::
Tensor
&
feats
,
const
at
::
Tensor
&
coors
,
const
reduce_t
reduce_type
)
{
CHECK_INPUT
(
feats
);
CHECK_INPUT
(
coors
);
const
int
num_input
=
feats
.
size
(
0
);
const
int
num_feats
=
feats
.
size
(
1
);
if
(
num_input
==
0
)
return
{
feats
.
clone
().
detach
(),
coors
.
clone
().
detach
(),
coors
.
new_empty
({
0
},
torch
::
kInt32
),
coors
.
new_empty
({
0
},
torch
::
kInt32
)};
at
::
Tensor
out_coors
;
at
::
Tensor
coors_map
;
at
::
Tensor
reduce_count
;
auto
coors_clean
=
coors
.
masked_fill
(
coors
.
lt
(
0
).
any
(
-
1
,
true
),
-
1
);
std
::
tie
(
out_coors
,
coors_map
,
reduce_count
)
=
at
::
unique_dim
(
coors_clean
,
0
,
true
,
true
,
true
);
if
(
out_coors
.
index
({
0
,
0
}).
lt
(
0
).
item
<
bool
>
())
{
// the first element of out_coors (-1,-1,-1) and should be removed
out_coors
=
out_coors
.
slice
(
0
,
1
);
reduce_count
=
reduce_count
.
slice
(
0
,
1
);
coors_map
=
coors_map
-
1
;
}
coors_map
=
coors_map
.
to
(
torch
::
kInt32
);
reduce_count
=
reduce_count
.
to
(
torch
::
kInt32
);
auto
reduced_feats
=
at
::
empty
({
out_coors
.
size
(
0
),
num_feats
},
feats
.
options
());
AT_DISPATCH_FLOATING_TYPES
(
feats
.
scalar_type
(),
"feats_reduce_kernel"
,
([
&
]
{
if
(
reduce_type
==
reduce_t
::
MAX
)
reduced_feats
.
fill_
(
-
std
::
numeric_limits
<
scalar_t
>::
infinity
());
else
reduced_feats
.
fill_
(
static_cast
<
scalar_t
>
(
0
));
dim3
blocks
(
std
::
min
(
at
::
cuda
::
ATenCeilDiv
(
num_input
,
threadsPerBlock
),
maxGridDim
));
dim3
threads
(
threadsPerBlock
);
feats_reduce_kernel
<<<
blocks
,
threads
>>>
(
feats
.
data_ptr
<
scalar_t
>
(),
coors_map
.
data_ptr
<
int32_t
>
(),
reduced_feats
.
data_ptr
<
scalar_t
>
(),
num_input
,
num_feats
,
reduce_type
);
if
(
reduce_type
==
reduce_t
::
MEAN
)
reduced_feats
/=
reduce_count
.
unsqueeze
(
-
1
).
to
(
reduced_feats
.
dtype
());
}));
AT_CUDA_CHECK
(
cudaGetLastError
());
return
{
reduced_feats
,
out_coors
,
coors_map
,
reduce_count
};
}
void
dynamic_point_to_voxel_backward_gpu
(
at
::
Tensor
&
grad_feats
,
const
at
::
Tensor
&
grad_reduced_feats
,
const
at
::
Tensor
&
feats
,
const
at
::
Tensor
&
reduced_feats
,
const
at
::
Tensor
&
coors_map
,
const
at
::
Tensor
&
reduce_count
,
const
reduce_t
reduce_type
)
{
CHECK_INPUT
(
grad_feats
);
CHECK_INPUT
(
grad_reduced_feats
);
CHECK_INPUT
(
feats
);
CHECK_INPUT
(
reduced_feats
);
CHECK_INPUT
(
coors_map
);
CHECK_INPUT
(
reduce_count
);
const
int
num_input
=
feats
.
size
(
0
);
const
int
num_reduced
=
reduced_feats
.
size
(
0
);
const
int
num_feats
=
feats
.
size
(
1
);
grad_feats
.
fill_
(
0
);
// copy voxel grad to points
if
(
num_input
==
0
||
num_reduced
==
0
)
return
;
if
(
reduce_type
==
reduce_t
::
MEAN
||
reduce_type
==
reduce_t
::
SUM
)
{
AT_DISPATCH_FLOATING_TYPES
(
grad_reduced_feats
.
scalar_type
(),
"add_reduce_traceback_grad_kernel"
,
([
&
]
{
dim3
blocks
(
std
::
min
(
at
::
cuda
::
ATenCeilDiv
(
num_input
,
threadsPerBlock
),
maxGridDim
));
dim3
threads
(
threadsPerBlock
);
add_reduce_traceback_grad_kernel
<<<
blocks
,
threads
>>>
(
grad_feats
.
data_ptr
<
scalar_t
>
(),
grad_reduced_feats
.
data_ptr
<
scalar_t
>
(),
coors_map
.
data_ptr
<
int32_t
>
(),
reduce_count
.
data_ptr
<
int32_t
>
(),
num_input
,
num_feats
,
reduce_type
);
}));
AT_CUDA_CHECK
(
cudaGetLastError
());
}
else
{
auto
reduce_from
=
at
::
full
({
num_reduced
,
num_feats
},
num_input
,
coors_map
.
options
().
dtype
(
torch
::
kInt32
));
AT_DISPATCH_FLOATING_TYPES
(
grad_reduced_feats
.
scalar_type
(),
"max_reduce_traceback_scatter_idx_kernel"
,
([
&
]
{
dim3
blocks
(
std
::
min
(
at
::
cuda
::
ATenCeilDiv
(
num_input
,
threadsPerBlock
),
maxGridDim
));
dim3
threads
(
threadsPerBlock
);
max_reduce_traceback_scatter_idx_kernel
<<<
blocks
,
threads
>>>
(
feats
.
data_ptr
<
scalar_t
>
(),
reduced_feats
.
data_ptr
<
scalar_t
>
(),
reduce_from
.
data_ptr
<
int32_t
>
(),
coors_map
.
data_ptr
<
int32_t
>
(),
num_input
,
num_feats
);
}));
AT_CUDA_CHECK
(
cudaGetLastError
());
AT_DISPATCH_FLOATING_TYPES
(
grad_reduced_feats
.
scalar_type
(),
"max_reduce_traceback_scatter_idx_kernel"
,
([
&
]
{
dim3
blocks
(
std
::
min
(
at
::
cuda
::
ATenCeilDiv
(
num_reduced
,
threadsPerBlock
),
maxGridDim
));
dim3
threads
(
threadsPerBlock
);
max_reduce_scatter_grad_kernel
<<<
blocks
,
threads
>>>
(
grad_feats
.
data_ptr
<
scalar_t
>
(),
grad_reduced_feats
.
data_ptr
<
scalar_t
>
(),
reduce_from
.
data_ptr
<
int32_t
>
(),
num_reduced
,
num_feats
);
}));
AT_CUDA_CHECK
(
cudaGetLastError
());
}
return
;
}
}
// namespace voxelization
mmdet3d/ops/voxel/src/voxelization.cpp
deleted
100644 → 0
View file @
9c7270d0
#include <torch/extension.h>
#include "voxelization.h"
namespace
voxelization
{
PYBIND11_MODULE
(
TORCH_EXTENSION_NAME
,
m
)
{
m
.
def
(
"hard_voxelize"
,
&
hard_voxelize
,
"hard voxelize"
);
m
.
def
(
"dynamic_voxelize"
,
&
dynamic_voxelize
,
"dynamic voxelization"
);
m
.
def
(
"dynamic_point_to_voxel_forward"
,
&
dynamic_point_to_voxel_forward
,
"dynamic point to voxel forward"
);
m
.
def
(
"dynamic_point_to_voxel_backward"
,
&
dynamic_point_to_voxel_backward
,
"dynamic point to voxel backward"
);
}
}
// namespace voxelization
Prev
1
…
5
6
7
8
9
10
11
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment