Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
mmdetection3d
Commits
d1aac35d
Commit
d1aac35d
authored
Apr 14, 2020
by
zhangwenwei
Browse files
Initial commit
parents
Changes
214
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
4191 additions
and
0 deletions
+4191
-0
mmdet3d/ops/spconv/include/spconv/point2voxel.h
mmdet3d/ops/spconv/include/spconv/point2voxel.h
+414
-0
mmdet3d/ops/spconv/include/spconv/pool_ops.h
mmdet3d/ops/spconv/include/spconv/pool_ops.h
+97
-0
mmdet3d/ops/spconv/include/spconv/reordering.cu.h
mmdet3d/ops/spconv/include/spconv/reordering.cu.h
+161
-0
mmdet3d/ops/spconv/include/spconv/reordering.h
mmdet3d/ops/spconv/include/spconv/reordering.h
+40
-0
mmdet3d/ops/spconv/include/spconv/spconv_ops.h
mmdet3d/ops/spconv/include/spconv/spconv_ops.h
+561
-0
mmdet3d/ops/spconv/include/tensorview/helper_kernel.cu.h
mmdet3d/ops/spconv/include/tensorview/helper_kernel.cu.h
+81
-0
mmdet3d/ops/spconv/include/tensorview/helper_launch.h
mmdet3d/ops/spconv/include/tensorview/helper_launch.h
+21
-0
mmdet3d/ops/spconv/include/tensorview/tensorview.h
mmdet3d/ops/spconv/include/tensorview/tensorview.h
+1144
-0
mmdet3d/ops/spconv/include/torch_utils.h
mmdet3d/ops/spconv/include/torch_utils.h
+70
-0
mmdet3d/ops/spconv/include/utility/timer.h
mmdet3d/ops/spconv/include/utility/timer.h
+54
-0
mmdet3d/ops/spconv/modules.py
mmdet3d/ops/spconv/modules.py
+205
-0
mmdet3d/ops/spconv/ops.py
mmdet3d/ops/spconv/ops.py
+183
-0
mmdet3d/ops/spconv/pool.py
mmdet3d/ops/spconv/pool.py
+85
-0
mmdet3d/ops/spconv/src/all.cc
mmdet3d/ops/spconv/src/all.cc
+51
-0
mmdet3d/ops/spconv/src/indice.cc
mmdet3d/ops/spconv/src/indice.cc
+89
-0
mmdet3d/ops/spconv/src/indice_cuda.cu
mmdet3d/ops/spconv/src/indice_cuda.cu
+158
-0
mmdet3d/ops/spconv/src/maxpool.cc
mmdet3d/ops/spconv/src/maxpool.cc
+82
-0
mmdet3d/ops/spconv/src/maxpool_cuda.cu
mmdet3d/ops/spconv/src/maxpool_cuda.cu
+471
-0
mmdet3d/ops/spconv/src/reordering.cc
mmdet3d/ops/spconv/src/reordering.cc
+69
-0
mmdet3d/ops/spconv/src/reordering_cuda.cu
mmdet3d/ops/spconv/src/reordering_cuda.cu
+155
-0
No files found.
mmdet3d/ops/spconv/include/spconv/point2voxel.h
0 → 100644
View file @
d1aac35d
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <pybind11/pybind11.h>
// must include pybind11/eigen.h if using eigen matrix as arguments.
// must include pybind11/stl.h if using containers in STL in arguments.
#include <algorithm>
#include <pybind11/numpy.h>
#include <pybind11/stl.h>
// #include <vector>
#include <iostream>
#include <math.h>
namespace
spconv
{
namespace
py
=
pybind11
;
using
namespace
pybind11
::
literals
;
template
<
typename
DType
,
int
NDim
>
int
points_to_voxel_3d_np
(
py
::
array_t
<
DType
>
points
,
py
::
array_t
<
DType
>
voxels
,
py
::
array_t
<
int
>
coors
,
py
::
array_t
<
int
>
num_points_per_voxel
,
py
::
array_t
<
int
>
coor_to_voxelidx
,
std
::
vector
<
DType
>
voxel_size
,
std
::
vector
<
DType
>
coors_range
,
int
max_points
,
int
max_voxels
)
{
auto
points_rw
=
points
.
template
mutable_unchecked
<
2
>();
auto
voxels_rw
=
voxels
.
template
mutable_unchecked
<
3
>();
auto
coors_rw
=
coors
.
mutable_unchecked
<
2
>
();
auto
num_points_per_voxel_rw
=
num_points_per_voxel
.
mutable_unchecked
<
1
>
();
auto
coor_to_voxelidx_rw
=
coor_to_voxelidx
.
mutable_unchecked
<
NDim
>
();
auto
N
=
points_rw
.
shape
(
0
);
auto
num_features
=
points_rw
.
shape
(
1
);
// auto ndim = points_rw.shape(1) - 1;
constexpr
int
ndim_minus_1
=
NDim
-
1
;
int
voxel_num
=
0
;
bool
failed
=
false
;
int
coor
[
NDim
];
int
c
;
int
grid_size
[
NDim
];
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
grid_size
[
i
]
=
round
((
coors_range
[
NDim
+
i
]
-
coors_range
[
i
])
/
voxel_size
[
i
]);
}
int
voxelidx
,
num
;
for
(
int
i
=
0
;
i
<
N
;
++
i
)
{
failed
=
false
;
for
(
int
j
=
0
;
j
<
NDim
;
++
j
)
{
c
=
floor
((
points_rw
(
i
,
j
)
-
coors_range
[
j
])
/
voxel_size
[
j
]);
if
((
c
<
0
||
c
>=
grid_size
[
j
]))
{
failed
=
true
;
break
;
}
coor
[
ndim_minus_1
-
j
]
=
c
;
}
if
(
failed
)
continue
;
voxelidx
=
coor_to_voxelidx_rw
(
coor
[
0
],
coor
[
1
],
coor
[
2
]);
if
(
voxelidx
==
-
1
)
{
voxelidx
=
voxel_num
;
if
(
voxel_num
>=
max_voxels
)
break
;
voxel_num
+=
1
;
coor_to_voxelidx_rw
(
coor
[
0
],
coor
[
1
],
coor
[
2
])
=
voxelidx
;
for
(
int
k
=
0
;
k
<
NDim
;
++
k
)
{
coors_rw
(
voxelidx
,
k
)
=
coor
[
k
];
}
}
num
=
num_points_per_voxel_rw
(
voxelidx
);
if
(
num
<
max_points
)
{
for
(
int
k
=
0
;
k
<
num_features
;
++
k
)
{
voxels_rw
(
voxelidx
,
num
,
k
)
=
points_rw
(
i
,
k
);
}
num_points_per_voxel_rw
(
voxelidx
)
+=
1
;
}
}
for
(
int
i
=
0
;
i
<
voxel_num
;
++
i
)
{
coor_to_voxelidx_rw
(
coors_rw
(
i
,
0
),
coors_rw
(
i
,
1
),
coors_rw
(
i
,
2
))
=
-
1
;
}
return
voxel_num
;
}
template
<
typename
DType
,
int
NDim
>
int
points_to_voxel_3d_np_mean
(
py
::
array_t
<
DType
>
points
,
py
::
array_t
<
DType
>
voxels
,
py
::
array_t
<
DType
>
means
,
py
::
array_t
<
int
>
coors
,
py
::
array_t
<
int
>
num_points_per_voxel
,
py
::
array_t
<
int
>
coor_to_voxelidx
,
std
::
vector
<
DType
>
voxel_size
,
std
::
vector
<
DType
>
coors_range
,
int
max_points
,
int
max_voxels
)
{
auto
points_rw
=
points
.
template
mutable_unchecked
<
2
>();
auto
means_rw
=
means
.
template
mutable_unchecked
<
2
>();
auto
voxels_rw
=
voxels
.
template
mutable_unchecked
<
3
>();
auto
coors_rw
=
coors
.
mutable_unchecked
<
2
>
();
auto
num_points_per_voxel_rw
=
num_points_per_voxel
.
mutable_unchecked
<
1
>
();
auto
coor_to_voxelidx_rw
=
coor_to_voxelidx
.
mutable_unchecked
<
NDim
>
();
auto
N
=
points_rw
.
shape
(
0
);
auto
num_features
=
points_rw
.
shape
(
1
);
// auto ndim = points_rw.shape(1) - 1;
constexpr
int
ndim_minus_1
=
NDim
-
1
;
int
voxel_num
=
0
;
bool
failed
=
false
;
int
coor
[
NDim
];
int
c
;
int
grid_size
[
NDim
];
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
grid_size
[
i
]
=
round
((
coors_range
[
NDim
+
i
]
-
coors_range
[
i
])
/
voxel_size
[
i
]);
}
int
voxelidx
,
num
;
for
(
int
i
=
0
;
i
<
N
;
++
i
)
{
failed
=
false
;
for
(
int
j
=
0
;
j
<
NDim
;
++
j
)
{
c
=
floor
((
points_rw
(
i
,
j
)
-
coors_range
[
j
])
/
voxel_size
[
j
]);
if
((
c
<
0
||
c
>=
grid_size
[
j
]))
{
failed
=
true
;
break
;
}
coor
[
ndim_minus_1
-
j
]
=
c
;
}
if
(
failed
)
continue
;
voxelidx
=
coor_to_voxelidx_rw
(
coor
[
0
],
coor
[
1
],
coor
[
2
]);
if
(
voxelidx
==
-
1
)
{
voxelidx
=
voxel_num
;
if
(
voxel_num
>=
max_voxels
)
break
;
voxel_num
+=
1
;
coor_to_voxelidx_rw
(
coor
[
0
],
coor
[
1
],
coor
[
2
])
=
voxelidx
;
for
(
int
k
=
0
;
k
<
NDim
;
++
k
)
{
coors_rw
(
voxelidx
,
k
)
=
coor
[
k
];
}
}
num
=
num_points_per_voxel_rw
(
voxelidx
);
if
(
num
<
max_points
)
{
for
(
int
k
=
0
;
k
<
num_features
;
++
k
)
{
voxels_rw
(
voxelidx
,
num
,
k
)
=
points_rw
(
i
,
k
);
}
num_points_per_voxel_rw
(
voxelidx
)
+=
1
;
for
(
int
k
=
0
;
k
<
num_features
;
++
k
)
{
means_rw
(
voxelidx
,
k
)
+=
(
points_rw
(
i
,
k
)
-
means_rw
(
voxelidx
,
k
))
/
DType
(
num
+
1
);
}
}
}
for
(
int
i
=
0
;
i
<
voxel_num
;
++
i
)
{
coor_to_voxelidx_rw
(
coors_rw
(
i
,
0
),
coors_rw
(
i
,
1
),
coors_rw
(
i
,
2
))
=
-
1
;
num
=
num_points_per_voxel_rw
(
i
);
for
(
int
j
=
num
;
j
<
max_points
;
++
j
){
for
(
int
k
=
0
;
k
<
num_features
;
++
k
)
{
voxels_rw
(
i
,
j
,
k
)
=
means_rw
(
i
,
k
);
}
}
}
return
voxel_num
;
}
template
<
typename
DType
,
int
NDim
>
int
points_to_voxel_3d_np_height
(
py
::
array_t
<
DType
>
points
,
py
::
array_t
<
DType
>
voxels
,
py
::
array_t
<
DType
>
height
,
py
::
array_t
<
DType
>
maxs
,
py
::
array_t
<
int
>
coors
,
py
::
array_t
<
int
>
num_points_per_voxel
,
py
::
array_t
<
int
>
coor_to_voxelidx
,
std
::
vector
<
DType
>
voxel_size
,
std
::
vector
<
DType
>
coors_range
,
int
max_points
,
int
max_voxels
)
{
auto
points_rw
=
points
.
template
mutable_unchecked
<
2
>();
auto
height_rw
=
height
.
template
mutable_unchecked
<
2
>();
auto
maxs_rw
=
maxs
.
template
mutable_unchecked
<
2
>();
auto
voxels_rw
=
voxels
.
template
mutable_unchecked
<
3
>();
auto
coors_rw
=
coors
.
mutable_unchecked
<
2
>
();
auto
num_points_per_voxel_rw
=
num_points_per_voxel
.
mutable_unchecked
<
1
>
();
auto
coor_to_voxelidx_rw
=
coor_to_voxelidx
.
mutable_unchecked
<
NDim
>
();
auto
N
=
points_rw
.
shape
(
0
);
auto
num_features
=
points_rw
.
shape
(
1
);
// auto ndim = points_rw.shape(1) - 1;
constexpr
int
ndim_minus_1
=
NDim
-
1
;
int
voxel_num
=
0
;
bool
failed
=
false
;
int
coor
[
NDim
];
int
c
;
int
grid_size
[
NDim
];
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
grid_size
[
i
]
=
round
((
coors_range
[
NDim
+
i
]
-
coors_range
[
i
])
/
voxel_size
[
i
]);
}
int
voxelidx
,
num
;
for
(
int
i
=
0
;
i
<
N
;
++
i
)
{
failed
=
false
;
for
(
int
j
=
0
;
j
<
NDim
;
++
j
)
{
c
=
floor
((
points_rw
(
i
,
j
)
-
coors_range
[
j
])
/
voxel_size
[
j
]);
if
((
c
<
0
||
c
>=
grid_size
[
j
]))
{
failed
=
true
;
break
;
}
coor
[
ndim_minus_1
-
j
]
=
c
;
}
if
(
failed
)
continue
;
voxelidx
=
coor_to_voxelidx_rw
(
coor
[
0
],
coor
[
1
],
coor
[
2
]);
if
(
voxelidx
==
-
1
)
{
voxelidx
=
voxel_num
;
if
(
voxel_num
>=
max_voxels
)
break
;
voxel_num
+=
1
;
coor_to_voxelidx_rw
(
coor
[
0
],
coor
[
1
],
coor
[
2
])
=
voxelidx
;
for
(
int
k
=
0
;
k
<
NDim
;
++
k
)
{
coors_rw
(
voxelidx
,
k
)
=
coor
[
k
];
}
}
num
=
num_points_per_voxel_rw
(
voxelidx
);
if
(
num
<
max_points
)
{
for
(
int
k
=
0
;
k
<
num_features
;
++
k
)
{
voxels_rw
(
voxelidx
,
num
,
k
)
=
points_rw
(
i
,
k
);
height_rw
(
voxelidx
,
k
)
=
std
::
min
(
points_rw
(
i
,
k
),
height_rw
(
voxelidx
,
k
));
maxs_rw
(
voxelidx
,
k
)
=
std
::
max
(
points_rw
(
i
,
k
),
maxs_rw
(
voxelidx
,
k
));
}
num_points_per_voxel_rw
(
voxelidx
)
+=
1
;
}
}
for
(
int
i
=
0
;
i
<
voxel_num
;
++
i
)
{
coor_to_voxelidx_rw
(
coors_rw
(
i
,
0
),
coors_rw
(
i
,
1
),
coors_rw
(
i
,
2
))
=
-
1
;
for
(
int
k
=
0
;
k
<
num_features
;
++
k
)
{
height_rw
(
i
,
k
)
=
maxs_rw
(
i
,
k
)
-
height_rw
(
i
,
k
);
}
}
return
voxel_num
;
}
template
<
typename
DType
,
int
NDim
>
int
block_filtering
(
py
::
array_t
<
DType
>
points
,
py
::
array_t
<
int
>
mask
,
py
::
array_t
<
DType
>
height
,
py
::
array_t
<
DType
>
maxs
,
py
::
array_t
<
int
>
coor_to_voxelidx
,
std
::
vector
<
DType
>
voxel_size
,
std
::
vector
<
DType
>
coors_range
,
int
max_voxels
,
DType
eps
)
{
auto
points_rw
=
points
.
template
mutable_unchecked
<
2
>();
auto
height_rw
=
height
.
template
mutable_unchecked
<
1
>();
auto
maxs_rw
=
maxs
.
template
mutable_unchecked
<
1
>();
auto
coor_to_voxelidx_rw
=
coor_to_voxelidx
.
mutable_unchecked
<
NDim
>
();
auto
N
=
points_rw
.
shape
(
0
);
auto
num_features
=
points_rw
.
shape
(
1
);
// auto ndim = points_rw.shape(1) - 1;
constexpr
int
ndim_minus_1
=
NDim
-
1
;
int
voxel_num
=
0
;
bool
failed
=
false
;
int
coor
[
NDim
];
int
c
;
int
grid_size
[
NDim
];
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
grid_size
[
i
]
=
round
((
coors_range
[
NDim
+
i
]
-
coors_range
[
i
])
/
voxel_size
[
i
]);
}
int
voxelidx
,
num
;
for
(
int
i
=
0
;
i
<
N
;
++
i
)
{
failed
=
false
;
for
(
int
j
=
0
;
j
<
NDim
;
++
j
)
{
c
=
floor
((
points_rw
(
i
,
j
)
-
coors_range
[
j
])
/
voxel_size
[
j
]);
if
((
c
<
0
||
c
>=
grid_size
[
j
]))
{
failed
=
true
;
break
;
}
coor
[
ndim_minus_1
-
j
]
=
c
;
}
if
(
failed
)
continue
;
voxelidx
=
coor_to_voxelidx_rw
(
coor
[
0
],
coor
[
1
],
coor
[
2
]);
if
(
voxelidx
==
-
1
)
{
voxelidx
=
voxel_num
;
voxel_num
+=
1
;
coor_to_voxelidx_rw
(
coor
[
0
],
coor
[
1
],
coor
[
2
])
=
voxelidx
;
}
height_rw
(
voxelidx
)
=
std
::
min
(
points_rw
(
i
,
2
),
height_rw
(
voxelidx
));
maxs_rw
(
voxelidx
)
=
std
::
max
(
points_rw
(
i
,
2
),
maxs_rw
(
voxelidx
));
}
for
(
int
i
=
0
;
i
<
N
;
++
i
)
{
failed
=
false
;
for
(
int
j
=
0
;
j
<
NDim
;
++
j
)
{
c
=
floor
((
points_rw
(
i
,
j
)
-
coors_range
[
j
])
/
voxel_size
[
j
]);
if
((
c
<
0
||
c
>=
grid_size
[
j
]))
{
failed
=
true
;
break
;
}
coor
[
ndim_minus_1
-
j
]
=
c
;
}
if
(
failed
)
continue
;
voxelidx
=
coor_to_voxelidx_rw
(
coor
[
0
],
coor
[
1
],
coor
[
2
]);
if
((
maxs_rw
(
voxelidx
)
-
height_rw
(
voxelidx
,
2
))
<
eps
){
mask
(
i
)
=
0
;
}
}
}
template
<
typename
DType
,
int
NDim
>
int
points_to_voxel_3d_with_filtering
(
py
::
array_t
<
DType
>
points
,
py
::
array_t
<
DType
>
voxels
,
py
::
array_t
<
int
>
voxel_mask
,
py
::
array_t
<
DType
>
mins
,
py
::
array_t
<
DType
>
maxs
,
py
::
array_t
<
int
>
coors
,
py
::
array_t
<
int
>
num_points_per_voxel
,
py
::
array_t
<
int
>
coor_to_voxelidx
,
std
::
vector
<
DType
>
voxel_size
,
std
::
vector
<
DType
>
coors_range
,
int
max_points
,
int
max_voxels
,
int
block_factor
,
int
block_size
,
DType
height_threshold
)
{
auto
points_rw
=
points
.
template
mutable_unchecked
<
2
>();
auto
mins_rw
=
mins
.
template
mutable_unchecked
<
2
>();
auto
maxs_rw
=
maxs
.
template
mutable_unchecked
<
2
>();
auto
voxels_rw
=
voxels
.
template
mutable_unchecked
<
3
>();
auto
voxel_mask_rw
=
voxel_mask
.
template
mutable_unchecked
<
1
>();
auto
coors_rw
=
coors
.
mutable_unchecked
<
2
>
();
auto
num_points_per_voxel_rw
=
num_points_per_voxel
.
mutable_unchecked
<
1
>
();
auto
coor_to_voxelidx_rw
=
coor_to_voxelidx
.
mutable_unchecked
<
NDim
>
();
auto
N
=
points_rw
.
shape
(
0
);
auto
num_features
=
points_rw
.
shape
(
1
);
// auto ndim = points_rw.shape(1) - 1;
constexpr
int
ndim_minus_1
=
NDim
-
1
;
int
voxel_num
=
0
;
bool
failed
=
false
;
int
coor
[
NDim
];
int
c
;
int
grid_size
[
NDim
];
DType
max_value
,
min_value
;
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
grid_size
[
i
]
=
round
((
coors_range
[
NDim
+
i
]
-
coors_range
[
i
])
/
voxel_size
[
i
]);
}
int
block_shape_H
=
grid_size
[
1
]
/
block_factor
;
int
block_shape_W
=
grid_size
[
0
]
/
block_factor
;
int
voxelidx
,
num
;
int
block_coor
[
2
];
int
startx
,
stopx
,
starty
,
stopy
;
for
(
int
i
=
0
;
i
<
N
;
++
i
)
{
failed
=
false
;
for
(
int
j
=
0
;
j
<
NDim
;
++
j
)
{
c
=
floor
((
points_rw
(
i
,
j
)
-
coors_range
[
j
])
/
voxel_size
[
j
]);
if
((
c
<
0
||
c
>=
grid_size
[
j
]))
{
failed
=
true
;
break
;
}
coor
[
ndim_minus_1
-
j
]
=
c
;
}
if
(
failed
)
continue
;
voxelidx
=
coor_to_voxelidx_rw
(
coor
[
0
],
coor
[
1
],
coor
[
2
]);
if
(
voxelidx
==
-
1
)
{
voxelidx
=
voxel_num
;
if
(
voxel_num
>=
max_voxels
)
break
;
voxel_num
+=
1
;
coor_to_voxelidx_rw
(
coor
[
0
],
coor
[
1
],
coor
[
2
])
=
voxelidx
;
for
(
int
k
=
0
;
k
<
NDim
;
++
k
)
{
coors_rw
(
voxelidx
,
k
)
=
coor
[
k
];
}
}
num
=
num_points_per_voxel_rw
(
voxelidx
);
if
(
num
<
max_points
)
{
for
(
int
k
=
0
;
k
<
num_features
;
++
k
)
{
voxels_rw
(
voxelidx
,
num
,
k
)
=
points_rw
(
i
,
k
);
}
block_coor
[
0
]
=
coor
[
1
]
/
block_factor
;
block_coor
[
1
]
=
coor
[
2
]
/
block_factor
;
mins_rw
(
block_coor
[
0
],
block_coor
[
1
])
=
std
::
min
(
points_rw
(
i
,
2
),
mins_rw
(
block_coor
[
0
],
block_coor
[
1
]));
maxs_rw
(
block_coor
[
0
],
block_coor
[
1
])
=
std
::
max
(
points_rw
(
i
,
2
),
maxs_rw
(
block_coor
[
0
],
block_coor
[
1
]));
num_points_per_voxel_rw
(
voxelidx
)
+=
1
;
}
}
for
(
int
i
=
0
;
i
<
voxel_num
;
++
i
)
{
coor
[
1
]
=
coors_rw
(
i
,
1
);
coor
[
2
]
=
coors_rw
(
i
,
2
);
coor_to_voxelidx_rw
(
coors_rw
(
i
,
0
),
coor
[
1
],
coor
[
2
])
=
-
1
;
block_coor
[
0
]
=
coor
[
1
]
/
block_factor
;
block_coor
[
1
]
=
coor
[
2
]
/
block_factor
;
min_value
=
mins_rw
(
block_coor
[
0
],
block_coor
[
1
]);
max_value
=
maxs_rw
(
block_coor
[
0
],
block_coor
[
1
]);
startx
=
std
::
max
(
0
,
block_coor
[
0
]
-
block_size
/
2
);
stopx
=
std
::
min
(
block_shape_H
,
block_coor
[
0
]
+
block_size
-
block_size
/
2
);
starty
=
std
::
max
(
0
,
block_coor
[
1
]
-
block_size
/
2
);
stopy
=
std
::
min
(
block_shape_W
,
block_coor
[
1
]
+
block_size
-
block_size
/
2
);
for
(
int
j
=
startx
;
j
<
stopx
;
++
j
){
for
(
int
k
=
starty
;
k
<
stopy
;
++
k
){
min_value
=
std
::
min
(
min_value
,
mins_rw
(
j
,
k
));
max_value
=
std
::
max
(
max_value
,
maxs_rw
(
j
,
k
));
}
}
voxel_mask_rw
(
i
)
=
(
max_value
-
min_value
)
>
height_threshold
;
}
return
voxel_num
;
}
}
// namespace spconv
mmdet3d/ops/spconv/include/spconv/pool_ops.h
0 → 100644
View file @
d1aac35d
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef SPARSE_POOL_OP_H_
#define SPARSE_POOL_OP_H_
#include <cuda_runtime_api.h>
#include <spconv/maxpool.h>
#include <torch/script.h>
#include <torch_utils.h>
#include <utility/timer.h>
namespace
spconv
{
template
<
typename
T
>
torch
::
Tensor
indiceMaxPool
(
torch
::
Tensor
features
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
int64_t
numAct
)
{
auto
device
=
features
.
device
().
type
();
auto
kernelVolume
=
indicePairs
.
size
(
0
);
auto
numInPlanes
=
features
.
size
(
1
);
auto
indicePairNumCpu
=
indiceNum
.
to
({
torch
::
kCPU
});
auto
options
=
torch
::
TensorOptions
().
dtype
(
features
.
dtype
()).
device
(
features
.
device
());
torch
::
Tensor
output
=
torch
::
zeros
({
numAct
,
numInPlanes
},
options
);
double
totalTime
=
0
;
for
(
int
i
=
0
;
i
<
kernelVolume
;
++
i
)
{
auto
nHot
=
indicePairNumCpu
.
data
<
int
>
()[
i
];
if
(
nHot
<=
0
)
{
continue
;
}
// auto timer = spconv::CudaContextTimer<>();
if
(
device
==
torch
::
kCPU
)
{
functor
::
SparseMaxPoolForwardFunctor
<
tv
::
CPU
,
T
,
int
>
forwardFtor
;
forwardFtor
(
tv
::
CPU
(),
tv
::
torch2tv
<
T
>
(
output
),
tv
::
torch2tv
<
const
T
>
(
features
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
),
nHot
);
}
else
{
functor
::
SparseMaxPoolForwardFunctor
<
tv
::
GPU
,
T
,
int
>
forwardFtor
;
forwardFtor
(
tv
::
TorchGPU
(),
tv
::
torch2tv
<
T
>
(
output
),
tv
::
torch2tv
<
const
T
>
(
features
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
),
nHot
);
TV_CHECK_CUDA_ERR
();
}
// totalTime += timer.report() / 1000.0;
}
// std::cout << "maxpool forward time " << totalTime << std::endl;
return
output
;
}
template
<
typename
T
>
torch
::
Tensor
indiceMaxPoolBackward
(
torch
::
Tensor
features
,
torch
::
Tensor
outFeatures
,
torch
::
Tensor
outGrad
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
)
{
auto
device
=
features
.
device
().
type
();
auto
numInPlanes
=
features
.
size
(
1
);
auto
indicePairNumCpu
=
indiceNum
.
to
({
torch
::
kCPU
});
auto
options
=
torch
::
TensorOptions
().
dtype
(
features
.
dtype
()).
device
(
features
.
device
());
torch
::
Tensor
inputGrad
=
torch
::
zeros
(
features
.
sizes
(),
options
);
auto
kernelVolume
=
indicePairs
.
size
(
0
);
for
(
int
i
=
0
;
i
<
kernelVolume
;
++
i
)
{
auto
nHot
=
indicePairNumCpu
.
data
<
int
>
()[
i
];
if
(
nHot
<=
0
)
{
continue
;
}
if
(
device
==
torch
::
kCPU
)
{
functor
::
SparseMaxPoolBackwardFunctor
<
tv
::
CPU
,
T
,
int
>
backwardFtor
;
backwardFtor
(
tv
::
CPU
(),
tv
::
torch2tv
<
const
T
>
(
outFeatures
),
tv
::
torch2tv
<
const
T
>
(
features
),
tv
::
torch2tv
<
const
T
>
(
outGrad
),
tv
::
torch2tv
<
T
>
(
inputGrad
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
),
nHot
);
}
else
{
functor
::
SparseMaxPoolBackwardFunctor
<
tv
::
GPU
,
T
,
int
>
backwardFtor
;
backwardFtor
(
tv
::
TorchGPU
(),
tv
::
torch2tv
<
const
T
>
(
outFeatures
),
tv
::
torch2tv
<
const
T
>
(
features
),
tv
::
torch2tv
<
const
T
>
(
outGrad
),
tv
::
torch2tv
<
T
>
(
inputGrad
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
),
nHot
);
TV_CHECK_CUDA_ERR
();
}
}
return
inputGrad
;
}
}
// namespace spconv
#endif
mmdet3d/ops/spconv/include/spconv/reordering.cu.h
0 → 100644
View file @
d1aac35d
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef REORDERING_CU_H_
#define REORDERING_CU_H_
#include <tensorview/helper_kernel.cu.h>
// see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
namespace
spconv
{
template
<
typename
T
,
typename
Index
,
int
NumTLP
,
int
NumILP
>
__global__
void
gatherGenericKernel
(
T
*
buffer
,
const
T
*
features
,
const
Index
*
indices
,
int
size
,
int
numPlanes
)
{
int
ILPStrideX
[
NumILP
];
Index
inds
[
NumILP
];
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
ILPStrideX
[
ilp
]
=
ilp
*
gridDim
.
x
*
blockDim
.
x
;
for
(
int
ix
:
tv
::
KernelLoopX
<
int
,
NumILP
>
(
size
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
{
if
(
ix
+
ILPStrideX
[
ilp
]
<
size
)
inds
[
ilp
]
=
indices
[
ix
+
ILPStrideX
[
ilp
]]
*
numPlanes
;
}
for
(
int
iy
:
tv
::
KernelLoopY
<
int
>
(
numPlanes
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
++
ilp
)
{
if
(
ix
+
ILPStrideX
[
ilp
]
<
size
)
buffer
[(
ix
+
ILPStrideX
[
ilp
])
*
numPlanes
+
iy
]
=
features
[
inds
[
ilp
]
+
iy
];
}
}
}
}
template
<
typename
T
,
typename
Index
,
int
NumTLP
,
int
NumILP
,
typename
VecType
>
__global__
void
gatherVecKernel
(
T
*
buffer
,
const
T
*
features
,
const
Index
*
indices
,
int
size
,
int
numPlanes
)
{
int
ILPStrideX
[
NumILP
];
Index
inds
[
NumILP
];
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
ILPStrideX
[
ilp
]
=
ilp
*
gridDim
.
x
*
blockDim
.
x
;
for
(
int
ix
:
tv
::
KernelLoopX
<
int
,
NumILP
>
(
size
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
{
if
(
ix
+
ILPStrideX
[
ilp
]
<
size
)
inds
[
ilp
]
=
indices
[
ix
+
ILPStrideX
[
ilp
]]
*
numPlanes
;
}
for
(
int
iy
:
tv
::
KernelLoopY
<
int
>
(
numPlanes
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
++
ilp
)
{
if
(
ix
+
ILPStrideX
[
ilp
]
<
size
)
reinterpret_cast
<
VecType
*>
(
buffer
)[(
ix
+
ILPStrideX
[
ilp
])
*
numPlanes
+
iy
]
=
reinterpret_cast
<
const
VecType
*>
(
features
)[
inds
[
ilp
]
+
iy
];
}
}
}
}
template
<
typename
T
,
typename
Index
,
int
NumTLP
,
int
NumILP
,
typename
VecType
=
int4
>
__global__
void
gatherVecBlockKernel
(
T
*
buffer
,
const
T
*
features
,
const
Index
*
indices
,
int
size
,
int
numPlanes
)
{
int
ILPStrideY
[
NumILP
];
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
ILPStrideY
[
ilp
]
=
ilp
*
gridDim
.
y
*
blockDim
.
y
;
features
+=
blockIdx
.
x
*
NumTLP
;
buffer
+=
blockIdx
.
x
*
NumTLP
;
for
(
int
iy
:
tv
::
KernelLoopY
<
int
,
NumILP
>
(
size
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
++
ilp
)
{
reinterpret_cast
<
VecType
*>
(
buffer
)[(
iy
+
ILPStrideY
[
ilp
])
*
numPlanes
+
threadIdx
.
x
]
=
reinterpret_cast
<
const
VecType
*>
(
features
)[
indices
[
iy
+
ILPStrideY
[
ilp
]]
*
numPlanes
+
threadIdx
.
x
];
}
}
}
template
<
typename
T
,
typename
Index
,
int
NumTLP
,
int
NumILP
>
__global__
void
scatterAddGenericKernel
(
T
*
outFeatures
,
const
T
*
buffer
,
const
Index
*
indices
,
int
size
,
int
numPlanes
)
{
int
ILPStrideX
[
NumILP
];
Index
inds
[
NumILP
];
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
ILPStrideX
[
ilp
]
=
ilp
*
gridDim
.
x
*
blockDim
.
x
;
for
(
int
ix
:
tv
::
KernelLoopX
<
int
,
NumILP
>
(
size
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
{
if
(
ix
+
ILPStrideX
[
ilp
]
<
size
)
inds
[
ilp
]
=
indices
[
ix
+
ILPStrideX
[
ilp
]]
*
numPlanes
;
}
for
(
int
iy
:
tv
::
KernelLoopY
<
int
>
(
numPlanes
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
++
ilp
)
{
if
(
ix
+
ILPStrideX
[
ilp
]
<
size
)
{
outFeatures
[
inds
[
ilp
]
+
iy
]
+=
buffer
[(
ix
+
ILPStrideX
[
ilp
])
*
numPlanes
+
iy
];
}
}
}
}
}
template
<
typename
T
,
typename
Index
,
int
NumTLP
,
int
NumILP
,
typename
VecType
=
int4
>
__global__
void
scatterAddVecBlockKernel
(
T
*
outFeatures
,
const
T
*
buffer
,
const
Index
*
indices
,
int
size
,
int
numPlanes
)
{
int
ILPStrideY
[
NumILP
];
constexpr
int
vecloadFactor
=
sizeof
(
VecType
)
/
sizeof
(
T
);
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
ILPStrideY
[
ilp
]
=
ilp
*
gridDim
.
y
*
blockDim
.
y
;
outFeatures
+=
blockIdx
.
x
*
NumTLP
;
buffer
+=
blockIdx
.
x
*
NumTLP
;
T
buf
[
vecloadFactor
];
T
buf2
[
vecloadFactor
];
Index
idx
;
for
(
int
iy
:
tv
::
KernelLoopY
<
int
,
NumILP
>
(
size
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
++
ilp
)
{
idx
=
indices
[
iy
+
ILPStrideY
[
ilp
]]
*
numPlanes
+
threadIdx
.
x
;
reinterpret_cast
<
VecType
*>
(
buf
)[
0
]
=
reinterpret_cast
<
VecType
*>
(
outFeatures
)[
idx
];
reinterpret_cast
<
VecType
*>
(
buf2
)[
0
]
=
reinterpret_cast
<
const
VecType
*>
(
buffer
)[(
iy
+
ILPStrideY
[
ilp
])
*
numPlanes
+
threadIdx
.
x
];
#pragma unroll
for
(
int
i
=
0
;
i
<
vecloadFactor
;
i
++
)
{
buf
[
i
]
+=
buf2
[
i
];
}
reinterpret_cast
<
VecType
*>
(
outFeatures
)[
idx
]
=
reinterpret_cast
<
VecType
*>
(
buf
)[
0
];
}
}
}
}
// namespace spconv
#endif
mmdet3d/ops/spconv/include/spconv/reordering.h
0 → 100644
View file @
d1aac35d
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef SPARSE_REORDERING_FUNCTOR_H_
#define SPARSE_REORDERING_FUNCTOR_H_
#include <tensorview/tensorview.h>
namespace
spconv
{
namespace
functor
{
template
<
typename
Device
,
typename
T
,
typename
Index
>
struct
SparseGatherFunctor
{
void
operator
()(
const
Device
&
d
,
tv
::
TensorView
<
T
>
buffer
,
tv
::
TensorView
<
const
T
>
features
,
tv
::
TensorView
<
const
Index
>
indices
,
int
size
);
};
template
<
typename
Device
,
typename
T
,
typename
Index
>
struct
SparseScatterAddFunctor
{
void
operator
()(
const
Device
&
d
,
tv
::
TensorView
<
T
>
out_features
,
tv
::
TensorView
<
const
T
>
buffer
,
tv
::
TensorView
<
const
Index
>
indices
,
int
size
,
bool
stable
=
false
);
};
}
// namespace functor
}
// namespace spconv
#endif
mmdet3d/ops/spconv/include/spconv/spconv_ops.h
0 → 100644
View file @
d1aac35d
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef SPARSE_CONV_OP_H_
#define SPARSE_CONV_OP_H_
#include <cuda_runtime_api.h>
#include <spconv/indice.h>
#include <spconv/reordering.h>
#include <torch/script.h>
#include <torch_utils.h>
#include <utility/timer.h>
namespace
spconv
{
// torch.jit's doc says only support int64, so we need to convert to int32.
template
<
unsigned
NDim
>
std
::
vector
<
torch
::
Tensor
>
getIndicePair
(
torch
::
Tensor
indices
,
int64_t
batchSize
,
std
::
vector
<
int64_t
>
outSpatialShape
,
std
::
vector
<
int64_t
>
spatialShape
,
std
::
vector
<
int64_t
>
kernelSize
,
std
::
vector
<
int64_t
>
stride
,
std
::
vector
<
int64_t
>
padding
,
std
::
vector
<
int64_t
>
dilation
,
std
::
vector
<
int64_t
>
outPadding
,
int64_t
_subM
,
int64_t
_transpose
)
{
// auto timer = spconv::CudaContextTimer<>();
bool
subM
=
_subM
!=
0
;
bool
transpose
=
_transpose
!=
0
;
auto
numAct
=
indices
.
size
(
0
);
auto
coorDim
=
indices
.
size
(
1
)
-
1
;
// batchIdx + xyz
TV_ASSERT_RT_ERR
(
NDim
==
coorDim
,
"error"
);
TV_ASSERT_RT_ERR
(
kernelSize
.
size
()
==
coorDim
,
"error"
);
TV_ASSERT_RT_ERR
(
outSpatialShape
.
size
()
==
coorDim
,
"error"
);
TV_ASSERT_RT_ERR
(
stride
.
size
()
==
coorDim
,
"error"
);
TV_ASSERT_RT_ERR
(
padding
.
size
()
==
coorDim
,
"error"
);
TV_ASSERT_RT_ERR
(
outPadding
.
size
()
==
coorDim
,
"error"
);
TV_ASSERT_RT_ERR
(
dilation
.
size
()
==
coorDim
,
"error"
);
auto
kernelVolume
=
kernelSize
[
0
];
for
(
int
i
=
1
;
i
<
kernelSize
.
size
();
++
i
)
{
kernelVolume
*=
kernelSize
[
i
];
}
TV_ASSERT_RT_ERR
(
kernelVolume
<=
4096
,
"error"
);
auto
outputVolume
=
outSpatialShape
[
0
];
for
(
int
i
=
1
;
i
<
outSpatialShape
.
size
();
++
i
)
{
outputVolume
*=
outSpatialShape
[
i
];
}
torch
::
Tensor
indicePairs
=
torch
::
full
({
kernelVolume
,
2
,
numAct
},
-
1
,
torch
::
dtype
(
torch
::
kInt32
).
device
(
indices
.
device
()));
torch
::
Tensor
indiceNum
=
torch
::
zeros
(
{
kernelVolume
},
torch
::
dtype
(
torch
::
kInt32
).
device
(
indices
.
device
()));
torch
::
Tensor
gridOut
=
torch
::
full
({
batchSize
*
outputVolume
},
-
1
,
torch
::
dtype
(
torch
::
kInt32
).
device
(
indices
.
device
()));
// std::cout << "full time " << timer.report() / 1000.0 << std::endl;
int64_t
numActOut
=
-
1
;
tv
::
SimpleVector
<
int
,
NDim
>
outSpatialShape32
;
tv
::
SimpleVector
<
int
,
NDim
>
kernelSize32
;
tv
::
SimpleVector
<
int
,
NDim
>
stride32
;
tv
::
SimpleVector
<
int
,
NDim
>
padding32
;
tv
::
SimpleVector
<
int
,
NDim
>
dilation32
;
auto
indicePairUnique
=
torch
::
full
({
indicePairs
.
numel
()
/
2
+
1
},
std
::
numeric_limits
<
int
>::
max
(),
torch
::
dtype
(
torch
::
kInt32
).
device
(
indices
.
device
()));
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
outSpatialShape32
.
push_back
(
outSpatialShape
[
i
]);
kernelSize32
.
push_back
(
kernelSize
[
i
]);
if
(
subM
)
{
stride32
.
push_back
(
1
);
padding32
.
push_back
(
kernelSize
[
i
]
/
2
);
dilation32
.
push_back
(
dilation
[
i
]);
}
else
{
stride32
.
push_back
(
stride
[
i
]);
padding32
.
push_back
(
padding
[
i
]);
dilation32
.
push_back
(
dilation
[
i
]);
}
}
if
(
subM
)
{
if
(
indices
.
device
().
type
()
==
torch
::
kCPU
)
{
auto
getIndicePairFtor
=
functor
::
CreateSubMIndicePairFunctor
<
tv
::
CPU
,
int
,
int
,
NDim
>
();
numActOut
=
getIndicePairFtor
(
tv
::
CPU
(),
tv
::
torch2tv
<
const
int
>
(
indices
),
tv
::
torch2tv
<
int
>
(
gridOut
),
tv
::
torch2tv
<
int
>
(
indicePairs
),
tv
::
torch2tv
<
int
>
(
indiceNum
),
kernelSize32
,
stride32
,
padding32
,
dilation32
,
outSpatialShape32
,
transpose
);
}
else
{
auto
getIndicePairFtor
=
functor
::
CreateSubMIndicePairFunctor
<
tv
::
GPU
,
int
,
int
,
NDim
>
();
numActOut
=
getIndicePairFtor
(
tv
::
TorchGPU
(),
tv
::
torch2tv
<
const
int
>
(
indices
),
tv
::
torch2tv
<
int
>
(
gridOut
),
tv
::
torch2tv
<
int
>
(
indicePairs
),
tv
::
torch2tv
<
int
>
(
indiceNum
),
kernelSize32
,
stride32
,
padding32
,
dilation32
,
outSpatialShape32
,
transpose
);
}
return
{
indices
,
indicePairs
,
indiceNum
};
}
else
{
torch
::
Tensor
outInds
=
torch
::
zeros
({
numAct
*
kernelVolume
,
coorDim
+
1
},
torch
::
dtype
(
torch
::
kInt32
).
device
(
indices
.
device
()));
if
(
indices
.
device
().
type
()
==
torch
::
kCPU
)
{
auto
getIndicePairFtor
=
functor
::
CreateConvIndicePairFunctor
<
tv
::
CPU
,
int
,
int
,
NDim
>
();
numActOut
=
getIndicePairFtor
(
tv
::
CPU
(),
tv
::
torch2tv
<
const
int
>
(
indices
),
tv
::
torch2tv
<
int
>
(
outInds
),
tv
::
torch2tv
<
int
>
(
gridOut
),
tv
::
torch2tv
<
int
>
(
indicePairs
),
tv
::
torch2tv
<
int
>
(
indiceNum
),
kernelSize32
,
stride32
,
padding32
,
dilation32
,
outSpatialShape32
,
transpose
);
}
else
{
auto
getIndicePairFtorP1
=
functor
::
CreateConvIndicePairFunctorP1
<
tv
::
GPU
,
int
,
int
,
NDim
>
();
auto
getIndicePairFtorP2
=
functor
::
CreateConvIndicePairFunctorP2
<
tv
::
GPU
,
int
,
int
,
NDim
>
();
numActOut
=
getIndicePairFtorP1
(
tv
::
TorchGPU
(),
tv
::
torch2tv
<
const
int
>
(
indices
),
tv
::
torch2tv
<
int
>
(
outInds
),
tv
::
torch2tv
<
int
>
(
gridOut
),
tv
::
torch2tv
<
int
>
(
indicePairs
),
tv
::
torch2tv
<
int
>
(
indiceNum
),
tv
::
torch2tv
<
int
>
(
indicePairUnique
),
kernelSize32
,
stride32
,
padding32
,
dilation32
,
outSpatialShape32
,
transpose
);
if
(
numActOut
>
0
)
{
auto
res
=
torch
::
_unique
(
indicePairUnique
);
indicePairUnique
=
std
::
get
<
0
>
(
res
);
numActOut
=
getIndicePairFtorP2
(
tv
::
TorchGPU
(),
tv
::
torch2tv
<
const
int
>
(
indices
),
tv
::
torch2tv
<
int
>
(
outInds
),
tv
::
torch2tv
<
int
>
(
gridOut
),
tv
::
torch2tv
<
int
>
(
indicePairs
),
tv
::
torch2tv
<
int
>
(
indiceNum
),
tv
::
torch2tv
<
int
>
(
indicePairUnique
),
outSpatialShape32
,
transpose
);
}
}
return
{
outInds
.
slice
(
0
,
0
,
numActOut
),
indicePairs
,
indiceNum
};
}
}
template
<
unsigned
NDim
>
std
::
vector
<
torch
::
Tensor
>
getIndicePairPreGrid
(
torch
::
Tensor
indices
,
torch
::
Tensor
gridOut
,
int64_t
batchSize
,
std
::
vector
<
int64_t
>
outSpatialShape
,
std
::
vector
<
int64_t
>
spatialShape
,
std
::
vector
<
int64_t
>
kernelSize
,
std
::
vector
<
int64_t
>
stride
,
std
::
vector
<
int64_t
>
padding
,
std
::
vector
<
int64_t
>
dilation
,
std
::
vector
<
int64_t
>
outPadding
,
int64_t
_subM
,
int64_t
_transpose
)
{
// auto timer = spconv::CudaContextTimer<>();
bool
subM
=
_subM
!=
0
;
bool
transpose
=
_transpose
!=
0
;
auto
numAct
=
indices
.
size
(
0
);
auto
coorDim
=
indices
.
size
(
1
)
-
1
;
// batchIdx + xyz
TV_ASSERT_RT_ERR
(
NDim
==
coorDim
,
"error"
);
TV_ASSERT_RT_ERR
(
kernelSize
.
size
()
==
coorDim
,
"error"
);
TV_ASSERT_RT_ERR
(
outSpatialShape
.
size
()
==
coorDim
,
"error"
);
TV_ASSERT_RT_ERR
(
stride
.
size
()
==
coorDim
,
"error"
);
TV_ASSERT_RT_ERR
(
padding
.
size
()
==
coorDim
,
"error"
);
TV_ASSERT_RT_ERR
(
outPadding
.
size
()
==
coorDim
,
"error"
);
TV_ASSERT_RT_ERR
(
dilation
.
size
()
==
coorDim
,
"error"
);
auto
kernelVolume
=
kernelSize
[
0
];
for
(
int
i
=
1
;
i
<
kernelSize
.
size
();
++
i
)
{
kernelVolume
*=
kernelSize
[
i
];
}
TV_ASSERT_RT_ERR
(
kernelVolume
<=
4096
,
"error"
);
auto
outputVolume
=
outSpatialShape
[
0
];
for
(
int
i
=
1
;
i
<
outSpatialShape
.
size
();
++
i
)
{
outputVolume
*=
outSpatialShape
[
i
];
}
TV_ASSERT_INVALID_ARG
(
gridOut
.
numel
()
>=
outputVolume
*
batchSize
,
"error"
);
torch
::
Tensor
indicePairs
=
torch
::
full
({
kernelVolume
,
2
,
numAct
},
-
1
,
torch
::
dtype
(
torch
::
kInt32
).
device
(
indices
.
device
()));
torch
::
Tensor
indiceNum
=
torch
::
zeros
(
{
kernelVolume
},
torch
::
dtype
(
torch
::
kInt32
).
device
(
indices
.
device
()));
// std::cout << "full time " << timer.report() / 1000.0 << std::endl;
int64_t
numActOut
=
-
1
;
tv
::
SimpleVector
<
int
,
NDim
>
outSpatialShape32
;
tv
::
SimpleVector
<
int
,
NDim
>
kernelSize32
;
tv
::
SimpleVector
<
int
,
NDim
>
stride32
;
tv
::
SimpleVector
<
int
,
NDim
>
padding32
;
tv
::
SimpleVector
<
int
,
NDim
>
dilation32
;
auto
indicePairUnique
=
torch
::
full
({
indicePairs
.
numel
()
/
2
+
1
},
std
::
numeric_limits
<
int
>::
max
(),
torch
::
dtype
(
torch
::
kInt32
).
device
(
indices
.
device
()));
for
(
int
i
=
0
;
i
<
NDim
;
++
i
)
{
outSpatialShape32
.
push_back
(
outSpatialShape
[
i
]);
kernelSize32
.
push_back
(
kernelSize
[
i
]);
if
(
subM
)
{
stride32
.
push_back
(
1
);
padding32
.
push_back
(
kernelSize
[
i
]
/
2
);
dilation32
.
push_back
(
dilation
[
i
]);
}
else
{
stride32
.
push_back
(
stride
[
i
]);
padding32
.
push_back
(
padding
[
i
]);
dilation32
.
push_back
(
dilation
[
i
]);
}
}
if
(
subM
)
{
if
(
indices
.
device
().
type
()
==
torch
::
kCPU
)
{
auto
getIndicePairFtor
=
functor
::
CreateSubMIndicePairFunctor
<
tv
::
CPU
,
int
,
int
,
NDim
>
();
numActOut
=
getIndicePairFtor
(
tv
::
CPU
(),
tv
::
torch2tv
<
const
int
>
(
indices
),
tv
::
torch2tv
<
int
>
(
gridOut
),
tv
::
torch2tv
<
int
>
(
indicePairs
),
tv
::
torch2tv
<
int
>
(
indiceNum
),
kernelSize32
,
stride32
,
padding32
,
dilation32
,
outSpatialShape32
,
transpose
);
gridOut
.
fill_
(
-
1
);
}
else
{
auto
getIndicePairFtor
=
functor
::
CreateSubMIndicePairFunctor
<
tv
::
GPU
,
int
,
int
,
NDim
>
();
numActOut
=
getIndicePairFtor
(
tv
::
TorchGPU
(),
tv
::
torch2tv
<
const
int
>
(
indices
),
tv
::
torch2tv
<
int
>
(
gridOut
),
tv
::
torch2tv
<
int
>
(
indicePairs
),
tv
::
torch2tv
<
int
>
(
indiceNum
),
kernelSize32
,
stride32
,
padding32
,
dilation32
,
outSpatialShape32
,
transpose
,
true
);
}
return
{
indices
,
indicePairs
,
indiceNum
};
}
else
{
torch
::
Tensor
outInds
=
torch
::
zeros
({
numAct
*
kernelVolume
,
coorDim
+
1
},
torch
::
dtype
(
torch
::
kInt32
).
device
(
indices
.
device
()));
if
(
indices
.
device
().
type
()
==
torch
::
kCPU
)
{
auto
getIndicePairFtor
=
functor
::
CreateConvIndicePairFunctor
<
tv
::
CPU
,
int
,
int
,
NDim
>
();
numActOut
=
getIndicePairFtor
(
tv
::
CPU
(),
tv
::
torch2tv
<
const
int
>
(
indices
),
tv
::
torch2tv
<
int
>
(
outInds
),
tv
::
torch2tv
<
int
>
(
gridOut
),
tv
::
torch2tv
<
int
>
(
indicePairs
),
tv
::
torch2tv
<
int
>
(
indiceNum
),
kernelSize32
,
stride32
,
padding32
,
dilation32
,
outSpatialShape32
,
transpose
,
true
);
gridOut
.
fill_
(
-
1
);
}
else
{
auto
getIndicePairFtorP1
=
functor
::
CreateConvIndicePairFunctorP1
<
tv
::
GPU
,
int
,
int
,
NDim
>
();
auto
getIndicePairFtorP2
=
functor
::
CreateConvIndicePairFunctorP2
<
tv
::
GPU
,
int
,
int
,
NDim
>
();
numActOut
=
getIndicePairFtorP1
(
tv
::
TorchGPU
(),
tv
::
torch2tv
<
const
int
>
(
indices
),
tv
::
torch2tv
<
int
>
(
outInds
),
tv
::
torch2tv
<
int
>
(
gridOut
),
tv
::
torch2tv
<
int
>
(
indicePairs
),
tv
::
torch2tv
<
int
>
(
indiceNum
),
tv
::
torch2tv
<
int
>
(
indicePairUnique
),
kernelSize32
,
stride32
,
padding32
,
dilation32
,
outSpatialShape32
,
transpose
);
if
(
numActOut
>
0
)
{
auto
res
=
torch
::
_unique
(
indicePairUnique
);
indicePairUnique
=
std
::
get
<
0
>
(
res
);
numActOut
=
getIndicePairFtorP2
(
tv
::
TorchGPU
(),
tv
::
torch2tv
<
const
int
>
(
indices
),
tv
::
torch2tv
<
int
>
(
outInds
),
tv
::
torch2tv
<
int
>
(
gridOut
),
tv
::
torch2tv
<
int
>
(
indicePairs
),
tv
::
torch2tv
<
int
>
(
indiceNum
),
tv
::
torch2tv
<
int
>
(
indicePairUnique
),
outSpatialShape32
,
transpose
,
true
);
}
}
return
{
outInds
.
slice
(
0
,
0
,
numActOut
),
indicePairs
,
indiceNum
};
}
}
template
<
typename
T
>
torch
::
Tensor
indiceConv
(
torch
::
Tensor
features
,
torch
::
Tensor
filters
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
int64_t
numActOut
,
int64_t
_inverse
,
int64_t
_subM
)
{
bool
subM
=
_subM
!=
0
;
bool
inverse
=
_inverse
!=
0
;
auto
device
=
features
.
device
().
type
();
auto
ndim
=
filters
.
dim
()
-
2
;
auto
kernelVolume
=
indicePairs
.
size
(
0
);
auto
numInPlanes
=
features
.
size
(
1
);
auto
numOutPlanes
=
filters
.
size
(
ndim
+
1
);
auto
indicePairNumCpu
=
indiceNum
.
to
({
torch
::
kCPU
});
auto
indicePairMaxSizeIter
=
std
::
max_element
(
indicePairNumCpu
.
data
<
int
>
(),
indicePairNumCpu
.
data
<
int
>
()
+
kernelVolume
);
int
indicePairMaxOffset
=
indicePairMaxSizeIter
-
indicePairNumCpu
.
data
<
int
>
();
int
indicePairMaxSize
=
*
indicePairMaxSizeIter
;
/*if (_subM){
std::vector<int> indicePairNumVec(indicePairNumCpu.data<int>(), indicePairNumCpu.data<int>() + kernelVolume);
indicePairNumVec.erase(indicePairNumVec.begin() + indicePairMaxOffset);
auto indicePairVecMaxSizeIter = std::max_element(
indicePairNumVec.begin(), indicePairNumVec.end());
indicePairMaxSize = *indicePairVecMaxSizeIter;
}*/
auto
options
=
torch
::
TensorOptions
().
dtype
(
features
.
dtype
()).
device
(
features
.
device
());
// auto indicePairOptions =
// torch::TensorOptions().dtype(torch::kInt64).device(indicePairs.device());
torch
::
Tensor
output
=
torch
::
zeros
({
numActOut
,
numOutPlanes
},
options
);
torch
::
Tensor
inputBuffer
=
torch
::
zeros
({
indicePairMaxSize
,
numInPlanes
},
options
);
torch
::
Tensor
outputBuffer
=
torch
::
zeros
({
indicePairMaxSize
,
numOutPlanes
},
options
);
filters
=
filters
.
view
({
-
1
,
numInPlanes
,
numOutPlanes
});
if
(
subM
)
{
// the center index of subm conv don't need gather and scatter
// add.
torch
::
mm_out
(
output
,
features
,
filters
[
indicePairMaxOffset
]);
}
double
totalGatherTime
=
0
;
double
totalGEMMTime
=
0
;
double
totalSAddTime
=
0
;
for
(
int
i
=
0
;
i
<
kernelVolume
;
++
i
)
{
auto
nHot
=
indicePairNumCpu
.
data
<
int
>
()[
i
];
if
(
nHot
<=
0
||
(
subM
&&
i
==
indicePairMaxOffset
))
{
continue
;
}
// auto timer = spconv::CudaContextTimer<>();
auto
outputBufferBlob
=
torch
::
from_blob
(
outputBuffer
.
data
<
T
>
(),
{
nHot
,
numOutPlanes
},
options
);
auto
inputBufferBlob
=
torch
::
from_blob
(
inputBuffer
.
data
<
T
>
(),
{
nHot
,
numInPlanes
},
options
);
if
(
device
==
torch
::
kCPU
)
{
functor
::
SparseGatherFunctor
<
tv
::
CPU
,
T
,
int
>
gatherFtor
;
gatherFtor
(
tv
::
CPU
(),
tv
::
torch2tv
<
T
>
(
inputBuffer
),
tv
::
torch2tv
<
const
T
>
(
features
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
inverse
),
nHot
);
}
else
{
functor
::
SparseGatherFunctor
<
tv
::
GPU
,
T
,
int
>
gatherFtor
;
gatherFtor
(
tv
::
TorchGPU
(),
tv
::
torch2tv
<
T
>
(
inputBuffer
),
tv
::
torch2tv
<
const
T
>
(
features
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
inverse
),
nHot
);
TV_CHECK_CUDA_ERR
();
/* slower than SparseGatherFunctor, may due to int->long conversion
auto indicePairLong = indicePairs[i][inverse].to(torch::kInt64);
auto indicePairBlob = torch::from_blob(indicePairLong.data<long>(), {nHot},
indicePairOptions);
torch::index_select_out(inputBufferBlob, features, 0,
indicePairBlob);*/
}
// totalGatherTime += timer.report() / 1000.0;
torch
::
mm_out
(
outputBufferBlob
,
inputBufferBlob
,
filters
[
i
]);
// totalGEMMTime += timer.report() / 1000.0;
if
(
device
==
torch
::
kCPU
)
{
functor
::
SparseScatterAddFunctor
<
tv
::
CPU
,
T
,
int
>
scatterFtor
;
scatterFtor
(
tv
::
CPU
(),
tv
::
torch2tv
<
T
>
(
output
),
tv
::
torch2tv
<
const
T
>
(
outputBuffer
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
!
inverse
),
nHot
,
true
);
}
else
{
functor
::
SparseScatterAddFunctor
<
tv
::
GPU
,
T
,
int
>
scatterFtor
;
scatterFtor
(
tv
::
TorchGPU
(),
tv
::
torch2tv
<
T
>
(
output
),
tv
::
torch2tv
<
const
T
>
(
outputBuffer
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
!
inverse
),
nHot
,
true
);
TV_CHECK_CUDA_ERR
();
}
// totalSAddTime += timer.report() / 1000.0;
}
// std::cout << "gather time " << totalGatherTime << std::endl;
// std::cout << "gemm time " << totalGEMMTime << std::endl;
// std::cout << "scatteradd time " << totalSAddTime << std::endl;
return
output
;
}
template
<
typename
T
>
std
::
vector
<
torch
::
Tensor
>
indiceConvBackward
(
torch
::
Tensor
features
,
torch
::
Tensor
filters
,
torch
::
Tensor
outGrad
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
int64_t
_inverse
,
int64_t
_subM
)
{
bool
subM
=
_subM
!=
0
;
bool
inverse
=
_inverse
!=
0
;
auto
device
=
features
.
device
().
type
();
auto
ndim
=
filters
.
dim
()
-
2
;
auto
kernelVolume
=
indicePairs
.
size
(
0
);
auto
numInPlanes
=
features
.
size
(
1
);
auto
numOutPlanes
=
filters
.
size
(
ndim
+
1
);
auto
indicePairNumCpu
=
indiceNum
.
to
({
torch
::
kCPU
});
auto
indicePairMaxSizeIter
=
std
::
max_element
(
indicePairNumCpu
.
data
<
int
>
(),
indicePairNumCpu
.
data
<
int
>
()
+
kernelVolume
);
int
indicePairMaxOffset
=
indicePairMaxSizeIter
-
indicePairNumCpu
.
data
<
int
>
();
int
indicePairMaxSize
=
*
indicePairMaxSizeIter
;
auto
options
=
torch
::
TensorOptions
().
dtype
(
features
.
dtype
()).
device
(
features
.
device
());
auto
filterShape
=
filters
.
sizes
();
torch
::
Tensor
inputGrad
=
torch
::
zeros
(
features
.
sizes
(),
options
);
torch
::
Tensor
filtersGrad
=
torch
::
zeros
(
filterShape
,
options
);
torch
::
Tensor
inputBuffer
=
torch
::
zeros
({
indicePairMaxSize
,
numInPlanes
},
options
);
torch
::
Tensor
outputBuffer
=
torch
::
zeros
({
indicePairMaxSize
,
numOutPlanes
},
options
);
filters
=
filters
.
view
({
-
1
,
numInPlanes
,
numOutPlanes
});
filtersGrad
=
filtersGrad
.
view
({
-
1
,
numInPlanes
,
numOutPlanes
});
if
(
subM
)
{
auto
filterGradSub
=
filtersGrad
[
indicePairMaxOffset
];
torch
::
mm_out
(
filterGradSub
,
features
.
t
(),
outGrad
);
torch
::
mm_out
(
inputGrad
,
outGrad
,
filters
[
indicePairMaxOffset
].
t
());
}
for
(
int
i
=
0
;
i
<
kernelVolume
;
++
i
)
{
auto
nHot
=
indicePairNumCpu
.
data
<
int
>
()[
i
];
if
(
nHot
<=
0
||
(
subM
&&
i
==
indicePairMaxOffset
))
{
continue
;
}
if
(
device
==
torch
::
kCPU
)
{
functor
::
SparseGatherFunctor
<
tv
::
CPU
,
T
,
int
>
gatherFtor
;
functor
::
SparseGatherFunctor
<
tv
::
CPU
,
T
,
int
>
gatherFtorOut
;
gatherFtor
(
tv
::
CPU
(),
tv
::
torch2tv
<
T
>
(
inputBuffer
),
tv
::
torch2tv
<
const
T
>
(
features
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
inverse
),
nHot
);
gatherFtorOut
(
tv
::
CPU
(),
tv
::
torch2tv
<
T
>
(
outputBuffer
),
tv
::
torch2tv
<
const
T
>
(
outGrad
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
!
inverse
),
nHot
);
}
else
{
functor
::
SparseGatherFunctor
<
tv
::
GPU
,
T
,
int
>
gatherFtor
;
functor
::
SparseGatherFunctor
<
tv
::
GPU
,
T
,
int
>
gatherFtorOut
;
gatherFtor
(
tv
::
TorchGPU
(),
tv
::
torch2tv
<
T
>
(
inputBuffer
),
tv
::
torch2tv
<
const
T
>
(
features
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
inverse
),
nHot
);
TV_CHECK_CUDA_ERR
();
gatherFtorOut
(
tv
::
TorchGPU
(),
tv
::
torch2tv
<
T
>
(
outputBuffer
),
tv
::
torch2tv
<
const
T
>
(
outGrad
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
!
inverse
),
nHot
);
TV_CHECK_CUDA_ERR
();
}
auto
filterGradSub
=
filtersGrad
[
i
];
auto
outputBufferBlob
=
torch
::
from_blob
(
outputBuffer
.
data
<
T
>
(),
{
nHot
,
numOutPlanes
},
options
);
auto
inputBufferBlob
=
torch
::
from_blob
(
inputBuffer
.
data
<
T
>
(),
{
nHot
,
numInPlanes
},
options
);
torch
::
mm_out
(
filterGradSub
,
inputBufferBlob
.
t
(),
outputBufferBlob
);
torch
::
mm_out
(
inputBufferBlob
,
outputBufferBlob
,
filters
[
i
].
t
());
if
(
device
==
torch
::
kCPU
)
{
functor
::
SparseScatterAddFunctor
<
tv
::
CPU
,
T
,
int
>
scatterFtor
;
scatterFtor
(
tv
::
CPU
(),
tv
::
torch2tv
<
T
>
(
inputGrad
),
tv
::
torch2tv
<
const
T
>
(
inputBuffer
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
inverse
),
nHot
);
}
else
{
functor
::
SparseScatterAddFunctor
<
tv
::
GPU
,
T
,
int
>
scatterFtor
;
scatterFtor
(
tv
::
TorchGPU
(),
tv
::
torch2tv
<
T
>
(
inputGrad
),
tv
::
torch2tv
<
const
T
>
(
inputBuffer
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
inverse
),
nHot
);
TV_CHECK_CUDA_ERR
();
}
}
return
{
inputGrad
,
filtersGrad
.
view
(
filterShape
)};
}
template
<
typename
T
>
torch
::
Tensor
indiceConvDevelopDontUse
(
torch
::
Tensor
features
,
torch
::
Tensor
filters
,
torch
::
Tensor
indicePairs
,
torch
::
Tensor
indiceNum
,
int64_t
numActOut
,
int64_t
_inverse
,
int64_t
_subM
)
{
bool
subM
=
_subM
!=
0
;
bool
inverse
=
_inverse
!=
0
;
auto
device
=
features
.
device
().
type
();
auto
ndim
=
filters
.
dim
()
-
2
;
auto
kernelVolume
=
indicePairs
.
size
(
0
);
auto
numInPlanes
=
features
.
size
(
1
);
auto
numOutPlanes
=
filters
.
size
(
ndim
+
1
);
auto
indicePairNumCpu
=
indiceNum
.
to
({
torch
::
kCPU
});
auto
totalActsTen
=
indicePairNumCpu
.
sum
();
auto
totalActs
=
indicePairNumCpu
.
data
<
int
>
()[
0
];
auto
indicePairMaxSizeIter
=
std
::
max_element
(
indicePairNumCpu
.
data
<
int
>
(),
indicePairNumCpu
.
data
<
int
>
()
+
kernelVolume
);
int
indicePairMaxOffset
=
indicePairMaxSizeIter
-
indicePairNumCpu
.
data
<
int
>
();
int
indicePairMaxSize
=
*
indicePairMaxSizeIter
;
std
::
vector
<
int
>
indicePairNumVec
(
indicePairNumCpu
.
data
<
int
>
(),
indicePairNumCpu
.
data
<
int
>
()
+
kernelVolume
);
indicePairNumVec
.
erase
(
indicePairNumVec
.
begin
()
+
indicePairMaxOffset
);
int
subRuleMaxSize
=
*
std
::
max_element
(
indicePairNumVec
.
begin
(),
indicePairNumVec
.
end
());
if
(
subM
)
{
indicePairMaxSize
=
subRuleMaxSize
;
}
auto
timer
=
spconv
::
CudaContextTimer
<>
();
auto
options
=
torch
::
TensorOptions
().
dtype
(
features
.
dtype
()).
device
(
features
.
device
());
// auto indicePairOptions =
// torch::TensorOptions().dtype(torch::kInt64).device(indicePairs.device());
torch
::
Tensor
output
=
torch
::
zeros
({
numActOut
,
numOutPlanes
},
options
);
torch
::
Tensor
inputBuffer
=
torch
::
zeros
({
kernelVolume
,
indicePairMaxSize
,
numInPlanes
},
options
);
torch
::
Tensor
outputBuffer
=
torch
::
zeros
({
kernelVolume
,
indicePairMaxSize
,
numOutPlanes
},
options
);
filters
=
filters
.
view
({
-
1
,
numInPlanes
,
numOutPlanes
});
std
::
cout
<<
"create time "
<<
timer
.
report
()
/
1000.0
<<
std
::
endl
;
if
(
subM
)
{
// the center index of subm conv don't need gather and scatter
// add.
torch
::
mm_out
(
output
,
features
,
filters
[
indicePairMaxOffset
]);
}
double
totalGatherTime
=
0
;
double
totalGEMMTime
=
0
;
double
totalSAddTime
=
0
;
// auto timer = spconv::CudaContextTimer<>();
for
(
int
i
=
0
;
i
<
kernelVolume
;
++
i
)
{
auto
nHot
=
indicePairNumCpu
.
data
<
int
>
()[
i
];
if
(
nHot
<=
0
||
(
subM
&&
i
==
indicePairMaxOffset
))
{
continue
;
}
//
auto
outputBufferBlob
=
torch
::
from_blob
(
outputBuffer
[
i
].
data
<
T
>
(),
{
nHot
,
numOutPlanes
},
options
);
auto
inputBufferBlob
=
torch
::
from_blob
(
inputBuffer
[
i
].
data
<
T
>
(),
{
nHot
,
numInPlanes
},
options
);
if
(
device
==
torch
::
kCPU
)
{
functor
::
SparseGatherFunctor
<
tv
::
CPU
,
T
,
int
>
gatherFtor
;
gatherFtor
(
tv
::
CPU
(),
tv
::
torch2tv
<
T
>
(
inputBufferBlob
),
tv
::
torch2tv
<
const
T
>
(
features
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
inverse
),
nHot
);
}
else
{
functor
::
SparseGatherFunctor
<
tv
::
GPU
,
T
,
int
>
gatherFtor
;
gatherFtor
(
tv
::
TorchGPU
(),
tv
::
torch2tv
<
T
>
(
inputBufferBlob
),
tv
::
torch2tv
<
const
T
>
(
features
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
inverse
),
nHot
);
TV_CHECK_CUDA_ERR
();
}
// }
// for (int i = 0; i < kernelVolume; ++i) {
// totalGatherTime += timer.report() / 1000.0;
// auto outputBufferBlob = torch::from_blob(outputBuffer[i].data<T>(),
// {nHot, numOutPlanes}, options);
}
// totalGatherTime += timer.report() / 1000.0;
for
(
int
i
=
0
;
i
<
kernelVolume
;
++
i
)
{
auto
nHot
=
indicePairNumCpu
.
data
<
int
>
()[
i
];
if
(
nHot
<=
0
||
(
subM
&&
i
==
indicePairMaxOffset
))
{
continue
;
}
auto
outputBufferBlob
=
torch
::
from_blob
(
outputBuffer
[
i
].
data
<
T
>
(),
{
nHot
,
numOutPlanes
},
options
);
auto
inputBufferBlob
=
torch
::
from_blob
(
inputBuffer
[
i
].
data
<
T
>
(),
{
nHot
,
numInPlanes
},
options
);
torch
::
mm_out
(
outputBufferBlob
,
inputBufferBlob
,
filters
[
i
]);
}
// totalGEMMTime += timer.report() / 1000.0;
// totalGEMMTime += timer.report() / 1000.0;
for
(
int
i
=
0
;
i
<
kernelVolume
;
++
i
)
{
auto
nHot
=
indicePairNumCpu
.
data
<
int
>
()[
i
];
if
(
nHot
<=
0
||
(
subM
&&
i
==
indicePairMaxOffset
))
{
continue
;
}
auto
outputBufferBlob
=
torch
::
from_blob
(
outputBuffer
[
i
].
data
<
T
>
(),
{
nHot
,
numOutPlanes
},
options
);
auto
inputBufferBlob
=
torch
::
from_blob
(
inputBuffer
[
i
].
data
<
T
>
(),
{
nHot
,
numInPlanes
},
options
);
if
(
device
==
torch
::
kCPU
)
{
functor
::
SparseScatterAddFunctor
<
tv
::
CPU
,
T
,
int
>
scatterFtor
;
scatterFtor
(
tv
::
CPU
(),
tv
::
torch2tv
<
T
>
(
output
),
tv
::
torch2tv
<
const
T
>
(
outputBufferBlob
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
!
inverse
),
nHot
,
true
);
}
else
{
functor
::
SparseScatterAddFunctor
<
tv
::
GPU
,
T
,
int
>
scatterFtor
;
scatterFtor
(
tv
::
TorchGPU
(),
tv
::
torch2tv
<
T
>
(
output
),
tv
::
torch2tv
<
const
T
>
(
outputBufferBlob
),
tv
::
torch2tv
<
const
int
>
(
indicePairs
).
subview
(
i
,
!
inverse
),
nHot
,
true
);
TV_CHECK_CUDA_ERR
();
}
// totalSAddTime += timer.report() / 1000.0;
}
// totalSAddTime += timer.report() / 1000.0;
// std::cout << "gather time " << totalGatherTime << std::endl;
// std::cout << "gemm time " << totalGEMMTime << std::endl;
// std::cout << "scatteradd time " << totalSAddTime << std::endl;
return
output
;
}
}
// namespace spconv
#endif
mmdet3d/ops/spconv/include/tensorview/helper_kernel.cu.h
0 → 100644
View file @
d1aac35d
#pragma once
// from tensorflow
namespace
tv
{
namespace
detail
{
template
<
typename
T
>
class
KernelLoop
{
struct
Iterator
{
__forceinline__
__device__
Iterator
(
T
index
,
T
delta
)
:
index_
(
index
),
delta_
(
delta
)
{}
__forceinline__
__device__
T
operator
*
()
const
{
return
index_
;
}
__forceinline__
__device__
Iterator
&
operator
++
()
{
index_
+=
delta_
;
return
*
this
;
}
__forceinline__
__device__
bool
operator
!=
(
const
Iterator
&
other
)
const
{
bool
greater
=
index_
>
other
.
index_
;
bool
less
=
index_
<
other
.
index_
;
// Anything past an end iterator (delta_ == 0) is equal.
// In range-based for loops, this optimizes to 'return less'.
if
(
!
other
.
delta_
)
{
return
less
;
}
if
(
!
delta_
)
{
return
greater
;
}
return
less
||
greater
;
}
private:
T
index_
;
const
T
delta_
;
};
public:
__forceinline__
__device__
KernelLoop
(
T
begin
,
T
delta
,
T
end
)
:
begin_
(
begin
),
delta_
(
delta
),
end_
(
end
)
{}
__forceinline__
__device__
Iterator
begin
()
const
{
return
Iterator
{
begin_
,
delta_
};
}
__forceinline__
__device__
Iterator
end
()
const
{
return
Iterator
{
end_
,
0
};
}
private:
T
begin_
;
T
delta_
;
T
end_
;
};
}
// namespace detail
template
<
typename
T
,
int
NumILP
=
1
>
__forceinline__
__device__
detail
::
KernelLoop
<
T
>
KernelLoopX
(
T
count
)
{
return
detail
::
KernelLoop
<
T
>
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
,
gridDim
.
x
*
blockDim
.
x
*
NumILP
,
count
);
}
// Helper to visit indices in the range 0 <= i < count using the y-coordinate.
// Usage: for(int i : KernelLoopY(count)) { visit(i); }
template
<
typename
T
,
int
NumILP
=
1
>
__forceinline__
__device__
detail
::
KernelLoop
<
T
>
KernelLoopY
(
T
count
)
{
return
detail
::
KernelLoop
<
T
>
(
blockIdx
.
y
*
blockDim
.
y
+
threadIdx
.
y
,
gridDim
.
y
*
blockDim
.
y
*
NumILP
,
count
);
}
// Helper to visit indices in the range 0 <= i < count using the z-coordinate.
// Usage: for(int i : KernelLoopZ(count)) { visit(i); }
template
<
typename
T
,
int
NumILP
=
1
>
__forceinline__
__device__
detail
::
KernelLoop
<
T
>
KernelLoopZ
(
T
count
)
{
return
detail
::
KernelLoop
<
T
>
(
blockIdx
.
z
*
blockDim
.
z
+
threadIdx
.
z
,
gridDim
.
z
*
blockDim
.
z
*
NumILP
,
count
);
}
}
// namespace tv
mmdet3d/ops/spconv/include/tensorview/helper_launch.h
0 → 100644
View file @
d1aac35d
#pragma once
// from pytorch.aten
#include "tensorview.h"
namespace
tv
{
namespace
launch
{
template
<
typename
T1
,
typename
T2
>
inline
int
DivUp
(
const
T1
a
,
const
T2
b
)
{
return
(
a
+
b
-
1
)
/
b
;
}
// Use 1024 threads per block, which requires cuda sm_2x or above
constexpr
int
CUDA_NUM_THREADS
=
1024
;
// CUDA: number of blocks for threads.
inline
int
getBlocks
(
const
int
N
)
{
TV_ASSERT_RT_ERR
(
N
>
0
,
"CUDA kernel launch blocks must be positive, but got N="
,
N
);
return
DivUp
(
N
,
CUDA_NUM_THREADS
);
}
}
// namespace launch
}
// namespace tv
mmdet3d/ops/spconv/include/tensorview/tensorview.h
0 → 100644
View file @
d1aac35d
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <algorithm>
#include <cassert>
#include <cstdlib>
#include <cuda_runtime_api.h>
#include <iostream>
#include <memory>
// #include <prettyprint.h>
#include <sstream>
#include <type_traits>
#include <vector>
namespace
tv
{
#ifdef __NVCC__
#define TV_HOST_DEVICE_INLINE __forceinline__ __device__ __host__
#define TV_DEVICE_INLINE __forceinline__ __device__
#define TV_HOST_DEVICE __device__ __host__
#define TV_ASSERT(expr) assert(expr)
#elif defined(__CUDACC_RTC__)
#define TV_ASSERT(expr) assert(expr)
#define TV_HOST_DEVICE_INLINE __forceinline__ __device__
#define TV_DEVICE_INLINE __forceinline__ __device__
#define TV_HOST_DEVICE __device__ __host__
#else
#define TV_ASSERT(x) assert(x)
#define TV_HOST_DEVICE_INLINE inline
#define TV_HOST_DEVICE
#endif
#define TV_REQUIRE(expr, ...) \
{ \
if (!(expr)) { \
printf(__VA_ARGS__); \
assert(expr); \
} \
}
#define TV_DEVICE_REQUIRE(expr, ...) \
{ \
if (!(expr) && threadIdx.x == 0) \
printf(__VA_ARGS__); \
assert(expr); \
}
template
<
class
SStream
,
class
T
>
void
sstream_print
(
SStream
&
ss
,
T
val
)
{
ss
<<
val
;
}
template
<
class
SStream
,
class
T
,
class
...
TArgs
>
void
sstream_print
(
SStream
&
ss
,
T
val
,
TArgs
...
args
)
{
ss
<<
val
<<
" "
;
sstream_print
(
ss
,
args
...);
}
#define TV_ASSERT_RT_ERR(expr, ...) \
{ \
if (!(expr)) { \
std::stringstream __macro_s; \
__macro_s << __FILE__ << " " << __LINE__ << "\n"; \
__macro_s << #expr << " assert faild. "; \
tv::sstream_print(__macro_s, __VA_ARGS__); \
throw std::runtime_error(__macro_s.str()); \
} \
}
#define TV_ASSERT_INVALID_ARG(expr, ...) \
{ \
if (!(expr)) { \
std::stringstream __macro_s; \
__macro_s << __FILE__ << " " << __LINE__ << "\n"; \
__macro_s << #expr << " assert faild. "; \
tv::sstream_print(__macro_s, __VA_ARGS__); \
throw std::invalid_argument(__macro_s.str()); \
} \
}
#define TV_CHECK_CUDA_ERR() \
{ \
auto err = cudaGetLastError(); \
if (err != cudaSuccess) { \
std::stringstream __macro_s; \
__macro_s << __FILE__ << " " << __LINE__ << "\n"; \
__macro_s << "cuda execution failed with error " << err; \
throw std::runtime_error(__macro_s.str()); \
} \
}
struct
GPU
{
GPU
(
cudaStream_t
s
=
0
)
:
mStream
(
s
)
{}
virtual
cudaStream_t
getStream
()
const
{
return
mStream
;
}
cudaStream_t
mStream
=
0
;
};
struct
CPU
{};
#define TV_MAX_DIM 6
/*
template <typename T>
constexpr size_t calc_align(size_t ndim)
{
if (ndim * sizeof(T) == 1)
return 1;
else if (ndim * sizeof(T) == 2)
return 2;
else if (ndim * sizeof(T) <= 4 && ndim * sizeof(T) > 2)
return 4;
else if (ndim * sizeof(T) <= 8 && ndim * sizeof(T) > 4)
return 8;
else if (ndim * sizeof(T) <= 16 && ndim * sizeof(T) > 8)
return 16;
else if (ndim * sizeof(T) <= 32 && ndim * sizeof(T) > 16)
return 32;
else
return 64;
}
*/
template
<
typename
T
,
size_t
MaxDim
=
TV_MAX_DIM
>
struct
/*alignas(calc_align<T>(MaxDim))*/
SimpleVector
{
public:
TV_HOST_DEVICE_INLINE
SimpleVector
(){};
TV_HOST_DEVICE_INLINE
SimpleVector
(
std
::
initializer_list
<
T
>
q
)
{
TV_ASSERT
(
q
.
size
()
<=
MaxDim
);
mSize
=
0
;
for
(
T
s
:
q
)
{
mArray
[
mSize
++
]
=
s
;
}
mSize
=
q
.
size
();
}
SimpleVector
(
const
std
::
vector
<
T
>
&
arr
)
{
TV_ASSERT
(
arr
.
size
()
<=
MaxDim
);
for
(
size_t
i
=
0
;
i
<
arr
.
size
();
++
i
)
{
mArray
[
i
]
=
arr
[
i
];
}
mSize
=
arr
.
size
();
}
TV_HOST_DEVICE_INLINE
SimpleVector
(
const
SimpleVector
<
T
,
MaxDim
>
&
arr
)
{
TV_ASSERT
(
arr
.
size
()
<=
MaxDim
);
for
(
size_t
i
=
0
;
i
<
arr
.
size
();
++
i
)
{
mArray
[
i
]
=
arr
[
i
];
}
mSize
=
arr
.
size
();
}
TV_HOST_DEVICE_INLINE
T
&
operator
[](
int
idx
)
{
#ifdef TV_DEBUG
TV_ASSERT
(
idx
>=
0
&&
idx
<
mSize
);
#endif
return
mArray
[
idx
];
}
TV_HOST_DEVICE_INLINE
const
T
&
operator
[](
int
idx
)
const
{
#ifdef TV_DEBUG
TV_ASSERT
(
idx
>=
0
&&
idx
<
mSize
);
#endif
return
mArray
[
idx
];
}
TV_HOST_DEVICE_INLINE
void
push_back
(
T
s
)
{
#ifdef TV_DEBUG
TV_ASSERT
(
mSize
<
MaxDim
);
#endif
mArray
[
mSize
]
=
s
;
mSize
++
;
}
TV_HOST_DEVICE_INLINE
void
pop_back
()
{
#ifdef TV_DEBUG
TV_ASSERT
(
mSize
>
0
);
#endif
mSize
--
;
}
TV_HOST_DEVICE_INLINE
size_t
size
()
const
{
return
mSize
;
}
TV_HOST_DEVICE_INLINE
const
T
*
data
()
const
{
return
mArray
;
}
TV_HOST_DEVICE_INLINE
size_t
empty
()
const
{
return
mSize
==
0
;
}
typedef
size_t
size_type
;
class
iterator
{
public:
typedef
iterator
self_type
;
typedef
T
value_type
;
typedef
T
&
reference
;
typedef
T
*
pointer
;
typedef
std
::
forward_iterator_tag
iterator_category
;
typedef
std
::
ptrdiff_t
difference_type
;
TV_HOST_DEVICE_INLINE
iterator
(
pointer
ptr
)
:
ptr_
(
ptr
)
{}
TV_HOST_DEVICE_INLINE
self_type
operator
++
(
int
junk
)
{
self_type
i
=
*
this
;
ptr_
++
;
return
i
;
}
TV_HOST_DEVICE_INLINE
self_type
operator
++
()
{
ptr_
++
;
return
*
this
;
}
TV_HOST_DEVICE_INLINE
reference
operator
*
()
{
return
*
ptr_
;
}
TV_HOST_DEVICE_INLINE
pointer
operator
->
()
{
return
ptr_
;
}
TV_HOST_DEVICE_INLINE
bool
operator
==
(
const
self_type
&
rhs
)
{
return
ptr_
==
rhs
.
ptr_
;
}
TV_HOST_DEVICE_INLINE
bool
operator
!=
(
const
self_type
&
rhs
)
{
return
ptr_
!=
rhs
.
ptr_
;
}
private:
pointer
ptr_
;
};
class
const_iterator
{
public:
typedef
const_iterator
self_type
;
typedef
T
value_type
;
typedef
const
T
&
reference
;
typedef
const
T
*
pointer
;
typedef
std
::
ptrdiff_t
difference_type
;
typedef
std
::
forward_iterator_tag
iterator_category
;
TV_HOST_DEVICE_INLINE
const_iterator
(
pointer
ptr
)
:
ptr_
(
ptr
)
{}
TV_HOST_DEVICE_INLINE
self_type
operator
++
(
int
junk
)
{
self_type
i
=
*
this
;
ptr_
++
;
return
i
;
}
TV_HOST_DEVICE_INLINE
self_type
operator
++
()
{
ptr_
++
;
return
*
this
;
}
TV_HOST_DEVICE_INLINE
reference
operator
*
()
{
return
*
ptr_
;
}
TV_HOST_DEVICE_INLINE
pointer
operator
->
()
{
return
ptr_
;
}
TV_HOST_DEVICE_INLINE
bool
operator
==
(
const
self_type
&
rhs
)
{
return
ptr_
==
rhs
.
ptr_
;
}
TV_HOST_DEVICE_INLINE
bool
operator
!=
(
const
self_type
&
rhs
)
{
return
ptr_
!=
rhs
.
ptr_
;
}
private:
pointer
ptr_
;
};
TV_HOST_DEVICE_INLINE
iterator
begin
()
{
return
iterator
(
mArray
);
}
TV_HOST_DEVICE_INLINE
iterator
end
()
{
return
iterator
(
mArray
+
mSize
);
}
TV_HOST_DEVICE_INLINE
const_iterator
begin
()
const
{
return
const_iterator
(
mArray
);
}
TV_HOST_DEVICE_INLINE
const_iterator
end
()
const
{
return
const_iterator
(
mArray
+
mSize
);
}
TV_HOST_DEVICE_INLINE
const_iterator
cbegin
()
const
{
return
const_iterator
(
mArray
);
}
TV_HOST_DEVICE_INLINE
const_iterator
cend
()
const
{
return
const_iterator
(
mArray
+
mSize
);
}
protected:
T
mArray
[
MaxDim
];
size_t
mSize
=
0
;
};
template
<
typename
T
,
size_t
MaxDim
>
bool
operator
==
(
const
SimpleVector
<
T
,
MaxDim
>
&
lfs
,
const
SimpleVector
<
T
,
MaxDim
>
&
rfs
)
{
if
(
lfs
.
size
()
!=
rfs
.
size
())
return
false
;
for
(
size_t
i
=
0
;
i
<
lfs
.
size
();
++
i
)
{
if
(
lfs
[
i
]
!=
rfs
[
i
])
return
false
;
}
return
true
;
}
template
<
typename
T
,
size_t
MaxDim
>
bool
operator
!=
(
const
SimpleVector
<
T
,
MaxDim
>
&
lfs
,
const
SimpleVector
<
T
,
MaxDim
>
&
rfs
)
{
return
!
(
lfs
==
rfs
);
}
struct
Slice
{
template
<
class
...
Integers
>
TV_HOST_DEVICE_INLINE
Slice
(
Integers
...
ints
)
{
static_assert
(
sizeof
...(
ints
)
<=
3
,
"slice init must smaller than 3"
);
SimpleVector
<
int
,
3
>
slices
{
int
(
ints
)...};
mSlices
[
0
]
=
-
1
;
mSlices
[
1
]
=
-
1
;
mSlices
[
2
]
=
-
1
;
for
(
size_t
i
=
0
;
i
<
slices
.
size
();
++
i
)
{
mSlices
[
i
]
=
slices
[
i
];
}
}
TV_HOST_DEVICE_INLINE
Slice
()
{
mSlices
[
0
]
=
-
1
;
mSlices
[
1
]
=
-
1
;
mSlices
[
2
]
=
-
1
;
}
template
<
typename
T
>
TV_HOST_DEVICE_INLINE
Slice
(
std
::
initializer_list
<
T
>
slice
)
{
mSlices
[
0
]
=
-
1
;
mSlices
[
1
]
=
-
1
;
mSlices
[
2
]
=
-
1
;
TV_ASSERT
(
slice
.
size
()
<=
3
);
int
idx
=
0
;
for
(
T
s
:
slice
)
{
mSlices
[
idx
]
=
int
(
s
);
++
idx
;
}
}
TV_HOST_DEVICE_INLINE
int
&
operator
[](
int
idx
)
{
#ifdef TV_DEBUG
TV_ASSERT
(
idx
>=
0
&&
idx
<
3
);
#endif
return
mSlices
[
idx
];
}
TV_HOST_DEVICE_INLINE
const
int
&
operator
[](
int
idx
)
const
{
#ifdef TV_DEBUG
TV_ASSERT
(
idx
>=
0
&&
idx
<
3
);
#endif
return
mSlices
[
idx
];
}
protected:
int
mSlices
[
3
];
};
template
<
size_t
MaxDim
=
TV_MAX_DIM
>
struct
ShapeBase
:
public
SimpleVector
<
int
,
MaxDim
>
{
TV_HOST_DEVICE_INLINE
ShapeBase
()
:
SimpleVector
<
int
,
MaxDim
>
(){};
TV_HOST_DEVICE_INLINE
ShapeBase
(
std
::
initializer_list
<
int
>
shape
)
:
SimpleVector
<
int
,
MaxDim
>
(
shape
)
{}
template
<
typename
T
,
template
<
class
...
>
class
Container
>
ShapeBase
(
Container
<
T
>
shape
)
:
SimpleVector
<
int
,
MaxDim
>
(
shape
)
{}
TV_HOST_DEVICE_INLINE
ShapeBase
(
const
ShapeBase
<
MaxDim
>
&
shape
)
:
SimpleVector
<
int
,
MaxDim
>
(
shape
)
{}
ShapeBase
(
const
std
::
vector
<
int
>
&
arr
)
:
SimpleVector
<
int
,
MaxDim
>
(
arr
)
{}
ShapeBase
<
MaxDim
>
&
operator
=
(
const
ShapeBase
<
MaxDim
>
&
shape
)
=
default
;
TV_HOST_DEVICE_INLINE
ShapeBase
<
MaxDim
>
subshape
(
int
start
,
int
end
)
const
{
#ifdef TV_DEBUG
TV_ASSERT
(
start
>=
0
&&
end
<
this
->
mSize
&&
end
>
start
);
#endif
ShapeBase
<
MaxDim
>
shape
;
for
(
int
i
=
start
;
i
<
end
;
++
i
)
{
shape
.
push_back
(
this
->
mArray
[
i
]);
}
return
shape
;
}
TV_HOST_DEVICE_INLINE
ShapeBase
<
MaxDim
>
subshape
(
int
start
)
const
{
#ifdef TV_DEBUG
TV_ASSERT
(
start
>=
0
&&
start
<=
this
->
mSize
);
#endif
ShapeBase
<
MaxDim
>
shape
;
for
(
int
i
=
start
;
i
<
this
->
mSize
;
++
i
)
{
shape
.
push_back
(
this
->
mArray
[
i
]);
}
return
shape
;
}
TV_HOST_DEVICE_INLINE
size_t
size
()
const
{
if
(
this
->
mSize
==
0
)
return
0
;
size_t
s
=
1
;
for
(
int
i
=
0
;
i
<
int
(
this
->
mSize
);
++
i
)
{
s
*=
this
->
mArray
[
i
];
}
return
s
;
}
TV_HOST_DEVICE_INLINE
size_t
ndim
()
const
{
return
this
->
mSize
;
}
TV_HOST_DEVICE_INLINE
ShapeBase
<
MaxDim
>
squeeze
()
const
{
ShapeBase
<
MaxDim
>
shape
;
for
(
int
i
=
0
;
i
<
this
->
mSize
;
++
i
)
{
if
(
this
->
mArray
[
i
]
!=
1
)
shape
.
push_back
(
this
->
mArray
[
i
]);
}
return
shape
;
}
TV_HOST_DEVICE_INLINE
ShapeBase
<
MaxDim
>
squeeze
(
int
dim
)
const
{
ShapeBase
<
MaxDim
>
shape
;
for
(
int
i
=
0
;
i
<
this
->
mSize
;
++
i
)
{
if
(
i
!=
dim
||
this
->
mArray
[
i
]
!=
1
)
shape
.
push_back
(
this
->
mArray
[
i
]);
}
return
shape
;
}
};
using
Shape
=
ShapeBase
<
TV_MAX_DIM
>
;
template
<
class
...
Inds
>
TV_HOST_DEVICE_INLINE
unsigned
rowArrayIdx
(
std
::
vector
<
int
>
&
shape
,
Inds
...
indexes
)
{
unsigned
offset
=
0
;
unsigned
m
=
1
;
int
indexes_vec
[
sizeof
...(
indexes
)]
=
{
indexes
...};
#ifdef TV_DEBUG
TV_ASSERT
(
sizeof
...(
indexes
)
==
shape
.
size
());
#endif
#pragma unroll
for
(
int
i
=
sizeof
...(
indexes
)
-
1
;
i
>=
0
;
--
i
)
{
offset
+=
m
*
indexes_vec
[
i
];
m
*=
shape
[
i
];
}
return
offset
;
}
TV_HOST_DEVICE_INLINE
unsigned
rowArrayIdx
(
std
::
vector
<
int
>
&
shape
,
std
::
vector
<
int
>
&
indexes_vec
)
{
unsigned
offset
=
0
;
unsigned
m
=
1
;
for
(
int
i
=
shape
.
size
()
-
1
;
i
>=
0
;
--
i
)
{
offset
+=
m
*
indexes_vec
[
i
];
m
*=
shape
[
i
];
}
return
offset
;
}
template
<
class
...
Inds
>
TV_HOST_DEVICE_INLINE
unsigned
rowArrayIdx
(
const
Shape
&
shape
,
Inds
...
indexes
)
{
unsigned
offset
=
0
;
unsigned
m
=
1
;
int
indexes_vec
[
sizeof
...(
indexes
)]
=
{
indexes
...};
#pragma unroll
for
(
int
i
=
sizeof
...(
indexes
)
-
1
;
i
>=
0
;
--
i
)
{
offset
+=
m
*
indexes_vec
[
i
];
m
*=
shape
[
i
];
}
return
offset
;
}
TV_HOST_DEVICE_INLINE
unsigned
rowArrayIdx
(
const
Shape
&
shape
,
const
Shape
&
indexes_vec
)
{
unsigned
offset
=
0
;
unsigned
m
=
1
;
for
(
int
i
=
indexes_vec
.
ndim
()
-
1
;
i
>=
0
;
--
i
)
{
offset
+=
m
*
indexes_vec
[
i
];
m
*=
shape
[
i
];
}
return
offset
;
}
template
<
typename
Index
,
unsigned
NDim
>
TV_HOST_DEVICE_INLINE
unsigned
rowArrayIdx
(
const
Index
*
indexes
,
const
Index
*
shape
)
{
unsigned
offset
=
0
;
unsigned
m
=
1
;
#pragma unroll
for
(
int
i
=
NDim
-
1
;
i
>=
0
;
--
i
)
{
offset
+=
m
*
indexes
[
i
];
m
*=
shape
[
i
];
}
return
offset
;
}
template
<
typename
Index
,
unsigned
NDim
>
TV_HOST_DEVICE_INLINE
Index
rowArrayIdxInv
(
Index
index
,
Index
*
output
,
const
Index
*
shape
)
{
#pragma unroll
for
(
int
i
=
NDim
-
1
;
i
>=
0
;
--
i
)
{
output
[
i
]
=
index
%
shape
[
i
];
index
-=
output
[
i
];
index
/=
shape
[
i
];
}
return
index
;
}
template
<
int
N
>
struct
ArrayIndexRowMajor
{
// mPtr[((i1 * mShape[1] + i2) * mShape[2] + i3) * mShape[3] + i4];
TV_HOST_DEVICE_INLINE
static
unsigned
run
(
const
Shape
&
shape
,
const
Shape
&
indexes
)
{
return
indexes
[
N
-
1
]
+
shape
[
N
-
1
]
*
ArrayIndexRowMajor
<
N
-
1
>::
run
(
shape
,
indexes
);
}
};
template
<
>
struct
ArrayIndexRowMajor
<
0
>
{
TV_HOST_DEVICE_INLINE
static
unsigned
run
(
const
Shape
&
shape
,
const
Shape
&
indexes
)
{
return
0
;
}
};
namespace
detail
{
template
<
typename
T
>
constexpr
const
char
*
simpleTypeName
(
T
val
=
T
());
template
<
>
constexpr
const
char
*
simpleTypeName
(
float
val
)
{
return
"float32"
;
}
template
<
>
constexpr
const
char
*
simpleTypeName
(
double
val
)
{
return
"float64"
;
}
template
<
>
constexpr
const
char
*
simpleTypeName
(
int
val
)
{
return
"int32"
;
}
template
<
>
constexpr
const
char
*
simpleTypeName
(
unsigned
val
)
{
return
"uint32"
;
}
template
<
>
constexpr
const
char
*
simpleTypeName
(
long
val
)
{
return
"int64"
;
}
template
<
>
constexpr
const
char
*
simpleTypeName
(
unsigned
long
val
)
{
return
"uint64"
;
}
};
// namespace detail
template
<
typename
T
,
int
Rank
=
-
1
>
struct
TensorView
{
TV_HOST_DEVICE_INLINE
TensorView
()
{}
explicit
TV_HOST_DEVICE_INLINE
TensorView
(
T
*
ptr
,
Shape
shape
)
:
mPtr
(
ptr
),
mShape
(
shape
)
{}
// explicit TV_HOST_DEVICE_INLINE TensorView(const
// TensorView<std::remove_const_t<T>> &tview) : mPtr(tview.data()),
// mShape(tview.shape()) {}
template
<
class
...
Integers
>
explicit
TV_HOST_DEVICE_INLINE
TensorView
(
T
*
ptr
,
Integers
...
shapes
)
:
mPtr
(
ptr
)
{
mShape
=
{
int
(
shapes
)...};
}
TV_HOST_DEVICE_INLINE
TensorView
<
T
,
Rank
>
&
assign
(
const
TensorView
<
T
,
Rank
>
&
tensor
)
{
TV_REQUIRE
(
tensor
.
shape
()
==
shape
(),
"you must provide same input size%s"
,
"
\n
"
);
T
*
ptr
=
mPtr
;
const
T
*
other_ptr
=
tensor
.
data
();
for
(
size_t
i
=
0
;
i
<
size
();
++
i
)
*
(
ptr
++
)
=
*
(
other_ptr
++
);
return
*
this
;
}
template
<
typename
T1
>
TV_HOST_DEVICE_INLINE
TensorView
<
T
,
Rank
>
&
assign
(
std
::
initializer_list
<
T1
>
seq
)
{
TV_REQUIRE
(
seq
.
size
()
==
size
(),
"you must provide same input size%s"
,
"
\n
"
);
T
*
ptr
=
mPtr
;
for
(
const
T1
&
s
:
seq
)
*
(
ptr
++
)
=
T
(
s
);
return
*
this
;
}
template
<
class
...
Inds
>
TV_HOST_DEVICE_INLINE
T
&
operator
()(
Inds
...
inds
)
{
#ifdef TV_DEBUG
int
idxes
[
sizeof
...(
Inds
)]{
int
(
inds
)...};
TV_REQUIRE
(
sizeof
...(
inds
)
==
mShape
.
ndim
(),
"you provide %d indexes, but dim is %d
\n
"
,
sizeof
...(
inds
),
mShape
.
ndim
());
for
(
int
i
=
0
;
i
<
sizeof
...(
inds
);
++
i
)
{
TV_REQUIRE
(
idxes
[
i
]
>=
0
&&
idxes
[
i
]
<
mShape
[
i
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
i
,
idxes
[
i
],
mShape
[
i
]);
}
#endif
return
mPtr
[
rowArrayIdx
(
mShape
,
int
(
inds
)...)];
}
template
<
class
...
Inds
>
TV_HOST_DEVICE_INLINE
const
T
&
operator
()(
Inds
...
inds
)
const
{
#ifdef TV_DEBUG
int
idxes
[
sizeof
...(
Inds
)]{
int
(
inds
)...};
TV_REQUIRE
(
sizeof
...(
inds
)
==
mShape
.
ndim
(),
"you provide %d indexes, but dim is %d
\n
"
,
sizeof
...(
inds
),
mShape
.
ndim
());
for
(
int
i
=
0
;
i
<
sizeof
...(
inds
);
++
i
)
{
TV_REQUIRE
(
idxes
[
i
]
>=
0
&&
idxes
[
i
]
<
mShape
[
i
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
i
,
idxes
[
i
],
mShape
[
i
]);
}
#endif
return
mPtr
[
rowArrayIdx
(
mShape
,
int
(
inds
)...)];
}
TV_HOST_DEVICE_INLINE
T
&
operator
()()
{
#if defined TV_DEBUG
#if defined(__CUDA_ARCH__)
TV_DEVICE_REQUIRE
(
mPtr
!=
nullptr
,
"you want get value but the view is empty.%s"
,
"
\n
"
);
TV_DEVICE_REQUIRE
(
mShape
.
ndim
()
==
0
,
"you provide 0 indexes, but dim is %ld
\n
"
,
mShape
.
ndim
());
#else
TV_REQUIRE
(
mPtr
!=
nullptr
,
"you want get value but the view is empty.%s"
,
"
\n
"
);
TV_REQUIRE
(
mShape
.
ndim
()
==
0
,
"you provide 0 indexes, but dim is %ld
\n
"
,
mShape
.
ndim
());
#endif
#endif
return
mPtr
[
0
];
}
TV_HOST_DEVICE_INLINE
const
T
&
operator
()()
const
{
#if defined TV_DEBUG
#if defined(__CUDA_ARCH__)
TV_DEVICE_REQUIRE
(
mPtr
!=
nullptr
,
"you want get value but the view is empty.%s"
,
"
\n
"
);
TV_DEVICE_REQUIRE
(
mShape
.
ndim
()
==
0
,
"you provide 0 indexes, but dim is %ld
\n
"
,
mShape
.
ndim
());
#else
TV_REQUIRE
(
mPtr
!=
nullptr
,
"you want get value but the view is empty.%s"
,
"
\n
"
);
TV_REQUIRE
(
mShape
.
ndim
()
==
0
,
"you provide 0 indexes, but dim is %ld
\n
"
,
mShape
.
ndim
());
#endif
#endif
return
mPtr
[
0
];
}
template
<
class
T1
>
TV_HOST_DEVICE_INLINE
T
&
operator
()(
T1
i1
)
{
#if defined TV_DEBUG
#if defined(__CUDA_ARCH__)
TV_DEVICE_REQUIRE
(
mShape
.
ndim
()
==
1
,
"you provide 1 indexes, but dim is %ld
\n
"
,
mShape
.
ndim
());
TV_DEVICE_REQUIRE
(
i1
>=
0
&&
i1
<
mShape
[
0
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
0
,
i1
,
mShape
[
0
]);
#else
TV_REQUIRE
(
mShape
.
ndim
()
==
1
,
"you provide 1 indexes, but dim is %ld
\n
"
,
mShape
.
ndim
());
TV_REQUIRE
(
i1
>=
0
&&
i1
<
mShape
[
0
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
0
,
i1
,
mShape
[
0
]);
#endif
#endif
return
mPtr
[
i1
];
}
template
<
class
T1
,
class
T2
>
TV_HOST_DEVICE_INLINE
T
&
operator
()(
T1
i1
,
T2
i2
)
{
#ifdef TV_DEBUG
#if defined(__CUDA_ARCH__)
TV_DEVICE_REQUIRE
(
mShape
.
ndim
()
==
2
,
"you provide 2 indexes, but dim is %ld
\n
"
,
mShape
.
ndim
());
TV_DEVICE_REQUIRE
(
i1
>=
0
&&
i1
<
mShape
[
0
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
0
,
int
(
i1
),
mShape
[
0
]);
TV_DEVICE_REQUIRE
(
i2
>=
0
&&
i2
<
mShape
[
1
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
1
,
int
(
i2
),
mShape
[
1
]);
#else
TV_REQUIRE
(
mShape
.
ndim
()
==
2
,
"you provide 2 indexes, but dim is %ld
\n
"
,
mShape
.
ndim
());
TV_REQUIRE
(
i1
>=
0
&&
i1
<
mShape
[
0
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
0
,
int
(
i1
),
mShape
[
0
]);
TV_REQUIRE
(
i2
>=
0
&&
i2
<
mShape
[
1
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
1
,
int
(
i2
),
mShape
[
1
]);
#endif
#endif
return
mPtr
[
i1
*
mShape
[
1
]
+
i2
];
}
template
<
class
T1
,
class
T2
,
class
T3
>
TV_HOST_DEVICE_INLINE
T
&
operator
()(
T1
i1
,
T2
i2
,
T3
i3
)
{
#ifdef TV_DEBUG
#if defined(__CUDA_ARCH__)
TV_DEVICE_REQUIRE
(
mShape
.
ndim
()
==
3
,
"you provide 3 indexes, but dim is %ld
\n
"
,
mShape
.
ndim
());
TV_DEVICE_REQUIRE
(
i1
>=
0
&&
i1
<
mShape
[
0
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
0
,
int
(
i1
),
mShape
[
0
]);
TV_DEVICE_REQUIRE
(
i2
>=
0
&&
i2
<
mShape
[
1
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
1
,
int
(
i2
),
mShape
[
1
]);
TV_DEVICE_REQUIRE
(
i3
>=
0
&&
i3
<
mShape
[
2
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
2
,
int
(
i3
),
mShape
[
2
]);
#else
TV_REQUIRE
(
mShape
.
ndim
()
==
3
,
"you provide 3 indexes, but dim is %ld
\n
"
,
mShape
.
ndim
());
TV_REQUIRE
(
i1
>=
0
&&
i1
<
mShape
[
0
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
0
,
int
(
i1
),
mShape
[
0
]);
TV_REQUIRE
(
i2
>=
0
&&
i2
<
mShape
[
1
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
1
,
int
(
i2
),
mShape
[
1
]);
TV_REQUIRE
(
i3
>=
0
&&
i3
<
mShape
[
2
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
2
,
int
(
i3
),
mShape
[
2
]);
#endif
#endif
return
mPtr
[(
i1
*
mShape
[
1
]
+
i2
)
*
mShape
[
2
]
+
i3
];
}
template
<
class
T1
,
class
T2
,
class
T3
,
class
T4
>
TV_HOST_DEVICE_INLINE
T
&
operator
()(
T1
i1
,
T2
i2
,
T3
i3
,
T4
i4
)
{
#ifdef TV_DEBUG
#if defined(__CUDA_ARCH__)
TV_DEVICE_REQUIRE
(
mShape
.
ndim
()
==
4
,
"you provide 4 indexes, but dim is %ld
\n
"
,
mShape
.
ndim
());
TV_DEVICE_REQUIRE
(
i1
>=
0
&&
i1
<
mShape
[
0
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
0
,
int
(
i1
),
mShape
[
0
]);
TV_DEVICE_REQUIRE
(
i2
>=
0
&&
i2
<
mShape
[
1
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
1
,
int
(
i2
),
mShape
[
1
]);
TV_DEVICE_REQUIRE
(
i3
>=
0
&&
i3
<
mShape
[
2
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
2
,
int
(
i3
),
mShape
[
2
]);
TV_DEVICE_REQUIRE
(
i4
>=
0
&&
i4
<
mShape
[
3
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
3
,
int
(
i4
),
mShape
[
3
]);
#else
TV_REQUIRE
(
mShape
.
ndim
()
==
4
,
"you provide 4 indexes, but dim is %ld
\n
"
,
mShape
.
ndim
());
TV_REQUIRE
(
i1
>=
0
&&
i1
<
mShape
[
0
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
0
,
int
(
i1
),
mShape
[
0
]);
TV_REQUIRE
(
i2
>=
0
&&
i2
<
mShape
[
1
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
1
,
int
(
i2
),
mShape
[
1
]);
TV_REQUIRE
(
i3
>=
0
&&
i3
<
mShape
[
2
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
2
,
int
(
i3
),
mShape
[
2
]);
TV_REQUIRE
(
i4
>=
0
&&
i4
<
mShape
[
3
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
3
,
int
(
i4
),
mShape
[
3
]);
#endif
#endif
return
mPtr
[((
i1
*
mShape
[
1
]
+
i2
)
*
mShape
[
2
]
+
i3
)
*
mShape
[
3
]
+
i4
];
}
template
<
class
T1
>
TV_HOST_DEVICE_INLINE
const
T
&
operator
()(
T1
i1
)
const
{
#ifdef TV_DEBUG
#if defined(__CUDA_ARCH__)
TV_DEVICE_REQUIRE
(
mShape
.
ndim
()
==
1
,
"you provide 1 indexes, but dim is %ld
\n
"
,
mShape
.
ndim
());
TV_DEVICE_REQUIRE
(
i1
>=
0
&&
i1
<
mShape
[
0
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
0
,
int
(
i1
),
mShape
[
0
]);
#else
TV_REQUIRE
(
mShape
.
ndim
()
==
1
,
"you provide 1 indexes, but dim is %ld
\n
"
,
mShape
.
ndim
());
TV_REQUIRE
(
i1
>=
0
&&
i1
<
mShape
[
0
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
0
,
int
(
i1
),
mShape
[
0
]);
#endif
#endif
return
mPtr
[
i1
];
}
template
<
class
T1
,
class
T2
>
TV_HOST_DEVICE_INLINE
const
T
&
operator
()(
T1
i1
,
T2
i2
)
const
{
#ifdef TV_DEBUG
#if defined(__CUDA_ARCH__)
TV_DEVICE_REQUIRE
(
mShape
.
ndim
()
==
2
,
"you provide 2 indexes, but dim is %ld
\n
"
,
mShape
.
ndim
());
TV_DEVICE_REQUIRE
(
i1
>=
0
&&
i1
<
mShape
[
0
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
0
,
int
(
i1
),
mShape
[
0
]);
TV_DEVICE_REQUIRE
(
i2
>=
0
&&
i2
<
mShape
[
1
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
1
,
int
(
i2
),
mShape
[
1
]);
#else
TV_REQUIRE
(
mShape
.
ndim
()
==
2
,
"you provide 2 indexes, but dim is %ld
\n
"
,
mShape
.
ndim
());
TV_REQUIRE
(
i1
>=
0
&&
i1
<
mShape
[
0
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
0
,
int
(
i1
),
mShape
[
0
]);
TV_REQUIRE
(
i2
>=
0
&&
i2
<
mShape
[
1
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
1
,
int
(
i2
),
mShape
[
1
]);
#endif
#endif
return
mPtr
[
i1
*
mShape
[
1
]
+
i2
];
}
template
<
class
T1
,
class
T2
,
class
T3
>
TV_HOST_DEVICE_INLINE
const
T
&
operator
()(
T1
i1
,
T2
i2
,
T3
i3
)
const
{
#ifdef TV_DEBUG
#if defined(__CUDA_ARCH__)
TV_DEVICE_REQUIRE
(
mShape
.
ndim
()
==
3
,
"you provide 3 indexes, but dim is %ld
\n
"
,
mShape
.
ndim
());
TV_DEVICE_REQUIRE
(
i1
>=
0
&&
i1
<
mShape
[
0
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
0
,
int
(
i1
),
mShape
[
0
]);
TV_DEVICE_REQUIRE
(
i2
>=
0
&&
i2
<
mShape
[
1
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
1
,
int
(
i2
),
mShape
[
1
]);
TV_DEVICE_REQUIRE
(
i3
>=
0
&&
i3
<
mShape
[
2
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
2
,
int
(
i3
),
mShape
[
2
]);
#else
TV_REQUIRE
(
mShape
.
ndim
()
==
3
,
"you provide 3 indexes, but dim is %ld
\n
"
,
mShape
.
ndim
());
TV_REQUIRE
(
i1
>=
0
&&
i1
<
mShape
[
0
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
0
,
int
(
i1
),
mShape
[
0
]);
TV_REQUIRE
(
i2
>=
0
&&
i2
<
mShape
[
1
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
1
,
int
(
i2
),
mShape
[
1
]);
TV_REQUIRE
(
i3
>=
0
&&
i3
<
mShape
[
2
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
2
,
int
(
i3
),
mShape
[
2
]);
#endif
#endif
return
mPtr
[(
i1
*
mShape
[
1
]
+
i2
)
*
mShape
[
2
]
+
i3
];
}
template
<
class
T1
,
class
T2
,
class
T3
,
class
T4
>
TV_HOST_DEVICE_INLINE
const
T
&
operator
()(
T1
i1
,
T2
i2
,
T3
i3
,
T4
i4
)
const
{
#ifdef TV_DEBUG
#if defined(__CUDA_ARCH__)
TV_DEVICE_REQUIRE
(
mShape
.
ndim
()
==
4
,
"you provide 4 indexes, but dim is %ld
\n
"
,
mShape
.
ndim
());
TV_DEVICE_REQUIRE
(
i1
>=
0
&&
i1
<
mShape
[
0
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
0
,
int
(
i1
),
mShape
[
0
]);
TV_DEVICE_REQUIRE
(
i2
>=
0
&&
i2
<
mShape
[
1
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
1
,
int
(
i2
),
mShape
[
1
]);
TV_DEVICE_REQUIRE
(
i3
>=
0
&&
i3
<
mShape
[
2
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
2
,
int
(
i3
),
mShape
[
2
]);
TV_DEVICE_REQUIRE
(
i4
>=
0
&&
i4
<
mShape
[
3
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
3
,
int
(
i4
),
mShape
[
3
]);
#else
TV_REQUIRE
(
mShape
.
ndim
()
==
4
,
"you provide 4 indexes, but dim is %ld
\n
"
,
mShape
.
ndim
());
TV_REQUIRE
(
i1
>=
0
&&
i1
<
mShape
[
0
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
0
,
int
(
i1
),
mShape
[
0
]);
TV_REQUIRE
(
i2
>=
0
&&
i2
<
mShape
[
1
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
1
,
int
(
i2
),
mShape
[
1
]);
TV_REQUIRE
(
i3
>=
0
&&
i3
<
mShape
[
2
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
2
,
int
(
i3
),
mShape
[
2
]);
TV_REQUIRE
(
i4
>=
0
&&
i4
<
mShape
[
3
],
"index-%d(%d) out-of-range: [0, %d)
\n
"
,
3
,
int
(
i4
),
mShape
[
3
]);
#endif
#endif
return
mPtr
[((
i1
*
mShape
[
1
]
+
i2
)
*
mShape
[
2
]
+
i3
)
*
mShape
[
3
]
+
i4
];
}
TV_HOST_DEVICE_INLINE
T
&
operator
[](
int
idx
)
{
#ifdef TV_DEBUG
#if defined(__CUDA_ARCH__)
TV_DEVICE_REQUIRE
(
idx
>=
0
&&
idx
<
size
(),
"index(%d) out-of-range: [0, %ld)
\n
"
,
int
(
idx
),
size
());
#else
TV_REQUIRE
(
idx
>=
0
&&
idx
<
size
(),
"index(%d) out-of-range: [0, %ld)
\n
"
,
int
(
idx
),
size
());
#endif
#endif
return
mPtr
[
idx
];
}
// TODO: this is conflcit with operator[](SimpleVector<Slice> slice_vec).
/*TV_HOST_DEVICE_INLINE T &operator[](const Shape index) {
int idx = rowArrayIdx(mShape, index);
#ifdef TV_DEBUG
TV_REQUIRE(idx >= 0 && idx < size(), "index(%d) out-of-range: [0, %ld)\n",
int(idx), size());
#endif
return mPtr[idx];
}
TV_HOST_DEVICE_INLINE const T &operator[](const Shape index) const {
int idx = rowArrayIdx(mShape, index);
#ifdef TV_DEBUG
TV_REQUIRE(idx >= 0 && idx < size(), "index(%d) out-of-range: [0, %ld)\n",
int(idx), size());
#endif
return mPtr[idx];
}*/
TV_HOST_DEVICE_INLINE
TensorView
<
T
,
Rank
>
operator
[](
SimpleVector
<
Slice
>
slice_vec
)
{
return
_subview
(
slice_vec
);
}
TV_HOST_DEVICE_INLINE
const
TensorView
<
T
,
Rank
>
operator
[](
SimpleVector
<
Slice
>
slice_vec
)
const
{
return
_subview
(
slice_vec
);
}
TV_HOST_DEVICE_INLINE
bool
empty
()
const
{
return
mPtr
==
nullptr
;
}
TV_HOST_DEVICE_INLINE
T
*
data
()
{
return
mPtr
;
}
TV_HOST_DEVICE_INLINE
const
T
*
data
()
const
{
return
mPtr
;
}
TV_HOST_DEVICE_INLINE
const
Shape
&
shape
()
const
{
return
mShape
;
}
TV_HOST_DEVICE_INLINE
int
dim
(
int
idx
)
const
{
return
mShape
[
idx
];
}
TV_HOST_DEVICE_INLINE
int
ndim
()
const
{
return
mShape
.
ndim
();
}
template
<
class
...
Inds
>
TV_HOST_DEVICE_INLINE
TensorView
<
T
,
Rank
>
&
reshape
(
Inds
...
newShapes
)
{
Shape
shapes
{
int
(
newShapes
)...};
TV_ASSERT
(
shapes
.
size
()
==
size
());
mShape
=
shapes
;
return
*
this
;
}
TV_HOST_DEVICE_INLINE
TensorView
<
T
,
Rank
>
&
reshape
(
Shape
shapes
)
{
TV_ASSERT
(
shapes
.
size
()
==
size
());
mShape
=
shapes
;
return
*
this
;
}
template
<
class
...
Inds
>
TV_HOST_DEVICE_INLINE
TensorView
<
T
,
Rank
>
view
(
Inds
...
newShapes
)
const
{
Shape
shapes
{
int
(
newShapes
)...};
for
(
size_t
i
=
0
;
i
<
shapes
.
ndim
();
++
i
)
{
if
(
shapes
[
i
]
==
-
1
)
{
shapes
[
i
]
=
1
;
shapes
[
i
]
=
size
()
/
shapes
.
size
();
break
;
}
}
TV_ASSERT
(
shapes
.
size
()
==
size
());
return
TensorView
<
T
,
Rank
>
(
mPtr
,
shapes
);
}
TV_HOST_DEVICE_INLINE
TensorView
<
T
,
Rank
>
view
(
Shape
shapes
)
const
{
TV_ASSERT
(
shapes
.
size
()
==
size
());
return
TensorView
<
T
,
Rank
>
(
mPtr
,
shapes
);
}
TV_HOST_DEVICE_INLINE
TensorView
<
T
,
Rank
>
squeeze
()
const
{
return
TensorView
<
T
,
Rank
>
(
mPtr
,
mShape
.
squeeze
());
}
TV_HOST_DEVICE_INLINE
TensorView
<
T
,
Rank
>
squeeze
(
int
dim
)
const
{
return
TensorView
<
T
,
Rank
>
(
mPtr
,
mShape
.
squeeze
(
dim
));
}
TV_HOST_DEVICE_INLINE
size_t
size
()
const
{
return
mShape
.
size
();
}
template
<
class
...
Slices
>
TV_HOST_DEVICE_INLINE
TensorView
<
T
,
Rank
>
subview
(
Slice
slice
,
Slices
...
slices
)
const
{
return
subview
<
float
,
Slice
,
Slices
...
>
(
slice
,
slices
...);
}
template
<
class
T2
=
float
,
class
...
Slices
>
TV_HOST_DEVICE_INLINE
TensorView
<
T
,
Rank
>
subview
(
Slices
...
slices
)
const
{
Slice
slice_vec
[
sizeof
...(
Slices
)]
=
{
to_slice
(
slices
)...};
Shape
new_shape
{
to_slice
(
slices
)[
0
]...};
Shape
start
{
to_slice
(
slices
)[
0
]...};
TV_ASSERT
(
new_shape
.
ndim
()
<=
mShape
.
ndim
());
TV_ASSERT
(
new_shape
.
ndim
()
!=
0
);
size_t
idxsize
=
new_shape
.
ndim
();
for
(
size_t
i
=
idxsize
;
i
<
mShape
.
ndim
();
++
i
)
{
new_shape
.
push_back
(
0
);
start
.
push_back
(
0
);
}
#pragma unroll
for
(
size_t
i
=
0
;
i
<
sizeof
...(
Slices
);
++
i
)
{
if
(
slice_vec
[
i
][
1
]
!=
-
1
)
{
new_shape
[
i
]
=
slice_vec
[
i
][
1
]
-
slice_vec
[
i
][
0
];
TV_ASSERT
(
new_shape
[
i
]
>=
0
);
}
else
{
new_shape
[
i
]
=
1
;
// reduce dim
}
}
auto
offset
=
rowArrayIdx
(
mShape
,
start
);
#pragma unroll
for
(
size_t
i
=
sizeof
...(
Slices
);
i
<
mShape
.
ndim
();
++
i
)
{
new_shape
[
i
]
=
mShape
[
i
];
TV_ASSERT
(
new_shape
[
i
]
>=
0
);
}
Shape
reduced_shape
;
#pragma unroll
for
(
size_t
i
=
0
;
i
<
sizeof
...(
Slices
);
++
i
)
{
if
(
slice_vec
[
i
][
1
]
!=
-
1
)
{
reduced_shape
.
push_back
(
new_shape
[
i
]);
}
}
#pragma unroll
for
(
size_t
i
=
sizeof
...(
Slices
);
i
<
mShape
.
ndim
();
++
i
)
{
reduced_shape
.
push_back
(
new_shape
[
i
]);
}
return
TensorView
<
T
,
Rank
>
(
mPtr
+
offset
,
reduced_shape
);
}
template
<
class
...
Integers
>
TV_HOST_DEVICE_INLINE
TensorView
<
T
,
Rank
>
subview
(
int
id
,
Integers
...
ints
)
{
Shape
start
=
{
id
,
ints
...};
for
(
int
i
=
1
+
sizeof
...(
ints
);
i
<
ndim
();
++
i
)
{
start
.
push_back
(
0
);
}
return
TensorView
<
T
,
Rank
>
(
mPtr
+
rowArrayIdx
(
mShape
,
start
),
mShape
.
subshape
(
sizeof
...(
ints
)
+
1
));
}
std
::
string
repr
()
const
{
std
::
ostringstream
ss
;
if
(
empty
())
return
""
;
if
(
mShape
.
ndim
()
==
0
)
{
ss
<<
*
mPtr
;
// ss << fmt::format("\nTensor: shape={}, dtype={}", mShape,
// detail::simpleTypeName<T>());
ss
<<
"Tensor: dtype="
<<
detail
::
simpleTypeName
<
T
>
();
return
ss
.
str
();
}
Shape
counter
=
mShape
;
auto
tensor_flat
=
this
->
view
(
-
1
);
for
(
int
i
=
0
;
i
<
counter
.
ndim
();
++
i
)
{
counter
[
i
]
=
0
;
ss
<<
"["
;
}
for
(
size_t
i
=
0
;
i
<
this
->
size
();
++
i
)
{
ss
<<
tensor_flat
(
rowArrayIdx
(
mShape
,
counter
));
counter
[
counter
.
ndim
()
-
1
]
+=
1
;
int
inc_count
=
0
;
bool
print_comma
=
true
;
for
(
int
c
=
counter
.
ndim
()
-
1
;
c
>=
0
;
--
c
)
{
if
(
counter
[
c
]
==
this
->
dim
(
c
)
&&
c
>
0
)
{
++
inc_count
;
counter
[
c
-
1
]
+=
1
;
counter
[
c
]
=
0
;
print_comma
=
false
;
}
}
if
(
print_comma
&&
i
!=
this
->
size
()
-
1
)
ss
<<
", "
;
for
(
int
j
=
0
;
j
<
inc_count
;
++
j
)
{
ss
<<
"]"
;
}
if
(
i
!=
this
->
size
()
-
1
)
{
if
(
inc_count
!=
0
)
ss
<<
"
\n
"
;
for
(
int
j
=
0
;
j
<
inc_count
;
++
j
)
{
ss
<<
"["
;
}
}
}
ss
<<
"]"
;
// ss << fmt::format("\nTensor: shape={}, dtype={}", mShape,
// detail::simpleTypeName<T>());
ss
<<
"Tensor: dtype="
<<
detail
::
simpleTypeName
<
T
>
();
return
ss
.
str
();
}
protected:
// TODO: make this function public.
// currently this function is called unexpectedly when using subview({0, 0}).
TV_HOST_DEVICE_INLINE
TensorView
<
T
,
Rank
>
_subview
(
SimpleVector
<
Slice
>
slice_vec
)
{
Shape
new_shape
;
for
(
int
i
=
0
;
i
<
slice_vec
.
size
();
++
i
)
{
new_shape
.
push_back
(
slice_vec
[
i
][
0
]);
}
Shape
start
=
new_shape
;
TV_ASSERT
(
new_shape
.
ndim
()
<=
mShape
.
ndim
());
TV_ASSERT
(
new_shape
.
ndim
()
!=
0
);
size_t
idxsize
=
new_shape
.
ndim
();
for
(
size_t
i
=
idxsize
;
i
<
mShape
.
ndim
();
++
i
)
{
new_shape
.
push_back
(
0
);
start
.
push_back
(
0
);
}
for
(
size_t
i
=
0
;
i
<
slice_vec
.
size
();
++
i
)
{
if
(
slice_vec
[
i
][
1
]
!=
-
1
)
{
new_shape
[
i
]
=
slice_vec
[
i
][
1
]
-
slice_vec
[
i
][
0
];
TV_ASSERT
(
new_shape
[
i
]
>=
0
);
}
else
{
new_shape
[
i
]
=
1
;
// reduce dim
}
}
auto
offset
=
rowArrayIdx
(
mShape
,
start
);
for
(
size_t
i
=
slice_vec
.
size
();
i
<
mShape
.
ndim
();
++
i
)
{
new_shape
[
i
]
=
mShape
[
i
];
TV_ASSERT
(
new_shape
[
i
]
>=
0
);
}
Shape
reduced_shape
;
for
(
size_t
i
=
0
;
i
<
slice_vec
.
size
();
++
i
)
{
if
(
slice_vec
[
i
][
1
]
!=
-
1
)
{
reduced_shape
.
push_back
(
new_shape
[
i
]);
}
}
for
(
size_t
i
=
slice_vec
.
size
();
i
<
mShape
.
ndim
();
++
i
)
{
reduced_shape
.
push_back
(
new_shape
[
i
]);
}
return
TensorView
<
T
,
Rank
>
(
mPtr
+
offset
,
reduced_shape
);
}
template
<
typename
T1
>
TV_HOST_DEVICE_INLINE
Slice
to_slice
(
T1
s
)
const
{
return
Slice
{
int
(
s
),
-
1
,
-
1
};
}
TV_HOST_DEVICE_INLINE
Slice
to_slice
(
Slice
s
)
const
{
return
Slice
(
s
);
}
T
*
mPtr
=
nullptr
;
Shape
mShape
;
};
template
<
typename
Os
,
typename
T
,
int
Rank
>
Os
&
operator
<<
(
Os
&
os
,
const
TensorView
<
T
,
Rank
>
&
dt
)
{
os
<<
dt
.
repr
();
return
os
;
}
template
<
typename
Os
,
typename
T
,
int
Rank
>
Os
&
operator
<<
(
Os
&
os
,
const
TensorView
<
const
T
,
Rank
>
&
dt
)
{
os
<<
dt
.
repr
();
return
os
;
}
namespace
detail
{
template
<
typename
T
>
constexpr
const
char
*
printfTypeFormat
(
T
val
=
T
());
template
<
>
constexpr
const
char
*
printfTypeFormat
(
float
val
)
{
return
"%.2f"
;
}
template
<
>
constexpr
const
char
*
printfTypeFormat
(
double
val
)
{
return
"%.2f"
;
}
template
<
>
constexpr
const
char
*
printfTypeFormat
(
int
val
)
{
return
"%d"
;
}
template
<
>
constexpr
const
char
*
printfTypeFormat
(
unsigned
val
)
{
return
"%u"
;
}
template
<
>
constexpr
const
char
*
printfTypeFormat
(
long
val
)
{
return
"%ld"
;
}
template
<
>
constexpr
const
char
*
printfTypeFormat
(
unsigned
long
val
)
{
return
"%lu"
;
}
};
// namespace detail
template
<
typename
T
>
TV_HOST_DEVICE
void
printTensorView
(
const
TensorView
<
T
>
tensor
,
const
char
*
format
)
{
if
(
tensor
.
empty
())
return
;
if
(
tensor
.
ndim
()
==
0
)
{
printf
(
format
,
tensor
());
printf
(
"
\n
"
);
return
;
}
Shape
counter
=
tensor
.
shape
();
auto
tensor_flat
=
tensor
.
view
(
-
1
);
for
(
int
i
=
0
;
i
<
counter
.
ndim
();
++
i
)
{
counter
[
i
]
=
0
;
printf
(
"["
);
}
for
(
size_t
i
=
0
;
i
<
tensor
.
size
();
++
i
)
{
printf
(
format
,
tensor_flat
(
rowArrayIdx
(
tensor
.
shape
(),
counter
)));
counter
[
counter
.
ndim
()
-
1
]
+=
1
;
int
inc_count
=
0
;
bool
print_comma
=
true
;
for
(
int
c
=
counter
.
ndim
()
-
1
;
c
>=
0
;
--
c
)
{
if
(
counter
[
c
]
==
tensor
.
dim
(
c
)
&&
c
>
0
)
{
++
inc_count
;
counter
[
c
-
1
]
+=
1
;
counter
[
c
]
=
0
;
print_comma
=
false
;
}
}
if
(
print_comma
&&
i
!=
tensor
.
size
()
-
1
)
printf
(
", "
);
for
(
int
j
=
0
;
j
<
inc_count
;
++
j
)
{
printf
(
"]"
);
}
if
(
i
!=
tensor
.
size
()
-
1
)
{
if
(
inc_count
!=
0
)
printf
(
"
\n
"
);
for
(
int
j
=
0
;
j
<
inc_count
;
++
j
)
{
printf
(
"["
);
}
}
}
printf
(
"]
\n
"
);
}
template
<
typename
T
>
TV_HOST_DEVICE
void
printTensorView
(
TensorView
<
T
>
tensor
)
{
using
Traw
=
typename
std
::
remove_const
<
T
>::
type
;
return
printTensorView
(
tensor
,
detail
::
printfTypeFormat
<
Traw
>
());
}
template
<
typename
T
>
TV_HOST_DEVICE
void
printTensorView
(
const
T
*
ptr
,
Shape
shape
)
{
using
Traw
=
typename
std
::
remove_const
<
T
>::
type
;
return
printTensorView
(
TensorView
<
const
T
>
(
ptr
,
shape
),
detail
::
printfTypeFormat
<
Traw
>
());
}
template
<
typename
T
>
TV_HOST_DEVICE
void
printTensorView
(
const
T
*
ptr
,
Shape
shape
,
const
char
*
format
)
{
return
printTensorView
(
TensorView
<
const
T
>
(
ptr
,
shape
),
format
);
}
}
// namespace tv
mmdet3d/ops/spconv/include/torch_utils.h
0 → 100644
View file @
d1aac35d
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <tensorview/tensorview.h>
#include <torch/script.h>
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
namespace
tv
{
struct
TorchGPU
:
public
tv
::
GPU
{
virtual
cudaStream_t
getStream
()
const
override
{
return
at
::
cuda
::
getCurrentCUDAStream
();
}
};
template
<
typename
T
>
void
check_torch_dtype
(
const
torch
::
Tensor
&
tensor
)
{
switch
(
tensor
.
type
().
scalarType
())
{
case
at
::
ScalarType
::
Double
:
{
auto
val
=
std
::
is_same
<
std
::
remove_const_t
<
T
>
,
double
>::
value
;
TV_ASSERT_RT_ERR
(
val
,
"error"
);
break
;
}
case
at
::
ScalarType
::
Float
:
{
auto
val
=
std
::
is_same
<
std
::
remove_const_t
<
T
>
,
float
>::
value
;
TV_ASSERT_RT_ERR
(
val
,
"error"
);
break
;
}
case
at
::
ScalarType
::
Int
:
{
auto
val
=
std
::
is_same
<
std
::
remove_const_t
<
T
>
,
int
>::
value
;
TV_ASSERT_RT_ERR
(
val
,
"error"
);
break
;
}
case
at
::
ScalarType
::
Half
:
{
auto
val
=
std
::
is_same
<
std
::
remove_const_t
<
T
>
,
at
::
Half
>::
value
;
TV_ASSERT_RT_ERR
(
val
,
"error"
);
break
;
}
case
at
::
ScalarType
::
Long
:
{
auto
val
=
std
::
is_same
<
std
::
remove_const_t
<
T
>
,
long
>::
value
;
TV_ASSERT_RT_ERR
(
val
,
"error"
);
break
;
}
default:
TV_ASSERT_RT_ERR
(
false
,
"error"
);
}
}
template
<
typename
T
>
tv
::
TensorView
<
T
>
torch2tv
(
const
torch
::
Tensor
&
tensor
)
{
check_torch_dtype
<
T
>
(
tensor
);
tv
::
Shape
shape
;
for
(
auto
i
:
tensor
.
sizes
())
{
shape
.
push_back
(
i
);
}
return
tv
::
TensorView
<
T
>
(
tensor
.
data
<
std
::
remove_const_t
<
T
>>
(),
shape
);
}
}
// namespace tv
mmdet3d/ops/spconv/include/utility/timer.h
0 → 100644
View file @
d1aac35d
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <chrono>
#include <cuda_runtime_api.h>
#include <iostream>
namespace
spconv
{
template
<
typename
TimeT
=
std
::
chrono
::
microseconds
>
struct
CudaContextTimer
{
CudaContextTimer
()
{
cudaDeviceSynchronize
();
mCurTime
=
std
::
chrono
::
steady_clock
::
now
();
}
typename
TimeT
::
rep
report
()
{
cudaDeviceSynchronize
();
auto
duration
=
std
::
chrono
::
duration_cast
<
TimeT
>
(
std
::
chrono
::
steady_clock
::
now
()
-
mCurTime
);
auto
res
=
duration
.
count
();
mCurTime
=
std
::
chrono
::
steady_clock
::
now
();
return
res
;
}
private:
std
::
chrono
::
time_point
<
std
::
chrono
::
steady_clock
>
mCurTime
;
};
template
<
typename
TimeT
=
std
::
chrono
::
microseconds
>
struct
CPUTimer
{
CPUTimer
()
{
mCurTime
=
std
::
chrono
::
steady_clock
::
now
();
}
typename
TimeT
::
rep
report
()
{
auto
duration
=
std
::
chrono
::
duration_cast
<
TimeT
>
(
std
::
chrono
::
steady_clock
::
now
()
-
mCurTime
);
auto
res
=
duration
.
count
();
mCurTime
=
std
::
chrono
::
steady_clock
::
now
();
return
res
;
}
private:
std
::
chrono
::
time_point
<
std
::
chrono
::
steady_clock
>
mCurTime
;
};
}
// namespace spconv
mmdet3d/ops/spconv/modules.py
0 → 100644
View file @
d1aac35d
# Copyright 2019 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
sys
from
collections
import
OrderedDict
import
torch
from
torch
import
nn
from
.structure
import
SparseConvTensor
def
is_spconv_module
(
module
):
spconv_modules
=
(
SparseModule
,
)
return
isinstance
(
module
,
spconv_modules
)
def
is_sparse_conv
(
module
):
from
.conv
import
SparseConvolution
return
isinstance
(
module
,
SparseConvolution
)
def
_mean_update
(
vals
,
m_vals
,
t
):
outputs
=
[]
if
not
isinstance
(
vals
,
list
):
vals
=
[
vals
]
if
not
isinstance
(
m_vals
,
list
):
m_vals
=
[
m_vals
]
for
val
,
m_val
in
zip
(
vals
,
m_vals
):
output
=
t
/
float
(
t
+
1
)
*
m_val
+
1
/
float
(
t
+
1
)
*
val
outputs
.
append
(
output
)
if
len
(
outputs
)
==
1
:
outputs
=
outputs
[
0
]
return
outputs
class
SparseModule
(
nn
.
Module
):
""" place holder,
All module subclass from this will take sptensor in SparseSequential.
"""
pass
class
SparseSequential
(
SparseModule
):
r
"""A sequential container.
Modules will be added to it in the order they are passed in the
constructor.
Alternatively, an ordered dict of modules can also be passed in.
To make it easier to understand, given is a small example::
# Example of using Sequential
model = SparseSequential(
SparseConv2d(1,20,5),
nn.ReLU(),
SparseConv2d(20,64,5),
nn.ReLU()
)
# Example of using Sequential with OrderedDict
model = SparseSequential(OrderedDict([
('conv1', SparseConv2d(1,20,5)),
('relu1', nn.ReLU()),
('conv2', SparseConv2d(20,64,5)),
('relu2', nn.ReLU())
]))
# Example of using Sequential with kwargs(python 3.6+)
model = SparseSequential(
conv1=SparseConv2d(1,20,5),
relu1=nn.ReLU(),
conv2=SparseConv2d(20,64,5),
relu2=nn.ReLU()
)
"""
def
__init__
(
self
,
*
args
,
**
kwargs
):
super
(
SparseSequential
,
self
).
__init__
()
if
len
(
args
)
==
1
and
isinstance
(
args
[
0
],
OrderedDict
):
for
key
,
module
in
args
[
0
].
items
():
self
.
add_module
(
key
,
module
)
else
:
for
idx
,
module
in
enumerate
(
args
):
self
.
add_module
(
str
(
idx
),
module
)
for
name
,
module
in
kwargs
.
items
():
if
sys
.
version_info
<
(
3
,
6
):
raise
ValueError
(
'kwargs only supported in py36+'
)
if
name
in
self
.
_modules
:
raise
ValueError
(
'name exists.'
)
self
.
add_module
(
name
,
module
)
self
.
_sparity_dict
=
{}
def
__getitem__
(
self
,
idx
):
if
not
(
-
len
(
self
)
<=
idx
<
len
(
self
)):
raise
IndexError
(
'index {} is out of range'
.
format
(
idx
))
if
idx
<
0
:
idx
+=
len
(
self
)
it
=
iter
(
self
.
_modules
.
values
())
for
i
in
range
(
idx
):
next
(
it
)
return
next
(
it
)
def
__len__
(
self
):
return
len
(
self
.
_modules
)
@
property
def
sparity_dict
(
self
):
return
self
.
_sparity_dict
def
add
(
self
,
module
,
name
=
None
):
if
name
is
None
:
name
=
str
(
len
(
self
.
_modules
))
if
name
in
self
.
_modules
:
raise
KeyError
(
'name exists'
)
self
.
add_module
(
name
,
module
)
def
forward
(
self
,
input
):
for
k
,
module
in
self
.
_modules
.
items
():
if
is_spconv_module
(
module
):
# use SpConvTensor as input
assert
isinstance
(
input
,
SparseConvTensor
)
self
.
_sparity_dict
[
k
]
=
input
.
sparity
input
=
module
(
input
)
else
:
if
isinstance
(
input
,
SparseConvTensor
):
if
input
.
indices
.
shape
[
0
]
!=
0
:
input
.
features
=
module
(
input
.
features
)
else
:
input
=
module
(
input
)
return
input
def
fused
(
self
):
"""don't use this. no effect.
"""
from
.conv
import
SparseConvolution
mods
=
[
v
for
k
,
v
in
self
.
_modules
.
items
()]
fused_mods
=
[]
idx
=
0
while
idx
<
len
(
mods
):
if
is_sparse_conv
(
mods
[
idx
]):
if
idx
<
len
(
mods
)
-
1
and
isinstance
(
mods
[
idx
+
1
],
nn
.
BatchNorm1d
):
new_module
=
SparseConvolution
(
ndim
=
mods
[
idx
].
ndim
,
in_channels
=
mods
[
idx
].
in_channels
,
out_channels
=
mods
[
idx
].
out_channels
,
kernel_size
=
mods
[
idx
].
kernel_size
,
stride
=
mods
[
idx
].
stride
,
padding
=
mods
[
idx
].
padding
,
dilation
=
mods
[
idx
].
dilation
,
groups
=
mods
[
idx
].
groups
,
bias
=
True
,
subm
=
mods
[
idx
].
subm
,
output_padding
=
mods
[
idx
].
output_padding
,
transposed
=
mods
[
idx
].
transposed
,
inverse
=
mods
[
idx
].
inverse
,
indice_key
=
mods
[
idx
].
indice_key
,
fused_bn
=
True
,
)
new_module
.
load_state_dict
(
mods
[
idx
].
state_dict
(),
False
)
new_module
.
to
(
mods
[
idx
].
weight
.
device
)
conv
=
new_module
bn
=
mods
[
idx
+
1
]
conv
.
bias
.
data
.
zero_
()
conv
.
weight
.
data
[:]
=
conv
.
weight
.
data
*
bn
.
weight
.
data
/
(
torch
.
sqrt
(
bn
.
running_var
)
+
bn
.
eps
)
conv
.
bias
.
data
[:]
=
(
conv
.
bias
.
data
-
bn
.
running_mean
)
*
bn
.
weight
.
data
/
(
torch
.
sqrt
(
bn
.
running_var
)
+
bn
.
eps
)
+
bn
.
bias
.
data
fused_mods
.
append
(
conv
)
idx
+=
2
else
:
fused_mods
.
append
(
mods
[
idx
])
idx
+=
1
else
:
fused_mods
.
append
(
mods
[
idx
])
idx
+=
1
return
SparseSequential
(
*
fused_mods
)
class
ToDense
(
SparseModule
):
"""convert SparseConvTensor to NCHW dense tensor.
"""
def
forward
(
self
,
x
:
SparseConvTensor
):
return
x
.
dense
()
class
RemoveGrid
(
SparseModule
):
"""remove pre-allocated grid buffer.
"""
def
forward
(
self
,
x
:
SparseConvTensor
):
x
.
grid
=
None
return
x
mmdet3d/ops/spconv/ops.py
0 → 100644
View file @
d1aac35d
# Copyright 2019 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
torch
from
.
import
sparse_conv_ext
def
get_conv_output_size
(
input_size
,
kernel_size
,
stride
,
padding
,
dilation
):
ndim
=
len
(
input_size
)
output_size
=
[]
for
i
in
range
(
ndim
):
size
=
(
input_size
[
i
]
+
2
*
padding
[
i
]
-
dilation
[
i
]
*
(
kernel_size
[
i
]
-
1
)
-
1
)
//
stride
[
i
]
+
1
if
kernel_size
[
i
]
==
-
1
:
output_size
.
append
(
1
)
else
:
output_size
.
append
(
size
)
return
output_size
def
get_deconv_output_size
(
input_size
,
kernel_size
,
stride
,
padding
,
dilation
,
output_padding
):
ndim
=
len
(
input_size
)
output_size
=
[]
for
i
in
range
(
ndim
):
if
kernel_size
[
i
]
==
-
1
:
raise
ValueError
(
"deconv don't support kernel_size < 0"
)
size
=
(
input_size
[
i
]
-
1
)
*
stride
[
i
]
-
2
*
padding
[
i
]
+
kernel_size
[
i
]
+
output_padding
[
i
]
output_size
.
append
(
size
)
return
output_size
def
get_indice_pairs
(
indices
,
batch_size
,
spatial_shape
,
ksize
=
3
,
stride
=
1
,
padding
=
0
,
dilation
=
1
,
out_padding
=
0
,
subm
=
False
,
transpose
=
False
,
grid
=
None
):
ndim
=
indices
.
shape
[
1
]
-
1
if
not
isinstance
(
ksize
,
(
list
,
tuple
)):
ksize
=
[
ksize
]
*
ndim
if
not
isinstance
(
stride
,
(
list
,
tuple
)):
stride
=
[
stride
]
*
ndim
if
not
isinstance
(
padding
,
(
list
,
tuple
)):
padding
=
[
padding
]
*
ndim
if
not
isinstance
(
dilation
,
(
list
,
tuple
)):
dilation
=
[
dilation
]
*
ndim
if
not
isinstance
(
out_padding
,
(
list
,
tuple
)):
out_padding
=
[
out_padding
]
*
ndim
for
d
,
s
in
zip
(
dilation
,
stride
):
assert
any
([
s
==
1
,
d
==
1
]),
"don't support this."
if
not
subm
:
if
transpose
:
out_shape
=
get_deconv_output_size
(
spatial_shape
,
ksize
,
stride
,
padding
,
dilation
,
out_padding
)
else
:
out_shape
=
get_conv_output_size
(
spatial_shape
,
ksize
,
stride
,
padding
,
dilation
)
else
:
out_shape
=
spatial_shape
if
grid
is
None
:
if
ndim
==
2
:
get_indice_pairs_func
=
sparse_conv_ext
.
get_indice_pairs_2d
elif
ndim
==
3
:
get_indice_pairs_func
=
sparse_conv_ext
.
get_indice_pairs_3d
elif
ndim
==
4
:
get_indice_pairs_func
=
sparse_conv_ext
.
get_indice_pairs_4d
else
:
raise
NotImplementedError
return
get_indice_pairs_func
(
indices
,
batch_size
,
out_shape
,
spatial_shape
,
ksize
,
stride
,
padding
,
dilation
,
out_padding
,
int
(
subm
),
int
(
transpose
))
else
:
if
ndim
==
2
:
get_indice_pairs_func
=
sparse_conv_ext
.
get_indice_pairs_grid_2d
elif
ndim
==
3
:
get_indice_pairs_func
=
sparse_conv_ext
.
get_indice_pairs_grid_3d
else
:
raise
NotImplementedError
return
get_indice_pairs_func
(
indices
,
grid
,
batch_size
,
out_shape
,
spatial_shape
,
ksize
,
stride
,
padding
,
dilation
,
out_padding
,
int
(
subm
),
int
(
transpose
))
def
indice_conv
(
features
,
filters
,
indice_pairs
,
indice_pair_num
,
num_activate_out
,
inverse
=
False
,
subm
=
False
):
if
filters
.
dtype
==
torch
.
float32
:
return
sparse_conv_ext
.
indice_conv_fp32
(
features
,
filters
,
indice_pairs
,
indice_pair_num
,
num_activate_out
,
int
(
inverse
),
int
(
subm
))
elif
filters
.
dtype
==
torch
.
half
:
return
sparse_conv_ext
.
indice_conv_half
(
features
,
filters
,
indice_pairs
,
indice_pair_num
,
num_activate_out
,
int
(
inverse
),
int
(
subm
))
else
:
raise
NotImplementedError
def
fused_indice_conv
(
features
,
filters
,
bias
,
indice_pairs
,
indice_pair_num
,
num_activate_out
,
inverse
,
subm
):
if
features
.
dtype
==
torch
.
half
:
func
=
sparse_conv_ext
.
fused_indice_conv_half
elif
filters
.
dtype
==
torch
.
float32
:
func
=
sparse_conv_ext
.
fused_indice_conv_fp32
else
:
raise
NotImplementedError
return
func
(
features
,
filters
,
bias
,
indice_pairs
,
indice_pair_num
,
num_activate_out
,
int
(
inverse
),
int
(
subm
))
def
indice_conv_backward
(
features
,
filters
,
out_bp
,
indice_pairs
,
indice_pair_num
,
inverse
=
False
,
subm
=
False
):
if
filters
.
dtype
==
torch
.
float32
:
return
sparse_conv_ext
.
indice_conv_backward_fp32
(
features
,
filters
,
out_bp
,
indice_pairs
,
indice_pair_num
,
int
(
inverse
),
int
(
subm
))
elif
filters
.
dtype
==
torch
.
half
:
return
sparse_conv_ext
.
indice_conv_backward_half
(
features
,
filters
,
out_bp
,
indice_pairs
,
indice_pair_num
,
int
(
inverse
),
int
(
subm
))
else
:
raise
NotImplementedError
def
indice_maxpool
(
features
,
indice_pairs
,
indice_pair_num
,
num_activate_out
):
if
features
.
dtype
==
torch
.
float32
:
return
sparse_conv_ext
.
indice_maxpool_fp32
(
features
,
indice_pairs
,
indice_pair_num
,
num_activate_out
)
elif
features
.
dtype
==
torch
.
half
:
return
sparse_conv_ext
.
indice_maxpool_half
(
features
,
indice_pairs
,
indice_pair_num
,
num_activate_out
)
else
:
raise
NotImplementedError
def
indice_maxpool_backward
(
features
,
out_features
,
out_bp
,
indice_pairs
,
indice_pair_num
):
if
features
.
dtype
==
torch
.
float32
:
return
sparse_conv_ext
.
indice_maxpool_backward_fp32
(
features
,
out_features
,
out_bp
,
indice_pairs
,
indice_pair_num
)
elif
features
.
dtype
==
torch
.
half
:
return
sparse_conv_ext
.
indice_maxpool_backward_half
(
features
,
out_features
,
out_bp
,
indice_pairs
,
indice_pair_num
)
else
:
raise
NotImplementedError
mmdet3d/ops/spconv/pool.py
0 → 100644
View file @
d1aac35d
# Copyright 2019 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
.
import
functional
as
Fsp
from
.
import
ops
from
.modules
import
SparseModule
from
.structure
import
SparseConvTensor
class
SparseMaxPool
(
SparseModule
):
def
__init__
(
self
,
ndim
,
kernel_size
,
stride
=
1
,
padding
=
0
,
dilation
=
1
,
subm
=
False
):
super
(
SparseMaxPool
,
self
).
__init__
()
if
not
isinstance
(
kernel_size
,
(
list
,
tuple
)):
kernel_size
=
[
kernel_size
]
*
ndim
if
not
isinstance
(
stride
,
(
list
,
tuple
)):
stride
=
[
stride
]
*
ndim
if
not
isinstance
(
padding
,
(
list
,
tuple
)):
padding
=
[
padding
]
*
ndim
if
not
isinstance
(
dilation
,
(
list
,
tuple
)):
dilation
=
[
dilation
]
*
ndim
self
.
ndim
=
ndim
self
.
kernel_size
=
kernel_size
self
.
stride
=
stride
self
.
padding
=
padding
self
.
subm
=
subm
self
.
dilation
=
dilation
def
forward
(
self
,
input
):
assert
isinstance
(
input
,
SparseConvTensor
)
features
=
input
.
features
device
=
features
.
device
indices
=
input
.
indices
spatial_shape
=
input
.
spatial_shape
batch_size
=
input
.
batch_size
if
not
self
.
subm
:
out_spatial_shape
=
ops
.
get_conv_output_size
(
spatial_shape
,
self
.
kernel_size
,
self
.
stride
,
self
.
padding
,
self
.
dilation
)
else
:
out_spatial_shape
=
spatial_shape
outids
,
indice_pairs
,
indice_pairs_num
=
ops
.
get_indice_pairs
(
indices
,
batch_size
,
spatial_shape
,
self
.
kernel_size
,
self
.
stride
,
self
.
padding
,
self
.
dilation
,
0
,
self
.
subm
)
out_features
=
Fsp
.
indice_maxpool
(
features
,
indice_pairs
.
to
(
device
),
indice_pairs_num
.
to
(
device
),
outids
.
shape
[
0
])
out_tensor
=
SparseConvTensor
(
out_features
,
outids
,
out_spatial_shape
,
batch_size
)
out_tensor
.
indice_dict
=
input
.
indice_dict
out_tensor
.
grid
=
input
.
grid
return
out_tensor
class
SparseMaxPool2d
(
SparseMaxPool
):
def
__init__
(
self
,
kernel_size
,
stride
=
1
,
padding
=
0
,
dilation
=
1
):
super
(
SparseMaxPool2d
,
self
).
__init__
(
2
,
kernel_size
,
stride
,
padding
,
dilation
)
class
SparseMaxPool3d
(
SparseMaxPool
):
def
__init__
(
self
,
kernel_size
,
stride
=
1
,
padding
=
0
,
dilation
=
1
):
super
(
SparseMaxPool3d
,
self
).
__init__
(
3
,
kernel_size
,
stride
,
padding
,
dilation
)
mmdet3d/ops/spconv/src/all.cc
0 → 100644
View file @
d1aac35d
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <cuda_runtime_api.h>
#include <spconv/fused_spconv_ops.h>
#include <spconv/pool_ops.h>
#include <spconv/spconv_ops.h>
#include <torch/extension.h>
PYBIND11_MODULE
(
TORCH_EXTENSION_NAME
,
m
)
{
m
.
def
(
"get_indice_pairs_2d"
,
&
spconv
::
getIndicePair
<
2
>
,
"get_indice_pairs_2d"
);
m
.
def
(
"get_indice_pairs_3d"
,
&
spconv
::
getIndicePair
<
3
>
,
"get_indice_pairs_3d"
);
m
.
def
(
"get_indice_pairs_4d"
,
&
spconv
::
getIndicePair
<
4
>
,
"get_indice_pairs_4d"
);
m
.
def
(
"get_indice_pairs_grid_2d"
,
&
spconv
::
getIndicePairPreGrid
<
2
>
,
"get_indice_pairs_grid_2d"
);
m
.
def
(
"get_indice_pairs_grid_3d"
,
&
spconv
::
getIndicePairPreGrid
<
3
>
,
"get_indice_pairs_grid_3d"
);
m
.
def
(
"indice_conv_fp32"
,
&
spconv
::
indiceConv
<
float
>
,
"indice_conv_fp32"
);
m
.
def
(
"indice_conv_backward_fp32"
,
&
spconv
::
indiceConvBackward
<
float
>
,
"indice_conv_backward_fp32"
);
m
.
def
(
"indice_conv_half"
,
&
spconv
::
indiceConv
<
at
::
Half
>
,
"indice_conv_half"
);
m
.
def
(
"indice_conv_backward_half"
,
&
spconv
::
indiceConvBackward
<
at
::
Half
>
,
"indice_conv_backward_half"
);
m
.
def
(
"fused_indice_conv_fp32"
,
&
spconv
::
fusedIndiceConvBatchNorm
<
float
>
,
"fused_indice_conv_fp32"
);
m
.
def
(
"fused_indice_conv_half"
,
&
spconv
::
fusedIndiceConvBatchNorm
<
at
::
Half
>
,
"fused_indice_conv_half"
);
m
.
def
(
"indice_maxpool_fp32"
,
&
spconv
::
indiceMaxPool
<
float
>
,
"indice_maxpool_fp32"
);
m
.
def
(
"indice_maxpool_backward_fp32"
,
&
spconv
::
indiceMaxPoolBackward
<
float
>
,
"indice_maxpool_backward_fp32"
);
m
.
def
(
"indice_maxpool_half"
,
&
spconv
::
indiceMaxPool
<
at
::
Half
>
,
"indice_maxpool_half"
);
m
.
def
(
"indice_maxpool_backward_half"
,
&
spconv
::
indiceMaxPoolBackward
<
at
::
Half
>
,
"indice_maxpool_backward_half"
);
}
mmdet3d/ops/spconv/src/indice.cc
0 → 100644
View file @
d1aac35d
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <spconv/geometry.h>
#include <spconv/indice.h>
#include <spconv/spconv_ops.h>
#include <torch/script.h>
namespace
spconv
{
namespace
functor
{
template
<
typename
Index
,
typename
IndexGrid
,
unsigned
NDim
>
struct
CreateConvIndicePairFunctor
<
tv
::
CPU
,
Index
,
IndexGrid
,
NDim
>
{
Index
operator
()(
const
tv
::
CPU
&
d
,
tv
::
TensorView
<
const
Index
>
indicesIn
,
tv
::
TensorView
<
Index
>
indicesOut
,
tv
::
TensorView
<
IndexGrid
>
gridsOut
,
tv
::
TensorView
<
Index
>
indicePairs
,
tv
::
TensorView
<
Index
>
indiceNum
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
kernelSize
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
stride
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
padding
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
dilation
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
outSpatialShape
,
bool
transpose
,
bool
resetGrid
)
{
if
(
transpose
)
return
getIndicePairsDeConv
<
Index
,
IndexGrid
,
NDim
>
(
indicesIn
,
indicesOut
,
gridsOut
,
indicePairs
,
indiceNum
,
kernelSize
.
data
(),
stride
.
data
(),
padding
.
data
(),
dilation
.
data
(),
outSpatialShape
.
data
());
else
return
getIndicePairsConv
<
Index
,
IndexGrid
,
NDim
>
(
indicesIn
,
indicesOut
,
gridsOut
,
indicePairs
,
indiceNum
,
kernelSize
.
data
(),
stride
.
data
(),
padding
.
data
(),
dilation
.
data
(),
outSpatialShape
.
data
());
}
};
template
<
typename
Index
,
typename
IndexGrid
,
unsigned
NDim
>
struct
CreateSubMIndicePairFunctor
<
tv
::
CPU
,
Index
,
IndexGrid
,
NDim
>
{
Index
operator
()(
const
tv
::
CPU
&
d
,
tv
::
TensorView
<
const
Index
>
indicesIn
,
tv
::
TensorView
<
IndexGrid
>
gridsOut
,
tv
::
TensorView
<
Index
>
indicePairs
,
tv
::
TensorView
<
Index
>
indiceNum
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
kernelSize
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
stride
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
padding
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
dilation
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
outSpatialShape
,
bool
transpose
,
bool
resetGrid
)
{
return
getIndicePairsSubM
<
Index
,
IndexGrid
,
NDim
>
(
indicesIn
,
gridsOut
,
indicePairs
,
indiceNum
,
kernelSize
.
data
(),
stride
.
data
(),
padding
.
data
(),
dilation
.
data
(),
outSpatialShape
.
data
());
}
};
}
// namespace functor
#define DECLARE_CPU_SPECS_INDEX_NDIM(Index, NDIM) \
template struct functor::CreateConvIndicePairFunctor<tv::CPU, Index, int, NDIM>; \
template struct functor::CreateSubMIndicePairFunctor<tv::CPU, Index, int, \
NDIM>;
#define DECLARE_CPU_INDEX(Index) \
DECLARE_CPU_SPECS_INDEX_NDIM(Index, 1); \
DECLARE_CPU_SPECS_INDEX_NDIM(Index, 2); \
DECLARE_CPU_SPECS_INDEX_NDIM(Index, 3); \
DECLARE_CPU_SPECS_INDEX_NDIM(Index, 4);
DECLARE_CPU_INDEX
(
int
);
DECLARE_CPU_INDEX
(
long
);
#undef DECLARE_CPU_INDEX
#undef DECLARE_CPU_SPECS_INDEX_NDIM
}
// namespace spconv
mmdet3d/ops/spconv/src/indice_cuda.cu
0 → 100644
View file @
d1aac35d
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <ATen/ATen.h>
#include <chrono>
#include <limits>
#include <spconv/mp_helper.h>
#include <spconv/indice.h>
#include <spconv/indice.cu.h>
#include <tensorview/helper_launch.h>
#include <tensorview/tensorview.h>
#include <type_traits>
#include <utility/timer.h>
namespace
spconv
{
namespace
functor
{
template
<
typename
Index
,
typename
IndexGrid
,
unsigned
NDim
>
struct
CreateConvIndicePairFunctorP1
<
tv
::
GPU
,
Index
,
IndexGrid
,
NDim
>
{
Index
operator
()(
const
tv
::
GPU
&
d
,
tv
::
TensorView
<
const
Index
>
indicesIn
,
tv
::
TensorView
<
Index
>
indicesOut
,
tv
::
TensorView
<
IndexGrid
>
gridsOut
,
tv
::
TensorView
<
Index
>
indicePairs
,
tv
::
TensorView
<
Index
>
indiceNum
,
tv
::
TensorView
<
Index
>
indicePairUnique
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
kernelSize
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
stride
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
padding
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
dilation
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
outSpatialShape
,
bool
transpose
)
{
Index
batchSize
=
gridsOut
.
dim
(
0
);
auto
numActIn
=
indicesIn
.
dim
(
0
);
if
(
numActIn
==
0
)
return
0
;
// auto timer = spconv::CudaContextTimer<>();
if
(
transpose
)
prepareDeConvIndicePairsKernel
<
Index
,
IndexGrid
,
NDim
,
4096
>
<<<
tv
::
launch
::
getBlocks
(
numActIn
),
tv
::
launch
::
CUDA_NUM_THREADS
,
0
,
d
.
getStream
()
>>>
(
indicesIn
,
indicesOut
,
gridsOut
,
indicePairs
,
indiceNum
,
indicePairUnique
,
kernelSize
,
stride
,
padding
,
dilation
,
outSpatialShape
);
else
prepareIndicePairsKernel
<
Index
,
IndexGrid
,
NDim
,
4096
>
<<<
tv
::
launch
::
getBlocks
(
numActIn
),
tv
::
launch
::
CUDA_NUM_THREADS
,
0
,
d
.
getStream
()
>>>
(
indicesIn
,
indicesOut
,
gridsOut
,
indicePairs
,
indiceNum
,
indicePairUnique
,
kernelSize
,
stride
,
padding
,
dilation
,
outSpatialShape
);
TV_CHECK_CUDA_ERR
();
// std::cout << "p1 gene time " << timer.report() / 1000.0 << std::endl;
return
1
;
}
};
template
<
typename
Index
,
typename
IndexGrid
,
unsigned
NDim
>
struct
CreateConvIndicePairFunctorP2
<
tv
::
GPU
,
Index
,
IndexGrid
,
NDim
>
{
Index
operator
()(
const
tv
::
GPU
&
d
,
tv
::
TensorView
<
const
Index
>
indicesIn
,
tv
::
TensorView
<
Index
>
indicesOut
,
tv
::
TensorView
<
IndexGrid
>
gridsOut
,
tv
::
TensorView
<
Index
>
indicePairs
,
tv
::
TensorView
<
Index
>
indiceNum
,
tv
::
TensorView
<
Index
>
indicePairUnique
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
outSpatialShape
,
bool
transpose
,
bool
resetGrid
)
{
Index
batchSize
=
gridsOut
.
dim
(
0
);
auto
kernelVolume
=
indicePairs
.
dim
(
0
);
auto
numActIn
=
indicesIn
.
dim
(
0
);
if
(
numActIn
==
0
)
return
0
;
Index
numAct
=
indicePairUnique
.
dim
(
0
)
-
1
;
assignGridAndIndiceOutKernel
<
Index
,
IndexGrid
,
NDim
>
<<<
tv
::
launch
::
getBlocks
(
numAct
),
tv
::
launch
::
CUDA_NUM_THREADS
,
0
,
d
.
getStream
()
>>>
(
indicesOut
,
gridsOut
,
numAct
,
indicePairs
,
indicePairUnique
,
outSpatialShape
,
batchSize
);
TV_CHECK_CUDA_ERR
();
assignIndicePairsKernel
<
Index
,
IndexGrid
,
NDim
>
<<<
tv
::
launch
::
getBlocks
(
numActIn
),
tv
::
launch
::
CUDA_NUM_THREADS
,
0
,
d
.
getStream
()
>>>
(
indicesOut
,
gridsOut
,
numActIn
,
indicePairs
,
indicePairUnique
,
outSpatialShape
);
TV_CHECK_CUDA_ERR
();
if
(
resetGrid
)
{
resetGridKernel
<
Index
,
IndexGrid
,
NDim
>
<<<
tv
::
launch
::
getBlocks
(
numAct
),
tv
::
launch
::
CUDA_NUM_THREADS
,
0
,
d
.
getStream
()
>>>
(
indicePairUnique
.
data
(),
gridsOut
,
numAct
);
TV_CHECK_CUDA_ERR
();
}
return
numAct
;
}
};
template
<
typename
Index
,
typename
IndexGrid
,
unsigned
NDim
>
struct
CreateSubMIndicePairFunctor
<
tv
::
GPU
,
Index
,
IndexGrid
,
NDim
>
{
Index
operator
()(
const
tv
::
GPU
&
d
,
tv
::
TensorView
<
const
Index
>
indicesIn
,
tv
::
TensorView
<
IndexGrid
>
gridsOut
,
tv
::
TensorView
<
Index
>
indicePairs
,
tv
::
TensorView
<
Index
>
indiceNum
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
kernelSize
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
stride
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
padding
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
dilation
,
const
tv
::
SimpleVector
<
Index
,
NDim
>
outSpatialShape
,
bool
transpose
,
bool
resetGrid
)
{
auto
numActIn
=
indicesIn
.
dim
(
0
);
if
(
numActIn
==
0
)
return
0
;
// auto timer = spconv::CudaContextTimer<>();
prepareSubMGridKernel
<
Index
,
IndexGrid
,
NDim
>
<<<
tv
::
launch
::
getBlocks
(
numActIn
),
tv
::
launch
::
CUDA_NUM_THREADS
,
0
,
d
.
getStream
()
>>>
(
indicesIn
,
gridsOut
,
outSpatialShape
);
TV_CHECK_CUDA_ERR
();
getSubMIndicePairsKernel
<
Index
,
IndexGrid
,
NDim
,
4096
>
<<<
tv
::
launch
::
getBlocks
(
numActIn
),
tv
::
launch
::
CUDA_NUM_THREADS
,
0
,
d
.
getStream
()
>>>
(
indicesIn
,
gridsOut
,
indicePairs
,
indiceNum
,
kernelSize
,
stride
,
padding
,
dilation
,
outSpatialShape
);
TV_CHECK_CUDA_ERR
();
// std::cout << "subm gene time " << timer.report() / 1000.0 << std::endl;
if
(
resetGrid
)
{
resetGridSubMKernel
<
Index
,
IndexGrid
,
NDim
>
<<<
tv
::
launch
::
getBlocks
(
numActIn
),
tv
::
launch
::
CUDA_NUM_THREADS
,
0
,
d
.
getStream
()
>>>
(
indicesIn
.
data
(),
gridsOut
,
outSpatialShape
,
numActIn
);
TV_CHECK_CUDA_ERR
();
}
return
numActIn
;
}
};
}
// namespace functor
#define DECLARE_GPU_SPECS_INDEX_NDIM(Index, NDIM) \
template struct functor::CreateConvIndicePairFunctor<tv::GPU, Index, int, \
NDIM>; \
template struct functor::CreateConvIndicePairFunctorP1<tv::GPU, Index, int, \
NDIM>; \
template struct functor::CreateConvIndicePairFunctorP2<tv::GPU, Index, int, \
NDIM>; \
template struct functor::CreateSubMIndicePairFunctor<tv::GPU, Index, int, \
NDIM>;
#define DECLARE_GPU_INDEX(Index) \
DECLARE_GPU_SPECS_INDEX_NDIM(Index, 1); \
DECLARE_GPU_SPECS_INDEX_NDIM(Index, 2); \
DECLARE_GPU_SPECS_INDEX_NDIM(Index, 3); \
DECLARE_GPU_SPECS_INDEX_NDIM(Index, 4);
DECLARE_GPU_INDEX
(
int
);
#undef DECLARE_GPU_INDEX
#undef DECLARE_GPU_SPECS_INDEX_NDIM
}
// namespace spconv
mmdet3d/ops/spconv/src/maxpool.cc
0 → 100644
View file @
d1aac35d
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <spconv/maxpool.h>
#include <torch/script.h>
namespace
spconv
{
namespace
functor
{
template
<
typename
T
,
typename
Index
>
struct
SparseMaxPoolForwardFunctor
<
tv
::
CPU
,
T
,
Index
>
{
void
operator
()(
const
tv
::
CPU
&
d
,
tv
::
TensorView
<
T
>
outFeatures
,
tv
::
TensorView
<
const
T
>
inFeatures
,
tv
::
TensorView
<
const
Index
>
indices
,
int
size
)
{
int
stride
=
outFeatures
.
dim
(
1
);
auto
outFeaturesData
=
outFeatures
.
data
();
auto
inFeaturesData
=
inFeatures
.
data
();
auto
indicesIn
=
indices
.
subview
(
0
).
data
();
auto
indicesOut
=
indices
.
subview
(
1
).
data
();
Index
idxi
,
idxo
;
for
(
int
row
=
0
;
row
<
size
;
row
++
)
{
idxi
=
indicesIn
[
row
]
*
stride
;
idxo
=
indicesOut
[
row
]
*
stride
;
for
(
int
plane
=
0
;
plane
<
stride
;
++
plane
)
if
(
outFeaturesData
[
idxo
+
plane
]
<
inFeaturesData
[
idxi
+
plane
])
outFeaturesData
[
idxo
+
plane
]
=
inFeaturesData
[
idxi
+
plane
];
}
}
};
template
<
typename
T
,
typename
Index
>
struct
SparseMaxPoolBackwardFunctor
<
tv
::
CPU
,
T
,
Index
>
{
void
operator
()(
const
tv
::
CPU
&
d
,
tv
::
TensorView
<
const
T
>
outFeatures
,
tv
::
TensorView
<
const
T
>
inFeatures
,
tv
::
TensorView
<
const
T
>
dout
,
tv
::
TensorView
<
T
>
din
,
tv
::
TensorView
<
const
Index
>
indices
,
int
size
)
{
int
stride
=
outFeatures
.
dim
(
1
);
auto
outFeaturesData
=
outFeatures
.
data
();
auto
inFeaturesData
=
inFeatures
.
data
();
auto
doutData
=
dout
.
data
();
auto
dinData
=
din
.
data
();
auto
indicesIn
=
indices
.
subview
(
0
).
data
();
auto
indicesOut
=
indices
.
subview
(
1
).
data
();
Index
idxi
,
idxo
;
for
(
int
row
=
0
;
row
<
size
;
row
++
)
{
idxi
=
indicesIn
[
row
]
*
stride
;
idxo
=
indicesOut
[
row
]
*
stride
;
for
(
int
plane
=
0
;
plane
<
stride
;
++
plane
)
if
(
outFeaturesData
[
idxo
+
plane
]
==
inFeaturesData
[
idxi
+
plane
])
dinData
[
idxi
+
plane
]
+=
doutData
[
idxo
+
plane
];
}
}
};
}
// namespace functor
#define DECLARE_CPU_SPECS_T_INDEX(T, Index) \
template struct functor::SparseMaxPoolForwardFunctor<tv::CPU, T, Index>; \
template struct functor::SparseMaxPoolBackwardFunctor<tv::CPU, T, Index>;
#define DECLARE_CPU_SPECS(T) \
DECLARE_CPU_SPECS_T_INDEX(T, int); \
DECLARE_CPU_SPECS_T_INDEX(T, long);
DECLARE_CPU_SPECS
(
float
);
DECLARE_CPU_SPECS
(
double
);
DECLARE_CPU_SPECS
(
at
::
Half
);
#undef DECLARE_CPU_SPECS
#undef DECLARE_CPU_SPECS_T_INDEX
}
// namespace spconv
mmdet3d/ops/spconv/src/maxpool_cuda.cu
0 → 100644
View file @
d1aac35d
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <ATen/ATen.h>
#include <chrono>
#include <limits>
#include <spconv/maxpool.h>
#include <spconv/mp_helper.h>
#include <tensorview/helper_kernel.cu.h>
#include <tensorview/helper_launch.h>
#include <tensorview/tensorview.h>
#include <type_traits>
namespace
spconv
{
template
<
typename
T
,
typename
Index
,
int
NumTLP
,
int
NumILP
>
__global__
void
maxPoolFwdBlockKernel
(
T
*
outFeatures
,
const
T
*
inFeatures
,
const
Index
*
indicesIn
,
const
Index
*
indicesOut
,
int
numHot
,
int
numPlanes
)
{
T
in
,
out
;
int
ILPStrideY
[
NumILP
];
Index
idxo
,
idxi
;
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
ILPStrideY
[
ilp
]
=
threadIdx
.
y
+
ilp
*
blockDim
.
y
;
outFeatures
+=
blockIdx
.
y
*
NumTLP
;
inFeatures
+=
blockIdx
.
y
*
NumTLP
;
for
(
int
ix
=
blockIdx
.
x
*
blockDim
.
x
;
ix
<
numHot
;
ix
+=
blockDim
.
x
*
gridDim
.
x
)
{
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
++
ilp
)
{
idxi
=
indicesIn
[
ix
+
ILPStrideY
[
ilp
]]
*
numPlanes
+
threadIdx
.
x
;
idxo
=
indicesOut
[
ix
+
ILPStrideY
[
ilp
]]
*
numPlanes
+
threadIdx
.
x
;
in
=
inFeatures
[
idxi
];
out
=
outFeatures
[
idxo
];
if
(
in
>
out
)
{
outFeatures
[
idxo
]
=
in
;
}
}
}
}
}
template
<
typename
T
,
typename
Index
,
int
NumTLP
,
int
NumILP
>
__global__
void
maxPoolFwdGenericBlockKernel
(
T
*
outFeatures
,
const
T
*
inFeatures
,
const
Index
*
indicesIn
,
const
Index
*
indicesOut
,
int
numHot
,
int
numPlanes
)
{
// see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
int
ILPStrideX
[
NumILP
];
Index
RI
[
NumILP
];
Index
RO
[
NumILP
];
T
in
,
out
;
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
ILPStrideX
[
ilp
]
=
ilp
*
gridDim
.
x
*
blockDim
.
x
;
for
(
int
ix
:
tv
::
KernelLoopX
<
int
,
NumILP
>
(
numHot
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
{
RI
[
ilp
]
=
indicesIn
[
ix
+
ILPStrideX
[
ilp
]]
*
numPlanes
;
RO
[
ilp
]
=
indicesOut
[
ix
+
ILPStrideX
[
ilp
]]
*
numPlanes
;
}
for
(
int
iy
:
tv
::
KernelLoopY
<
int
>
(
numPlanes
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
++
ilp
)
{
in
=
inFeatures
[
RI
[
ilp
]
+
iy
];
out
=
outFeatures
[
RO
[
ilp
]
+
iy
];
if
(
in
>
out
)
{
outFeatures
[
RO
[
ilp
]
+
iy
]
=
in
;
}
}
}
}
}
template
<
typename
T
,
typename
Index
,
int
NumTLP
,
int
NumILP
,
typename
VecType
>
__global__
void
maxPoolFwdVecBlockKernel
(
T
*
outFeatures
,
const
T
*
inFeatures
,
const
Index
*
indicesIn
,
const
Index
*
indicesOut
,
int
numHot
,
int
numPlanes
)
{
// see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
int
ILPStrideY
[
NumILP
];
constexpr
int
vecloadFactor
=
sizeof
(
VecType
)
/
sizeof
(
T
);
T
bufi
[
vecloadFactor
];
T
bufo
[
vecloadFactor
];
Index
idxi
,
idxo
;
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
ILPStrideY
[
ilp
]
=
threadIdx
.
y
+
ilp
*
blockDim
.
y
;
outFeatures
+=
blockIdx
.
y
*
NumTLP
;
inFeatures
+=
blockIdx
.
y
*
NumTLP
;
for
(
int
ix
=
blockIdx
.
x
*
blockDim
.
x
*
vecloadFactor
;
ix
<
numHot
;
ix
+=
blockDim
.
x
*
gridDim
.
x
*
vecloadFactor
)
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
++
ilp
)
{
idxi
=
indicesIn
[
ix
+
ILPStrideY
[
ilp
]]
*
numPlanes
+
threadIdx
.
x
;
idxo
=
indicesOut
[
ix
+
ILPStrideY
[
ilp
]]
*
numPlanes
+
threadIdx
.
x
;
reinterpret_cast
<
VecType
*>
(
bufo
)[
0
]
=
reinterpret_cast
<
VecType
*>
(
outFeatures
)[
idxo
];
reinterpret_cast
<
VecType
*>
(
bufi
)[
0
]
=
reinterpret_cast
<
const
VecType
*>
(
inFeatures
)[
idxi
];
#pragma unroll
for
(
int
i
=
0
;
i
<
vecloadFactor
;
i
++
)
{
if
(
bufi
[
i
]
>
bufo
[
i
])
{
bufo
[
i
]
=
bufi
[
i
];
}
}
reinterpret_cast
<
VecType
*>
(
outFeatures
)[
idxo
]
=
reinterpret_cast
<
VecType
*>
(
bufo
)[
0
];
}
}
}
template
<
typename
T
,
typename
Index
,
int
NumTLP
,
int
NumILP
>
__global__
void
maxPoolFwdGenericKernel
(
T
*
outFeatures
,
const
T
*
inFeatures
,
const
Index
*
indicesIn
,
const
Index
*
indicesOut
,
int
numHot
,
int
numPlanes
)
{
// see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
int
ILPStrideX
[
NumILP
];
Index
RI
[
NumILP
];
Index
RO
[
NumILP
];
T
in
,
out
;
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
ILPStrideX
[
ilp
]
=
ilp
*
gridDim
.
x
*
blockDim
.
x
;
for
(
int
ix
:
tv
::
KernelLoopX
<
int
,
NumILP
>
(
numHot
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
{
if
(
ix
+
ILPStrideX
[
ilp
]
<
numHot
)
{
RI
[
ilp
]
=
indicesIn
[
ix
+
ILPStrideX
[
ilp
]]
*
numPlanes
;
RO
[
ilp
]
=
indicesOut
[
ix
+
ILPStrideX
[
ilp
]]
*
numPlanes
;
}
}
for
(
int
iy
:
tv
::
KernelLoopY
<
int
>
(
numPlanes
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
++
ilp
)
{
if
(
ix
+
ILPStrideX
[
ilp
]
<
numHot
)
{
in
=
inFeatures
[
RI
[
ilp
]
+
iy
];
out
=
outFeatures
[
RO
[
ilp
]
+
iy
];
if
(
in
>
out
)
{
outFeatures
[
RO
[
ilp
]
+
iy
]
=
in
;
}
}
}
}
}
}
template
<
typename
T
,
typename
Index
,
int
NumTLP
,
int
NumILP
>
__global__
void
maxPoolBwdBlockKernel
(
const
T
*
outFeatures
,
const
T
*
inFeatures
,
const
T
*
dout
,
T
*
din
,
const
Index
*
indicesIn
,
const
Index
*
indicesOut
,
int
numHot
,
int
numPlanes
)
{
// see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
T
in
,
out
;
Index
idxo
,
idxi
;
int
ILPStrideY
[
NumILP
];
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
ILPStrideY
[
ilp
]
=
threadIdx
.
y
+
ilp
*
blockDim
.
y
;
outFeatures
+=
blockIdx
.
y
*
NumTLP
;
inFeatures
+=
blockIdx
.
y
*
NumTLP
;
dout
+=
blockIdx
.
y
*
NumTLP
;
din
+=
blockIdx
.
y
*
NumTLP
;
for
(
int
ix
=
blockIdx
.
x
*
blockDim
.
x
;
ix
<
numHot
;
ix
+=
blockDim
.
x
*
gridDim
.
x
)
{
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
++
ilp
)
{
idxi
=
indicesIn
[
ix
+
ILPStrideY
[
ilp
]]
*
numPlanes
+
threadIdx
.
x
;
idxo
=
indicesOut
[
ix
+
ILPStrideY
[
ilp
]]
*
numPlanes
+
threadIdx
.
x
;
in
=
inFeatures
[
idxi
];
out
=
outFeatures
[
idxo
];
if
(
in
==
out
)
{
din
[
idxi
]
+=
dout
[
idxo
];
}
}
}
}
}
template
<
typename
T
,
typename
Index
,
int
NumTLP
,
int
NumILP
>
__global__
void
maxPoolBwdGenericBlockKernel
(
const
T
*
outFeatures
,
const
T
*
inFeatures
,
const
T
*
dout
,
T
*
din
,
const
Index
*
indicesIn
,
const
Index
*
indicesOut
,
int
numHot
,
int
numPlanes
)
{
// see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
int
ILPStrideX
[
NumILP
];
Index
RI
[
NumILP
];
Index
RO
[
NumILP
];
T
in
,
out
;
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
ILPStrideX
[
ilp
]
=
ilp
*
gridDim
.
x
*
blockDim
.
x
;
for
(
int
ix
:
tv
::
KernelLoopX
<
int
,
NumILP
>
(
numHot
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
{
RI
[
ilp
]
=
indicesIn
[
ix
+
ILPStrideX
[
ilp
]]
*
numPlanes
;
RO
[
ilp
]
=
indicesOut
[
ix
+
ILPStrideX
[
ilp
]]
*
numPlanes
;
}
for
(
int
iy
:
tv
::
KernelLoopY
<
int
>
(
numPlanes
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
++
ilp
)
{
in
=
inFeatures
[
RI
[
ilp
]
+
iy
];
out
=
outFeatures
[
RO
[
ilp
]
+
iy
];
if
(
in
==
out
)
{
din
[
RI
[
ilp
]
+
iy
]
+=
dout
[
RO
[
ilp
]
+
iy
];
}
}
}
}
}
template
<
typename
T
,
typename
Index
,
int
NumTLP
,
int
NumILP
,
typename
VecType
>
__global__
void
maxPoolBwdVecBlockKernel
(
const
T
*
outFeatures
,
const
T
*
inFeatures
,
const
T
*
dout
,
T
*
din
,
const
Index
*
indicesIn
,
const
Index
*
indicesOut
,
int
numHot
,
int
numPlanes
)
{
// see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
int
ILPStrideY
[
NumILP
];
constexpr
int
vecloadFactor
=
sizeof
(
VecType
)
/
sizeof
(
T
);
T
bufi
[
vecloadFactor
];
T
bufo
[
vecloadFactor
];
T
bufdi
[
vecloadFactor
];
T
bufdo
[
vecloadFactor
];
Index
idxi
,
idxo
;
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
ILPStrideY
[
ilp
]
=
threadIdx
.
y
+
ilp
*
blockDim
.
y
;
outFeatures
+=
blockIdx
.
y
*
NumTLP
;
inFeatures
+=
blockIdx
.
y
*
NumTLP
;
for
(
int
ix
=
blockIdx
.
x
*
blockDim
.
x
*
vecloadFactor
;
ix
<
numHot
;
ix
+=
blockDim
.
x
*
gridDim
.
x
*
vecloadFactor
)
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
++
ilp
)
{
idxi
=
indicesIn
[
ix
+
ILPStrideY
[
ilp
]]
*
numPlanes
+
threadIdx
.
x
;
idxo
=
indicesOut
[
ix
+
ILPStrideY
[
ilp
]]
*
numPlanes
+
threadIdx
.
x
;
reinterpret_cast
<
VecType
*>
(
bufo
)[
0
]
=
reinterpret_cast
<
const
VecType
*>
(
outFeatures
)[
idxo
];
reinterpret_cast
<
VecType
*>
(
bufi
)[
0
]
=
reinterpret_cast
<
const
VecType
*>
(
inFeatures
)[
idxi
];
reinterpret_cast
<
VecType
*>
(
bufdo
)[
0
]
=
reinterpret_cast
<
const
VecType
*>
(
dout
)[
idxo
];
reinterpret_cast
<
VecType
*>
(
bufdi
)[
0
]
=
reinterpret_cast
<
VecType
*>
(
din
)[
idxi
];
#pragma unroll
for
(
int
i
=
0
;
i
<
vecloadFactor
;
i
++
)
{
if
(
bufi
[
i
]
==
bufo
[
i
])
{
bufdi
[
i
]
+=
bufdo
[
i
];
}
}
reinterpret_cast
<
VecType
*>
(
din
)[
idxi
]
=
reinterpret_cast
<
VecType
*>
(
bufdi
)[
0
];
}
}
}
template
<
typename
T
,
typename
Index
,
int
NumTLP
,
int
NumILP
>
__global__
void
maxPoolBwdGenericKernel
(
const
T
*
outFeatures
,
const
T
*
inFeatures
,
const
T
*
dout
,
T
*
din
,
const
Index
*
indicesIn
,
const
Index
*
indicesOut
,
int
numHot
,
int
numPlanes
)
{
// see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
int
ILPStrideX
[
NumILP
];
Index
RI
[
NumILP
];
Index
RO
[
NumILP
];
T
in
,
out
;
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
ILPStrideX
[
ilp
]
=
ilp
*
gridDim
.
x
*
blockDim
.
x
;
for
(
int
ix
:
tv
::
KernelLoopX
<
int
,
NumILP
>
(
numHot
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
ilp
++
)
{
if
(
ix
+
ILPStrideX
[
ilp
]
<
numHot
)
{
RI
[
ilp
]
=
indicesIn
[
ix
+
ILPStrideX
[
ilp
]]
*
numPlanes
;
RO
[
ilp
]
=
indicesOut
[
ix
+
ILPStrideX
[
ilp
]]
*
numPlanes
;
}
}
for
(
int
iy
:
tv
::
KernelLoopY
<
int
>
(
numPlanes
))
{
#pragma unroll
for
(
int
ilp
=
0
;
ilp
<
NumILP
;
++
ilp
)
{
if
(
ix
+
ILPStrideX
[
ilp
]
<
numHot
)
{
in
=
inFeatures
[
RI
[
ilp
]
+
iy
];
out
=
outFeatures
[
RO
[
ilp
]
+
iy
];
if
(
in
==
out
)
{
din
[
RI
[
ilp
]
+
iy
]
+=
dout
[
RO
[
ilp
]
+
iy
];
}
}
}
}
}
}
namespace
functor
{
template
<
typename
T
,
typename
Index
>
struct
SparseMaxPoolForwardFunctor
<
tv
::
GPU
,
T
,
Index
>
{
using
vecload_type_t
=
std
::
conditional_t
<
std
::
is_same
<
T
,
at
::
Half
>::
value
,
int2
,
int4
>
;
using
kernel_block_t
=
mp_list_c
<
int
,
64
,
32
,
16
>
;
void
operator
()(
const
tv
::
GPU
&
d
,
tv
::
TensorView
<
T
>
outFeatures
,
tv
::
TensorView
<
const
T
>
inFeatures
,
tv
::
TensorView
<
const
Index
>
indices
,
int
size
)
{
if
(
size
<=
0
)
return
;
int
numPlanes
=
inFeatures
.
dim
(
1
);
bool
notFound
=
true
;
constexpr
int
vecloadFactor
=
sizeof
(
vecload_type_t
)
/
sizeof
(
T
);
mp_for_each
<
kernel_block_t
>
([
=
,
&
outFeatures
,
&
inFeatures
,
&
indices
,
&
notFound
](
auto
NumTLP
)
{
constexpr
int
NumILP
=
NumTLP
/
4
;
int
numHotBlock
=
(
size
/
NumTLP
)
*
NumTLP
;
if
(
notFound
)
{
if
(
numPlanes
%
NumTLP
==
0
)
{
if
(
numHotBlock
>=
NumTLP
)
{
maxPoolFwdVecBlockKernel
<
T
,
Index
,
int
(
NumTLP
),
NumILP
,
vecload_type_t
>
<<<
dim3
(
std
::
min
(
size
/
NumTLP
,
512
),
numPlanes
/
NumTLP
),
dim3
(
NumTLP
/
vecloadFactor
,
NumTLP
/
NumILP
),
0
,
d
.
getStream
()
>>>
(
outFeatures
.
data
(),
inFeatures
.
data
(),
indices
.
subview
(
0
).
data
(),
indices
.
subview
(
1
).
data
(),
numHotBlock
,
numPlanes
/
vecloadFactor
);
TV_CHECK_CUDA_ERR
();
}
if
(
size
>
numHotBlock
)
{
maxPoolFwdGenericKernel
<
T
,
Index
,
int
(
NumTLP
),
NumILP
>
<<<
dim3
(
1
,
numPlanes
/
NumTLP
),
dim3
(
NumTLP
/
NumILP
,
NumTLP
),
0
,
d
.
getStream
()
>>>
(
outFeatures
.
data
(),
inFeatures
.
data
(),
indices
.
subview
(
0
).
data
()
+
numHotBlock
,
indices
.
subview
(
1
).
data
()
+
numHotBlock
,
size
-
numHotBlock
,
numPlanes
);
TV_CHECK_CUDA_ERR
();
}
notFound
=
false
;
}
}
});
if
(
notFound
)
{
constexpr
int
NumTLP
=
64
;
constexpr
int
NumILP
=
NumTLP
/
4
;
int
numHotBlock
=
(
size
/
NumTLP
)
*
NumTLP
;
if
(
numHotBlock
>=
NumTLP
)
{
maxPoolFwdGenericBlockKernel
<
T
,
Index
,
NumTLP
,
NumILP
>
<<<
dim3
(
size
/
NumTLP
,
tv
::
launch
::
DivUp
(
numPlanes
,
NumTLP
)),
dim3
(
NumTLP
/
NumILP
,
NumTLP
),
0
,
d
.
getStream
()
>>>
(
outFeatures
.
data
(),
inFeatures
.
data
(),
indices
.
subview
(
0
).
data
(),
indices
.
subview
(
1
).
data
(),
numHotBlock
,
numPlanes
);
TV_CHECK_CUDA_ERR
();
}
if
(
size
>
numHotBlock
)
{
maxPoolFwdGenericKernel
<
T
,
Index
,
NumTLP
,
NumILP
>
<<<
dim3
(
1
,
tv
::
launch
::
DivUp
(
numPlanes
,
NumTLP
)),
dim3
(
NumTLP
/
NumILP
,
NumTLP
),
0
,
d
.
getStream
()
>>>
(
outFeatures
.
data
(),
inFeatures
.
data
(),
indices
.
subview
(
0
).
data
()
+
numHotBlock
,
indices
.
subview
(
1
).
data
()
+
numHotBlock
,
size
-
numHotBlock
,
numPlanes
);
TV_CHECK_CUDA_ERR
();
}
}
}
};
template
<
typename
T
,
typename
Index
>
struct
SparseMaxPoolBackwardFunctor
<
tv
::
GPU
,
T
,
Index
>
{
using
vecload_type_t
=
std
::
conditional_t
<
std
::
is_same
<
T
,
at
::
Half
>::
value
,
int2
,
int4
>
;
using
kernel_block_t
=
mp_list_c
<
int
,
64
,
32
,
16
>
;
void
operator
()(
const
tv
::
GPU
&
d
,
tv
::
TensorView
<
const
T
>
outFeatures
,
tv
::
TensorView
<
const
T
>
inFeatures
,
tv
::
TensorView
<
const
T
>
dout
,
tv
::
TensorView
<
T
>
din
,
tv
::
TensorView
<
const
Index
>
indices
,
int
size
)
{
if
(
size
<=
0
)
return
;
int
numPlanes
=
inFeatures
.
dim
(
1
);
bool
notFound
=
true
;
constexpr
int
vecloadFactor
=
sizeof
(
vecload_type_t
)
/
sizeof
(
T
);
mp_for_each
<
kernel_block_t
>
([
=
,
&
outFeatures
,
&
inFeatures
,
&
dout
,
&
din
,
&
indices
,
&
notFound
](
auto
NumTLP
)
{
constexpr
int
NumILP
=
NumTLP
/
4
;
int
numHotBlock
=
(
size
/
NumTLP
)
*
NumTLP
;
if
(
notFound
)
{
if
(
numPlanes
%
NumTLP
==
0
)
{
if
(
numHotBlock
>=
NumTLP
)
{
maxPoolBwdVecBlockKernel
<
T
,
Index
,
int
(
NumTLP
),
NumILP
,
vecload_type_t
>
<<<
dim3
(
std
::
min
(
size
/
NumTLP
,
512
),
numPlanes
/
NumTLP
),
dim3
(
NumTLP
/
vecloadFactor
,
NumTLP
/
NumILP
),
0
,
d
.
getStream
()
>>>
(
outFeatures
.
data
(),
inFeatures
.
data
(),
dout
.
data
(),
din
.
data
(),
indices
.
subview
(
0
).
data
(),
indices
.
subview
(
1
).
data
(),
numHotBlock
,
numPlanes
/
vecloadFactor
);
TV_CHECK_CUDA_ERR
();
}
if
(
size
>
numHotBlock
)
{
maxPoolBwdGenericKernel
<
T
,
Index
,
int
(
NumTLP
),
NumILP
>
<<<
dim3
(
1
,
numPlanes
/
NumTLP
),
dim3
(
NumTLP
/
NumILP
,
NumTLP
),
0
,
d
.
getStream
()
>>>
(
outFeatures
.
data
(),
inFeatures
.
data
(),
dout
.
data
(),
din
.
data
(),
indices
.
subview
(
0
).
data
()
+
numHotBlock
,
indices
.
subview
(
1
).
data
()
+
numHotBlock
,
size
-
numHotBlock
,
numPlanes
);
TV_CHECK_CUDA_ERR
();
}
notFound
=
false
;
}
}
});
if
(
notFound
)
{
constexpr
int
NumTLP
=
64
;
constexpr
int
NumILP
=
NumTLP
/
4
;
int
numHotBlock
=
(
size
/
NumTLP
)
*
NumTLP
;
if
(
numHotBlock
>=
NumTLP
)
{
maxPoolBwdGenericBlockKernel
<
T
,
Index
,
NumTLP
,
NumILP
>
<<<
dim3
(
size
/
NumTLP
,
tv
::
launch
::
DivUp
(
numPlanes
,
NumTLP
)),
dim3
(
NumTLP
/
NumILP
,
NumTLP
),
0
,
d
.
getStream
()
>>>
(
outFeatures
.
data
(),
inFeatures
.
data
(),
dout
.
data
(),
din
.
data
(),
indices
.
subview
(
0
).
data
(),
indices
.
subview
(
1
).
data
(),
numHotBlock
,
numPlanes
);
TV_CHECK_CUDA_ERR
();
}
if
(
size
>
numHotBlock
)
{
maxPoolBwdGenericKernel
<
T
,
Index
,
NumTLP
,
NumILP
>
<<<
dim3
(
1
,
tv
::
launch
::
DivUp
(
numPlanes
,
NumTLP
)),
dim3
(
NumTLP
/
NumILP
,
NumTLP
),
0
,
d
.
getStream
()
>>>
(
outFeatures
.
data
(),
inFeatures
.
data
(),
dout
.
data
(),
din
.
data
(),
indices
.
subview
(
0
).
data
()
+
numHotBlock
,
indices
.
subview
(
1
).
data
()
+
numHotBlock
,
size
-
numHotBlock
,
numPlanes
);
TV_CHECK_CUDA_ERR
();
}
}
}
};
}
// namespace functor
#define DECLARE_GPU_SPECS_T_INDEX(T, Index) \
template struct functor::SparseMaxPoolForwardFunctor<tv::GPU, T, Index>; \
template struct functor::SparseMaxPoolBackwardFunctor<tv::GPU, T, Index>;
#define DECLARE_GPU_SPECS(T) DECLARE_GPU_SPECS_T_INDEX(T, int);
DECLARE_GPU_SPECS
(
float
);
DECLARE_GPU_SPECS
(
double
);
DECLARE_GPU_SPECS
(
at
::
Half
);
#undef DECLARE_GPU_SPECS
#undef DECLARE_GPU_SPECS_T_INDEX
}
// namespace spconv
mmdet3d/ops/spconv/src/reordering.cc
0 → 100644
View file @
d1aac35d
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <spconv/reordering.h>
#include <torch/script.h>
namespace
spconv
{
namespace
functor
{
template
<
typename
T
,
typename
Index
>
struct
SparseGatherFunctor
<
tv
::
CPU
,
T
,
Index
>
{
void
operator
()(
const
tv
::
CPU
&
d
,
tv
::
TensorView
<
T
>
buffer
,
tv
::
TensorView
<
const
T
>
features
,
tv
::
TensorView
<
const
Index
>
indices
,
int
size
)
{
int
numPlanes
=
features
.
dim
(
1
);
for
(
int
i
=
0
;
i
<
size
;
++
i
)
{
std
::
memcpy
(
buffer
.
data
()
+
i
*
numPlanes
,
features
.
data
()
+
indices
[
i
]
*
numPlanes
,
sizeof
(
T
)
*
numPlanes
);
}
}
};
template
<
typename
T
,
typename
Index
>
struct
SparseScatterAddFunctor
<
tv
::
CPU
,
T
,
Index
>
{
void
operator
()(
const
tv
::
CPU
&
d
,
tv
::
TensorView
<
T
>
outFeatures
,
tv
::
TensorView
<
const
T
>
buffer
,
tv
::
TensorView
<
const
Index
>
indices
,
int
size
,
bool
stable
)
{
int
numPlanes
=
outFeatures
.
dim
(
1
);
const
T
*
buf
=
buffer
.
data
();
T
*
out
=
outFeatures
.
data
();
for
(
int
i
=
0
;
i
<
size
;
++
i
)
{
buf
=
buffer
.
data
()
+
i
*
numPlanes
;
out
=
outFeatures
.
data
()
+
indices
[
i
]
*
numPlanes
;
for
(
int
j
=
0
;
j
<
numPlanes
;
++
j
){
out
[
j
]
+=
buf
[
j
];
}
}
}
};
}
// namespace functor
#define DECLARE_CPU_SPECS_T_INDEX(T, Index) \
template struct functor::SparseGatherFunctor<tv::CPU, T, Index>; \
template struct functor::SparseScatterAddFunctor<tv::CPU, T, Index>;
#define DECLARE_CPU_SPECS(T) \
DECLARE_CPU_SPECS_T_INDEX(T, int); \
DECLARE_CPU_SPECS_T_INDEX(T, long);
DECLARE_CPU_SPECS
(
float
);
DECLARE_CPU_SPECS
(
double
);
DECLARE_CPU_SPECS
(
at
::
Half
);
#undef DECLARE_CPU_SPECS
#undef DECLARE_CPU_SPECS_T_INDEX
}
// namespace spconv
mmdet3d/ops/spconv/src/reordering_cuda.cu
0 → 100644
View file @
d1aac35d
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <ATen/ATen.h>
#include <chrono>
#include <limits>
#include <spconv/mp_helper.h>
#include <spconv/reordering.h>
#include <spconv/reordering.cu.h>
#include <tensorview/helper_kernel.cu.h>
#include <tensorview/helper_launch.h>
#include <tensorview/tensorview.h>
#include <type_traits>
#include <utility/timer.h>
namespace
spconv
{
namespace
functor
{
template
<
typename
T
,
typename
Index
>
struct
SparseGatherFunctor
<
tv
::
GPU
,
T
,
Index
>
{
using
vecload_type_t
=
std
::
conditional_t
<
std
::
is_same
<
T
,
at
::
Half
>::
value
,
int2
,
int4
>
;
using
kernel_block_t
=
mp_list_c
<
int
,
64
,
32
,
16
>
;
void
operator
()(
const
tv
::
GPU
&
d
,
tv
::
TensorView
<
T
>
buffer
,
tv
::
TensorView
<
const
T
>
features
,
tv
::
TensorView
<
const
Index
>
indices
,
int
size
)
{
if
(
size
<=
0
)
return
;
int
numPlanes
=
features
.
dim
(
1
);
bool
notFound
=
true
;
constexpr
int
vecloadFactor
=
sizeof
(
vecload_type_t
)
/
sizeof
(
T
);
mp_for_each
<
kernel_block_t
>
([
=
,
&
buffer
,
&
features
,
&
indices
,
&
notFound
](
auto
NumTLP
)
{
constexpr
int
NumILP
=
NumTLP
/
4
;
// constexpr int NumILP = NumTLP / (64 / (NumTLP / vecloadFactor));
int
nHotBlock
=
(
size
/
NumTLP
)
*
NumTLP
;
if
(
notFound
)
{
if
(
numPlanes
%
NumTLP
==
0
)
{
if
(
nHotBlock
>=
NumTLP
)
{
gatherVecBlockKernel
<
T
,
Index
,
int
(
NumTLP
),
NumILP
,
vecload_type_t
>
<<<
dim3
(
numPlanes
/
NumTLP
,
size
/
NumTLP
),
dim3
(
NumTLP
/
vecloadFactor
,
NumTLP
/
NumILP
),
0
,
d
.
getStream
()
>>>
(
buffer
.
data
(),
features
.
data
(),
indices
.
data
(),
nHotBlock
,
numPlanes
/
vecloadFactor
);
TV_CHECK_CUDA_ERR
();
}
if
(
size
-
nHotBlock
>
0
)
{
gatherVecKernel
<
T
,
Index
,
int
(
NumTLP
),
NumILP
,
vecload_type_t
>
<<<
dim3
(
1
,
numPlanes
/
NumTLP
),
dim3
(
NumTLP
/
NumILP
,
NumTLP
/
vecloadFactor
),
0
,
d
.
getStream
()
>>>
(
buffer
.
data
()
+
nHotBlock
*
numPlanes
,
features
.
data
(),
indices
.
data
()
+
nHotBlock
,
size
-
nHotBlock
,
numPlanes
/
vecloadFactor
);
TV_CHECK_CUDA_ERR
();
}
notFound
=
false
;
}
}
});
if
(
notFound
)
{
constexpr
int
NumTLP
=
64
;
constexpr
int
NumILP
=
NumTLP
/
4
;
gatherGenericKernel
<
T
,
Index
,
NumTLP
,
NumILP
>
<<<
dim3
(
tv
::
launch
::
DivUp
(
size
,
NumTLP
),
tv
::
launch
::
DivUp
(
numPlanes
,
NumTLP
)),
dim3
(
NumTLP
/
NumILP
,
NumTLP
),
0
,
d
.
getStream
()
>>>
(
buffer
.
data
(),
features
.
data
(),
indices
.
data
(),
size
,
numPlanes
);
TV_CHECK_CUDA_ERR
();
}
}
};
template
<
typename
T
,
typename
Index
>
struct
SparseScatterAddFunctor
<
tv
::
GPU
,
T
,
Index
>
{
using
vecload_type_t
=
std
::
conditional_t
<
std
::
is_same
<
T
,
at
::
Half
>::
value
,
int2
,
int4
>
;
using
kernel_block_t
=
mp_list_c
<
int
,
64
,
32
,
16
>
;
void
operator
()(
const
tv
::
GPU
&
d
,
tv
::
TensorView
<
T
>
outFeatures
,
tv
::
TensorView
<
const
T
>
buffer
,
tv
::
TensorView
<
const
Index
>
indices
,
int
size
,
bool
stable
)
{
if
(
size
<=
0
)
return
;
int
numPlanes
=
outFeatures
.
dim
(
1
);
bool
notFound
=
true
;
constexpr
int
vecloadFactor
=
sizeof
(
vecload_type_t
)
/
sizeof
(
T
);
// important for half.
mp_for_each
<
kernel_block_t
>
([
=
,
&
d
,
&
outFeatures
,
&
buffer
,
&
indices
,
&
notFound
](
auto
NumTLP
)
{
// constexpr int NumILP = NumTLP / (64 / (NumTLP / vecloadFactor));
constexpr
int
NumILP
=
NumTLP
/
4
;
int
nHotBlock
=
(
size
/
NumTLP
)
*
NumTLP
;
if
(
notFound
)
{
if
(
numPlanes
%
NumTLP
==
0
)
{
if
(
nHotBlock
>=
NumTLP
)
{
scatterAddVecBlockKernel
<
T
,
Index
,
int
(
NumTLP
),
NumILP
,
vecload_type_t
>
<<<
dim3
(
numPlanes
/
NumTLP
,
size
/
NumTLP
),
dim3
(
NumTLP
/
vecloadFactor
,
NumTLP
/
NumILP
),
0
,
d
.
getStream
()
>>>
(
outFeatures
.
data
(),
buffer
.
data
(),
indices
.
data
(),
nHotBlock
,
numPlanes
/
vecloadFactor
);
TV_CHECK_CUDA_ERR
();
}
if
(
size
-
nHotBlock
>
0
)
{
scatterAddGenericKernel
<
T
,
Index
,
int
(
NumTLP
),
NumILP
>
<<<
dim3
(
1
,
numPlanes
/
NumTLP
),
dim3
(
NumTLP
/
NumILP
,
NumTLP
),
0
,
d
.
getStream
()
>>>
(
outFeatures
.
data
(),
buffer
.
data
()
+
nHotBlock
*
numPlanes
,
indices
.
data
()
+
nHotBlock
,
size
-
nHotBlock
,
numPlanes
);
TV_CHECK_CUDA_ERR
();
}
notFound
=
false
;
}
}
});
if
(
notFound
)
{
constexpr
int
NumTLP
=
64
;
constexpr
int
NumILP
=
NumTLP
/
4
;
scatterAddGenericKernel
<
T
,
Index
,
NumTLP
,
NumILP
>
<<<
dim3
(
tv
::
launch
::
DivUp
(
size
,
NumTLP
),
tv
::
launch
::
DivUp
(
numPlanes
,
NumTLP
)),
dim3
(
NumTLP
/
NumILP
,
NumTLP
),
0
,
d
.
getStream
()
>>>
(
outFeatures
.
data
(),
buffer
.
data
(),
indices
.
data
(),
size
,
numPlanes
);
TV_CHECK_CUDA_ERR
();
}
}
};
}
// namespace functor
#define DECLARE_GPU_SPECS_T_INDEX(T, Index) \
template struct functor::SparseGatherFunctor<tv::GPU, T, Index>; \
template struct functor::SparseScatterAddFunctor<tv::GPU, T, Index>;
#define DECLARE_GPU_SPECS(T) DECLARE_GPU_SPECS_T_INDEX(T, int);
DECLARE_GPU_SPECS
(
float
);
DECLARE_GPU_SPECS
(
double
);
DECLARE_GPU_SPECS
(
at
::
Half
);
#undef DECLARE_GPU_SPECS
#undef DECLARE_GPU_SPECS_T_INDEX
}
// namespace spconv
Prev
1
…
5
6
7
8
9
10
11
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment