Commit 01ed382c authored by yan.yan's avatar yan.yan
Browse files

working on tensor core test

parent 3517290c
// Copyright 2019-2020 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <spconv/box_iou.h>
#include <spconv/nms.h>
#include <spconv/point2voxel.h>
namespace py = pybind11;
using namespace pybind11::literals;
PYBIND11_MODULE(spconv_utils, m) {
m.doc() = "util pybind11 functions for spconv";
#ifdef TV_CUDA
m.def("non_max_suppression", &spconv::non_max_suppression<double>,
py::return_value_policy::reference_internal, "bbox iou", "boxes"_a = 1,
"keep_out"_a = 2, "nms_overlap_thresh"_a = 3, "device_id"_a = 4);
m.def("non_max_suppression", &spconv::non_max_suppression<float>,
py::return_value_policy::reference_internal, "bbox iou", "boxes"_a = 1,
"keep_out"_a = 2, "nms_overlap_thresh"_a = 3, "device_id"_a = 4);
#endif
m.def("non_max_suppression_cpu", &spconv::non_max_suppression_cpu<double>,
py::return_value_policy::reference_internal, "bbox iou", "boxes"_a = 1,
"order"_a = 2, "nms_overlap_thresh"_a = 3, "eps"_a = 4);
m.def("non_max_suppression_cpu", &spconv::non_max_suppression_cpu<float>,
py::return_value_policy::reference_internal, "bbox iou", "boxes"_a = 1,
"order"_a = 2, "nms_overlap_thresh"_a = 3, "eps"_a = 4);
m.def("rotate_non_max_suppression_cpu",
&spconv::rotate_non_max_suppression_cpu<float>,
py::return_value_policy::reference_internal, "bbox iou",
"box_corners"_a = 1, "order"_a = 2, "standup_iou"_a = 3,
"thresh"_a = 4);
m.def("rotate_non_max_suppression_cpu",
&spconv::rotate_non_max_suppression_cpu<double>,
py::return_value_policy::reference_internal, "bbox iou",
"box_corners"_a = 1, "order"_a = 2, "standup_iou"_a = 3,
"thresh"_a = 4);
m.def("rbbox_iou", &spconv::rbbox_iou<double>,
py::return_value_policy::reference_internal, "rbbox iou",
"box_corners"_a = 1, "qbox_corners"_a = 2, "standup_iou"_a = 3,
"standup_thresh"_a = 4);
m.def("rbbox_iou", &spconv::rbbox_iou<float>,
py::return_value_policy::reference_internal, "rbbox iou",
"box_corners"_a = 1, "qbox_corners"_a = 2, "standup_iou"_a = 3,
"standup_thresh"_a = 4);
m.def("rbbox_intersection", &spconv::rbbox_intersection<double>,
py::return_value_policy::reference_internal, "rbbox iou",
"box_corners"_a = 1, "qbox_corners"_a = 2, "standup_iou"_a = 3,
"standup_thresh"_a = 4);
m.def("rbbox_intersection", &spconv::rbbox_intersection<float>,
py::return_value_policy::reference_internal, "rbbox iou",
"box_corners"_a = 1, "qbox_corners"_a = 2, "standup_iou"_a = 3,
"standup_thresh"_a = 4);
m.def("points_to_voxel_3d_np", &spconv::points_to_voxel_3d_np<float, 3>,
"matrix tensor_square", "points"_a = 1, "voxels"_a = 2,
"voxel_point_mask"_a = 3, "coors"_a = 4, "num_points_per_voxel"_a = 5,
"coor_to_voxelidx"_a = 6, "voxel_size"_a = 7, "coors_range"_a = 8,
"max_points"_a = 9, "max_voxels"_a = 10);
m.def("points_to_voxel_3d_np", &spconv::points_to_voxel_3d_np<double, 3>,
"matrix tensor_square", "points"_a = 1, "voxels"_a = 2,
"voxel_point_mask"_a = 3, "coors"_a = 4, "num_points_per_voxel"_a = 5,
"coor_to_voxelidx"_a = 6, "voxel_size"_a = 7, "coors_range"_a = 8,
"max_points"_a = 9, "max_voxels"_a = 10);
m.def("points_to_voxel_3d_np_mean",
&spconv::points_to_voxel_3d_np_mean<float, 3>, "matrix tensor_square",
"points"_a = 1, "voxels"_a = 2, "voxel_point_mask"_a = 3, "means"_a = 4,
"coors"_a = 5, "num_points_per_voxel"_a = 6, "coor_to_voxelidx"_a = 7,
"voxel_size"_a = 8, "coors_range"_a = 9, "max_points"_a = 10,
"max_voxels"_a = 11);
m.def("points_to_voxel_3d_np_mean",
&spconv::points_to_voxel_3d_np_mean<double, 3>, "matrix tensor_square",
"points"_a = 1, "voxels"_a = 2, "voxel_point_mask"_a = 3, "means"_a = 4,
"coors"_a = 5, "num_points_per_voxel"_a = 6, "coor_to_voxelidx"_a = 7,
"voxel_size"_a = 8, "coors_range"_a = 9, "max_points"_a = 10,
"max_voxels"_a = 11);
m.def("points_to_voxel_3d_with_filtering",
&spconv::points_to_voxel_3d_with_filtering<float, 3>,
"matrix tensor_square", "points"_a = 1, "voxels"_a = 2,
"voxel_point_mask"_a = 3, "voxel_mask"_a = 4, "mins"_a = 5,
"maxs"_a = 6, "coors"_a = 7, "num_points_per_voxel"_a = 8,
"coor_to_voxelidx"_a = 9, "voxel_size"_a = 10, "coors_range"_a = 11,
"max_points"_a = 12, "max_voxels"_a = 13, "block_factor"_a = 14,
"block_size"_a = 15, "height_threshold"_a = 16,
"height_high_threshold"_a = 17);
m.def("points_to_voxel_3d_with_filtering",
&spconv::points_to_voxel_3d_with_filtering<float, 3>,
"matrix tensor_square", "points"_a = 1, "voxels"_a = 2,
"voxel_point_mask"_a = 3, "voxel_mask"_a = 4, "mins"_a = 5,
"maxs"_a = 6, "coors"_a = 7, "num_points_per_voxel"_a = 8,
"coor_to_voxelidx"_a = 9, "voxel_size"_a = 10, "coors_range"_a = 11,
"max_points"_a = 12, "max_voxels"_a = 13, "block_factor"_a = 14,
"block_size"_a = 15, "height_threshold"_a = 16,
"height_high_threshold"_a = 17);
}
\ No newline at end of file
// ------------------------------------------------------------------
// Deformable Convolutional Networks
// Copyright (c) 2015 Microsoft
// Licensed under The MIT License
// Modified from MATLAB Faster R-CNN
// (https://github.com/shaoqingren/faster_rcnn)
// ------------------------------------------------------------------
#include <cuda_runtime.h>
#include <iostream>
#include <spconv/nms_gpu.h>
#include <vector>
#define CUDA_CHECK(condition) \
/* Code block avoids redefinition of cudaError_t error */ \
do { \
cudaError_t error = condition; \
if (error != cudaSuccess) { \
std::cout << cudaGetErrorString(error) << std::endl; \
} \
} while (0)
#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
int const threadsPerBlock = sizeof(unsigned long long) * 8;
template <typename DType>
__device__ inline DType devIoU(DType const *const a, DType const *const b) {
DType left = max(a[0], b[0]), right = min(a[2], b[2]);
DType top = max(a[1], b[1]), bottom = min(a[3], b[3]);
DType width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f);
DType interS = width * height;
DType Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1);
DType Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1);
return interS / (Sa + Sb - interS);
}
template <typename DType, int BLOCK_THREADS>
__global__ void nms_kernel(const int n_boxes, const DType nms_overlap_thresh,
const DType *dev_boxes,
unsigned long long *dev_mask) {
const int row_start = blockIdx.y;
const int col_start = blockIdx.x;
// if (row_start > col_start) return;
const int row_size = min(n_boxes - row_start * BLOCK_THREADS, BLOCK_THREADS);
const int col_size = min(n_boxes - col_start * BLOCK_THREADS, BLOCK_THREADS);
__shared__ DType block_boxes[BLOCK_THREADS * 5];
if (threadIdx.x < col_size) {
#pragma unroll
for (int i = 0; i < 5; ++i) {
block_boxes[threadIdx.x * 5 + i] =
dev_boxes[(BLOCK_THREADS * col_start + threadIdx.x) * 5 + i];
}
}
__syncthreads();
if (threadIdx.x < row_size) {
const int cur_box_idx = BLOCK_THREADS * row_start + threadIdx.x;
const DType *cur_box = dev_boxes + cur_box_idx * 5;
unsigned long long t = 0;
int start = 0;
if (row_start == col_start) {
start = threadIdx.x + 1;
}
for (int i = start; i < col_size; i++) {
if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) {
t |= 1ULL << i;
}
}
const int col_blocks = DIVUP(n_boxes, BLOCK_THREADS);
dev_mask[cur_box_idx * col_blocks + col_start] = t;
}
}
void _set_device(int device_id) {
int current_device;
CUDA_CHECK(cudaGetDevice(&current_device));
if (current_device == device_id) {
return;
}
// The call to cudaSetDevice must come before any calls to Get, which
// may perform initialization using the GPU.
CUDA_CHECK(cudaSetDevice(device_id));
}
template <typename DType, int BLOCK_THREADS>
int _nms_gpu(int *keep_out, const DType *boxes_host, int boxes_num,
int boxes_dim, DType nms_overlap_thresh, int device_id) {
_set_device(device_id);
DType *boxes_dev = NULL;
unsigned long long *mask_dev = NULL;
const int col_blocks = DIVUP(boxes_num, BLOCK_THREADS);
CUDA_CHECK(cudaMalloc(&boxes_dev, boxes_num * boxes_dim * sizeof(DType)));
CUDA_CHECK(cudaMemcpy(boxes_dev, boxes_host,
boxes_num * boxes_dim * sizeof(DType),
cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMalloc(&mask_dev,
boxes_num * col_blocks * sizeof(unsigned long long)));
dim3 blocks(DIVUP(boxes_num, BLOCK_THREADS), DIVUP(boxes_num, BLOCK_THREADS));
dim3 threads(BLOCK_THREADS);
nms_kernel<DType, BLOCK_THREADS>
<<<blocks, threads>>>(boxes_num, nms_overlap_thresh, boxes_dev, mask_dev);
std::vector<unsigned long long> mask_host(boxes_num * col_blocks);
CUDA_CHECK(cudaMemcpy(&mask_host[0], mask_dev,
sizeof(unsigned long long) * boxes_num * col_blocks,
cudaMemcpyDeviceToHost));
std::vector<unsigned long long> remv(col_blocks);
memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
int num_to_keep = 0;
for (int i = 0; i < boxes_num; i++) {
int nblock = i / BLOCK_THREADS;
int inblock = i % BLOCK_THREADS;
if (!(remv[nblock] & (1ULL << inblock))) {
keep_out[num_to_keep++] = i;
unsigned long long *p = &mask_host[0] + i * col_blocks;
for (int j = nblock; j < col_blocks; j++) {
remv[j] |= p[j];
}
}
}
CUDA_CHECK(cudaFree(boxes_dev));
CUDA_CHECK(cudaFree(mask_dev));
return num_to_keep;
}
// template<>
template int _nms_gpu<float, threadsPerBlock>(int *keep_out,
const float *boxes_host,
int boxes_num, int boxes_dim,
float nms_overlap_thresh,
int device_id);
// template<>
template int _nms_gpu<double, threadsPerBlock>(int *keep_out,
const double *boxes_host,
int boxes_num, int boxes_dim,
double nms_overlap_thresh,
int device_id);
\ No newline at end of file
# Copyright 2021 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
STR = """
0.0016176700592041016
0.002481698989868164
0.0027854442596435547
0.0031723976135253906
0.0017437934875488281
0.0020503997802734375
0.001399993896484375
0.0016183853149414062
0.0007357597351074219
0.0008492469787597656
0.0006558895111083984
0.0007994174957275391
0.000335693359375
0.000347137451171875
"""
"""
0.003921985626220703
0.0049707889556884766
0.0052530765533447266
0.0060312747955322266
0.0036766529083251953
0.00421142578125
0.002129793167114258
0.0023038387298583984
0.0013151168823242188
0.0015285015106201172
0.0008392333984375
0.0008127689361572266
0.0002486705780029297
0.00030994415283203125
"""
STR = """
0.0006084442138671875
0.0005354881286621094
0.0012688636779785156
0.0012619495391845703
0.002301931381225586
0.0019693374633789062
0.0038712024688720703
0.002872467041015625
0.005068302154541016
0.0047588348388671875
0.007832765579223633
0.005643367767333984
0.005807161331176758
0.004715442657470703"""
"""
0.0004992485046386719
0.0003979206085205078
0.0013720989227294922
0.0015933513641357422
0.0027768611907958984
0.0024590492248535156
0.004837512969970703
0.004601001739501953
0.009881019592285156
0.008889913558959961
0.017162084579467773
0.009079217910766602
0.009355545043945312
0.0068836212158203125
"""
nums = list(map(float, STR.strip().split("\n")))
print(sum(nums))
\ No newline at end of file
# Copyright 2021 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import time import time
from pathlib import Path from pathlib import Path
import numpy as np import numpy as np
import torch import torch
from torch import nn from torch import nn
from cumm import tensorview as tv
import spconv import spconv.pytorch as spconv
from spconv.utils import VoxelGeneratorV2 from spconv.utils import Point2VoxelCPU3d
def waymo_data(batch_size=1): def waymo_data(batch_size=1):
gen = VoxelGeneratorV2([0.1, 0.1, 0.1], [-80, -80, -2, 80, 80, 6], 1, gen = Point2VoxelCPU3d([0.1, 0.1, 0.1], [-80, -80, -2, 80, 80, 6], 3,
150000) 150000, 1)
# gen = VoxelGeneratorV2([0.1, 0.1, 0.1], [-80, -80, -2, 80, 80, 6], 1,
# 150000)
data = np.load(Path(__file__).parent / "data" / "benchmark-pc.npz") data = np.load(Path(__file__).parent / "data" / "benchmark-pc.npz")
pc = data["pc"] pc = np.ascontiguousarray(data["pc"])
data = gen.generate(pc) print(pc.shape)
voxels = data["voxels"].reshape(-1, 3) voxels_tv, indices_tv, _ = gen.point_to_voxel(tv.from_numpy(pc))
coors = data["coordinates"] voxels = voxels_tv.numpy().reshape(-1, 3)
coors = indices_tv.numpy()
N = coors.shape[0] N = coors.shape[0]
coors = np.concatenate([np.full([N, 1], 0, coors.dtype), coors], axis=1) coors = np.concatenate([np.full([N, 1], 0, coors.dtype), coors], axis=1)
return voxels, coors, gen.grid_size return voxels, coors, gen.grid_size
...@@ -28,14 +44,25 @@ class Net(nn.Module): ...@@ -28,14 +44,25 @@ class Net(nn.Module):
self.net = spconv.SparseSequential( self.net = spconv.SparseSequential(
spconv.SubMConv3d(3, 64, 3, bias=False, indice_key="c0", spconv.SubMConv3d(3, 64, 3, bias=False, indice_key="c0",
algo=algo), algo=algo),
# spconv.SubMConv3d(32,
# 32,
# 3,
# bias=False,
# indice_key="c0",
# algo=algo),
# # nn.BatchNorm1d(32),
# # nn.ReLU(),
# # spconv.SparseConv3d(64, 64, 2, 2, bias=False,
# # algo=algo),
# spconv.SubMConv3d(32, 64, 3, bias=False, indice_key="c0",
# algo=algo),
spconv.SubMConv3d(64, spconv.SubMConv3d(64,
64, 64,
3, 3,
bias=False, bias=False,
indice_key="c0", indice_key="c0",
algo=algo), algo=algo),
# nn.BatchNorm1d(32),
# nn.ReLU(),
spconv.SparseMaxPool3d(2, 2), spconv.SparseMaxPool3d(2, 2),
spconv.SubMConv3d(64, spconv.SubMConv3d(64,
96, 96,
...@@ -137,27 +164,111 @@ class Net(nn.Module): ...@@ -137,27 +164,111 @@ class Net(nn.Module):
self.grid) self.grid)
return self.net(x) return self.net(x)
class Net2(nn.Module):
def __init__(self, shape, algo):
super().__init__()
self.net = spconv.SparseSequential(
spconv.SubMConv3d(3, 256, 3, bias=False, indice_key="c0",
algo=algo),
# spconv.SubMConv3d(32,
# 32,
# 3,
# bias=False,
# indice_key="c0",
# algo=algo),
# # nn.BatchNorm1d(32),
# # nn.ReLU(),
# # spconv.SparseConv3d(64, 64, 2, 2, bias=False,
# # algo=algo),
# spconv.SubMConv3d(32, 64, 3, bias=False, indice_key="c0",
# algo=algo),
spconv.SubMConv3d(256,
256,
3,
bias=False,
indice_key="c0",
algo=algo),
# nn.BatchNorm1d(32),
# nn.ReLU(),
spconv.SparseMaxPool3d(2, 2),
spconv.SubMConv3d(256,
512,
3,
bias=False,
indice_key="c1",
algo=algo),
spconv.SubMConv3d(512,
512,
3,
bias=False,
indice_key="c1",
algo=algo),
)
max_batch_size = 1
# grid (dense map) is used for indice generation. use pre-allocated grid can run faster.
self.grid = torch.full([max_batch_size, *shape], -1,
dtype=torch.int32).cuda()
# self.grid = None
self.shape = shape
def forward(self, features, coors, batch_size):
x = spconv.SparseConvTensor(features, coors, self.shape, batch_size,
self.grid)
return self.net(x)
def main(): def main():
voxels, coors, spatial_shape = waymo_data() import pickle
np.random.seed(50051)
torch.manual_seed(50051)
# voxels, coors, spatial_shape = waymo_data()
# with open("/home/yy/test_spconv.pkl", "wb") as f:
# pickle.dump((voxels, coors, spatial_shape), f)
with open("/home/yy/test_spconv.pkl", "rb") as f:
(voxels, coors, spatial_shape) = pickle.load(f)
print(spatial_shape)
print(voxels.shape)
# voxels = voxels[:100]
# coors = coors[:100]
voxels_th = torch.from_numpy(voxels).cuda().float() voxels_th = torch.from_numpy(voxels).cuda().float()
coors_th = torch.from_numpy(coors).cuda().int() coors_th = torch.from_numpy(coors).cuda().int()
voxels_th.requires_grad = True
algo = spconv.ConvAlgo.Native algo = spconv.ConvAlgo.Native
net = Net(spatial_shape[::-1], algo).cuda().eval().float() net = Net(spatial_shape, algo).cuda().eval().float()
print(coors_th.shape) print(coors_th.shape)
out = net(voxels_th, coors_th, 1) out = net(voxels_th, coors_th, 1)
print(out.spatial_shape) print(out.spatial_shape)
print(voxels.mean(), voxels.max(), voxels.min())
dout = np.random.uniform(-0.2, 0.2,
out.features.shape).astype(np.float32)
dout_t = torch.from_numpy(dout).cuda()
print(out.spatial_shape, out.features.mean(), out.features.max(), out.features.min())
times = [] times = []
with torch.no_grad(): with torch.no_grad():
for i in range(20): for i in range(20):
print("------------")
torch.cuda.synchronize() torch.cuda.synchronize()
t = time.time() t = time.time()
out = net(voxels_th, coors_th, 1) out_nograd = net(voxels_th, coors_th, 1)
torch.cuda.synchronize() torch.cuda.synchronize()
times.append(time.time() - t) times.append(time.time() - t)
print("spconv time", np.mean(times[10:]))
times = []
for i in range(10):
out = net(voxels_th, coors_th, 1)
print("------------")
torch.cuda.synchronize()
t = time.time()
out.features.backward(dout_t)
torch.cuda.synchronize()
times.append(time.time() - t)
# print((net.grid == -1).float().sum(), net.grid.numel()) # print((net.grid == -1).float().sum(), net.grid.numel())
# print("spconv time", time.time() - t) # print("spconv time", time.time() - t)
print("spconv time", np.mean(times[10:])) print("spconv bw time", np.mean(times[5:]))
if __name__ == "__main__": if __name__ == "__main__":
......
import time
from pathlib import Path
import numpy as np
import torch
from torch import nn
import spconv
from spconv.utils import VoxelGeneratorV2
def waymo_data(batch_size=1):
gen = VoxelGeneratorV2([0.1, 0.1, 0.1], [-80, -80, -2, 80, 80, 6], 1,
150000)
data = np.load(Path(__file__).parent / "data" / "benchmark-pc.npz")
pc = data["pc"]
data = gen.generate(pc)
voxels = data["voxels"].reshape(-1, 3)
coors = data["coordinates"]
N = coors.shape[0]
coors = np.concatenate([np.full([N, 1], 0, coors.dtype), coors], axis=1)
return voxels, coors, gen.grid_size
class Net(nn.Module):
def __init__(self, shape, algo):
super().__init__()
self.net = spconv.SparseSequential(
spconv.SubMConv3d(3,
64,
3,
bias=False,
indice_key="c0",
algo=algo,
name="subm-0-0"),
spconv.SubMConv3d(64,
64,
3,
bias=False,
indice_key="c0",
algo=algo,
name="subm-0-1"),
# nn.BatchNorm1d(32),
# nn.ReLU(),
spconv.SparseMaxPool3d(2, 2, name="pool-0"),
spconv.SubMConv3d(64,
96,
3,
bias=False,
indice_key="c1",
algo=algo,
name="subm-1-0"),
spconv.SubMConv3d(96,
96,
3,
bias=False,
indice_key="c1",
algo=algo,
name="subm-1-1"),
# nn.BatchNorm1d(64),
# nn.ReLU(),
spconv.SparseMaxPool3d(2, 2, name="pool-1"),
spconv.SubMConv3d(96,
128,
3,
bias=False,
indice_key="c2",
algo=algo,
name="subm-2-0"),
spconv.SubMConv3d(128,
128,
3,
bias=False,
indice_key="c2",
algo=algo,
name="subm-2-1"),
# nn.BatchNorm1d(128),
# nn.ReLU(),
spconv.SparseMaxPool3d(2, 2, name="pool-2"),
spconv.SubMConv3d(128,
160,
3,
bias=False,
indice_key="c3",
algo=algo,
name="subm-3-0"),
spconv.SubMConv3d(160,
160,
3,
bias=False,
indice_key="c3",
algo=algo,
name="subm-3-1"),
# nn.BatchNorm1d(128),
# nn.ReLU(),
spconv.SparseMaxPool3d(2, 2, name="pool-3"),
spconv.SubMConv3d(160,
192,
3,
bias=False,
indice_key="c4",
algo=algo,
name="subm-4-0"),
spconv.SubMConv3d(192,
192,
3,
bias=False,
indice_key="c4",
algo=algo,
name="subm-4-1"),
# nn.BatchNorm1d(128),
# nn.ReLU(),
spconv.SparseMaxPool3d(2, 2, name="pool-4"),
spconv.SubMConv3d(192,
224,
3,
bias=False,
indice_key="c5",
algo=algo,
name="subm-5-0"),
spconv.SubMConv3d(224,
224,
3,
bias=False,
indice_key="c5",
algo=algo,
name="subm-5-1"),
# nn.BatchNorm1d(128),
# nn.ReLU(),
spconv.SparseMaxPool3d(2, 2, name="pool-5"),
spconv.SubMConv3d(224,
256,
3,
bias=False,
indice_key="c6",
algo=algo,
name="subm-6-0"),
spconv.SubMConv3d(256,
256,
3,
bias=False,
indice_key="c6",
algo=algo,
name="subm-6-1"),
)
max_batch_size = 1
# grid (dense map) is used for indice generation. use pre-allocated grid can run faster.
self.grid = torch.full([max_batch_size, *shape], -1,
dtype=torch.int32).cuda()
# self.grid = None
self.shape = shape
def forward(self, features, coors, batch_size):
x = spconv.SparseConvTensor(features,
coors,
self.shape,
batch_size,
self.grid,
benchmark=True)
return self.net(x)
def main():
dtype = torch.float32
voxels, coors, spatial_shape = waymo_data()
voxels_th = torch.from_numpy(voxels).cuda().to(dtype)
coors_th = torch.from_numpy(coors).cuda().int()
algo = spconv.ConvAlgo.Minkowski
net = Net(spatial_shape[::-1], algo).cuda().eval().to(dtype)
print(coors_th.shape)
out = net(voxels_th, coors_th, 1)
print(out.spatial_shape)
times = []
detail_bench = {}
detail_ind_gen_bench = {}
with torch.no_grad():
for i in range(20):
torch.cuda.synchronize()
t = time.time()
out = net(voxels_th, coors_th, 1)
for k, v in out.benchmark_record.items():
if k not in detail_bench:
detail_bench[k] = []
detail_ind_gen_bench[k] = []
detail_bench[k].extend(v["time"])
detail_ind_gen_bench[k].extend(v["indice_gen_time"])
torch.cuda.synchronize()
times.append(time.time() - t)
# print((net.grid == -1).float().sum(), net.grid.numel())
# print("spconv time", time.time() - t)
print("spconv time", np.mean(times[10:]))
print(detail_bench["subm-6-0"])
print(detail_ind_gen_bench["subm-6-0"])
if __name__ == "__main__":
main()
import time
from pathlib import Path
import numpy as np
import torch
from torch import nn
import spconv
from spconv.utils import VoxelGeneratorV2, VoxelGeneratorV3
def waymo_data_gpu(batch_size=1):
print('gpu with total points available per voxel')
data = np.load(Path(__file__).parent / "data" / "benchmark-pc.npz")
points = torch.from_numpy(data['pc']).cuda().float()
voxel_size = torch.Tensor([0.1, 0.1,
0.1]).to(points.dtype).to(points.device)
coors_range = torch.Tensor([-80, -80, -2, 80, 80,
6]).to(points.dtype).to(points.device)
gen = VoxelGeneratorV3(voxel_size,
coors_range,
max_points=200000,
num_features=points.shape[1],
dtype=points.dtype,
device=points.device)
voxels, coors = gen.generate(points)
times = []
with torch.no_grad():
for i in range(200):
torch.cuda.synchronize()
t = time.time()
voxels, coors = gen.generate(points)
torch.cuda.synchronize()
times.append(time.time() - t)
print("voxelization time", np.mean(times[100:]))
N = coors.shape[0]
batch_id = torch.zeros([N, 1], dtype=coors.dtype, device=coors.device)
coors = torch.cat([batch_id, coors], dim=1)
return voxels, coors, gen.grid_size
def waymo_data_cpu(max_points_per_voxel=1, batch_size=1):
print('cpu with %d max points per voxel' % max_points_per_voxel)
gen = VoxelGeneratorV2([0.1, 0.1, 0.1], [-80, -80, -2, 80, 80, 6],
max_points_per_voxel, 150000)
data = np.load(Path(__file__).parent / "data" / "benchmark-pc.npz")
pc = data["pc"]
data = gen.generate(pc)
times = []
with torch.no_grad():
for i in range(200):
torch.cuda.synchronize()
t = time.time()
data = gen.generate(pc)
torch.cuda.synchronize()
times.append(time.time() - t)
print("voxelization time", np.mean(times[100:]))
voxels = data["voxels"].reshape(-1, 3)
coors = data["coordinates"]
N = coors.shape[0]
coors = np.concatenate([np.full([N, 1], 0, coors.dtype), coors], axis=1)
return voxels, coors, gen.grid_size
def get_index(coor, grid_size):
index = coor[0]
for c, g in zip(coor[1:], grid_size):
index = index * g + c
return index
def main():
voxels_gpu, coors_gpu, grid_size_gpu = waymo_data_gpu()
voxels_cpu, coors_cpu, grid_size_cpu = waymo_data_cpu(1)
waymo_data_cpu(10)
waymo_data_cpu(40)
print('...')
grid_size_gpu = grid_size_gpu[::-1]
grid_size_cpu = grid_size_cpu[::-1]
assert len(grid_size_gpu) == len(grid_size_cpu), "mismatch grid size"
assert grid_size_gpu[0] == grid_size_cpu[0], "mismatch grid size"
assert grid_size_gpu[1] == grid_size_cpu[1], "mismatch grid size"
assert grid_size_gpu[2] == grid_size_cpu[2], "mismatch grid size"
assert coors_gpu.shape[0] == coors_cpu.shape[0], "mismatch coors shape"
index2voxel = dict()
for coor, voxel in zip(coors_gpu, voxels_gpu):
index = get_index(coor, grid_size_gpu).item()
index2voxel[index] = voxel[:3].cpu()
for coor, voxel in zip(coors_cpu, voxels_cpu):
index = get_index(coor, grid_size_cpu).item()
assert index in index2voxel, "mismatch index: " + str(index)
assert (index2voxel.pop(index) - voxel[:3]).abs().max() < 0.1, \
"voxel diff should be smaller than voxel_size 0.1"
print('Perfect GPU Voxelization!!!')
if __name__ == "__main__":
main()
import time
from pathlib import Path
import numpy as np
import torch
from torch import nn
import spconv
from spconv.utils import VoxelGeneratorV3
def waymo_data(batch_size=1):
data = np.load(Path(__file__).parent / "data" / "benchmark-pc.npz")
points = torch.from_numpy(data['pc']).cuda().float()
voxel_size = torch.Tensor([0.1, 0.1,
0.1]).to(points.dtype).to(points.device)
coors_range = torch.Tensor([-80, -80, -2, 80, 80,
6]).to(points.dtype).to(points.device)
gen = VoxelGeneratorV3(voxel_size, coors_range)
voxels, coors = gen.generate(points)
N = coors.shape[0]
batch_id = torch.zeros([N, 1], dtype=coors.dtype, device=coors.device)
coors = torch.cat([batch_id, coors], dim=1)
return voxels, coors, gen.grid_size
class Net(nn.Module):
def __init__(self, shape, algo, device):
super().__init__()
self.device = device
self.net = spconv.SparseSequential(
spconv.SubMConv3d(3, 64, 3, bias=False, indice_key="c0",
algo=algo),
spconv.SubMConv3d(64,
64,
3,
bias=False,
indice_key="c0",
algo=algo),
# nn.BatchNorm1d(32),
# nn.ReLU(),
spconv.SparseMaxPool3d(2, 2),
spconv.SubMConv3d(64,
96,
3,
bias=False,
indice_key="c1",
algo=algo),
spconv.SubMConv3d(96,
96,
3,
bias=False,
indice_key="c1",
algo=algo),
# nn.BatchNorm1d(64),
# nn.ReLU(),
spconv.SparseMaxPool3d(2, 2),
spconv.SubMConv3d(96,
128,
3,
bias=False,
indice_key="c2",
algo=algo),
spconv.SubMConv3d(128,
128,
3,
bias=False,
indice_key="c2",
algo=algo),
# nn.BatchNorm1d(128),
# nn.ReLU(),
spconv.SparseMaxPool3d(2, 2),
spconv.SubMConv3d(128,
160,
3,
bias=False,
indice_key="c3",
algo=algo),
spconv.SubMConv3d(160,
160,
3,
bias=False,
indice_key="c3",
algo=algo),
# nn.BatchNorm1d(128),
# nn.ReLU(),
spconv.SparseMaxPool3d(2, 2),
spconv.SubMConv3d(160,
192,
3,
bias=False,
indice_key="c4",
algo=algo),
spconv.SubMConv3d(192,
192,
3,
bias=False,
indice_key="c4",
algo=algo),
# nn.BatchNorm1d(128),
# nn.ReLU(),
spconv.SparseMaxPool3d(2, 2),
spconv.SubMConv3d(192,
224,
3,
bias=False,
indice_key="c5",
algo=algo),
spconv.SubMConv3d(224,
224,
3,
bias=False,
indice_key="c5",
algo=algo),
# nn.BatchNorm1d(128),
# nn.ReLU(),
spconv.SparseMaxPool3d(2, 2),
spconv.SubMConv3d(224,
256,
3,
bias=False,
indice_key="c6",
algo=algo),
spconv.SubMConv3d(256,
256,
3,
bias=False,
indice_key="c6",
algo=algo),
)
max_batch_size = 1
# grid (dense map) is used for indice generation. use pre-allocated grid can run faster.
self.grid = torch.full([max_batch_size, *shape],
-1,
dtype=torch.int32,
device=self.device)
# self.grid = None
self.shape = shape
def forward(self, features, coors, batch_size):
x = spconv.SparseConvTensor(features, coors, self.shape, batch_size,
self.grid)
return self.net(x)
def main():
voxels, coors, spatial_shape = waymo_data()
voxels_th, coors_th = voxels, coors
algo = spconv.ConvAlgo.Native
net = Net(spatial_shape[::-1], algo,
voxels_th.device).cuda(device=voxels_th.device).eval().float()
print(coors_th.shape)
out = net(voxels_th, coors_th, 1)
print(out.spatial_shape)
times = []
with torch.no_grad():
for i in range(20):
torch.cuda.synchronize()
t = time.time()
out = net(voxels_th, coors_th, 1)
torch.cuda.synchronize()
times.append(time.time() - t)
# print((net.grid == -1).float().sum(), net.grid.numel())
# print("spconv time", time.time() - t)
print("spconv time", np.mean(times[10:]))
if __name__ == "__main__":
main()
// 000-CatchMain.cpp
// In a Catch project with multiple files, dedicate one file to compile the
// source code of Catch itself and reuse the resulting object file for linking.
// Let Catch provide main():
#define CATCH_CONFIG_MAIN
#include "catch.hpp"
// That's it
// Compile implementation of Catch for use with files that do contain tests:
// - g++ -std=c++11 -Wall -I$(CATCH_SINGLE_INCLUDE) -c 000-CatchMain.cpp
// - cl -EHsc -I%CATCH_SINGLE_INCLUDE% -c 000-CatchMain.cpp
#include <algorithm>
#include <iostream>
#include <map>
#include "catch.hpp"
#include <prettyprint.h>
#include <string>
#include <vector>
#include <exception>
#include <numeric>
#include <pybind11/embed.h> // everything needed for embedding
#include <pybind11/functional.h>
#include <pybind11/numpy.h>
#include <pybind11/pybind11.h>
#include <pybind11/stl.h>
#include <tuple>
#include <pybind11_utils.h>
#include <spconv/spconv_ops.h>
namespace py = pybind11;
TEST_CASE("GetConvIndPair", "[SpConvNet]")
{
using namespace py::literals;
py::scoped_interpreter guard{}; // start the interpreter and keep it alive
py::exec(R"(
from __future__ import print_function
import numpy as np
import math
# import spconv
# import torch
def get_convolution_output_size(input_size,
kernel_size,
stride,
padding=None,
rate=None):
ndim = len(input_size)
if padding is None:
padding = [0] * ndim
output_size = []
for i in range(ndim):
output_size.append((input_size[i] + 2 * padding[i] - (
(kernel_size[i] - 1) + 1)) // stride[i] + 1)
return output_size
def get_test_sparse_data(shape,
num_points,
num_channels,
integer=False,
dtype=np.float32):
dense_shape = shape
ndim = len(dense_shape)
# num_points = np.random.randint(10, 100, size=[batch_size, ndim])
num_points = np.array(num_points)
# num_points = np.array([3, 2])
batch_size = len(num_points)
batch_indices = []
coors_total = np.stack(
np.meshgrid(*[np.arange(0, s) for s in shape]), axis=-1)
coors_total = coors_total.reshape(-1, ndim)
for i in range(batch_size):
np.random.shuffle(coors_total)
inds_total = coors_total[:num_points[i]]
inds_total = np.pad(
inds_total, ((0, 0), (0, 1)), mode="constant", constant_values=i)
batch_indices.append(inds_total)
if integer:
sparse_data = np.random.randint(
20, 100, size=[num_points.sum(), num_channels]).astype(dtype)
else:
sparse_data = np.random.uniform(
-1, 1, size=[num_points.sum(), num_channels]).astype(dtype)
# sparse_data = np.arange(1, num_points.sum() + 1).astype(np.float32).reshape(5, 1)
dense_data = np.zeros(
[batch_size, num_channels, *dense_shape], dtype=sparse_data.dtype)
start = 0
for i, inds in enumerate(batch_indices):
for j, ind in enumerate(inds):
dense_slice = (i, slice(None), *ind[:-1])
dense_data[dense_slice] = sparse_data[start + j]
start += len(inds)
batch_indices = np.concatenate(batch_indices, axis=0)
return {
"features": sparse_data.astype(dtype),
"indices": batch_indices.astype(np.int32),
"features_dense": dense_data.astype(dtype),
}
shape = [50, 30, 30]
num_points = [5000] * 1
# np.random.seed(np.random.randint(1, 100000))
in_channels = 64
sparse_dict = get_test_sparse_data(shape, num_points, in_channels)
features = np.ascontiguousarray(sparse_dict["features"]).astype(np.float32)
indices = np.ascontiguousarray(sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
features_dense = sparse_dict["features_dense"]
# indices_t = torch.from_numpy(indices)
filters = np.random.uniform(0, 1, size=[3, 3, 3, 64, 64]).astype(np.float32)
# print(outids.shape)
)");
SECTION("DebugTest"){
auto inds = array2TensorView<int>(py::array(py::globals()["indices"]));
auto inds_tensor = torch::from_blob(inds.data(), {inds.dim(0), inds.dim(1)}, torch::dtype(torch::kInt32));
auto inds_gpu = inds_tensor.to(torch::Device(torch::kCPU));
auto features = array2TensorView<float>(py::array(py::globals()["features"]));
auto features_tensor = torch::from_blob(features.data(), {features.dim(0), features.dim(1)}, torch::dtype(torch::kFloat));
auto features_gpu = features_tensor.to(torch::Device(torch::kCUDA, 0));
auto filters = array2TensorView<float>(py::array(py::globals()["filters"]));
auto filters_tensor = torch::from_blob(filters.data(), {filters.dim(0), filters.dim(1), filters.dim(2), filters.dim(3), filters.dim(4)}, torch::dtype(torch::kFloat));
auto filters_gpu = filters_tensor.to(torch::Device(torch::kCUDA, 0));
auto outputs = spconv::getIndicePair<3>(inds_gpu, 1, {46, 26, 26}, {50, 30, 30}, {3, 3, 3},
{1, 1, 1}, {0, 0, 0}, {2, 2, 2}, {0, 0, 0}, 0, 0, 0);
// std::cout << outputs[2] << std::endl;
/*
auto output = spconv::indiceConv<float>(features_gpu, filters_gpu, outputs[1], outputs[2], outputs[0].size(0), false);
std::cout << output << std::endl;*/
}
}
\ No newline at end of file
# Copyright 2019-2020 Yan Yan # Copyright 2021 Yan Yan
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
# You may obtain a copy of the License at # You may obtain a copy of the License at
# #
# http://www.apache.org/licenses/LICENSE-2.0 # http://www.apache.org/licenses/LICENSE-2.0
# #
# Unless required by applicable law or agreed to in writing, software # Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, # distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...@@ -20,9 +20,9 @@ import numpy as np ...@@ -20,9 +20,9 @@ import numpy as np
import torch import torch
from torch import nn from torch import nn
import spconv import spconv.pytorch as spconv
from spconv.test_utils import TestCase, generate_sparse_data, params_grid from spconv.test_utils import TestCase, generate_sparse_data, params_grid
from spconv.constants import FILTER_HWIO
# import sparseconvnet as scn # import sparseconvnet as scn
...@@ -37,7 +37,7 @@ class SparseConv3dTestTorch(nn.Module): ...@@ -37,7 +37,7 @@ class SparseConv3dTestTorch(nn.Module):
stride, stride,
padding, padding,
dilation, dilation,
algo=spconv.ConvAlgo.Minkowski): algo=spconv.ConvAlgo.Native):
super().__init__() super().__init__()
layers = [ layers = [
spconv.SparseConv3d(in_channels, spconv.SparseConv3d(in_channels,
...@@ -47,7 +47,6 @@ class SparseConv3dTestTorch(nn.Module): ...@@ -47,7 +47,6 @@ class SparseConv3dTestTorch(nn.Module):
padding=padding, padding=padding,
dilation=dilation, dilation=dilation,
bias=False, bias=False,
use_hash=False,
algo=algo) algo=algo)
] ]
for i in range(1, num_layers): for i in range(1, num_layers):
...@@ -59,7 +58,6 @@ class SparseConv3dTestTorch(nn.Module): ...@@ -59,7 +58,6 @@ class SparseConv3dTestTorch(nn.Module):
padding=padding, padding=padding,
dilation=dilation, dilation=dilation,
bias=False, bias=False,
use_hash=False,
algo=algo)) algo=algo))
self.net = spconv.SparseSequential(*layers, ) self.net = spconv.SparseSequential(*layers, )
# self.grid = torch.full([3, *shape], -1, dtype=torch.int32).cuda() # self.grid = torch.full([3, *shape], -1, dtype=torch.int32).cuda()
...@@ -359,6 +357,9 @@ class TestSpConv(TestCase): ...@@ -359,6 +357,9 @@ class TestSpConv(TestCase):
strides = [1, 2, 3] strides = [1, 2, 3]
paddings = [0, 1, 2] paddings = [0, 1, 2]
dilations = [1, 2, 3] dilations = [1, 2, 3]
# strides = [1]
# paddings = [0]
# dilations = [1]
for dev, shape, bs, IC, OC, k, s, p, d in params_grid( for dev, shape, bs, IC, OC, k, s, p, d in params_grid(
devices, shapes, batchsizes, in_channels, out_channels, ksizes, devices, shapes, batchsizes, in_channels, out_channels, ksizes,
...@@ -367,7 +368,6 @@ class TestSpConv(TestCase): ...@@ -367,7 +368,6 @@ class TestSpConv(TestCase):
continue # don't support this. continue # don't support this.
device = torch.device(dev) device = torch.device(dev)
num_points = [1000] * bs num_points = [1000] * bs
sparse_dict = generate_sparse_data(shape, num_points, IC) sparse_dict = generate_sparse_data(shape, num_points, IC)
features = np.ascontiguousarray(sparse_dict["features"]).astype( features = np.ascontiguousarray(sparse_dict["features"]).astype(
...@@ -375,8 +375,13 @@ class TestSpConv(TestCase): ...@@ -375,8 +375,13 @@ class TestSpConv(TestCase):
indices = np.ascontiguousarray( indices = np.ascontiguousarray(
sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32) sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
features_dense = sparse_dict["features_dense"].astype(np.float32) features_dense = sparse_dict["features_dense"].astype(np.float32)
filters = np.random.uniform(0, 1, size=[k, k, k, IC, if FILTER_HWIO:
OC]).astype(np.float32) filters = np.random.uniform(0, 1, size=[k, k, k, IC,
OC]).astype(np.float32)
else:
filters = np.random.uniform(0, 1, size=[k, k, k, OC,
IC]).astype(np.float32)
indices_t = torch.from_numpy(indices).int().to(device) indices_t = torch.from_numpy(indices).int().to(device)
features_t = torch.from_numpy(features).to(device) features_t = torch.from_numpy(features).to(device)
features_t.requires_grad = True features_t.requires_grad = True
...@@ -387,11 +392,19 @@ class TestSpConv(TestCase): ...@@ -387,11 +392,19 @@ class TestSpConv(TestCase):
net_ref = Conv3dTestTorch(1, 3, shape, IC, OC, k, s, p, net_ref = Conv3dTestTorch(1, 3, shape, IC, OC, k, s, p,
d).to(device) d).to(device)
filters_t = torch.from_numpy(filters).to(device) filters_t = torch.from_numpy(filters).to(device)
net_ref.net[0].weight.data[:] = filters_t.permute(4, 3, 0, 1, if FILTER_HWIO:
2).contiguous() net_ref.net[0].weight.data[:] = filters_t.permute(4, 3, 0, 1,
2).contiguous()
else:
net_ref.net[0].weight.data[:] = filters_t.permute(3, 4, 0, 1,
2).contiguous()
net.net[0].weight.data[:] = filters_t net.net[0].weight.data[:] = filters_t
out_ref = net_ref(features_dense_t) out_ref = net_ref(features_dense_t)
out = net(features_t, indices_t, bs).dense() out = net(features_t, indices_t, bs).dense()
out_np = out.detach().cpu().numpy()
out_ref_np = out_ref.detach().cpu().numpy()
self.assertAllClose(out_np, out_ref_np, atol=1e-4)
dout = np.random.uniform(-0.2, 0.2, dout = np.random.uniform(-0.2, 0.2,
out_ref.shape).astype(features.dtype) out_ref.shape).astype(features.dtype)
dout_t = torch.from_numpy(dout).to(device) dout_t = torch.from_numpy(dout).to(device)
...@@ -401,18 +414,21 @@ class TestSpConv(TestCase): ...@@ -401,18 +414,21 @@ class TestSpConv(TestCase):
1).contiguous() 1).contiguous()
din_sparse = gather_nd(din_dense, indices_t.long()) din_sparse = gather_nd(din_dense, indices_t.long())
din = features_t.grad.detach() din = features_t.grad.detach()
din_np = din.cpu().numpy() din_np = din.cpu().numpy()
din_sparse_np = din_sparse.cpu().numpy() din_sparse_np = din_sparse.cpu().numpy()
self.assertAllClose(din_np, din_sparse_np, atol=1e-4)
for layer, layer_ref in zip(net.net, net_ref.net): for layer, layer_ref in zip(net.net, net_ref.net):
dw = layer.weight.grad.detach().cpu().numpy() dw = layer.weight.grad.detach().cpu().numpy()
dw_ref = layer_ref.weight.grad.detach().cpu().numpy() dw_ref = layer_ref.weight.grad.detach().cpu().numpy()
dw = dw.transpose(4, 3, 0, 1, 2) if FILTER_HWIO:
dw = dw.transpose(4, 3, 0, 1, 2)
else:
dw = dw.transpose(3, 4, 0, 1, 2)
self.assertAllClose(dw, dw_ref, atol=1e-4) self.assertAllClose(dw, dw_ref, atol=1e-4)
self.assertAllClose(din_np, din_sparse_np, atol=1e-4)
out_np = out.detach().cpu().numpy()
out_ref_np = out_ref.detach().cpu().numpy()
self.assertAllClose(out_np, out_ref_np, atol=1e-4)
def testSpDeConv3d(self): def testSpDeConv3d(self):
np.random.seed(484) np.random.seed(484)
...@@ -454,7 +470,7 @@ class TestSpConv(TestCase): ...@@ -454,7 +470,7 @@ class TestSpConv(TestCase):
net_ref = DeConv3dTestTorch(1, 3, shape, IC, OC, k, s, p, net_ref = DeConv3dTestTorch(1, 3, shape, IC, OC, k, s, p,
d).to(device) d).to(device)
filters_t = torch.from_numpy(filters).to(device) filters_t = torch.from_numpy(filters).to(device)
net_ref.net[0].weight.data[:] = filters_t.permute(3, 4, 0, 1, net_ref.net[0].weight.data[:] = filters_t.permute(4, 3, 0, 1,
2).contiguous() 2).contiguous()
net.net[0].weight.data[:] = filters_t net.net[0].weight.data[:] = filters_t
out_ref = net_ref(features_dense_t) out_ref = net_ref(features_dense_t)
...@@ -474,7 +490,7 @@ class TestSpConv(TestCase): ...@@ -474,7 +490,7 @@ class TestSpConv(TestCase):
for layer, layer_ref in zip(net.net, net_ref.net): for layer, layer_ref in zip(net.net, net_ref.net):
dw = layer.weight.grad.detach().cpu().numpy() dw = layer.weight.grad.detach().cpu().numpy()
dw_ref = layer_ref.weight.grad.detach().cpu().numpy() dw_ref = layer_ref.weight.grad.detach().cpu().numpy()
dw = dw.transpose(3, 4, 0, 1, 2) dw = dw.transpose(4, 3, 0, 1, 2)
self.assertAllClose(dw, dw_ref, atol=1e-4) self.assertAllClose(dw, dw_ref, atol=1e-4)
out_np = out.detach().cpu().numpy() out_np = out.detach().cpu().numpy()
...@@ -551,12 +567,16 @@ class TestSpConv(TestCase): ...@@ -551,12 +567,16 @@ class TestSpConv(TestCase):
shapes = [[19, 18, 17]] shapes = [[19, 18, 17]]
batchsizes = [1, 2] batchsizes = [1, 2]
in_channels = [62] in_channels = [64]
out_channels = [62] out_channels = [64]
ksizes = [2, 3] ksizes = [2, 3]
strides = [1, 2, 3] strides = [1, 2, 3]
paddings = [0, 1] paddings = [0, 1]
dilations = [1, 2, 3] dilations = [1, 2, 3]
ksizes = [2]
strides = [2]
paddings = [0]
dilations = [1]
for dev, shape, bs, IC, OC, k, s, p, d in params_grid( for dev, shape, bs, IC, OC, k, s, p, d in params_grid(
devices, shapes, batchsizes, in_channels, out_channels, ksizes, devices, shapes, batchsizes, in_channels, out_channels, ksizes,
...@@ -565,6 +585,7 @@ class TestSpConv(TestCase): ...@@ -565,6 +585,7 @@ class TestSpConv(TestCase):
continue # don't support this. continue # don't support this.
device = torch.device(dev) device = torch.device(dev)
num_points = [1000] * bs num_points = [1000] * bs
# when data contains negative, sparse maxpool is not equal to dense maxpool. # when data contains negative, sparse maxpool is not equal to dense maxpool.
sparse_dict = generate_sparse_data(shape, sparse_dict = generate_sparse_data(shape,
num_points, num_points,
...@@ -576,8 +597,8 @@ class TestSpConv(TestCase): ...@@ -576,8 +597,8 @@ class TestSpConv(TestCase):
indices = np.ascontiguousarray( indices = np.ascontiguousarray(
sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32) sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
features_dense = sparse_dict["features_dense"].astype(np.float32) features_dense = sparse_dict["features_dense"].astype(np.float32)
filters = np.random.uniform(0, 1, size=[k, k, k, IC, filters = np.random.uniform(0, 1, size=[k, k, k, OC,
OC]).astype(np.float32) IC]).astype(np.float32)
indices_t = torch.from_numpy(indices).int().to(device) indices_t = torch.from_numpy(indices).int().to(device)
features_t = torch.from_numpy(features).to(device) features_t = torch.from_numpy(features).to(device)
features_t.requires_grad = True features_t.requires_grad = True
...@@ -588,11 +609,15 @@ class TestSpConv(TestCase): ...@@ -588,11 +609,15 @@ class TestSpConv(TestCase):
out_ref = net_ref(features_dense_t) out_ref = net_ref(features_dense_t)
out = net(features_t, indices_t, bs) out = net(features_t, indices_t, bs)
outids = out.indices outids = out.indices
outfeatures = out.features outfeatures = out.features
outids_dev = outids.float() outids_dev = outids.float()
out_dense = out.dense(channels_first=False) out_dense = out.dense(channels_first=False)
out = out_dense.permute(0, 4, 1, 2, 3).contiguous() out = out_dense.permute(0, 4, 1, 2, 3).contiguous()
out_np = out.detach().cpu().numpy()
out_ref_np = out_ref.detach().cpu().numpy()
self.assertAllClose(out_np, out_ref_np, atol=1e-4)
dout_sparse = np.random.uniform( dout_sparse = np.random.uniform(
-0.2, 0.2, outfeatures.shape).astype(features.dtype) -0.2, 0.2, outfeatures.shape).astype(features.dtype)
...@@ -607,9 +632,6 @@ class TestSpConv(TestCase): ...@@ -607,9 +632,6 @@ class TestSpConv(TestCase):
din_sparse = gather_nd(din_dense, indices_t.long()) din_sparse = gather_nd(din_dense, indices_t.long())
din = features_t.grad.detach() din = features_t.grad.detach()
out_np = out.detach().cpu().numpy()
out_ref_np = out_ref.detach().cpu().numpy()
self.assertAllClose(out_np, out_ref_np, atol=1e-4)
din_np = din.cpu().numpy() din_np = din.cpu().numpy()
din_sparse_np = din_sparse.cpu().numpy() din_sparse_np = din_sparse.cpu().numpy()
self.assertAllClose(din_np, din_sparse_np, atol=1e-4) self.assertAllClose(din_np, din_sparse_np, atol=1e-4)
......
This source diff could not be displayed because it is too large. You can view the blob instead.
Subproject commit fd7e058d0cb3e4bf743edc530c7778a210cb168b
Subproject commit 29764aad4881fde809af6a025c12012e47a55515
Subproject commit 3b1dbebabc801c9cf6f0953a4c20b904d444f879
<!--
Copyright 2021 Yan Yan
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
## How to debug manylinux build
```Bash
docker run --rm -it -e PLAT=manylinux2014_x86_64 -v `pwd`:/io -v $HOME:/myhome scrin/manylinux2014-cuda:cu114-devel bash
/io/tools/build-wheels.sh
```
\ No newline at end of file
#!/bin/bash
# Copyright 2021 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -e -u -x
function repair_wheel {
wheel="$1"
outpath="$2"
if ! auditwheel show "$wheel"; then
echo "Skipping non-platform wheel $wheel"
else
auditwheel repair "$wheel" --plat "$PLAT" -w "$outpath"
fi
}
export SPCONV_DISABLE_JIT="1"
export CUMM_CUDA_ARCH_LIST="all"
# Compile wheels, we only support 3.7-3.10.
# "/opt/python/cp36-cp36m/bin/pip" wheel /io/ --no-deps -w /io/wheelhouse_tmp
"/opt/python/cp37-cp37m/bin/pip" wheel /io/ --no-deps -w /io/wheelhouse_tmp
"/opt/python/cp38-cp38/bin/pip" wheel /io/ --no-deps -w /io/wheelhouse_tmp
"/opt/python/cp39-cp39/bin/pip" wheel /io/ --no-deps -w /io/wheelhouse_tmp
"/opt/python/cp310-cp310/bin/pip" wheel /io/ --no-deps -w /io/wheelhouse_tmp
# Bundle external shared libraries into the wheels
for whl in /io/wheelhouse_tmp/*.whl; do
repair_wheel "$whl" /io/dist
done
rm -rf /io/wheelhouse_tmp
\ No newline at end of file
## -------------------
## Constants
## -------------------
# Dictionary of known cuda versions and thier download URLS, which do not follow a consistent pattern :(
$CUDA_KNOWN_URLS = @{
"10.2" = "http://developer.download.nvidia.com/compute/cuda/10.2/Prod/network_installers/cuda_10.2.89_win10_network.exe";
"11.0" = "http://developer.download.nvidia.com/compute/cuda/11.0.3/network_installers/cuda_11.0.3_win10_network.exe";
"11.1" = "https://developer.download.nvidia.com/compute/cuda/11.1.1/network_installers/cuda_11.1.1_win10_network.exe";
"11.2" = "https://developer.download.nvidia.com/compute/cuda/11.2.2/network_installers/cuda_11.2.2_win10_network.exe";
"11.3" = "https://developer.download.nvidia.com/compute/cuda/11.3.1/network_installers/cuda_11.3.1_win10_network.exe";
"11.4" = "https://developer.download.nvidia.com/compute/cuda/11.4.2/network_installers/cuda_11.4.2_win10_network.exe";
}
# cuda_runtime.h is in nvcc <= 10.2, but cudart >= 11.0
# @todo - make this easier to vary per CUDA version.
$CUDA_PACKAGES_IN = @(
"nvcc";
"visual_studio_integration";
"curand_dev";
"nvrtc_dev";
"cudart";
)
## -------------------
## Select CUDA version
## -------------------
# Get the cuda version from the environment as env:cuda.
$CUDA_VERSION_FULL = $env:cuda
# Make sure CUDA_VERSION_FULL is set and valid, otherwise error.
# Validate CUDA version, extracting components via regex
$cuda_ver_matched = $CUDA_VERSION_FULL -match "^(?<major>[1-9][0-9]*)\.(?<minor>[0-9]+)$"
if(-not $cuda_ver_matched){
Write-Output "Invalid CUDA version specified, <major>.<minor> required. '$CUDA_VERSION_FULL'."
exit 1
}
$CUDA_MAJOR=$Matches.major
$CUDA_MINOR=$Matches.minor
## ------------------------------------------------
## Select CUDA packages to install from environment
## ------------------------------------------------
$CUDA_PACKAGES = ""
# for CUDA >= 11 cudart is a required package.
# if([version]$CUDA_VERSION_FULL -ge [version]"11.0") {
# if(-not $CUDA_PACKAGES_IN -contains "cudart") {
# $CUDA_PACKAGES_IN += 'cudart'
# }
# }
Foreach ($package in $CUDA_PACKAGES_IN) {
# Make sure the correct package name is used for nvcc.
if($package -eq "nvcc" -and [version]$CUDA_VERSION_FULL -lt [version]"9.1"){
$package="compiler"
} elseif($package -eq "compiler" -and [version]$CUDA_VERSION_FULL -ge [version]"9.1") {
$package="nvcc"
}
$CUDA_PACKAGES += " $($package)_$($CUDA_MAJOR).$($CUDA_MINOR)"
}
echo "$($CUDA_PACKAGES)"
## -----------------
## Prepare download
## -----------------
# Select the download link if known, otherwise have a guess.
$CUDA_REPO_PKG_REMOTE=""
if($CUDA_KNOWN_URLS.containsKey($CUDA_VERSION_FULL)){
$CUDA_REPO_PKG_REMOTE=$CUDA_KNOWN_URLS[$CUDA_VERSION_FULL]
} else{
# Guess what the url is given the most recent pattern (at the time of writing, 10.1)
Write-Output "note: URL for CUDA ${$CUDA_VERSION_FULL} not known, estimating."
$CUDA_REPO_PKG_REMOTE="http://developer.download.nvidia.com/compute/cuda/$($CUDA_MAJOR).$($CUDA_MINOR)/Prod/network_installers/cuda_$($CUDA_VERSION_FULL)_win10_network.exe"
}
$CUDA_REPO_PKG_LOCAL="cuda_$($CUDA_VERSION_FULL)_win10_network.exe"
## ------------
## Install CUDA
## ------------
# Get CUDA network installer
Write-Output "Downloading CUDA Network Installer for $($CUDA_VERSION_FULL) from: $($CUDA_REPO_PKG_REMOTE)"
Invoke-WebRequest $CUDA_REPO_PKG_REMOTE -OutFile $CUDA_REPO_PKG_LOCAL | Out-Null
if(Test-Path -Path $CUDA_REPO_PKG_LOCAL){
Write-Output "Downloading Complete"
} else {
Write-Output "Error: Failed to download $($CUDA_REPO_PKG_LOCAL) from $($CUDA_REPO_PKG_REMOTE)"
exit 1
}
# Invoke silent install of CUDA (via network installer)
Write-Output "Installing CUDA $($CUDA_VERSION_FULL). Subpackages $($CUDA_PACKAGES)"
Start-Process -Wait -FilePath .\"$($CUDA_REPO_PKG_LOCAL)" -ArgumentList "-s $($CUDA_PACKAGES)"
# Check the return status of the CUDA installer.
if (!$?) {
Write-Output "Error: CUDA installer reported error. $($LASTEXITCODE)"
exit 1
}
# Store the CUDA_PATH in the environment for the current session, to be forwarded in the action.
$CUDA_PATH = "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v$($CUDA_MAJOR).$($CUDA_MINOR)"
$CUDA_PATH_VX_Y = "CUDA_PATH_V$($CUDA_MAJOR)_$($CUDA_MINOR)"
# Set environmental variables in this session
$env:CUDA_PATH = "$($CUDA_PATH)"
$env:CUDA_PATH_VX_Y = "$($CUDA_PATH_VX_Y)"
Write-Output "CUDA_PATH $($CUDA_PATH)"
Write-Output "CUDA_PATH_VX_Y $($CUDA_PATH_VX_Y)"
# PATH needs updating elsewhere, anything in here won't persist.
# Append $CUDA_PATH/bin to path.
# Set CUDA_PATH as an environmental variable
# If executing on github actions, emit the appropriate echo statements to update environment variables
if (Test-Path "env:GITHUB_ACTIONS") {
# Set paths for subsequent steps, using $env:CUDA_PATH
echo "Adding CUDA to CUDA_PATH, CUDA_PATH_X_Y and PATH"
echo "CUDA_PATH=$env:CUDA_PATH" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
echo "$env:CUDA_PATH_VX_Y=$env:CUDA_PATH" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
echo "$env:CUDA_PATH/bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
}
# Copyright 2021 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
$installPath = &"C:\Program Files (x86)\Microsoft Visual Studio\Installer\vswhere.exe" -property installationpath
Import-Module (Join-Path $installPath "Common7\Tools\Microsoft.VisualStudio.DevShell.dll")
Enter-VsDevShell -VsInstallPath $installPath -SkipAutomaticLocation -DevCmdArguments '-arch=x64 -no_logo'
\ No newline at end of file
2.0.0
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment