"csrc/vscode:/vscode.git/clone" did not exist on "abba6adde6b7a2d5bf3651c58b9edd670d13c986"
Unverified Commit f0d7a46d authored by Yan Yan's avatar Yan Yan Committed by GitHub
Browse files

Merge branch 'master' into master

parents 999c834c 83344f71
set(ALL_FILES all.cc indice.cc reordering.cc maxpool.cc nms.cc) set(ALL_FILES all.cc indice.cc reordering.cc maxpool.cc nms.cc spconv_ops.cc)
if (SPCONV_BuildCUDA) if (SPCONV_BuildCUDA)
set(ALL_FILES ${ALL_FILES} indice.cu reordering.cu maxpool.cu pillar_scatter.cu) set(ALL_FILES ${ALL_FILES} indice.cu reordering.cu maxpool.cu pillar_scatter.cu)
endif() endif()
......
...@@ -12,6 +12,7 @@ ...@@ -12,6 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include <torch/script.h>
#include <spconv/pool_ops.h> #include <spconv/pool_ops.h>
#include <spconv/spconv_ops.h> #include <spconv/spconv_ops.h>
#include <spconv/pillar_scatter_ops.h> #include <spconv/pillar_scatter_ops.h>
...@@ -19,16 +20,14 @@ ...@@ -19,16 +20,14 @@
#include <spconv/nms_ops.h> #include <spconv/nms_ops.h>
static auto registry = static auto registry =
torch::RegisterOperators("spconv::get_indice_pairs_2d", &spconv::getIndicePair<2>) torch::RegisterOperators()
.op("spconv::get_indice_pairs_2d", &spconv::getIndicePair<2>)
.op("spconv::get_indice_pairs_3d", &spconv::getIndicePair<3>) .op("spconv::get_indice_pairs_3d", &spconv::getIndicePair<3>)
.op("spconv::get_indice_pairs_4d", &spconv::getIndicePair<4>) .op("spconv::get_indice_pairs_4d", &spconv::getIndicePair<4>)
.op("spconv::get_indice_pairs_grid_2d", &spconv::getIndicePairPreGrid<2>) .op("spconv::get_indice_pairs_grid_2d", &spconv::getIndicePairPreGrid<2>)
.op("spconv::get_indice_pairs_grid_3d", &spconv::getIndicePairPreGrid<3>) .op("spconv::get_indice_pairs_grid_3d", &spconv::getIndicePairPreGrid<3>)
.op("spconv::indice_conv_fp32", &spconv::indiceConv<float>) .op("spconv::indice_conv", &spconv::indiceConv)
.op("spconv::indice_conv_backward_fp32", &spconv::indiceConvBackward<float>) .op("spconv::indice_conv_backward", &spconv::indiceConvBackward)
.op("spconv::indice_conv_half", &spconv::indiceConv<at::Half>)
.op("spconv::indice_conv_backward_half",
&spconv::indiceConvBackward<at::Half>)
.op("spconv::fused_indice_conv_fp32", &spconv::fusedIndiceConvBatchNorm<float>) .op("spconv::fused_indice_conv_fp32", &spconv::fusedIndiceConvBatchNorm<float>)
.op("spconv::fused_indice_conv_half", &spconv::fusedIndiceConvBatchNorm<at::Half>) .op("spconv::fused_indice_conv_half", &spconv::fusedIndiceConvBatchNorm<at::Half>)
.op("spconv::indice_maxpool_fp32", &spconv::indiceMaxPool<float>) .op("spconv::indice_maxpool_fp32", &spconv::indiceMaxPool<float>)
......
#include <spconv/spconv_ops.h>
namespace spconv {
torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
torch::Tensor indicePairs, torch::Tensor indiceNum,
int64_t numActOut, int64_t _inverse, int64_t _subM) {
bool subM = _subM != 0;
bool inverse = _inverse != 0;
auto device = features.device().type();
auto ndim = filters.dim() - 2;
auto kernelVolume = indicePairs.size(0);
auto numInPlanes = features.size(1);
auto numOutPlanes = filters.size(ndim + 1);
auto indicePairNumCpu = indiceNum.to({torch::kCPU});
auto indicePairMaxSizeIter =
std::max_element(indicePairNumCpu.data_ptr<int>(),
indicePairNumCpu.data_ptr<int>() + kernelVolume);
int indicePairMaxOffset =
indicePairMaxSizeIter - indicePairNumCpu.data_ptr<int>();
int indicePairMaxSize = *indicePairMaxSizeIter;
/*if (_subM){
std::vector<int> indicePairNumVec(indicePairNumCpu.data_ptr<int>(),
indicePairNumCpu.data_ptr<int>() + kernelVolume);
indicePairNumVec.erase(indicePairNumVec.begin() + indicePairMaxOffset);
auto indicePairVecMaxSizeIter = std::max_element(
indicePairNumVec.begin(), indicePairNumVec.end());
indicePairMaxSize = *indicePairVecMaxSizeIter;
}*/
auto options =
torch::TensorOptions().dtype(features.dtype()).device(features.device());
// auto indicePairOptions =
// torch::TensorOptions().dtype(torch::kInt64).device(indicePairs.device());
torch::Tensor output = torch::zeros({numActOut, numOutPlanes}, options);
torch::Tensor inputBuffer =
torch::zeros({indicePairMaxSize, numInPlanes}, options);
torch::Tensor outputBuffer =
torch::zeros({indicePairMaxSize, numOutPlanes}, options);
filters = filters.view({-1, numInPlanes, numOutPlanes});
if (subM) { // the center index of subm conv don't need gather and scatter
// add.
torch::mm_out(output, features, filters[indicePairMaxOffset]);
}
double totalGatherTime = 0;
double totalGEMMTime = 0;
double totalSAddTime = 0;
tv::torch_dispatch<float, double, at::Half>(
features.scalar_type(), [&](auto I) {
using T = decltype(I);
for (int i = 0; i < kernelVolume; ++i) {
auto nHot = indicePairNumCpu.data_ptr<int>()[i];
if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
continue;
}
// auto timer = spconv::CudaContextTimer<>();
auto outputBufferBlob = torch::from_blob(
outputBuffer.data_ptr<T>(), {nHot, numOutPlanes}, options);
auto inputBufferBlob = torch::from_blob(inputBuffer.data_ptr<T>(),
{nHot, numInPlanes}, options);
if (device == torch::kCPU) {
functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtor;
gatherFtor(tv::CPU(), tv::torch2tv<T>(inputBuffer),
tv::torch2tv<const T>(features),
tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot);
}
#ifdef SPCONV_CUDA
else if (device == torch::kCUDA) {
functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtor;
gatherFtor(tv::TorchGPU(), tv::torch2tv<T>(inputBuffer),
tv::torch2tv<const T>(features),
tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot);
TV_CHECK_CUDA_ERR();
/* slower than SparseGatherFunctor, may due to int->long conversion
auto indicePairLong = indicePairs[i][inverse].to(torch::kInt64);
auto indicePairBlob = torch::from_blob(indicePairLong.data<long>(),
{nHot}, indicePairOptions); torch::index_select_out(inputBufferBlob,
features, 0, indicePairBlob);*/
}
#endif
else {
TV_ASSERT_INVALID_ARG(false, "unknown device type");
}
// totalGatherTime += timer.report() / 1000.0;
torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]);
// totalGEMMTime += timer.report() / 1000.0;
if (device == torch::kCPU) {
functor::SparseScatterAddFunctor<tv::CPU, T, int> scatterFtor;
scatterFtor(
tv::CPU(), tv::torch2tv<T>(output),
tv::torch2tv<const T>(outputBuffer),
tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
true);
}
#ifdef SPCONV_CUDA
else if (device == torch::kCUDA) {
functor::SparseScatterAddFunctor<tv::GPU, T, int> scatterFtor;
scatterFtor(
tv::TorchGPU(), tv::torch2tv<T>(output),
tv::torch2tv<const T>(outputBuffer),
tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
true);
TV_CHECK_CUDA_ERR();
}
#endif
else {
TV_ASSERT_INVALID_ARG(false, "unknown device type");
}
// totalSAddTime += timer.report() / 1000.0;
}
});
// std::cout << "gather time " << totalGatherTime << std::endl;
// std::cout << "gemm time " << totalGEMMTime << std::endl;
// std::cout << "scatteradd time " << totalSAddTime << std::endl;
return output;
}
std::vector<torch::Tensor>
indiceConvBackward(torch::Tensor features, torch::Tensor filters,
torch::Tensor outGrad, torch::Tensor indicePairs,
torch::Tensor indiceNum, int64_t _inverse, int64_t _subM) {
bool subM = _subM != 0;
bool inverse = _inverse != 0;
auto device = features.device().type();
auto ndim = filters.dim() - 2;
auto kernelVolume = indicePairs.size(0);
auto numInPlanes = features.size(1);
auto numOutPlanes = filters.size(ndim + 1);
auto indicePairNumCpu = indiceNum.to({torch::kCPU});
auto indicePairMaxSizeIter =
std::max_element(indicePairNumCpu.data_ptr<int>(),
indicePairNumCpu.data_ptr<int>() + kernelVolume);
int indicePairMaxOffset =
indicePairMaxSizeIter - indicePairNumCpu.data_ptr<int>();
int indicePairMaxSize = *indicePairMaxSizeIter;
auto options =
torch::TensorOptions().dtype(features.dtype()).device(features.device());
auto filterShape = filters.sizes();
torch::Tensor inputGrad = torch::zeros(features.sizes(), options);
torch::Tensor filtersGrad = torch::zeros(filterShape, options);
torch::Tensor inputBuffer =
torch::zeros({indicePairMaxSize, numInPlanes}, options);
torch::Tensor outputBuffer =
torch::zeros({indicePairMaxSize, numOutPlanes}, options);
filters = filters.view({-1, numInPlanes, numOutPlanes});
filtersGrad = filtersGrad.view({-1, numInPlanes, numOutPlanes});
if (subM) {
auto filterGradSub = filtersGrad[indicePairMaxOffset];
torch::mm_out(filterGradSub, features.t(), outGrad);
torch::mm_out(inputGrad, outGrad, filters[indicePairMaxOffset].t());
}
tv::torch_dispatch<float, double,
at::Half>(features.scalar_type(), [&](auto I) {
using T = decltype(I);
for (int i = 0; i < kernelVolume; ++i) {
auto nHot = indicePairNumCpu.data_ptr<int>()[i];
if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
continue;
}
if (device == torch::kCPU) {
functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtor;
functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtorOut;
gatherFtor(tv::CPU(), tv::torch2tv<T>(inputBuffer),
tv::torch2tv<const T>(features),
tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot);
gatherFtorOut(tv::CPU(), tv::torch2tv<T>(outputBuffer),
tv::torch2tv<const T>(outGrad),
tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
nHot);
}
#ifdef SPCONV_CUDA
else if (device == torch::kCUDA) {
functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtor;
functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtorOut;
gatherFtor(tv::TorchGPU(), tv::torch2tv<T>(inputBuffer),
tv::torch2tv<const T>(features),
tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot);
TV_CHECK_CUDA_ERR();
gatherFtorOut(tv::TorchGPU(), tv::torch2tv<T>(outputBuffer),
tv::torch2tv<const T>(outGrad),
tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
nHot);
TV_CHECK_CUDA_ERR();
}
#endif
else {
TV_ASSERT_INVALID_ARG(false, "unknown device type");
}
auto filterGradSub = filtersGrad[i];
auto outputBufferBlob = torch::from_blob(outputBuffer.data_ptr<T>(),
{nHot, numOutPlanes}, options);
auto inputBufferBlob = torch::from_blob(inputBuffer.data_ptr<T>(),
{nHot, numInPlanes}, options);
torch::mm_out(filterGradSub, inputBufferBlob.t(), outputBufferBlob);
torch::mm_out(inputBufferBlob, outputBufferBlob, filters[i].t());
if (device == torch::kCPU) {
functor::SparseScatterAddFunctor<tv::CPU, T, int> scatterFtor;
scatterFtor(tv::CPU(), tv::torch2tv<T>(inputGrad),
tv::torch2tv<const T>(inputBuffer),
tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot);
}
#ifdef SPCONV_CUDA
else if (device == torch::kCUDA) {
functor::SparseScatterAddFunctor<tv::GPU, T, int> scatterFtor;
scatterFtor(tv::TorchGPU(), tv::torch2tv<T>(inputGrad),
tv::torch2tv<const T>(inputBuffer),
tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot);
TV_CHECK_CUDA_ERR();
}
#endif
else {
TV_ASSERT_INVALID_ARG(false, "unknown device type");
}
}
});
return {inputGrad, filtersGrad.view(filterShape)};
}
} // namespace spconv
\ No newline at end of file
import horovod.torch as hvd
import time
from pathlib import Path
import fire
import numpy as np
import torch
import torch.nn.functional as F
import tqdm
from torch import distributed, nn
from torch.utils import data
from torch.utils.data import DataLoader, Dataset
from torchvision import datasets, transforms
import spconv
from spconv.test_utils import generate_sparse_data
class FakeSparseDataset(Dataset):
def __len__(self):
return 500
def __getitem__(self, idx):
data_ranges = {
0: [-1, 1],
1: [0, 2],
2: [-2, 0],
3: [-2, -2],
}
l = np.random.randint(0, 4, size=[2])
data = generate_sparse_data([16, 64, 64], [16 * 64 * 64 // 2],
3,
data_range=data_ranges[l[0]],
with_dense=False)
data2 = generate_sparse_data([16, 64, 64], [16 * 64 * 64 // 2],
3,
data_range=data_ranges[l[1]],
with_dense=False)
features = np.ascontiguousarray(data["features"]).astype(np.float32)
indices = np.ascontiguousarray(
data["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
features2 = np.ascontiguousarray(data2["features"]).astype(np.float32)
indices2 = np.ascontiguousarray(
data2["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
features = np.ascontiguousarray(np.concatenate([features, features2]))
indices = np.ascontiguousarray(np.concatenate([indices, indices2]))
return features, indices, l
class FakeClassifier(nn.Module):
def __init__(self):
super().__init__()
self.net = spconv.SparseSequential(
spconv.SubMConv3d(3, 8, 3, indice_key="subm1", padding=1, use_hash=False),
nn.BatchNorm1d(8),
nn.ReLU(),
spconv.SparseConv3d(8, 16, 3, stride=2, padding=1, use_hash=False),
nn.BatchNorm1d(16),
nn.ReLU(),
spconv.SubMConv3d(16, 16, 3, indice_key="subm2", padding=1, use_hash=False),
nn.BatchNorm1d(16),
nn.ReLU(),
spconv.SparseConv3d(16, 32, 3, stride=2, padding=1, use_hash=False),
nn.BatchNorm1d(32),
nn.ReLU(),
spconv.SubMConv3d(32, 32, 3, indice_key="subm3", padding=1, use_hash=False),
nn.BatchNorm1d(32),
nn.ReLU(),
spconv.SparseConv3d(32, 64, 3, stride=2, padding=1, use_hash=False),
nn.BatchNorm1d(64),
nn.ReLU(),
spconv.SubMConv3d(64, 64, 3, indice_key="subm4", padding=1, use_hash=False),
nn.BatchNorm1d(64),
nn.ReLU(),
spconv.ToDense() # [64, 2, 8, 8]
)
self.linear = nn.Linear(64 * 2 * 8 * 8, 4)
def forward(self, features, indices):
indices = indices.int()
x = spconv.SparseConvTensor(features, indices, [16, 64, 64], 2)
x = self.net(x)
x = x.view(2, -1)
x = self.linear(x)
return x
def run():
hvd.init()
torch.cuda.set_device(hvd.local_rank())
np.random.seed(50051 + hvd.local_rank())
ds = FakeSparseDataset()
device = torch.device('cuda')
model = FakeClassifier()
model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
hvd.broadcast_parameters(model.state_dict(), root_rank=0)
hvd.broadcast_optimizer_state(optimizer, root_rank=0)
compression = hvd.Compression.none
optimizer = hvd.DistributedOptimizer(optimizer,
named_parameters=model.named_parameters(),
compression=compression,
op=hvd.Average)
for i in tqdm.tqdm(list(range(100))):
# for j in range(4):
# features, indices, label = ds[(i * 4 + j) % len(ds)]
features, indices, label = ds[i % len(ds)]
features_t = torch.from_numpy(features)
indices_t = torch.from_numpy(indices)
features_t = features_t.to(device)
indices_t = indices_t.to(device)
target = torch.from_numpy(label).to(device)
output = model(features_t, indices_t)
# print(output.shape)
loss = F.cross_entropy(output, target)
optimizer.zero_grad()
loss.backward()
optimizer.step()
def dev():
ds = FakeSparseDataset()
for i in range(10):
features, indices, label = ds[i]
print(indices[:10])
features_t = torch.from_numpy(features.astype(np.float32)).cuda()
indices_t = torch.from_numpy(indices.astype(np.int32)).cuda()
net = FakeClassifier().cuda()
net(features_t, indices_t)
def main():
run()
if __name__ == "__main__":
fire.Fire(main)
import time
from pathlib import Path
import fire
import numpy as np
import torch
import torch.nn.functional as F
import tqdm
from torch import distributed, nn
from torch.utils import data
from torch.utils.data import DataLoader, Dataset
from torchvision import datasets, transforms
import spconv
from spconv.test_utils import generate_sparse_data
class FakeSparseDataset(Dataset):
def __len__(self):
return 500
def __getitem__(self, idx):
data_ranges = {
0: [-1, 1],
1: [0, 2],
2: [-2, 0],
3: [-2, -2],
}
l = np.random.randint(0, 4, size=[2])
data = generate_sparse_data([16, 64, 64], [16 * 64 * 64 // 2],
3,
data_range=data_ranges[l[0]],
with_dense=False)
data2 = generate_sparse_data([16, 64, 64], [16 * 64 * 64 // 2],
3,
data_range=data_ranges[l[1]],
with_dense=False)
features = np.ascontiguousarray(data["features"]).astype(np.float32)
indices = np.ascontiguousarray(
data["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
features2 = np.ascontiguousarray(data2["features"]).astype(np.float32)
indices2 = np.ascontiguousarray(
data2["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
features = np.ascontiguousarray(np.concatenate([features, features2]))
indices = np.ascontiguousarray(np.concatenate([indices, indices2]))
return features, indices, l
class FakeClassifier(nn.Module):
def __init__(self):
super().__init__()
self.net = spconv.SparseSequential(
spconv.SubMConv3d(3, 8, 3, indice_key="subm1", padding=1, use_hash=False),
nn.BatchNorm1d(8),
nn.ReLU(),
spconv.SparseConv3d(8, 16, 3, stride=2, padding=1, use_hash=False),
nn.BatchNorm1d(16),
nn.ReLU(),
spconv.SubMConv3d(16, 16, 3, indice_key="subm2", padding=1, use_hash=False),
nn.BatchNorm1d(16),
nn.ReLU(),
spconv.SparseConv3d(16, 32, 3, stride=2, padding=1, use_hash=False),
nn.BatchNorm1d(32),
nn.ReLU(),
spconv.SubMConv3d(32, 32, 3, indice_key="subm3", padding=1, use_hash=False),
nn.BatchNorm1d(32),
nn.ReLU(),
spconv.SparseConv3d(32, 64, 3, stride=2, padding=1, use_hash=False),
nn.BatchNorm1d(64),
nn.ReLU(),
spconv.SubMConv3d(64, 64, 3, indice_key="subm4", padding=1, use_hash=False),
nn.BatchNorm1d(64),
nn.ReLU(),
spconv.ToDense() # [64, 2, 8, 8]
)
self.linear = nn.Linear(64 * 2 * 8 * 8, 4)
def forward(self, features, indices):
indices = indices.int()
x = spconv.SparseConvTensor(features, indices, [16, 64, 64], 2)
x = self.net(x)
x = x.view(2, -1)
x = self.linear(x)
return x
def run():
np.random.seed(50051)
ds = FakeSparseDataset()
device = torch.device('cuda')
model = FakeClassifier()
model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
for i in tqdm.tqdm(list(range(100))):
# for j in range(4):
# features, indices, label = ds[(i * 4 + j) % len(ds)]
features, indices, label = ds[i % len(ds)]
features_t = torch.from_numpy(features)
indices_t = torch.from_numpy(indices)
features_t = features_t.to(device)
indices_t = indices_t.to(device)
target = torch.from_numpy(label).to(device)
output = model(features_t, indices_t)
# print(output.shape)
loss = F.cross_entropy(output, target)
optimizer.zero_grad()
loss.backward()
optimizer.step()
def dev():
ds = FakeSparseDataset()
for i in range(10):
features, indices, label = ds[i]
print(indices[:10])
features_t = torch.from_numpy(features.astype(np.float32)).cuda()
indices_t = torch.from_numpy(indices.astype(np.int32)).cuda()
net = FakeClassifier().cuda()
net(features_t, indices_t)
def main():
run()
if __name__ == "__main__":
fire.Fire(main)
...@@ -581,7 +581,7 @@ def main(): ...@@ -581,7 +581,7 @@ def main():
if all([s > 1, d > 1]): if all([s > 1, d > 1]):
continue continue
device = torch.device(dev) device = torch.device(dev)
num_points = [5] * bs num_points = [500] * bs
sparse_dict = generate_sparse_data(shape, num_points, IC) sparse_dict = generate_sparse_data(shape, num_points, IC)
...@@ -601,7 +601,7 @@ def main(): ...@@ -601,7 +601,7 @@ def main():
net.net[0].weight[:] = filters_t net.net[0].weight[:] = filters_t
out_ref = net_ref(features_dense_t) out_ref = net_ref(features_dense_t)
times = [] times = []
for i in range(0): for i in range(10):
t = time.time() t = time.time()
out = net(features_t, indices_t, bs) out = net(features_t, indices_t, bs)
torch.cuda.synchronize() torch.cuda.synchronize()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment