Merge branch 'master' into master

f0d7a46d · Yan Yan · GitHub · 999c834c · 83344f71 · f0d7a46d
Unverified Commit f0d7a46d authored May 20, 2020 by Yan Yan Committed by GitHub May 20, 2020
6 changed files
--- a/src/spconv/CMakeLists.txt
+++ b/src/spconv/CMakeLists.txt
-set(ALL_FILES all.cc indice.cc reordering.cc maxpool.cc nms.cc)
+set(ALL_FILES all.cc indice.cc reordering.cc maxpool.cc nms.cc spconv_ops.cc)
 if (SPCONV_BuildCUDA)
    set(ALL_FILES ${ALL_FILES} indice.cu reordering.cu maxpool.cu pillar_scatter.cu)
 endif()

--- a/src/spconv/all.cc
+++ b/src/spconv/all.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
+#include <torch/script.h>
 #include <spconv/pool_ops.h>
 #include <spconv/spconv_ops.h>
 #include <spconv/pillar_scatter_ops.h>
@@ -19,16 +20,14 @@
 #include <spconv/nms_ops.h>
 static auto registry =
-    torch::RegisterOperators("spconv::get_indice_pairs_2d", &spconv::getIndicePair<2>)
+    torch::RegisterOperators()
+        .op("spconv::get_indice_pairs_2d", &spconv::getIndicePair<2>)
        .op("spconv::get_indice_pairs_3d", &spconv::getIndicePair<3>)
        .op("spconv::get_indice_pairs_4d", &spconv::getIndicePair<4>)
        .op("spconv::get_indice_pairs_grid_2d", &spconv::getIndicePairPreGrid<2>)
        .op("spconv::get_indice_pairs_grid_3d", &spconv::getIndicePairPreGrid<3>)
-        .op("spconv::indice_conv_fp32", &spconv::indiceConv<float>)
+        .op("spconv::indice_conv", &spconv::indiceConv)
-        .op("spconv::indice_conv_backward_fp32", &spconv::indiceConvBackward<float>)
+        .op("spconv::indice_conv_backward", &spconv::indiceConvBackward)
-        .op("spconv::indice_conv_half", &spconv::indiceConv<at::Half>)
-        .op("spconv::indice_conv_backward_half",
-            &spconv::indiceConvBackward<at::Half>)
        .op("spconv::fused_indice_conv_fp32", &spconv::fusedIndiceConvBatchNorm<float>)
        .op("spconv::fused_indice_conv_half", &spconv::fusedIndiceConvBatchNorm<at::Half>)
        .op("spconv::indice_maxpool_fp32", &spconv::indiceMaxPool<float>)

--- a/src/spconv/spconv_ops.cc
+++ b/src/spconv/spconv_ops.cc
+#include <spconv/spconv_ops.h>
+namespace spconv {
+torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
+                         torch::Tensor indicePairs, torch::Tensor indiceNum,
+                         int64_t numActOut, int64_t _inverse, int64_t _subM) {
+  bool subM = _subM != 0;
+  bool inverse = _inverse != 0;
+  auto device = features.device().type();
+  auto ndim = filters.dim() - 2;
+  auto kernelVolume = indicePairs.size(0);
+  auto numInPlanes = features.size(1);
+  auto numOutPlanes = filters.size(ndim + 1);
+  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
+  auto indicePairMaxSizeIter =
+      std::max_element(indicePairNumCpu.data_ptr<int>(),
+                       indicePairNumCpu.data_ptr<int>() + kernelVolume);
+  int indicePairMaxOffset =
+      indicePairMaxSizeIter - indicePairNumCpu.data_ptr<int>();
+  int indicePairMaxSize = *indicePairMaxSizeIter;
+  /*if (_subM){
+  std::vector<int> indicePairNumVec(indicePairNumCpu.data_ptr<int>(),
+  indicePairNumCpu.data_ptr<int>() + kernelVolume);
+  indicePairNumVec.erase(indicePairNumVec.begin() + indicePairMaxOffset);
+  auto indicePairVecMaxSizeIter = std::max_element(
+      indicePairNumVec.begin(), indicePairNumVec.end());
+  indicePairMaxSize = *indicePairVecMaxSizeIter;
+  }*/
+  auto options =
+      torch::TensorOptions().dtype(features.dtype()).device(features.device());
+  // auto indicePairOptions =
+  //     torch::TensorOptions().dtype(torch::kInt64).device(indicePairs.device());
+  torch::Tensor output = torch::zeros({numActOut, numOutPlanes}, options);
+  torch::Tensor inputBuffer =
+      torch::zeros({indicePairMaxSize, numInPlanes}, options);
+  torch::Tensor outputBuffer =
+      torch::zeros({indicePairMaxSize, numOutPlanes}, options);
+  filters = filters.view({-1, numInPlanes, numOutPlanes});
+  if (subM) { // the center index of subm conv don't need gather and scatter
+    // add.
+    torch::mm_out(output, features, filters[indicePairMaxOffset]);
+  }
+  double totalGatherTime = 0;
+  double totalGEMMTime = 0;
+  double totalSAddTime = 0;
+  tv::torch_dispatch<float, double, at::Half>(
+      features.scalar_type(), [&](auto I) {
+        using T = decltype(I);
+        for (int i = 0; i < kernelVolume; ++i) {
+          auto nHot = indicePairNumCpu.data_ptr<int>()[i];
+          if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
+            continue;
+          }
+          // auto timer = spconv::CudaContextTimer<>();
+          auto outputBufferBlob = torch::from_blob(
+              outputBuffer.data_ptr<T>(), {nHot, numOutPlanes}, options);
+          auto inputBufferBlob = torch::from_blob(inputBuffer.data_ptr<T>(),
+                                                  {nHot, numInPlanes}, options);
+          if (device == torch::kCPU) {
+            functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtor;
+            gatherFtor(tv::CPU(), tv::torch2tv<T>(inputBuffer),
+                       tv::torch2tv<const T>(features),
+                       tv::torch2tv<const int>(indicePairs).subview(i, inverse),
+                       nHot);
+          }
+#ifdef SPCONV_CUDA
+          else if (device == torch::kCUDA) {
+            functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtor;
+            gatherFtor(tv::TorchGPU(), tv::torch2tv<T>(inputBuffer),
+                       tv::torch2tv<const T>(features),
+                       tv::torch2tv<const int>(indicePairs).subview(i, inverse),
+                       nHot);
+            TV_CHECK_CUDA_ERR();
+            /* slower than SparseGatherFunctor, may due to int->long conversion
+    auto indicePairLong = indicePairs[i][inverse].to(torch::kInt64);
+    auto indicePairBlob = torch::from_blob(indicePairLong.data<long>(),
+    {nHot}, indicePairOptions); torch::index_select_out(inputBufferBlob,
+    features, 0, indicePairBlob);*/
+          }
+#endif
+          else {
+            TV_ASSERT_INVALID_ARG(false, "unknown device type");
+          }
+          // totalGatherTime += timer.report() / 1000.0;
+          torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]);
+          // totalGEMMTime += timer.report() / 1000.0;
+          if (device == torch::kCPU) {
+            functor::SparseScatterAddFunctor<tv::CPU, T, int> scatterFtor;
+            scatterFtor(
+                tv::CPU(), tv::torch2tv<T>(output),
+                tv::torch2tv<const T>(outputBuffer),
+                tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
+                true);
+          }
+#ifdef SPCONV_CUDA
+          else if (device == torch::kCUDA) {
+            functor::SparseScatterAddFunctor<tv::GPU, T, int> scatterFtor;
+            scatterFtor(
+                tv::TorchGPU(), tv::torch2tv<T>(output),
+                tv::torch2tv<const T>(outputBuffer),
+                tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
+                true);
+            TV_CHECK_CUDA_ERR();
+          }
+#endif
+          else {
+            TV_ASSERT_INVALID_ARG(false, "unknown device type");
+          }
+          // totalSAddTime += timer.report() / 1000.0;
+        }
+      });
+  // std::cout << "gather time " << totalGatherTime << std::endl;
+  // std::cout << "gemm time " << totalGEMMTime << std::endl;
+  // std::cout << "scatteradd time " << totalSAddTime << std::endl;
+  return output;
+}
+std::vector<torch::Tensor>
+indiceConvBackward(torch::Tensor features, torch::Tensor filters,
+                   torch::Tensor outGrad, torch::Tensor indicePairs,
+                   torch::Tensor indiceNum, int64_t _inverse, int64_t _subM) {
+  bool subM = _subM != 0;
+  bool inverse = _inverse != 0;
+  auto device = features.device().type();
+  auto ndim = filters.dim() - 2;
+  auto kernelVolume = indicePairs.size(0);
+  auto numInPlanes = features.size(1);
+  auto numOutPlanes = filters.size(ndim + 1);
+  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
+  auto indicePairMaxSizeIter =
+      std::max_element(indicePairNumCpu.data_ptr<int>(),
+                       indicePairNumCpu.data_ptr<int>() + kernelVolume);
+  int indicePairMaxOffset =
+      indicePairMaxSizeIter - indicePairNumCpu.data_ptr<int>();
+  int indicePairMaxSize = *indicePairMaxSizeIter;
+  auto options =
+      torch::TensorOptions().dtype(features.dtype()).device(features.device());
+  auto filterShape = filters.sizes();
+  torch::Tensor inputGrad = torch::zeros(features.sizes(), options);
+  torch::Tensor filtersGrad = torch::zeros(filterShape, options);
+  torch::Tensor inputBuffer =
+      torch::zeros({indicePairMaxSize, numInPlanes}, options);
+  torch::Tensor outputBuffer =
+      torch::zeros({indicePairMaxSize, numOutPlanes}, options);
+  filters = filters.view({-1, numInPlanes, numOutPlanes});
+  filtersGrad = filtersGrad.view({-1, numInPlanes, numOutPlanes});
+  if (subM) {
+    auto filterGradSub = filtersGrad[indicePairMaxOffset];
+    torch::mm_out(filterGradSub, features.t(), outGrad);
+    torch::mm_out(inputGrad, outGrad, filters[indicePairMaxOffset].t());
+  }
+  tv::torch_dispatch<float, double,
+                     at::Half>(features.scalar_type(), [&](auto I) {
+    using T = decltype(I);
+    for (int i = 0; i < kernelVolume; ++i) {
+      auto nHot = indicePairNumCpu.data_ptr<int>()[i];
+      if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
+        continue;
+      }
+      if (device == torch::kCPU) {
+        functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtor;
+        functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtorOut;
+        gatherFtor(tv::CPU(), tv::torch2tv<T>(inputBuffer),
+                   tv::torch2tv<const T>(features),
+                   tv::torch2tv<const int>(indicePairs).subview(i, inverse),
+                   nHot);
+        gatherFtorOut(tv::CPU(), tv::torch2tv<T>(outputBuffer),
+                      tv::torch2tv<const T>(outGrad),
+                      tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
+                      nHot);
+      }
+#ifdef SPCONV_CUDA
+      else if (device == torch::kCUDA) {
+        functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtor;
+        functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtorOut;
+        gatherFtor(tv::TorchGPU(), tv::torch2tv<T>(inputBuffer),
+                   tv::torch2tv<const T>(features),
+                   tv::torch2tv<const int>(indicePairs).subview(i, inverse),
+                   nHot);
+        TV_CHECK_CUDA_ERR();
+        gatherFtorOut(tv::TorchGPU(), tv::torch2tv<T>(outputBuffer),
+                      tv::torch2tv<const T>(outGrad),
+                      tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
+                      nHot);
+        TV_CHECK_CUDA_ERR();
+      }
+#endif
+      else {
+        TV_ASSERT_INVALID_ARG(false, "unknown device type");
+      }
+      auto filterGradSub = filtersGrad[i];
+      auto outputBufferBlob = torch::from_blob(outputBuffer.data_ptr<T>(),
+                                               {nHot, numOutPlanes}, options);
+      auto inputBufferBlob = torch::from_blob(inputBuffer.data_ptr<T>(),
+                                              {nHot, numInPlanes}, options);
+      torch::mm_out(filterGradSub, inputBufferBlob.t(), outputBufferBlob);
+      torch::mm_out(inputBufferBlob, outputBufferBlob, filters[i].t());
+      if (device == torch::kCPU) {
+        functor::SparseScatterAddFunctor<tv::CPU, T, int> scatterFtor;
+        scatterFtor(tv::CPU(), tv::torch2tv<T>(inputGrad),
+                    tv::torch2tv<const T>(inputBuffer),
+                    tv::torch2tv<const int>(indicePairs).subview(i, inverse),
+                    nHot);
+      }
+#ifdef SPCONV_CUDA
+      else if (device == torch::kCUDA) {
+        functor::SparseScatterAddFunctor<tv::GPU, T, int> scatterFtor;
+        scatterFtor(tv::TorchGPU(), tv::torch2tv<T>(inputGrad),
+                    tv::torch2tv<const T>(inputBuffer),
+                    tv::torch2tv<const int>(indicePairs).subview(i, inverse),
+                    nHot);
+        TV_CHECK_CUDA_ERR();
+      }
+#endif
+      else {
+        TV_ASSERT_INVALID_ARG(false, "unknown device type");
+      }
+    }
+  });
+  return {inputGrad, filtersGrad.view(filterShape)};
+}
+} // namespace spconv
\ No newline at end of file
--- a/test/fake_dist_train.py
+++ b/test/fake_dist_train.py
+import horovod.torch as hvd
+import time
+from pathlib import Path
+import fire
+import numpy as np
+import torch
+import torch.nn.functional as F
+import tqdm
+from torch import distributed, nn
+from torch.utils import data
+from torch.utils.data import DataLoader, Dataset
+from torchvision import datasets, transforms
+import spconv
+from spconv.test_utils import generate_sparse_data
+class FakeSparseDataset(Dataset):
+    def __len__(self):
+        return 500
+    def __getitem__(self, idx):
+        data_ranges = {
+            0: [-1, 1],
+            1: [0, 2],
+            2: [-2, 0],
+            3: [-2, -2],
+        }
+        l = np.random.randint(0, 4, size=[2])
+        data = generate_sparse_data([16, 64, 64], [16 * 64 * 64 // 2],
+                                    3,
+                                    data_range=data_ranges[l[0]],
+                                    with_dense=False)
+        data2 = generate_sparse_data([16, 64, 64], [16 * 64 * 64 // 2],
+                                     3,
+                                     data_range=data_ranges[l[1]],
+                                     with_dense=False)
+        features = np.ascontiguousarray(data["features"]).astype(np.float32)
+        indices = np.ascontiguousarray(
+            data["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
+        features2 = np.ascontiguousarray(data2["features"]).astype(np.float32)
+        indices2 = np.ascontiguousarray(
+            data2["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
+        features = np.ascontiguousarray(np.concatenate([features, features2]))
+        indices = np.ascontiguousarray(np.concatenate([indices, indices2]))
+        return features, indices, l
+class FakeClassifier(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.net = spconv.SparseSequential(
+            spconv.SubMConv3d(3, 8, 3, indice_key="subm1", padding=1, use_hash=False),
+            nn.BatchNorm1d(8),
+            nn.ReLU(),
+            spconv.SparseConv3d(8, 16, 3, stride=2, padding=1, use_hash=False),
+            nn.BatchNorm1d(16),
+            nn.ReLU(),
+            spconv.SubMConv3d(16, 16, 3, indice_key="subm2", padding=1, use_hash=False),
+            nn.BatchNorm1d(16),
+            nn.ReLU(),
+            spconv.SparseConv3d(16, 32, 3, stride=2, padding=1, use_hash=False),
+            nn.BatchNorm1d(32),
+            nn.ReLU(),
+            spconv.SubMConv3d(32, 32, 3, indice_key="subm3", padding=1, use_hash=False),
+            nn.BatchNorm1d(32),
+            nn.ReLU(),
+            spconv.SparseConv3d(32, 64, 3, stride=2, padding=1, use_hash=False),
+            nn.BatchNorm1d(64),
+            nn.ReLU(),
+            spconv.SubMConv3d(64, 64, 3, indice_key="subm4", padding=1, use_hash=False),
+            nn.BatchNorm1d(64),
+            nn.ReLU(),
+            spconv.ToDense()  # [64, 2, 8, 8]
+        )
+        self.linear = nn.Linear(64 * 2 * 8 * 8, 4)
+    def forward(self, features, indices):
+        indices = indices.int()
+        x = spconv.SparseConvTensor(features, indices, [16, 64, 64], 2)
+        x = self.net(x)
+        x = x.view(2, -1)
+        x = self.linear(x)
+        return x
+def run():
+    hvd.init()
+    torch.cuda.set_device(hvd.local_rank())
+    np.random.seed(50051 + hvd.local_rank())
+    ds = FakeSparseDataset()
+    device = torch.device('cuda')
+    model = FakeClassifier()
+    model.to(device)
+    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
+    hvd.broadcast_parameters(model.state_dict(), root_rank=0)
+    hvd.broadcast_optimizer_state(optimizer, root_rank=0)
+    compression = hvd.Compression.none
+    optimizer = hvd.DistributedOptimizer(optimizer,
+                                         named_parameters=model.named_parameters(),
+                                         compression=compression,
+                                         op=hvd.Average)
+    for i in tqdm.tqdm(list(range(100))):
+        # for j in range(4):
+        #     features, indices, label = ds[(i * 4 + j) % len(ds)]
+        features, indices, label = ds[i % len(ds)]
+        features_t = torch.from_numpy(features)
+        indices_t = torch.from_numpy(indices)
+        features_t = features_t.to(device)
+        indices_t = indices_t.to(device)
+        target = torch.from_numpy(label).to(device)
+        output = model(features_t, indices_t)
+        # print(output.shape)
+        loss = F.cross_entropy(output, target)
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+def dev():
+    ds = FakeSparseDataset()
+    for i in range(10):
+        features, indices, label = ds[i]
+        print(indices[:10])
+    features_t = torch.from_numpy(features.astype(np.float32)).cuda()
+    indices_t = torch.from_numpy(indices.astype(np.int32)).cuda()
+    net = FakeClassifier().cuda()
+    net(features_t, indices_t)
+def main():
+    run()
+if __name__ == "__main__":
+    fire.Fire(main)
--- a/test/fake_train.py
+++ b/test/fake_train.py
+import time
+from pathlib import Path
+import fire
+import numpy as np
+import torch
+import torch.nn.functional as F
+import tqdm
+from torch import distributed, nn
+from torch.utils import data
+from torch.utils.data import DataLoader, Dataset
+from torchvision import datasets, transforms
+import spconv
+from spconv.test_utils import generate_sparse_data
+class FakeSparseDataset(Dataset):
+    def __len__(self):
+        return 500
+    def __getitem__(self, idx):
+        data_ranges = {
+            0: [-1, 1],
+            1: [0, 2],
+            2: [-2, 0],
+            3: [-2, -2],
+        }
+        l = np.random.randint(0, 4, size=[2])
+        data = generate_sparse_data([16, 64, 64], [16 * 64 * 64 // 2],
+                                    3,
+                                    data_range=data_ranges[l[0]],
+                                    with_dense=False)
+        data2 = generate_sparse_data([16, 64, 64], [16 * 64 * 64 // 2],
+                                     3,
+                                     data_range=data_ranges[l[1]],
+                                     with_dense=False)
+        features = np.ascontiguousarray(data["features"]).astype(np.float32)
+        indices = np.ascontiguousarray(
+            data["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
+        features2 = np.ascontiguousarray(data2["features"]).astype(np.float32)
+        indices2 = np.ascontiguousarray(
+            data2["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
+        features = np.ascontiguousarray(np.concatenate([features, features2]))
+        indices = np.ascontiguousarray(np.concatenate([indices, indices2]))
+        return features, indices, l
+class FakeClassifier(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.net = spconv.SparseSequential(
+            spconv.SubMConv3d(3, 8, 3, indice_key="subm1", padding=1, use_hash=False),
+            nn.BatchNorm1d(8),
+            nn.ReLU(),
+            spconv.SparseConv3d(8, 16, 3, stride=2, padding=1, use_hash=False),
+            nn.BatchNorm1d(16),
+            nn.ReLU(),
+            spconv.SubMConv3d(16, 16, 3, indice_key="subm2", padding=1, use_hash=False),
+            nn.BatchNorm1d(16),
+            nn.ReLU(),
+            spconv.SparseConv3d(16, 32, 3, stride=2, padding=1, use_hash=False),
+            nn.BatchNorm1d(32),
+            nn.ReLU(),
+            spconv.SubMConv3d(32, 32, 3, indice_key="subm3", padding=1, use_hash=False),
+            nn.BatchNorm1d(32),
+            nn.ReLU(),
+            spconv.SparseConv3d(32, 64, 3, stride=2, padding=1, use_hash=False),
+            nn.BatchNorm1d(64),
+            nn.ReLU(),
+            spconv.SubMConv3d(64, 64, 3, indice_key="subm4", padding=1, use_hash=False),
+            nn.BatchNorm1d(64),
+            nn.ReLU(),
+            spconv.ToDense()  # [64, 2, 8, 8]
+        )
+        self.linear = nn.Linear(64 * 2 * 8 * 8, 4)
+    def forward(self, features, indices):
+        indices = indices.int()
+        x = spconv.SparseConvTensor(features, indices, [16, 64, 64], 2)
+        x = self.net(x)
+        x = x.view(2, -1)
+        x = self.linear(x)
+        return x
+def run():
+    np.random.seed(50051)
+    ds = FakeSparseDataset()
+    device = torch.device('cuda')
+    model = FakeClassifier()
+    model.to(device)
+    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
+    for i in tqdm.tqdm(list(range(100))):
+        # for j in range(4):
+        #     features, indices, label = ds[(i * 4 + j) % len(ds)]
+        features, indices, label = ds[i % len(ds)]
+        features_t = torch.from_numpy(features)
+        indices_t = torch.from_numpy(indices)
+        features_t = features_t.to(device)
+        indices_t = indices_t.to(device)
+        target = torch.from_numpy(label).to(device)
+        output = model(features_t, indices_t)
+        # print(output.shape)
+        loss = F.cross_entropy(output, target)
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+def dev():
+    ds = FakeSparseDataset()
+    for i in range(10):
+        features, indices, label = ds[i]
+        print(indices[:10])
+    features_t = torch.from_numpy(features.astype(np.float32)).cuda()
+    indices_t = torch.from_numpy(indices.astype(np.int32)).cuda()
+    net = FakeClassifier().cuda()
+    net(features_t, indices_t)
+def main():
+    run()
+if __name__ == "__main__":
+    fire.Fire(main)
--- a/test/test_conv.py
+++ b/test/test_conv.py
@@ -581,7 +581,7 @@ def main():
        if all([s > 1, d > 1]):
            continue
        device = torch.device(dev)
-        num_points = [5] * bs
+        num_points = [500] * bs
        sparse_dict = generate_sparse_data(shape, num_points, IC)
@@ -601,7 +601,7 @@ def main():
        net.net[0].weight[:] = filters_t
        out_ref = net_ref(features_dense_t)
        times = []
-        for i in range(0):
+        for i in range(10):
            t = time.time()
            out = net(features_t, indices_t, bs)
            torch.cuda.synchronize()