allpy low nk params

e8bc31ec · EvernightAurora · 2b195e43 · e8bc31ec · e8bc31ec
Commit e8bc31ec authored Aug 29, 2022 by EvernightAurora
Hide whitespace changes
Inline Side-by-side

Showing with 123 additions and 1 deletion

spconv/core.py spconv/core.py +79 -0

test/benchmark.py test/benchmark.py +44 -1

No files found.
--- a/spconv/core.py
+++ b/spconv/core.py
@@ -487,6 +487,56 @@ IMPLGEMM_VOLTA_PARAMS = [
 ]

 IMPLGEMM_TURING_PARAMS = [
+
+    *gen_conv_params(ConvFwdAndBwdInput, (32, 16, 16), (16, 16, 16),
+                     NDIM_DONT_CARE,
+                     ConvIterAlgo.Optimized,
+                     2, ["f16,f16,f16,f16,f16", "f16,f16,f16,f32,f32"],
+                     NHWC,
+                     NHWC,
+                     NHWC,
+                     GemmAlgo.Turing,
+                     TensorOp((16, 8, 8)),
+                     mask_sparse=True,
+                     increment_k_first=True,
+                     access_per_vector=0),
+    *gen_conv_params(ConvFwdAndBwdInput, (32, 16, 16), (16, 16, 16),
+                     NDIM_DONT_CARE,
+                     ConvIterAlgo.Optimized,
+                     2, ["f16,f16,f16,f16,f16", "f16,f16,f16,f32,f32"],
+                     NHWC,
+                     NHWC,
+                     NHWC,
+                     GemmAlgo.Turing,
+                     TensorOp((16, 8, 8)),
+                     mask_sparse=True,
+                     increment_k_first=True,
+                     access_per_vector=1),    
+    *gen_conv_params(ConvFwdAndBwdInput, (64, 32, 16), (32, 16, 16),
+                     NDIM_DONT_CARE,
+                     ConvIterAlgo.Optimized,
+                     2, ["f16,f16,f16,f16,f16", "f16,f16,f16,f32,f32"],
+                     NHWC,
+                     NHWC,
+                     NHWC,
+                     GemmAlgo.Turing,
+                     TensorOp((16, 8, 8)),
+                     mask_sparse=True,
+                     increment_k_first=True,
+                     access_per_vector=0),
+    *gen_conv_params(ConvFwdAndBwdInput, (64, 32, 16), (32, 16, 16),
+                     NDIM_DONT_CARE,
+                     ConvIterAlgo.Optimized,
+                     2, ["f16,f16,f16,f16,f16", "f16,f16,f16,f32,f32"],
+                     NHWC,
+                     NHWC,
+                     NHWC,
+                     GemmAlgo.Turing,
+                     TensorOp((16, 8, 8)),
+                     mask_sparse=True,
+                     increment_k_first=True,
+                     access_per_vector=1),
+
    *gen_conv_params(ConvFwdAndBwdInput, (32, 64, 32), (32, 32, 16),
                     NDIM_DONT_CARE,
                     ConvIterAlgo.Optimized,
@@ -658,6 +708,35 @@ IMPLGEMM_TURING_PARAMS = [
                     mask_sparse=True,
                     increment_k_first=True,
                     access_per_vector=1),
+
+
+    *gen_conv_params(ConvBwdWeight, (64, 16, 32), (32, 16, 32),
+                     NDIM_DONT_CARE,
+                     ConvIterAlgo.Optimized,
+                     2,
+                     "f16,f16,f16,f32,f32",
+                     NHWC,
+                     NHWC,
+                     NHWC,
+                     GemmAlgo.Turing,
+                     TensorOp((16, 8, 8)),
+                     mask_sparse=True,
+                     increment_k_first=True,
+                     access_per_vector=0),
+    *gen_conv_params(ConvBwdWeight, (64, 16, 32), (32, 16, 32),
+                     NDIM_DONT_CARE,
+                     ConvIterAlgo.Optimized,
+                     2,
+                     "f16,f16,f16,f32,f32",
+                     NHWC,
+                     NHWC,
+                     NHWC,
+                     GemmAlgo.Turing,
+                     TensorOp((16, 8, 8)),
+                     mask_sparse=True,
+                     increment_k_first=True,
+                     access_per_vector=1),
+    
    # *gen_conv_params(ConvBwdWeight, (32, 64, 32), (32, 32, 16), NDIM_DONT_CARE, ConvIterAlgo.Optimized, 2, "f16,f16,f16,f32,f32",
    #     NHWC, NHWC, NHWC, GemmAlgo.Turing, TensorOp((16, 8, 8)), mask_sparse=True, increment_k_first=True, access_per_vector=1),


--- a/test/benchmark.py
+++ b/test/benchmark.py
@@ -290,6 +290,49 @@ class Net2(nn.Module):
        return self.net(x)


+
+class NetSm(nn.Module):
+    def __init__(self, shape, algo):
+        super().__init__()
+        self.net = spconv.SparseSequential(
+            spconv.SubMConv3d(3,
+                              8,
+                              3,
+                              bias=False,
+                              indice_key="c0",
+                              algo=algo),
+            spconv.SubMConv3d(8,
+                              16,
+                              3,
+                              bias=False,
+                              indice_key="c0",
+                              algo=algo),
+            spconv.SubMConv3d(16,
+                              32,
+                              3,
+                              bias=False,
+                              indice_key="c0",
+                              algo=algo),
+            spconv.SubMConv3d(32,
+                              64,
+                              3,
+                              bias=False,
+                              indice_key="c0",
+                              algo=algo),
+            
+        )
+        max_batch_size = 1
+        # grid (dense map) is used for indice generation. use pre-allocated grid can run faster.
+        self.grid = torch.full([max_batch_size, *shape], -1,
+                               dtype=torch.int32).cuda()
+        # self.grid = None
+        self.shape = shape
+
+    def forward(self, features, coors, batch_size, enable_timer: bool = False):
+        x = spconv.SparseConvTensor(features, coors, self.shape, batch_size,
+                                    self.grid, enable_timer=enable_timer)
+        return self.net(x)
+
 import numpy as np
 from cumm import tensorview as tv
 from spconv.core_cc.csrc.sparse.all import SpconvOps
@@ -358,7 +401,7 @@ def main():
    # MaskImpGemm: 51.0ms
    # MaskSplitImpGemm: 41.1ms
    # algo = None
-    net = Net(spatial_shape, algo).to(device).eval().to(dtype)# .train()
+    net = NetSm(spatial_shape, algo).to(device).eval().to(dtype)# .train()
    # net.load_state_dict(net.state_dict())
    spconv.assign_name_for_sparse_modules(net)
    print(coors_th.shape)