Commit bf011c76 authored by yan.yan's avatar yan.yan
Browse files

temp commit

parent 4791f582
......@@ -35,6 +35,20 @@ from spconv.utils import nullcontext
FILTER_HWIO = False
def expand_nd(val: Union[int, List[int], Tuple[int, ...]], ndim: int) -> List[int]:
if isinstance(val, int):
val = [val] * ndim
elif isinstance(val, list):
assert len(val) == ndim
elif isinstance(val, tuple):
assert len(val) == ndim
return [*val]
else:
raise NotImplementedError
return val
def _calculate_fan_in_and_fan_out_hwio(tensor, algo: ConvAlgo):
dimensions = tensor.ndimension()
if dimensions < 2:
......@@ -110,7 +124,9 @@ class SparseConvolution(SparseModule):
self.out_channels = out_channels
self.kernel_size = kernel_size
kv = int(np.prod(kernel_size))
self.conv1x1 = kv == 1
kv_stride = int(np.prod(kernel_size))
self.conv1x1 = kv == 1 and kv_stride == 1
self.stride = stride
self.padding = padding
self.dilation = dilation
......
......@@ -104,7 +104,8 @@ class SparseConvTensor(metaclass=SpConvTensorMeta):
indice_dict: Optional[dict] = None,
benchmark: bool = False,
permanent_thrust_allocator: bool = False,
enable_timer: bool = False):
enable_timer: bool = False,
force_algo: Optional[ConvAlgo] = None):
"""
Args:
features: [num_points, num_features] feature tensor
......@@ -115,6 +116,8 @@ class SparseConvTensor(metaclass=SpConvTensorMeta):
is very large.
benchmark: whether to enable benchmark. if enabled, all sparse operators will be record to
SparseConvTensor.
enable_timer: if exists, all spconv internal ops run time will be record in _timer.
force_algo: force conv/pool layers use this algo, should only used for debug.
"""
ndim = indices.shape[1] - 1
assert features.ndim == 2
......@@ -139,6 +142,7 @@ class SparseConvTensor(metaclass=SpConvTensorMeta):
if permanent_thrust_allocator:
self.thrust_allocator = ThrustSortAllocator(features.device)
self._timer = CUDAKernelTimer(enable_timer)
self.force_algo = force_algo
def replace_feature(self, feature: torch.Tensor):
"""we need to replace x.features = F.relu(x.features) with x = x.replace_feature(F.relu(x.features))
......@@ -152,6 +156,8 @@ class SparseConvTensor(metaclass=SpConvTensorMeta):
new_spt.benchmark_record = self.benchmark_record
new_spt.thrust_allocator = self.thrust_allocator
new_spt._timer = self._timer
new_spt.force_algo = self.force_algo
return new_spt
@property
......@@ -217,4 +223,5 @@ class SparseConvTensor(metaclass=SpConvTensorMeta):
tensor.benchmark_record = self.benchmark_record
tensor.thrust_allocator = self.thrust_allocator
tensor._timer = self._timer
tensor.force_algo = self.force_algo
return tensor
......@@ -30,7 +30,8 @@ _TORCH_DTYPE_TO_TV = {
def torch_tensor_to_tv(ten: torch.Tensor,
dtype: Optional[int] = None,
shape: Optional[List[int]] = None):
shape: Optional[List[int]] = None,
stride: Optional[List[int]] = None):
# assert ten.is_contiguous(), "must be contiguous tensor"
ptr = ten.data_ptr()
device = ten.device
......@@ -40,11 +41,20 @@ def torch_tensor_to_tv(ten: torch.Tensor,
tv_device = 0
else:
raise NotImplementedError
if shape is None:
shape = list(ten.shape)
if dtype is None:
dtype = _TORCH_DTYPE_TO_TV[ten.dtype]
return tv.from_blob(ptr, shape, list(ten.stride()), dtype, tv_device)
if stride is None:
stride = list(ten.stride())
if shape is None:
shape = list(ten.shape)
else:
if not ten.is_contiguous():
msg = "if you provide custom shape for non-contig tensor, stride must not None"
assert stride is not None, msg
else:
# custom shape, if tensor is contiguous, we use from_blob and calc strides
return tv.from_blob(ptr, shape, dtype, tv_device)
return tv.from_blob_strided(ptr, shape, stride, dtype, tv_device)
def get_current_stream():
......
......@@ -137,6 +137,7 @@ class SparseSequential(SparseModule):
input = module(input)
else:
if isinstance(input, spconv.SparseConvTensor):
print(input.features.shape)
if input.indices.shape[0] != 0:
input = input.replace_feature(module(input.features))
else:
......
......@@ -1066,7 +1066,7 @@ def indice_conv_backward(features: torch.Tensor,
alpha=1.0,
beta=beta)
if not FILTER_HWIO:
if is_KC_not_CK:
a = out_bp_tv
b = features_tv
a_inds = out_indices
......@@ -1376,6 +1376,9 @@ def implicit_gemm_backward(features: torch.Tensor,
mask_width=-1,
beta=beta,
stream=stream)
# for backward weight, beta = 0 because each split
# handle different kernel locations.
# TODO remove D iterator in backward weight kernel
CONV.run_with_tuned_result(
wgrad_tune_res,
ConvOpType.kBackwardWeight,
......@@ -1389,7 +1392,7 @@ def implicit_gemm_backward(features: torch.Tensor,
reverse_mask=False,
mask_filter=masks[j].item(),
mask_width=mask_width,
beta=beta,
beta=0,
workspace=workspace_tv,
stream=stream)
......@@ -1403,6 +1406,8 @@ def indice_maxpool(features: torch.Tensor, indice_pairs: torch.Tensor,
# stream = get_current_stream()
# CONV.stream_synchronize(stream)
# t = time.time()
if not features.is_contiguous():
features = features.contiguous()
out_channel = features.shape[-1]
out_features = torch.zeros((num_activate_out, out_channel),
......@@ -1474,6 +1479,8 @@ def indice_maxpool_implicit_gemm(features: torch.Tensor,
stream = get_current_stream()
# CONV.stream_synchronize(stream)
# t = time.time()
if not features.is_contiguous():
features = features.contiguous()
out_channel = features.shape[-1]
out_features = torch.empty((num_activate_out, out_channel),
......
......@@ -71,36 +71,72 @@ class PointToVoxel(object):
pc: torch.Tensor,
clear_voxels: bool = True,
empty_mean: bool = False):
"""generate voxels/indices/num_point_per_voxel/pc_voxel_ids from
point cloud.
This function don't return pc_voxel_id for backward compatility.
pc_voxel_id will be added in spconv 2.2.
Args:
pc: [N, 3+] point cloud.
clear_voxels: if True, call zero on voxels
empty_mean: if True, full empty location of voxels with mean.
Returns:
voxels: voxels
indices: quantized coords
num_per_voxel: number of points in a voxel
"""
res = self.generate_voxel_with_id(pc, clear_voxels, empty_mean)
return res[0], res[1], res[2]
def generate_voxel_with_id(self,
pc: torch.Tensor,
clear_voxels: bool = True,
empty_mean: bool = False):
"""generate voxels/indices/num_point_per_voxel/pc_voxel_ids from
point cloud.
Args:
pc: [N, 3+] point cloud.
clear_voxels: if True, call zero on voxels
empty_mean: if True, full empty location of voxels with mean.
Returns:
voxels: voxels
indices: quantized coords
num_per_voxel: number of points in a voxel
pc_voxel_id: voxel id for every point. if not exists, -1.
"""
assert pc.device.type == self.device.type, "your pc device is wrong"
expected_hash_data_num = pc.shape[0] * 2
with torch.no_grad():
pc_voxel_id = torch.empty([pc.shape[0]],
dtype=torch.int64,
device=self.device)
pc_voxel_id_tv = torch_tensor_to_tv(pc_voxel_id)
if self.device.type != "cpu":
if self.hashdata.shape[0] < expected_hash_data_num:
self.hashdata = torch.empty([expected_hash_data_num, 2],
dtype=torch.int64,
device=self.device)
hashdata = torch.empty([expected_hash_data_num, 2],
dtype=torch.int64,
device=pc.device)
point_indice_data = torch.empty([pc.shape[0]],
dtype=torch.int64,
device=pc.device)
if self.point_indice_data.shape[0] < pc.shape[0]:
self.point_indice_data = torch.empty([pc.shape[0]],
dtype=torch.int64,
device=self.device)
pc_tv = torch_tensor_to_tv(pc)
stream = get_current_stream()
voxels_tv = torch_tensor_to_tv(self.voxels)
indices_tv = torch_tensor_to_tv(self.indices)
num_per_voxel_tv = torch_tensor_to_tv(self.num_per_voxel)
hashdata_tv = torch_tensor_to_tv(
self.hashdata,
hashdata,
dtype=tv.custom128,
shape=[self.hashdata.shape[0]])
point_indice_data_tv = torch_tensor_to_tv(
self.point_indice_data)
res = SpconvOps.point2voxel_cuda(
pc_tv, voxels_tv, indices_tv, num_per_voxel_tv,
hashdata_tv, point_indice_data_tv, self.vsize,
self.grid_size, self.grid_stride, self.coors_range,
empty_mean, clear_voxels, stream)
shape=[hashdata.shape[0]])
point_indice_data_tv = torch_tensor_to_tv(point_indice_data)
with torch.cuda.device(pc.device):
res = SpconvOps.point2voxel_cuda(
pc_tv, voxels_tv, indices_tv, num_per_voxel_tv,
hashdata_tv, point_indice_data_tv, pc_voxel_id_tv, self.vsize,
self.grid_size, self.grid_stride, self.coors_range,
empty_mean, clear_voxels, stream)
num_voxels = res[0].shape[0]
else:
pc_tv = torch_tensor_to_tv(pc)
......@@ -111,6 +147,7 @@ class PointToVoxel(object):
hashdata_tv = torch_tensor_to_tv(self.hashdata, dtype=tv.int32)
res = SpconvOps.point2voxel_cpu(pc_tv, voxels_tv, indices_tv,
num_per_voxel_tv, hashdata_tv,
pc_voxel_id_tv,
self.vsize, self.grid_size,
self.grid_stride,
self.coors_range, empty_mean,
......@@ -118,4 +155,4 @@ class PointToVoxel(object):
num_voxels = res[0].shape[0]
return (self.voxels[:num_voxels], self.indices[:num_voxels],
self.num_per_voxel[:num_voxels])
self.num_per_voxel[:num_voxels], pc_voxel_id)
......@@ -24,7 +24,7 @@ from spconv.core import ConvAlgo
import spconv.pytorch as spconv
from spconv.utils import Point2VoxelCPU3d
# torch.backends.cudnn.enabled = False
def waymo_data(batch_size=1):
gen = Point2VoxelCPU3d([0.1, 0.1, 0.1], [-80, -80, -2, 80, 80, 6], 3,
150000, 1)
......@@ -289,7 +289,7 @@ def main():
voxels_th = torch.from_numpy(voxels).to(device).to(dtype)
coors_th = torch.from_numpy(coors).to(device).int()
voxels_th.requires_grad = True
algo = spconv.ConvAlgo.Native
algo = spconv.ConvAlgo.MaskImplicitGemm
# 3080 Laptop
# MaskImpGemm: 11.2ms
# MaskSplitImpGemm: 12.2ms
......@@ -324,26 +324,26 @@ def main():
print(out.spatial_shape, out.features.mean(), out.features.max(),
out.features.min())
# times = []
# with torch.no_grad():
# for i in range(20):
# print("------------")
# torch.cuda.synchronize()
# t = time.time()
# out_nograd = net(voxels_th, coors_th, 1, False)
# timer = out_nograd._timer
# # res = timer.collect_by_name("forward", timer.get_all_pair_time())
# # res2 = timer.collect_by_name("forward0", timer.get_all_pair_time())
times = []
with torch.no_grad():
for i in range(20):
print("------------")
torch.cuda.synchronize()
t = time.time()
out_nograd = net(voxels_th, coors_th, 1, False)
timer = out_nograd._timer
# res = timer.collect_by_name("forward", timer.get_all_pair_time())
# res2 = timer.collect_by_name("forward0", timer.get_all_pair_time())
# # print(sum(res.values()) + sum(res2.values()))
# # print(timer.get_all_pair_time())
# print(sum(res.values()) + sum(res2.values()))
# print(timer.get_all_pair_time())
# # print(sum(timer.get_all_pair_time().values()))
# torch.cuda.synchronize()
# # sort_bench()
# times.append(time.time() - t)
# print("spconv time", np.mean(times[10:]))
# times = []
# print(sum(timer.get_all_pair_time().values()))
torch.cuda.synchronize()
# sort_bench()
times.append(time.time() - t)
print("spconv time", np.mean(times[10:]))
times = []
# for i in range(10):
# out = net(voxels_th, coors_th, 1)
......
This diff is collapsed.
This diff is collapsed.
# Copyright 2021 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from spconv.core_cc.csrc.sparse.all import SpconvOps
......@@ -12,9 +12,330 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""Compare results between different algo:
CPU: gather-mm-scatter
"""Compare results between different algos:
CPU: simple gather-mm-scatter
Native: Fused gather-mm-scatter
ImplicitGemm
ImplicitGemm: implicit gemm
"""
import time
from pathlib import Path
import numpy as np
import torch
from torch import nn
from cumm import tensorview as tv
from spconv.core import ConvAlgo
import spconv.pytorch as spconv
import pickle
from spconv.test_utils import generate_sparse_data, params_grid
class Net(nn.Module):
def __init__(self, shape, algo):
super().__init__()
pool_algo = algo
# pool_algo = ConvAlgo.Native
self.net = spconv.SparseSequential(
spconv.SubMConv3d(3, 32, 3, bias=False, indice_key="c0",
algo=algo),
spconv.SubMConv3d(32,
32,
3,
bias=False,
indice_key="c0",
algo=algo),
# # nn.BatchNorm1d(32),
# # nn.ReLU(),
spconv.SubMConv3d(32, 64, 3, bias=False, indice_key="c0",
algo=algo),
spconv.SubMConv3d(64,
64,
3,
bias=False,
indice_key="c0",
algo=algo),
# nn.BatchNorm1d(32),
# # nn.ReLU(),
spconv.SparseConv3d(64, 64, 3, 2, 1, bias=False, indice_key="m0", algo=algo),
# # spconv.SparseMaxPool3d(2, 2, algo=pool_algo),
spconv.SubMConv3d(64,
96,
3,
bias=False,
indice_key="c1",
algo=algo),
spconv.SubMConv3d(96,
96,
3,
bias=False,
indice_key="c1",
algo=algo),
# nn.BatchNorm1d(64),
# nn.ReLU(),
spconv.SparseConv3d(96, 96, 2, 2, bias=False, indice_key="m1", algo=algo),
# spconv.SparseMaxPool3d(2, 2, algo=pool_algo),
spconv.SubMConv3d(96,
128,
3,
bias=False,
indice_key="c2",
algo=algo),
spconv.SubMConv3d(128,
128,
3,
bias=False,
indice_key="c2",
algo=algo),
# nn.BatchNorm1d(128),
# nn.ReLU(),
# spconv.SparseConv3d(128, 128, 2, 2, bias=False, indice_key="m2"),
spconv.SparseMaxPool3d(2, 2, algo=pool_algo),
spconv.SubMConv3d(128,
160,
3,
bias=False,
indice_key="c3",
algo=algo),
spconv.SubMConv3d(160,
160,
3,
bias=False,
indice_key="c3",
algo=algo),
# nn.BatchNorm1d(128),
# nn.ReLU(),
# spconv.SparseConv3d(160, 160, 2, 2, bias=False, indice_key="m3"),
spconv.SparseMaxPool3d(2, 2, algo=pool_algo, indice_key="m3"),
spconv.SubMConv3d(160,
192,
3,
bias=False,
indice_key="c4",
algo=algo),
spconv.SubMConv3d(192,
192,
3,
bias=False,
indice_key="c4",
algo=algo),
# nn.BatchNorm1d(128),
# nn.ReLU(),
spconv.SparseMaxPool3d(2, 2, indice_key="m4", algo=pool_algo),
# spconv.SparseConv3d(192, 192, 2, 2, bias=False, indice_key="m4"),
spconv.SubMConv3d(192,
224,
3,
bias=False,
indice_key="c5",
algo=algo),
spconv.SubMConv3d(224,
224,
3,
bias=False,
indice_key="c5",
algo=algo),
# nn.BatchNorm1d(256),
# nn.ReLU(),
spconv.SparseInverseConv3d(224, 128, 2, indice_key="m4", bias=False, algo=algo),
# # nn.BatchNorm1d(128),
# nn.ReLU(),
spconv.SparseInverseConv3d(128, 64, 2, indice_key="m3", bias=False, algo=algo),
)
max_batch_size = 1
# grid (dense map) is used for indice generation. use pre-allocated grid can run faster.
# self.grid = None
self.shape = shape
def forward(self, features, coors, batch_size):
x = spconv.SparseConvTensor(features,
coors,
self.shape,
batch_size)
return self.net(x)
class NetLight(nn.Module):
def __init__(self, shape, algo):
super().__init__()
pool_algo = algo
# pool_algo = ConvAlgo.Native
self.net = spconv.SparseSequential(
spconv.SubMConv3d(3, 32, 3, bias=False, indice_key="c0",
algo=algo),
spconv.SubMConv3d(32,
32,
3,
bias=False,
indice_key="c0",
algo=algo),
# # nn.BatchNorm1d(32),
# # nn.ReLU(),
spconv.SubMConv3d(32, 64, 3, bias=False, indice_key="c0",
algo=algo),
spconv.SubMConv3d(64,
64,
3,
bias=False,
indice_key="c0",
algo=algo),
# nn.BatchNorm1d(32),
# # nn.ReLU(),
spconv.SparseConv3d(64, 64, 3, 2, 1, bias=False, indice_key="m0", algo=algo),
# # spconv.SparseMaxPool3d(2, 2, algo=pool_algo),
spconv.SubMConv3d(64,
96,
3,
bias=False,
indice_key="c1",
algo=algo),
spconv.SubMConv3d(96,
96,
3,
bias=False,
indice_key="c1",
algo=algo),
# nn.BatchNorm1d(64),
# nn.ReLU(),
spconv.SparseConv3d(96, 96, 2, 2, bias=False, indice_key="m1", algo=algo),
# spconv.SparseMaxPool3d(2, 2, algo=pool_algo),
spconv.SparseInverseConv3d(96, 64, 2, indice_key="m1", bias=False, algo=algo),
# # nn.BatchNorm1d(128),
# nn.ReLU(),
spconv.SparseInverseConv3d(64, 32, 3, indice_key="m0", bias=False, algo=algo),
)
max_batch_size = 1
# grid (dense map) is used for indice generation. use pre-allocated grid can run faster.
# self.grid = None
self.shape = shape
def forward(self, features, coors, batch_size):
x = spconv.SparseConvTensor(features,
coors,
self.shape,
batch_size)
return self.net(x)
def _test_multi_impl(dtype: torch.dtype):
# TODO remove or release this when tf32 op is ready
torch.backends.cuda.matmul.allow_tf32 = False
torch.backends.cudnn.allow_tf32 = False
np.random.seed(50051)
if dtype != torch.float16:
with open(Path(__file__).parent / "data" / "test_spconv.pkl", "rb") as f:
(voxels, coors, spatial_shape) = pickle.load(f)
else:
# CPU fp16 is very slow, so we use a small data here.
spatial_shape = [19, 18, 17]
sparse_dict = generate_sparse_data(spatial_shape, [1500] * 1, 3)
voxels = np.ascontiguousarray(sparse_dict["features"]).astype(
np.float32)
coors = np.ascontiguousarray(
sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
device = torch.device("cuda:0")
device_cpu = torch.device("cpu:0")
voxels_th = torch.from_numpy(voxels).to(device_cpu).to(dtype)
coors_th = torch.from_numpy(coors).to(device_cpu).int()
voxels_th_cuda = torch.from_numpy(voxels).to(device).to(dtype)
coors_th_cuda = torch.from_numpy(coors).to(device).int()
net_cls = Net
if dtype == torch.float16:
# CPU fp16 is very slow, so we use a small network here.
net_cls = NetLight
# cpu
torch.manual_seed(50051)
net_native_cpu = net_cls(spatial_shape, ConvAlgo.Native).to(device_cpu).to(dtype)
# gpu_native
torch.manual_seed(50051)
net_native_gpu = net_cls(spatial_shape, ConvAlgo.Native).to(device).to(dtype)
torch.manual_seed(50051)
net_imp_gpu = net_cls(spatial_shape, ConvAlgo.MaskImplicitGemm).to(device).to(dtype)
torch.manual_seed(50051)
net_simp_gpu = net_cls(spatial_shape, ConvAlgo.MaskSplitImplicitGemm).to(device).to(dtype)
spconv.assign_name_for_sparse_modules(net_native_cpu)
spconv.assign_name_for_sparse_modules(net_native_gpu)
spconv.assign_name_for_sparse_modules(net_imp_gpu)
spconv.assign_name_for_sparse_modules(net_simp_gpu)
with torch.no_grad():
out: torch.Tensor = net_native_cpu(voxels_th, coors_th, 1).dense()
dout = np.random.uniform(-0.2, 0.2, out.shape).astype(np.float32)
dout_t = torch.from_numpy(dout).to(device_cpu).to(dtype)
dout_t_cu = torch.from_numpy(dout).to(device).to(dtype)
out_cpu = net_native_cpu(voxels_th, coors_th, 1).dense()
out_cpu.backward(dout_t)
out = net_native_gpu(voxels_th_cuda, coors_th_cuda, 1).dense()
out.backward(dout_t_cu)
out_imp = net_imp_gpu(voxels_th_cuda, coors_th_cuda, 1).dense()
out_imp.backward(dout_t_cu)
out_simp = net_simp_gpu(voxels_th_cuda, coors_th_cuda, 1).dense()
out_simp.backward(dout_t_cu)
with torch.no_grad():
dense_cpu = out_cpu.cuda()
dense_native = out
dense_imp = out_imp
dense_simp = out_simp
error_native = torch.linalg.norm(dense_cpu - dense_native).cpu().item()
error_imp = torch.linalg.norm(dense_cpu - dense_imp).cpu().item()
error_simp = torch.linalg.norm(dense_cpu - dense_simp).cpu().item()
print("error_native", error_native)
print("error_imp", error_imp)
print("error_simp", error_simp)
if dtype == torch.float32:
assert error_native < 0.01
assert error_imp < 0.01
assert error_simp < 0.01
else:
assert error_native < 10
assert error_imp < 10
assert error_simp < 10
cpu_params = dict(net_native_cpu.named_parameters())
native_params = dict(net_native_gpu.named_parameters())
imp_params = dict(net_imp_gpu.named_parameters())
simp_params = dict(net_simp_gpu.named_parameters())
for k, cpu_w in cpu_params.items():
native_w = native_params[k]
imp_w = imp_params[k]
simp_w = simp_params[k]
cpu_w_grad = cpu_w.grad.detach().cuda()
native_w_grad = native_w.grad.detach()
imp_w_grad = imp_w.grad.detach()
simp_w_grad = simp_w.grad.detach()
error_native = torch.linalg.norm(native_w_grad - cpu_w_grad).cpu().item()
error_imp = torch.linalg.norm(native_w_grad - imp_w_grad).cpu().item()
error_simp = torch.linalg.norm(native_w_grad - simp_w_grad).cpu().item()
print(k, error_native, error_imp, error_simp)
assert error_imp < 1
assert error_simp < 1
def test_multi_impl():
_test_multi_impl(torch.float32)
_test_multi_impl(torch.float16)
if __name__ == "__main__":
test_multi_impl()
# Copyright 2021 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# developers must run this file before push or pull request.
# this script contains three parts:
# 1. unit tests for all gemm/conv kernels
# 2. comparison test: compare network fwd/bwd results between CPU, Native, ImplicitGemm
# 3. f32/f16 train/eval test based on mnist and some small datasets
echo "-------------UNIT TEST START--------------"
pytest ./test
echo "-------------UNIT TEST END--------------"
python ./example/mnist_sparse.py --fp16
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment