# Copyright 2021 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Compare results between sparse and dense layers:
SparseConvXd
SparseConvTransposeXd
SparseMaxPoolXd
"""

import time
import unittest
from pathlib import Path

import numpy as np
import torch
from torch import nn
from spconv.core import ConvAlgo

import spconv.pytorch as spconv
from spconv.test_utils import TestCase, generate_sparse_data, params_grid
from spconv.constants import ALL_WEIGHT_IS_KRSC, FILTER_HWIO

# we must disable tf32 to increase reference precision.
torch.backends.cuda.matmul.allow_tf32 = False
torch.backends.cudnn.allow_tf32 = False

class SparseConv3dTestTorch(nn.Module):
    def __init__(self,
                 num_layers,
                 ndim,
                 shape,
                 in_channels,
                 out_channels,
                 kernel_size,
                 stride,
                 padding,
                 dilation,
                 algo=spconv.ConvAlgo.MaskSplitImplicitGemm):
        super().__init__()
        self.algo = algo
        layers = [
            spconv.SparseConv3d(in_channels,
                                out_channels,
                                kernel_size,
                                stride,
                                padding=padding,
                                dilation=dilation,
                                bias=False,
                                algo=algo)
        ]
        for i in range(1, num_layers):
            layers.append(
                spconv.SparseConv3d(out_channels,
                                    out_channels,
                                    kernel_size,
                                    stride,
                                    padding=padding,
                                    dilation=dilation,
                                    bias=False,
                                    algo=algo))
        self.net = spconv.SparseSequential(*layers, )
        # self.grid = torch.full([3, *shape], -1, dtype=torch.int32).cuda()
        self.grid = None
        self.shape = shape

    def forward(self, features, coors, batch_size):
        coors = coors.int()
        x = spconv.SparseConvTensor(features, coors, self.shape, batch_size,
                                    self.grid)
        return self.net(x)  # .dense()

class Conv3dTestTorch(nn.Module):
    def __init__(self, num_layers, ndim, shape, in_channels, out_channels,
                 kernel_size, stride, padding, dilation):
        super().__init__()
        layers = [
            nn.Conv3d(in_channels,
                      out_channels,
                      kernel_size,
                      stride,
                      padding=padding,
                      dilation=dilation,
                      bias=False)
        ]
        for i in range(1, num_layers):
            layers.append(
                nn.Conv3d(out_channels,
                          out_channels,
                          kernel_size,
                          stride,
                          padding=padding,
                          dilation=dilation,
                          bias=False))
        self.net = nn.Sequential(*layers, )
        self.shape = shape

    def forward(self, x):
        return self.net(x)  # .dense()

class SparseDeConv3dTestTorch(nn.Module):
    def __init__(self, num_layers, ndim, shape, in_channels, out_channels,
                 kernel_size, stride, padding, dilation, algo):
        super().__init__()
        self.algo = algo
        layers = [
            spconv.SparseConvTranspose3d(in_channels,
                                         out_channels,
                                         kernel_size,
                                         stride,
                                         padding=padding,
                                         dilation=dilation,
                                         bias=False,
                                         algo=algo)
        ]
        for i in range(1, num_layers):
            layers.append(
                spconv.SparseConvTranspose3d(out_channels,
                                             out_channels,
                                             kernel_size,
                                             stride,
                                             padding=padding,
                                             dilation=dilation,
                                             bias=False,
                                             algo=algo))
        self.net = spconv.SparseSequential(*layers, )
        self.shape = shape

    def forward(self, features, coors, batch_size):
        coors = coors.int()
        x = spconv.SparseConvTensor(features, coors, self.shape, batch_size)
        return self.net(x)  # .dense()


class DeConv3dTestTorch(nn.Module):
    def __init__(self, num_layers, ndim, shape, in_channels, out_channels,
                 kernel_size, stride, padding, dilation):
        super().__init__()
        layers = [
            nn.ConvTranspose3d(in_channels,
                               out_channels,
                               kernel_size,
                               stride,
                               padding=padding,
                               dilation=dilation,
                               bias=False)
        ]
        for i in range(1, num_layers):
            layers.append(
                nn.ConvTranspose3d(out_channels,
                                   out_channels,
                                   kernel_size,
                                   stride,
                                   padding=padding,
                                   dilation=dilation,
                                   bias=False))
        self.net = nn.Sequential(*layers, )
        self.shape = shape

    def forward(self, x):
        return self.net(x)  # .dense()


class SparseMaxPoolTestTorch(nn.Module):
    def __init__(self, num_layers, ndim, shape, kernel_size, stride, padding,
                 dilation, algo):
        super().__init__()
        self.algo = algo
        layers = [
            spconv.SparseMaxPool3d(kernel_size, stride, padding, dilation, algo=algo)
        ]
        for i in range(1, num_layers):
            layers.append(
                spconv.SparseMaxPool3d(kernel_size, stride, padding, dilation, algo=algo))
        self.net = spconv.SparseSequential(*layers, )
        self.shape = shape

    def forward(self, features, coors, batch_size):
        coors = coors.int()
        x = spconv.SparseConvTensor(features, coors, self.shape, batch_size)
        return self.net(x)  # .dense()


class MaxPool3dTestTorch(nn.Module):
    def __init__(self, num_layers, ndim, shape, kernel_size, stride, padding,
                 dilation):
        super().__init__()
        layers = [nn.MaxPool3d(kernel_size, stride, padding, dilation)]
        for i in range(1, num_layers):
            layers.append(nn.MaxPool3d(kernel_size, stride, padding, dilation))
        self.net = nn.Sequential(*layers, )
        self.shape = shape

    def forward(self, x):
        return self.net(x)  # .dense()

def gather_nd(params, indices):
    # this function has a limit that MAX_ADVINDEX_CALC_DIMS=5
    ndim = indices.shape[-1]
    output_shape = list(indices.shape[:-1]) + list(
        params.shape[indices.shape[-1]:])
    flatted_indices = indices.view(-1, ndim)
    slices = [flatted_indices[:, i] for i in range(ndim)]
    slices += [Ellipsis]
    return params[slices].view(*output_shape)


def scatter_nd(indices, updates, shape):
    """pytorch edition of tensorflow scatter_nd.
    this function don't contain except handle code. so use this carefully
    when indice repeats, don't support repeat add which is supported
    in tensorflow.
    """
    ret = torch.zeros(*shape, dtype=updates.dtype, device=updates.device)
    ndim = indices.shape[-1]
    output_shape = list(indices.shape[:-1]) + shape[indices.shape[-1]:]
    flatted_indices = indices.view(-1, ndim)
    slices = [flatted_indices[:, i] for i in range(ndim)]
    slices += [Ellipsis]
    ret[slices] = updates.view(*output_shape)
    return ret

def test_spconv3d():
    test_case = TestCase()
    np.random.seed(484)
    torch.manual_seed(48848)
    devices = ["cuda:0"]
    shapes = [[19, 18, 17]]
    batchsizes = [1, 2]

    in_channels = [32]
    out_channels = [32, 48, 64]
    ksizes = [2, 3]
    strides = [1, 2, 3]
    paddings = [0, 1, 2]
    dilations = [1, 2, 3]
    algos = [
        ConvAlgo.Native, ConvAlgo.MaskImplicitGemm,
        ConvAlgo.MaskSplitImplicitGemm
    ]
    algos = [ConvAlgo.Native]

    for dev, shape, bs, IC, OC, k, s, p, d, al in params_grid(
            devices, shapes, batchsizes, in_channels, out_channels, ksizes,
            strides, paddings, dilations, algos):
        if all([s > 1, d > 1]):
            continue  # don't support this.
        # print(dev, shape, bs, IC, OC, k, s, p, d)
        device = torch.device(dev)
        num_points = [1500] * bs
        dtype = torch.float32
        net = SparseConv3dTestTorch(1,
                                    3,
                                    shape,
                                    IC,
                                    OC,
                                    k,
                                    s,
                                    p,
                                    d,
                                    algo=al).to(device).to(dtype)
        net_ref = Conv3dTestTorch(1, 3, shape, IC, OC, k, s, p,
                                    d).to(device).to(dtype)

        sparse_dict = generate_sparse_data(shape, num_points, IC)

        features = np.ascontiguousarray(sparse_dict["features"]).astype(
            np.float32)
        indices = np.ascontiguousarray(
            sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
        features_dense = sparse_dict["features_dense"].astype(np.float32)
        indices_t = torch.from_numpy(indices).int().to(device)
        features_t = torch.from_numpy(features).to(device).to(dtype)
        features_t.requires_grad = True
        features_dense_t = torch.from_numpy(features_dense).to(device).to(
            dtype)
        features_dense_t.requires_grad = True
        if net.algo == ConvAlgo.Native and not ALL_WEIGHT_IS_KRSC:
            if FILTER_HWIO:
                filters = np.random.uniform(-1, 1,
                                            size=[k, k, k, IC,
                                                    OC]).astype(np.float32)
            else:
                filters = np.random.uniform(-1, 1,
                                            size=[k, k, k, OC,
                                                    IC]).astype(np.float32)
            filters_t = torch.from_numpy(filters).to(device).to(dtype)
            if FILTER_HWIO:
                net_ref.net[0].weight.data[:] = filters_t.permute(
                    4, 3, 0, 1, 2).contiguous()
            else:
                net_ref.net[0].weight.data[:] = filters_t.permute(
                    3, 4, 0, 1, 2).contiguous()
        else:
            filters = np.random.uniform(-1, 1,
                                        size=[OC, k, k, k,
                                                IC]).astype(np.float32)
            filters_t = torch.from_numpy(filters).to(device).to(dtype)
            net_ref.net[0].weight.data[:] = filters_t.permute(
                0, 4, 1, 2, 3).contiguous()
        net.net[0].weight.data[:] = filters_t
        out_ref = net_ref(features_dense_t)
        out = net(features_t, indices_t, bs).dense()
        out_np = out.detach().cpu().numpy()
        out_ref_np = out_ref.detach().cpu().numpy()
        test_case.assertAllClose(out_np, out_ref_np, atol=1e-4)

        dout = np.random.uniform(-0.2, 0.2,
                                    out_ref.shape).astype(features.dtype)
        dout_t = torch.from_numpy(dout).to(device)
        out.backward(dout_t)
        out_ref.backward(dout_t)
        din_dense = features_dense_t.grad.detach().permute(0, 2, 3, 4,
                                                            1).contiguous()
        din_sparse = gather_nd(din_dense, indices_t.long())
        din = features_t.grad.detach()

        din_np = din.cpu().numpy()
        din_sparse_np = din_sparse.cpu().numpy()
        for layer, layer_ref in zip(net.net, net_ref.net):
            dw = layer.weight.grad.detach().cpu().numpy()
            dw_ref = layer_ref.weight.grad.detach().cpu().numpy()
            if net.algo == ConvAlgo.Native and not ALL_WEIGHT_IS_KRSC:
                if FILTER_HWIO:
                    dw = dw.transpose(4, 3, 0, 1, 2)
                else:
                    dw = dw.transpose(3, 4, 0, 1, 2)
            else:
                # OHWI -> OIHW
                dw = dw.transpose(0, 4, 1, 2, 3)

            test_case.assertAllClose(dw, dw_ref, atol=1e-4)
        test_case.assertAllClose(din_np, din_sparse_np, atol=1e-4)

def test_spdeconv3d():
    test_case = TestCase()

    np.random.seed(484)
    devices = ["cuda:0"]
    shapes = [[19, 18, 17]]
    batchsizes = [1, 2]

    in_channels = [64]
    out_channels = [32, 48, 64]
    ksizes = [2, 3]
    strides = [2, 3]
    paddings = [0, 1, 2]
    dilations = [1, 2, 3]

    algos = [
        ConvAlgo.Native, ConvAlgo.MaskImplicitGemm,
        ConvAlgo.MaskSplitImplicitGemm
    ]

    for dev, shape, bs, IC, OC, k, s, p, d, al in params_grid(
            devices, shapes, batchsizes, in_channels, out_channels, ksizes,
            strides, paddings, dilations, algos):
        if all([s > 1, d > 1]):
            continue  # don't support this.
        device = torch.device(dev)
        num_points = [1000] * bs
        dtype = torch.float32

        sparse_dict = generate_sparse_data(shape, num_points, IC)

        features = np.ascontiguousarray(sparse_dict["features"]).astype(
            np.float32)
        indices = np.ascontiguousarray(
            sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
        features_dense = sparse_dict["features_dense"].astype(np.float32)
        net = SparseDeConv3dTestTorch(1, 3, shape, IC, OC, k, s, p,
                                        d, al).to(device)
        net_ref = DeConv3dTestTorch(1, 3, shape, IC, OC, k, s, p,
                                    d).to(device)

        if net.algo == ConvAlgo.Native and not ALL_WEIGHT_IS_KRSC:
            if FILTER_HWIO:
                filters = np.random.uniform(-1, 1,
                                            size=[k, k, k, IC,
                                                    OC]).astype(np.float32)
            else:
                filters = np.random.uniform(-1, 1,
                                            size=[k, k, k, OC,
                                                    IC]).astype(np.float32)
            filters_t = torch.from_numpy(filters).to(device).to(dtype)
            if FILTER_HWIO:
                net_ref.net[0].weight.data[:] = filters_t.permute(
                    3, 4, 0, 1, 2).contiguous()
            else:
                net_ref.net[0].weight.data[:] = filters_t.permute(
                    4, 3, 0, 1, 2).contiguous()
        else:
            filters = np.random.uniform(-1, 1,
                                        size=[OC, k, k, k,
                                                IC]).astype(np.float32)
            filters_t = torch.from_numpy(filters).to(device).to(dtype)
            net_ref.net[0].weight.data[:] = filters_t.permute(
                4, 0, 1, 2, 3).contiguous()
        net.net[0].weight.data[:] = filters_t

        indices_t = torch.from_numpy(indices).int().to(device)
        features_t = torch.from_numpy(features).to(device)
        features_t.requires_grad = True
        features_dense_t = torch.from_numpy(features_dense).to(device)
        features_dense_t.requires_grad = True
        filters_t = torch.from_numpy(filters).to(device)
        out_ref = net_ref(features_dense_t)
        out = net(features_t, indices_t, bs).dense()
        out_np = out.detach().cpu().numpy()
        out_ref_np = out_ref.detach().cpu().numpy()
        test_case.assertAllClose(out_np, out_ref_np, atol=1e-4)

        dout = np.random.uniform(-0.2, 0.2,
                                    out_ref.shape).astype(features.dtype)
        dout_t = torch.from_numpy(dout).to(device)
        out.backward(dout_t)
        out_ref.backward(dout_t)
        din_dense = features_dense_t.grad.detach().permute(0, 2, 3, 4,
                                                            1).contiguous()
        din_sparse = gather_nd(din_dense, indices_t.long())
        din = features_t.grad.detach()
        din_np = din.cpu().numpy()
        din_sparse_np = din_sparse.cpu().numpy()
        test_case.assertAllClose(din_np, din_sparse_np, atol=1e-4)
        for layer, layer_ref in zip(net.net, net_ref.net):
            dw = layer.weight.grad.detach().cpu().numpy()
            dw_ref = layer_ref.weight.grad.detach().cpu().numpy()
            if net.algo == ConvAlgo.Native and not ALL_WEIGHT_IS_KRSC:
                if FILTER_HWIO:
                    dw = dw.transpose(3, 4, 0, 1, 2)
                else:
                    dw = dw.transpose(4, 3, 0, 1, 2)
            else:
                # OHWI -> OIHW
                dw = dw.transpose(4, 0, 1, 2, 3)
            test_case.assertAllClose(dw, dw_ref, atol=1e-4)

def test_spmaxpool3d():
    test_case = TestCase()

    np.random.seed(485)
    devices = ["cuda:0"]
    shapes = [[19, 18, 17]]
    batchsizes = [1, 2]

    in_channels = [64]
    out_channels = [64]
    ksizes = [2, 3]
    strides = [1, 2, 3]
    paddings = [0, 1]
    dilations = [1, 2, 3]
    # ksizes = [2]
    # strides = [2]
    # paddings = [0]
    # dilations = [1]
    algos = [
        ConvAlgo.Native, ConvAlgo.MaskImplicitGemm,
        ConvAlgo.MaskSplitImplicitGemm
    ]


    for dev, shape, bs, IC, OC, k, s, p, d, al in params_grid(
            devices, shapes, batchsizes, in_channels, out_channels, ksizes,
            strides, paddings, dilations, algos):
        if all([s > 1, d > 1]):
            continue  # don't support this.
        device = torch.device(dev)
        num_points = [1000] * bs

        # when data contains negative, sparse maxpool is not equal to dense maxpool.
        sparse_dict = generate_sparse_data(shape,
                                            num_points,
                                            IC,
                                            data_range=[0.1, 1])

        features = np.ascontiguousarray(sparse_dict["features"]).astype(
            np.float32)
        indices = np.ascontiguousarray(
            sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
        features_dense = sparse_dict["features_dense"].astype(np.float32)
        indices_t = torch.from_numpy(indices).int().to(device)
        features_t = torch.from_numpy(features).to(device)
        features_t.requires_grad = True
        features_dense_t = torch.from_numpy(features_dense).to(device)
        features_dense_t.requires_grad = True
        net = SparseMaxPoolTestTorch(1, 3, shape, k, s, p, d, al).to(device)
        net_ref = MaxPool3dTestTorch(1, 3, shape, k, s, p, d).to(device)

        out_ref = net_ref(features_dense_t)
        out = net(features_t, indices_t, bs)

        outids = out.indices
        outfeatures = out.features
        outids_dev = outids.float()
        out_dense = out.dense(channels_first=False)
        out = out_dense.permute(0, 4, 1, 2, 3).contiguous()
        out_np = out.detach().cpu().numpy()
        out_ref_np = out_ref.detach().cpu().numpy()
        test_case.assertAllClose(out_np, out_ref_np, atol=1e-4)

        dout_sparse = np.random.uniform(
            -0.2, 0.2, outfeatures.shape).astype(features.dtype)
        dout_sparse_t = torch.from_numpy(dout_sparse).to(device)
        dout_t = scatter_nd(outids.long(), dout_sparse_t,
                            list(out_dense.shape))
        dout_t = dout_t.permute(0, 4, 1, 2, 3).contiguous()
        out.backward(dout_t)
        out_ref.backward(dout_t)
        din_dense = features_dense_t.grad.detach().permute(0, 2, 3, 4,
                                                            1).contiguous()
        din_sparse = gather_nd(din_dense, indices_t.long())
        din = features_t.grad.detach()

        din_np = din.cpu().numpy()
        din_sparse_np = din_sparse.cpu().numpy()
        test_case.assertAllClose(din_np, din_sparse_np, atol=1e-4)


if __name__ == "__main__":
    test_spconv3d()