import gc

import numpy
import pytest

import cupy
from cupy._core import _routines_linalg as _linalg
from cupy import testing
from cupy.cuda import device

from cupy.cuda import cutensor as ct

if ct.available:
    from cupyx import cutensor


@testing.parameterize(
    {'dtype': numpy.float16, 'tol': 3e-3},
    {'dtype': numpy.float32, 'tol': 1e-6},
    {'dtype': numpy.float64, 'tol': 1e-12},
    {'dtype': numpy.complex64, 'tol': 1e-6},
    {'dtype': numpy.complex128, 'tol': 1e-12},
)
@pytest.mark.skipif(not ct.available, reason='cuTensor is unavailable')
class TestCuTensor:

    @pytest.fixture(autouse=True)
    def setUp(self):
        self.a = testing.shaped_random(
            (20, 40, 30), cupy, self.dtype, seed=0)
        self.b = testing.shaped_random(
            (40, 30, 20), cupy, self.dtype, seed=1)
        self.c = testing.shaped_random(
            (30, 20, 40), cupy, self.dtype, seed=2)
        self.mode_a = ('y', 'z', 'x')
        self.mode_b = ('z', 'x', 'y')
        self.mode_c = ('x', 'y', 'z')
        self.alpha = 1.1
        self.beta = 1.2
        self.gamma = 1.3
        self.a_transposed = self.a.transpose(2, 0, 1).copy()
        self.b_transposed = self.b.transpose(1, 2, 0).copy()
        self.c_transposed = self.c.copy()

    def test_elementwise_trinary(self):
        desc_a = cutensor.create_tensor_descriptor(self.a)
        desc_b = cutensor.create_tensor_descriptor(self.b)
        desc_c = cutensor.create_tensor_descriptor(self.c)

        d = cutensor.elementwise_trinary(
            self.alpha, self.a, desc_a, self.mode_a,
            self.beta, self.b, desc_b, self.mode_b,
            self.gamma, self.c, desc_c, self.mode_c
        )

        assert d.dtype == self.dtype

        testing.assert_allclose(
            self.alpha * self.a_transposed +
            self.beta * self.b_transposed +
            self.gamma * self.c_transposed,
            d,
            rtol=self.tol, atol=self.tol
        )

    def test_elementwise_trinary_out(self):
        out = testing.shaped_random(
            (30, 20, 40), cupy, self.dtype, seed=3)

        desc_a = cutensor.create_tensor_descriptor(self.a)
        desc_b = cutensor.create_tensor_descriptor(self.b)
        desc_c = cutensor.create_tensor_descriptor(self.c)

        d = cutensor.elementwise_trinary(
            self.alpha, self.a, desc_a, self.mode_a,
            self.beta, self.b, desc_b, self.mode_b,
            self.gamma, self.c, desc_c, self.mode_c, out=out
        )

        assert d is out
        testing.assert_allclose(
            self.alpha * self.a_transposed +
            self.beta * self.b_transposed +
            self.gamma * self.c,
            d,
            rtol=self.tol, atol=self.tol
        )

    def test_elementwise_binary(self):
        desc_a = cutensor.create_tensor_descriptor(self.a)
        desc_c = cutensor.create_tensor_descriptor(self.c)

        d = cutensor.elementwise_binary(
            self.alpha, self.a, desc_a, self.mode_a,
            self.gamma, self.c, desc_c, self.mode_c
        )

        assert d.dtype == self.dtype

        testing.assert_allclose(
            self.alpha * self.a_transposed +
            self.gamma * self.c_transposed,
            d,
            rtol=self.tol, atol=self.tol
        )

    def test_elementwise_binary_out(self):
        out = testing.shaped_random(
            (30, 20, 40), cupy, self.dtype, seed=3)
        desc_a = cutensor.create_tensor_descriptor(self.a)
        desc_c = cutensor.create_tensor_descriptor(self.c)

        d = cutensor.elementwise_binary(
            self.alpha, self.a, desc_a, self.mode_a,
            self.gamma, self.c, desc_c, self.mode_c, out=out
        )

        assert d is out
        testing.assert_allclose(
            self.alpha * self.a_transposed +
            self.gamma * self.c_transposed,
            d,
            rtol=self.tol, atol=self.tol
        )

    def test_contraction(self):
        compute_capability = int(device.get_compute_capability())
        if compute_capability < 70 and self.dtype == numpy.float16:
            pytest.skip('Not supported.')

        desc_a = cutensor.create_tensor_descriptor(self.a)
        desc_b = cutensor.create_tensor_descriptor(self.b)
        desc_c = cutensor.create_tensor_descriptor(self.c)

        d = cutensor.contraction(
            self.alpha, self.a, desc_a, self.mode_a,
            self.b, desc_b, self.mode_b,
            self.beta, self.c, desc_c, self.mode_c
        )

        assert self.c is d
        testing.assert_allclose(
            self.alpha * self.a_transposed * self.b_transposed +
            self.beta * self.c_transposed,
            d,
            rtol=self.tol, atol=self.tol
        )

    def test_reduction(self):
        if self.dtype == numpy.float16:
            pytest.skip('Not supported.')

        c = testing.shaped_random((30,), cupy, self.dtype, seed=2)
        c_orig = c.copy()

        desc_a = cutensor.create_tensor_descriptor(self.a)
        desc_c = cutensor.create_tensor_descriptor(c)

        d = cutensor.reduction(
            self.alpha, self.a, desc_a, self.mode_a,
            self.beta, c, desc_c, ('x',)
        )

        assert c is d
        testing.assert_allclose(
            self.alpha * self.a_transposed.sum(axis=(1, 2)) +
            self.beta * c_orig,
            d,
            rtol=self.tol, atol=self.tol
        )


@pytest.mark.skipif(not ct.available, reason='cuTensor is unavailable')
class TestMode:

    def test_create_mode_int(self):
        m = cutensor.create_mode(10, 11, 12)
        assert m.ndim == 3
        assert repr(m) == 'mode(10, 11, 12)'

    def test_create_mode_ascii(self):
        m = cutensor.create_mode('x', 'y')
        assert m.ndim == 2
        assert repr(m) == 'mode(120, 121)'

    def test_mode_compare(self):
        m1 = cutensor.create_mode(10, 11, 12)
        m2 = cutensor.create_mode(10, 11, 12)
        assert m1 == m2
        assert m1.data == m2.data  # cached

        m2 = cutensor.create_mode(12, 11, 10)
        assert m1 != m2
        assert m1.data != m2.data


@pytest.mark.skipif(not ct.available, reason='cuTensor is unavailable')
class TestScalar:

    def test_create(self):
        s = cutensor._Scalar(10, cupy.float32)
        assert repr(s) == 'scalar(10.0, dtype=float32)'


@pytest.mark.skipif(not ct.available, reason='cuTensor is unavailable')
class TestCuTensorDescriptor:

    @pytest.fixture(autouse=True)
    def setUp(self):
        self.a = testing.shaped_random(
            (20, 40, 30), cupy, numpy.float32, seed=0)
        self.b = testing.shaped_random(
            (40, 30, 20), cupy, numpy.float32, seed=1)
        self.c = testing.shaped_random(
            (30, 20, 40), cupy, numpy.float32, seed=2)
        self.mode_a = ('y', 'z', 'x')
        self.mode_b = ('z', 'x', 'y')
        self.mode_c = ('x', 'y', 'z')
        self.alpha = 1.1
        self.beta = 1.2
        self.gamma = 1.3
        self.a_transposed = self.a.transpose(2, 0, 1).copy()
        self.b_transposed = self.b.transpose(1, 2, 0).copy()
        self.c_transposed = self.c.copy()

    def test_elementwise_trinary(self):
        desc_a = cutensor.create_tensor_descriptor(self.a, ct.OP_SQRT)
        desc_b = cutensor.create_tensor_descriptor(self.b, ct.OP_TANH)
        desc_c = cutensor.create_tensor_descriptor(self.c, ct.OP_COS)

        d = cutensor.elementwise_trinary(
            self.alpha, self.a, desc_a, self.mode_a,
            self.beta, self.b, desc_b, self.mode_b,
            self.gamma, self.c, desc_c, self.mode_c,
            op_AB=ct.OP_ADD, op_ABC=ct.OP_MUL
        )

        testing.assert_allclose(
            (self.alpha * cupy.sqrt(self.a_transposed) +
             self.beta * cupy.tanh(self.b_transposed)) *
            self.gamma * cupy.cos(self.c),
            d,
            rtol=1e-6, atol=1e-6
        )

    def test_elementwise_binary(self):
        desc_a = cutensor.create_tensor_descriptor(self.a, ct.OP_SIGMOID)
        desc_c = cutensor.create_tensor_descriptor(self.c, ct.OP_ABS)

        d = cutensor.elementwise_binary(
            self.alpha, self.a, desc_a, self.mode_a,
            self.gamma, self.c, desc_c, self.mode_c,
            op_AC=ct.OP_MUL
        )

        testing.assert_allclose(
            self.alpha * (1 / (1 + cupy.exp(-self.a_transposed))) *
            self.gamma * cupy.abs(self.c),
            d,
            rtol=1e-6, atol=1e-6
        )

    def test_reduction(self):
        c = testing.shaped_random((30,), cupy, numpy.float32, seed=2)
        c_orig = c.copy()

        desc_a = cutensor.create_tensor_descriptor(self.a, ct.OP_COS)
        desc_c = cutensor.create_tensor_descriptor(c, ct.OP_TANH)

        d = cutensor.reduction(
            self.alpha, self.a, desc_a, self.mode_a,
            self.beta, c, desc_c, ('x',),
            reduce_op=ct.OP_MAX
        )

        assert c is d
        testing.assert_allclose(
            self.alpha * cupy.cos(self.a_transposed).max(axis=(1, 2)) +
            self.beta * cupy.tanh(c_orig),
            d,
            rtol=1e-6, atol=1e-6
        )


@testing.parameterize(*testing.product({
    'dtype_combo': ['eee', 'fff', 'ddd', 'FFF', 'DDD', 'dDD', 'DdD'],
    'compute_type_hint': [None, 'down-convert', 'TF32'],
    'shape': [(40, 20, 20)],  # let last two dim be the same for testing cache
    'alpha': [1.0],
    'beta': [0.0, 1.0],
}))
@pytest.mark.skipif(not ct.available, reason='cuTensor is unavailable')
class TestCuTensorContraction:
    _tol = {'e': 1e-3, 'f': 1e-6, 'd': 1e-12}

    def make_random_array(self, shape, dtype):
        return testing.shaped_random(shape, cupy, dtype=dtype, scale=1)

    def make_matrix(self, shape, dtype):
        r_dtype = dtype
        if dtype == numpy.complex64:
            r_dtype = numpy.float32
        elif dtype == numpy.complex128:
            r_dtype = numpy.float64
        a = self.make_random_array(shape, r_dtype)
        if dtype.char in 'FD':
            a = a + 1j * self.make_random_array(shape, r_dtype)
        return a

    @pytest.fixture(autouse=True)
    def setUp(self):
        compute_capability = int(device.get_compute_capability())
        if compute_capability < 70 and 'e' in self.dtype_combo:
            pytest.skip("Not supported")
        dtype_chars = list(self.dtype_combo)
        self.a_dtype = numpy.dtype(dtype_chars[0])
        self.b_dtype = numpy.dtype(dtype_chars[1])
        self.c_dtype = numpy.dtype(dtype_chars[2])
        self.tol = self._tol[dtype_chars[2].lower()]
        self.compute_type = _linalg.COMPUTE_TYPE_DEFAULT
        if self.compute_type_hint == 'down-convert':
            if self.c_dtype.char in 'fF':
                self.compute_type = _linalg.COMPUTE_TYPE_FP16
                self.tol = self._tol['e']
            elif self.c_dtype.char in 'dD':
                self.compute_type = _linalg.COMPUTE_TYPE_FP32
                self.tol = self._tol['f']
        elif self.compute_type_hint == 'TF32':
            if self.c_dtype.char in 'fF':
                self.compute_type = _linalg.COMPUTE_TYPE_TF32
                self.tol = self._tol['e']
        m, n, k = self.shape
        self.a = self.make_matrix((m, k), self.a_dtype)
        self.b = self.make_matrix((k, n), self.b_dtype)
        self.c = self.make_matrix((m, n), self.c_dtype)
        self.c_ref = self.alpha * cupy.matmul(self.a, self.b)
        self.c_ref += self.beta * self.c
        old_compute_type = cupy._core.get_compute_type(self.c_dtype)
        cupy._core.set_compute_type(self.c_dtype, self.compute_type)
        yield
        cupy._core.set_compute_type(self.c_dtype, old_compute_type)

    def test_contraction(self):
        desc_a = cutensor.create_tensor_descriptor(self.a)
        desc_b = cutensor.create_tensor_descriptor(self.b)
        desc_c = cutensor.create_tensor_descriptor(self.c)
        mode_a = cutensor.create_mode('m', 'k')
        mode_b = cutensor.create_mode('k', 'n')
        mode_c = cutensor.create_mode('m', 'n')
        cutensor.contraction(self.alpha,
                             self.a, desc_a, mode_a,
                             self.b, desc_b, mode_b,
                             self.beta,
                             self.c, desc_c, mode_c)
        cupy.testing.assert_allclose(self.c, self.c_ref,
                                     rtol=self.tol, atol=self.tol)

        # test the contraction descriptor cache (issues #7318, #7812)
        del mode_b
        gc.collect()
        mode_b = cutensor.create_mode('n', 'k')  # flipped
        self.c_ref = self.alpha * cupy.matmul(self.a, self.b.T)
        self.c_ref += self.beta * self.c
        cutensor.contraction(self.alpha,
                             self.a, desc_a, mode_a,
                             self.b, desc_b, mode_b,
                             self.beta,
                             self.c, desc_c, mode_c)
        cupy.testing.assert_allclose(self.c, self.c_ref,
                                     rtol=self.tol, atol=self.tol)