import unittest

from numba.cuda.testing import CUDATestCase, skip_on_cudasim
from numba.tests.support import captured_stdout


@skip_on_cudasim("cudasim doesn't support cuda import at non-top-level")
class TestReduction(CUDATestCase):
    """
    Test shared memory reduction
    """

    def setUp(self):
        # Prevent output from this test showing up when running the test suite
        self._captured_stdout = captured_stdout()
        self._captured_stdout.__enter__()
        super().setUp()

    def tearDown(self):
        # No exception type, value, or traceback
        self._captured_stdout.__exit__(None, None, None)
        super().tearDown()

    def test_ex_reduction(self):
        # ex_reduction.import.begin
        import numpy as np
        from numba import cuda
        from numba.types import int32
        # ex_reduction.import.end

        # ex_reduction.allocate.begin
        # generate data
        a = cuda.to_device(np.arange(1024))
        nelem = len(a)
        # ex_reduction.allocate.end

        # ex_reduction.kernel.begin
        @cuda.jit
        def array_sum(data):
            tid = cuda.threadIdx.x
            size = len(data)
            if tid < size:
                i = cuda.grid(1)

                # Declare an array in shared memory
                shr = cuda.shared.array(nelem, int32)
                shr[tid] = data[i]

                # Ensure writes to shared memory are visible
                # to all threads before reducing
                cuda.syncthreads()

                s = 1
                while s < cuda.blockDim.x:
                    if tid % (2 * s) == 0:
                        # Stride by `s` and add
                        shr[tid] += shr[tid + s]
                    s *= 2
                    cuda.syncthreads()

                # After the loop, the zeroth  element contains the sum
                if tid == 0:
                    data[tid] = shr[tid]
        # ex_reduction.kernel.end

        # ex_reduction.launch.begin
        array_sum[1, nelem](a)
        print(a[0])                  # 523776
        print(sum(np.arange(1024)))  # 523776
        # ex_reduction.launch.end

        np.testing.assert_equal(a[0], sum(np.arange(1024)))


if __name__ == "__main__":
    unittest.main()