test_driver.py

import ctypes
import os
import threading

import numpy as np

import unittest
from numba.roc.hsadrv.driver import hsa, Queue, Program, Executable,\
                                    BrigModule, Context, dgpu_present

from numba.roc.hsadrv.driver import hsa as roc
import numba.roc.api as hsaapi
from numba import float32, float64, vectorize

from numba.roc.hsadrv import drvapi
from numba.roc.hsadrv import enums
from numba.roc.hsadrv import enums_ext

from numba.core import config

try:
    import queue
except ImportError:
    import Queue as queue

class TestLowLevelApi(unittest.TestCase):
    """This test checks that all the functions defined in drvapi
    bind properly using ctypes."""

    def test_functions_available(self):
        missing_functions = []
        for fname in drvapi.API_PROTOTYPES.keys():
            try:
                getattr(hsa, fname)
            except Exception as e:
                missing_functions.append("'{0}': {1}".format(fname, str(e)))

        self.assertEqual(len(missing_functions), 0,
                         msg='\n'.join(missing_functions))


class TestAgents(unittest.TestCase):
    def test_agents_init(self):
        self.assertGreater(len(roc.agents), 0)

    def test_agents_create_queue_single(self):
        for agent in roc.agents:
            if agent.is_component:
                # sugon: adapt for DTK
                # queue = agent.create_queue_single(2 ** 5)
                queue = agent.create_queue_multi(2 ** 6)
                self.assertIsInstance(queue, Queue)

    def test_agents_create_queue_multi(self):
        for agent in roc.agents:
            if agent.is_component:
                # sugon: adapt for DTK
                # queue = agent.create_queue_multi(2 ** 5)
                queue = agent.create_queue_multi(2 ** 6)
                self.assertIsInstance(queue, Queue)

    def test_agent_wavebits(self):
        for agent in roc.agents:
            if agent.is_component:
                if agent.name.decode() in ['gfx803', 'gfx900']:
                    self.assertEqual(agent.wavebits, 6)


class _TestBase(unittest.TestCase):
    def setUp(self):
        self.gpu = [a for a in roc.agents if a.is_component][0]
        self.cpu = [a for a in roc.agents if not a.is_component][0]
        self.queue = self.gpu.create_queue_multi(self.gpu.queue_max_size)

    def tearDown(self):
        del self.queue
        del self.gpu
        del self.cpu


def get_brig_file():
    path = os.path.join('/opt/rocm/hsa/sample/vector_copy_full.brig')
    assert os.path.isfile(path)
    return path

def _check_example_file():
    try:
        get_brig_file()
    except Exception:
        return False
    return True

has_brig_example = _check_example_file()


@unittest.skipUnless(has_brig_example, "Brig example not found")
class TestBrigModule(unittest.TestCase):
    def test_from_file(self):
        brig_file = get_brig_file()
        brig_module = BrigModule.from_file(brig_file)
        self.assertGreater(len(brig_module), 0)


@unittest.skipUnless(has_brig_example, "Brig example not found")
class TestProgram(_TestBase):
    def test_create_program(self):
        brig_file = get_brig_file()
        symbol = '&__vector_copy_kernel'
        brig_module = BrigModule.from_file(brig_file)
        program = Program()
        program.add_module(brig_module)
        code = program.finalize(self.gpu.isa)

        ex = Executable()
        ex.load(self.gpu, code)
        ex.freeze()

        sym = ex.get_symbol(self.gpu, symbol)
        self.assertGreater(sym.kernarg_segment_size, 0)


class TestMemory(_TestBase):
    def test_region_list(self):
        self.assertGreater(len(self.gpu.regions.globals), 0)
        self.assertGreater(len(self.gpu.regions.groups), 0)
        # The following maybe empty
        # print(self.gpu.regions.privates)
        # print(self.gpu.regions.readonlys)

    def test_register(self):
        src = np.random.random(1024).astype(np.float32)
        roc.hsa_memory_register(src.ctypes.data, src.nbytes)
        roc.hsa_memory_deregister(src.ctypes.data, src.nbytes)

    def test_allocate(self):
        regions = self.gpu.regions
        # More than one region
        self.assertGreater(len(regions), 0)
        # Find kernel argument regions
        kernarg_regions = list()
        for r in regions:
            if r.supports(enums.HSA_REGION_GLOBAL_FLAG_KERNARG):
                kernarg_regions.append(r)

        self.assertGreater(len(kernarg_regions), 0)
        # Test allocating at the kernel argument region
        kernarg_region = kernarg_regions[0]
        nelem = 10
        ptr = kernarg_region.allocate(ctypes.sizeof(ctypes.c_float) * nelem)
        self.assertNotEqual(ctypes.addressof(ptr), 0,
                            "pointer must not be NULL")
        # Test writing to it
        src = np.random.random(nelem).astype(np.float32)
        ctypes.memmove(ptr, src.ctypes.data, src.nbytes)

        ref = (ctypes.c_float * nelem).from_address(ptr.value)
        for i in range(src.size):
            self.assertEqual(ref[i], src[i])
        roc.hsa_memory_free(ptr)

    @unittest.skipUnless(dgpu_present, "dGPU only")
    def test_coarse_grained_allocate(self):
        """
        Tests the coarse grained allocation works on a dGPU.
        It performs a data copying round trip via:
        memory
          |
        HSA cpu memory
          |
        HSA dGPU host accessible memory <---|
          |                                 |
        HSA dGPU memory --------------------|
        """
        gpu_regions = self.gpu.regions
        gpu_only_coarse_regions = list()
        gpu_host_accessible_coarse_regions = list()
        for r in gpu_regions:
            if r.supports(enums.HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED):
                if r.host_accessible:
                    gpu_host_accessible_coarse_regions.append(r)
                else:
                    gpu_only_coarse_regions.append(r)

        # check we have 1+ coarse gpu region(s) of each type
        self.assertGreater(len(gpu_only_coarse_regions), 0)
        self.assertGreater(len(gpu_host_accessible_coarse_regions), 0)

        cpu_regions = self.cpu.regions
        cpu_coarse_regions = list()
        for r in cpu_regions:
            if r.supports(enums.HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED):
                cpu_coarse_regions.append(r)
        # check we have 1+ coarse cpu region(s)
        self.assertGreater(len(cpu_coarse_regions), 0)

        # ten elements of data used
        nelem = 10

        # allocation
        cpu_region = cpu_coarse_regions[0]
        cpu_ptr = cpu_region.allocate(ctypes.sizeof(ctypes.c_float) * nelem)
        self.assertNotEqual(ctypes.addressof(cpu_ptr), 0,
                "pointer must not be NULL")

        gpu_only_region = gpu_only_coarse_regions[0]
        gpu_only_ptr = gpu_only_region.allocate(ctypes.sizeof(ctypes.c_float) *
                nelem)
        self.assertNotEqual(ctypes.addressof(gpu_only_ptr), 0,
                "pointer must not be NULL")

        gpu_host_accessible_region = gpu_host_accessible_coarse_regions[0]
        gpu_host_accessible_ptr = gpu_host_accessible_region.allocate(
                ctypes.sizeof(ctypes.c_float) * nelem)
        self.assertNotEqual(ctypes.addressof(gpu_host_accessible_ptr), 0,
                "pointer must not be NULL")

        # Test writing to allocated area
        src = np.random.random(nelem).astype(np.float32)
        roc.hsa_memory_copy(cpu_ptr, src.ctypes.data, src.nbytes)
        roc.hsa_memory_copy(gpu_host_accessible_ptr, cpu_ptr, src.nbytes)
        roc.hsa_memory_copy(gpu_only_ptr, gpu_host_accessible_ptr, src.nbytes)

        # check write is correct
        cpu_ref = (ctypes.c_float * nelem).from_address(cpu_ptr.value)
        for i in range(src.size):
            self.assertEqual(cpu_ref[i], src[i])

        gpu_ha_ref = (ctypes.c_float * nelem).\
            from_address(gpu_host_accessible_ptr.value)
        for i in range(src.size):
            self.assertEqual(gpu_ha_ref[i], src[i])

        # zero out host accessible GPU memory and CPU memory
        z0 = np.zeros(nelem).astype(np.float32)
        roc.hsa_memory_copy(cpu_ptr, z0.ctypes.data, z0.nbytes)
        roc.hsa_memory_copy(gpu_host_accessible_ptr, cpu_ptr, z0.nbytes)

        # check zeroing is correct
        for i in range(z0.size):
            self.assertEqual(cpu_ref[i], z0[i])

        for i in range(z0.size):
            self.assertEqual(gpu_ha_ref[i], z0[i])

        # copy back the data from the GPU
        roc.hsa_memory_copy(gpu_host_accessible_ptr, gpu_only_ptr, src.nbytes)

        # check the copy back is ok
        for i in range(src.size):
            self.assertEqual(gpu_ha_ref[i], src[i])

        # free
        roc.hsa_memory_free(cpu_ptr)
        roc.hsa_memory_free(gpu_only_ptr)
        roc.hsa_memory_free(gpu_host_accessible_ptr)

    @unittest.skipUnless(has_brig_example, "Brig example not found")
    @unittest.skipUnless(dgpu_present, "dGPU only")
    @unittest.skip("Permanently skip? HSA spec violation causes corruption")
    def test_coarse_grained_kernel_execution(self):
        """
        This tests the execution of a kernel on a dGPU using coarse memory
        regions for the buffers.
        NOTE: the code violates the HSA spec in that it uses a coarse region
        for kernargs, this is a performance hack.
        """

        from numba.roc.hsadrv.driver import BrigModule, Program, hsa,\
                Executable

        # get a brig file
        brig_file = get_brig_file()
        brig_module = BrigModule.from_file(brig_file)
        self.assertGreater(len(brig_module), 0)

        # use existing GPU regions for computation space
        gpu_regions = self.gpu.regions
        gpu_only_coarse_regions = list()
        gpu_host_accessible_coarse_regions = list()
        for r in gpu_regions:
            if r.supports(enums.HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED):
                if r.host_accessible:
                    gpu_host_accessible_coarse_regions.append(r)
                else:
                    gpu_only_coarse_regions.append(r)

        # check we have 1+ coarse gpu region(s) of each type
        self.assertGreater(len(gpu_only_coarse_regions), 0)
        self.assertGreater(len(gpu_host_accessible_coarse_regions), 0)

        # Compilation phase:

        # FIXME: this is dubious, assume launching agent is indexed first
        agent = roc.components[0]

        prog = Program()
        prog.add_module(brig_module)

        # get kernel and load
        code = prog.finalize(agent.isa)

        ex = Executable()
        ex.load(agent, code)
        ex.freeze()

        # extract symbols
        sym = ex.get_symbol(agent, "&__vector_copy_kernel")
        self.assertNotEqual(sym.kernel_object, 0)
        self.assertGreater(sym.kernarg_segment_size, 0)

        # attempt kernel execution
        import ctypes
        import numpy as np

        # Do memory allocations

        # allocate and initialise memory
        nelem = 1024 * 1024

        src = np.random.random(nelem).astype(np.float32)
        z0 = np.zeros_like(src)

        # alloc host accessible memory
        nbytes = ctypes.sizeof(ctypes.c_float) * nelem
        gpu_host_accessible_region = gpu_host_accessible_coarse_regions[0]
        host_in_ptr = gpu_host_accessible_region.allocate(nbytes)
        self.assertNotEqual(host_in_ptr.value, None,
                "pointer must not be NULL")
        host_out_ptr = gpu_host_accessible_region.allocate(nbytes)
        self.assertNotEqual(host_out_ptr.value, None,
                "pointer must not be NULL")

        # init mem with data
        roc.hsa_memory_copy(host_in_ptr, src.ctypes.data, src.nbytes)
        roc.hsa_memory_copy(host_out_ptr, z0.ctypes.data, z0.nbytes)

        # alloc gpu only memory
        gpu_only_region = gpu_only_coarse_regions[0]
        gpu_in_ptr = gpu_only_region.allocate(nbytes)
        self.assertNotEqual(gpu_in_ptr.value, None, "pointer must not be NULL")
        gpu_out_ptr = gpu_only_region.allocate(nbytes)
        self.assertNotEqual(gpu_out_ptr.value, None,
            "pointer must not be NULL")

        # copy memory from host accessible location to gpu only
        roc.hsa_memory_copy(gpu_in_ptr, host_in_ptr, src.nbytes)

        # Do kernargs

        # Find a coarse region (for better performance on dGPU) in which
        # to place kernargs. NOTE: This violates the HSA spec
        kernarg_regions = list()
        for r in gpu_host_accessible_coarse_regions:
           # NOTE: VIOLATION
            if r.supports(enums.HSA_REGION_GLOBAL_FLAG_KERNARG):
                kernarg_regions.append(r)
        self.assertGreater(len(kernarg_regions), 0)

        # use first region for args
        kernarg_region = kernarg_regions[0]

        kernarg_ptr = kernarg_region.allocate(
                2 * ctypes.sizeof(ctypes.c_void_p))

        self.assertNotEqual(kernarg_ptr, None, "pointer must not be NULL")

        # wire in gpu memory
        argref = (2 * ctypes.c_size_t).from_address(kernarg_ptr.value)
        argref[0] = gpu_in_ptr.value
        argref[1] = gpu_out_ptr.value

        # signal
        sig = roc.create_signal(1)

        # create queue and dispatch job

        queue = agent.create_queue_single(32)
        queue.dispatch(sym, kernarg_ptr, workgroup_size=(256, 1, 1),
                           grid_size=(nelem, 1, 1),signal=None)

        # copy result back to host accessible memory to check
        roc.hsa_memory_copy(host_out_ptr, gpu_out_ptr, src.nbytes)

        # check the data is recovered
        ref = (nelem * ctypes.c_float).from_address(host_out_ptr.value)
        np.testing.assert_equal(ref, src)

        # free
        roc.hsa_memory_free(host_in_ptr)
        roc.hsa_memory_free(host_out_ptr)
        roc.hsa_memory_free(gpu_in_ptr)
        roc.hsa_memory_free(gpu_out_ptr)


class TestContext(_TestBase):
    """Tests the Context class behaviour is correct."""

    def test_memalloc(self):
        """
            Tests Context.memalloc() for a given, in the parlance of HSA,\
            `component`. Testing includes specialisations for the supported
            components of dGPUs and APUs.
        """
        n = 10 # things to alloc
        nbytes = ctypes.sizeof(ctypes.c_double) * n

        # run if a dGPU is present
        if dgpu_present:
            # find a host accessible region
            dGPU_agent = self.gpu
            CPU_agent = self.cpu
            gpu_ctx = Context(dGPU_agent)
            gpu_only_mem = gpu_ctx.memalloc(nbytes, hostAccessible=False)
            ha_mem = gpu_ctx.memalloc(nbytes, hostAccessible=True)

            # on dGPU systems, all host mem is host accessible
            cpu_ctx = Context(CPU_agent)
            cpu_mem = cpu_ctx.memalloc(nbytes, hostAccessible=True)

            # Test writing to allocated area
            src = np.random.random(n).astype(np.float64)
            roc.hsa_memory_copy(cpu_mem.device_pointer, src.ctypes.data, src.nbytes)
            roc.hsa_memory_copy(ha_mem.device_pointer, cpu_mem.device_pointer, src.nbytes)
            roc.hsa_memory_copy(gpu_only_mem.device_pointer, ha_mem.device_pointer, src.nbytes)

            # clear
            z0 = np.zeros_like(src)
            roc.hsa_memory_copy(ha_mem.device_pointer, z0.ctypes.data, z0.nbytes)
            ref = (n * ctypes.c_double).from_address(ha_mem.device_pointer.value)
            for k in range(n):
                self.assertEqual(ref[k], 0)

            # copy back from dGPU
            roc.hsa_memory_copy(ha_mem.device_pointer, gpu_only_mem.device_pointer, src.nbytes)
            for k in range(n):
                self.assertEqual(ref[k], src[k])

        else: #TODO: write APU variant
            pass

    def check_mempools(self, agent, has_fine_grain=True):
        # get allocation-allowed pools
        mp_alloc_list = [mp for mp in agent.mempools if mp.alloc_allowed]
        mpdct = {'global': [], 'readonly': [], 'private': [], 'group': []}

        for mp in mp_alloc_list:
            mpdct[mp.kind].append(mp)

        # only globals are allocation-allowed
        if has_fine_grain:
            self.assertEqual(len(mpdct['global']), 2)
        else:
            self.assertEqual(len(mpdct['global']), 1)
        self.assertEqual(len(mpdct['readonly']), 0)
        self.assertEqual(len(mpdct['private']), 0)
        self.assertEqual(len(mpdct['group']), 0)

        self.assertEqual(len(agent.mempools.globals), len(mpdct['global']))

        # the global-pools are coarse-grain and fine-grain pools
        glbs = mpdct['global']
        coarsegrain = None
        finegrain = None
        for gmp in glbs:
            if gmp.supports(enums_ext.HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED):
                coarsegrain = gmp
            if gmp.supports(enums_ext.HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED):
                finegrain = gmp

        self.assertIsNotNone(coarsegrain)
        if has_fine_grain:
            self.assertIsNotNone(finegrain)
        else:
            self.assertIsNone(finegrain)
        self.assertIsNot(coarsegrain, finegrain)

    def test_cpu_mempool_property(self):
        self.check_mempools(self.cpu)

    @unittest.skipUnless(dgpu_present, "dGPU only")
    def test_gpu_mempool_property(self):
        self.check_mempools(self.gpu, has_fine_grain=False)

    @unittest.skipUnless(dgpu_present, "dGPU only")
    def test_mempool(self):
        n = 10 # things to alloc
        nbytes = ctypes.sizeof(ctypes.c_double) * n

        dGPU_agent = self.gpu
        CPU_agent = self.cpu

        # allocate a GPU memory pool
        gpu_ctx = Context(dGPU_agent)
        gpu_only_mem = gpu_ctx.mempoolalloc(nbytes)

        # allocate a CPU memory pool, allow the GPU access to it
        cpu_ctx = Context(CPU_agent)
        cpu_mem = cpu_ctx.mempoolalloc(nbytes, allow_access_to=[gpu_ctx.agent])

        ## Test writing to allocated area
        src = np.random.random(n).astype(np.float64)
        roc.hsa_memory_copy(cpu_mem.device_pointer, src.ctypes.data, src.nbytes)
        roc.hsa_memory_copy(gpu_only_mem.device_pointer, cpu_mem.device_pointer, src.nbytes)


        # clear
        z0 = np.zeros_like(src)
        roc.hsa_memory_copy(cpu_mem.device_pointer, z0.ctypes.data, z0.nbytes)
        ref = (n * ctypes.c_double).from_address(cpu_mem.device_pointer.value)
        for k in range(n):
            self.assertEqual(ref[k], 0)

        # copy back from dGPU
        roc.hsa_memory_copy(cpu_mem.device_pointer, gpu_only_mem.device_pointer, src.nbytes)
        for k in range(n):
            self.assertEqual(ref[k], src[k])

    def check_mempool_with_flags(self, finegrain):
        dGPU_agent = self.gpu
        gpu_ctx = Context(dGPU_agent)

        CPU_agent = self.cpu
        cpu_ctx = Context(CPU_agent)

        # get mempool with specific flags
        cpu_ctx.mempoolalloc(1024, allow_access_to=[gpu_ctx._agent])

    @unittest.skipUnless(dgpu_present, 'dGPU only')
    def test_mempool_finegrained(self):
        self.check_mempool_with_flags(finegrain=True)

    @unittest.skipUnless(dgpu_present, 'dGPU only')
    def test_mempool_coarsegrained(self):
        self.check_mempool_with_flags(finegrain=False)

    @unittest.skipUnless(dgpu_present, 'dGPU only')
    def test_mempool_amd_example(self):
        dGPU_agent = self.gpu
        gpu_ctx = Context(dGPU_agent)
        CPU_agent = self.cpu
        cpu_ctx = Context(CPU_agent)

        kNumInt = 1024
        kSize = kNumInt * ctypes.sizeof(ctypes.c_int)

        dependent_signal = roc.create_signal(0)
        completion_signal = roc.create_signal(0)

        ## allocate host src and dst, allow gpu access
        flags = dict(allow_access_to=[gpu_ctx.agent], finegrain=False)
        host_src = cpu_ctx.mempoolalloc(kSize, **flags)
        host_dst = cpu_ctx.mempoolalloc(kSize, **flags)

        # there's a loop in `i` here over GPU hardware
        i = 0

        # get gpu local pool
        local_memory = gpu_ctx.mempoolalloc(kSize)

        host_src_view = (kNumInt * ctypes.c_int).from_address(host_src.device_pointer.value)
        host_dst_view = (kNumInt * ctypes.c_int).from_address(host_dst.device_pointer.value)

        host_src_view[:] = i + 2016 + np.arange(0, kNumInt, dtype=np.int32)
        host_dst_view[:] = np.zeros(kNumInt, dtype=np.int32)

        # print("GPU: %s"%gpu_ctx._agent.name)
        # print("CPU: %s"%cpu_ctx._agent.name)

        roc.hsa_signal_store_relaxed(completion_signal, 1);

        q = queue.Queue()

        class validatorThread(threading.Thread):
            def run(self):
                # sugon: adapt for DTK
                val = roc.hsa_signal_wait_scacquire(
                    completion_signal,
                    enums.HSA_SIGNAL_CONDITION_EQ,
                    0,
                    ctypes.c_uint64(-1),
                    enums.HSA_WAIT_STATE_ACTIVE)

                q.put(val)  # wait_res

        # this could be a call on the signal itself dependent_signal.store_relaxed(1)
        roc.hsa_signal_store_relaxed(dependent_signal, 1);

        h2l_start = threading.Semaphore(value=0)

        class l2hThread(threading.Thread):
            def run(self):
                dep_signal = drvapi.hsa_signal_t(dependent_signal._id)
                roc.hsa_amd_memory_async_copy(host_dst.device_pointer.value,
                                        cpu_ctx._agent._id,
                                        local_memory.device_pointer.value,
                                        gpu_ctx._agent._id, kSize, 1,
                                        ctypes.byref(dep_signal),
                                        completion_signal)
                h2l_start.release()  # signal h2l to start

        class h2lThread(threading.Thread):
            def run(self):
                h2l_start.acquire()  # to wait until l2h thread has started
                roc.hsa_amd_memory_async_copy(local_memory.device_pointer.value,
                                            gpu_ctx._agent._id,
                                            host_src.device_pointer.value,
                                            cpu_ctx._agent._id, kSize, 0,
                                            None,
                                            dependent_signal)

        timeout = 10  # 10 seconds timeout
        # # init thread instances
        validator = validatorThread()
        l2h = l2hThread()
        h2l = h2lThread()
        # run them
        validator.start()
        l2h.start()
        h2l.start()
        # join
        l2h.join(timeout)
        h2l.join(timeout)
        validator.join(timeout)
        # verify
        wait_res = q.get()
        self.assertEqual(wait_res, 0)
        np.testing.assert_allclose(host_dst_view, host_src_view)

    @unittest.skipUnless(dgpu_present, "dGPU only")
    def test_to_device_to_host(self):
        """
            Tests .to_device() and .copy_to_host()
        """
        n  = 10
        data = np.zeros(n)
        output = np.zeros(n)
        @vectorize("float64(float64)", target='roc')
        def func(x):
            return x + 1

        hsaapi.to_device(data)
        out_device = hsaapi.to_device(output)
        func(data, out=out_device)
        host_output = out_device.copy_to_host()
        np.testing.assert_equal(np.ones(n), host_output)


if __name__ == '__main__':
    unittest.main()