# nvprof --print-gpu-trace python examples/stream/cupy_memcpy.py
import cupy
import numpy

pinned_memory_pool = cupy.cuda.PinnedMemoryPool()
cupy.cuda.set_pinned_memory_allocator(pinned_memory_pool.malloc)


def _pin_memory(array):
    mem = cupy.cuda.alloc_pinned_memory(array.nbytes)
    ret = numpy.frombuffer(mem, array.dtype, array.size).reshape(array.shape)
    ret[...] = array
    return ret


SIZE = 1024 * 1024
x_cpu_src = numpy.arange(SIZE, dtype=numpy.float32)
x_gpu_src = cupy.arange(SIZE, dtype=numpy.float32)


# synchronous
stream = cupy.cuda.Stream.null
start = stream.record()
x_gpu_dst = cupy.empty(x_cpu_src.shape, x_cpu_src.dtype)
x_gpu_dst.set(x_cpu_src)
x_cpu_dst = x_gpu_src.get()
end = stream.record()

print('Synchronous Device to Host / Host to Device (ms)')
print(cupy.cuda.get_elapsed_time(start, end))


# asynchronous
x_gpu_dst = cupy.empty(x_cpu_src.shape, x_cpu_src.dtype)
x_cpu_dst = numpy.empty(x_gpu_src.shape, x_gpu_src.dtype)

x_pinned_cpu_src = _pin_memory(x_cpu_src)
x_pinned_cpu_dst = _pin_memory(x_cpu_dst)

with cupy.cuda.stream.Stream() as stream_htod:
    start = stream_htod.record()
    x_gpu_dst.set(x_pinned_cpu_src)
    with cupy.cuda.stream.Stream() as stream_dtoh:
        x_gpu_src.get(out=x_pinned_cpu_dst)
        stream_dtoh.synchronize()
    stream_htod.synchronize()
    end = stream_htod.record()

print('Asynchronous Device to Host / Host to Device (ms)')
print(cupy.cuda.get_elapsed_time(start, end))