map_reduce.py 1.24 KB
Newer Older
root's avatar
root committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import cupy
import time

device = cupy.cuda.Device()
memory_pool = cupy.cuda.MemoryPool()
cupy.cuda.set_allocator(memory_pool.malloc)
rand = cupy.random.RandomState(seed=1)

n = 10
zs = []
map_streams = []
stop_events = []
reduce_stream = cupy.cuda.stream.Stream()
for i in range(n):
    map_streams.append(cupy.cuda.stream.Stream())

start_time = time.time()

# Map
for stream in map_streams:
    with stream:
        x = rand.normal(size=(1, 1024**2))
        y = rand.normal(size=(1024**2, 1))
        z = cupy.matmul(x, y)
        zs.append(z)
    stop_event = stream.record()
    stop_events.append(stop_event)

# Block the `reduce_stream` until all events occur. This does not block host.
# This is not required when reduction is performed in the default (Stream.null)
# stream unless streams are created with `non_blocking=True` flag.
for i in range(n):
    reduce_stream.wait_event(stop_events[i])

# Reduce
with reduce_stream:
    z = sum(zs)

device.synchronize()
elapsed_time = time.time() - start_time
print('elapsed time', elapsed_time)
print('total bytes', memory_pool.total_bytes())

# Free all blocks in the memory pool of streams
for stream in map_streams:
    memory_pool.free_all_blocks(stream=stream)
print('total bytes', memory_pool.total_bytes())