import cupy import time device = cupy.cuda.Device() memory_pool = cupy.cuda.MemoryPool() cupy.cuda.set_allocator(memory_pool.malloc) rand = cupy.random.RandomState(seed=1) n = 10 zs = [] map_streams = [] stop_events = [] reduce_stream = cupy.cuda.stream.Stream() for i in range(n): map_streams.append(cupy.cuda.stream.Stream()) start_time = time.time() # Map for stream in map_streams: with stream: x = rand.normal(size=(1, 1024**2)) y = rand.normal(size=(1024**2, 1)) z = cupy.matmul(x, y) zs.append(z) stop_event = stream.record() stop_events.append(stop_event) # Block the `reduce_stream` until all events occur. This does not block host. # This is not required when reduction is performed in the default (Stream.null) # stream unless streams are created with `non_blocking=True` flag. for i in range(n): reduce_stream.wait_event(stop_events[i]) # Reduce with reduce_stream: z = sum(zs) device.synchronize() elapsed_time = time.time() - start_time print('elapsed time', elapsed_time) print('total bytes', memory_pool.total_bytes()) # Free all blocks in the memory pool of streams for stream in map_streams: memory_pool.free_all_blocks(stream=stream) print('total bytes', memory_pool.total_bytes())