# Run this script with the following command: # # mpiexec -n 2 python multple_devices.py # # This script executes simple communication and computation with 2 MPI # processes, each of which uses a different GPU import cupy from mpi4py import MPI comm = MPI.COMM_WORLD rank = comm.Get_rank() size = comm.Get_size() if size != 2: raise RuntimeError("run this script with 2 processes: mpiexec -n 2 ...") device_count = cupy.cuda.runtime.getDeviceCount() if device_count < 2: raise RuntimeError("this script requires 2 GPUs") # Select device based on local MPI rank. # Caveat: for simplicity we assume local_rank == rank here, which may or may # not be the case depending how MPI processes are lauched and how your code # is written. For more robust usage, you may need to consult the user manual # for your MPI library. For example: # local_rank = int(os.getenv("OMPI_COMM_WORLD_LOCAL_RANK")) # Open MPI # local_rank = int(os.getenv("MV2_COMM_WORLD_LOCAL_RANK")) # MVAPICH2 local_rank = rank cupy.cuda.Device(local_rank).use() # send-recv if rank == 0: arr = cupy.empty(100, dtype=cupy.int64) comm.Recv(arr, source=1, tag=87) assert (arr == cupy.arange(100).astype(cupy.int64)).all() else: arr = cupy.arange(100).astype(cupy.int64) comm.Send(arr, dest=0, tag=87) # allreduce arr1 = cupy.empty(1000) arr2 = cupy.random.random(1000) arr_total = arr2.copy() comm.Allreduce(MPI.IN_PLACE, arr_total) # in-place reduction if rank == 0: comm.Recv(arr1, source=1, tag=88) comm.Send(arr2, dest=1, tag=89) else: comm.Send(arr2, dest=0, tag=88) comm.Recv(arr1, source=0, tag=89) assert (arr1 + arr2 == arr_total).all() print("process {}: finished".format(rank))