Unverified Commit 59c67318 authored by Prabhat Roy's avatar Prabhat Roy Committed by GitHub
Browse files

Updated all_gather() to make use of all_gather_object() (#3857)

parent 3c47bfdf
from collections import defaultdict, deque from collections import defaultdict, deque
import datetime import datetime
import pickle import errno
import os
import time import time
import torch import torch
import torch.distributed as dist import torch.distributed as dist
import errno
import os
class SmoothedValue(object): class SmoothedValue(object):
"""Track a series of values and provide access to smoothed values over a """Track a series of values and provide access to smoothed values over a
...@@ -83,35 +81,8 @@ def all_gather(data): ...@@ -83,35 +81,8 @@ def all_gather(data):
world_size = get_world_size() world_size = get_world_size()
if world_size == 1: if world_size == 1:
return [data] return [data]
data_list = [None] * world_size
# serialized to a Tensor dist.all_gather_object(data_list, data)
buffer = pickle.dumps(data)
storage = torch.ByteStorage.from_buffer(buffer)
tensor = torch.ByteTensor(storage).to("cuda")
# obtain Tensor size of each rank
local_size = torch.tensor([tensor.numel()], device="cuda")
size_list = [torch.tensor([0], device="cuda") for _ in range(world_size)]
dist.all_gather(size_list, local_size)
size_list = [int(size.item()) for size in size_list]
max_size = max(size_list)
# receiving Tensor from all ranks
# we pad the tensor because torch all_gather does not support
# gathering tensors of different shapes
tensor_list = []
for _ in size_list:
tensor_list.append(torch.empty((max_size,), dtype=torch.uint8, device="cuda"))
if local_size != max_size:
padding = torch.empty(size=(max_size - local_size,), dtype=torch.uint8, device="cuda")
tensor = torch.cat((tensor, padding), dim=0)
dist.all_gather(tensor_list, tensor)
data_list = []
for size, tensor in zip(size_list, tensor_list):
buffer = tensor.cpu().numpy().tobytes()[:size]
data_list.append(pickle.loads(buffer))
return data_list return data_list
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment