Updated all_gather() to make use of all_gather_object() (#3857)

59c67318 · Prabhat Roy · GitHub · 3c47bfdf · 59c67318
Unverified Commit 59c67318 authored May 19, 2021 by Prabhat Roy Committed by GitHub May 19, 2021
Hide whitespace changes
Inline Side-by-side

Showing with 4 additions and 33 deletions

references/detection/utils.py references/detection/utils.py +4 -33

No files found.
--- a/references/detection/utils.py
+++ b/references/detection/utils.py
 from collections import defaultdict, deque
 import datetime
-import pickle
+import errno
+import os
 import time

 import torch
 import torch.distributed as dist

-import errno
-import os
-

 class SmoothedValue(object):
    """Track a series of values and provide access to smoothed values over a
@@ -83,35 +81,8 @@ def all_gather(data):
    world_size = get_world_size()
    if world_size == 1:
        return [data]
-
-    # serialized to a Tensor
-    buffer = pickle.dumps(data)
-    storage = torch.ByteStorage.from_buffer(buffer)
-    tensor = torch.ByteTensor(storage).to("cuda")
-
-    # obtain Tensor size of each rank
-    local_size = torch.tensor([tensor.numel()], device="cuda")
-    size_list = [torch.tensor([0], device="cuda") for _ in range(world_size)]
-    dist.all_gather(size_list, local_size)
-    size_list = [int(size.item()) for size in size_list]
-    max_size = max(size_list)
-
-    # receiving Tensor from all ranks
-    # we pad the tensor because torch all_gather does not support
-    # gathering tensors of different shapes
-    tensor_list = []
-    for _ in size_list:
-        tensor_list.append(torch.empty((max_size,), dtype=torch.uint8, device="cuda"))
-    if local_size != max_size:
-        padding = torch.empty(size=(max_size - local_size,), dtype=torch.uint8, device="cuda")
-        tensor = torch.cat((tensor, padding), dim=0)
-    dist.all_gather(tensor_list, tensor)
-
-    data_list = []
-    for size, tensor in zip(size_list, tensor_list):
-        buffer = tensor.cpu().numpy().tobytes()[:size]
-        data_list.append(pickle.loads(buffer))
-
+    data_list = [None] * world_size
+    dist.all_gather_object(data_list, data)
    return data_list