support multiple pytorch versions prviate apis

bf2fd0c0 · Rick Ho · 481f5c4f · bf2fd0c0 · bf2fd0c0 · bf2fd0c0
Commit bf2fd0c0 authored Feb 05, 2021 by Rick Ho
Hide whitespace changes
Inline Side-by-side

Showing with 25 additions and 3 deletions

fmoe/distributed.py fmoe/distributed.py +3 -2

fmoe/functions.py fmoe/functions.py +2 -1

fmoe/utils.py fmoe/utils.py +20 -0

No files found.
--- a/fmoe/distributed.py
+++ b/fmoe/distributed.py
 import torch
 import torch.nn as nn
 from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
+from .utils import get_torch_default_comm
 class DistributedGroupedDataParallel(nn.Module):
@@ -17,9 +18,9 @@ class DistributedGroupedDataParallel(nn.Module):
        if dp_group is not None:
            self.comms['dp'] = dp_group
        else:
-            self.comms['dp'] = torch.distributed.distributed_c10d._get_default_group()
+            self.comms['dp'] = get_torch_default_comm()
        if world_group is None:
-            self.comms['world'] = torch.distributed.distributed_c10d._get_default_group()
+            self.comms['world'] = get_torch_default_comm()
        else:
            self.comms['world'] = world_group

--- a/fmoe/functions.py
+++ b/fmoe/functions.py
@@ -7,6 +7,7 @@ computation.
 import torch
 from torch.autograd import Function
 import fmoe_cuda
+from .utils import get_torch_default_comm
 def moe_prepare_forward(gate, num_expert, world_size, comm=None):
@@ -21,7 +22,7 @@ def moe_prepare_forward(gate, num_expert, world_size, comm=None):
        comm: the communicator of all workers in the expert-parallel group.
    """
    if comm is None:
-        comm = torch.distributed.distributed_c10d._get_default_group()
+        comm = get_torch_default_comm()
    if world_size > 1:
        fmoe_cuda.ensure_nccl(comm, gate)

--- a/fmoe/utils.py
+++ b/fmoe/utils.py
+import torch.distributed as dist
+def get_torch_default_comm():
+    try:
+        comm = dist.distributed_c10d._get_default_group()
+        return comm
+    except Exception as e:
+        print('Error {}'.format(e))
+        pass
+    try:
+        comm = dist.distributed_c10d._default_pg
+        if comm is not None:
+            return comm
+    except Exception as _:
+        pass
+    raise RuntimeError('Unsupported PyTorch version')
+    return None