Merge remote-tracking branch 'origin/master' into checkpoint

Conflicts: fmoe/transformer.py

Merge remote-tracking branch 'origin/master' into checkpoint
Conflicts: fmoe/transformer.py
d155925d · Jiezhong Qiu · f49a9ec4 · 98b4b0be · d155925d · d155925d
Commit d155925d authored Mar 22, 2021 by Jiezhong Qiu
14 changed files
--- a/.github/ISSUE_TEMPLATE/bug_report.md
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
+---
+name: Bug report
+about: Create a report to help us improve
+title: ''
+labels: ''
+assignees: ''
+---
+**Describe the bug**
+A clear and concise description of what the bug is.
+**To Reproduce**
+Steps to reproduce the behavior:
+1. Compile with "..."
+2. Run "..." with "..." processes on "..." nodes
+**Expected behavior**
+A clear and concise description of what you expected to happen.
+**Logs**
+If applicable, add logs to help explain your problem.
+**Platform**
+ - Device: [e.g. NVIDIA V100]
+ - OS: [e.g. Debian 10.2 buster]
+ - CUDA version: [e.g. 11.1]
+- NCCL version: [e.g. 2.7.8-1]
+**Additional context**
+Add any other context about the problem here.
--- a/.github/ISSUE_TEMPLATE/feature_request.md
+++ b/.github/ISSUE_TEMPLATE/feature_request.md
+---
+name: Feature request
+about: Suggest an idea for this project
+title: ''
+labels: ''
+assignees: ''
+---
+**Is your feature request related to a problem? Please describe.**
+A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
+**Describe the solution you'd like**
+A clear and concise description of what you want to happen.
+**Describe alternatives you've considered**
+A clear and concise description of any alternative solutions or features you've considered.
+**Additional context**
+Add any other context or screenshots about the feature request here.
--- a/.gitignore
+++ b/.gitignore
@@ -10,3 +10,4 @@ a.out
 build
 *swp
 logs
+dist
--- a/cuda/moe.cpp
+++ b/cuda/moe.cpp
@@ -117,15 +117,29 @@ public:
 	ncclComm_t getcomm(at::Device dev) {
 		auto key = std::to_string(dev.index());
 #ifdef ENABLE_NCCL_P2P_SUPPORT
-		auto v = getNCCLComm(key, {dev}, c10d::OpType::ALLTOALL);
+		ncclUniqueId ncclID;
+		int rank = getRank();
+		if (rank == 0) {
+			ncclGetUniqueId(&ncclID);
+		}
+		broadcastUniqueNCCLID(&ncclID,
+				c10d::OpType::SEND,
+				"fastmoe_nccl_comm",
+				rank);
+		ncclComm_t comm;
+		ncclCommInitRank(&comm, getSize(), ncclID, rank);
+		return comm;
 #else
 		auto v = getNCCLComm(key, {dev});
-#endif
 		if (v.size() == 0) {
 			std::cerr << "PyTorch has nothing\n";
 			return 0;
 		}
+		int count;
+		ncclCommCount(v[0]->getNcclComm(), &count);
+		std::cerr << "PyTorch has " << v.size() << " comms, comm 0 size " << count << "\n";
 		return v[0]->getNcclComm();
+#endif
 	}
 };

--- a/doc/release-note.md
+++ b/doc/release-note.md
+## v0.1.2
+### Compilation
+- Remove dependency on the CUDA examples repository.
+### Distributed
+- Fix a bug related to PyTorch v1.8.0. FastMoE can now operate on multiple GPUs
+on multiple nodes with PyTorch v1.8.0.
+### Misc
+- Fix tons of typos.
+- Format the code.
 ## v0.1.1
 ### Distributed

--- a/fmoe/distributed.py
+++ b/fmoe/distributed.py
@@ -47,7 +47,8 @@ class DistributedGroupedDataParallel(nn.Module):
        else:
            self.comms["world"] = world_group
-        def allreduce_params(no_scale=False, reduce_after=False, fp32_allreduce=False):
+        def allreduce_params(no_scale=False,
+                reduce_after=False, fp32_allreduce=False):
            groups = dict()
            for p in self.module.parameters():
                if not p.requires_grad or p.grad is None:

--- a/fmoe/functions.py
+++ b/fmoe/functions.py
@@ -40,7 +40,8 @@ def moe_prepare_forward(gate, num_expert, world_size, comm=None):
            )
        else:
            global_expert_count = local_expert_count
-        fwd_expert_count = global_expert_count.view(world_size, num_expert).sum(dim=0)
+        fwd_expert_count = global_expert_count.view(world_size,
+                num_expert).sum(dim=0)
        fwd_batch_size = int(fwd_expert_count.sum().item())
    return (
        pos,

--- a/fmoe/gates.py
+++ b/fmoe/gates.py
@@ -23,7 +23,8 @@ class ZeroGate(nn.Module):
        idx = torch.zeros(
            inp.shape[0] * self.top_k, dtype=torch.int64, device=inp.device
        )
-        score = torch.ones(inp.shape[0] * self.top_k, device=inp.device) / self.top_k
+        score = torch.ones(inp.shape[0] * self.top_k,
+                device=inp.device) / self.top_k
        return idx, score.reshape(-1, 1, self.top_k)

--- a/fmoe/layers.py
+++ b/fmoe/layers.py
@@ -114,7 +114,8 @@ def _fmoe_general_global_forward(inp, gate, expert_fn, num_expert, world_size):
        fwd_batch_size,
    ) = moe_prepare_forward(gate, num_expert, world_size)
    x = MOEScatter.apply(
-        inp, pos, local_expert_count, global_expert_count, fwd_batch_size, world_size
+        inp, pos,
+        local_expert_count, global_expert_count, fwd_batch_size, world_size
    )
    x = expert_fn(x, fwd_expert_count)
    x = MOEGather.apply(
@@ -165,7 +166,8 @@ class FMoE(nn.Module):
        self.top_k = top_k
        self.gate = gate(d_model, num_expert, world_size, top_k)
        if expert is not None:
-            self.experts = nn.ModuleList([expert(d_model) for _ in range(num_expert)])
+            self.experts = nn.ModuleList([expert(d_model)
+                for _ in range(num_expert)])
            self.experts_fused = False
        else:
            self.experts_fused = True

--- a/fmoe/megatron.py
+++ b/fmoe/megatron.py
@@ -45,7 +45,7 @@ def _megatron_init_method(self, rng, sigma):
    device = self.weight.device
    dtype = self.weight.dtype
    weight = rng.normal(loc=0.0, scale=sigma, size=tuple(self.weight.size()))
-    self.weight.data = torch.tensor(weight, dtype=dtype, device=device)
+    self.weight.data = torch.from_numpy(weight).to(dtype=dtype, device=device)
    if self.bias is not None:
        # Always initialize bias to zero.
@@ -64,13 +64,13 @@ def _random_init_weight(self, rng):
    device = self.weight.device
    dtype = self.weight.dtype
    weight = rng.uniform(-bound, bound, size=tuple(self.weight.size()))
-    self.weight.data = torch.tensor(weight, dtype=dtype, device=device)
+    self.weight.data = torch.from_numpy(weight).to(dtype=dtype, device=device)
    if self.bias is not None:
        fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.weight[0])
        bound = 1 / math.sqrt(fan_in)
        bias = rng.uniform(-bound, bound, size=tuple(self.bias.size()))
-        self.bias.data = torch.tensor(bias, dtype=dtype, device=device)
+        self.bias.data = torch.from_numpy(bias).to(dtype=dtype, device=device)
 class MegatronMLP(FMoETransformerMLP):
@@ -81,7 +81,8 @@ class MegatronMLP(FMoETransformerMLP):
    def __init__(self, args, group):
        assert (
-            args.seq_length * args.micro_batch_size % args.tensor_model_parallel_size
+            args.seq_length * args.micro_batch_size
+            % args.tensor_model_parallel_size
            == 0
        ), "Batch size x sequence length should be multiple of mp size"
        if not args.distributed_experts:

--- a/fmoe/transformer.py
+++ b/fmoe/transformer.py
@@ -15,10 +15,8 @@ class _Expert(nn.Module):
    def __init__(self, num_expert, d_model, d_hidden, activation, rank=0):
        super().__init__()
-        self.htoh4 = FMoELinear(num_expert, d_model, d_hidden,
+        self.htoh4 = FMoELinear(num_expert, d_model, d_hidden, bias=True, rank=rank)
-                        bias=True, rank=rank)
+        self.h4toh = FMoELinear(num_expert, d_hidden, d_model, bias=True, rank=rank)
-        self.h4toh = FMoELinear(num_expert, d_hidden, d_model,
-                        bias=True, rank=rank)
        self.activation = activation
    def forward(self, inp, fwd_expert_count):

--- a/setup.py
+++ b/setup.py
@@ -14,7 +14,7 @@ if os.environ.get('USE_NCCL', '0') == '1':
 if __name__ == '__main__':
    setuptools.setup(
        name='fastmoe',
-        version='0.1.1',
+        version='0.1.2',
        description='An efficient Mixture-of-Experts system for PyTorch',
        author='Jiaao He, Jiezhong Qiu and Aohan Zeng',
        author_email='hja20@mails.tsinghua.edu.cn',

--- a/tests/benchmark_mlp.py
+++ b/tests/benchmark_mlp.py
+import torch
+import torch.nn as nn
 from fmoe import FMoETransformerMLP
 from fmoe.gates import NaiveGate
 from moe import BruteForceMoELinear
-import torch
-import torch.nn as nn
 import time
 import sys
 import os

--- a/tests/test.sh
+++ b/tests/test.sh
@@ -25,7 +25,10 @@ PYTHON_VERSION=$($PYTHON_EXEC --version)
 PYTHON_REVISION=${PYTHON_VERSION:7:3}
 SCRIPT_PATH=$(dirname $(dirname $(realpath $0)))
+source ~/scripts/torch.env
 export PYTHONPATH=$SCRIPT_PATH:$SCRIPT_PATH/build/lib.linux-x86_64-$PYTHON_REVISION:$PYTHONPATH
-export LD_LIBRARY_PATH=/home/laekov/.local/lib/python$PYTHON_REVISION/site-packages/torch/lib:$LD_LIBRARY_PATH
-exec $PYTHON_EXEC $@ 2>logs/$RANK.log
+core0=$(expr $OMPI_COMM_WORLD_LOCAL_RANK \* 4)
+cores=$core0-$(expr $core0 + 3)
+exec numactl -C $cores $PYTHON_EXEC $@ 2>logs/$RANK.log