Merged with master

f957c299 · TiagoMAntunes · 7001bc3e · 26824495 · f957c299 · f957c299
Commit f957c299 authored Mar 29, 2021 by TiagoMAntunes
20 changed files
--- a/.github/ISSUE_TEMPLATE/bug_report.md
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
+---
+name: Bug report
+about: Create a report to help us improve
+title: ''
+labels: ''
+assignees: ''
+
+---
+
+**Describe the bug**
+A clear and concise description of what the bug is.
+
+**To Reproduce**
+Steps to reproduce the behavior:
+1. Compile with "..."
+2. Run "..." with "..." processes on "..." nodes
+
+**Expected behavior**
+A clear and concise description of what you expected to happen.
+
+**Logs**
+If applicable, add logs to help explain your problem.
+
+**Platform**
+ - Device: [e.g. NVIDIA V100]
+ - OS: [e.g. Debian 10.2 buster]
+ - CUDA version: [e.g. 11.1]
+- NCCL version: [e.g. 2.7.8-1]
+
+**Additional context**
+Add any other context about the problem here.
--- a/.github/ISSUE_TEMPLATE/feature_request.md
+++ b/.github/ISSUE_TEMPLATE/feature_request.md
+---
+name: Feature request
+about: Suggest an idea for this project
+title: ''
+labels: ''
+assignees: ''
+
+---
+
+**Is your feature request related to a problem? Please describe.**
+A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
+
+**Describe the solution you'd like**
+A clear and concise description of what you want to happen.
+
+**Describe alternatives you've considered**
+A clear and concise description of any alternative solutions or features you've considered.
+
+**Additional context**
+Add any other context or screenshots about the feature request here.
--- a/.pylintrc
+++ b/.pylintrc
@@ -179,7 +179,7 @@ score=yes
 [REFACTORING]

 # Maximum number of nested blocks for function / method body
-max-nested-blocks=5
+max-nested-blocks=32

 # Complete name of functions that never returns. When checking for
 # inconsistent-return-statements if a never returning function is called then
@@ -402,7 +402,7 @@ indent-after-paren=4
 indent-string='    '

 # Maximum number of characters on a single line.
-max-line-length=81
+max-line-length=120

 # Maximum number of lines in a module.
 max-module-lines=1000
@@ -563,10 +563,10 @@ max-attributes=32
 max-bool-expr=5

 # Maximum number of branch for function / method body.
-max-branches=12
+max-branches=32

 # Maximum number of locals for function / method body.
-max-locals=15
+max-locals=32

 # Maximum number of parents for a class (see R0901).
 max-parents=7
@@ -578,7 +578,7 @@ max-public-methods=20
 max-returns=6

 # Maximum number of statements in function / method body.
-max-statements=50
+max-statements=128

 # Minimum number of public methods for a class (see R0903).
 min-public-methods=2

--- a/README.md
+++ b/README.md
@@ -99,6 +99,17 @@ FastMoE's model parallel requires sophiscated parallel strategies that neither P
 Megatron-LM provides. The `fmoe.DistributedGroupedDataParallel` module is
 introduced to replace PyTorch's DDP module.

+## Citation
+
+```
+@article{he2021fastmoe,
+      title={FastMoE: A Fast Mixture-of-Expert Training System}, 
+      author={Jiaao He and Jiezhong Qiu and Aohan Zeng and Zhilin Yang and Jidong Zhai and Jie Tang},
+      journal={arXiv preprint arXiv:2103.13262},
+      year={2021}
+}
+```
+
 ## Troubleshootings / Discussion

 If you have any problem using FastMoE, or you are interested in getting involved in developing FastMoE, feel free to join the [our slack channel](https://join.slack.com/t/fastmoe/shared_invite/zt-mz0ai6ol-ggov75D62YsgHfzShw8KYw).
--- a/cuda/moe.cpp
+++ b/cuda/moe.cpp
@@ -180,4 +180,4 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
 #endif
  m.def("forward", &moe_forward, "MoE forward (CUDA)");
  m.def("backward", &moe_backward, "MoE backward (CUDA)");
-}
+}
\ No newline at end of file
--- a/fmoe/balance.py
+++ b/fmoe/balance.py
+import torch
+import torch.nn.functional as F
+
+metrics = {
+    "coefficient-variation": lambda c_e: torch.std(c_e) / torch.mean(c_e),
+    "Lmax-over-Lmin": lambda c_e: (torch.max(c_e) + 1) / (torch.min(c_e) + 1),
+    "Lmax-over-Lmean": lambda c_e: torch.max(c_e) / torch.mean(c_e),
+}
+
+
+def reset_balance_profile(balance_dict, num_layers, balance_strategy):
+    for key in metrics:
+        balance_dict[key] = [None for _ in range(num_layers)]
+    if balance_strategy:
+        balance_dict[f"{balance_strategy}_loss"] = [None for _ in range(num_layers)]
+
+
+def update_balance_profile(
+    balance_dict,
+    gate_top_k_idx,
+    _gate_score_top_k,
+    gate_context,
+    layer_idx,
+    num_expert,
+    balance_strategy,
+):
+    c_e = torch.scatter_add(
+        torch.zeros(num_expert, device=gate_top_k_idx.device),
+        0,
+        gate_top_k_idx,
+        torch.ones_like(gate_top_k_idx, dtype=torch.float),
+    )
+    for key in metrics:
+        balance_dict[key][layer_idx] = metrics[key](c_e)
+    S = gate_top_k_idx.shape[0]
+    if balance_strategy == "gshard":
+        gate_score_all = gate_context
+        m_e = torch.sum(F.softmax(gate_score_all, dim=1), dim=0) / S
+        balance_dict["gshard_loss"][layer_idx] = torch.sum(c_e * m_e) / num_expert / S
+    elif balance_strategy == "noisy":
+        balance_dict["noisy_loss"][layer_idx] = gate_context
--- a/fmoe/gates.py
+++ b/fmoe/gates.py
@@ -5,6 +5,7 @@ The `NaiveGate` is the reference to implement any other gate.
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from torch.distributions.normal import Normal


 class ZeroGate(nn.Module):
@@ -12,8 +13,9 @@ class ZeroGate(nn.Module):
    Guide all input samples to gate 0.
    """

-    def __init__(self, _1, _2, _3, top_k=2):
+    def __init__(self, _1, num_expert, _3, top_k=2):
        super().__init__()
+        self.num_expert = num_expert
        self.top_k = top_k

    def forward(self, inp):
@@ -23,9 +25,12 @@ class ZeroGate(nn.Module):
        idx = torch.zeros(
            inp.shape[0] * self.top_k, dtype=torch.int64, device=inp.device
        )
-        score = torch.ones(inp.shape[0] * self.top_k,
-                device=inp.device) / self.top_k
-        return idx, score.reshape(-1, 1, self.top_k)
+        gate_score = (
+            torch.ones(inp.shape[0] * self.top_k, device=inp.device) / self.top_k
+        )
+        gate_score_all = torch.zeros(inp.shape[0], self.num_expert, device=inp.device)
+        gate_score_all[:, 0] = 1
+        return idx, gate_score.reshape(-1, 1, self.top_k), gate_score_all


 class NaiveGate(nn.Module):
@@ -58,4 +63,129 @@ class NaiveGate(nn.Module):
        gate_score = F.softmax(gate_top_k_val, dim=-1).unsqueeze(1)
        gate_top_k_idx = gate_top_k_idx.view(-1)  # (BxLxtop_k)

-        return gate_top_k_idx, gate_score
+        return gate_top_k_idx, gate_score, gate
+
+
+class NoisyGate(nn.Module):
+    def __init__(self, d_model, num_expert, world_size, top_k=2):
+        super().__init__()
+        self.num_expert = num_expert * world_size
+        self.w_gate = nn.Parameter(
+            torch.zeros(d_model, num_expert * world_size), requires_grad=True
+        )
+        self.w_noise = nn.Parameter(
+            torch.zeros(d_model, num_expert * world_size), requires_grad=True
+        )
+        self.top_k = top_k
+        self.softplus = nn.Softplus()
+        self.softmax = nn.Softmax(1)
+
+        self.noise_epsilon = 1e-2
+
+    def _gates_to_load(self, gates):
+        """Compute the true load per expert, given the gates.
+        The load is the number of examples for which the corresponding gate is >0.
+        Args:
+        gates: a `Tensor` of shape [batch_size, n]
+        Returns:
+        a float32 `Tensor` of shape [n]
+        """
+        return (gates > 0).sum(0)
+
+    def _prob_in_top_k(
+        self, clean_values, noisy_values, noise_stddev, noisy_top_values
+    ):
+        """Helper function to NoisyTopKGating.
+        Computes the probability that value is in top k, given different random noise.
+        This gives us a way of backpropagating from a loss that balances the number
+        of times each expert is in the top k experts per example.
+        In the case of no noise, pass in None for noise_stddev, and the result will
+        not be differentiable.
+        Args:
+        clean_values: a `Tensor` of shape [batch, n].
+        noisy_values: a `Tensor` of shape [batch, n].  Equal to clean values plus
+          normally distributed noise with standard deviation noise_stddev.
+        noise_stddev: a `Tensor` of shape [batch, n], or None
+        noisy_top_values: a `Tensor` of shape [batch, m].
+           "values" Output of tf.top_k(noisy_top_values, m).  m >= k+1
+        Returns:
+        a `Tensor` of shape [batch, n].
+        """
+
+        batch = clean_values.size(0)
+        m = noisy_top_values.size(1)
+        top_values_flat = noisy_top_values.flatten()
+        threshold_positions_if_in = (
+            torch.arange(batch, device=clean_values.device) * m + self.top_k
+        )
+        threshold_if_in = torch.unsqueeze(
+            torch.gather(top_values_flat, 0, threshold_positions_if_in), 1
+        )
+        is_in = torch.gt(noisy_values, threshold_if_in)
+        threshold_positions_if_out = threshold_positions_if_in - 1
+        threshold_if_out = torch.unsqueeze(
+            torch.gather(top_values_flat, 0, threshold_positions_if_out), 1
+        )
+        # is each value currently in the top k.
+        normal = Normal(
+            torch.tensor([0.0], device=clean_values.device),
+            torch.tensor([1.0], device=clean_values.device),
+        )
+        prob_if_in = normal.cdf((clean_values - threshold_if_in) / noise_stddev)
+        prob_if_out = normal.cdf((clean_values - threshold_if_out) / noise_stddev)
+        prob = torch.where(is_in, prob_if_in, prob_if_out)
+        return prob
+
+    def cv_squared(self, x):
+        """The squared coefficient of variation of a sample.
+        Useful as a loss to encourage a positive distribution to be more uniform.
+        Epsilons added for numerical stability.
+        Returns 0 for an empty Tensor.
+        Args:
+        x: a `Tensor`.
+        Returns:
+        a `Scalar`.
+        """
+        eps = 1e-10
+        # if only num_expert = 1
+        if x.shape[0] == 1:
+            return torch.Tensor([0])
+        return x.float().var() / (x.float().mean() ** 2 + eps)
+
+    def forward(self, inp):
+        clean_logits = inp @ self.w_gate
+        raw_noise_stddev = inp @ self.w_noise
+        noise_stddev = (
+            self.softplus(raw_noise_stddev) + self.noise_epsilon
+        ) * self.training
+        noisy_logits = clean_logits + (torch.randn_like(clean_logits) * noise_stddev)
+        logits = noisy_logits
+
+        # calculate topk + 1 that will be needed for the noisy gates
+        top_logits, top_indices = logits.topk(
+            min(self.top_k + 1, self.num_expert), dim=1
+        )
+        top_k_logits = top_logits[:, : self.top_k]
+        top_k_indices = top_indices[:, : self.top_k]
+        top_k_gates = self.softmax(top_k_logits)
+
+        zeros = torch.zeros_like(logits, requires_grad=True)
+        gates = zeros.scatter(1, top_k_indices, top_k_gates)
+
+        if self.top_k < self.num_expert:
+            load = (
+                self._prob_in_top_k(
+                    clean_logits, noisy_logits, noise_stddev, top_logits
+                )
+            ).sum(0)
+        else:
+            load = self._gates_to_load(gates)
+
+        importance = gates.sum(0)
+        loss = self.cv_squared(importance) + self.cv_squared(load)
+
+        return (
+            top_k_indices.contiguous().view(-1),
+            top_k_gates.contiguous().unsqueeze(1),
+            loss,
+        )
--- a/fmoe/layers.py
+++ b/fmoe/layers.py
@@ -121,6 +121,7 @@ class FMoE(nn.Module):
        top_k=2,
        gate=NaiveGate,
        expert=None,
+        gate_hook=None,
    ):
        super().__init__()
        self.num_expert = num_expert
@@ -141,6 +142,7 @@ class FMoE(nn.Module):
            self.experts_fused = False
        else:
            self.experts_fused = True
+        self.gate_hook = gate_hook

    def expert_fn(self, inp, fwd_expert_count):
        r"""
@@ -182,7 +184,9 @@ class FMoE(nn.Module):
        if self.mp_size > 1:
            inp = Slice.apply(inp, self.mp_rank, self.mp_size, self.mp_group)

-        gate_top_k_idx, gate_score = self.gate(inp)
+        gate_top_k_idx, gate_score, gate_state_dict = self.gate(inp)
+        if self.gate_hook:
+            self.gate_hook(gate_top_k_idx, gate_score, gate_state_dict)
        # to: (BxLxtop_k) x d_model
        inp = inp.repeat_interleave(repeats=self.top_k, dim=0)
        x = _fmoe_general_global_forward(

--- a/fmoe/megatron/Megatron.LICENSE
+++ b/fmoe/megatron/Megatron.LICENSE
+Part of our code in megatron.py is copied from NVIDIA's Megatron-LM 
+codebase with modification.
+
+------------- LICENSE FOR NVIDIA Megatron-LM --------------
+
+The following applies to all files unless otherwise noted:
+
+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+--
+
+This repository also contains code from Hugging Face Inc., Google Research,
+and Facebook (from their Fairseq project). Files from these
+organizations have notices at the top of each file. Below are licenses
+used in those files, as indicated.
+
+
+------------- LICENSE FOR huggingface and Google Research code  --------------
+
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+------------- LICENSE FOR Facebook Fairseq code --------------
+
+MIT License
+
+Copyright (c) Facebook, Inc. and its affiliates.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/fmoe/megatron/__init__.py
+++ b/fmoe/megatron/__init__.py
+r"""
+A set of modules to plugin into Megatron-LM with FastMoE
+"""
+from .utils import add_fmoe_args
+
+from .layers import MegatronMLP
+from .layers import fmoefy
+
+from .checkpoint import save_checkpoint
+from .checkpoint import load_checkpoint
+
+from .distributed import DistributedDataParallel
+
+from .balance import reset_gate_hook
+from .balance import get_balance_profile
+from .balance import generate_megatron_gate_hook
+from .balance import add_balance_log
+from .balance import patch_forward_step
+from .balance import patch_model_provider
--- a/fmoe/megatron/balance.py
+++ b/fmoe/megatron/balance.py
+r"""
+Support for monitoring loss in Megatron
+"""
+import torch
+from fmoe.balance import reset_balance_profile
+from fmoe.balance import update_balance_profile
+from fmoe.utils import get_torch_default_comm
+
+
+balance_dict = {}
+num_layers = 0
+
+
+def reset_gate_hook(_num_layers=None):
+    from megatron import get_args
+
+    global balance_dict, num_layers
+    if _num_layers is not None:
+        num_layers = _num_layers
+    reset_balance_profile(balance_dict, num_layers, get_args().balance_strategy)
+
+
+def get_balance_profile():
+    global balance_dict
+    return balance_dict
+
+
+def generate_megatron_gate_hook(layer_idx, num_expert_global):
+    from megatron import get_args
+
+    balance_strategy = get_args().balance_strategy
+
+    def megatron_gate_hook(gate_top_k_idx, gate_score_top_k, gate_context):
+        global balance_dict
+        update_balance_profile(
+            balance_dict,
+            gate_top_k_idx,
+            gate_score_top_k,
+            gate_context,
+            layer_idx,
+            num_expert_global,
+            balance_strategy,
+        )
+
+    return megatron_gate_hook
+
+
+def add_balance_log(writer, iteration):
+    from megatron import is_last_rank
+
+    balance_dict_tensor = torch.vstack(
+        [torch.tensor(item, device=item[0].device) for item in balance_dict.values()]
+    ).detach()
+    world_group = get_torch_default_comm()
+    world_size = torch.distributed.get_world_size(group=world_group)
+    torch.distributed.all_reduce(balance_dict_tensor, group=world_group)
+    balance_dict_tensor /= world_size
+
+    if writer and is_last_rank():
+        for idx, metric_name in enumerate(balance_dict):
+            for layer_id, val in enumerate(balance_dict_tensor[idx]):
+                writer.add_scalar(
+                    f"balance-{metric_name}/layer-{layer_id}", val.item(), iteration
+                )
+            writer.add_scalar(
+                f"balance-{metric_name}/all",
+                balance_dict_tensor[idx].mean().item(),
+                iteration,
+            )
+
+    reset_gate_hook()
+
+
+def patch_forward_step(forward_step_func):
+    r"""
+    Patch model's forward_step_func to support balance loss
+    """
+
+    from megatron.mpu import is_pipeline_last_stage
+    from megatron import get_args
+
+    if not get_args().balance_strategy:
+        return forward_step_func
+
+    def forward_step_with_balance_loss(data_iterator, model, input_tensor):
+        args = get_args()
+        output = forward_step_func(data_iterator, model, input_tensor)
+
+        if not is_pipeline_last_stage():
+            return output
+        loss_name = args.balance_strategy + "_loss"
+
+        (loss, state_dict), bal_loss = (
+            output,
+            (
+                torch.tensor(
+                    balance_dict[loss_name],
+                    device=balance_dict[loss_name][0].device,
+                ).mean()
+                * args.balance_loss_weight
+            ).float(),
+        )
+
+        # avarage across world group
+        world_group = get_torch_default_comm()
+        world_size = torch.distributed.get_world_size(group=world_group)
+        averaged_bal_loss = bal_loss.clone().detach()
+        torch.distributed.all_reduce(averaged_bal_loss, group=world_group)
+        averaged_bal_loss /= world_size
+
+        loss += bal_loss
+        state_dict[loss_name] = averaged_bal_loss
+
+        return loss, state_dict
+
+    return forward_step_with_balance_loss
+
+
+def patch_model_provider(model_provider):
+    from megatron import get_args
+
+    def fmoefied_model_provider():
+        from .layers import fmoefy
+        args = get_args()
+        return fmoefy(
+            model_provider(),
+            num_experts=args.num_experts,
+            hidden_hidden_size=4 * args.hidden_size // args.top_k,
+            top_k=args.top_k,
+        )
+
+    return fmoefied_model_provider
--- a/fmoe/megatron/checkpoint.py
+++ b/fmoe/megatron/checkpoint.py
+r"""
+Support for Megatron to enable saving parameters of different experts on
+different ranks.
+"""
+import os
+import sys
+import random
+from collections import OrderedDict
+import numpy as np
+import torch
+
+
+def get_fmoe_checkpoint_name(
+    checkpoints_path, iteration, release=False, data_parallel_rank=-1
+):
+    """A unified checkpoint name, allowing specifying a data parallel rank"""
+    from megatron import mpu
+    from megatron.checkpointing import get_checkpoint_name
+
+    if data_parallel_rank == -1:
+        data_parallel_rank = mpu.get_data_parallel_rank()
+    if data_parallel_rank == 0:
+        return get_checkpoint_name(checkpoints_path, iteration, release)
+
+    if release:
+        directory = "release"
+    else:
+        directory = "iter_{:07d}".format(iteration)
+    # Use both the tensor and pipeline MP rank.
+    if mpu.get_pipeline_model_parallel_world_size() == 1:
+        return os.path.join(
+            checkpoints_path,
+            directory,
+            "mp_rank_{:02d}_dp_rank_{:04d}".format(
+                mpu.get_tensor_model_parallel_rank(), data_parallel_rank
+            ),
+            "model_optim_rng.pt",
+        )
+    return os.path.join(
+        checkpoints_path,
+        directory,
+        "mp_rank_{:02d}_{:03d}_dp_rank_{:04d}".format(
+            mpu.get_tensor_model_parallel_rank(),
+            mpu.get_pipeline_model_parallel_rank(),
+            data_parallel_rank,
+        ),
+        "model_optim_rng.pt",
+    )
+
+
+def save_checkpoint(iteration, model, optimizer, lr_scheduler):
+    """Save a model checkpoint with expert parallel """
+    # TODO: update patch
+    from megatron import get_args
+    from megatron import mpu
+    from megatron import print_rank_last
+
+    expert_dp_comm = "none"
+
+    if mpu.get_data_parallel_rank() == 0:
+        # at dp rank 0, we still follows the native load_checkpoint by megatron
+        from megatron.checkpointing import save_checkpoint as save_checkpoint_native
+
+        save_checkpoint_native(iteration, model, optimizer, lr_scheduler)
+        return
+
+    args = get_args()
+
+    # Only rank zero of the data parallel writes to the disk.
+    if hasattr(model, 'module'):
+        model = model.module
+
+    print_rank_last(
+        "saving checkpoint at iteration {:7d} to {}".format(iteration, args.save)
+    )
+
+    # Arguments, iteration, and model.
+    state_dict = {}
+    state_dict["model"] = model.state_dict_for_save_checkpoint(
+        keep_vars=(mpu.get_data_parallel_rank() > 0)
+    )
+
+    def extract_expert_param(state_dict, expert_dp_comm="none"):
+        state_dict_new = state_dict.__class__()
+        for k, v in state_dict.items():
+            # megatron uses both dict and OrderedDict in its state_dict
+            if isinstance(v, (OrderedDict, dict)):
+                v_new = extract_expert_param(v, expert_dp_comm)
+                if len(v_new) > 0:
+                    state_dict_new[k] = v_new
+            elif hasattr(v, "dp_comm") and v.dp_comm == expert_dp_comm:
+                state_dict_new[k] = v.detach()
+        return state_dict_new
+
+    state_dict["model"] = extract_expert_param(state_dict["model"], expert_dp_comm)
+
+    # Optimizer stuff.
+    if not args.no_save_optim:
+        if optimizer is not None:
+            state_dict["optimizer"] = optimizer.state_dict()
+            param_global_idx = 0
+            for param_group in optimizer.optimizer.param_groups:
+                for param in param_group["params"]:
+                    if not (
+                        hasattr(param, "dp_comm") and param.dp_comm == expert_dp_comm
+                    ):
+                        # this parameter is not an expert parameter
+                        # thus there is no need to save its state in current rank
+                        # since it has been saved by data parallel rank 0
+                        if args.fp16:
+                            # fp16 optimizer may have empty state due to overflow
+                            state_dict["optimizer"]["optimizer"]["state"].pop(
+                                param_global_idx, None
+                            )
+                        else:
+                            state_dict["optimizer"]["state"].pop(param_global_idx)
+                    param_global_idx += 1
+            if args.fp16:
+                state_dict["optimizer"]["optimizer"].pop("param_groups")
+                # fp32_from_fp16_params in state_dict is not a copy
+                # but a reference to optimizer.fp32_from_fp16_params,
+                # changing it in state_dict will change
+                # optimizer.fp32_from_fp16_params as well
+                # thus we create an empty fp32_from_fp16_params in state_dict
+                # and only insert expert parameters.
+                fp32_from_fp16_params = state_dict["optimizer"]["fp32_from_fp16_params"]
+                state_dict["optimizer"]["fp32_from_fp16_params"] = []
+                for param_group in fp32_from_fp16_params:
+                    param_group_copy = []
+                    for param in param_group:
+                        param_copy = (
+                            param
+                            if hasattr(param, "dp_comm")
+                            and param.dp_comm == expert_dp_comm
+                            else None
+                        )
+                        param_group_copy.append(param_copy)
+                    state_dict["optimizer"]["fp32_from_fp16_params"].append(
+                        param_group_copy
+                    )
+            else:
+                state_dict["optimizer"].pop("param_groups")
+
+    # Save.
+    checkpoint_name = get_fmoe_checkpoint_name(args.save, iteration)
+    from megatron.checkpointing import ensure_directory_exists
+    from megatron.checkpointing import get_checkpoint_tracker_filename
+
+    ensure_directory_exists(checkpoint_name)
+    torch.save(state_dict, checkpoint_name)
+
+    # Wait so everyone is done (necessary)
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print(
+            "  successfully saved checkpoint at iteration {:7d} to {}".format(
+                iteration, args.save
+            ),
+            flush=True,
+        )
+    # And update the latest iteration
+    if torch.distributed.get_rank() == 0:
+        tracker_filename = get_checkpoint_tracker_filename(args.save)
+        with open(tracker_filename, "w") as f:
+            f.write(str(iteration))
+    # Wait so everyone is done (not necessary)
+    torch.distributed.barrier()
+
+
+def merge_state_dict(state_dict_rank0, state_dict_local, fp16):
+    """merge two state dicts, one from data parallel rank 0,
+    another only contains expert states"""
+    # from megatron import print_rank_last
+
+    def merge_model(state_dict_rank0, state_dict_local):
+        for k, v in state_dict_local.items():
+            # megatron uses both dict and OrderedDict in its state_dict
+            if isinstance(v, (OrderedDict, dict)):
+                merge_model(state_dict_rank0[k], v)
+            else:
+                state_dict_rank0[k] = v
+
+    merge_model(state_dict_rank0["model"], state_dict_local["model"])
+
+    optimizer_rank0 = (
+        state_dict_rank0["optimizer"]["optimizer"]
+        if fp16
+        else state_dict_rank0["optimizer"]
+    )
+    optimizer_local = (
+        state_dict_local["optimizer"]["optimizer"]
+        if fp16
+        else state_dict_local["optimizer"]
+    )
+
+    for k, v in optimizer_local["state"].items():
+        optimizer_rank0["state"][k] = v
+
+    if fp16:
+        for group_idx, param_group in enumerate(
+            state_dict_local["optimizer"]["fp32_from_fp16_params"]
+        ):
+            for param_in_group_idx, param in enumerate(param_group):
+                if param is not None:
+                    state_dict_rank0["optimizer"]["fp32_from_fp16_params"][group_idx][
+                        param_in_group_idx
+                    ] = param
+
+    return state_dict_rank0
+
+
+def load_checkpoint(model, optimizer, lr_scheduler, load_arg="load"):
+    """Load a model checkpoint and return the iteration."""
+
+    from megatron import get_args
+    from megatron import mpu
+    from megatron import print_rank_last
+    from megatron.checkpointing import get_checkpoint_tracker_filename
+    from megatron.checkpointing import set_checkpoint_version
+    from megatron.checkpointing import check_checkpoint_args
+    from megatron.checkpointing import update_num_microbatches
+
+    if mpu.get_data_parallel_rank() == 0:
+        # at dp rank 0, we still follow the native load_checkpoint by megatron
+        from megatron.checkpointing import load_checkpoint as load_checkpoint_native
+
+        return load_checkpoint_native(model, optimizer, lr_scheduler, load_arg)
+
+    args = get_args()
+    load_dir = getattr(args, load_arg)
+
+    if hasattr(model, 'module'):
+        model = model.module
+    # Read the tracker file and set the iteration.
+    tracker_filename = get_checkpoint_tracker_filename(load_dir)
+
+    # If no tracker file, return iretation zero.
+    if not os.path.isfile(tracker_filename):
+        print_rank_last(
+            "WARNING: could not find the metadata file {} ".format(tracker_filename)
+        )
+        print_rank_last(
+            "    will not load any checkpoints and will start from " "random"
+        )
+        return 0
+
+    # Otherwise, read the tracker file and either set the iteration or
+    # mark it as a release checkpoint.
+    iteration = 0
+    release = False
+    with open(tracker_filename, "r") as f:
+        metastring = f.read().strip()
+        try:
+            iteration = int(metastring)
+        except ValueError:
+            release = metastring == "release"
+            if not release:
+                print_rank_last(
+                    "ERROR: Invalid metadata file {}. Exiting".format(tracker_filename)
+                )
+                sys.exit()
+
+    assert iteration > 0 or release, "error parsing metadata file {}".format(
+        tracker_filename
+    )
+
+    # Checkpoint.
+    checkpoint_name_rank0 = get_fmoe_checkpoint_name(load_dir, iteration, release, 0)
+    checkpoint_name_local = get_fmoe_checkpoint_name(
+        load_dir, iteration, release, mpu.get_data_parallel_rank()
+    )
+    print_rank_last(
+        " loading checkpoint at rank 0 from {} and rank {} from {} at iteration {}, will merge them later".format(
+            checkpoint_name_rank0,
+            mpu.get_data_parallel_rank(),
+            checkpoint_name_local,
+            iteration,
+        )
+    )
+
+    # Load the checkpoint.
+    def load_state_dict(checkpoint_name):
+        try:
+            state_dict = torch.load(checkpoint_name, map_location="cpu")
+        except ModuleNotFoundError:
+            from megatron.fp16_deprecated import loss_scaler
+
+            # For backward compatibility.
+            print_rank_last(" > deserializing using the old code structure ...")
+            sys.modules["fp16.loss_scaler"] = sys.modules[
+                "megatron.fp16_deprecated.loss_scaler"
+            ]
+            sys.modules["megatron.fp16.loss_scaler"] = sys.modules[
+                "megatron.fp16_deprecated.loss_scaler"
+            ]
+            state_dict = torch.load(checkpoint_name, map_location="cpu")
+            sys.modules.pop("fp16.loss_scaler", None)
+            sys.modules.pop("megatron.fp16.loss_scaler", None)
+        return state_dict
+
+    state_dict_rank0 = load_state_dict(checkpoint_name_rank0)
+    state_dict_local = load_state_dict(checkpoint_name_local)
+
+    state_dict = merge_state_dict(state_dict_rank0, state_dict_local, args.fp16)
+
+    # set checkpoint version
+    set_checkpoint_version(state_dict.get("checkpoint_version", 0))
+
+    # Set iteration.
+    if args.finetune or release:
+        iteration = 0
+    else:
+        try:
+            iteration = state_dict["iteration"]
+        except KeyError:
+            try:  # Backward compatible with older checkpoints
+                iteration = state_dict["total_iters"]
+            except KeyError:
+                print_rank_last(
+                    "A metadata file exists but unable to load "
+                    "iteration from checkpoint {}, exiting".format(
+                        checkpoint_name_local
+                    )
+                )
+                sys.exit()
+
+    # Check arguments.
+    assert args.consumed_train_samples == 0
+    assert args.consumed_valid_samples == 0
+    if "args" in state_dict:
+        checkpoint_args = state_dict["args"]
+        check_checkpoint_args(checkpoint_args)
+        args.consumed_train_samples = getattr(
+            checkpoint_args, "consumed_train_samples", 0
+        )
+        update_num_microbatches(consumed_samples=args.consumed_train_samples)
+        args.consumed_valid_samples = getattr(
+            checkpoint_args, "consumed_valid_samples", 0
+        )
+    else:
+        print_rank_last("could not find arguments in the checkpoint ...")
+
+    # Model.
+    model.load_state_dict(state_dict["model"])
+
+    # Optimizer.
+    if not release and not args.finetune and not args.no_load_optim:
+        try:
+            if optimizer is not None:
+                optimizer.load_state_dict(state_dict["optimizer"])
+            if lr_scheduler is not None:
+                lr_scheduler.load_state_dict(state_dict["lr_scheduler"])
+        except KeyError:
+            print_rank_last(
+                "Unable to load optimizer from checkpoint {}. "
+                "Specify --no-load-optim or --finetune to prevent "
+                "attempting to load the optimizer state, "
+                "exiting ...".format(checkpoint_name_local)
+            )
+            sys.exit()
+
+    # rng states.
+    if not release and not args.finetune and not args.no_load_rng:
+        try:
+            random.setstate(state_dict["random_rng_state"])
+            np.random.set_state(state_dict["np_rng_state"])
+            torch.set_rng_state(state_dict["torch_rng_state"])
+            torch.cuda.set_rng_state(state_dict["cuda_rng_state"])
+            mpu.get_cuda_rng_tracker().set_states(state_dict["rng_tracker_states"])
+        except KeyError:
+            print_rank_last(
+                "Unable to load optimizer from checkpoint {}. "
+                "Specify --no-load-rng or --finetune to prevent "
+                "attempting to load the optimizer state, "
+                "exiting ...".format(checkpoint_name_local)
+            )
+            sys.exit()
+
+    torch.distributed.barrier()
+    print_rank_last(
+        "  successfully loaded checkpoint (with expert parametes updated) from {} at iteration {}".format(
+            args.load, iteration
+        )
+    )
+
+    return iteration
--- a/fmoe/megatron/distributed.py
+++ b/fmoe/megatron/distributed.py
+r"""
+distributed support for Megatron
+"""
+from fmoe.distributed import DistributedGroupedDataParallel
+
+
+class DistributedDataParallel(DistributedGroupedDataParallel):
+    r"""
+    A wrapper that is used to replace the DDP module provided by Megatron, which
+    is adapted to enable the sophiscated parallel and reduction strategies in
+    Fast MoE.
+    """
+
+    def __init__(self, module):
+        from megatron import mpu
+
+        super().__init__(
+            module,
+            mp_group=mpu.get_model_parallel_group(),
+            dp_group=mpu.get_data_parallel_group(),
+        )
+
+    def state_dict(self, *args, **kwargs):
+        r"""
+        Keep consitency with Megatron
+        """
+        return self.module.state_dict(*args, **kwargs)
+
+    def state_dict_for_save_checkpoint(self, *args, **kwargs):
+        r"""
+        Keep consitency with Megatron
+        """
+        return self.module.state_dict_for_save_checkpoint(*args, **kwargs)
+
+    def load_state_dict(self, *args, **kwargs):
+        r"""
+        Keep consitency with Megatron
+        """
+        return self.module.load_state_dict(*args, **kwargs)
--- a/fmoe/megatron.py
+++ b/fmoe/megatron.py
 r"""
-The adaptor to seamlessly enable FastMoE in Megatron-LM v2.0 with at most two
-lines of modification.
-See `examples/megatron` for usage instructions.
+nn modules to replace Megatron's native ones
 """
 import math
 import numpy as np
@@ -9,8 +7,9 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F

-from .transformer import FMoETransformerMLP
-from .distributed import DistributedGroupedDataParallel
+from fmoe.transformer import FMoETransformerMLP
+from .balance import reset_gate_hook
+from .balance import generate_megatron_gate_hook


 class _FakeMegatronMLP(nn.Module):
@@ -75,16 +74,26 @@ class MegatronMLP(FMoETransformerMLP):
    communication group `group` to replace the original MLP layer in Megatron.
    """

-    def __init__(self, args, group):
+    def __init__(self, args, group, layer_idx):
        assert (
-            args.seq_length * args.micro_batch_size
-            % args.tensor_model_parallel_size
+            args.seq_length * args.micro_batch_size % args.tensor_model_parallel_size
            == 0
        ), "Batch size x sequence length should be multiple of mp size"
        if not args.distributed_experts:
            world_size = 1
        else:
            world_size = args.world_size
+        gate = None
+        if not args.balance_strategy or args.balance_strategy == "gshard":
+            from fmoe.gates import NaiveGate
+
+            gate = NaiveGate
+        elif args.balance_strategy == "noisy":
+            from fmoe.gates import NoisyGate
+
+            gate = NoisyGate
+        else:
+            assert False, "Undefined balance strategy {}" % (args.balance_strategy)
        super().__init__(
            args.num_experts,
            top_k=args.top_k,
@@ -93,6 +102,10 @@ class MegatronMLP(FMoETransformerMLP):
            world_size=world_size,
            mp_group=group,
            expert_dp_comm="none" if args.distributed_experts else "dp",
+            gate_hook=generate_megatron_gate_hook(
+                layer_idx, args.num_experts * world_size
+            ),
+            gate=gate,
        )
        self.hidden_size = args.hidden_size
        if args.distributed_experts:
@@ -166,41 +179,11 @@ def fmoefy(
    if distributed_experts is not None:
        args.distributed_experts = distributed_experts

-    for l in model.language_model.transformer.layers:
-        l.mlp = MegatronMLP(args, mpu.get_model_parallel_group())
-    return model
-
-
-class DistributedDataParallel(DistributedGroupedDataParallel):
-    r"""
-    A wrapper that is used to replace the DDP module provided by Megatron, which
-    is adapted to enable the sophiscated parallel and reduction strategies in
-    Fast MoE.
-    """
-
-    def __init__(self, module):
-        from megatron import mpu
-
-        super().__init__(
-            module,
-            mp_group=mpu.get_model_parallel_group(),
-            dp_group=mpu.get_data_parallel_group(),
-        )
-
-    def state_dict(self, *args, **kwargs):
-        r"""
-        Keep consitency with Megatron
-        """
-        return self.module.state_dict(*args, **kwargs)
+    for idx, l in enumerate(model.language_model.transformer.layers):
+        l.mlp = MegatronMLP(args, mpu.get_model_parallel_group(), idx)

-    def state_dict_for_save_checkpoint(self, *args, **kwargs):
-        r"""
-        Keep consitency with Megatron
-        """
-        return self.module.state_dict_for_save_checkpoint(*args, **kwargs)
+    # initialize gate hook
+    num_layers = len(model.language_model.transformer.layers)
+    reset_gate_hook(num_layers)

-    def load_state_dict(self, *args, **kwargs):
-        r"""
-        Keep consitency with Megatron
-        """
-        return self.module.load_state_dict(*args, **kwargs)
+    return model
--- a/fmoe/megatron/utils.py
+++ b/fmoe/megatron/utils.py
+r"""
+Utility in Megatron
+"""
+def add_fmoe_args(parser):
+    group = parser.add_argument_group(title="fastmoe")
+
+    group.add_argument("--fmoefy", action="store_true")
+    group.add_argument("--num-experts", type=int, default=None)
+    group.add_argument("--top-k", type=int, default=2)
+    group.add_argument("--balance-loss-weight", type=float, default=1)
+    group.add_argument("--balance-strategy", type=str, default=None)
+
+    return parser
--- a/fmoe/transformer.py
+++ b/fmoe/transformer.py
@@ -15,10 +15,8 @@ class _Expert(nn.Module):

    def __init__(self, num_expert, d_model, d_hidden, activation, rank=0):
        super().__init__()
-        self.htoh4 = FMoELinear(num_expert, d_model, d_hidden, bias=True,
-                rank=rank)
-        self.h4toh = FMoELinear(num_expert, d_hidden, d_model, bias=True,
-                rank=rank)
+        self.htoh4 = FMoELinear(num_expert, d_model, d_hidden, bias=True, rank=rank)
+        self.h4toh = FMoELinear(num_expert, d_hidden, d_model, bias=True, rank=rank)
        self.activation = activation

    def forward(self, inp, fwd_expert_count):
@@ -50,6 +48,7 @@ class FMoETransformerMLP(FMoE):
        gate=NaiveGate,
        top_k=2,
        expert_dp_comm="none",
+        gate_hook=None,
    ):
        super().__init__(
            num_expert=num_expert,
@@ -58,6 +57,7 @@ class FMoETransformerMLP(FMoE):
            top_k=top_k,
            world_size=world_size,
            mp_group=mp_group,
+            gate_hook=gate_hook,
        )
        self.experts = _Expert(
            num_expert, d_model, d_hidden, activation, rank=self.mp_rank

--- a/setup.py
+++ b/setup.py
@@ -20,7 +20,7 @@ if __name__ == '__main__':
        author_email='hja20@mails.tsinghua.edu.cn',
        license='Apache-2',
        url='https://github.com/laekov/fastmoe',
-        packages=['fmoe'],
+        packages=['fmoe', 'fmoe.megatron'],
        ext_modules=[
            CUDAExtension(
                name='fmoe_cuda', 

--- a/tests/benchmark_mlp.py
+++ b/tests/benchmark_mlp.py
@@ -40,7 +40,7 @@ class BruteForceMoE(nn.Module):
    def forward(self, inp):
        if self.pre_lnorm:
            inp = self.layer_norm(inp)
-        gate_top_k_idx, gate_score = self.gate(inp)
+        gate_top_k_idx, gate_score, _ = self.gate(inp)
        inp = inp.repeat_interleave(repeats=self.top_k, dim=0)
        x = self.mlp(inp, gate_top_k_idx, gate_score)
        if not self.pre_lnorm:

--- a/tests/test_numerical.py
+++ b/tests/test_numerical.py
@@ -12,7 +12,7 @@ from fmoe.gates import NaiveGate
 from fmoe.layers import FMoE
 from fmoe.transformer import _Expert
 from fmoe.distributed import DistributedGroupedDataParallel as LocalDDP
-from fmoe.megatron import _megatron_init_method
+from fmoe.megatron.layers import _megatron_init_method
 from moe import BruteForceMoELinear, BruteForceMoE, NaiveExpert, LinearExpert


@@ -38,7 +38,7 @@ def _perform_forward(
    inp.requires_grad = True

    inp_raw.requires_grad = True
-    gate_idx, gate_score = moe.gate(inp_raw)
+    gate_idx, gate_score, _ = moe.gate(inp_raw)
    inp_repeated = inp_raw.repeat_interleave(repeats=top_k, dim=0)
    moe_out = moe(inp)
    raw_out = moe_raw(inp_repeated, gate_idx, gate_score)

--- a/tests/test_zero.py
+++ b/tests/test_zero.py
+import torch
+from fmoe.layers import _fmoe_general_global_forward
+from fmoe import FMoETransformerMLP
+
+
+class ConstantGate(torch.nn.Module):
+    def __init__(self, d_model, num_expert, world_size, top_k=1):
+        super().__init__()
+        self.top_k = top_k
+
+    def forward(self, inp):
+        idx = torch.zeros((inp.shape[0] * self.top_k,), dtype=torch.int64,
+                device=inp.device)
+        score = torch.ones((inp.shape[0], 1, self.top_k), device=inp.device) / 2
+        return idx, score, None
+
+
+def test_zero_fwd(num_expert=2, batch_size=4, d_hidden=8, world_size=1):
+    inp = torch.rand(batch_size, d_hidden).cuda()
+    gate = torch.zeros(batch_size, dtype=torch.int64).cuda()
+    x = _fmoe_general_global_forward(inp, gate, lambda x, y: x, num_expert,
+            world_size)
+
+
+def test_zero_transformer(num_expert=2, batch_size=4, d_hidden=8, world_size=1):
+    inp = torch.rand(batch_size, d_hidden).cuda()
+    model = FMoETransformerMLP(num_expert, d_hidden, d_hidden * 4, world_size,
+            gate=ConstantGate).cuda()
+    oup = model(inp)
+
+
+if __name__ == '__main__':
+    torch.distributed.init_process_group(backend="nccl")
+    torch.cuda.set_device(torch.distributed.get_rank())
+    # test_zero_fwd(world_size=torch.distributed.get_world_size())
+    test_zero_transformer(num_expert=16, batch_size=4096, d_hidden=1024,
+            world_size=torch.distributed.get_world_size())
+    print('done')