Merge remote-tracking branch 'origin/master' into balance_public

69121432 · Sengxian · 89d6c794 · c1e67585 · 69121432 · 69121432
Commit 69121432 authored Mar 22, 2021 by Sengxian
6 changed files
--- a/.github/ISSUE_TEMPLATE/bug_report.md
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
+---
+name: Bug report
+about: Create a report to help us improve
+title: ''
+labels: ''
+assignees: ''
+---
+**Describe the bug**
+A clear and concise description of what the bug is.
+**To Reproduce**
+Steps to reproduce the behavior:
+1. Compile with "..."
+2. Run "..." with "..." processes on "..." nodes
+**Expected behavior**
+A clear and concise description of what you expected to happen.
+**Logs**
+If applicable, add logs to help explain your problem.
+**Platform**
+ - Device: [e.g. NVIDIA V100]
+ - OS: [e.g. Debian 10.2 buster]
+ - CUDA version: [e.g. 11.1]
+- NCCL version: [e.g. 2.7.8-1]
+**Additional context**
+Add any other context about the problem here.
--- a/.github/ISSUE_TEMPLATE/feature_request.md
+++ b/.github/ISSUE_TEMPLATE/feature_request.md
+---
+name: Feature request
+about: Suggest an idea for this project
+title: ''
+labels: ''
+assignees: ''
+---
+**Is your feature request related to a problem? Please describe.**
+A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
+**Describe the solution you'd like**
+A clear and concise description of what you want to happen.
+**Describe alternatives you've considered**
+A clear and concise description of any alternative solutions or features you've considered.
+**Additional context**
+Add any other context or screenshots about the feature request here.
--- a/.pylintrc
+++ b/.pylintrc
@@ -402,7 +402,7 @@ indent-after-paren=4
 indent-string='    '
 # Maximum number of characters on a single line.
-max-line-length=81
+max-line-length=120
 # Maximum number of lines in a module.
 max-module-lines=1000

--- a/fmoe/Megatron.LICENSE
+++ b/fmoe/Megatron.LICENSE
+Part of our code in megatron.py is copied from NVIDIA's Megatron-LM 
+codebase with modification.
+------------- LICENSE FOR NVIDIA Megatron-LM --------------
+The following applies to all files unless otherwise noted:
+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+--
+This repository also contains code from Hugging Face Inc., Google Research,
+and Facebook (from their Fairseq project). Files from these
+organizations have notices at the top of each file. Below are licenses
+used in those files, as indicated.
+------------- LICENSE FOR huggingface and Google Research code  --------------
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+------------- LICENSE FOR Facebook Fairseq code --------------
+MIT License
+Copyright (c) Facebook, Inc. and its affiliates.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/fmoe/megatron.py
+++ b/fmoe/megatron.py
@@ -3,7 +3,11 @@ The adaptor to seamlessly enable FastMoE in Megatron-LM v2.0 with at most two
 lines of modification.
 See `examples/megatron` for usage instructions.
 """
+import os
+import sys
 import math
+import random
+from collections import OrderedDict
 import numpy as np
 import torch
 import torch.nn as nn
@@ -361,3 +365,382 @@ class DistributedDataParallel(DistributedGroupedDataParallel):
        Keep consitency with Megatron
        """
        return self.module.load_state_dict(*args, **kwargs)
+def get_fmoe_checkpoint_name(
+    checkpoints_path, iteration, release=False, data_parallel_rank=-1
+):
+    """A unified checkpoint name, allowing specifying a data parallel rank"""
+    from megatron import mpu
+    from megatron.checkpointing import get_checkpoint_name
+    if data_parallel_rank == -1:
+        data_parallel_rank = mpu.get_data_parallel_rank()
+    if data_parallel_rank == 0:
+        return get_checkpoint_name(checkpoints_path, iteration, release)
+    if release:
+        directory = "release"
+    else:
+        directory = "iter_{:07d}".format(iteration)
+    # Use both the tensor and pipeline MP rank.
+    if mpu.get_pipeline_model_parallel_world_size() == 1:
+        return os.path.join(
+            checkpoints_path,
+            directory,
+            "mp_rank_{:02d}_dp_rank_{:04d}".format(
+                mpu.get_tensor_model_parallel_rank(), data_parallel_rank
+            ),
+            "model_optim_rng.pt",
+        )
+    return os.path.join(
+        checkpoints_path,
+        directory,
+        "mp_rank_{:02d}_{:03d}_dp_rank_{:04d}".format(
+            mpu.get_tensor_model_parallel_rank(),
+            mpu.get_pipeline_model_parallel_rank(),
+            data_parallel_rank,
+        ),
+        "model_optim_rng.pt",
+    )
+def save_checkpoint(iteration, model, optimizer, lr_scheduler):
+    """Save a model checkpoint with expert parallel """
+    # TODO: update patch
+    from megatron import get_args
+    from megatron import mpu
+    from megatron import print_rank_last
+    expert_dp_comm = "none"
+    if mpu.get_data_parallel_rank() == 0:
+        # at dp rank 0, we still follows the native load_checkpoint by megatron
+        from megatron.checkpointing import save_checkpoint as save_checkpoint_native
+        save_checkpoint_native(iteration, model, optimizer, lr_scheduler)
+        return
+    args = get_args()
+    # Only rank zero of the data parallel writes to the disk.
+    if isinstance(model, DistributedDataParallel):
+        model = model.module
+    print_rank_last(
+        "saving checkpoint at iteration {:7d} to {}".format(iteration, args.save)
+    )
+    # Arguments, iteration, and model.
+    state_dict = {}
+    state_dict["model"] = model.state_dict_for_save_checkpoint(
+        keep_vars=(mpu.get_data_parallel_rank() > 0)
+    )
+    def extract_expert_param(state_dict, expert_dp_comm="none"):
+        state_dict_new = state_dict.__class__()
+        for k, v in state_dict.items():
+            # megatron uses both dict and OrderedDict in its state_dict
+            if isinstance(v, (OrderedDict, dict)):
+                v_new = extract_expert_param(v, expert_dp_comm)
+                if len(v_new) > 0:
+                    state_dict_new[k] = v_new
+            elif hasattr(v, "dp_comm") and v.dp_comm == expert_dp_comm:
+                state_dict_new[k] = v.detach()
+        return state_dict_new
+    state_dict["model"] = extract_expert_param(state_dict["model"], expert_dp_comm)
+    # Optimizer stuff.
+    if not args.no_save_optim:
+        if optimizer is not None:
+            state_dict["optimizer"] = optimizer.state_dict()
+            param_global_idx = 0
+            for param_group in optimizer.optimizer.param_groups:
+                for param in param_group["params"]:
+                    if not (
+                        hasattr(param, "dp_comm") and param.dp_comm == expert_dp_comm
+                    ):
+                        # this parameter is not an expert parameter
+                        # thus there is no need to save its state in current rank
+                        # since it has been saved by data parallel rank 0
+                        if args.fp16:
+                            # fp16 optimizer may have empty state due to overflow
+                            state_dict["optimizer"]["optimizer"]["state"].pop(
+                                param_global_idx, None
+                            )
+                        else:
+                            state_dict["optimizer"]["state"].pop(param_global_idx)
+                    param_global_idx += 1
+            if args.fp16:
+                state_dict["optimizer"]["optimizer"].pop("param_groups")
+                # fp32_from_fp16_params in state_dict is not a copy
+                # but a reference to optimizer.fp32_from_fp16_params,
+                # changing it in state_dict will change
+                # optimizer.fp32_from_fp16_params as well
+                # thus we create an empty fp32_from_fp16_params in state_dict
+                # and only insert expert parameters.
+                fp32_from_fp16_params = state_dict["optimizer"]["fp32_from_fp16_params"]
+                state_dict["optimizer"]["fp32_from_fp16_params"] = []
+                for param_group in fp32_from_fp16_params:
+                    param_group_copy = []
+                    for param in param_group:
+                        param_copy = (
+                            param
+                            if hasattr(param, "dp_comm")
+                            and param.dp_comm == expert_dp_comm
+                            else None
+                        )
+                        param_group_copy.append(param_copy)
+                    state_dict["optimizer"]["fp32_from_fp16_params"].append(
+                        param_group_copy
+                    )
+            else:
+                state_dict["optimizer"].pop("param_groups")
+    # Save.
+    checkpoint_name = get_fmoe_checkpoint_name(args.save, iteration)
+    from megatron.checkpointing import ensure_directory_exists
+    from megatron.checkpointing import get_checkpoint_tracker_filename
+    ensure_directory_exists(checkpoint_name)
+    torch.save(state_dict, checkpoint_name)
+    # Wait so everyone is done (necessary)
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print(
+            "  successfully saved checkpoint at iteration {:7d} to {}".format(
+                iteration, args.save
+            ),
+            flush=True,
+        )
+    # And update the latest iteration
+    if torch.distributed.get_rank() == 0:
+        tracker_filename = get_checkpoint_tracker_filename(args.save)
+        with open(tracker_filename, "w") as f:
+            f.write(str(iteration))
+    # Wait so everyone is done (not necessary)
+    torch.distributed.barrier()
+def merge_state_dict(state_dict_rank0, state_dict_local, fp16):
+    """merge two state dicts, one from data parallel rank 0,
+    another only contains expert states"""
+    from megatron import print_rank_last
+    def merge_model(state_dict_rank0, state_dict_local):
+        for k, v in state_dict_local.items():
+            # megatron uses both dict and OrderedDict in its state_dict
+            if isinstance(v, (OrderedDict, dict)):
+                merge_model(state_dict_rank0[k], v)
+            else:
+                state_dict_rank0[k] = v
+    merge_model(state_dict_rank0["model"], state_dict_local["model"])
+    optimizer_rank0 = (
+        state_dict_rank0["optimizer"]["optimizer"]
+        if fp16
+        else state_dict_rank0["optimizer"]
+    )
+    optimizer_local = (
+        state_dict_local["optimizer"]["optimizer"]
+        if fp16
+        else state_dict_local["optimizer"]
+    )
+    for k, v in optimizer_local["state"].items():
+        optimizer_rank0["state"][k] = v
+    if fp16:
+        for group_idx, param_group in enumerate(
+            state_dict_local["optimizer"]["fp32_from_fp16_params"]
+        ):
+            for param_in_group_idx, param in enumerate(param_group):
+                if param is not None:
+                    state_dict_rank0["optimizer"]["fp32_from_fp16_params"][group_idx][
+                        param_in_group_idx
+                    ] = param
+    return state_dict_rank0
+def load_checkpoint(model, optimizer, lr_scheduler, load_arg="load"):
+    """Load a model checkpoint and return the iteration."""
+    from megatron import get_args
+    from megatron import mpu
+    from megatron import print_rank_last
+    from megatron.checkpointing import get_checkpoint_tracker_filename
+    from megatron.checkpointing import set_checkpoint_version
+    from megatron.checkpointing import check_checkpoint_args
+    from megatron.checkpointing import update_num_microbatches
+    if mpu.get_data_parallel_rank() == 0:
+        # at dp rank 0, we still follow the native load_checkpoint by megatron
+        from megatron.checkpointing import load_checkpoint as load_checkpoint_native
+        return load_checkpoint_native(model, optimizer, lr_scheduler, load_arg)
+    args = get_args()
+    load_dir = getattr(args, load_arg)
+    if isinstance(model, DistributedDataParallel):
+        model = model.module
+    # Read the tracker file and set the iteration.
+    tracker_filename = get_checkpoint_tracker_filename(load_dir)
+    # If no tracker file, return iretation zero.
+    if not os.path.isfile(tracker_filename):
+        print_rank_last(
+            "WARNING: could not find the metadata file {} ".format(tracker_filename)
+        )
+        print_rank_last(
+            "    will not load any checkpoints and will start from " "random"
+        )
+        return 0
+    # Otherwise, read the tracker file and either set the iteration or
+    # mark it as a release checkpoint.
+    iteration = 0
+    release = False
+    with open(tracker_filename, "r") as f:
+        metastring = f.read().strip()
+        try:
+            iteration = int(metastring)
+        except ValueError:
+            release = metastring == "release"
+            if not release:
+                print_rank_last(
+                    "ERROR: Invalid metadata file {}. Exiting".format(tracker_filename)
+                )
+                sys.exit()
+    assert iteration > 0 or release, "error parsing metadata file {}".format(
+        tracker_filename
+    )
+    # Checkpoint.
+    checkpoint_name_rank0 = get_fmoe_checkpoint_name(load_dir, iteration, release, 0)
+    checkpoint_name_local = get_fmoe_checkpoint_name(
+        load_dir, iteration, release, mpu.get_data_parallel_rank()
+    )
+    print_rank_last(
+        " loading checkpoint at rank 0 from {} and rank {} from {} at iteration {}, will merge them later".format(
+            checkpoint_name_rank0,
+            mpu.get_data_parallel_rank(),
+            checkpoint_name_local,
+            iteration,
+        )
+    )
+    # Load the checkpoint.
+    def load_state_dict(checkpoint_name):
+        try:
+            state_dict = torch.load(checkpoint_name, map_location="cpu")
+        except ModuleNotFoundError:
+            from megatron.fp16_deprecated import loss_scaler
+            # For backward compatibility.
+            print_rank_last(" > deserializing using the old code structure ...")
+            sys.modules["fp16.loss_scaler"] = sys.modules[
+                "megatron.fp16_deprecated.loss_scaler"
+            ]
+            sys.modules["megatron.fp16.loss_scaler"] = sys.modules[
+                "megatron.fp16_deprecated.loss_scaler"
+            ]
+            state_dict = torch.load(checkpoint_name, map_location="cpu")
+            sys.modules.pop("fp16.loss_scaler", None)
+            sys.modules.pop("megatron.fp16.loss_scaler", None)
+        except BaseException:
+            print_rank_last("could not load the checkpoint")
+            sys.exit()
+        return state_dict
+    state_dict_rank0 = load_state_dict(checkpoint_name_rank0)
+    state_dict_local = load_state_dict(checkpoint_name_local)
+    state_dict = merge_state_dict(state_dict_rank0, state_dict_local, args.fp16)
+    # set checkpoint version
+    set_checkpoint_version(state_dict.get("checkpoint_version", 0))
+    # Set iteration.
+    if args.finetune or release:
+        iteration = 0
+    else:
+        try:
+            iteration = state_dict["iteration"]
+        except KeyError:
+            try:  # Backward compatible with older checkpoints
+                iteration = state_dict["total_iters"]
+            except KeyError:
+                print_rank_last(
+                    "A metadata file exists but unable to load "
+                    "iteration from checkpoint {}, exiting".format(
+                        checkpoint_name_local
+                    )
+                )
+                sys.exit()
+    # Check arguments.
+    assert args.consumed_train_samples == 0
+    assert args.consumed_valid_samples == 0
+    if "args" in state_dict:
+        checkpoint_args = state_dict["args"]
+        check_checkpoint_args(checkpoint_args)
+        args.consumed_train_samples = getattr(
+            checkpoint_args, "consumed_train_samples", 0
+        )
+        update_num_microbatches(consumed_samples=args.consumed_train_samples)
+        args.consumed_valid_samples = getattr(
+            checkpoint_args, "consumed_valid_samples", 0
+        )
+    else:
+        print_rank_last("could not find arguments in the checkpoint ...")
+    # Model.
+    model.load_state_dict(state_dict["model"])
+    # Optimizer.
+    if not release and not args.finetune and not args.no_load_optim:
+        try:
+            if optimizer is not None:
+                optimizer.load_state_dict(state_dict["optimizer"])
+            if lr_scheduler is not None:
+                lr_scheduler.load_state_dict(state_dict["lr_scheduler"])
+        except KeyError:
+            print_rank_last(
+                "Unable to load optimizer from checkpoint {}. "
+                "Specify --no-load-optim or --finetune to prevent "
+                "attempting to load the optimizer state, "
+                "exiting ...".format(checkpoint_name_local)
+            )
+            sys.exit()
+    # rng states.
+    if not release and not args.finetune and not args.no_load_rng:
+        try:
+            random.setstate(state_dict["random_rng_state"])
+            np.random.set_state(state_dict["np_rng_state"])
+            torch.set_rng_state(state_dict["torch_rng_state"])
+            torch.cuda.set_rng_state(state_dict["cuda_rng_state"])
+            mpu.get_cuda_rng_tracker().set_states(state_dict["rng_tracker_states"])
+        except KeyError:
+            print_rank_last(
+                "Unable to load optimizer from checkpoint {}. "
+                "Specify --no-load-rng or --finetune to prevent "
+                "attempting to load the optimizer state, "
+                "exiting ...".format(checkpoint_name_local)
+            )
+            sys.exit()
+    torch.distributed.barrier()
+    print_rank_last(
+        "  successfully loaded checkpoint (with expert parametes updated) from {} at iteration {}".format(
+            args.load, iteration
+        )
+    )
+    return iteration
--- a/fmoe/transformer.py
+++ b/fmoe/transformer.py
@@ -15,10 +15,8 @@ class _Expert(nn.Module):
    def __init__(self, num_expert, d_model, d_hidden, activation, rank=0):
        super().__init__()
-        self.htoh4 = FMoELinear(num_expert, d_model, d_hidden, bias=True,
+        self.htoh4 = FMoELinear(num_expert, d_model, d_hidden, bias=True, rank=rank)
-                rank=rank)
+        self.h4toh = FMoELinear(num_expert, d_hidden, d_model, bias=True, rank=rank)
-        self.h4toh = FMoELinear(num_expert, d_hidden, d_model, bias=True,
-                rank=rank)
        self.activation = activation
    def forward(self, inp, fwd_expert_count):