Merge pull request #64 from laekov/fp16_bal_loss

fix fp16 training with balance loss

Merge pull request #64 from laekov/fp16_bal_loss
fix fp16 training with balance loss
bba5f289 · Rick Ho · GitHub · 537679a8 · ebefe2b1 · bba5f289
Unverified Commit bba5f289 authored Jul 20, 2021 by Rick Ho Committed by GitHub Jul 20, 2021
Hide whitespace changes
Inline Side-by-side

Showing with 2 additions and 2 deletions

fmoe/megatron/balance.py fmoe/megatron/balance.py +2 -2

No files found.
--- a/fmoe/megatron/balance.py
+++ b/fmoe/megatron/balance.py
@@ -49,7 +49,7 @@ def generate_megatron_gate_hook(layer_idx, num_expert_global):
 def add_balance_log(model, writer, iteration):
    from megatron import is_last_rank

-    if hasattr(model, 'module'):
+    while hasattr(model, 'module'):
        model = model.module

    balance_dict_tensor = torch.vstack(
@@ -92,7 +92,7 @@ def patch_forward_step(forward_step_func):
            return output
        loss_name = args.balance_strategy + "_loss"

-        if hasattr(model, 'module'):
+        while hasattr(model, 'module'):
            model = model.module

        loss_list = [l.mlp.gate.get_loss(clear=False).view(1)